Introduction to Correlation
Correlation is one of the most fundamental concepts in statistics and data science. It measures the strength and direction of the relationship between two variables. Understanding correlation helps data scientists identify patterns, make predictions, and uncover insights hidden within data.
Key Concepts
- Correlation: A statistical measure that describes the extent to which two variables change together
- Direction: Positive (move together) or negative (move opposite)
- Strength: How strongly the variables are related (from -1 to +1)
- Causation: Correlation does not imply causation!
1. What is Correlation?
The Concept
Correlation measures how two variables move in relation to each other:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Generate example data
np.random.seed(42)
n = 100
# Perfect positive correlation
x_pos = np.linspace(0, 10, n)
y_pos = 2 * x_pos + 1
# Strong positive correlation (with noise)
x_strong = np.random.randn(n)
y_strong = 2 * x_strong + np.random.randn(n) * 0.5
# Weak positive correlation
x_weak = np.random.randn(n)
y_weak = 0.3 * x_weak + np.random.randn(n) * 1.2
# No correlation
x_none = np.random.randn(n)
y_none = np.random.randn(n)
# Strong negative correlation
x_neg = np.random.randn(n)
y_neg = -2 * x_neg + np.random.randn(n) * 0.5
# Perfect negative correlation
x_perfect_neg = np.linspace(0, 10, n)
y_perfect_neg = -2 * x_perfect_neg + 10
# Visualize different correlation strengths
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes[0, 0].scatter(x_pos, y_pos, alpha=0.7)
axes[0, 0].set_title(f'Perfect Positive Correlation (r = 1.00)')
axes[0, 0].set_xlabel('X')
axes[0, 0].set_ylabel('Y')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 1].scatter(x_strong, y_strong, alpha=0.7)
corr_strong = np.corrcoef(x_strong, y_strong)[0, 1]
axes[0, 1].set_title(f'Strong Positive Correlation (r = {corr_strong:.2f})')
axes[0, 1].set_xlabel('X')
axes[0, 1].set_ylabel('Y')
axes[0, 1].grid(True, alpha=0.3)
axes[0, 2].scatter(x_weak, y_weak, alpha=0.7)
corr_weak = np.corrcoef(x_weak, y_weak)[0, 1]
axes[0, 2].set_title(f'Weak Positive Correlation (r = {corr_weak:.2f})')
axes[0, 2].set_xlabel('X')
axes[0, 2].set_ylabel('Y')
axes[0, 2].grid(True, alpha=0.3)
axes[1, 0].scatter(x_none, y_none, alpha=0.7)
corr_none = np.corrcoef(x_none, y_none)[0, 1]
axes[1, 0].set_title(f'No Correlation (r = {corr_none:.2f})')
axes[1, 0].set_xlabel('X')
axes[1, 0].set_ylabel('Y')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 1].scatter(x_neg, y_neg, alpha=0.7)
corr_neg = np.corrcoef(x_neg, y_neg)[0, 1]
axes[1, 1].set_title(f'Strong Negative Correlation (r = {corr_neg:.2f})')
axes[1, 1].set_xlabel('X')
axes[1, 1].set_ylabel('Y')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 2].scatter(x_perfect_neg, y_perfect_neg, alpha=0.7)
axes[1, 2].set_title(f'Perfect Negative Correlation (r = -1.00)')
axes[1, 2].set_xlabel('X')
axes[1, 2].set_ylabel('Y')
axes[1, 2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Understanding Correlation Values
Correlation coefficient (r) ranges from -1 to +1:
def explain_correlation_values():
"""Explain what different correlation values mean"""
values = [
(1.00, "Perfect positive correlation: Variables move exactly together"),
(0.80, "Strong positive correlation: Clear upward trend"),
(0.50, "Moderate positive correlation: Noticeable positive relationship"),
(0.20, "Weak positive correlation: Slight tendency to move together"),
(0.00, "No correlation: Variables are independent"),
(-0.20, "Weak negative correlation: Slight tendency to move opposite"),
(-0.50, "Moderate negative correlation: Noticeable negative relationship"),
(-0.80, "Strong negative correlation: Clear downward trend"),
(-1.00, "Perfect negative correlation: Variables move exactly opposite")
]
print("Correlation Coefficient Interpretation")
print("=" * 60)
print("| r-value | Interpretation")
print("|---------|----------------")
for r, interpretation in values:
print(f"| {r:7.2f} | {interpretation}")
explain_correlation_values()
2. Types of Correlation
Pearson Correlation
Pearson correlation measures linear relationships between continuous variables:
def pearson_correlation(x, y):
"""Calculate Pearson correlation coefficient"""
n = len(x)
x_mean = np.mean(x)
y_mean = np.mean(y)
numerator = np.sum((x - x_mean) * (y - y_mean))
denominator = np.sqrt(np.sum((x - x_mean)**2) * np.sum((y - y_mean)**2))
return numerator / denominator
# Demonstrate Pearson correlation with various relationships
np.random.seed(42)
n = 200
# Generate different types of relationships
x = np.linspace(-3, 3, n)
linear = 2 * x + 1 + np.random.randn(n) * 0.5
quadratic = x**2 + np.random.randn(n) * 0.5
exponential = np.exp(x/2) + np.random.randn(n) * 0.5
sine = np.sin(x * 2) + np.random.randn(n) * 0.2
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
relationships = [
(linear, "Linear Relationship"),
(quadratic, "Quadratic Relationship"),
(exponential, "Exponential Relationship"),
(sine, "Periodic Relationship")
]
for idx, (y_data, title) in enumerate(relationships):
ax = axes[idx // 2, idx % 2]
ax.scatter(x, y_data, alpha=0.6)
r_pearson = pearson_correlation(x, y_data)
r_scipy = stats.pearsonr(x, y_data)[0]
ax.set_title(f'{title}\nPearson r = {r_pearson:.3f}')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Important Note:")
print("Pearson correlation only captures LINEAR relationships!")
print("Quadratic, exponential, and periodic relationships can have low Pearson correlation")
print("even though there is a strong relationship.")
Spearman Rank Correlation
Spearman correlation measures monotonic relationships (not necessarily linear):
def spearman_correlation(x, y):
"""Calculate Spearman rank correlation"""
# Rank the data
x_rank = stats.rankdata(x)
y_rank = stats.rankdata(y)
# Calculate Pearson on ranks
return pearson_correlation(x_rank, y_rank)
# Generate data with monotonic but non-linear relationships
np.random.seed(42)
n = 200
x = np.linspace(0, 3, n)
# Exponential relationship
y_exp = np.exp(x) + np.random.randn(n) * 0.5
# Power relationship
y_power = x**3 + np.random.randn(n) * 0.5
# Logarithmic relationship
y_log = np.log(x + 0.5) + np.random.randn(n) * 0.1
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
monotonic_relationships = [
(y_exp, "Exponential (Monotonic)"),
(y_power, "Power (Monotonic)"),
(y_log, "Logarithmic (Monotonic)")
]
for idx, (y_data, title) in enumerate(monotonic_relationships):
ax = axes[idx]
ax.scatter(x, y_data, alpha=0.6)
r_pearson = stats.pearsonr(x, y_data)[0]
r_spearman = stats.spearmanr(x, y_data)[0]
ax.set_title(f'{title}\nPearson r = {r_pearson:.3f}\nSpearman ρ = {r_spearman:.3f}')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nPearson vs Spearman Correlation:")
print("=" * 50)
print("Pearson: Captures LINEAR relationships")
print("Spearman: Captures MONOTONIC relationships (increasing or decreasing)")
print("Spearman is more robust to outliers and non-linear monotonic relationships")
Kendall's Tau
Kendall's Tau is another rank-based correlation measure:
def kendall_tau(x, y):
"""Calculate Kendall's Tau correlation"""
n = len(x)
concordant = 0
discordant = 0
for i in range(n):
for j in range(i + 1, n):
if (x[i] - x[j]) * (y[i] - y[j]) > 0:
concordant += 1
elif (x[i] - x[j]) * (y[i] - y[j]) < 0:
discordant += 1
tau = (concordant - discordant) / (n * (n - 1) / 2)
return tau
# Compare correlation measures
np.random.seed(42)
n = 50
x = np.random.randn(n)
y = x**3 + np.random.randn(n) * 0.3
print("Correlation Comparison")
print("=" * 50)
print(f"Pearson r: {stats.pearsonr(x, y)[0]:.4f}")
print(f"Spearman ρ: {stats.spearmanr(x, y)[0]:.4f}")
print(f"Kendall τ: {stats.kendalltau(x, y)[0]:.4f}")
print(f"Custom Kendall: {kendall_tau(x, y):.4f}")
print("\nWhen to use each:")
print(" • Pearson: Linear relationships, normally distributed data")
print(" • Spearman: Monotonic relationships, ordinal data, outliers")
print(" • Kendall: Small samples, ordinal data, more interpretable")
3. Correlation Matrix
Understanding Correlation Matrices
# Create a correlation matrix for multiple variables
np.random.seed(42)
n = 500
# Create correlated data
temperature = np.random.normal(20, 5, n)
ice_cream_sales = 100 + 5 * temperature + np.random.normal(0, 10, n)
crime_rate = 50 - 0.5 * temperature + np.random.normal(0, 5, n)
beach_visitors = 200 + 8 * temperature + np.random.normal(0, 20, n)
# Create DataFrame
df = pd.DataFrame({
'Temperature': temperature,
'Ice_Cream_Sales': ice_cream_sales,
'Crime_Rate': crime_rate,
'Beach_Visitors': beach_visitors
})
# Calculate correlation matrix
correlation_matrix = df.corr()
print("Correlation Matrix")
print("=" * 50)
print(correlation_matrix.round(4))
# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
square=True, fmt='.3f', cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix Heatmap')
plt.tight_layout()
plt.show()
Interpreting Correlation Matrices
def interpret_correlation_matrix(df, corr_matrix):
"""Interpret the correlation matrix"""
print("Correlation Matrix Interpretation")
print("=" * 60)
print()
# Find strongest positive correlations
positive_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i + 1, len(corr_matrix.columns)):
corr = corr_matrix.iloc[i, j]
if corr > 0:
positive_pairs.append((corr, corr_matrix.columns[i], corr_matrix.columns[j]))
positive_pairs.sort(reverse=True)
print("Strongest Positive Correlations:")
for corr, var1, var2 in positive_pairs[:3]:
print(f" {var1} ↔ {var2}: {corr:.3f}")
# Interpret
if corr > 0.7:
print(f" → Very strong positive relationship")
elif corr > 0.5:
print(f" → Strong positive relationship")
elif corr > 0.3:
print(f" → Moderate positive relationship")
else:
print(f" → Weak positive relationship")
# Find strongest negative correlations
negative_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i + 1, len(corr_matrix.columns)):
corr = corr_matrix.iloc[i, j]
if corr < 0:
negative_pairs.append((corr, corr_matrix.columns[i], corr_matrix.columns[j]))
negative_pairs.sort()
print("\nStrongest Negative Correlations:")
for corr, var1, var2 in negative_pairs[:3]:
print(f" {var1} ↔ {var2}: {corr:.3f}")
if corr < -0.7:
print(f" → Very strong negative relationship")
elif corr < -0.5:
print(f" → Strong negative relationship")
elif corr < -0.3:
print(f" → Moderate negative relationship")
else:
print(f" → Weak negative relationship")
# Check for multicollinearity
print("\nPotential Multicollinearity Issues:")
for i in range(len(corr_matrix.columns)):
for j in range(i + 1, len(corr_matrix.columns)):
corr = corr_matrix.iloc[i, j]
if abs(corr) > 0.8:
print(f" ⚠️ {corr_matrix.columns[i]} and {corr_matrix.columns[j]}: {corr:.3f}")
interpret_correlation_matrix(df, correlation_matrix)
4. Real-World Examples
Example 1: Economic Indicators
def economic_correlation_demo():
"""Analyze correlations between economic indicators"""
# Create synthetic economic data
np.random.seed(42)
years = np.arange(2010, 2025)
n = len(years)
# Economic indicators
gdp_growth = np.random.normal(2.5, 1, n)
unemployment = 6 - 0.8 * gdp_growth + np.random.normal(0, 0.5, n)
inflation = 2 + 0.5 * gdp_growth + np.random.normal(0, 0.3, n)
stock_market = 100 + 15 * gdp_growth + np.random.normal(0, 5, n)
# Create DataFrame
df = pd.DataFrame({
'Year': years,
'GDP_Growth': gdp_growth,
'Unemployment': unemployment,
'Inflation': inflation,
'Stock_Market': stock_market
})
# Correlation matrix
corr = df[['GDP_Growth', 'Unemployment', 'Inflation', 'Stock_Market']].corr()
print("Economic Indicators Correlation Analysis")
print("=" * 60)
print("\nCorrelation Matrix:")
print(corr.round(4))
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# GDP vs Unemployment
axes[0, 0].scatter(df['GDP_Growth'], df['Unemployment'], alpha=0.7)
axes[0, 0].set_xlabel('GDP Growth (%)')
axes[0, 0].set_ylabel('Unemployment (%)')
axes[0, 0].set_title(f'GDP vs Unemployment\nr = {corr.loc["GDP_Growth", "Unemployment"]:.3f}')
axes[0, 0].grid(True, alpha=0.3)
# GDP vs Stock Market
axes[0, 1].scatter(df['GDP_Growth'], df['Stock_Market'], alpha=0.7)
axes[0, 1].set_xlabel('GDP Growth (%)')
axes[0, 1].set_ylabel('Stock Market Index')
axes[0, 1].set_title(f'GDP vs Stock Market\nr = {corr.loc["GDP_Growth", "Stock_Market"]:.3f}')
axes[0, 1].grid(True, alpha=0.3)
# Unemployment vs Inflation (Phillips Curve)
axes[1, 0].scatter(df['Unemployment'], df['Inflation'], alpha=0.7)
axes[1, 0].set_xlabel('Unemployment (%)')
axes[1, 0].set_ylabel('Inflation (%)')
axes[1, 0].set_title(f'Phillips Curve\nr = {corr.loc["Unemployment", "Inflation"]:.3f}')
axes[1, 0].grid(True, alpha=0.3)
# Correlation heatmap
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0,
ax=axes[1, 1], fmt='.3f')
axes[1, 1].set_title('Correlation Heatmap')
plt.tight_layout()
plt.show()
# Economic interpretation
print("\nEconomic Interpretations:")
print("-" * 40)
print("• GDP Growth vs Unemployment: Negative correlation")
print(" (When economy grows, unemployment tends to decrease)")
print()
print("• GDP Growth vs Stock Market: Positive correlation")
print(" (Economic growth typically boosts stock markets)")
print()
print("• Phillips Curve: Negative correlation between unemployment and inflation")
print(" (Lower unemployment often leads to higher inflation)")
economic_correlation_demo()
Example 2: Health and Lifestyle Data
def health_correlation_demo():
"""Analyze correlations between health and lifestyle factors"""
np.random.seed(42)
n = 1000
# Generate health data
exercise_hours = np.random.exponential(2, n)
exercise_hours = np.clip(exercise_hours, 0, 15)
calories_daily = 2000 + 100 * exercise_hours + np.random.normal(0, 200, n)
bmi = 25 - 0.5 * exercise_hours + 0.0005 * calories_daily + np.random.normal(0, 2, n)
bmi = np.clip(bmi, 18, 35)
blood_pressure_systolic = 120 - 0.5 * exercise_hours + 0.2 * bmi + np.random.normal(0, 5, n)
blood_pressure_systolic = np.clip(blood_pressure_systolic, 100, 160)
sleep_hours = 7 + 0.1 * exercise_hours - 0.05 * bmi + np.random.normal(0, 1, n)
sleep_hours = np.clip(sleep_hours, 4, 10)
# Create DataFrame
df = pd.DataFrame({
'Exercise_Hours': exercise_hours,
'Calories_Daily': calories_daily,
'BMI': bmi,
'Blood_Pressure': blood_pressure_systolic,
'Sleep_Hours': sleep_hours
})
# Calculate correlations
corr = df.corr()
print("Health and Lifestyle Correlation Analysis")
print("=" * 60)
print("\nCorrelation Matrix:")
print(corr.round(3))
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Exercise vs BMI
axes[0, 0].scatter(df['Exercise_Hours'], df['BMI'], alpha=0.5)
axes[0, 0].set_xlabel('Exercise Hours per Week')
axes[0, 0].set_ylabel('BMI')
axes[0, 0].set_title(f'Exercise vs BMI\nr = {corr.loc["Exercise_Hours", "BMI"]:.3f}')
axes[0, 0].grid(True, alpha=0.3)
# Exercise vs Blood Pressure
axes[0, 1].scatter(df['Exercise_Hours'], df['Blood_Pressure'], alpha=0.5)
axes[0, 1].set_xlabel('Exercise Hours per Week')
axes[0, 1].set_ylabel('Blood Pressure (Systolic)')
axes[0, 1].set_title(f'Exercise vs Blood Pressure\nr = {corr.loc["Exercise_Hours", "Blood_Pressure"]:.3f}')
axes[0, 1].grid(True, alpha=0.3)
# BMI vs Blood Pressure
axes[1, 0].scatter(df['BMI'], df['Blood_Pressure'], alpha=0.5)
axes[1, 0].set_xlabel('BMI')
axes[1, 0].set_ylabel('Blood Pressure (Systolic)')
axes[1, 0].set_title(f'BMI vs Blood Pressure\nr = {corr.loc["BMI", "Blood_Pressure"]:.3f}')
axes[1, 0].grid(True, alpha=0.3)
# Exercise vs Sleep
axes[1, 1].scatter(df['Exercise_Hours'], df['Sleep_Hours'], alpha=0.5)
axes[1, 1].set_xlabel('Exercise Hours per Week')
axes[1, 1].set_ylabel('Sleep Hours per Night')
axes[1, 1].set_title(f'Exercise vs Sleep\nr = {corr.loc["Exercise_Hours", "Sleep_Hours"]:.3f}')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Health insights
print("\nHealth Insights:")
print("-" * 40)
print("• Exercise is negatively correlated with BMI")
print(" (More exercise → Lower BMI)")
print()
print("• Exercise is negatively correlated with blood pressure")
print(" (Regular exercise helps maintain healthy blood pressure)")
print()
print("• BMI is positively correlated with blood pressure")
print(" (Higher BMI → Higher blood pressure risk)")
print()
print("• Exercise shows positive correlation with sleep quality")
print(" (Active people tend to sleep better)")
health_correlation_demo()
Example 3: Marketing and Sales Data
def marketing_correlation_demo():
"""Analyze correlations between marketing spend and sales"""
np.random.seed(42)
n = 200
# Marketing spends
tv_spend = np.random.uniform(0, 100, n)
social_spend = np.random.uniform(0, 80, n)
email_spend = np.random.uniform(0, 50, n)
search_spend = np.random.uniform(0, 60, n)
# Sales with diminishing returns and interactions
sales = (2.5 * tv_spend +
3.2 * social_spend +
1.8 * email_spend +
2.1 * search_spend -
0.01 * tv_spend**2 -
0.02 * social_spend**2 +
0.005 * tv_spend * social_spend +
100 +
np.random.normal(0, 15, n))
# Create DataFrame
df = pd.DataFrame({
'TV_Spend': tv_spend,
'Social_Spend': social_spend,
'Email_Spend': email_spend,
'Search_Spend': search_spend,
'Sales': sales
})
# Calculate correlations
corr = df.corr()
print("Marketing Spend Correlation Analysis")
print("=" * 60)
print("\nCorrelation with Sales:")
print(corr['Sales'].drop('Sales').sort_values(ascending=False).round(3))
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# TV Spend vs Sales
axes[0, 0].scatter(df['TV_Spend'], df['Sales'], alpha=0.6)
axes[0, 0].set_xlabel('TV Advertising Spend ($)')
axes[0, 0].set_ylabel('Sales ($)')
axes[0, 0].set_title(f'TV Spend vs Sales\nr = {corr.loc["TV_Spend", "Sales"]:.3f}')
axes[0, 0].grid(True, alpha=0.3)
# Social Spend vs Sales
axes[0, 1].scatter(df['Social_Spend'], df['Sales'], alpha=0.6)
axes[0, 1].set_xlabel('Social Media Spend ($)')
axes[0, 1].set_ylabel('Sales ($)')
axes[0, 1].set_title(f'Social Media vs Sales\nr = {corr.loc["Social_Spend", "Sales"]:.3f}')
axes[0, 1].grid(True, alpha=0.3)
# Email Spend vs Sales
axes[1, 0].scatter(df['Email_Spend'], df['Sales'], alpha=0.6)
axes[1, 0].set_xlabel('Email Marketing Spend ($)')
axes[1, 0].set_ylabel('Sales ($)')
axes[1, 0].set_title(f'Email Marketing vs Sales\nr = {corr.loc["Email_Spend", "Sales"]:.3f}')
axes[1, 0].grid(True, alpha=0.3)
# Search Spend vs Sales
axes[1, 1].scatter(df['Search_Spend'], df['Sales'], alpha=0.6)
axes[1, 1].set_xlabel('Search Engine Spend ($)')
axes[1, 1].set_ylabel('Sales ($)')
axes[1, 1].set_title(f'Search Spend vs Sales\nr = {corr.loc["Search_Spend", "Sales"]:.3f}')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Marketing insights
print("\nMarketing Insights:")
print("-" * 40)
# Find best channel
best_channel = corr['Sales'].drop('Sales').idxmax()
best_corr = corr['Sales'].drop('Sales').max()
print(f"• Most correlated channel: {best_channel} (r = {best_corr:.3f})")
# Check for multicollinearity
print("\n• Potential synergy effects:")
for i in range(len(df.columns) - 1):
for j in range(i + 1, len(df.columns) - 1):
var1 = df.columns[i]
var2 = df.columns[j]
channel_corr = corr.loc[var1, var2]
if abs(channel_corr) > 0.6:
print(f" {var1} and {var2} are correlated (r = {channel_corr:.3f})")
print(f" → Consider combined campaigns")
marketing_correlation_demo()
5. Correlation vs Causation
The Fundamental Principle
def correlation_vs_causation_demo():
"""Demonstrate that correlation does not imply causation"""
np.random.seed(42)
n = 365 # One year of daily data
# Common confounding variable: summer (temperature)
day_of_year = np.arange(1, n + 1)
temperature = 15 + 10 * np.sin(2 * np.pi * day_of_year / 365) + np.random.randn(n) * 2
# Variables that are both caused by summer
ice_cream_sales = 50 + 5 * temperature + np.random.randn(n) * 10
sunburn_cases = 10 + 0.5 * temperature + np.random.randn(n) * 5
# Correlation without causation
corr_ice_sunburn = np.corrcoef(ice_cream_sales, sunburn_cases)[0, 1]
print("Spurious Correlation Example")
print("=" * 60)
print(f"Ice Cream Sales vs Sunburn Cases")
print(f"Correlation: r = {corr_ice_sunburn:.3f}")
print()
print("Does ice cream cause sunburn?")
print("NO! Both are caused by a third factor: SUMMER")
print("Summer = more ice cream AND more sunburns")
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Ice Cream vs Sunburn
axes[0].scatter(ice_cream_sales, sunburn_cases, alpha=0.6)
axes[0].set_xlabel('Ice Cream Sales')
axes[0].set_ylabel('Sunburn Cases')
axes[0].set_title(f'Spurious Correlation (r = {corr_ice_sunburn:.3f})')
axes[0].grid(True, alpha=0.3)
# Both vs Temperature (confounding variable)
axes[1].scatter(temperature, ice_cream_sales, alpha=0.6, label='Ice Cream')
axes[1].scatter(temperature, sunburn_cases, alpha=0.6, label='Sunburns')
axes[1].set_xlabel('Temperature')
axes[1].set_ylabel('Value')
axes[1].set_title('Both Caused by Temperature')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Famous spurious correlations
print("\nFamous Spurious Correlations:")
print("-" * 40)
spurious = [
"Ice cream sales → Drowning deaths (both caused by summer)",
"Number of Nicolas Cage movies → Drowning in pools (coincidence)",
"US spending on science → Suicide by hanging (coincidence)",
"Per capita cheese consumption → Death by bedsheet tangling (coincidence)",
"Divorce rate in Maine → US margarine consumption (coincidence)"
]
for example in spurious:
print(f" • {example}")
print("\nCausation Criteria:")
print(" • Temporal precedence (cause comes before effect)")
print(" • Strength of association (strong correlation helps but not enough)")
print(" • Consistency across studies")
print(" • Specificity (specific cause → specific effect)")
print(" • Dose-response relationship (more cause → more effect)")
print(" • Biological plausibility")
print(" • Experimental evidence")
correlation_vs_causation_demo()
Identifying Confounding Variables
def confounding_variable_demo():
"""Demonstrate how to identify confounding variables"""
np.random.seed(42)
n = 500
# Confounding variable: age
age = np.random.uniform(20, 80, n)
# Variables affected by age
coffee_consumption = 3 - 0.03 * (age - 50) + np.random.randn(n) * 0.5
coffee_consumption = np.clip(coffee_consumption, 0.5, 5)
heart_disease_risk = 0.1 + 0.02 * (age - 20) + np.random.randn(n) * 0.1
heart_disease_risk = np.clip(heart_disease_risk, 0, 1)
# Create DataFrame
df = pd.DataFrame({
'Age': age,
'Coffee_Consumption': coffee_consumption,
'Heart_Disease_Risk': heart_disease_risk
})
# Simple correlation (ignoring confounder)
simple_corr = df['Coffee_Consumption'].corr(df['Heart_Disease_Risk'])
# Partial correlation (controlling for age)
# First, regress both variables on age
from sklearn.linear_model import LinearRegression
X_age = df[['Age']]
y_coffee = df['Coffee_Consumption']
y_heart = df['Heart_Disease_Risk']
model_coffee = LinearRegression().fit(X_age, y_coffee)
model_heart = LinearRegression().fit(X_age, y_heart)
# Get residuals (age removed)
coffee_residuals = y_coffee - model_coffee.predict(X_age)
heart_residuals = y_heart - model_heart.predict(X_age)
# Partial correlation
partial_corr = np.corrcoef(coffee_residuals, heart_residuals)[0, 1]
print("Confounding Variable Analysis")
print("=" * 60)
print(f"Simple correlation (ignoring age): r = {simple_corr:.3f}")
print(f"Partial correlation (controlling for age): r = {partial_corr:.3f}")
print()
if abs(simple_corr) > abs(partial_corr):
print("⚠️ Age is likely a confounding variable!")
print("The relationship between coffee and heart disease is largely")
print("explained by age (older people drink less coffee AND have higher risk)")
# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Coffee vs Heart Disease
axes[0].scatter(df['Coffee_Consumption'], df['Heart_Disease_Risk'],
c=df['Age'], cmap='viridis', alpha=0.6)
axes[0].set_xlabel('Coffee Consumption')
axes[0].set_ylabel('Heart Disease Risk')
axes[0].set_title(f'Simple Correlation\nr = {simple_corr:.3f}')
axes[0].grid(True, alpha=0.3)
# Age vs Coffee
axes[1].scatter(df['Age'], df['Coffee_Consumption'], alpha=0.6)
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Coffee Consumption')
axes[1].set_title('Age vs Coffee')
axes[1].grid(True, alpha=0.3)
# Age vs Heart Disease
axes[2].scatter(df['Age'], df['Heart_Disease_Risk'], alpha=0.6)
axes[2].set_xlabel('Age')
axes[2].set_ylabel('Heart Disease Risk')
axes[2].set_title('Age vs Heart Disease')
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nKey Insight:")
print("After controlling for age, the correlation between coffee and")
print("heart disease becomes much weaker. This suggests age is a")
print("confounding variable explaining the observed relationship.")
confounding_variable_demo()
6. Advanced Correlation Concepts
Partial Correlation
def partial_correlation_demo():
"""Calculate and interpret partial correlations"""
np.random.seed(42)
n = 200
# Create three correlated variables
x1 = np.random.randn(n)
x2 = 0.6 * x1 + 0.4 * np.random.randn(n)
x3 = 0.5 * x1 + 0.3 * x2 + 0.2 * np.random.randn(n)
df = pd.DataFrame({
'X1': x1,
'X2': x2,
'X3': x3
})
# Simple correlations
simple_corr = df.corr()
# Calculate partial correlations
def partial_correlation(df, var1, var2, control):
"""Calculate partial correlation controlling for other variables"""
# Residuals of var1 regressed on control
model1 = LinearRegression().fit(df[control], df[var1])
resid1 = df[var1] - model1.predict(df[control])
# Residuals of var2 regressed on control
model2 = LinearRegression().fit(df[control], df[var2])
resid2 = df[var2] - model2.predict(df[control])
return np.corrcoef(resid1, resid2)[0, 1]
print("Simple vs Partial Correlations")
print("=" * 60)
print("\nSimple Correlations:")
print(simple_corr.round(3))
print("\nPartial Correlations:")
print(f"X1 vs X2 (controlling for X3): {partial_correlation(df, 'X1', 'X2', ['X3']):.3f}")
print(f"X1 vs X3 (controlling for X2): {partial_correlation(df, 'X1', 'X3', ['X2']):.3f}")
print(f"X2 vs X3 (controlling for X1): {partial_correlation(df, 'X2', 'X3', ['X1']):.3f}")
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Simple correlation scatter plot
axes[0].scatter(df['X1'], df['X2'], alpha=0.6)
axes[0].set_xlabel('X1')
axes[0].set_ylabel('X2')
axes[0].set_title(f'Simple Correlation\nr = {simple_corr.loc["X1", "X2"]:.3f}')
axes[0].grid(True, alpha=0.3)
# Partial correlation residual plot
model1 = LinearRegression().fit(df[['X3']], df['X1'])
resid1 = df['X1'] - model1.predict(df[['X3']])
model2 = LinearRegression().fit(df[['X3']], df['X2'])
resid2 = df['X2'] - model2.predict(df[['X3']])
axes[1].scatter(resid1, resid2, alpha=0.6)
axes[1].set_xlabel('X1 residuals (after removing X3)')
axes[1].set_ylabel('X2 residuals (after removing X3)')
axes[1].set_title(f'Partial Correlation\nr = {partial_correlation(df, "X1", "X2", ["X3"]):.3f}')
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
partial_correlation_demo()
Point-Biserial Correlation
def point_biserial_correlation_demo():
"""Correlation between binary and continuous variables"""
np.random.seed(42)
n = 300
# Binary variable (e.g., gender, treatment group)
binary = np.random.binomial(1, 0.5, n)
# Continuous variable (e.g., test scores)
# Higher scores for group 1
continuous = 50 + 15 * binary + np.random.randn(n) * 10
df = pd.DataFrame({
'Group': binary,
'Score': continuous
})
# Point-biserial correlation
# Equivalent to Pearson correlation with binary variable
pbs_corr = df['Score'].corr(df['Group'])
# Calculate using formula
n1 = df[df['Group'] == 1]['Score'].count()
n0 = df[df['Group'] == 0]['Score'].count()
mean1 = df[df['Group'] == 1]['Score'].mean()
mean0 = df[df['Group'] == 0]['Score'].mean()
std_all = df['Score'].std()
pbs_formula = (mean1 - mean0) / std_all * np.sqrt(n1 * n0 / (n1 + n0)**2)
print("Point-Biserial Correlation")
print("=" * 50)
print(f"Correlation between Group and Score: {pbs_corr:.3f}")
print(f"Formula calculation: {pbs_formula:.3f}")
print()
print("Interpretation:")
print(f" Group 0 mean: {mean0:.2f}")
print(f" Group 1 mean: {mean1:.2f}")
print(f" Difference: {mean1 - mean0:.2f}")
print(f" Effect size: {pbs_corr:.3f}")
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Box plot
df.boxplot(column='Score', by='Group', ax=axes[0])
axes[0].set_title('Box Plot by Group')
axes[0].set_xlabel('Group')
axes[0].set_ylabel('Score')
# Scatter plot (jittered)
jitter = np.random.randn(n) * 0.05
axes[1].scatter(df['Group'] + jitter, df['Score'], alpha=0.6)
axes[1].set_xlabel('Group')
axes[1].set_ylabel('Score')
axes[1].set_title(f'Point-Biserial Correlation (r = {pbs_corr:.3f})')
axes[1].set_xticks([0, 1])
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
point_biserial_correlation_demo()
7. Correlation in Machine Learning
Feature Selection Using Correlation
def correlation_feature_selection():
"""Use correlation for feature selection"""
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# Create dataset with correlated features
np.random.seed(42)
X, y = make_classification(n_samples=500, n_features=20,
n_informative=10, n_redundant=5,
n_repeated=2, random_state=42)
# Create DataFrame
feature_names = [f'Feature_{i}' for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
df['Target'] = y
# Calculate correlations with target
target_corr = df.corr()['Target'].drop('Target').sort_values(ascending=False)
print("Feature Selection Using Correlation")
print("=" * 60)
print("\nTop 10 features correlated with target:")
print(target_corr.head(10).round(3))
print("\nBottom 5 features correlated with target:")
print(target_corr.tail(5).round(3))
# Identify highly correlated features (multicollinearity)
feature_corr = df[feature_names].corr()
# Find pairs with correlation > 0.8
high_corr_pairs = []
for i in range(len(feature_names)):
for j in range(i + 1, len(feature_names)):
corr = feature_corr.iloc[i, j]
if abs(corr) > 0.8:
high_corr_pairs.append((feature_names[i], feature_names[j], corr))
print("\nHighly Correlated Feature Pairs (|r| > 0.8):")
if high_corr_pairs:
for f1, f2, corr in high_corr_pairs[:5]:
print(f" {f1} ↔ {f2}: {corr:.3f}")
else:
print(" No highly correlated pairs found")
# Train model with and without correlated features
# With all features
model_all = RandomForestClassifier(n_estimators=100, random_state=42)
model_all.fit(X, y)
score_all = model_all.score(X, y)
# Remove one feature from each highly correlated pair
if high_corr_pairs:
features_to_drop = set()
for f1, f2, _ in high_corr_pairs:
if target_corr[f1] < target_corr[f2]:
features_to_drop.add(f1)
else:
features_to_drop.add(f2)
keep_features = [f for f in feature_names if f not in features_to_drop]
X_reduced = df[keep_features].values
model_reduced = RandomForestClassifier(n_estimators=100, random_state=42)
model_reduced.fit(X_reduced, y)
score_reduced = model_reduced.score(X_reduced, y)
print("\nModel Performance:")
print(f" With all features ({len(feature_names)}): {score_all:.4f}")
print(f" Without redundant features ({len(keep_features)}): {score_reduced:.4f}")
print(f" Removed {len(features_to_drop)} features")
correlation_feature_selection()
8. Practical Applications
Example: Customer Segmentation
def customer_segmentation_correlation():
"""Use correlation for customer segmentation"""
np.random.seed(42)
n = 500
# Generate customer data
age = np.random.randint(18, 70, n)
income = 30000 + 500 * age + np.random.randn(n) * 10000
income = np.maximum(income, 20000)
spending = 0.1 * income + np.random.randn(n) * 2000
spending = np.maximum(spending, 0)
loyalty = 50 + 0.5 * spending - 0.2 * age + np.random.randn(n) * 10
loyalty = np.clip(loyalty, 0, 100)
satisfaction = 60 + 0.3 * loyalty - 0.1 * age + np.random.randn(n) * 10
satisfaction = np.clip(satisfaction, 0, 100)
# Create DataFrame
df = pd.DataFrame({
'Age': age,
'Income': income,
'Annual_Spending': spending,
'Loyalty_Score': loyalty,
'Satisfaction': satisfaction
})
# Correlation matrix
corr = df.corr()
print("Customer Segmentation Analysis")
print("=" * 60)
print("\nCorrelation Matrix:")
print(corr.round(3))
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Age vs Spending
axes[0, 0].scatter(df['Age'], df['Annual_Spending'], alpha=0.6)
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Annual Spending ($)')
axes[0, 0].set_title(f'Age vs Spending\nr = {corr.loc["Age", "Annual_Spending"]:.3f}')
axes[0, 0].grid(True, alpha=0.3)
# Income vs Spending
axes[0, 1].scatter(df['Income'], df['Annual_Spending'], alpha=0.6)
axes[0, 1].set_xlabel('Income ($)')
axes[0, 1].set_ylabel('Annual Spending ($)')
axes[0, 1].set_title(f'Income vs Spending\nr = {corr.loc["Income", "Annual_Spending"]:.3f}')
axes[0, 1].grid(True, alpha=0.3)
# Loyalty vs Satisfaction
axes[1, 0].scatter(df['Loyalty_Score'], df['Satisfaction'], alpha=0.6)
axes[1, 0].set_xlabel('Loyalty Score')
axes[1, 0].set_ylabel('Satisfaction Score')
axes[1, 0].set_title(f'Loyalty vs Satisfaction\nr = {corr.loc["Loyalty_Score", "Satisfaction"]:.3f}')
axes[1, 0].grid(True, alpha=0.3)
# Correlation heatmap
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0,
ax=axes[1, 1], fmt='.3f')
axes[1, 1].set_title('Correlation Heatmap')
plt.tight_layout()
plt.show()
# Segmentation insights
print("\nCustomer Segmentation Insights:")
print("-" * 40)
print("• Strong positive correlation between income and spending")
print(" → Higher income customers spend more")
print()
print("• Moderate negative correlation between age and spending")
print(" → Younger customers tend to spend more")
print()
print("• Strong positive correlation between loyalty and satisfaction")
print(" → Satisfied customers are more loyal")
print()
print("Recommended segments:")
print(" • High-value: High income, high spending")
print(" • Young shoppers: Low age, high spending")
print(" • Loyal customers: High loyalty, high satisfaction")
print(" • Growth potential: High income, low loyalty")
customer_segmentation_correlation()
9. Correlation and Machine Learning Models
def correlation_impact_on_models():
"""Demonstrate how correlation affects different ML models"""
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler
np.random.seed(42)
n = 200
# Create correlated features
X1 = np.random.randn(n)
X2 = 0.9 * X1 + 0.1 * np.random.randn(n) # Highly correlated with X1
X3 = np.random.randn(n) # Independent
# Target with all three features
y = 2 * X1 + 1 * X2 + 3 * X3 + np.random.randn(n) * 0.5
X = np.column_stack([X1, X2, X3])
# Different models
models = {
'Linear Regression': LinearRegression(),
'Ridge (α=0.1)': Ridge(alpha=0.1),
'Ridge (α=1)': Ridge(alpha=1),
'Ridge (α=10)': Ridge(alpha=10)
}
print("Impact of Correlation on Model Coefficients")
print("=" * 70)
for name, model in models.items():
model.fit(X, y)
print(f"\n{name}:")
print(f" Coefficients: [{model.coef_[0]:.3f}, {model.coef_[1]:.3f}, {model.coef_[2]:.3f}]")
print(f" R²: {model.score(X, y):.4f}")
print("\n" + "=" * 70)
print("Observations:")
print("• Linear Regression coefficients for X1 and X2 are unstable")
print(" because they are highly correlated")
print("• Ridge regression shrinks coefficients to handle correlation")
print("• Higher regularization (α) leads to more shrinkage")
correlation_impact_on_models()
10. Best Practices and Pitfalls
Correlation Best Practices
def correlation_best_practices():
"""Best practices for using correlation in data science"""
practices = {
"Data Preparation": [
"Check for outliers that can inflate/deflate correlation",
"Ensure data is appropriate for the correlation type",
"Handle missing values appropriately",
"Consider data transformations if needed"
],
"Interpretation": [
"Always visualize relationships, don't rely solely on numbers",
"Remember: correlation ≠ causation",
"Consider the context and domain knowledge",
"Report confidence intervals, not just point estimates"
],
"Statistical Significance": [
"Use hypothesis testing to check significance",
"Consider sample size when interpreting results",
"Be aware of multiple testing issues",
"Check p-values for statistical significance"
],
"Limitations": [
"Pearson only captures linear relationships",
"Spearman captures monotonic but not non-monotonic",
"Correlation is sensitive to outliers",
"Range restriction can attenuate correlation"
]
}
print("Correlation Best Practices")
print("=" * 60)
for category, items in practices.items():
print(f"\n📌 {category}")
for item in items:
print(f" • {item}")
# Example of hypothesis testing
print("\n" + "=" * 60)
print("Statistical Significance Example:")
np.random.seed(42)
x = np.random.randn(30)
y = 0.5 * x + np.random.randn(30)
corr, p_value = stats.pearsonr(x, y)
print(f"Correlation: r = {corr:.3f}")
print(f"P-value: {p_value:.4f}")
if p_value < 0.05:
print("✓ Statistically significant at α = 0.05")
else:
print("✗ Not statistically significant at α = 0.05")
correlation_best_practices()
Common Pitfalls
def correlation_pitfalls():
"""Common pitfalls when using correlation"""
np.random.seed(42)
n = 100
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# 1. Outliers
x_out = np.random.randn(n)
y_out = 2 * x_out + np.random.randn(n)
y_out[0] = 50 # Outlier
corr_out = np.corrcoef(x_out, y_out)[0, 1]
axes[0, 0].scatter(x_out, y_out, alpha=0.6)
axes[0, 0].scatter(x_out[0], y_out[0], color='red', s=100, label='Outlier')
axes[0, 0].set_title(f'Outlier Effect\nr = {corr_out:.3f}')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
# 2. Non-linear relationship
x_non = np.linspace(-3, 3, n)
y_non = x_non**2 + np.random.randn(n) * 0.2
corr_non = np.corrcoef(x_non, y_non)[0, 1]
axes[0, 1].scatter(x_non, y_non, alpha=0.6)
axes[0, 1].set_title(f'Non-linear Relationship\nr = {corr_non:.3f}')
axes[0, 1].grid(True, alpha=0.3)
# 3. Restricted range
x_full = np.random.randn(n)
y_full = x_full + np.random.randn(n) * 0.5
# Restrict range of x
mask = (x_full > -1) & (x_full < 1)
x_restricted = x_full[mask]
y_restricted = y_full[mask]
corr_full = np.corrcoef(x_full, y_full)[0, 1]
corr_restricted = np.corrcoef(x_restricted, y_restricted)[0, 1]
axes[0, 2].scatter(x_full, y_full, alpha=0.3, label='Full data')
axes[0, 2].scatter(x_restricted, y_restricted, alpha=0.6, label='Restricted')
axes[0, 2].set_title(f'Restricted Range\nFull r = {corr_full:.3f}\nRestricted r = {corr_restricted:.3f}')
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3)
# 4. Simpson's paradox
group1_x = np.random.normal(2, 1, 50)
group1_y = group1_x * 0.5 + np.random.randn(50)
group2_x = np.random.normal(8, 1, 50)
group2_y = group2_x * 0.5 + 2 + np.random.randn(50)
x_all = np.concatenate([group1_x, group2_x])
y_all = np.concatenate([group1_y, group2_y])
corr_all = np.corrcoef(x_all, y_all)[0, 1]
corr_group1 = np.corrcoef(group1_x, group1_y)[0, 1]
corr_group2 = np.corrcoef(group2_x, group2_y)[0, 1]
axes[1, 0].scatter(group1_x, group1_y, alpha=0.6, label='Group 1')
axes[1, 0].scatter(group2_x, group2_y, alpha=0.6, label='Group 2')
axes[1, 0].set_title(f"Simpson's Paradox\nOverall r = {corr_all:.3f}\nGroup r1 = {corr_group1:.3f}, r2 = {corr_group2:.3f}")
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# 5. No correlation ≠ no relationship
x_circle = np.linspace(-1, 1, n)
y_circle = np.sqrt(1 - x_circle**2) * np.random.choice([-1, 1], n)
corr_circle = np.corrcoef(x_circle, y_circle)[0, 1]
axes[1, 1].scatter(x_circle, y_circle, alpha=0.6)
axes[1, 1].set_title(f'No Linear Correlation\nr = {corr_circle:.3f}')
axes[1, 1].set_xlabel('X')
axes[1, 1].set_ylabel('Y')
axes[1, 1].grid(True, alpha=0.3)
# 6. Heteroscedasticity
x_hetero = np.linspace(-3, 3, n)
y_hetero = 2 * x_hetero + x_hetero * np.random.randn(n) * 0.5
corr_hetero = np.corrcoef(x_hetero, y_hetero)[0, 1]
axes[1, 2].scatter(x_hetero, y_hetero, alpha=0.6)
axes[1, 2].set_title(f'Heteroscedasticity\nr = {corr_hetero:.3f}')
axes[1, 2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nCommon Correlation Pitfalls:")
print("=" * 60)
print("1. Outliers: Can dramatically affect correlation coefficient")
print("2. Non-linear relationships: Pearson correlation may be close to 0")
print("3. Restricted range: Can underestimate true correlation")
print("4. Simpson's paradox: Overall correlation may not represent subgroups")
print("5. Zero correlation ≠ independence")
print("6. Heteroscedasticity: Correlation may not capture relationship")
correlation_pitfalls()
Conclusion
Correlation is a powerful tool in data science, but it must be used wisely:
Key Takeaways
- Types of Correlation: Pearson (linear), Spearman (monotonic), Kendall (rank-based)
- Range: -1 to +1, where 0 indicates no linear relationship
- Correlation ≠ Causation: Always be skeptical of causal claims
- Visualization: Always plot your data before calculating correlations
- Confounding Variables: Consider third variables that may explain relationships
- Assumptions: Each correlation type has specific assumptions
- Outliers: Can dramatically affect correlation coefficients
Quick Reference
| Correlation Type | Use Case | Data Type | Assumptions |
|---|---|---|---|
| Pearson | Linear relationships | Continuous | Normal, linear, no outliers |
| Spearman | Monotonic relationships | Ordinal or continuous | Monotonic, robust to outliers |
| Kendall | Small samples, ordinal | Ordinal | Monotonic, interpretable |
When to Use
- Feature Selection: Identify relationships with target
- Exploratory Analysis: Understand data structure
- Multicollinearity: Detect redundant features
- Validation: Check model assumptions
Remember: Correlation is a starting point, not an end point. It helps us ask better questions, but it doesn't provide definitive answers about causality or real-world mechanisms.