Introduction to Correlation Matrix
A correlation matrix is a powerful statistical tool that reveals relationships between multiple variables in a dataset. It's fundamental to data science for understanding patterns, feature selection, and initial exploratory data analysis (EDA). This comprehensive guide covers everything from basic concepts to advanced applications of correlation matrices.
Key Concepts
- Correlation: Statistical measure of relationship between variables
- Correlation Coefficient: Numerical value (-1 to +1) indicating strength and direction
- Correlation Matrix: Table showing correlation coefficients between all variable pairs
- Multicollinearity: High correlation between predictor variables
- Feature Selection: Using correlation to identify redundant variables
1. Understanding Correlation
Types of Correlation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
def plot_correlation_types():
"""
Visualize different types of correlation
"""
np.random.seed(42)
n = 200
x = np.linspace(-3, 3, n)
# Generate different correlation patterns
patterns = {
'Strong Positive (r = 0.95)': x + np.random.normal(0, 0.3, n),
'Moderate Positive (r = 0.70)': x + np.random.normal(0, 0.8, n),
'No Correlation (r = 0.02)': np.random.normal(0, 1, n),
'Moderate Negative (r = -0.70)': -x + np.random.normal(0, 0.8, n),
'Strong Negative (r = -0.95)': -x + np.random.normal(0, 0.3, n),
'Non-linear (r = 0.01)': x**2 + np.random.normal(0, 0.5, n)
}
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
for idx, (title, y) in enumerate(patterns.items()):
# Calculate correlation
r, p_value = stats.pearsonr(x, y)
axes[idx].scatter(x, y, alpha=0.6, s=20)
axes[idx].set_title(f'{title}\n(r = {r:.2f})', fontsize=12)
axes[idx].set_xlabel('X')
axes[idx].set_ylabel('Y')
axes[idx].grid(True, alpha=0.3)
# Add regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
axes[idx].plot(x, slope*x + intercept, 'r-', linewidth=2, alpha=0.7)
plt.suptitle('Types of Correlation', fontsize=16)
plt.tight_layout()
plt.show()
plot_correlation_types()
Correlation Coefficients
def explain_correlation_coefficients():
"""
Explain correlation coefficient values and their meanings
"""
# Create data for different correlation strengths
np.random.seed(42)
x = np.linspace(-3, 3, 100)
correlations = {
'Perfect Positive (r = 1.0)': x,
'Very Strong (r = 0.9)': x + np.random.normal(0, 0.2, 100),
'Strong (r = 0.7)': x + np.random.normal(0, 0.5, 100),
'Moderate (r = 0.5)': x + np.random.normal(0, 0.8, 100),
'Weak (r = 0.3)': x + np.random.normal(0, 1.1, 100),
'Very Weak (r = 0.1)': x + np.random.normal(0, 1.5, 100),
'Zero (r = 0.0)': np.random.normal(0, 2, 100),
'Perfect Negative (r = -1.0)': -x
}
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()
for idx, (title, y) in enumerate(correlations.items()):
r, p_value = stats.pearsonr(x, y)
axes[idx].scatter(x, y, alpha=0.6, s=15)
axes[idx].set_title(f'{title}\n(r = {r:.2f})', fontsize=10)
axes[idx].set_xlabel('X')
axes[idx].set_ylabel('Y')
axes[idx].grid(True, alpha=0.3)
axes[idx].set_xlim(-3.5, 3.5)
axes[idx].set_ylim(-4, 4)
plt.suptitle('Correlation Coefficient Strengths', fontsize=16)
plt.tight_layout()
plt.show()
explain_correlation_coefficients()
2. Creating Correlation Matrices
Basic Correlation Matrix
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def create_correlation_matrix():
"""
Create a basic correlation matrix from sample data
"""
# Generate sample data
np.random.seed(42)
n_samples = 1000
data = {
'age': np.random.normal(40, 15, n_samples),
'income': np.random.normal(50000, 20000, n_samples),
'education_years': np.random.normal(14, 3, n_samples),
'credit_score': np.random.normal(700, 50, n_samples),
'debt_ratio': np.random.normal(0.3, 0.2, n_samples),
'savings': np.random.normal(10000, 5000, n_samples),
'spending': np.random.normal(3000, 1000, n_samples)
}
df = pd.DataFrame(data)
# Add relationships to create meaningful correlations
df['income'] = df['income'] + df['education_years'] * 3000
df['credit_score'] = df['credit_score'] - df['debt_ratio'] * 200
df['savings'] = df['savings'] + df['income'] * 0.1 - df['spending'] * 0.5
df['spending'] = df['spending'] + df['income'] * 0.05
# Calculate correlation matrix
corr_matrix = df.corr()
print("Correlation Matrix:")
print(corr_matrix.round(3))
# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Heatmap
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
cmap='coolwarm', center=0, square=True,
linewidths=0.5, ax=ax1)
ax1.set_title('Correlation Matrix Heatmap', fontsize=14)
# Clustered heatmap
sns.clustermap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
center=0, square=True, figsize=(8, 8))
plt.suptitle('Clustered Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.show()
return corr_matrix
corr_matrix = create_correlation_matrix()
Correlation Matrix with p-values
def correlation_with_pvalues(df):
"""
Calculate correlation matrix with p-values
"""
def calculate_correlation_pvalues(df):
"""Calculate correlation matrix and p-values"""
corr = df.corr().values
p_values = np.zeros_like(corr)
for i in range(df.shape[1]):
for j in range(df.shape[1]):
if i == j:
p_values[i, j] = 0
else:
_, p_values[i, j] = stats.pearsonr(df.iloc[:, i], df.iloc[:, j])
return pd.DataFrame(corr, index=df.columns, columns=df.columns), \
pd.DataFrame(p_values, index=df.columns, columns=df.columns)
# Generate data
np.random.seed(42)
n = 200
df = pd.DataFrame({
'A': np.random.randn(n),
'B': np.random.randn(n),
'C': np.random.randn(n),
'D': np.random.randn(n),
'E': np.random.randn(n)
})
# Add correlations
df['B'] = df['A'] * 0.8 + np.random.randn(n) * 0.3
df['C'] = df['A'] * -0.6 + np.random.randn(n) * 0.4
df['D'] = df['B'] * 0.5 + df['C'] * 0.3 + np.random.randn(n) * 0.2
# Calculate correlation and p-values
corr_matrix, p_matrix = calculate_correlation_pvalues(df)
# Create mask for significance
sig_mask = p_matrix < 0.05
# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Correlation heatmap with significance markers
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
center=0, square=True, ax=axes[0])
axes[0].set_title('Correlation Matrix', fontsize=14)
# Significance heatmap
sns.heatmap(p_matrix, annot=True, fmt='.3f', cmap='RdYlGn_r',
center=0.05, square=True, ax=axes[1])
axes[1].set_title('p-values Matrix', fontsize=14)
plt.tight_layout()
plt.show()
print("\nSignificant correlations (p < 0.05):")
for i in range(len(df.columns)):
for j in range(i+1, len(df.columns)):
if p_matrix.iloc[i, j] < 0.05:
print(f"{df.columns[i]} vs {df.columns[j]}: "
f"r = {corr_matrix.iloc[i, j]:.3f}, "
f"p = {p_matrix.iloc[i, j]:.4f}")
correlation_with_pvalues(pd.DataFrame())
3. Visualizing Correlation Matrices
Advanced Heatmap Visualizations
def advanced_correlation_visualizations(df):
"""
Create advanced correlation matrix visualizations
"""
# Generate sample data
np.random.seed(42)
n = 500
data = {
'sales': np.random.normal(1000, 200, n),
'marketing': np.random.normal(100, 30, n),
'advertising': np.random.normal(50, 15, n),
'price': np.random.normal(50, 10, n),
'competition': np.random.normal(5, 2, n),
'customer_satisfaction': np.random.normal(80, 10, n),
'returns': np.random.normal(5, 2, n),
'employee_count': np.random.normal(50, 15, n)
}
df = pd.DataFrame(data)
# Add meaningful relationships
df['sales'] = (df['sales'] +
df['marketing'] * 2 +
df['advertising'] * 3 -
df['price'] * 5 -
df['competition'] * 10 +
df['customer_satisfaction'] * 2)
# Calculate correlation
corr = df.corr()
# Create figure with subplots
fig = plt.figure(figsize=(18, 12))
# 1. Standard heatmap
ax1 = plt.subplot(2, 3, 1)
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm',
center=0, square=True, ax=ax1, cbar_kws={'shrink': 0.8})
ax1.set_title('Standard Correlation Heatmap', fontsize=12)
# 2. Masked upper triangle
ax2 = plt.subplot(2, 3, 2)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='coolwarm',
center=0, square=True, ax=ax2, cbar_kws={'shrink': 0.8})
ax2.set_title('Lower Triangle Heatmap', fontsize=12)
# 3. Diverging palette
ax3 = plt.subplot(2, 3, 3)
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r',
center=0, square=True, ax=ax3, cbar_kws={'shrink': 0.8})
ax3.set_title('Diverging Color Scheme', fontsize=12)
# 4. Hierarchical clustering
ax4 = plt.subplot(2, 3, 4)
sns.clustermap(corr, annot=True, fmt='.2f', cmap='coolwarm',
center=0, square=True, figsize=(6, 5))
ax4.set_title('Clustered Correlation Matrix', fontsize=12)
# 5. Correlation matrix with scatter plots
from pandas.plotting import scatter_matrix
ax5 = plt.subplot(2, 3, 5)
scatter_matrix(df, alpha=0.2, figsize=(10, 10), diagonal='hist')
ax5.set_title('Scatter Plot Matrix', fontsize=12)
# 6. Correlation network
ax6 = plt.subplot(2, 3, 6)
# Create network graph from correlation matrix
import networkx as nx
G = nx.Graph()
for i in range(len(corr.columns)):
G.add_node(corr.columns[i])
for i in range(len(corr.columns)):
for j in range(i+1, len(corr.columns)):
if abs(corr.iloc[i, j]) > 0.5:
G.add_edge(corr.columns[i], corr.columns[j],
weight=abs(corr.iloc[i, j]))
pos = nx.spring_layout(G, k=2, iterations=50)
nx.draw(G, pos, with_labels=True, node_color='lightblue',
node_size=3000, font_size=10, font_weight='bold',
edge_color=[G[u][v]['weight'] for u, v in G.edges()],
edge_cmap=plt.cm.RdYlGn, edge_vmin=0, edge_vmax=1)
ax6.set_title('Correlation Network (>0.5)', fontsize=12)
ax6.axis('off')
plt.tight_layout()
plt.show()
# Generate sample data for demonstration
np.random.seed(42)
n = 500
df = pd.DataFrame({
'sales': np.random.normal(1000, 200, n),
'marketing': np.random.normal(100, 30, n),
'advertising': np.random.normal(50, 15, n),
'price': np.random.normal(50, 10, n)
})
df['sales'] = df['sales'] + df['marketing'] * 2 + df['advertising'] * 3 - df['price'] * 5
advanced_correlation_visualizations(df)
4. Correlation Matrix Interpretation
Analyzing Correlation Patterns
def analyze_correlation_patterns(df, corr_matrix):
"""
Analyze and interpret correlation patterns
"""
print("=" * 60)
print("CORRELATION MATRIX ANALYSIS")
print("=" * 60)
# Find strongest correlations
print("\n1. STRONGEST CORRELATIONS:")
print("-" * 40)
corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr_pairs.append({
'var1': corr_matrix.columns[i],
'var2': corr_matrix.columns[j],
'correlation': corr_matrix.iloc[i, j]
})
corr_pairs.sort(key=lambda x: abs(x['correlation']), reverse=True)
for pair in corr_pairs[:5]:
strength = "Strong" if abs(pair['correlation']) > 0.7 else \
"Moderate" if abs(pair['correlation']) > 0.5 else "Weak"
direction = "positive" if pair['correlation'] > 0 else "negative"
print(f" {pair['var1']} ↔ {pair['var2']}: {pair['correlation']:.3f} "
f"({strength} {direction})")
# Identify potential issues
print("\n2. POTENTIAL ISSUES:")
print("-" * 40)
# High correlations (> 0.8) - potential multicollinearity
high_corr = [p for p in corr_pairs if abs(p['correlation']) > 0.8]
if high_corr:
print(" ⚠ High correlations (>0.8) - potential multicollinearity:")
for p in high_corr:
print(f" - {p['var1']} and {p['var2']}: {p['correlation']:.3f}")
else:
print(" ✓ No high correlations detected")
# Perfect correlations
perfect_corr = [p for p in corr_pairs if abs(p['correlation']) == 1.0]
if perfect_corr:
print("\n ⚠ Perfect correlations detected - redundant variables:")
for p in perfect_corr:
print(f" - {p['var1']} and {p['var2']}: {p['correlation']:.3f}")
# Identify isolated variables
print("\n3. ISOLATED VARIABLES:")
print("-" * 40)
for var in corr_matrix.columns:
max_corr = corr_matrix[var].drop(var).abs().max()
if max_corr < 0.3:
print(f" {var}: Weakly correlated with all others (max r = {max_corr:.3f})")
# Cluster analysis
print("\n4. VARIABLE CLUSTERS:")
print("-" * 40)
from scipy.cluster.hierarchy import dendrogram, linkage
# Perform hierarchical clustering
linkage_matrix = linkage(corr_matrix, method='ward')
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix, labels=corr_matrix.columns,
orientation='top', leaf_rotation=90)
plt.title('Hierarchical Clustering of Variables', fontsize=14)
plt.xlabel('Variables')
plt.ylabel('Distance')
plt.tight_layout()
plt.show()
# Generate sample data for analysis
np.random.seed(42)
n = 1000
df = pd.DataFrame({
'feature_1': np.random.randn(n),
'feature_2': np.random.randn(n),
'feature_3': np.random.randn(n),
'feature_4': np.random.randn(n),
'feature_5': np.random.randn(n),
'target': np.random.randn(n)
})
# Add relationships
df['feature_2'] = df['feature_1'] * 0.85 + np.random.randn(n) * 0.3
df['feature_3'] = df['feature_1'] * 0.9 + np.random.randn(n) * 0.2
df['feature_4'] = df['feature_2'] * 0.7 + df['feature_3'] * 0.3
df['target'] = df['feature_1'] * 0.5 + df['feature_2'] * 0.3 + np.random.randn(n) * 0.2
corr_matrix = df.corr()
analyze_correlation_patterns(df, corr_matrix)
5. Practical Applications
Feature Selection with Correlation
def feature_selection_correlation(df, target_col, threshold=0.95):
"""
Select features based on correlation with target and between features
"""
# Calculate correlations
corr_with_target = df.corr()[target_col].drop(target_col).abs().sort_values(ascending=False)
full_corr = df.corr()
print("=" * 60)
print("FEATURE SELECTION USING CORRELATION")
print("=" * 60)
print(f"\n1. CORRELATION WITH TARGET ({target_col}):")
print("-" * 40)
for feat, corr in corr_with_target.head(10).items():
strength = "Very Strong" if abs(corr) > 0.7 else \
"Strong" if abs(corr) > 0.5 else \
"Moderate" if abs(corr) > 0.3 else "Weak"
print(f" {feat}: {corr:.3f} ({strength})")
# Feature selection based on correlation
selected_features = []
correlated_features = []
for feature in corr_with_target.index:
# Check if feature is already selected or removed
if feature in correlated_features:
continue
# Check correlation with already selected features
keep_feature = True
for selected in selected_features:
if abs(full_corr.loc[feature, selected]) > threshold:
keep_feature = False
correlated_features.append(feature)
break
if keep_feature:
selected_features.append(feature)
print(f"\n2. RECOMMENDED FEATURES (threshold = {threshold}):")
print("-" * 40)
for feat in selected_features[:10]:
print(f" ✓ {feat}")
print(f"\n3. REMOVED FEATURES (highly correlated with others):")
print("-" * 40)
for feat in correlated_features[:10]:
print(f" ✗ {feat}")
return selected_features, correlated_features
# Generate sample data
np.random.seed(42)
n = 1000
df = pd.DataFrame({
'target': np.random.randn(n),
'x1': np.random.randn(n),
'x2': np.random.randn(n),
'x3': np.random.randn(n),
'x4': np.random.randn(n),
'x5': np.random.randn(n)
})
# Add relationships
df['x1'] = df['target'] * 0.8 + np.random.randn(n) * 0.3
df['x2'] = df['target'] * 0.6 + np.random.randn(n) * 0.4
df['x3'] = df['x1'] * 0.9 + np.random.randn(n) * 0.1
df['x4'] = df['x2'] * 0.85 + np.random.randn(n) * 0.15
df['x5'] = np.random.randn(n) # Noise
selected, removed = feature_selection_correlation(df, 'target')
Multicollinearity Detection
def detect_multicollinearity(df, threshold=0.8):
"""
Detect multicollinearity using correlation matrix
"""
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
print("=" * 60)
print("MULTICOLLINEARITY DETECTION")
print("=" * 60)
# 1. High correlation pairs
print("\n1. HIGH CORRELATION PAIRS:")
print("-" * 40)
corr_matrix = df.corr()
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
if abs(corr_matrix.iloc[i, j]) > threshold:
high_corr_pairs.append({
'var1': corr_matrix.columns[i],
'var2': corr_matrix.columns[j],
'correlation': corr_matrix.iloc[i, j]
})
if high_corr_pairs:
for pair in high_corr_pairs:
print(f" {pair['var1']} ↔ {pair['var2']}: {pair['correlation']:.3f}")
else:
print(" No high correlation pairs detected")
# 2. Variance Inflation Factor (VIF)
print("\n2. VARIANCE INFLATION FACTOR (VIF):")
print("-" * 40)
X = df.values
vif_data = pd.DataFrame()
vif_data["variable"] = df.columns
vif_data["VIF"] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
vif_data = vif_data.sort_values('VIF', ascending=False)
print("\n VIF > 10 indicates multicollinearity:")
for _, row in vif_data.iterrows():
indicator = "⚠" if row['VIF'] > 10 else " "
print(f" {indicator} {row['variable']}: {row['VIF']:.2f}")
# 3. Condition Number
print("\n3. CONDITION NUMBER:")
print("-" * 40)
from numpy.linalg import cond
condition_number = cond(X)
print(f" Condition Number: {condition_number:.2f}")
if condition_number > 30:
print(" ⚠ High condition number indicates multicollinearity")
else:
print(" ✓ Acceptable condition number")
return vif_data
# Generate data with multicollinearity
np.random.seed(42)
n = 200
X1 = np.random.randn(n)
X2 = X1 * 0.9 + np.random.randn(n) * 0.1
X3 = X1 * 0.8 + np.random.randn(n) * 0.2
X4 = np.random.randn(n)
df_multicollinear = pd.DataFrame({
'X1': X1,
'X2': X2,
'X3': X3,
'X4': X4
})
detect_multicollinearity(df_multicollinear)
6. Correlation in Machine Learning
Correlation with Target Variable
def plot_target_correlations(df, target):
"""
Visualize correlations between features and target
"""
# Calculate correlations
correlations = df.corr()[target].drop(target).sort_values()
# Create figure
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Bar plot
colors = ['red' if x < 0 else 'green' for x in correlations.values]
axes[0].barh(correlations.index, correlations.values, color=colors, alpha=0.7)
axes[0].axvline(x=0, color='black', linestyle='-', linewidth=1)
axes[0].set_xlabel('Correlation with Target')
axes[0].set_title(f'Feature Correlations with {target}')
axes[0].grid(True, alpha=0.3)
# Scatter plot matrix for top features
top_features = correlations.abs().nlargest(4).index
cols = [target] + list(top_features)
from pandas.plotting import scatter_matrix
scatter_matrix(df[cols], alpha=0.5, figsize=(12, 12), diagonal='hist', ax=axes[1])
axes[1].set_title('Scatter Plots: Target vs Top Features', fontsize=12)
plt.tight_layout()
plt.show()
# Print correlation summary
print("\nCorrelation with Target Variable:")
print("-" * 40)
for feature, corr in correlations.items():
strength = "Strong" if abs(corr) > 0.5 else \
"Moderate" if abs(corr) > 0.3 else "Weak"
direction = "positive" if corr > 0 else "negative"
print(f" {feature}: {corr:.3f} ({strength} {direction})")
# Generate sample data
np.random.seed(42)
n = 500
df = pd.DataFrame({
'target': np.random.randn(n),
'feature_1': np.random.randn(n),
'feature_2': np.random.randn(n),
'feature_3': np.random.randn(n),
'feature_4': np.random.randn(n)
})
# Add relationships
df['feature_1'] = df['target'] * 0.8 + np.random.randn(n) * 0.3
df['feature_2'] = df['target'] * -0.6 + np.random.randn(n) * 0.4
df['feature_3'] = df['target'] * 0.4 + np.random.randn(n) * 0.5
df['feature_4'] = np.random.randn(n) # Noise
plot_target_correlations(df, 'target')
Correlation-based Feature Engineering
def correlation_feature_engineering(df):
"""
Create new features based on correlation patterns
"""
# Original correlations
original_corr = df.corr()
print("=" * 60)
print("FEATURE ENGINEERING WITH CORRELATION")
print("=" * 60)
# 1. Interaction features
print("\n1. CREATING INTERACTION FEATURES:")
print("-" * 40)
# Find highly correlated features
high_corr_pairs = []
for i in range(len(df.columns)):
for j in range(i+1, len(df.columns)):
corr = abs(original_corr.iloc[i, j])
if corr > 0.6:
high_corr_pairs.append((df.columns[i], df.columns[j], corr))
for col1, col2, corr in high_corr_pairs[:3]:
interaction = f"{col1}_x_{col2}"
df[interaction] = df[col1] * df[col2]
print(f" Created {interaction} (correlation {corr:.3f})")
# 2. Ratio features
print("\n2. CREATING RATIO FEATURES:")
print("-" * 40)
for col1, col2, _ in high_corr_pairs[:3]:
ratio = f"{col1}_div_{col2}"
df[ratio] = df[col1] / (df[col2] + 1e-8)
print(f" Created {ratio}")
# 3. Sum/Average of correlated features
print("\n3. AGGREGATING CORRELATED FEATURES:")
print("-" * 40)
# Group features by correlation
from sklearn.cluster import KMeans
corr_dist = 1 - abs(original_corr)
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(corr_dist)
for cluster_id in range(3):
cluster_features = df.columns[clusters == cluster_id]
if len(cluster_features) > 1:
cluster_name = f"cluster_{cluster_id}_avg"
df[cluster_name] = df[cluster_features].mean(axis=1)
print(f" Created {cluster_name} from {list(cluster_features)}")
# 4. Check improvements
print("\n4. IMPROVEMENT ANALYSIS:")
print("-" * 40)
new_corr = df.corr()
improvements = []
for col in original_corr.columns:
old_max = original_corr[col].drop(col).abs().max()
new_max = new_corr[col].drop(col).abs().max()
if new_max > old_max:
improvements.append((col, old_max, new_max))
if improvements:
print(" Features with improved correlation to target:")
for feat, old, new in improvements[:5]:
print(f" {feat}: {old:.3f} → {new:.3f}")
else:
print(" No significant improvements")
return df
# Generate sample data
np.random.seed(42)
n = 500
df = pd.DataFrame({
'feature_A': np.random.randn(n),
'feature_B': np.random.randn(n),
'feature_C': np.random.randn(n),
'target': np.random.randn(n)
})
# Add relationships
df['feature_B'] = df['feature_A'] * 0.7 + np.random.randn(n) * 0.3
df['feature_C'] = df['feature_A'] * 0.5 + df['feature_B'] * 0.3 + np.random.randn(n) * 0.2
df['target'] = df['feature_A'] * 0.8 + df['feature_B'] * 0.4 + np.random.randn(n) * 0.1
correlation_feature_engineering(df)
7. Real-World Case Studies
Customer Segmentation with Correlation
def customer_segmentation_correlation():
"""
Customer segmentation using correlation analysis
"""
# Generate customer data
np.random.seed(42)
n_customers = 500
customers = pd.DataFrame({
'customer_id': range(1, n_customers + 1),
'age': np.random.normal(40, 12, n_customers),
'income': np.random.normal(60000, 25000, n_customers),
'purchase_frequency': np.random.poisson(5, n_customers),
'avg_order_value': np.random.normal(100, 50, n_customers),
'customer_tenure': np.random.normal(24, 18, n_customers),
'support_tickets': np.random.poisson(1, n_customers),
'satisfaction_score': np.random.normal(7, 1.5, n_customers),
'promo_response_rate': np.random.beta(2, 5, n_customers)
})
# Add realistic relationships
customers['purchase_frequency'] = (customers['purchase_frequency'] +
customers['income'] / 20000)
customers['avg_order_value'] = (customers['avg_order_value'] +
customers['income'] / 1000)
customers['satisfaction_score'] = (customers['satisfaction_score'] -
customers['support_tickets'] * 0.5)
customers['promo_response_rate'] = (customers['promo_response_rate'] +
customers['purchase_frequency'] * 0.05)
# Calculate RFM-style metrics
customers['total_spend'] = customers['purchase_frequency'] * customers['avg_order_value']
customers['spend_per_month'] = customers['total_spend'] / (customers['customer_tenure'] + 1)
# Correlation analysis
customer_metrics = ['age', 'income', 'purchase_frequency', 'avg_order_value',
'customer_tenure', 'support_tickets', 'satisfaction_score',
'promo_response_rate', 'total_spend', 'spend_per_month']
corr_matrix = customers[customer_metrics].corr()
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
# Correlation heatmap
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
center=0, square=True, ax=axes[0])
axes[0].set_title('Customer Metrics Correlation Matrix', fontsize=14)
# Customer segmentation based on key metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
features = ['income', 'purchase_frequency', 'avg_order_value', 'satisfaction_score']
X = StandardScaler().fit_transform(customers[features])
kmeans = KMeans(n_clusters=4, random_state=42)
customers['segment'] = kmeans.fit_predict(X)
# Segment analysis
segment_means = customers.groupby('segment')[features].mean()
axes[1].axis('off')
table_data = []
for segment in range(4):
row = [f'Segment {segment + 1}']
for feat in features:
row.append(f"{segment_means.loc[segment, feat]:.0f}")
table_data.append(row)
table = axes[1].table(cellText=table_data,
rowLabels=[f'Segment {i+1}' for i in range(4)],
colLabels=['Segment'] + features,
cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
axes[1].set_title('Segment Characteristics', fontsize=14, pad=20)
plt.suptitle('Customer Segmentation Analysis', fontsize=16)
plt.tight_layout()
plt.show()
# Print insights
print("\nCUSTOMER SEGMENTATION INSIGHTS")
print("=" * 40)
for segment in range(4):
seg_data = customers[customers['segment'] == segment]
print(f"\nSegment {segment + 1} (n={len(seg_data)}):")
print(f" Avg Income: ${seg_data['income'].mean():,.0f}")
print(f" Avg Purchases: {seg_data['purchase_frequency'].mean():.1f}")
print(f" Avg Order Value: ${seg_data['avg_order_value'].mean():.0f}")
print(f" Satisfaction: {seg_data['satisfaction_score'].mean():.1f}/10")
return customers
customer_segmentation_correlation()
Financial Portfolio Correlation
def financial_portfolio_correlation():
"""
Analyze correlation in financial portfolio
"""
# Generate synthetic asset returns
np.random.seed(42)
n_days = 500
n_assets = 8
# Asset categories
asset_names = ['Tech Stock', 'Energy Stock', 'Bonds', 'Real Estate',
'Gold', 'Commodities', 'International', 'Cash']
# Generate correlated returns
returns = np.random.randn(n_days, n_assets)
# Create correlation structure
# Tech and Energy are correlated
returns[:, 1] = returns[:, 0] * 0.6 + returns[:, 1] * 0.8
# Bonds negatively correlated with stocks
returns[:, 2] = -returns[:, 0] * 0.3 + returns[:, 2] * 0.9
# Real Estate correlated with both stocks
returns[:, 3] = returns[:, 0] * 0.4 + returns[:, 1] * 0.3 + returns[:, 3] * 0.8
# Gold as hedge
returns[:, 4] = -returns[:, 0] * 0.2 + returns[:, 4] * 0.95
df_returns = pd.DataFrame(returns, columns=asset_names)
# Calculate correlation
corr_matrix = df_returns.corr()
# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# 1. Correlation heatmap
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
center=0, square=True, ax=axes[0, 0])
axes[0, 0].set_title('Asset Return Correlations', fontsize=12)
# 2. Hierarchical clustering
from scipy.cluster.hierarchy import dendrogram, linkage
linkage_matrix = linkage(corr_matrix, method='ward')
dendrogram(linkage_matrix, labels=corr_matrix.columns,
orientation='left', ax=axes[0, 1])
axes[0, 1].set_title('Asset Clustering', fontsize=12)
# 3. Rolling correlation (time-varying)
window = 60
rolling_corr = df_returns['Tech Stock'].rolling(window).corr(df_returns['Energy Stock'])
axes[1, 0].plot(rolling_corr, color='blue', linewidth=2)
axes[1, 0].axhline(y=0, color='black', linestyle='-', alpha=0.5)
axes[1, 0].set_title('Rolling Correlation: Tech vs Energy')
axes[1, 0].set_ylabel('Correlation')
axes[1, 0].grid(True, alpha=0.3)
# 4. Portfolio optimization based on correlation
from scipy.optimize import minimize
# Calculate expected returns and covariance
expected_returns = df_returns.mean() * 252 # Annualized
cov_matrix = df_returns.cov() * 252
def portfolio_volatility(weights):
return np.sqrt(np.dot(weights.T, np.dot(cov_matrix, weights)))
def portfolio_return(weights):
return np.sum(weights * expected_returns)
# Equal weight portfolio
n = len(asset_names)
equal_weights = np.ones(n) / n
equal_vol = portfolio_volatility(equal_weights)
equal_return = portfolio_return(equal_weights)
# Minimum variance portfolio
constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
bounds = tuple((0, 1) for _ in range(n))
result = minimize(portfolio_volatility, equal_weights,
method='SLSQP', bounds=bounds, constraints=constraints)
min_var_weights = result.x
min_var_vol = portfolio_volatility(min_var_weights)
min_var_return = portfolio_return(min_var_weights)
# Create comparison
comp_data = []
for asset, weight_eq, weight_mv in zip(asset_names, equal_weights, min_var_weights):
comp_data.append([asset, f"{weight_eq:.1%}", f"{weight_mv:.1%}"])
axes[1, 1].axis('off')
table = axes[1, 1].table(cellText=comp_data,
colLabels=['Asset', 'Equal Weight', 'Min Variance'],
cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(9)
axes[1, 1].set_title('Portfolio Allocation Comparison', fontsize=12, pad=20)
# Add portfolio statistics
stats_text = f"Equal Weight Portfolio:\n Return: {equal_return:.1%}\n Volatility: {equal_vol:.1%}\n\n"
stats_text += f"Min Variance Portfolio:\n Return: {min_var_return:.1%}\n Volatility: {min_var_vol:.1%}\n"
axes[1, 1].text(0.5, -0.3, stats_text, transform=axes[1, 1].transAxes,
fontsize=10, ha='center')
plt.suptitle('Financial Portfolio Correlation Analysis', fontsize=16)
plt.tight_layout()
plt.show()
return df_returns, corr_matrix
financial_portfolio_correlation()
8. Correlation Matrix Tools
Correlation Matrix Dashboard
def correlation_dashboard(df):
"""
Create an interactive correlation dashboard
"""
from ipywidgets import interact, FloatSlider, Dropdown, VBox, HBox
import matplotlib.pyplot as plt
# Calculate correlation matrix
corr_matrix = df.corr()
def update_view(threshold=0.5, view_type='heatmap', target_var=None):
fig, ax = plt.subplots(figsize=(10, 8))
if view_type == 'heatmap':
# Create masked matrix
mask = np.abs(corr_matrix) < threshold
masked_corr = corr_matrix.mask(mask)
sns.heatmap(masked_corr, annot=True, fmt='.2f', cmap='coolwarm',
center=0, square=True, ax=ax, cbar_kws={'shrink': 0.8})
ax.set_title(f'Correlation Matrix (|r| > {threshold})', fontsize=14)
elif view_type == 'network' and target_var:
# Create correlation network
import networkx as nx
G = nx.Graph()
# Add nodes
for col in df.columns:
G.add_node(col)
# Add edges for correlations above threshold
for i in range(len(df.columns)):
for j in range(i+1, len(df.columns)):
corr = corr_matrix.iloc[i, j]
if abs(corr) > threshold:
G.add_edge(df.columns[i], df.columns[j],
weight=abs(corr), sign=np.sign(corr))
# Layout and draw
pos = nx.spring_layout(G, k=2, iterations=50)
# Node colors based on correlation with target
if target_var in df.columns:
target_corr = corr_matrix[target_var].drop(target_var)
node_colors = [target_corr.get(node, 0) for node in G.nodes()]
nx.draw(G, pos, with_labels=True, node_color=node_colors,
cmap='coolwarm', node_size=3000, font_size=10,
font_weight='bold', ax=ax)
else:
nx.draw(G, pos, with_labels=True, node_color='lightblue',
node_size=3000, font_size=10, font_weight='bold', ax=ax)
ax.set_title(f'Correlation Network (|r| > {threshold})', fontsize=14)
ax.axis('off')
plt.tight_layout()
plt.show()
# Interactive controls
threshold_slider = FloatSlider(min=0, max=0.9, step=0.05, value=0.5,
description='Threshold:', continuous_update=False)
view_selector = Dropdown(options=['heatmap', 'network'], value='heatmap',
description='View:')
target_selector = Dropdown(options=['None'] + list(df.columns), value='None',
description='Target:')
interact(update_view, threshold=threshold_slider, view_type=view_selector,
target_var=target_selector)
# Example data
np.random.seed(42)
n = 200
df = pd.DataFrame({
'A': np.random.randn(n),
'B': np.random.randn(n),
'C': np.random.randn(n),
'D': np.random.randn(n),
'E': np.random.randn(n)
})
df['B'] = df['A'] * 0.8 + np.random.randn(n) * 0.2
df['C'] = df['A'] * 0.6 + df['B'] * 0.3 + np.random.randn(n) * 0.1
df['D'] = df['A'] * -0.5 + np.random.randn(n) * 0.3
df['E'] = np.random.randn(n) # Noise
# Uncomment to run interactive dashboard
# correlation_dashboard(df)
9. Best Practices and Guidelines
Correlation Analysis Checklist
def correlation_best_practices():
"""
Checklist for correlation analysis best practices
"""
checklist = {
"Data Preparation": [
"Handle missing values appropriately",
"Check for outliers that may skew correlation",
"Ensure data types are numeric",
"Consider standardizing variables if needed",
"Remove duplicate or near-duplicate columns"
],
"Correlation Calculation": [
"Use appropriate correlation method (Pearson/Spearman/Kendall)",
"Check assumptions for parametric correlation",
"Consider robust correlation methods for outliers",
"Calculate p-values for significance testing",
"Use confidence intervals when possible"
],
"Interpretation": [
"Don't confuse correlation with causation",
"Consider the domain context",
"Watch for spurious correlations",
"Check for non-linear relationships",
"Consider practical significance vs statistical significance"
],
"Visualization": [
"Use appropriate color schemes (diverging for correlation)",
"Include annotations for important values",
"Use clustering to reveal patterns",
"Consider interactive visualizations for exploration",
"Include sample sizes and significance indicators"
],
"Reporting": [
"Highlight key findings",
"Explain implications for analysis/modeling",
"Document feature selection decisions",
"Note any limitations or caveats",
"Provide actionable insights"
]
}
print("=" * 60)
print("CORRELATION ANALYSIS BEST PRACTICES")
print("=" * 60)
for category, items in checklist.items():
print(f"\n{category}:")
print("-" * 40)
for item in items:
print(f" ✓ {item}")
correlation_best_practices()
Common Pitfalls and Solutions
def correlation_pitfalls():
"""
Illustrate common correlation pitfalls
"""
np.random.seed(42)
n = 100
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
# 1. Spurious correlation
x = np.random.randn(n)
y = np.random.randn(n)
r, p = stats.pearsonr(x, y)
axes[0].scatter(x, y, alpha=0.6)
axes[0].set_title(f'Spurious Correlation\nr = {r:.3f}, p = {p:.3f}')
axes[0].set_xlabel('Random X')
axes[0].set_ylabel('Random Y')
# 2. Non-linear relationship
x = np.linspace(-3, 3, n)
y = x**2 + np.random.randn(n) * 0.5
r, p = stats.pearsonr(x, y)
axes[1].scatter(x, y, alpha=0.6)
axes[1].set_title(f'Non-linear Relationship\nr = {r:.3f}')
axes[1].set_xlabel('X')
axes[1].set_ylabel('Y')
# 3. Outliers
x = np.random.randn(n)
y = x + np.random.randn(n) * 0.2
x[0] = 5
y[0] = 5
r, p = stats.pearsonr(x, y)
axes[2].scatter(x, y, alpha=0.6)
axes[2].scatter([5], [5], color='red', s=100, label='Outlier')
axes[2].set_title(f'Outlier Influence\nr = {r:.3f}')
axes[2].legend()
# 4. Restricted range
x = np.random.randn(n)
y = x + np.random.randn(n) * 0.2
x = x[x > -1]
y = y[y > -1]
r, p = stats.pearsonr(x, y)
axes[3].scatter(x, y, alpha=0.6)
axes[3].set_title(f'Restricted Range\nr = {r:.3f}')
# 5. Simpsons paradox
groups = 3
for i in range(groups):
x = np.random.randn(30) + i
y = x + np.random.randn(30) * 0.3
axes[4].scatter(x, y, alpha=0.6, label=f'Group {i+1}')
all_x = np.concatenate([np.random.randn(30) + i for i in range(groups)])
all_y = all_x + np.random.randn(90) * 0.5
axes[4].scatter(all_x, all_y, alpha=0.3, color='gray', s=50)
axes[4].set_title('Simpson\'s Paradox\nOverall vs Group Correlations')
axes[4].legend()
# 6. Heteroscedasticity
x = np.linspace(-3, 3, n)
y = x + np.random.randn(n) * np.abs(x)
r, p = stats.pearsonr(x, y)
axes[5].scatter(x, y, alpha=0.6)
axes[5].set_title(f'Heteroscedasticity\nr = {r:.3f}')
plt.suptitle('Common Correlation Pitfalls', fontsize=16)
plt.tight_layout()
plt.show()
correlation_pitfalls()
Conclusion
Correlation matrices are essential tools in data science for understanding relationships between variables, detecting multicollinearity, and guiding feature selection.
Key Takeaways
- Understanding Correlation: Know the difference between Pearson, Spearman, and Kendall correlation
- Visualization: Use appropriate color schemes and clustering to reveal patterns
- Interpretation: Don't confuse correlation with causation
- Feature Selection: Use correlation to identify redundant features
- Multicollinearity: Detect and address high correlations between predictors
- Domain Knowledge: Always interpret correlations in context
- Robustness: Be aware of outliers and non-linear relationships
Quick Reference
| Correlation Type | Use Case | Range | Interpretation |
|---|---|---|---|
| Pearson | Linear relationships | -1 to +1 | Strength of linear association |
| Spearman | Monotonic relationships | -1 to +1 | Rank correlation |
| Kendall | Small samples, ties | -1 to +1 | Concordant/discordant pairs |
Correlation Strength Interpretation
| r | Strength | ||
|---|---|---|---|
| 0.0 - 0.3 | Weak | ||
| 0.3 - 0.5 | Moderate | ||
| 0.5 - 0.7 | Strong | ||
| 0.7 - 1.0 | Very Strong |
Mastering correlation matrices is fundamental to exploratory data analysis and building robust machine learning models!