What is Variance?
Variance measures the spread or dispersion of data points around their mean. It quantifies how far a set of numbers are spread out from their average value.
"Variance tells you how much your data is bouncing around."
The Intuition
Low Variance: High Variance: All values close to mean Values widely scattered [10, 11, 9, 10, 10] [0, 20, 5, 15, 10] Mean = 10 Mean = 10 Variance = 0.4 Variance = 62.5
Part 1: Mathematical Foundations
1.1 Population Variance (σ²)
For a complete population of N values:
σ² = Σ(xᵢ - μ)² / N Where: - σ² = population variance (sigma squared) - xᵢ = each individual value - μ = population mean - N = population size
1.2 Sample Variance (s²)
For a sample from a population (uses n-1 for unbiased estimation):
s² = Σ(xᵢ - x̄)² / (n - 1) Where: - s² = sample variance - x̄ = sample mean - n = sample size - (n-1) = degrees of freedom
1.3 Why n-1? (Bessel's Correction)
import numpy as np
import pandas as pd
# Demonstration of Bessel's Correction
np.random.seed(42)
population = np.random.normal(50, 10, 10000) # True variance = 100
# Take many samples and calculate variance with and without correction
sample_sizes = [10, 30, 100, 300]
results = []
for n in sample_sizes:
biased_variances = []
unbiased_variances = []
for _ in range(1000):
sample = np.random.choice(population, n)
biased_variances.append(np.var(sample, ddof=0)) # Uses n
unbiased_variances.append(np.var(sample, ddof=1)) # Uses n-1
results.append({
'n': n,
'biased_mean': np.mean(biased_variances),
'unbiased_mean': np.mean(unbiased_variances),
'true_variance': np.var(population)
})
df_results = pd.DataFrame(results)
print(df_results)
# Biased underestimates true variance, unbiased is closer
Part 2: Calculating Variance
2.1 Basic Implementation
import numpy as np
import pandas as pd
# Sample data
data = [12, 15, 14, 10, 13, 16, 11, 14, 15, 12]
# Method 1: Manual calculation
def calculate_variance(data, ddof=0):
"""
Calculate variance manually
ddof=0: population variance
ddof=1: sample variance
"""
n = len(data)
mean = sum(data) / n
squared_deviations = [(x - mean) ** 2 for x in data]
variance = sum(squared_deviations) / (n - ddof)
return variance
pop_var = calculate_variance(data, ddof=0)
sample_var = calculate_variance(data, ddof=1)
print(f"Population variance: {pop_var:.2f}")
print(f"Sample variance: {sample_var:.2f}")
# Method 2: Using numpy
pop_var_np = np.var(data, ddof=0)
sample_var_np = np.var(data, ddof=1)
# Method 3: Using pandas
series = pd.Series(data)
pop_var_pd = series.var(ddof=0)
sample_var_pd = series.var(ddof=1)
2.2 Vectorized Operations
# For large datasets, use vectorized operations
def fast_variance(data, ddof=0):
"""Efficient variance calculation for large arrays"""
data = np.asarray(data)
n = len(data)
mean = np.mean(data)
squared_deviations = (data - mean) ** 2
return np.sum(squared_deviations) / (n - ddof)
# Performance comparison
import time
large_data = np.random.randn(1000000)
start = time.time()
var1 = np.var(large_data)
print(f"NumPy: {time.time() - start:.4f} seconds")
start = time.time()
var2 = fast_variance(large_data)
print(f"Custom: {time.time() - start:.4f} seconds")
Part 3: Properties of Variance
3.1 Mathematical Properties
# Property 1: Variance is always non-negative
variance = np.var([1, 2, 3, 4, 5]) # Always >= 0
# Property 2: Adding a constant doesn't change variance
data = [1, 2, 3, 4, 5]
var1 = np.var(data) # 2.0
var2 = np.var([x + 10 for x in data]) # Still 2.0
# Property 3: Scaling by constant multiplies variance by constant²
data = [1, 2, 3, 4, 5]
var_original = np.var(data) # 2.0
var_scaled = np.var([x * 2 for x in data]) # 8.0 (2² * 2)
# Property 4: Variance of sum of independent variables
x = np.random.randn(1000)
y = np.random.randn(1000)
var_sum = np.var(x + y)
var_x = np.var(x)
var_y = np.var(y)
print(f"Var(X+Y) ≈ Var(X) + Var(Y): {var_sum:.3f} ≈ {var_x + var_y:.3f}")
# Property 5: Computational formula
# Var(X) = E[X²] - (E[X])²
def variance_computational(data):
n = len(data)
sum_x = sum(data)
sum_x2 = sum(x**2 for x in data)
return (sum_x2 / n) - (sum_x / n) ** 2
3.2 Variance Decomposition
# Total variance = Explained variance + Unexplained variance
from sklearn.linear_model import LinearRegression
# Generate data with relationship
np.random.seed(42)
X = np.random.randn(100, 1)
y = 2 * X.flatten() + np.random.randn(100) * 0.5
# Fit model
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
# Variance decomposition
total_var = np.var(y)
explained_var = np.var(y_pred)
unexplained_var = np.var(y - y_pred)
print(f"Total variance: {total_var:.3f}")
print(f"Explained variance: {explained_var:.3f}")
print(f"Unexplained variance: {unexplained_var:.3f}")
print(f"R² = {explained_var/total_var:.3f}") # Proportion explained
Part 4: Variance in Context
4.1 Variance vs. Standard Deviation
# Standard deviation is more interpretable (same units as data)
variance = np.var(data)
std_dev = np.std(data)
print(f"Variance: {variance:.2f} (squared units)")
print(f"Standard deviation: {std_dev:.2f} (original units)")
# When to use each:
# - Variance: Mathematical derivations, ANOVA, statistical tests
# - Std Dev: Reporting spread, confidence intervals, interpretability
4.2 Coefficient of Variation (Relative Variance)
def coefficient_of_variation(data):
"""CV = σ/μ - allows comparison across different scales"""
mean = np.mean(data)
std = np.std(data)
return (std / mean) * 100 # As percentage
# Compare datasets with different scales
heights_cm = [160, 165, 170, 175, 180] # Mean ~170
weights_kg = [55, 65, 75, 85, 95] # Mean ~75
cv_height = coefficient_of_variation(heights_cm) # ~4.7%
cv_weight = coefficient_of_variation(weights_kg) # ~20.0%
print(f"Heights CV: {cv_height:.1f}% (less relative spread)")
print(f"Weights CV: {cv_weight:.1f}% (more relative spread)")
Part 5: Variance in Different Distributions
5.1 Common Distributions and Their Variance
from scipy import stats
import matplotlib.pyplot as plt
# Normal Distribution: variance = σ²
normal_data = np.random.normal(loc=0, scale=2, size=1000) # σ=2, σ²=4
# Uniform Distribution: variance = (b-a)²/12
uniform_data = np.random.uniform(low=0, high=10, size=1000) # variance = 100/12 ≈ 8.33
# Poisson Distribution: variance = λ (mean = variance)
poisson_data = np.random.poisson(lam=5, size=1000) # variance ≈ 5
# Exponential Distribution: variance = 1/λ²
exp_data = np.random.exponential(scale=2, size=1000) # scale=1/λ, variance=4
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
distributions = [
(normal_data, 'Normal (σ²=4)'),
(uniform_data, 'Uniform (σ²=8.33)'),
(poisson_data, 'Poisson (σ²≈5)'),
(exp_data, 'Exponential (σ²=4)')
]
for ax, (data, title) in zip(axes.flat, distributions):
ax.hist(data, bins=50, alpha=0.7, edgecolor='black')
ax.axvline(np.mean(data), color='red', linestyle='--', label=f'Mean: {np.mean(data):.2f}')
ax.axvline(np.mean(data) + np.std(data), color='orange', linestyle=':', label='±1σ')
ax.axvline(np.mean(data) - np.std(data), color='orange', linestyle=':')
ax.set_title(f'{title}\nVariance: {np.var(data):.2f}')
ax.legend()
plt.tight_layout()
Part 6: Variance in Data Science Applications
6.1 Feature Variance for Feature Selection
from sklearn.feature_selection import VarianceThreshold
# Remove constant or near-constant features
def select_by_variance(df, threshold=0.01):
"""
Remove features with variance below threshold
Useful for removing features with little information
"""
# Normalize features first if on different scales
from sklearn.preprocessing import StandardScaler
# For binary features, variance = p(1-p)
# Maximum variance for binary is 0.25 (when p=0.5)
selector = VarianceThreshold(threshold=threshold)
X_scaled = StandardScaler().fit_transform(df.select_dtypes(include=[np.number]))
X_selected = selector.fit_transform(X_scaled)
# Get selected feature names
selected_mask = selector.get_support()
selected_features = df.select_dtypes(include=[np.number]).columns[selected_mask]
removed_features = df.select_dtypes(include=[np.number]).columns[~selected_mask]
print(f"Removed {len(removed_features)} low-variance features:")
print(list(removed_features))
return X_selected, selected_features
6.2 Bias-Variance Tradeoff
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
def demonstrate_bias_variance_tradeoff(X, y):
"""
Demonstrate how model complexity affects bias and variance
"""
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
degrees = range(1, 10)
train_errors = []
test_errors = []
for degree in degrees:
# Create polynomial features
poly = PolynomialFeatures(degree=degree)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)
# Fit model
model = LinearRegression()
model.fit(X_poly_train, y_train)
# Calculate errors
train_pred = model.predict(X_poly_train)
test_pred = model.predict(X_poly_test)
train_errors.append(mean_squared_error(y_train, train_pred))
test_errors.append(mean_squared_error(y_test, test_pred))
# Plot
plt.figure(figsize=(10, 6))
plt.plot(degrees, train_errors, label='Training Error (Bias)', marker='o')
plt.plot(degrees, test_errors, label='Test Error (Bias + Variance)', marker='o')
plt.xlabel('Model Complexity (Polynomial Degree)')
plt.ylabel('Mean Squared Error')
plt.title('Bias-Variance Tradeoff')
plt.legend()
plt.grid(True, alpha=0.3)
# Find optimal complexity
optimal_degree = degrees[np.argmin(test_errors)]
plt.axvline(optimal_degree, color='red', linestyle='--',
label=f'Optimal: degree={optimal_degree}')
plt.legend()
plt.show()
return optimal_degree
6.3 Analysis of Variance (ANOVA)
from scipy.stats import f_oneway
import statsmodels.api as sm
from statsmodels.formula.api import ols
def anova_analysis(df, target, categorical_feature):
"""
Perform ANOVA to test if group means are significantly different
"""
# Group statistics
groups = df.groupby(categorical_feature)[target]
print(f"=== ANOVA: {target} by {categorical_feature} ===\n")
for name, group in groups:
print(f"{name}: n={len(group)}, mean={group.mean():.3f}, var={group.var():.3f}")
# Perform one-way ANOVA
group_data = [group.values for name, group in groups]
f_stat, p_value = f_oneway(*group_data)
print(f"\nF-statistic: {f_stat:.3f}")
print(f"p-value: {p_value:.4f}")
if p_value < 0.05:
print("Significant differences exist between groups")
# Post-hoc test (Tukey's HSD)
from statsmodels.stats.multicomp import pairwise_tukeyhsd
tukey = pairwise_tukeyhsd(df[target], df[categorical_feature])
print("\nTukey's HSD Post-hoc Test:")
print(tukey)
else:
print("No significant differences between groups")
# Calculate effect size (eta-squared)
ss_between = sum(groups.count() * (groups.mean() - df[target].mean())**2)
ss_total = sum((df[target] - df[target].mean())**2)
eta_squared = ss_between / ss_total
print(f"\nEffect size (η²): {eta_squared:.3f}")
return f_stat, p_value, eta_squared
6.4 Variance in Time Series
def analyze_time_series_variance(data, window=30):
"""
Analyze variance in time series data
"""
# Rolling variance
rolling_var = data.rolling(window=window).var()
rolling_std = data.rolling(window=window).std()
# Volatility clustering
plt.figure(figsize=(12, 8))
plt.subplot(3, 1, 1)
plt.plot(data, label='Original Series')
plt.title('Time Series')
plt.legend()
plt.subplot(3, 1, 2)
plt.plot(rolling_var, color='orange', label=f'{window}-period Rolling Variance')
plt.title('Variance Over Time')
plt.legend()
plt.subplot(3, 1, 3)
# Squared returns for financial data
returns = data.pct_change().dropna()
squared_returns = returns ** 2
plt.plot(squared_returns, color='green', alpha=0.7)
plt.plot(squared_returns.rolling(window=window).mean(),
color='red', linewidth=2, label='Volatility')
plt.title('Squared Returns (Volatility Proxy)')
plt.legend()
plt.tight_layout()
plt.show()
# Heteroscedasticity test
from scipy.stats import bartlett
# Split data into segments
n = len(returns)
segment1 = returns[:n//2]
segment2 = returns[n//2:]
_, p_value = bartlett(segment1, segment2)
print(f"Bartlett test for equal variance: p-value = {p_value:.4f}")
return rolling_var, rolling_std
Part 7: Advanced Variance Concepts
7.1 Variance in Machine Learning Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
def model_variance_analysis(X, y):
"""
Analyze variance in model predictions
"""
# Train multiple models on bootstrap samples
n_models = 100
predictions = []
for i in range(n_models):
# Bootstrap sampling
indices = np.random.choice(len(X), len(X), replace=True)
X_boot = X[indices]
y_boot = y[indices]
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=i)
model.fit(X_boot, y_boot)
# Predict on full dataset
pred = model.predict(X)
predictions.append(pred)
predictions = np.array(predictions)
# Calculate prediction variance for each sample
pred_variance = np.var(predictions, axis=0)
pred_mean = np.mean(predictions, axis=0)
# Visualize
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(pred_variance, bins=50, edgecolor='black')
plt.xlabel('Prediction Variance')
plt.ylabel('Frequency')
plt.title('Distribution of Prediction Variance')
plt.subplot(1, 2, 2)
plt.scatter(pred_mean, pred_variance, alpha=0.5)
plt.xlabel('Mean Prediction')
plt.ylabel('Prediction Variance')
plt.title('Variance vs. Mean Prediction')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return pred_mean, pred_variance
7.2 Variance Inflation Factor (VIF) for Multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calculate_vif(X):
"""
Calculate Variance Inflation Factor for each feature
VIF > 10 indicates high multicollinearity
"""
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(len(X.columns))]
return vif_data.sort_values('VIF', ascending=False)
# Example usage
def diagnose_multicollinearity(df, features):
"""
Diagnose and handle multicollinearity
"""
X = df[features]
vif_df = calculate_vif(X)
print("Variance Inflation Factors:")
print(vif_df)
# Identify problematic features
high_vif = vif_df[vif_df['VIF'] > 10]
if len(high_vif) > 0:
print(f"\nHigh multicollinearity detected in: {list(high_vif['feature'])}")
# Recommend removal strategy
print("\nRecommendations:")
for feature in high_vif['feature']:
print(f" - Consider removing {feature} or combining with related features")
# Correlation matrix for high VIF features
high_vif_features = high_vif['feature'].values
corr_matrix = X[high_vif_features].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of High VIF Features')
plt.show()
return vif_df
7.3 Bias-Variance Decomposition
def bias_variance_decomposition(model, X_train, y_train, X_test, y_test, n_bootstrap=100):
"""
Decompose test error into bias² + variance + irreducible error
"""
predictions = []
for i in range(n_bootstrap):
# Bootstrap sample
indices = np.random.choice(len(X_train), len(X_train), replace=True)
X_boot = X_train[indices]
y_boot = y_train[indices]
# Train model
model_clone = model.__class__(**model.get_params())
model_clone.fit(X_boot, y_boot)
# Predict on test set
pred = model_clone.predict(X_test)
predictions.append(pred)
predictions = np.array(predictions)
# Calculate bias² and variance
mean_prediction = np.mean(predictions, axis=0)
bias_squared = np.mean((mean_prediction - y_test) ** 2)
variance = np.mean(np.var(predictions, axis=0))
# Irreducible error (noise)
irreducible_error = np.var(y_test) if len(y_test.shape) == 1 else 0
total_error = bias_squared + variance + irreducible_error
test_mse = np.mean((mean_prediction - y_test) ** 2)
print(f"Bias²: {bias_squared:.4f}")
print(f"Variance: {variance:.4f}")
print(f"Irreducible Error: {irreducible_error:.4f}")
print(f"Total (Bias² + Variance + Irreducible): {total_error:.4f}")
print(f"Actual Test MSE: {test_mse:.4f}")
return {
'bias_squared': bias_squared,
'variance': variance,
'irreducible_error': irreducible_error,
'total_error': total_error
}
Part 8: Practical Tips and Best Practices
8.1 Variance Interpretation Guidelines
# Context matters for variance interpretation
def interpret_variance(data, context):
"""
Provide context-specific variance interpretation
"""
variance = np.var(data)
std = np.std(data)
mean = np.mean(data)
cv = (std / mean) * 100 if mean != 0 else np.inf
print(f"Variance: {variance:.2f}")
print(f"Standard deviation: {std:.2f}")
print(f"Coefficient of Variation: {cv:.1f}%")
if context == 'manufacturing':
if cv < 5:
print("✓ Excellent process control (low variation)")
elif cv < 10:
print("✓ Acceptable process variation")
else:
print("⚠ High variation - process needs improvement")
elif context == 'finance':
if cv > 30:
print("⚠ High risk investment (high volatility)")
elif cv > 15:
print("✓ Moderate risk")
else:
print("✓ Low risk investment")
elif context == 'scientific':
if cv < 10:
print("✓ Low experimental variation (precise measurements)")
else:
print("⚠ High experimental variation (need more replicates)")
return cv
8.2 Common Pitfalls and Solutions
# Pitfall 1: Using sample variance formula on population
def avoid_pitfall_1(data, is_sample=True):
"""Use correct ddof parameter"""
if is_sample:
variance = np.var(data, ddof=1) # Sample variance
else:
variance = np.var(data, ddof=0) # Population variance
return variance
# Pitfall 2: Comparing variances across different scales
def avoid_pitfall_2(data1, data2):
"""Use coefficient of variation for comparison"""
cv1 = (np.std(data1) / np.mean(data1)) * 100
cv2 = (np.std(data2) / np.mean(data2)) * 100
return cv1, cv2
# Pitfall 3: Ignoring outliers
def avoid_pitfall_3(data):
"""Use robust variance measures"""
# Median Absolute Deviation (MAD)
median = np.median(data)
mad = np.median(np.abs(data - median))
# IQR-based spread
q75, q25 = np.percentile(data, [75, 25])
iqr = q75 - q25
print(f"Standard deviation (sensitive): {np.std(data):.2f}")
print(f"MAD (robust): {mad:.2f}")
print(f"IQR (robust): {iqr:.2f}")
return mad, iqr
# Pitfall 4: Non-normal distributions
def avoid_pitfall_4(data):
"""Check assumptions before using variance-based methods"""
from scipy.stats import normaltest
_, p_value = normaltest(data)
if p_value < 0.05:
print("Data is not normally distributed")
print("Consider using:")
print(" - Median Absolute Deviation for spread")
print(" - Interquartile Range for spread")
print(" - Non-parametric tests for group comparisons")
else:
print("Data appears normally distributed")
print("Standard methods using variance are appropriate")
Part 9: Variance in Statistical Tests
def variance_based_tests(data, group_col, value_col):
"""
Demonstrate variance-based statistical tests
"""
from scipy.stats import levene, bartlett
# Get groups
groups = data[group_col].unique()
group_data = [data[data[group_col] == g][value_col].values for g in groups]
# Test for equal variance (homoscedasticity)
# Levene's test (robust)
levene_stat, levene_p = levene(*group_data)
# Bartlett's test (sensitive to normality)
bartlett_stat, bartlett_p = bartlett(*group_data)
print("=== Variance Homogeneity Tests ===\n")
print(f"Levene's test: statistic={levene_stat:.3f}, p={levene_p:.4f}")
print(f"Bartlett's test: statistic={bartlett_stat:.3f}, p={bartlett_p:.4f}")
if levene_p < 0.05:
print("\n⚠ Variances are significantly different")
print("Consider using Welch's ANOVA instead of standard ANOVA")
# Welch's ANOVA
from scipy.stats import f_oneway
# Use Welch's t-test for two groups, or use statsmodels for multi-group
print("Use Welch's test for unequal variances")
else:
print("\n✓ Variances are homogeneous")
print("Standard ANOVA is appropriate")
return levene_p, bartlett_p
Summary: Key Takeaways
# Quick Reference Card
variance_concepts = {
"Definition": "Average squared deviation from the mean",
"Population": "σ² = Σ(x - μ)² / N",
"Sample": "s² = Σ(x - x̄)² / (n - 1)",
"Properties": [
"Always non-negative",
"Scale-sensitive (multiplies by c²)",
"Additive for independent variables"
],
"Applications": [
"Feature selection (low variance removal)",
"Bias-variance tradeoff",
"ANOVA and hypothesis testing",
"Risk assessment (finance)",
"Quality control (manufacturing)"
],
"Interpretation": {
"Low variance": "Data clustered tightly around mean",
"High variance": "Data widely spread, more uncertainty",
"Zero variance": "All values identical (constant feature)"
}
}
Key Takeaway: Variance is the foundation of statistical thinking in data science. It quantifies uncertainty, drives model selection through the bias-variance tradeoff, and appears in nearly every statistical test and machine learning algorithm. Mastery of variance concepts enables better feature engineering, model evaluation, and interpretation of results. Remember: understanding variance means understanding the uncertainty in your data and models.