Introduction to Linear Regression
Linear regression is one of the most fundamental and widely used statistical techniques in data science and machine learning. It models the relationship between a dependent variable (target) and one or more independent variables (features) by fitting a linear equation to observed data. This technique forms the foundation for many more complex algorithms and provides interpretable insights into variable relationships.
Key Concepts
- Simple Linear Regression: Models relationship between one independent variable and the dependent variable
- Multiple Linear Regression: Models relationship between multiple independent variables and the dependent variable
- Best Fit Line: Minimizes the sum of squared residuals (errors)
- Coefficients: Represent the strength and direction of relationships
- R-squared: Measures how well the model explains the variance in the data
- Assumptions: Linearity, independence, homoscedasticity, normality of residuals
1. Simple Linear Regression
Theoretical Foundation
The simple linear regression model is expressed as:
y = β₀ + β₁x + ε
Where:
- y = dependent variable (target)
- x = independent variable (feature)
- β₀ = y-intercept (value of y when x = 0)
- β₁ = slope (change in y per unit change in x)
- ε = error term (residual)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
# Generate synthetic data
np.random.seed(42)
X = np.random.randn(100, 1) * 10
true_slope = 2.5
true_intercept = 15
y = true_intercept + true_slope * X.flatten() + np.random.randn(100) * 5
# Create DataFrame
df = pd.DataFrame({'X': X.flatten(), 'y': y})
print("Data Sample:")
print(df.head())
# Visualize data
plt.figure(figsize=(10, 6))
plt.scatter(df['X'], df['y'], alpha=0.6, edgecolors='k')
plt.xlabel('X (Independent Variable)')
plt.ylabel('y (Dependent Variable)')
plt.title('Simple Linear Regression Data')
plt.grid(True, alpha=0.3)
plt.show()
Implementing Simple Linear Regression
# Method 1: Using scikit-learn
from sklearn.linear_model import LinearRegression
# Split data
X_train, X_test, y_train, y_test = train_test_split(df[['X']], df['y'],
test_size=0.2, random_state=42)
# Create and train model
model_sklearn = LinearRegression()
model_sklearn.fit(X_train, y_train)
# Make predictions
y_pred_sklearn = model_sklearn.predict(X_test)
# Get coefficients
print("Scikit-learn Model:")
print(f" Intercept (β₀): {model_sklearn.intercept_:.4f}")
print(f" Slope (β₁): {model_sklearn.coef_[0]:.4f}")
# Method 2: Manual calculation using normal equation
def manual_linear_regression(X, y):
"""
Calculate linear regression coefficients manually.
Formula: β = (XᵀX)⁻¹Xᵀy
"""
# Add column of ones for intercept
X_with_intercept = np.column_stack([np.ones(len(X)), X])
# Normal equation: β = (XᵀX)⁻¹Xᵀy
beta = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ X_with_intercept.T @ y
return beta[0], beta[1] # intercept, slope
intercept_manual, slope_manual = manual_linear_regression(X_train.values, y_train.values)
print(f"\nManual Calculation:")
print(f" Intercept (β₀): {intercept_manual:.4f}")
print(f" Slope (β₁): {slope_manual:.4f}")
# Method 3: Using statsmodels for statistical inference
X_with_const = sm.add_constant(X_train) # Add intercept term
model_stats = sm.OLS(y_train, X_with_const).fit()
print("\nStatsmodels Results:")
print(model_stats.summary())
Visualization of Regression Line
def plot_regression_results(X, y, model, title="Linear Regression Results"):
"""
Visualize regression line and predictions.
"""
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# 1. Regression line with data
X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_pred_range = model.predict(X_range)
axes[0].scatter(X, y, alpha=0.6, label='Actual')
axes[0].plot(X_range, y_pred_range, 'r-', linewidth=2, label='Regression Line')
axes[0].set_xlabel('X')
axes[0].set_ylabel('y')
axes[0].set_title(title)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# 2. Residuals plot
y_pred = model.predict(X)
residuals = y - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.6)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted Values')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')
axes[1].grid(True, alpha=0.3)
# 3. Distribution of residuals
axes[2].hist(residuals, bins=20, edgecolor='black', alpha=0.7)
axes[2].set_xlabel('Residuals')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Residual Distribution')
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Calculate metrics
r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
print(f"\nModel Performance:")
print(f" R² Score: {r2:.4f}")
print(f" MSE: {mse:.4f}")
print(f" RMSE: {rmse:.4f}")
# Plot results
plot_regression_results(X_train.values, y_train.values, model_sklearn)
2. Multiple Linear Regression
Extending to Multiple Features
# Generate multiple features data
np.random.seed(42)
n_samples = 200
# Create correlated features
X1 = np.random.randn(n_samples) * 10
X2 = 0.7 * X1 + np.random.randn(n_samples) * 3 # Correlated with X1
X3 = np.random.randn(n_samples) * 8
X4 = 0.3 * X1 + 0.5 * X2 + np.random.randn(n_samples) * 2
# True relationship: y = 10 + 2*X1 + 1.5*X2 - X3 + 0.5*X4 + noise
true_coef = [10, 2, 1.5, -1, 0.5]
y = (true_coef[0] +
true_coef[1] * X1 +
true_coef[2] * X2 +
true_coef[3] * X3 +
true_coef[4] * X4 +
np.random.randn(n_samples) * 5)
# Create DataFrame
df_multi = pd.DataFrame({
'X1': X1,
'X2': X2,
'X3': X3,
'X4': X4,
'y': y
})
print("Multiple Linear Regression Data:")
print(df_multi.head())
print(f"\nCorrelation Matrix:")
print(df_multi.corr().round(3))
Multiple Regression Implementation
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
# Prepare data
X_multi = df_multi[['X1', 'X2', 'X3', 'X4']]
y_multi = df_multi['y']
# Split data
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
X_multi, y_multi, test_size=0.2, random_state=42
)
# Scale features (important for interpretation)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_multi)
X_test_scaled = scaler.transform(X_test_multi)
# Train model
model_multi = LinearRegression()
model_multi.fit(X_train_scaled, y_train_multi)
# Make predictions
y_pred_multi = model_multi.predict(X_test_scaled)
# Results
print("Multiple Linear Regression Results:")
print("=" * 50)
print(f"Intercept (β₀): {model_multi.intercept_:.4f}")
print("\nCoefficients:")
for i, feature in enumerate(['X1', 'X2', 'X3', 'X4']):
print(f" {feature} (β_{i+1}): {model_multi.coef_[i]:.4f}")
print(f"\nTrue Coefficients:")
for i, (feature, coef) in enumerate(zip(['X1', 'X2', 'X3', 'X4'], true_coef[1:])):
print(f" {feature}: {coef:.4f}")
# Model performance
r2_multi = r2_score(y_test_multi, y_pred_multi)
mse_multi = mean_squared_error(y_test_multi, y_pred_multi)
rmse_multi = np.sqrt(mse_multi)
print(f"\nModel Performance:")
print(f" R² Score: {r2_multi:.4f}")
print(f" MSE: {mse_multi:.4f}")
print(f" RMSE: {rmse_multi:.4f}")
Feature Importance Analysis
def analyze_feature_importance(model, feature_names, X_train, y_train):
"""
Analyze feature importance using coefficients and statistical significance.
"""
# Add constant for statsmodels
X_with_const = sm.add_constant(X_train)
model_stats = sm.OLS(y_train, X_with_const).fit()
# Create results DataFrame
results = pd.DataFrame({
'Feature': ['Intercept'] + feature_names,
'Coefficient': [model.intercept_] + list(model.coef_),
'Std Error': [np.nan] + list(model_stats.bse[1:]),
'p-value': [np.nan] + list(model_stats.pvalues[1:]),
't-statistic': [np.nan] + list(model_stats.tvalues[1:])
})
# Calculate absolute importance
results['abs_coefficient'] = np.abs(results['Coefficient'])
return results
# Analyze feature importance
importance_results = analyze_feature_importance(
model_multi, ['X1', 'X2', 'X3', 'X4'], X_train_scaled, y_train_multi
)
print("\nFeature Importance Analysis:")
print(importance_results.round(4).to_string(index=False))
# Visualize coefficients
plt.figure(figsize=(10, 6))
features = ['X1', 'X2', 'X3', 'X4']
coefs = model_multi.coef_
colors = ['green' if c > 0 else 'red' for c in coefs]
plt.barh(features, coefs, color=colors, alpha=0.7)
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.xlabel('Coefficient Value')
plt.title('Feature Coefficients (Multiple Linear Regression)')
plt.grid(True, alpha=0.3)
plt.show()
3. Model Diagnostics
Residual Analysis
def comprehensive_residual_analysis(model, X, y, feature_names):
"""
Perform comprehensive residual analysis for regression diagnostics.
"""
y_pred = model.predict(X)
residuals = y - y_pred
standardized_residuals = (residuals - residuals.mean()) / residuals.std()
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# 1. Residuals vs Fitted
axes[0, 0].scatter(y_pred, residuals, alpha=0.6)
axes[0, 0].axhline(y=0, color='r', linestyle='--')
axes[0, 0].set_xlabel('Fitted Values')
axes[0, 0].set_ylabel('Residuals')
axes[0, 0].set_title('Residuals vs Fitted')
axes[0, 0].grid(True, alpha=0.3)
# 2. Q-Q plot for normality
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('Q-Q Plot')
axes[0, 1].grid(True, alpha=0.3)
# 3. Scale-Location plot
sqrt_abs_residuals = np.sqrt(np.abs(standardized_residuals))
axes[0, 2].scatter(y_pred, sqrt_abs_residuals, alpha=0.6)
axes[0, 2].set_xlabel('Fitted Values')
axes[0, 2].set_ylabel('√|Standardized Residuals|')
axes[0, 2].set_title('Scale-Location Plot')
axes[0, 2].grid(True, alpha=0.3)
# 4. Residuals vs each predictor
for i, feature in enumerate(feature_names):
ax = axes[1, i] if i < 3 else axes[1, 2]
ax.scatter(X[feature], residuals, alpha=0.6)
ax.axhline(y=0, color='r', linestyle='--')
ax.set_xlabel(feature)
ax.set_ylabel('Residuals')
ax.set_title(f'Residuals vs {feature}')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Statistical tests
# Shapiro-Wilk test for normality
shapiro_stat, shapiro_p = stats.shapiro(residuals)
# Durbin-Watson test for autocorrelation
from statsmodels.stats.stattools import durbin_watson
dw_stat = durbin_watson(residuals)
# Breusch-Pagan test for heteroscedasticity
from statsmodels.stats.diagnostic import het_breuschpagan
bp_test = het_breuschpagan(residuals, sm.add_constant(X))
print("Residual Diagnostics:")
print("=" * 50)
print(f"Shapiro-Wilk Test (Normality):")
print(f" Statistic: {shapiro_stat:.4f}")
print(f" p-value: {shapiro_p:.4f}")
print(f" {'✓ Normal' if shapiro_p > 0.05 else '✗ Non-normal'}")
print(f"\nDurbin-Watson Test (Autocorrelation):")
print(f" Statistic: {dw_stat:.4f}")
print(f" {'✓ No autocorrelation' if 1.5 < dw_stat < 2.5 else '✗ Possible autocorrelation'}")
print(f"\nBreusch-Pagan Test (Homoscedasticity):")
print(f" LM Statistic: {bp_test[0]:.4f}")
print(f" p-value: {bp_test[1]:.4f}")
print(f" {'✓ Homoscedastic' if bp_test[1] > 0.05 else '✗ Heteroscedastic'}")
return residuals, standardized_residuals
# Perform residual analysis
residuals, std_residuals = comprehensive_residual_analysis(
model_multi, X_train_scaled, y_train_multi, ['X1', 'X2', 'X3', 'X4']
)
Multicollinearity Detection
def detect_multicollinearity(X):
"""
Detect multicollinearity using Variance Inflation Factor (VIF).
VIF > 5-10 indicates high multicollinearity.
"""
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Add constant
X_with_const = sm.add_constant(X)
# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X_with_const.values, i+1)
for i in range(len(X.columns))]
return vif_data.sort_values('VIF', ascending=False)
# Check multicollinearity
vif_results = detect_multicollinearity(X_multi)
print("Multicollinearity Detection (VIF):")
print("=" * 40)
print(vif_results.to_string(index=False))
print("\nInterpretation:")
print(" VIF = 1: No correlation")
print(" VIF 1-5: Moderate correlation")
print(" VIF > 5: High correlation (multicollinearity)")
print(" VIF > 10: Very high correlation (severe multicollinearity)")
# Visualize correlation matrix
plt.figure(figsize=(8, 6))
corr_matrix = X_multi.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()
4. Regularized Linear Regression
Ridge Regression (L2 Regularization)
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
def compare_regularization_models(X_train, y_train, X_test, y_test):
"""
Compare Ridge, Lasso, and ElasticNet regression.
"""
results = {}
# 1. Ridge Regression
ridge_params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, ridge_params, cv=5, scoring='r2')
ridge_cv.fit(X_train, y_train)
ridge_best = ridge_cv.best_estimator_
ridge_pred = ridge_best.predict(X_test)
results['Ridge'] = {
'best_alpha': ridge_cv.best_params_['alpha'],
'r2': r2_score(y_test, ridge_pred),
'rmse': np.sqrt(mean_squared_error(y_test, ridge_pred))
}
# 2. Lasso Regression
lasso_params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
lasso = Lasso()
lasso_cv = GridSearchCV(lasso, lasso_params, cv=5, scoring='r2')
lasso_cv.fit(X_train, y_train)
lasso_best = lasso_cv.best_estimator_
lasso_pred = lasso_best.predict(X_test)
results['Lasso'] = {
'best_alpha': lasso_cv.best_params_['alpha'],
'r2': r2_score(y_test, lasso_pred),
'rmse': np.sqrt(mean_squared_error(y_test, lasso_pred)),
'nonzero_features': sum(abs(lasso_best.coef_) > 1e-6)
}
# 3. ElasticNet
elastic_params = {
'alpha': [0.001, 0.01, 0.1, 1, 10],
'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}
elastic = ElasticNet()
elastic_cv = GridSearchCV(elastic, elastic_params, cv=5, scoring='r2')
elastic_cv.fit(X_train, y_train)
elastic_best = elastic_cv.best_estimator_
elastic_pred = elastic_best.predict(X_test)
results['ElasticNet'] = {
'best_alpha': elastic_cv.best_params_['alpha'],
'best_l1_ratio': elastic_cv.best_params_['l1_ratio'],
'r2': r2_score(y_test, elastic_pred),
'rmse': np.sqrt(mean_squared_error(y_test, elastic_pred))
}
return results, ridge_best, lasso_best, elastic_best
# Compare regularization models
reg_results, ridge_model, lasso_model, elastic_model = compare_regularization_models(
X_train_scaled, y_train_multi, X_test_scaled, y_test_multi
)
print("Regularization Model Comparison:")
print("=" * 50)
for model_name, metrics in reg_results.items():
print(f"\n{model_name}:")
for metric, value in metrics.items():
if isinstance(value, float):
print(f" {metric}: {value:.4f}")
else:
print(f" {metric}: {value}")
Visualizing Regularization Effects
def plot_regularization_path(X_train, y_train):
"""
Plot coefficient paths for different regularization strengths.
"""
alphas = np.logspace(-3, 2, 100)
# Ridge coefficients
ridge_coefs = []
for alpha in alphas:
ridge = Ridge(alpha=alpha)
ridge.fit(X_train, y_train)
ridge_coefs.append(ridge.coef_)
ridge_coefs = np.array(ridge_coefs)
# Lasso coefficients
lasso_coefs = []
for alpha in alphas:
lasso = Lasso(alpha=alpha, max_iter=10000)
lasso.fit(X_train, y_train)
lasso_coefs.append(lasso.coef_)
lasso_coefs = np.array(lasso_coefs)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Ridge path
for i in range(len(X_train[0])):
axes[0].plot(alphas, ridge_coefs[:, i], label=f'Feature {i+1}')
axes[0].set_xscale('log')
axes[0].set_xlabel('Alpha (Regularization Strength)')
axes[0].set_ylabel('Coefficient Value')
axes[0].set_title('Ridge (L2) Regularization Path')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Lasso path
for i in range(len(X_train[0])):
axes[1].plot(alphas, lasso_coefs[:, i], label=f'Feature {i+1}')
axes[1].set_xscale('log')
axes[1].set_xlabel('Alpha (Regularization Strength)')
axes[1].set_ylabel('Coefficient Value')
axes[1].set_title('Lasso (L1) Regularization Path')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Plot regularization paths
plot_regularization_path(X_train_scaled, y_train_multi)
5. Polynomial Regression
Capturing Non-Linear Relationships
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
def polynomial_regression_example():
"""
Demonstrate polynomial regression for non-linear relationships.
"""
# Generate non-linear data
np.random.seed(42)
X_poly = np.random.randn(100, 1) * 2
y_poly = 3 * X_poly.flatten()**2 + 2 * X_poly.flatten() + 1 + np.random.randn(100) * 2
# Create models with different degrees
degrees = [1, 2, 3, 4]
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
X_range = np.linspace(X_poly.min(), X_poly.max(), 100).reshape(-1, 1)
for i, degree in enumerate(degrees):
row = i // 2
col = i % 2
# Create polynomial model
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model.fit(X_poly, y_poly)
# Predictions
y_pred = model.predict(X_range)
# Plot
axes[row, col].scatter(X_poly, y_poly, alpha=0.6, label='Data')
axes[row, col].plot(X_range, y_pred, 'r-', linewidth=2, label=f'Degree {degree}')
axes[row, col].set_xlabel('X')
axes[row, col].set_ylabel('y')
axes[row, col].set_title(f'Polynomial Regression (Degree {degree})')
axes[row, col].legend()
axes[row, col].grid(True, alpha=0.3)
# Calculate metrics
y_pred_train = model.predict(X_poly)
r2 = r2_score(y_poly, y_pred_train)
mse = mean_squared_error(y_poly, y_pred_train)
axes[row, col].text(0.05, 0.95, f'R² = {r2:.4f}\nMSE = {mse:.4f}',
transform=axes[row, col].transAxes,
verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
plt.tight_layout()
plt.show()
return X_poly, y_poly
# Run polynomial regression example
X_poly, y_poly = polynomial_regression_example()
Cross-Validation for Degree Selection
from sklearn.model_selection import cross_val_score
def select_optimal_degree(X, y, max_degree=10):
"""
Use cross-validation to select optimal polynomial degree.
"""
degrees = range(1, max_degree + 1)
cv_scores = []
cv_stds = []
for degree in degrees:
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
scores = cross_val_score(model, X, y, cv=5, scoring='r2')
cv_scores.append(scores.mean())
cv_stds.append(scores.std())
# Find optimal degree
optimal_degree = degrees[np.argmax(cv_scores)]
# Plot results
plt.figure(figsize=(10, 6))
plt.errorbar(degrees, cv_scores, yerr=cv_stds, capsize=5, marker='o')
plt.axvline(x=optimal_degree, color='r', linestyle='--', label=f'Optimal Degree = {optimal_degree}')
plt.xlabel('Polynomial Degree')
plt.ylabel('Cross-Validation R² Score')
plt.title('Cross-Validation for Polynomial Degree Selection')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"Optimal Polynomial Degree: {optimal_degree}")
print(f"Best CV R² Score: {cv_scores[optimal_degree-1]:.4f} (+/- {cv_stds[optimal_degree-1]:.4f})")
return optimal_degree
# Select optimal degree
optimal_deg = select_optimal_degree(X_poly, y_poly)
6. Real-World Examples
House Price Prediction
def house_price_prediction():
"""
Example: Predicting house prices using multiple features.
"""
# Generate synthetic house data
np.random.seed(42)
n_houses = 500
# Features
sqft = np.random.normal(2000, 500, n_houses).clip(800, 5000)
bedrooms = np.random.randint(1, 5, n_houses)
bathrooms = np.random.uniform(1, 4, n_houses).round(1)
age = np.random.randint(0, 50, n_houses)
location_score = np.random.uniform(0, 10, n_houses)
# Price formula (with noise)
base_price = 50000
price = (base_price +
150 * sqft +
10000 * bedrooms +
8000 * bathrooms -
500 * age +
15000 * location_score +
np.random.normal(0, 30000, n_houses))
# Create DataFrame
house_df = pd.DataFrame({
'sqft': sqft,
'bedrooms': bedrooms,
'bathrooms': bathrooms,
'age': age,
'location_score': location_score,
'price': price
})
print("House Price Dataset:")
print(house_df.head())
print(f"\nDataset Shape: {house_df.shape}")
# Prepare data
X = house_df[['sqft', 'bedrooms', 'bathrooms', 'age', 'location_score']]
y = house_df['price']
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# Predict
y_pred = model.predict(X_test_scaled)
# Results
print("\nModel Performance:")
print(f" R² Score: {r2_score(y_test, y_pred):.4f}")
print(f" RMSE: ${np.sqrt(mean_squared_error(y_test, y_pred)):,.0f}")
# Feature importance
feature_importance = pd.DataFrame({
'Feature': X.columns,
'Coefficient': model.coef_,
'Abs_Importance': np.abs(model.coef_)
}).sort_values('Abs_Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance.to_string(index=False))
# Visualize predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('House Price Predictions')
plt.grid(True, alpha=0.3)
plt.show()
return model, scaler
# Run house price prediction
house_model, house_scaler = house_price_prediction()
Sales Forecasting
def sales_forecasting():
"""
Example: Forecasting sales with seasonal effects.
"""
# Generate time series sales data
np.random.seed(42)
months = 36 # 3 years
time = np.arange(months)
# Components
trend = 1000 + 50 * time
seasonal = 200 * np.sin(2 * np.pi * time / 12) # Yearly seasonality
random_noise = np.random.randn(months) * 100
sales = trend + seasonal + random_noise
# Create features
sales_df = pd.DataFrame({
'month': time,
'sales': sales,
'month_sin': np.sin(2 * np.pi * time / 12),
'month_cos': np.cos(2 * np.pi * time / 12),
'time': time,
'time_squared': time ** 2
})
# Add lag features
for lag in [1, 2, 3, 6, 12]:
sales_df[f'lag_{lag}'] = sales_df['sales'].shift(lag)
# Drop NaN values
sales_df = sales_df.dropna()
# Prepare features
feature_cols = ['time', 'time_squared', 'month_sin', 'month_cos',
'lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_12']
X_sales = sales_df[feature_cols]
y_sales = sales_df['sales']
# Split (time series split)
split_idx = int(len(X_sales) * 0.8)
X_train_sales, X_test_sales = X_sales[:split_idx], X_sales[split_idx:]
y_train_sales, y_test_sales = y_sales[:split_idx], y_sales[split_idx:]
# Train model
model_sales = LinearRegression()
model_sales.fit(X_train_sales, y_train_sales)
# Predict
y_pred_sales = model_sales.predict(X_test_sales)
# Results
print("Sales Forecasting Results:")
print("=" * 50)
print(f"R² Score: {r2_score(y_test_sales, y_pred_sales):.4f}")
print(f"RMSE: ${np.sqrt(mean_squared_error(y_test_sales, y_pred_sales)):,.2f}")
# Feature importance
importance = pd.DataFrame({
'Feature': feature_cols,
'Coefficient': model_sales.coef_,
'Abs_Importance': np.abs(model_sales.coef_)
}).sort_values('Abs_Importance', ascending=False)
print("\nFeature Importance:")
print(importance.to_string(index=False))
# Plot predictions
plt.figure(figsize=(12, 6))
plt.plot(y_test_sales.index, y_test_sales.values, label='Actual', alpha=0.7)
plt.plot(y_test_sales.index, y_pred_sales, label='Predicted', alpha=0.7)
plt.xlabel('Time Period')
plt.ylabel('Sales')
plt.title('Sales Forecasting with Linear Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
return model_sales, importance
# Run sales forecasting
sales_model, sales_importance = sales_forecasting()
7. Advanced Topics
Weighted Linear Regression
def weighted_linear_regression(X, y, weights=None):
"""
Implement weighted linear regression.
"""
# Add intercept term
X_with_intercept = np.column_stack([np.ones(len(X)), X])
if weights is None:
# Unweighted regression
beta = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ X_with_intercept.T @ y
else:
# Weighted regression: β = (XᵀWX)⁻¹XᵀWy
W = np.diag(weights)
beta = np.linalg.inv(X_with_intercept.T @ W @ X_with_intercept) @ X_with_intercept.T @ W @ y
return beta[0], beta[1:]
# Example with different weights
np.random.seed(42)
X_weighted = np.random.randn(100, 1) * 10
y_weighted = 3 * X_weighted.flatten() + 5 + np.random.randn(100) * 3
# Create weights (give more importance to recent observations)
weights = np.linspace(0.1, 1, len(X_weighted))
# Compare unweighted vs weighted
intercept_unweighted, coef_unweighted = weighted_linear_regression(X_weighted, y_weighted)
intercept_weighted, coef_weighted = weighted_linear_regression(X_weighted, y_weighted, weights)
print("Weighted Linear Regression Comparison:")
print("=" * 50)
print(f"Unweighted: y = {intercept_unweighted:.4f} + {coef_unweighted[0]:.4f}x")
print(f"Weighted: y = {intercept_weighted:.4f} + {coef_weighted[0]:.4f}x")
Robust Regression
from sklearn.linear_model import HuberRegressor, RANSACRegressor
def robust_regression_comparison(X, y):
"""
Compare different robust regression methods.
"""
# Standard Linear Regression
lr = LinearRegression()
lr.fit(X, y)
# Huber Regression (robust to outliers)
huber = HuberRegressor(epsilon=1.35)
huber.fit(X, y)
# RANSAC (Random Sample Consensus)
ransac = RANSACRegressor(min_samples=0.5, residual_threshold=5)
ransac.fit(X, y)
# Generate predictions
X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X, y, alpha=0.6, label='Data')
plt.plot(X_range, lr.predict(X_range), label='Linear Regression', linewidth=2)
plt.plot(X_range, huber.predict(X_range), label='Huber Regression', linewidth=2)
plt.plot(X_range, ransac.predict(X_range), label='RANSAC', linewidth=2)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Robust Regression Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
# Residual plot
plt.subplot(1, 2, 2)
plt.scatter(lr.predict(X), y - lr.predict(X), alpha=0.5, label='Linear', s=30)
plt.scatter(huber.predict(X), y - huber.predict(X), alpha=0.5, label='Huber', s=30)
plt.axhline(y=0, color='k', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return {'Linear': lr, 'Huber': huber, 'RANSAC': ransac}
# Generate data with outliers
np.random.seed(42)
X_robust = np.random.randn(100, 1) * 10
y_robust = 2 * X_robust.flatten() + 3 + np.random.randn(100) * 2
# Add outliers
outlier_indices = np.random.choice(len(X_robust), 10, replace=False)
y_robust[outlier_indices] = y_robust[outlier_indices] + np.random.randn(10) * 50
# Run comparison
robust_models = robust_regression_comparison(X_robust, y_robust)
8. Model Interpretation
Coefficient Interpretation
def interpret_coefficients(model, feature_names, target_name):
"""
Provide detailed interpretation of regression coefficients.
"""
intercept = model.intercept_
coefficients = model.coef_
print(f"Linear Regression Model Interpretation")
print("=" * 60)
print(f"Equation: {target_name} = {intercept:.4f}")
for i, (name, coef) in enumerate(zip(feature_names, coefficients)):
print(f" + ({coef:.4f}) * {name}")
print(f"\nInterpretation:")
print(f" Intercept ({intercept:.4f}):")
print(f" Predicted value when all features are zero.")
for name, coef in zip(feature_names, coefficients):
if coef > 0:
print(f"\n {name} (+{coef:.4f}):")
print(f" For each 1-unit increase in {name}, {target_name}")
print(f" increases by {coef:.4f} units, holding all else constant.")
else:
print(f"\n {name} ({coef:.4f}):")
print(f" For each 1-unit increase in {name}, {target_name}")
print(f" decreases by {abs(coef):.4f} units, holding all else constant.")
# Interpret house price model
feature_names = ['sqft', 'bedrooms', 'bathrooms', 'age', 'location_score']
interpret_coefficients(house_model, feature_names, 'price')
Partial Dependence Plots
def plot_partial_dependence(model, X, feature_names, feature_idx, n_points=100):
"""
Create partial dependence plots for selected features.
"""
from sklearn.inspection import partial_dependence
fig, axes = plt.subplots(1, len(feature_idx), figsize=(15, 4))
if len(feature_idx) == 1:
axes = [axes]
for i, idx in enumerate(feature_idx):
# Calculate partial dependence
pdp = partial_dependence(model, X, [idx], kind='average', grid_resolution=n_points)
# Plot
axes[i].plot(pdp['values'][0], pdp['average'][0], linewidth=2)
axes[i].set_xlabel(feature_names[idx])
axes[i].set_ylabel('Partial Dependence')
axes[i].set_title(f'Effect of {feature_names[idx]} on Prediction')
axes[i].grid(True, alpha=0.3)
# Add confidence intervals (bootstrap)
# Simplified version - could use more sophisticated bootstrap
plt.tight_layout()
plt.show()
# Plot partial dependence for house price model
# Note: This requires the model to be fitted on original (unscaled) data
# For scaled data, we need to transform back or use the scaler
9. Model Selection and Comparison
def compare_regression_models(X_train, y_train, X_test, y_test):
"""
Compare multiple regression models.
"""
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
models = {
'Linear Regression': LinearRegression(),
'Ridge': Ridge(alpha=1.0),
'Lasso': Lasso(alpha=1.0),
'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
'Decision Tree': DecisionTreeRegressor(max_depth=5),
'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=3)
}
results = []
for name, model in models.items():
# Train
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = np.mean(np.abs(y_test - y_pred))
results.append({
'Model': name,
'R² Score': r2,
'RMSE': rmse,
'MAE': mae
})
# Create comparison DataFrame
comparison = pd.DataFrame(results).sort_values('R² Score', ascending=False)
return comparison
# Compare models
model_comparison = compare_regression_models(
X_train_scaled, y_train_multi, X_test_scaled, y_test_multi
)
print("Model Comparison Results:")
print("=" * 70)
print(model_comparison.round(4).to_string(index=False))
10. Best Practices and Common Pitfalls
Assumption Checking Checklist
def regression_assumption_checklist(model, X, y):
"""
Comprehensive regression assumption checking.
"""
y_pred = model.predict(X)
residuals = y - y_pred
results = {
'assumptions': {},
'tests': {}
}
# 1. Linearity
from scipy.stats import pearsonr
corr, p_value = pearsonr(y_pred, residuals)
results['assumptions']['linearity'] = abs(corr) < 0.1
# 2. Homoscedasticity (using residuals plot visual assessment)
from statsmodels.stats.diagnostic import het_breuschpagan
X_const = sm.add_constant(X)
_, bp_pvalue, _, _ = het_breuschpagan(residuals, X_const)
results['assumptions']['homoscedasticity'] = bp_pvalue > 0.05
results['tests']['Breusch-Pagan p-value'] = bp_pvalue
# 3. Normality of residuals
from scipy.stats import shapiro
_, shapiro_p = shapiro(residuals)
results['assumptions']['normality'] = shapiro_p > 0.05
results['tests']['Shapiro-Wilk p-value'] = shapiro_p
# 4. Independence of errors (Durbin-Watson)
from statsmodels.stats.stattools import durbin_watson
dw = durbin_watson(residuals)
results['assumptions']['independence'] = 1.5 < dw < 2.5
results['tests']['Durbin-Watson'] = dw
# 5. No multicollinearity (VIF)
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = []
for i in range(X.shape[1]):
vif = variance_inflation_factor(X.values, i)
vifs.append(vif)
results['assumptions']['no_multicollinearity'] = max(vifs) < 10
results['tests']['max_VIF'] = max(vifs)
# Summary
print("Regression Assumptions Check")
print("=" * 50)
for assumption, satisfied in results['assumptions'].items():
status = "✓" if satisfied else "✗"
print(f"{status} {assumption}: {satisfied}")
print("\nDetailed Tests:")
for test, value in results['tests'].items():
print(f" {test}: {value:.4f}")
return results
# Check assumptions for house price model
# Note: We need to use unscaled data for this check
assumptions = regression_assumption_checklist(
house_model, X_train, y_train_multi
)
Common Pitfalls and Solutions
def demonstrate_pitfalls():
"""
Demonstrate common pitfalls in linear regression.
"""
# Pitfall 1: Overfitting with polynomial regression
np.random.seed(42)
X = np.linspace(0, 1, 50)
y = 2 * X + 1 + np.random.randn(50) * 0.2
X_poly_high = np.vstack([X**i for i in range(1, 20)]).T
model_overfit = LinearRegression()
model_overfit.fit(X_poly_high, y)
y_pred_overfit = model_overfit.predict(X_poly_high)
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.scatter(X, y, alpha=0.6)
plt.plot(X, y_pred_overfit, 'r-', label='Overfit (Degree 19)')
plt.plot(X, 2*X + 1, 'g--', label='True', linewidth=2)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Overfitting')
plt.legend()
plt.grid(True, alpha=0.3)
# Pitfall 2: Extrapolation
X_train = np.random.rand(80) * 10
y_train = 2 * X_train + 1 + np.random.randn(80)
X_test = np.random.rand(20) * 10 + 10 # Outside training range
model = LinearRegression()
model.fit(X_train.reshape(-1, 1), y_train)
y_pred = model.predict(X_test.reshape(-1, 1))
plt.subplot(1, 2, 2)
plt.scatter(X_train, y_train, alpha=0.6, label='Training')
plt.scatter(X_test, y_pred, alpha=0.6, label='Extrapolation', color='red')
X_line = np.linspace(0, 20, 100)
plt.plot(X_line, 2*X_line + 1, 'g--', label='True', linewidth=2)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Extrapolation Danger')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nCommon Pitfall Solutions:")
print("1. Overfitting: Use regularization (Ridge/Lasso) or cross-validation")
print("2. Extrapolation: Never predict outside training range")
print("3. Multicollinearity: Use VIF to detect, remove correlated features")
print("4. Non-linearity: Consider polynomial features or non-linear models")
print("5. Outliers: Use robust regression (Huber, RANSAC)")
demonstrate_pitfalls()
11. Model Deployment Ready Code
class LinearRegressionModel:
"""
Complete linear regression model with training, evaluation, and prediction.
"""
def __init__(self, name="Linear Regression Model"):
self.name = name
self.model = LinearRegression()
self.scaler = StandardScaler()
self.feature_names = None
self.target_name = None
def fit(self, X, y, feature_names=None, target_name="target"):
"""
Train the linear regression model.
"""
self.feature_names = feature_names or [f"feature_{i}" for i in range(X.shape[1])]
self.target_name = target_name
# Scale features
X_scaled = self.scaler.fit_transform(X)
# Train model
self.model.fit(X_scaled, y)
return self
def predict(self, X):
"""
Make predictions on new data.
"""
X_scaled = self.scaler.transform(X)
return self.model.predict(X_scaled)
def evaluate(self, X_test, y_test):
"""
Evaluate model performance.
"""
y_pred = self.predict(X_test)
return {
'r2': r2_score(y_test, y_pred),
'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
'mae': mean_absolute_error(y_test, y_pred),
'predictions': y_pred
}
def get_coefficients(self):
"""
Get model coefficients.
"""
return {
'intercept': self.model.intercept_,
'coefficients': dict(zip(self.feature_names, self.model.coef_))
}
def summary(self):
"""
Print model summary.
"""
print(f"\n{self.name} Summary")
print("=" * 50)
print(f"Features: {self.feature_names}")
print(f"Target: {self.target_name}")
print(f"\nIntercept: {self.model.intercept_:.4f}")
print("\nCoefficients:")
for name, coef in zip(self.feature_names, self.model.coef_):
print(f" {name}: {coef:.4f}")
# Example usage
model = LinearRegressionModel("House Price Model")
model.fit(X_train, y_train_multi, feature_names=['sqft', 'bedrooms', 'bathrooms', 'age', 'location_score'])
model.summary()
# Evaluate
metrics = model.evaluate(X_test, y_test_multi)
print(f"\nModel Performance:")
print(f" R²: {metrics['r2']:.4f}")
print(f" RMSE: {metrics['rmse']:.2f}")
Conclusion
Linear regression is a fundamental tool in data science that provides interpretable insights into variable relationships:
Key Takeaways
- Simplicity: Linear regression is easy to understand and implement
- Interpretability: Coefficients directly show the impact of each feature
- Foundation: Many advanced algorithms build upon linear regression concepts
- Assumptions: Linear regression has important assumptions that need checking
- Regularization: Ridge, Lasso, and ElasticNet improve generalization
- Extensions: Polynomial regression captures non-linear relationships
Best Practices Checklist
- [ ] Visualize data before modeling
- [ ] Check for linear relationships
- [ ] Handle missing values appropriately
- [ ] Scale features for regularized models
- [ ] Test for multicollinearity
- [ ] Validate model assumptions
- [ ] Use cross-validation for model selection
- [ ] Monitor for overfitting
- [ ] Interpret coefficients carefully
- [ ] Document model limitations
Linear regression remains a powerful, interpretable tool in the data scientist's toolkit. While more complex models may offer higher predictive power, linear regression provides unmatched transparency and ease of interpretation.