Complete Guide to Linear Regression in Data Science

Introduction to Linear Regression

Linear regression is one of the most fundamental and widely used statistical techniques in data science and machine learning. It models the relationship between a dependent variable (target) and one or more independent variables (features) by fitting a linear equation to observed data. This technique forms the foundation for many more complex algorithms and provides interpretable insights into variable relationships.

Key Concepts

  • Simple Linear Regression: Models relationship between one independent variable and the dependent variable
  • Multiple Linear Regression: Models relationship between multiple independent variables and the dependent variable
  • Best Fit Line: Minimizes the sum of squared residuals (errors)
  • Coefficients: Represent the strength and direction of relationships
  • R-squared: Measures how well the model explains the variance in the data
  • Assumptions: Linearity, independence, homoscedasticity, normality of residuals

1. Simple Linear Regression

Theoretical Foundation

The simple linear regression model is expressed as:

y = β₀ + β₁x + ε

Where:

  • y = dependent variable (target)
  • x = independent variable (feature)
  • β₀ = y-intercept (value of y when x = 0)
  • β₁ = slope (change in y per unit change in x)
  • ε = error term (residual)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
# Generate synthetic data
np.random.seed(42)
X = np.random.randn(100, 1) * 10
true_slope = 2.5
true_intercept = 15
y = true_intercept + true_slope * X.flatten() + np.random.randn(100) * 5
# Create DataFrame
df = pd.DataFrame({'X': X.flatten(), 'y': y})
print("Data Sample:")
print(df.head())
# Visualize data
plt.figure(figsize=(10, 6))
plt.scatter(df['X'], df['y'], alpha=0.6, edgecolors='k')
plt.xlabel('X (Independent Variable)')
plt.ylabel('y (Dependent Variable)')
plt.title('Simple Linear Regression Data')
plt.grid(True, alpha=0.3)
plt.show()

Implementing Simple Linear Regression

# Method 1: Using scikit-learn
from sklearn.linear_model import LinearRegression
# Split data
X_train, X_test, y_train, y_test = train_test_split(df[['X']], df['y'], 
test_size=0.2, random_state=42)
# Create and train model
model_sklearn = LinearRegression()
model_sklearn.fit(X_train, y_train)
# Make predictions
y_pred_sklearn = model_sklearn.predict(X_test)
# Get coefficients
print("Scikit-learn Model:")
print(f"  Intercept (β₀): {model_sklearn.intercept_:.4f}")
print(f"  Slope (β₁): {model_sklearn.coef_[0]:.4f}")
# Method 2: Manual calculation using normal equation
def manual_linear_regression(X, y):
"""
Calculate linear regression coefficients manually.
Formula: β = (XᵀX)⁻¹Xᵀy
"""
# Add column of ones for intercept
X_with_intercept = np.column_stack([np.ones(len(X)), X])
# Normal equation: β = (XᵀX)⁻¹Xᵀy
beta = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ X_with_intercept.T @ y
return beta[0], beta[1]  # intercept, slope
intercept_manual, slope_manual = manual_linear_regression(X_train.values, y_train.values)
print(f"\nManual Calculation:")
print(f"  Intercept (β₀): {intercept_manual:.4f}")
print(f"  Slope (β₁): {slope_manual:.4f}")
# Method 3: Using statsmodels for statistical inference
X_with_const = sm.add_constant(X_train)  # Add intercept term
model_stats = sm.OLS(y_train, X_with_const).fit()
print("\nStatsmodels Results:")
print(model_stats.summary())

Visualization of Regression Line

def plot_regression_results(X, y, model, title="Linear Regression Results"):
"""
Visualize regression line and predictions.
"""
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# 1. Regression line with data
X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_pred_range = model.predict(X_range)
axes[0].scatter(X, y, alpha=0.6, label='Actual')
axes[0].plot(X_range, y_pred_range, 'r-', linewidth=2, label='Regression Line')
axes[0].set_xlabel('X')
axes[0].set_ylabel('y')
axes[0].set_title(title)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# 2. Residuals plot
y_pred = model.predict(X)
residuals = y - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.6)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted Values')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')
axes[1].grid(True, alpha=0.3)
# 3. Distribution of residuals
axes[2].hist(residuals, bins=20, edgecolor='black', alpha=0.7)
axes[2].set_xlabel('Residuals')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Residual Distribution')
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Calculate metrics
r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
print(f"\nModel Performance:")
print(f"  R² Score: {r2:.4f}")
print(f"  MSE: {mse:.4f}")
print(f"  RMSE: {rmse:.4f}")
# Plot results
plot_regression_results(X_train.values, y_train.values, model_sklearn)

2. Multiple Linear Regression

Extending to Multiple Features

# Generate multiple features data
np.random.seed(42)
n_samples = 200
# Create correlated features
X1 = np.random.randn(n_samples) * 10
X2 = 0.7 * X1 + np.random.randn(n_samples) * 3  # Correlated with X1
X3 = np.random.randn(n_samples) * 8
X4 = 0.3 * X1 + 0.5 * X2 + np.random.randn(n_samples) * 2
# True relationship: y = 10 + 2*X1 + 1.5*X2 - X3 + 0.5*X4 + noise
true_coef = [10, 2, 1.5, -1, 0.5]
y = (true_coef[0] + 
true_coef[1] * X1 + 
true_coef[2] * X2 + 
true_coef[3] * X3 + 
true_coef[4] * X4 + 
np.random.randn(n_samples) * 5)
# Create DataFrame
df_multi = pd.DataFrame({
'X1': X1,
'X2': X2,
'X3': X3,
'X4': X4,
'y': y
})
print("Multiple Linear Regression Data:")
print(df_multi.head())
print(f"\nCorrelation Matrix:")
print(df_multi.corr().round(3))

Multiple Regression Implementation

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
# Prepare data
X_multi = df_multi[['X1', 'X2', 'X3', 'X4']]
y_multi = df_multi['y']
# Split data
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
X_multi, y_multi, test_size=0.2, random_state=42
)
# Scale features (important for interpretation)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_multi)
X_test_scaled = scaler.transform(X_test_multi)
# Train model
model_multi = LinearRegression()
model_multi.fit(X_train_scaled, y_train_multi)
# Make predictions
y_pred_multi = model_multi.predict(X_test_scaled)
# Results
print("Multiple Linear Regression Results:")
print("=" * 50)
print(f"Intercept (β₀): {model_multi.intercept_:.4f}")
print("\nCoefficients:")
for i, feature in enumerate(['X1', 'X2', 'X3', 'X4']):
print(f"  {feature} (β_{i+1}): {model_multi.coef_[i]:.4f}")
print(f"\nTrue Coefficients:")
for i, (feature, coef) in enumerate(zip(['X1', 'X2', 'X3', 'X4'], true_coef[1:])):
print(f"  {feature}: {coef:.4f}")
# Model performance
r2_multi = r2_score(y_test_multi, y_pred_multi)
mse_multi = mean_squared_error(y_test_multi, y_pred_multi)
rmse_multi = np.sqrt(mse_multi)
print(f"\nModel Performance:")
print(f"  R² Score: {r2_multi:.4f}")
print(f"  MSE: {mse_multi:.4f}")
print(f"  RMSE: {rmse_multi:.4f}")

Feature Importance Analysis

def analyze_feature_importance(model, feature_names, X_train, y_train):
"""
Analyze feature importance using coefficients and statistical significance.
"""
# Add constant for statsmodels
X_with_const = sm.add_constant(X_train)
model_stats = sm.OLS(y_train, X_with_const).fit()
# Create results DataFrame
results = pd.DataFrame({
'Feature': ['Intercept'] + feature_names,
'Coefficient': [model.intercept_] + list(model.coef_),
'Std Error': [np.nan] + list(model_stats.bse[1:]),
'p-value': [np.nan] + list(model_stats.pvalues[1:]),
't-statistic': [np.nan] + list(model_stats.tvalues[1:])
})
# Calculate absolute importance
results['abs_coefficient'] = np.abs(results['Coefficient'])
return results
# Analyze feature importance
importance_results = analyze_feature_importance(
model_multi, ['X1', 'X2', 'X3', 'X4'], X_train_scaled, y_train_multi
)
print("\nFeature Importance Analysis:")
print(importance_results.round(4).to_string(index=False))
# Visualize coefficients
plt.figure(figsize=(10, 6))
features = ['X1', 'X2', 'X3', 'X4']
coefs = model_multi.coef_
colors = ['green' if c > 0 else 'red' for c in coefs]
plt.barh(features, coefs, color=colors, alpha=0.7)
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.xlabel('Coefficient Value')
plt.title('Feature Coefficients (Multiple Linear Regression)')
plt.grid(True, alpha=0.3)
plt.show()

3. Model Diagnostics

Residual Analysis

def comprehensive_residual_analysis(model, X, y, feature_names):
"""
Perform comprehensive residual analysis for regression diagnostics.
"""
y_pred = model.predict(X)
residuals = y - y_pred
standardized_residuals = (residuals - residuals.mean()) / residuals.std()
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# 1. Residuals vs Fitted
axes[0, 0].scatter(y_pred, residuals, alpha=0.6)
axes[0, 0].axhline(y=0, color='r', linestyle='--')
axes[0, 0].set_xlabel('Fitted Values')
axes[0, 0].set_ylabel('Residuals')
axes[0, 0].set_title('Residuals vs Fitted')
axes[0, 0].grid(True, alpha=0.3)
# 2. Q-Q plot for normality
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('Q-Q Plot')
axes[0, 1].grid(True, alpha=0.3)
# 3. Scale-Location plot
sqrt_abs_residuals = np.sqrt(np.abs(standardized_residuals))
axes[0, 2].scatter(y_pred, sqrt_abs_residuals, alpha=0.6)
axes[0, 2].set_xlabel('Fitted Values')
axes[0, 2].set_ylabel('√|Standardized Residuals|')
axes[0, 2].set_title('Scale-Location Plot')
axes[0, 2].grid(True, alpha=0.3)
# 4. Residuals vs each predictor
for i, feature in enumerate(feature_names):
ax = axes[1, i] if i < 3 else axes[1, 2]
ax.scatter(X[feature], residuals, alpha=0.6)
ax.axhline(y=0, color='r', linestyle='--')
ax.set_xlabel(feature)
ax.set_ylabel('Residuals')
ax.set_title(f'Residuals vs {feature}')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Statistical tests
# Shapiro-Wilk test for normality
shapiro_stat, shapiro_p = stats.shapiro(residuals)
# Durbin-Watson test for autocorrelation
from statsmodels.stats.stattools import durbin_watson
dw_stat = durbin_watson(residuals)
# Breusch-Pagan test for heteroscedasticity
from statsmodels.stats.diagnostic import het_breuschpagan
bp_test = het_breuschpagan(residuals, sm.add_constant(X))
print("Residual Diagnostics:")
print("=" * 50)
print(f"Shapiro-Wilk Test (Normality):")
print(f"  Statistic: {shapiro_stat:.4f}")
print(f"  p-value: {shapiro_p:.4f}")
print(f"  {'✓ Normal' if shapiro_p > 0.05 else '✗ Non-normal'}")
print(f"\nDurbin-Watson Test (Autocorrelation):")
print(f"  Statistic: {dw_stat:.4f}")
print(f"  {'✓ No autocorrelation' if 1.5 < dw_stat < 2.5 else '✗ Possible autocorrelation'}")
print(f"\nBreusch-Pagan Test (Homoscedasticity):")
print(f"  LM Statistic: {bp_test[0]:.4f}")
print(f"  p-value: {bp_test[1]:.4f}")
print(f"  {'✓ Homoscedastic' if bp_test[1] > 0.05 else '✗ Heteroscedastic'}")
return residuals, standardized_residuals
# Perform residual analysis
residuals, std_residuals = comprehensive_residual_analysis(
model_multi, X_train_scaled, y_train_multi, ['X1', 'X2', 'X3', 'X4']
)

Multicollinearity Detection

def detect_multicollinearity(X):
"""
Detect multicollinearity using Variance Inflation Factor (VIF).
VIF > 5-10 indicates high multicollinearity.
"""
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Add constant
X_with_const = sm.add_constant(X)
# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X_with_const.values, i+1) 
for i in range(len(X.columns))]
return vif_data.sort_values('VIF', ascending=False)
# Check multicollinearity
vif_results = detect_multicollinearity(X_multi)
print("Multicollinearity Detection (VIF):")
print("=" * 40)
print(vif_results.to_string(index=False))
print("\nInterpretation:")
print("  VIF = 1: No correlation")
print("  VIF 1-5: Moderate correlation")
print("  VIF > 5: High correlation (multicollinearity)")
print("  VIF > 10: Very high correlation (severe multicollinearity)")
# Visualize correlation matrix
plt.figure(figsize=(8, 6))
corr_matrix = X_multi.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

4. Regularized Linear Regression

Ridge Regression (L2 Regularization)

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
def compare_regularization_models(X_train, y_train, X_test, y_test):
"""
Compare Ridge, Lasso, and ElasticNet regression.
"""
results = {}
# 1. Ridge Regression
ridge_params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, ridge_params, cv=5, scoring='r2')
ridge_cv.fit(X_train, y_train)
ridge_best = ridge_cv.best_estimator_
ridge_pred = ridge_best.predict(X_test)
results['Ridge'] = {
'best_alpha': ridge_cv.best_params_['alpha'],
'r2': r2_score(y_test, ridge_pred),
'rmse': np.sqrt(mean_squared_error(y_test, ridge_pred))
}
# 2. Lasso Regression
lasso_params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
lasso = Lasso()
lasso_cv = GridSearchCV(lasso, lasso_params, cv=5, scoring='r2')
lasso_cv.fit(X_train, y_train)
lasso_best = lasso_cv.best_estimator_
lasso_pred = lasso_best.predict(X_test)
results['Lasso'] = {
'best_alpha': lasso_cv.best_params_['alpha'],
'r2': r2_score(y_test, lasso_pred),
'rmse': np.sqrt(mean_squared_error(y_test, lasso_pred)),
'nonzero_features': sum(abs(lasso_best.coef_) > 1e-6)
}
# 3. ElasticNet
elastic_params = {
'alpha': [0.001, 0.01, 0.1, 1, 10],
'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}
elastic = ElasticNet()
elastic_cv = GridSearchCV(elastic, elastic_params, cv=5, scoring='r2')
elastic_cv.fit(X_train, y_train)
elastic_best = elastic_cv.best_estimator_
elastic_pred = elastic_best.predict(X_test)
results['ElasticNet'] = {
'best_alpha': elastic_cv.best_params_['alpha'],
'best_l1_ratio': elastic_cv.best_params_['l1_ratio'],
'r2': r2_score(y_test, elastic_pred),
'rmse': np.sqrt(mean_squared_error(y_test, elastic_pred))
}
return results, ridge_best, lasso_best, elastic_best
# Compare regularization models
reg_results, ridge_model, lasso_model, elastic_model = compare_regularization_models(
X_train_scaled, y_train_multi, X_test_scaled, y_test_multi
)
print("Regularization Model Comparison:")
print("=" * 50)
for model_name, metrics in reg_results.items():
print(f"\n{model_name}:")
for metric, value in metrics.items():
if isinstance(value, float):
print(f"  {metric}: {value:.4f}")
else:
print(f"  {metric}: {value}")

Visualizing Regularization Effects

def plot_regularization_path(X_train, y_train):
"""
Plot coefficient paths for different regularization strengths.
"""
alphas = np.logspace(-3, 2, 100)
# Ridge coefficients
ridge_coefs = []
for alpha in alphas:
ridge = Ridge(alpha=alpha)
ridge.fit(X_train, y_train)
ridge_coefs.append(ridge.coef_)
ridge_coefs = np.array(ridge_coefs)
# Lasso coefficients
lasso_coefs = []
for alpha in alphas:
lasso = Lasso(alpha=alpha, max_iter=10000)
lasso.fit(X_train, y_train)
lasso_coefs.append(lasso.coef_)
lasso_coefs = np.array(lasso_coefs)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Ridge path
for i in range(len(X_train[0])):
axes[0].plot(alphas, ridge_coefs[:, i], label=f'Feature {i+1}')
axes[0].set_xscale('log')
axes[0].set_xlabel('Alpha (Regularization Strength)')
axes[0].set_ylabel('Coefficient Value')
axes[0].set_title('Ridge (L2) Regularization Path')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Lasso path
for i in range(len(X_train[0])):
axes[1].plot(alphas, lasso_coefs[:, i], label=f'Feature {i+1}')
axes[1].set_xscale('log')
axes[1].set_xlabel('Alpha (Regularization Strength)')
axes[1].set_ylabel('Coefficient Value')
axes[1].set_title('Lasso (L1) Regularization Path')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Plot regularization paths
plot_regularization_path(X_train_scaled, y_train_multi)

5. Polynomial Regression

Capturing Non-Linear Relationships

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
def polynomial_regression_example():
"""
Demonstrate polynomial regression for non-linear relationships.
"""
# Generate non-linear data
np.random.seed(42)
X_poly = np.random.randn(100, 1) * 2
y_poly = 3 * X_poly.flatten()**2 + 2 * X_poly.flatten() + 1 + np.random.randn(100) * 2
# Create models with different degrees
degrees = [1, 2, 3, 4]
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
X_range = np.linspace(X_poly.min(), X_poly.max(), 100).reshape(-1, 1)
for i, degree in enumerate(degrees):
row = i // 2
col = i % 2
# Create polynomial model
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model.fit(X_poly, y_poly)
# Predictions
y_pred = model.predict(X_range)
# Plot
axes[row, col].scatter(X_poly, y_poly, alpha=0.6, label='Data')
axes[row, col].plot(X_range, y_pred, 'r-', linewidth=2, label=f'Degree {degree}')
axes[row, col].set_xlabel('X')
axes[row, col].set_ylabel('y')
axes[row, col].set_title(f'Polynomial Regression (Degree {degree})')
axes[row, col].legend()
axes[row, col].grid(True, alpha=0.3)
# Calculate metrics
y_pred_train = model.predict(X_poly)
r2 = r2_score(y_poly, y_pred_train)
mse = mean_squared_error(y_poly, y_pred_train)
axes[row, col].text(0.05, 0.95, f'R² = {r2:.4f}\nMSE = {mse:.4f}', 
transform=axes[row, col].transAxes, 
verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
plt.tight_layout()
plt.show()
return X_poly, y_poly
# Run polynomial regression example
X_poly, y_poly = polynomial_regression_example()

Cross-Validation for Degree Selection

from sklearn.model_selection import cross_val_score
def select_optimal_degree(X, y, max_degree=10):
"""
Use cross-validation to select optimal polynomial degree.
"""
degrees = range(1, max_degree + 1)
cv_scores = []
cv_stds = []
for degree in degrees:
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
scores = cross_val_score(model, X, y, cv=5, scoring='r2')
cv_scores.append(scores.mean())
cv_stds.append(scores.std())
# Find optimal degree
optimal_degree = degrees[np.argmax(cv_scores)]
# Plot results
plt.figure(figsize=(10, 6))
plt.errorbar(degrees, cv_scores, yerr=cv_stds, capsize=5, marker='o')
plt.axvline(x=optimal_degree, color='r', linestyle='--', label=f'Optimal Degree = {optimal_degree}')
plt.xlabel('Polynomial Degree')
plt.ylabel('Cross-Validation R² Score')
plt.title('Cross-Validation for Polynomial Degree Selection')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"Optimal Polynomial Degree: {optimal_degree}")
print(f"Best CV R² Score: {cv_scores[optimal_degree-1]:.4f} (+/- {cv_stds[optimal_degree-1]:.4f})")
return optimal_degree
# Select optimal degree
optimal_deg = select_optimal_degree(X_poly, y_poly)

6. Real-World Examples

House Price Prediction

def house_price_prediction():
"""
Example: Predicting house prices using multiple features.
"""
# Generate synthetic house data
np.random.seed(42)
n_houses = 500
# Features
sqft = np.random.normal(2000, 500, n_houses).clip(800, 5000)
bedrooms = np.random.randint(1, 5, n_houses)
bathrooms = np.random.uniform(1, 4, n_houses).round(1)
age = np.random.randint(0, 50, n_houses)
location_score = np.random.uniform(0, 10, n_houses)
# Price formula (with noise)
base_price = 50000
price = (base_price + 
150 * sqft + 
10000 * bedrooms + 
8000 * bathrooms - 
500 * age + 
15000 * location_score + 
np.random.normal(0, 30000, n_houses))
# Create DataFrame
house_df = pd.DataFrame({
'sqft': sqft,
'bedrooms': bedrooms,
'bathrooms': bathrooms,
'age': age,
'location_score': location_score,
'price': price
})
print("House Price Dataset:")
print(house_df.head())
print(f"\nDataset Shape: {house_df.shape}")
# Prepare data
X = house_df[['sqft', 'bedrooms', 'bathrooms', 'age', 'location_score']]
y = house_df['price']
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# Predict
y_pred = model.predict(X_test_scaled)
# Results
print("\nModel Performance:")
print(f"  R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"  RMSE: ${np.sqrt(mean_squared_error(y_test, y_pred)):,.0f}")
# Feature importance
feature_importance = pd.DataFrame({
'Feature': X.columns,
'Coefficient': model.coef_,
'Abs_Importance': np.abs(model.coef_)
}).sort_values('Abs_Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance.to_string(index=False))
# Visualize predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('House Price Predictions')
plt.grid(True, alpha=0.3)
plt.show()
return model, scaler
# Run house price prediction
house_model, house_scaler = house_price_prediction()

Sales Forecasting

def sales_forecasting():
"""
Example: Forecasting sales with seasonal effects.
"""
# Generate time series sales data
np.random.seed(42)
months = 36  # 3 years
time = np.arange(months)
# Components
trend = 1000 + 50 * time
seasonal = 200 * np.sin(2 * np.pi * time / 12)  # Yearly seasonality
random_noise = np.random.randn(months) * 100
sales = trend + seasonal + random_noise
# Create features
sales_df = pd.DataFrame({
'month': time,
'sales': sales,
'month_sin': np.sin(2 * np.pi * time / 12),
'month_cos': np.cos(2 * np.pi * time / 12),
'time': time,
'time_squared': time ** 2
})
# Add lag features
for lag in [1, 2, 3, 6, 12]:
sales_df[f'lag_{lag}'] = sales_df['sales'].shift(lag)
# Drop NaN values
sales_df = sales_df.dropna()
# Prepare features
feature_cols = ['time', 'time_squared', 'month_sin', 'month_cos', 
'lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_12']
X_sales = sales_df[feature_cols]
y_sales = sales_df['sales']
# Split (time series split)
split_idx = int(len(X_sales) * 0.8)
X_train_sales, X_test_sales = X_sales[:split_idx], X_sales[split_idx:]
y_train_sales, y_test_sales = y_sales[:split_idx], y_sales[split_idx:]
# Train model
model_sales = LinearRegression()
model_sales.fit(X_train_sales, y_train_sales)
# Predict
y_pred_sales = model_sales.predict(X_test_sales)
# Results
print("Sales Forecasting Results:")
print("=" * 50)
print(f"R² Score: {r2_score(y_test_sales, y_pred_sales):.4f}")
print(f"RMSE: ${np.sqrt(mean_squared_error(y_test_sales, y_pred_sales)):,.2f}")
# Feature importance
importance = pd.DataFrame({
'Feature': feature_cols,
'Coefficient': model_sales.coef_,
'Abs_Importance': np.abs(model_sales.coef_)
}).sort_values('Abs_Importance', ascending=False)
print("\nFeature Importance:")
print(importance.to_string(index=False))
# Plot predictions
plt.figure(figsize=(12, 6))
plt.plot(y_test_sales.index, y_test_sales.values, label='Actual', alpha=0.7)
plt.plot(y_test_sales.index, y_pred_sales, label='Predicted', alpha=0.7)
plt.xlabel('Time Period')
plt.ylabel('Sales')
plt.title('Sales Forecasting with Linear Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
return model_sales, importance
# Run sales forecasting
sales_model, sales_importance = sales_forecasting()

7. Advanced Topics

Weighted Linear Regression

def weighted_linear_regression(X, y, weights=None):
"""
Implement weighted linear regression.
"""
# Add intercept term
X_with_intercept = np.column_stack([np.ones(len(X)), X])
if weights is None:
# Unweighted regression
beta = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ X_with_intercept.T @ y
else:
# Weighted regression: β = (XᵀWX)⁻¹XᵀWy
W = np.diag(weights)
beta = np.linalg.inv(X_with_intercept.T @ W @ X_with_intercept) @ X_with_intercept.T @ W @ y
return beta[0], beta[1:]
# Example with different weights
np.random.seed(42)
X_weighted = np.random.randn(100, 1) * 10
y_weighted = 3 * X_weighted.flatten() + 5 + np.random.randn(100) * 3
# Create weights (give more importance to recent observations)
weights = np.linspace(0.1, 1, len(X_weighted))
# Compare unweighted vs weighted
intercept_unweighted, coef_unweighted = weighted_linear_regression(X_weighted, y_weighted)
intercept_weighted, coef_weighted = weighted_linear_regression(X_weighted, y_weighted, weights)
print("Weighted Linear Regression Comparison:")
print("=" * 50)
print(f"Unweighted: y = {intercept_unweighted:.4f} + {coef_unweighted[0]:.4f}x")
print(f"Weighted:   y = {intercept_weighted:.4f} + {coef_weighted[0]:.4f}x")

Robust Regression

from sklearn.linear_model import HuberRegressor, RANSACRegressor
def robust_regression_comparison(X, y):
"""
Compare different robust regression methods.
"""
# Standard Linear Regression
lr = LinearRegression()
lr.fit(X, y)
# Huber Regression (robust to outliers)
huber = HuberRegressor(epsilon=1.35)
huber.fit(X, y)
# RANSAC (Random Sample Consensus)
ransac = RANSACRegressor(min_samples=0.5, residual_threshold=5)
ransac.fit(X, y)
# Generate predictions
X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X, y, alpha=0.6, label='Data')
plt.plot(X_range, lr.predict(X_range), label='Linear Regression', linewidth=2)
plt.plot(X_range, huber.predict(X_range), label='Huber Regression', linewidth=2)
plt.plot(X_range, ransac.predict(X_range), label='RANSAC', linewidth=2)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Robust Regression Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
# Residual plot
plt.subplot(1, 2, 2)
plt.scatter(lr.predict(X), y - lr.predict(X), alpha=0.5, label='Linear', s=30)
plt.scatter(huber.predict(X), y - huber.predict(X), alpha=0.5, label='Huber', s=30)
plt.axhline(y=0, color='k', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return {'Linear': lr, 'Huber': huber, 'RANSAC': ransac}
# Generate data with outliers
np.random.seed(42)
X_robust = np.random.randn(100, 1) * 10
y_robust = 2 * X_robust.flatten() + 3 + np.random.randn(100) * 2
# Add outliers
outlier_indices = np.random.choice(len(X_robust), 10, replace=False)
y_robust[outlier_indices] = y_robust[outlier_indices] + np.random.randn(10) * 50
# Run comparison
robust_models = robust_regression_comparison(X_robust, y_robust)

8. Model Interpretation

Coefficient Interpretation

def interpret_coefficients(model, feature_names, target_name):
"""
Provide detailed interpretation of regression coefficients.
"""
intercept = model.intercept_
coefficients = model.coef_
print(f"Linear Regression Model Interpretation")
print("=" * 60)
print(f"Equation: {target_name} = {intercept:.4f}")
for i, (name, coef) in enumerate(zip(feature_names, coefficients)):
print(f"           + ({coef:.4f}) * {name}")
print(f"\nInterpretation:")
print(f"  Intercept ({intercept:.4f}):")
print(f"    Predicted value when all features are zero.")
for name, coef in zip(feature_names, coefficients):
if coef > 0:
print(f"\n  {name} (+{coef:.4f}):")
print(f"    For each 1-unit increase in {name}, {target_name}")
print(f"    increases by {coef:.4f} units, holding all else constant.")
else:
print(f"\n  {name} ({coef:.4f}):")
print(f"    For each 1-unit increase in {name}, {target_name}")
print(f"    decreases by {abs(coef):.4f} units, holding all else constant.")
# Interpret house price model
feature_names = ['sqft', 'bedrooms', 'bathrooms', 'age', 'location_score']
interpret_coefficients(house_model, feature_names, 'price')

Partial Dependence Plots

def plot_partial_dependence(model, X, feature_names, feature_idx, n_points=100):
"""
Create partial dependence plots for selected features.
"""
from sklearn.inspection import partial_dependence
fig, axes = plt.subplots(1, len(feature_idx), figsize=(15, 4))
if len(feature_idx) == 1:
axes = [axes]
for i, idx in enumerate(feature_idx):
# Calculate partial dependence
pdp = partial_dependence(model, X, [idx], kind='average', grid_resolution=n_points)
# Plot
axes[i].plot(pdp['values'][0], pdp['average'][0], linewidth=2)
axes[i].set_xlabel(feature_names[idx])
axes[i].set_ylabel('Partial Dependence')
axes[i].set_title(f'Effect of {feature_names[idx]} on Prediction')
axes[i].grid(True, alpha=0.3)
# Add confidence intervals (bootstrap)
# Simplified version - could use more sophisticated bootstrap
plt.tight_layout()
plt.show()
# Plot partial dependence for house price model
# Note: This requires the model to be fitted on original (unscaled) data
# For scaled data, we need to transform back or use the scaler

9. Model Selection and Comparison

def compare_regression_models(X_train, y_train, X_test, y_test):
"""
Compare multiple regression models.
"""
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
models = {
'Linear Regression': LinearRegression(),
'Ridge': Ridge(alpha=1.0),
'Lasso': Lasso(alpha=1.0),
'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
'Decision Tree': DecisionTreeRegressor(max_depth=5),
'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=3)
}
results = []
for name, model in models.items():
# Train
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = np.mean(np.abs(y_test - y_pred))
results.append({
'Model': name,
'R² Score': r2,
'RMSE': rmse,
'MAE': mae
})
# Create comparison DataFrame
comparison = pd.DataFrame(results).sort_values('R² Score', ascending=False)
return comparison
# Compare models
model_comparison = compare_regression_models(
X_train_scaled, y_train_multi, X_test_scaled, y_test_multi
)
print("Model Comparison Results:")
print("=" * 70)
print(model_comparison.round(4).to_string(index=False))

10. Best Practices and Common Pitfalls

Assumption Checking Checklist

def regression_assumption_checklist(model, X, y):
"""
Comprehensive regression assumption checking.
"""
y_pred = model.predict(X)
residuals = y - y_pred
results = {
'assumptions': {},
'tests': {}
}
# 1. Linearity
from scipy.stats import pearsonr
corr, p_value = pearsonr(y_pred, residuals)
results['assumptions']['linearity'] = abs(corr) < 0.1
# 2. Homoscedasticity (using residuals plot visual assessment)
from statsmodels.stats.diagnostic import het_breuschpagan
X_const = sm.add_constant(X)
_, bp_pvalue, _, _ = het_breuschpagan(residuals, X_const)
results['assumptions']['homoscedasticity'] = bp_pvalue > 0.05
results['tests']['Breusch-Pagan p-value'] = bp_pvalue
# 3. Normality of residuals
from scipy.stats import shapiro
_, shapiro_p = shapiro(residuals)
results['assumptions']['normality'] = shapiro_p > 0.05
results['tests']['Shapiro-Wilk p-value'] = shapiro_p
# 4. Independence of errors (Durbin-Watson)
from statsmodels.stats.stattools import durbin_watson
dw = durbin_watson(residuals)
results['assumptions']['independence'] = 1.5 < dw < 2.5
results['tests']['Durbin-Watson'] = dw
# 5. No multicollinearity (VIF)
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = []
for i in range(X.shape[1]):
vif = variance_inflation_factor(X.values, i)
vifs.append(vif)
results['assumptions']['no_multicollinearity'] = max(vifs) < 10
results['tests']['max_VIF'] = max(vifs)
# Summary
print("Regression Assumptions Check")
print("=" * 50)
for assumption, satisfied in results['assumptions'].items():
status = "✓" if satisfied else "✗"
print(f"{status} {assumption}: {satisfied}")
print("\nDetailed Tests:")
for test, value in results['tests'].items():
print(f"  {test}: {value:.4f}")
return results
# Check assumptions for house price model
# Note: We need to use unscaled data for this check
assumptions = regression_assumption_checklist(
house_model, X_train, y_train_multi
)

Common Pitfalls and Solutions

def demonstrate_pitfalls():
"""
Demonstrate common pitfalls in linear regression.
"""
# Pitfall 1: Overfitting with polynomial regression
np.random.seed(42)
X = np.linspace(0, 1, 50)
y = 2 * X + 1 + np.random.randn(50) * 0.2
X_poly_high = np.vstack([X**i for i in range(1, 20)]).T
model_overfit = LinearRegression()
model_overfit.fit(X_poly_high, y)
y_pred_overfit = model_overfit.predict(X_poly_high)
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.scatter(X, y, alpha=0.6)
plt.plot(X, y_pred_overfit, 'r-', label='Overfit (Degree 19)')
plt.plot(X, 2*X + 1, 'g--', label='True', linewidth=2)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Overfitting')
plt.legend()
plt.grid(True, alpha=0.3)
# Pitfall 2: Extrapolation
X_train = np.random.rand(80) * 10
y_train = 2 * X_train + 1 + np.random.randn(80)
X_test = np.random.rand(20) * 10 + 10  # Outside training range
model = LinearRegression()
model.fit(X_train.reshape(-1, 1), y_train)
y_pred = model.predict(X_test.reshape(-1, 1))
plt.subplot(1, 2, 2)
plt.scatter(X_train, y_train, alpha=0.6, label='Training')
plt.scatter(X_test, y_pred, alpha=0.6, label='Extrapolation', color='red')
X_line = np.linspace(0, 20, 100)
plt.plot(X_line, 2*X_line + 1, 'g--', label='True', linewidth=2)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Extrapolation Danger')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nCommon Pitfall Solutions:")
print("1. Overfitting: Use regularization (Ridge/Lasso) or cross-validation")
print("2. Extrapolation: Never predict outside training range")
print("3. Multicollinearity: Use VIF to detect, remove correlated features")
print("4. Non-linearity: Consider polynomial features or non-linear models")
print("5. Outliers: Use robust regression (Huber, RANSAC)")
demonstrate_pitfalls()

11. Model Deployment Ready Code

class LinearRegressionModel:
"""
Complete linear regression model with training, evaluation, and prediction.
"""
def __init__(self, name="Linear Regression Model"):
self.name = name
self.model = LinearRegression()
self.scaler = StandardScaler()
self.feature_names = None
self.target_name = None
def fit(self, X, y, feature_names=None, target_name="target"):
"""
Train the linear regression model.
"""
self.feature_names = feature_names or [f"feature_{i}" for i in range(X.shape[1])]
self.target_name = target_name
# Scale features
X_scaled = self.scaler.fit_transform(X)
# Train model
self.model.fit(X_scaled, y)
return self
def predict(self, X):
"""
Make predictions on new data.
"""
X_scaled = self.scaler.transform(X)
return self.model.predict(X_scaled)
def evaluate(self, X_test, y_test):
"""
Evaluate model performance.
"""
y_pred = self.predict(X_test)
return {
'r2': r2_score(y_test, y_pred),
'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
'mae': mean_absolute_error(y_test, y_pred),
'predictions': y_pred
}
def get_coefficients(self):
"""
Get model coefficients.
"""
return {
'intercept': self.model.intercept_,
'coefficients': dict(zip(self.feature_names, self.model.coef_))
}
def summary(self):
"""
Print model summary.
"""
print(f"\n{self.name} Summary")
print("=" * 50)
print(f"Features: {self.feature_names}")
print(f"Target: {self.target_name}")
print(f"\nIntercept: {self.model.intercept_:.4f}")
print("\nCoefficients:")
for name, coef in zip(self.feature_names, self.model.coef_):
print(f"  {name}: {coef:.4f}")
# Example usage
model = LinearRegressionModel("House Price Model")
model.fit(X_train, y_train_multi, feature_names=['sqft', 'bedrooms', 'bathrooms', 'age', 'location_score'])
model.summary()
# Evaluate
metrics = model.evaluate(X_test, y_test_multi)
print(f"\nModel Performance:")
print(f"  R²: {metrics['r2']:.4f}")
print(f"  RMSE: {metrics['rmse']:.2f}")

Conclusion

Linear regression is a fundamental tool in data science that provides interpretable insights into variable relationships:

Key Takeaways

  1. Simplicity: Linear regression is easy to understand and implement
  2. Interpretability: Coefficients directly show the impact of each feature
  3. Foundation: Many advanced algorithms build upon linear regression concepts
  4. Assumptions: Linear regression has important assumptions that need checking
  5. Regularization: Ridge, Lasso, and ElasticNet improve generalization
  6. Extensions: Polynomial regression captures non-linear relationships

Best Practices Checklist

  • [ ] Visualize data before modeling
  • [ ] Check for linear relationships
  • [ ] Handle missing values appropriately
  • [ ] Scale features for regularized models
  • [ ] Test for multicollinearity
  • [ ] Validate model assumptions
  • [ ] Use cross-validation for model selection
  • [ ] Monitor for overfitting
  • [ ] Interpret coefficients carefully
  • [ ] Document model limitations

Linear regression remains a powerful, interpretable tool in the data scientist's toolkit. While more complex models may offer higher predictive power, linear regression provides unmatched transparency and ease of interpretation.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper