Complete Guide to Regression Tables in Data Science

Introduction to Regression Tables

Regression tables are essential outputs in data science that summarize the results of regression analysis. They provide a comprehensive view of the relationships between variables, model performance, and statistical significance. Understanding how to read, interpret, and create regression tables is crucial for any data scientist.

Key Concepts

  • Coefficients: Estimates of relationships between variables
  • Standard Errors: Measure of coefficient precision
  • p-values: Statistical significance of coefficients
  • R-squared: Proportion of variance explained
  • F-statistic: Overall model significance
  • Confidence Intervals: Range of likely coefficient values

1. Understanding Regression Table Components

Basic Linear Regression Table

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
def create_regression_table():
"""
Create and explain a basic regression table
"""
# Generate sample data
np.random.seed(42)
n = 200
# Create features
X1 = np.random.normal(50, 15, n)
X2 = np.random.normal(100, 25, n)
X3 = np.random.normal(30, 8, n)
# Create target with known relationships
y = 10 + 2.5 * X1 - 1.8 * X2 + 0.5 * X3 + np.random.normal(0, 15, n)
# Create DataFrame
df = pd.DataFrame({
'X1': X1,
'X2': X2,
'X3': X3,
'y': y
})
# Fit regression model
X = sm.add_constant(df[['X1', 'X2', 'X3']])
model = sm.OLS(df['y'], X).fit()
# Display regression table
print("=" * 80)
print("REGRESSION RESULTS")
print("=" * 80)
print(model.summary())
# Extract and format key components
results = {
'Variable': ['const'] + list(df.columns[:-1]),
'Coefficient': model.params.values,
'Std Error': model.bse.values,
't-statistic': model.tvalues.values,
'p-value': model.pvalues.values,
'CI Lower': model.conf_int()[0],
'CI Upper': model.conf_int()[1]
}
results_df = pd.DataFrame(results)
print("\n" + "=" * 80)
print("KEY COEFFICIENTS TABLE")
print("=" * 80)
print(results_df.round(4).to_string(index=False))
return model, results_df
model, results_df = create_regression_table()

Understanding Regression Output Components

def explain_regression_components(model):
"""
Explain each component of the regression output
"""
print("=" * 80)
print("REGRESSION OUTPUT COMPONENTS EXPLAINED")
print("=" * 80)
components = {
'Coefficients': {
'description': 'Estimated relationship between predictors and target',
'interpretation': 'A 1-unit increase in X is associated with β change in Y',
'example': f"X1 coefficient = {model.params[1]:.3f}"
},
'Standard Errors': {
'description': 'Measure of coefficient precision',
'interpretation': 'Smaller SE = more precise estimate',
'example': f"X1 SE = {model.bse[1]:.3f}"
},
't-statistic': {
'description': 'Coefficient divided by standard error',
'interpretation': 'Larger |t| indicates stronger evidence against null hypothesis',
'example': f"X1 t = {model.tvalues[1]:.3f}"
},
'p-value': {
'description': 'Probability of observing data if null hypothesis is true',
'interpretation': 'p < 0.05 indicates statistical significance',
'example': f"X1 p = {model.pvalues[1]:.4f}"
},
'Confidence Intervals': {
'description': 'Range of likely coefficient values',
'interpretation': '95% CI: We are 95% confident true value lies within',
'example': f"X1 95% CI: [{model.conf_int()[1][0]:.3f}, {model.conf_int()[1][1]:.3f}]"
},
'R-squared': {
'description': 'Proportion of variance explained by model',
'interpretation': 'Higher R² indicates better fit',
'example': f"R² = {model.rsquared:.3f}"
},
'Adj. R-squared': {
'description': 'R² penalized for number of predictors',
'interpretation': 'Prevents overfitting by adjusting for model complexity',
'example': f"Adj R² = {model.rsquared_adj:.3f}"
},
'F-statistic': {
'description': 'Overall model significance test',
'interpretation': 'Tests if any predictors are significant',
'example': f"F = {model.fvalue:.3f}, p = {model.f_pvalue:.4f}"
},
'AIC/BIC': {
'description': 'Information criteria for model comparison',
'interpretation': 'Lower values indicate better model fit with fewer parameters',
'example': f"AIC = {model.aic:.1f}, BIC = {model.bic:.1f}"
}
}
for comp, info in components.items():
print(f"\n{comp}:")
print(f"  {info['description']}")
print(f"  → {info['interpretation']}")
print(f"  → {info['example']}")
explain_regression_components(model)

2. Types of Regression Tables

Simple Linear Regression

def simple_linear_regression_table():
"""
Create a simple linear regression table
"""
np.random.seed(42)
n = 150
# Generate data
x = np.random.uniform(0, 100, n)
y = 25 + 1.5 * x + np.random.normal(0, 15, n)
# Fit model
X = sm.add_constant(x)
model = sm.OLS(y, X).fit()
# Create formatted table
results = []
for var in ['const', 'x']:
results.append({
'Variable': var,
'Coefficient': model.params[var],
'Std. Error': model.bse[var],
't-stat': model.tvalues[var],
'p-value': model.pvalues[var],
'CI Lower': model.conf_int().loc[var][0],
'CI Upper': model.conf_int().loc[var][1]
})
table = pd.DataFrame(results)
# Add model fit statistics
fit_stats = {
'R-squared': model.rsquared,
'Adj. R-squared': model.rsquared_adj,
'F-statistic': model.fvalue,
'F p-value': model.f_pvalue,
'AIC': model.aic,
'BIC': model.bic,
'RMSE': np.sqrt(model.mse_resid)
}
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Regression plot
axes[0].scatter(x, y, alpha=0.6, label='Data')
axes[0].plot(x, model.fittedvalues, 'r-', linewidth=2, label='Regression line')
axes[0].set_xlabel('X')
axes[0].set_ylabel('Y')
axes[0].set_title(f'Simple Linear Regression\nY = {model.params[0]:.2f} + {model.params[1]:.2f}X')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Table display
axes[1].axis('off')
table_text = "REGRESSION RESULTS\n" + "="*40 + "\n\n"
table_text += table.round(4).to_string(index=False) + "\n\n"
table_text += "="*40 + "\n"
table_text += f"R² = {model.rsquared:.4f}\n"
table_text += f"Adj. R² = {model.rsquared_adj:.4f}\n"
table_text += f"F = {model.fvalue:.2f} (p = {model.f_pvalue:.4f})\n"
table_text += f"RMSE = {np.sqrt(model.mse_resid):.2f}"
axes[1].text(0.05, 0.95, table_text, transform=axes[1].transAxes,
fontsize=10, verticalalignment='top', fontfamily='monospace')
plt.suptitle('Simple Linear Regression Analysis', fontsize=14)
plt.tight_layout()
plt.show()
return table, fit_stats
simple_linear_regression_table()

Multiple Linear Regression

def multiple_regression_table():
"""
Create a comprehensive multiple regression table
"""
np.random.seed(42)
n = 250
# Generate correlated features
X1 = np.random.normal(50, 15, n)
X2 = X1 * 0.6 + np.random.normal(0, 8, n)
X3 = np.random.normal(30, 10, n)
X4 = np.random.normal(100, 25, n)
# Create target with interactions
y = (20 + 
2.5 * X1 + 
-1.2 * X2 + 
0.8 * X3 + 
0.3 * X4 + 
0.5 * X1 * X3 / 100 +  # Interaction
np.random.normal(0, 12, n))
# Create DataFrame
df = pd.DataFrame({
'X1': X1,
'X2': X2,
'X3': X3,
'X4': X4,
'y': y
})
# Fit multiple models for comparison
models = {}
formulas = {
'Model 1': 'y ~ X1',
'Model 2': 'y ~ X1 + X2',
'Model 3': 'y ~ X1 + X2 + X3',
'Model 4': 'y ~ X1 + X2 + X3 + X4',
'Model 5': 'y ~ X1 * X3 + X2 + X4'  # With interaction
}
for name, formula in formulas.items():
models[name] = smf.ols(formula, data=df).fit()
# Create comparison table
comparison = []
for name, model in models.items():
comparison.append({
'Model': name,
'R²': model.rsquared,
'Adj R²': model.rsquared_adj,
'AIC': model.aic,
'BIC': model.bic,
'F-stat': model.fvalue,
'F p-val': model.f_pvalue
})
comp_df = pd.DataFrame(comparison)
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# 1. Model comparison bar plot
metrics = ['R²', 'Adj R²']
x = np.arange(len(models))
width = 0.35
for i, metric in enumerate(metrics):
axes[0, 0].bar(x + i*width, comp_df[metric], width, label=metric)
axes[0, 0].set_xlabel('Model')
axes[0, 0].set_ylabel('Value')
axes[0, 0].set_title('Model Performance Comparison')
axes[0, 0].set_xticks(x + width/2)
axes[0, 0].set_xticklabels(comp_df['Model'], rotation=45)
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
# 2. Coefficient plot for best model
best_model = models['Model 5']
coeffs = best_model.params.drop('Intercept')
errors = best_model.bse.drop('Intercept')
axes[0, 1].errorbar(coeffs.values, range(len(coeffs)), 
xerr=1.96*errors.values, fmt='o', capsize=5)
axes[0, 1].axvline(x=0, color='red', linestyle='--', alpha=0.5)
axes[0, 1].set_yticks(range(len(coeffs)))
axes[0, 1].set_yticklabels(coeffs.index)
axes[0, 1].set_xlabel('Coefficient Value')
axes[0, 1].set_title('Coefficient Estimates with 95% CI (Model 5)')
axes[0, 1].grid(True, alpha=0.3)
# 3. Actual vs Predicted
axes[1, 0].scatter(df['y'], best_model.fittedvalues, alpha=0.5)
axes[1, 0].plot([df['y'].min(), df['y'].max()], 
[df['y'].min(), df['y'].max()], 'r--', linewidth=2)
axes[1, 0].set_xlabel('Actual Values')
axes[1, 0].set_ylabel('Predicted Values')
axes[1, 0].set_title('Actual vs Predicted (Model 5)')
axes[1, 0].grid(True, alpha=0.3)
# 4. Model comparison table
axes[1, 1].axis('off')
comp_table = comp_df.round(4)
axes[1, 1].table(cellText=comp_table.values,
colLabels=comp_table.columns,
cellLoc='center', loc='center')
axes[1, 1].set_title('Model Comparison Table', fontsize=12)
plt.suptitle('Multiple Linear Regression Analysis', fontsize=16)
plt.tight_layout()
plt.show()
return models, comp_df
multiple_regression_table()

3. Advanced Regression Tables

Regularized Regression (Ridge/Lasso)

from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
def regularized_regression_table():
"""
Create regression tables for regularized models
"""
np.random.seed(42)
n = 300
p = 10
# Generate data with many features
X = np.random.randn(n, p)
# True coefficients (only first 5 matter)
true_coef = np.zeros(p)
true_coef[:5] = [2, -1.5, 1, -0.5, 0.8]
y = X @ true_coef + np.random.randn(n) * 0.5
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Fit models with different alphas
alphas = [0.01, 0.1, 1, 10, 100]
ridge_results = []
lasso_results = []
for alpha in alphas:
ridge = Ridge(alpha=alpha)
ridge.fit(X_train_scaled, y_train)
ridge_results.append({
'alpha': alpha,
'train_r2': ridge.score(X_train_scaled, y_train),
'test_r2': ridge.score(X_test_scaled, y_test),
'nonzero_coef': np.sum(np.abs(ridge.coef_) > 1e-6),
'coefs': ridge.coef_
})
lasso = Lasso(alpha=alpha)
lasso.fit(X_train_scaled, y_train)
lasso_results.append({
'alpha': alpha,
'train_r2': lasso.score(X_train_scaled, y_train),
'test_r2': lasso.score(X_test_scaled, y_test),
'nonzero_coef': np.sum(np.abs(lasso.coef_) > 1e-6),
'coefs': lasso.coef_
})
ridge_df = pd.DataFrame(ridge_results)
lasso_df = pd.DataFrame(lasso_results)
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Ridge coefficients path
for i in range(p):
coef_path = [res['coefs'][i] for res in ridge_results]
axes[0, 0].plot(alphas, coef_path, label=f'Feature {i+1}')
axes[0, 0].set_xscale('log')
axes[0, 0].set_xlabel('Alpha (log scale)')
axes[0, 0].set_ylabel('Coefficient Value')
axes[0, 0].set_title('Ridge Regression: Coefficient Paths')
axes[0, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0, 0].grid(True, alpha=0.3)
# Lasso coefficients path
for i in range(p):
coef_path = [res['coefs'][i] for res in lasso_results]
axes[0, 1].plot(alphas, coef_path, label=f'Feature {i+1}')
axes[0, 1].set_xscale('log')
axes[0, 1].set_xlabel('Alpha (log scale)')
axes[0, 1].set_ylabel('Coefficient Value')
axes[0, 1].set_title('Lasso Regression: Coefficient Paths')
axes[0, 1].grid(True, alpha=0.3)
# Ridge performance
axes[1, 0].plot(alphas, ridge_df['train_r2'], 'o-', label='Train', linewidth=2)
axes[1, 0].plot(alphas, ridge_df['test_r2'], 's-', label='Test', linewidth=2)
axes[1, 0].set_xscale('log')
axes[1, 0].set_xlabel('Alpha (log scale)')
axes[1, 0].set_ylabel('R² Score')
axes[1, 0].set_title('Ridge Regression Performance')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# Lasso performance
axes[1, 1].plot(alphas, lasso_df['train_r2'], 'o-', label='Train', linewidth=2)
axes[1, 1].plot(alphas, lasso_df['test_r2'], 's-', label='Test', linewidth=2)
axes[1, 1].set_xscale('log')
axes[1, 1].set_xlabel('Alpha (log scale)')
axes[1, 1].set_ylabel('R² Score')
axes[1, 1].set_title('Lasso Regression Performance')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
plt.suptitle('Regularized Regression Analysis', fontsize=14)
plt.tight_layout()
plt.show()
print("\nRidge Regression Results:")
print(ridge_df.round(4).to_string(index=False))
print("\nLasso Regression Results:")
print(lasso_df.round(4).to_string(index=False))
return ridge_df, lasso_df
regularized_regression_table()

Logistic Regression Table

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
def logistic_regression_table():
"""
Create regression tables for logistic regression
"""
np.random.seed(42)
n = 500
# Generate binary classification data
X1 = np.random.normal(0, 1, n)
X2 = np.random.normal(0, 1, n)
X3 = np.random.normal(0, 1, n)
# Create log-odds
log_odds = -2 + 1.5 * X1 - 1.2 * X2 + 0.8 * X3
prob = 1 / (1 + np.exp(-log_odds))
y = (np.random.rand(n) < prob).astype(int)
# Create DataFrame
df = pd.DataFrame({
'X1': X1,
'X2': X2,
'X3': X3,
'y': y
})
# Fit logistic regression
X = sm.add_constant(df[['X1', 'X2', 'X3']])
model = sm.Logit(df['y'], X).fit()
# Calculate odds ratios
odds_ratios = np.exp(model.params)
odds_ratios_ci = np.exp(model.conf_int())
# Create results table
results = []
for var in model.params.index:
results.append({
'Variable': var,
'Coefficient': model.params[var],
'Std Error': model.bse[var],
'z-stat': model.tvalues[var],
'p-value': model.pvalues[var],
'Odds Ratio': odds_ratios[var],
'OR 95% CI Lower': odds_ratios_ci[0][var],
'OR 95% CI Upper': odds_ratios_ci[1][var]
})
results_df = pd.DataFrame(results)
# Predictions
y_pred_prob = model.predict(X)
y_pred = (y_pred_prob > 0.5).astype(int)
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Coefficient plot with odds ratios
coefs = model.params.drop('const')
errors = model.bse.drop('const')
axes[0, 0].errorbar(coefs.values, range(len(coefs)), 
xerr=1.96*errors.values, fmt='o', capsize=5, color='blue')
axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)
axes[0, 0].set_yticks(range(len(coefs)))
axes[0, 0].set_yticklabels(coefs.index)
axes[0, 0].set_xlabel('Coefficient')
axes[0, 0].set_title('Logistic Regression Coefficients (95% CI)')
axes[0, 0].grid(True, alpha=0.3)
# Odds ratios plot
or_values = odds_ratios.drop('const')
or_ci_lower = odds_ratios_ci[0].drop('const')
or_ci_upper = odds_ratios_ci[1].drop('const')
axes[0, 1].errorbar(or_values.values, range(len(or_values)), 
xerr=[or_values.values - or_ci_lower.values, 
or_ci_upper.values - or_values.values],
fmt='o', capsize=5, color='green')
axes[0, 1].axvline(x=1, color='red', linestyle='--', alpha=0.5)
axes[0, 1].set_yticks(range(len(or_values)))
axes[0, 1].set_yticklabels(or_values.index)
axes[0, 1].set_xlabel('Odds Ratio')
axes[0, 1].set_title('Odds Ratios (95% CI)')
axes[0, 1].set_xscale('log')
axes[0, 1].grid(True, alpha=0.3)
# ROC Curve
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y, y_pred_prob)
roc_auc = auc(fpr, tpr)
axes[1, 0].plot(fpr, tpr, color='darkorange', lw=2, 
label=f'ROC curve (AUC = {roc_auc:.3f})')
axes[1, 0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[1, 0].set_xlabel('False Positive Rate')
axes[1, 0].set_ylabel('True Positive Rate')
axes[1, 0].set_title('ROC Curve')
axes[1, 0].legend(loc="lower right")
axes[1, 0].grid(True, alpha=0.3)
# Confusion Matrix
cm = confusion_matrix(y, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 1])
axes[1, 1].set_xlabel('Predicted')
axes[1, 1].set_ylabel('Actual')
axes[1, 1].set_title('Confusion Matrix')
plt.suptitle('Logistic Regression Analysis', fontsize=16)
plt.tight_layout()
plt.show()
# Print results
print("\n" + "=" * 80)
print("LOGISTIC REGRESSION RESULTS")
print("=" * 80)
print(results_df.round(4).to_string(index=False))
print("\n" + "=" * 80)
print("MODEL PERFORMANCE")
print("=" * 80)
print(classification_report(y, y_pred))
return model, results_df
logistic_regression_table()

4. Creating Publication-Ready Tables

Formatting Regression Tables

def format_regression_table(model, var_names=None, decimals=3, add_stars=True):
"""
Create a publication-ready formatted regression table
"""
# Get coefficients
coefs = model.params
std_err = model.bse
pvals = model.pvalues
ci_low, ci_high = model.conf_int().T
if var_names is None:
var_names = coefs.index
# Create table rows
table_rows = []
for i, (var, coef, se, p, ci_l, ci_h) in enumerate(zip(var_names, coefs, std_err, pvals, ci_low, ci_high)):
# Format coefficient
coef_str = f"{coef:.{decimals}f}"
# Add significance stars
if add_stars:
if p < 0.001:
coef_str += "***"
elif p < 0.01:
coef_str += "**"
elif p < 0.05:
coef_str += "*"
# Format standard error in parentheses
se_str = f"({se:.{decimals}f})"
# Format confidence interval
ci_str = f"[{ci_l:.{decimals}f}, {ci_h:.{decimals}f}]"
table_rows.append({
'Variable': var,
'Coefficient': coef_str,
'Std.Error': se_str,
'p-value': f"{p:.4f}",
'95% CI': ci_str
})
# Create DataFrame
table = pd.DataFrame(table_rows)
# Add model statistics
model_stats = {
'Observations': int(model.nobs),
'R-squared': model.rsquared if hasattr(model, 'rsquared') else None,
'Adj. R-squared': model.rsquared_adj if hasattr(model, 'rsquared_adj') else None,
'F-statistic': model.fvalue if hasattr(model, 'fvalue') else None,
'AIC': model.aic,
'BIC': model.bic
}
# Print formatted table
print("=" * 100)
print("REGRESSION RESULTS")
print("=" * 100)
print(table.to_string(index=False))
print("-" * 100)
print(f"Observations: {model_stats['Observations']}")
if model_stats['R-squared']:
print(f"R²: {model_stats['R-squared']:.4f}")
print(f"Adj. R²: {model_stats['Adj. R-squared']:.4f}")
print(f"AIC: {model_stats['AIC']:.2f}")
print(f"BIC: {model_stats['BIC']:.2f}")
print("-" * 100)
print("Significance codes: *** p<0.001, ** p<0.01, * p<0.05")
print("=" * 100)
return table, model_stats
# Example usage with existing model
format_regression_table(model)

Multi-model Comparison Table

def multi_model_comparison_table(models_dict, metrics=['r2', 'adj_r2', 'aic', 'bic']):
"""
Create a comparison table for multiple models
"""
comparison = []
for name, model in models_dict.items():
row = {'Model': name}
for metric in metrics:
if metric == 'r2':
row['R²'] = model.rsquared
elif metric == 'adj_r2':
row['Adj R²'] = model.rsquared_adj
elif metric == 'aic':
row['AIC'] = model.aic
elif metric == 'bic':
row['BIC'] = model.bic
elif metric == 'f_stat':
row['F-stat'] = model.fvalue
elif metric == 'f_pval':
row['F p-val'] = model.f_pvalue
comparison.append(row)
comparison_df = pd.DataFrame(comparison)
# Format for display
for col in comparison_df.columns:
if col != 'Model':
comparison_df[col] = comparison_df[col].map(lambda x: f"{x:.4f}" if pd.notnull(x) else "")
print("=" * 80)
print("MODEL COMPARISON")
print("=" * 80)
print(comparison_df.to_string(index=False))
return comparison_df
# Example with multiple models
models_dict = {
'Model 1': models['Model 1'],
'Model 2': models['Model 2'],
'Model 3': models['Model 3'],
'Model 4': models['Model 4'],
'Model 5': models['Model 5']
}
multi_model_comparison_table(models_dict)

5. Interpreting Regression Tables

Statistical Significance

def interpret_significance(model):
"""
Interpret the statistical significance of coefficients
"""
print("=" * 80)
print("INTERPRETING STATISTICAL SIGNIFICANCE")
print("=" * 80)
pvalues = model.pvalues
coefs = model.params
print("\nVariable Significance:")
print("-" * 40)
for var in pvalues.index:
p = pvalues[var]
coef = coefs[var]
if p < 0.001:
significance = "*** Highly significant"
elif p < 0.01:
significance = "** Very significant"
elif p < 0.05:
significance = "* Significant"
elif p < 0.1:
significance = "† Marginally significant"
else:
significance = "Not significant"
direction = "positive" if coef > 0 else "negative"
print(f"  {var}: {significance}")
print(f"    → Coefficient = {coef:.4f} ({direction})")
print(f"    → p-value = {p:.4f}")
print("\nInterpretation Guidelines:")
print("-" * 40)
print("  p < 0.001 (***): Very strong evidence against null hypothesis")
print("  p < 0.01 (**): Strong evidence against null hypothesis")
print("  p < 0.05 (*): Moderate evidence against null hypothesis")
print("  p < 0.10 (†): Weak evidence, consider cautiously")
print("  p ≥ 0.10: Insufficient evidence of relationship")
interpret_significance(model)

Coefficient Interpretation

def interpret_coefficients(model, feature_names=None):
"""
Interpret regression coefficients in context
"""
print("=" * 80)
print("COEFFICIENT INTERPRETATION")
print("=" * 80)
coefs = model.params
ci = model.conf_int()
for i, var in enumerate(coefs.index):
if var == 'const':
print(f"\n{var} (Intercept):")
print(f"  Value: {coefs[var]:.4f}")
print(f"  Interpretation: Predicted value when all predictors are zero")
print(f"  Confidence: {ci[0][var]:.4f} to {ci[1][var]:.4f}")
else:
print(f"\n{var}:")
print(f"  Coefficient: {coefs[var]:.4f}")
print(f"  Interpretation: A 1-unit increase in {var} is associated with")
print(f"    a {coefs[var]:.4f} unit change in the outcome")
print(f"  Confidence: {ci[0][var]:.4f} to {ci[1][var]:.4f}")
# Standardized interpretation
if hasattr(model, 'rsquared'):
if abs(coefs[var]) > 0.5:
strength = "strong"
elif abs(coefs[var]) > 0.2:
strength = "moderate"
else:
strength = "weak"
print(f"  Effect size: {strength} effect")
interpret_coefficients(model)

6. Real-World Applications

House Price Prediction

def house_price_regression():
"""
Real estate price prediction with regression tables
"""
np.random.seed(42)
n = 1000
# Generate realistic house data
sqft = np.random.normal(2000, 500, n)
bedrooms = np.random.poisson(3, n)
bathrooms = np.random.poisson(2, n)
age = np.random.exponential(20, n)
location_score = np.random.normal(70, 15, n)
# Create price with realistic relationships
price = (50000 + 
150 * sqft + 
15000 * bedrooms + 
20000 * bathrooms - 
1000 * age + 
500 * location_score + 
np.random.normal(0, 50000, n))
# Create DataFrame
df = pd.DataFrame({
'sqft': sqft,
'bedrooms': bedrooms,
'bathrooms': bathrooms,
'age': age,
'location_score': location_score,
'price': price
})
# Fit model
X = sm.add_constant(df.drop('price', axis=1))
model = sm.OLS(df['price'], X).fit()
# Create formatted table
results = []
for var in model.params.index:
results.append({
'Variable': var,
'Coefficient': model.params[var],
'Std. Error': model.bse[var],
'p-value': model.pvalues[var],
'95% CI Lower': model.conf_int()[0][var],
'95% CI Upper': model.conf_int()[1][var]
})
results_df = pd.DataFrame(results)
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Coefficient plot
coefs = model.params.drop('const')
errors = model.bse.drop('const')
axes[0, 0].errorbar(coefs.values, range(len(coefs)), 
xerr=1.96*errors.values, fmt='o', capsize=5)
axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)
axes[0, 0].set_yticks(range(len(coefs)))
axes[0, 0].set_yticklabels(coefs.index)
axes[0, 0].set_xlabel('Coefficient Value')
axes[0, 0].set_title('House Price Determinants')
axes[0, 0].grid(True, alpha=0.3)
# Importance plot
importance = np.abs(coefs.values)
importance = importance / importance.sum()
axes[0, 1].barh(coefs.index, importance, color='steelblue')
axes[0, 1].set_xlabel('Relative Importance')
axes[0, 1].set_title('Variable Importance')
axes[0, 1].grid(True, alpha=0.3)
# Actual vs Predicted
axes[1, 0].scatter(df['price'], model.fittedvalues, alpha=0.3)
axes[1, 0].plot([df['price'].min(), df['price'].max()], 
[df['price'].min(), df['price'].max()], 
'r--', linewidth=2)
axes[1, 0].set_xlabel('Actual Price ($)')
axes[1, 0].set_ylabel('Predicted Price ($)')
axes[1, 0].set_title('Model Performance')
axes[1, 0].grid(True, alpha=0.3)
# Residual plot
residuals = model.resid
axes[1, 1].scatter(model.fittedvalues, residuals, alpha=0.3)
axes[1, 1].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[1, 1].set_xlabel('Fitted Values')
axes[1, 1].set_ylabel('Residuals')
axes[1, 1].set_title('Residual Plot')
axes[1, 1].grid(True, alpha=0.3)
plt.suptitle('House Price Prediction Model', fontsize=16)
plt.tight_layout()
plt.show()
print("\n" + "=" * 80)
print("HOUSE PRICE REGRESSION RESULTS")
print("=" * 80)
print(results_df.round(2).to_string(index=False))
print("-" * 80)
print(f"R² = {model.rsquared:.4f}")
print(f"Adj. R² = {model.rsquared_adj:.4f}")
print(f"RMSE = ${np.sqrt(model.mse_resid):,.0f}")
return model, results_df
house_price_regression()

Customer Lifetime Value Prediction

def customer_clv_regression():
"""
Customer Lifetime Value prediction with regression tables
"""
np.random.seed(42)
n = 500
# Generate customer data
tenure = np.random.exponential(24, n)
frequency = np.random.poisson(5, n)
monetary = np.random.gamma(2, 100, n)
recency = np.random.uniform(0, 90, n)
satisfaction = np.random.normal(7, 1.5, n)
# Create CLV with interactions
clv = (100 + 
50 * np.log(tenure + 1) + 
30 * frequency + 
0.5 * monetary - 
10 * np.log(recency + 1) + 
50 * satisfaction + 
2 * frequency * satisfaction + 
np.random.normal(0, 100, n))
# Create DataFrame
df = pd.DataFrame({
'tenure': tenure,
'frequency': frequency,
'monetary': monetary,
'recency': recency,
'satisfaction': satisfaction,
'clv': clv
})
# Fit model with interaction
formula = 'clv ~ tenure + frequency + monetary + recency + satisfaction + frequency:satisfaction'
model = smf.ols(formula, data=df).fit()
# Create coefficient table
coef_df = pd.DataFrame({
'Variable': model.params.index,
'Coefficient': model.params.values,
'Std Error': model.bse.values,
'p-value': model.pvalues.values,
'95% CI Lower': model.conf_int()[0],
'95% CI Upper': model.conf_int()[1]
})
# Calculate elasticity
elasticity = {}
for var in ['tenure', 'frequency', 'monetary', 'recency', 'satisfaction']:
elasticity[var] = model.params[var] * df[var].mean() / df['clv'].mean()
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Coefficient plot
coefs = coef_df[coef_df['Variable'] != 'Intercept']
axes[0, 0].barh(coefs['Variable'], coefs['Coefficient'], 
xerr=coefs['Std Error'] * 1.96, color='steelblue')
axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)
axes[0, 0].set_xlabel('Coefficient')
axes[0, 0].set_title('Effect on Customer Lifetime Value')
axes[0, 0].grid(True, alpha=0.3)
# Elasticity plot
axes[1, 0].barh(list(elasticity.keys()), list(elasticity.values()), color='coral')
axes[1, 0].set_xlabel('Elasticity')
axes[1, 0].set_title('Variable Elasticity')
axes[1, 0].grid(True, alpha=0.3)
# Actual vs Predicted
axes[1, 1].scatter(df['clv'], model.fittedvalues, alpha=0.5)
axes[1, 1].plot([df['clv'].min(), df['clv'].max()], 
[df['clv'].min(), df['clv'].max()], 'r--')
axes[1, 1].set_xlabel('Actual CLV')
axes[1, 1].set_ylabel('Predicted CLV')
axes[1, 1].set_title('Model Fit')
axes[1, 1].grid(True, alpha=0.3)
# Model statistics table
axes[0, 1].axis('off')
stats_text = f"""
MODEL STATISTICS
================
R²: {model.rsquared:.4f}
Adj. R²: {model.rsquared_adj:.4f}
F-statistic: {model.fvalue:.2f}
F p-value: {model.f_pvalue:.4f}
AIC: {model.aic:.1f}
BIC: {model.bic:.1f}
Observations: {int(model.nobs)}
"""
axes[0, 1].text(0.05, 0.95, stats_text, transform=axes[0, 1].transAxes,
fontsize=10, verticalalignment='top', fontfamily='monospace')
plt.suptitle('Customer Lifetime Value Model', fontsize=16)
plt.tight_layout()
plt.show()
print("\n" + "=" * 80)
print("CUSTOMER LIFETIME VALUE REGRESSION")
print("=" * 80)
print(coef_df.round(4).to_string(index=False))
print("\n" + "-" * 40)
print("ELASTICITY ANALYSIS:")
for var, el in elasticity.items():
print(f"  {var}: {el:.3f} (1% increase in {var} → {el:.1f}% change in CLV)")
return model, coef_df
customer_clv_regression()

7. Best Practices and Tips

Regression Table Checklist

def regression_table_checklist():
"""
Best practices checklist for regression tables
"""
print("=" * 80)
print("REGRESSION TABLE BEST PRACTICES")
print("=" * 80)
checklist = {
"Coefficients": [
"Include standard errors in parentheses",
"Add significance stars (*, **, ***)",
"Provide confidence intervals",
"Consider standardized coefficients for comparison"
],
"Model Fit": [
"Report R-squared and adjusted R-squared",
"Include F-statistic and p-value",
"Report AIC/BIC for model comparison",
"Include number of observations"
],
"Diagnostics": [
"Check multicollinearity (VIF)",
"Test residual normality",
"Check heteroscedasticity",
"Consider influence diagnostics"
],
"Presentation": [
"Use consistent decimal places",
"Format large numbers appropriately",
"Provide clear variable labels",
"Include footnotes for significance codes"
],
"Interpretation": [
"Discuss practical significance",
"Explain confidence intervals",
"Address limitations",
"Provide context for coefficients"
]
}
for category, items in checklist.items():
print(f"\n{category}:")
print("-" * 40)
for item in items:
print(f"  ✓ {item}")
regression_table_checklist()

Common Mistakes to Avoid

def common_regression_mistakes():
"""
Illustrate common regression mistakes
"""
np.random.seed(42)
n = 200
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
# 1. Overfitting
X = np.random.randn(n, 20)
y = X[:, 0] * 2 + np.random.randn(n) * 0.5
model_over = LinearRegression().fit(X, y)
axes[0].bar(range(len(model_over.coef_)), model_over.coef_)
axes[0].set_title('Overfitting: Many Coefficients')
axes[0].set_xlabel('Feature')
axes[0].set_ylabel('Coefficient')
axes[0].grid(True, alpha=0.3)
# 2. Ignoring multicollinearity
X1 = np.random.randn(n)
X2 = X1 * 0.95 + np.random.randn(n) * 0.1
y = X1 * 2 + np.random.randn(n) * 0.5
X_mc = sm.add_constant(np.column_stack([X1, X2]))
model_mc = sm.OLS(y, X_mc).fit()
axes[1].errorbar(model_mc.params, range(3), xerr=1.96*model_mc.bse, fmt='o')
axes[1].axvline(x=0, color='red', linestyle='--')
axes[1].set_title('Multicollinearity: Large Standard Errors')
axes[1].set_yticklabels(['const', 'X1', 'X2'])
axes[1].grid(True, alpha=0.3)
# 3. Ignoring non-linearity
x = np.linspace(-3, 3, n)
y_linear = 2 * x + np.random.randn(n) * 0.5
y_nonlinear = 2 * x**2 + np.random.randn(n) * 2
axes[2].scatter(x, y_nonlinear, alpha=0.5)
axes[2].plot(x, 2*x + 0, 'r-', label='Linear Fit')
axes[2].set_title('Non-linear Relationship Misspecified')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
# 4. Outliers
x = np.random.randn(n)
y = 2 * x + np.random.randn(n) * 0.5
x[0] = 5
y[0] = -10
model_out = sm.OLS(y, sm.add_constant(x)).fit()
axes[3].scatter(x, y, alpha=0.5)
axes[3].plot(np.sort(x), model_out.fittedvalues[np.argsort(x)], 'r-')
axes[3].scatter(x[0], y[0], color='red', s=100, label='Outlier')
axes[3].set_title('Outlier Influence')
axes[3].legend()
axes[3].grid(True, alpha=0.3)
# 5. Heteroscedasticity
x = np.linspace(0, 10, n)
y = 2 * x + np.random.randn(n) * x
model_het = sm.OLS(y, sm.add_constant(x)).fit()
axes[4].scatter(model_het.fittedvalues, model_het.resid, alpha=0.5)
axes[4].axhline(y=0, color='red', linestyle='--')
axes[4].set_title('Heteroscedasticity')
axes[4].set_xlabel('Fitted Values')
axes[4].set_ylabel('Residuals')
axes[4].grid(True, alpha=0.3)
# 6. Non-normal residuals
x = np.random.randn(n)
y = 2 * x + np.random.exponential(1, n) - 1
model_norm = sm.OLS(y, sm.add_constant(x)).fit()
from scipy import stats
stats.probplot(model_norm.resid, dist="norm", plot=axes[5])
axes[5].set_title('Non-normal Residuals')
axes[5].grid(True, alpha=0.3)
plt.suptitle('Common Regression Mistakes to Avoid', fontsize=16)
plt.tight_layout()
plt.show()
common_regression_mistakes()

Conclusion

Regression tables are essential tools for communicating model results in data science. Understanding how to create, interpret, and present them effectively is crucial for any data scientist.

Key Takeaways

  1. Components: Know the meaning of coefficients, standard errors, p-values, and confidence intervals
  2. Interpretation: Focus on both statistical and practical significance
  3. Presentation: Create clear, publication-ready tables with proper formatting
  4. Model Comparison: Use multiple metrics to evaluate and compare models
  5. Diagnostics: Always check model assumptions and potential issues
  6. Context: Interpret results within the domain context

Quick Reference

ComponentWhat It Tells YouHow to Interpret
CoefficientDirection and magnitude of relationshipUnit change in Y per unit change in X
Standard ErrorPrecision of estimateSmaller = more precise
p-valueStatistical significancep < 0.05 indicates significant relationship
Confidence IntervalRange of likely values95% confident true value lies within
R-squaredModel fitProportion of variance explained
F-statisticOverall model significanceTests if any predictors are significant

Mastering regression tables is fundamental to data science and statistical modeling!

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper