Complete Guide to Linear Functions in Data Science

Table of Contents

Introduction to Linear Functions

Linear functions are the foundation of many data science techniques. From simple predictions to complex neural networks, understanding linear relationships is essential for anyone working with data. This guide explores linear functions from both mathematical and practical perspectives, with applications in data science.

Key Concepts

Linear Relationship: A relationship that can be represented by a straight line
Slope: Rate of change (steepness of the line)
Intercept: Starting point where the line crosses the y-axis
Linearity: The property of constant rate of change
Linear Models: Statistical models that assume linear relationships

1. What is a Linear Function?

Definition and Basic Form

A linear function is a function that creates a straight line when graphed. It has the general form:

y = mx + b

Where:

y is the dependent variable (output)
x is the independent variable (input)
m is the slope (rate of change)
b is the y-intercept (value when x = 0)

import numpy as np
import matplotlib.pyplot as plt
def linear_function(x, slope, intercept):
"""Basic linear function: y = m*x + b"""
return slope * x + intercept
# Example: y = 2x + 1
x_values = np.array([0, 1, 2, 3, 4, 5])
y_values = linear_function(x_values, slope=2, intercept=1)
print("Linear Function: y = 2x + 1")
for x, y in zip(x_values, y_values):
print(f"x = {x} → y = {y}")
# Visualize
plt.figure(figsize=(8, 5))
plt.plot(x_values, y_values, 'b-', linewidth=2, label='y = 2x + 1')
plt.scatter(x_values, y_values, color='red', s=50)
plt.xlabel('x (independent variable)')
plt.ylabel('y (dependent variable)')
plt.title('Linear Function Example')
plt.grid(True, alpha=0.3)
plt.legend()
plt.axhline(y=0, color='k', linewidth=0.5)
plt.axvline(x=0, color='k', linewidth=0.5)
plt.show()

Understanding Slope and Intercept

def explore_linear_parameters():
"""Explore how slope and intercept affect the line"""
x = np.linspace(-5, 5, 100)
# Different slopes
slopes = [-2, -0.5, 0, 0.5, 2]
intercept = 0
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
for slope in slopes:
y = linear_function(x, slope, intercept)
plt.plot(x, y, label=f'slope = {slope}')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Effect of Slope (m)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='k', linewidth=0.5)
plt.axvline(x=0, color='k', linewidth=0.5)
# Different intercepts
slope = 1
intercepts = [-3, -1, 0, 1, 3]
plt.subplot(1, 2, 2)
for intercept in intercepts:
y = linear_function(x, slope, intercept)
plt.plot(x, y, label=f'intercept = {intercept}')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Effect of Intercept (b)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='k', linewidth=0.5)
plt.axvline(x=0, color='k', linewidth=0.5)
plt.tight_layout()
plt.show()
explore_linear_parameters()

2. Linear Relationships in Data

Identifying Linear Relationships

import pandas as pd
import seaborn as sns
def demonstrate_linear_relationships():
"""Show examples of linear vs non-linear relationships"""
# Generate sample data
np.random.seed(42)
x = np.linspace(0, 10, 100)
# Linear relationships
linear_strong = 2 * x + 1 + np.random.normal(0, 1, 100)
linear_weak = 2 * x + 1 + np.random.normal(0, 3, 100)
# Non-linear relationships
quadratic = x**2 - 10*x + 25 + np.random.normal(0, 2, 100)
exponential = np.exp(x/3) + np.random.normal(0, 2, 100)
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0, 0].scatter(x, linear_strong, alpha=0.6)
axes[0, 0].plot(x, 2*x + 1, 'r-', label='True line')
axes[0, 0].set_title('Strong Linear Relationship')
axes[0, 0].set_xlabel('x')
axes[0, 0].set_ylabel('y')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
axes[0, 1].scatter(x, linear_weak, alpha=0.6)
axes[0, 1].plot(x, 2*x + 1, 'r-', label='True line')
axes[0, 1].set_title('Weak Linear Relationship')
axes[0, 1].set_xlabel('x')
axes[0, 1].set_ylabel('y')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
axes[1, 0].scatter(x, quadratic, alpha=0.6)
axes[1, 0].plot(x, x**2 - 10*x + 25, 'r-', label='Quadratic')
axes[1, 0].set_title('Quadratic (Non-Linear)')
axes[1, 0].set_xlabel('x')
axes[1, 0].set_ylabel('y')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
axes[1, 1].scatter(x, exponential, alpha=0.6)
axes[1, 1].plot(x, np.exp(x/3), 'r-', label='Exponential')
axes[1, 1].set_title('Exponential (Non-Linear)')
axes[1, 1].set_xlabel('x')
axes[1, 1].set_ylabel('y')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
demonstrate_linear_relationships()

Measuring Linear Relationships: Correlation

def calculate_correlation(x, y):
"""Calculate Pearson correlation coefficient"""
x_mean = np.mean(x)
y_mean = np.mean(y)
numerator = np.sum((x - x_mean) * (y - y_mean))
denominator = np.sqrt(np.sum((x - x_mean)**2) * np.sum((y - y_mean)**2))
return numerator / denominator
# Generate data with different correlation strengths
np.random.seed(42)
n_samples = 100
# Strong positive correlation
x_strong = np.random.randn(n_samples)
y_strong = 2 * x_strong + np.random.randn(n_samples) * 0.5
# Weak positive correlation
x_weak = np.random.randn(n_samples)
y_weak = 0.3 * x_weak + np.random.randn(n_samples) * 1.2
# No correlation
x_none = np.random.randn(n_samples)
y_none = np.random.randn(n_samples)
# Strong negative correlation
x_neg = np.random.randn(n_samples)
y_neg = -2 * x_neg + np.random.randn(n_samples) * 0.5
# Calculate correlations
corr_strong = calculate_correlation(x_strong, y_strong)
corr_weak = calculate_correlation(x_weak, y_weak)
corr_none = calculate_correlation(x_none, y_none)
corr_neg = calculate_correlation(x_neg, y_neg)
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0, 0].scatter(x_strong, y_strong, alpha=0.6)
axes[0, 0].set_title(f'Strong Positive Correlation (r = {corr_strong:.2f})')
axes[0, 0].set_xlabel('x')
axes[0, 0].set_ylabel('y')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 1].scatter(x_weak, y_weak, alpha=0.6)
axes[0, 1].set_title(f'Weak Positive Correlation (r = {corr_weak:.2f})')
axes[0, 1].set_xlabel('x')
axes[0, 1].set_ylabel('y')
axes[0, 1].grid(True, alpha=0.3)
axes[1, 0].scatter(x_none, y_none, alpha=0.6)
axes[1, 0].set_title(f'No Correlation (r = {corr_none:.2f})')
axes[1, 0].set_xlabel('x')
axes[1, 0].set_ylabel('y')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 1].scatter(x_neg, y_neg, alpha=0.6)
axes[1, 1].set_title(f'Strong Negative Correlation (r = {corr_neg:.2f})')
axes[1, 1].set_xlabel('x')
axes[1, 1].set_ylabel('y')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

3. Simple Linear Regression

Fitting a Line to Data

Linear regression finds the "best fit" line that minimizes the errors between predictions and actual values.

from scipy import stats
class SimpleLinearRegression:
"""Simple linear regression model"""
def __init__(self):
self.slope = None
self.intercept = None
def fit(self, x, y):
"""Fit the linear regression model"""
# Calculate means
x_mean = np.mean(x)
y_mean = np.mean(y)
# Calculate slope
numerator = np.sum((x - x_mean) * (y - y_mean))
denominator = np.sum((x - x_mean) ** 2)
self.slope = numerator / denominator
# Calculate intercept
self.intercept = y_mean - self.slope * x_mean
return self
def predict(self, x):
"""Make predictions using the fitted model"""
return self.slope * x + self.intercept
def r_squared(self, x, y):
"""Calculate R² (coefficient of determination)"""
y_pred = self.predict(x)
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
return 1 - (ss_res / ss_tot)
# Generate sample data
np.random.seed(42)
x = np.linspace(0, 10, 50)
true_slope = 1.5
true_intercept = 2
y = true_slope * x + true_intercept + np.random.normal(0, 1.5, 50)
# Fit the model
model = SimpleLinearRegression()
model.fit(x, y)
# Make predictions
y_pred = model.predict(x)
# Calculate R²
r2 = model.r_squared(x, y)
print("Simple Linear Regression Results")
print("=" * 50)
print(f"True slope: {true_slope}")
print(f"Estimated slope: {model.slope:.3f}")
print(f"True intercept: {true_intercept}")
print(f"Estimated intercept: {model.intercept:.3f}")
print(f"R² score: {r2:.3f}")
print(f"Interpretation: {r2*100:.1f}% of variance explained")
# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.6, label='Actual data')
plt.plot(x, y_pred, 'r-', linewidth=2, label=f'Fit: y = {model.slope:.2f}x + {model.intercept:.2f}')
plt.plot(x, true_slope * x + true_intercept, 'g--', linewidth=2, label='True line')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Simple Linear Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Understanding the Error

def analyze_regression_errors(x, y, y_pred):
"""Analyze and visualize regression errors (residuals)"""
residuals = y - y_pred
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Residual plot
axes[0].scatter(x, residuals, alpha=0.6)
axes[0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[0].set_xlabel('x')
axes[0].set_ylabel('Residuals (Actual - Predicted)')
axes[0].set_title('Residual Plot')
axes[0].grid(True, alpha=0.3)
# Histogram of residuals
axes[1].hist(residuals, bins=15, edgecolor='black', alpha=0.7)
axes[1].axvline(x=0, color='r', linestyle='--', linewidth=2)
axes[1].set_xlabel('Residuals')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Residuals')
axes[1].grid(True, alpha=0.3)
# Print statistics
print("Residual Analysis")
print("=" * 40)
print(f"Mean of residuals: {np.mean(residuals):.4f}")
print(f"Std of residuals: {np.std(residuals):.4f}")
print(f"Min residual: {np.min(residuals):.4f}")
print(f"Max residual: {np.max(residuals):.4f}")
plt.tight_layout()
plt.show()
# Using the previous model
analyze_regression_errors(x, y, y_pred)

4. Multiple Linear Regression

When we have multiple input variables, we use multiple linear regression:

y = b₀ + b₁x₁ + b₂x₂ + ... + bₙxₙ

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
def demonstrate_multiple_regression():
"""Show multiple linear regression with multiple features"""
# Generate sample data with 3 features
np.random.seed(42)
n_samples = 200
# Features
x1 = np.random.randn(n_samples)  # Feature 1
x2 = np.random.randn(n_samples)  # Feature 2
x3 = np.random.randn(n_samples)  # Feature 3
# True coefficients
true_b0 = 5
true_b1 = 2.5
true_b2 = -1.5
true_b3 = 0.5
# Target with noise
y = true_b0 + true_b1 * x1 + true_b2 * x2 + true_b3 * x3 + np.random.randn(n_samples) * 0.5
# Combine features
X = np.column_stack([x1, x2, x3])
# Fit multiple linear regression
model = LinearRegression()
model.fit(X, y)
# Predictions
y_pred = model.predict(X)
print("Multiple Linear Regression Results")
print("=" * 50)
print(f"True coefficients: b0={true_b0}, b1={true_b1}, b2={true_b2}, b3={true_b3}")
print(f"Estimated coefficients:")
print(f"  b0 (intercept): {model.intercept_:.3f}")
print(f"  b1: {model.coef_[0]:.3f}")
print(f"  b2: {model.coef_[1]:.3f}")
print(f"  b3: {model.coef_[2]:.3f}")
print(f"R² Score: {r2_score(y, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y, y_pred)):.4f}")
# Feature importance (absolute coefficient values)
feature_importance = np.abs(model.coef_)
features = ['Feature 1', 'Feature 2', 'Feature 3']
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Actual vs Predicted
axes[0].scatter(y, y_pred, alpha=0.6)
axes[0].plot([y.min(), y.max()], [y.min(), y.max()], 'r--', linewidth=2)
axes[0].set_xlabel('Actual Values')
axes[0].set_ylabel('Predicted Values')
axes[0].set_title('Actual vs Predicted')
axes[0].grid(True, alpha=0.3)
# Feature importance
axes[1].bar(features, feature_importance, color='skyblue', edgecolor='black')
axes[1].set_xlabel('Features')
axes[1].set_ylabel('Absolute Coefficient Value')
axes[1].set_title('Feature Importance')
axes[1].grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
demonstrate_multiple_regression()

5. Linear Regression in Real Data

Example: House Price Prediction

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def house_price_prediction():
"""Predict house prices using linear regression"""
# Create synthetic house price dataset
np.random.seed(42)
n_houses = 500
# Features
sqft = np.random.normal(2000, 500, n_houses)  # Square footage
bedrooms = np.random.randint(1, 5, n_houses)  # Number of bedrooms
age = np.random.randint(0, 50, n_houses)      # Age of house
location_score = np.random.uniform(0, 10, n_houses)  # Location quality
# Generate price with linear relationship + noise
price = (150 * sqft + 
20000 * bedrooms - 
500 * age + 
30000 * location_score + 
50000 + 
np.random.normal(0, 30000, n_houses))
# Create DataFrame
df = pd.DataFrame({
'sqft': sqft,
'bedrooms': bedrooms,
'age': age,
'location_score': location_score,
'price': price
})
print("House Price Dataset")
print("=" * 50)
print(df.head())
print("\nSummary Statistics:")
print(df.describe())
# Split data
X = df[['sqft', 'bedrooms', 'age', 'location_score']]
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# Evaluate
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)
print("\nModel Performance")
print("=" * 50)
print(f"Training R²: {r2_score(y_train, y_pred_train):.4f}")
print(f"Test R²: {r2_score(y_test, y_pred_test):.4f}")
print(f"Training RMSE: ${np.sqrt(mean_squared_error(y_train, y_pred_train)):,.0f}")
print(f"Test RMSE: ${np.sqrt(mean_squared_error(y_test, y_pred_test)):,.0f}")
# Feature importance
print("\nFeature Coefficients (scaled):")
for feature, coef in zip(X.columns, model.coef_):
print(f"  {feature}: ${coef:,.0f} per unit change")
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Actual vs Predicted
axes[0, 0].scatter(y_test, y_pred_test, alpha=0.6)
axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
axes[0, 0].set_xlabel('Actual Price')
axes[0, 0].set_ylabel('Predicted Price')
axes[0, 0].set_title('Test Set: Actual vs Predicted')
axes[0, 0].grid(True, alpha=0.3)
# Residuals
residuals = y_test - y_pred_test
axes[0, 1].scatter(y_pred_test, residuals, alpha=0.6)
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Predicted Price')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title('Residual Plot')
axes[0, 1].grid(True, alpha=0.3)
# Feature importance
axes[1, 0].barh(X.columns, model.coef_)
axes[1, 0].set_xlabel('Coefficient Value')
axes[1, 0].set_title('Feature Importance')
axes[1, 0].grid(True, alpha=0.3, axis='x')
# Price distribution
axes[1, 1].hist(y, bins=30, edgecolor='black', alpha=0.7)
axes[1, 1].set_xlabel('Price')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('House Price Distribution')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return model, scaler
model, scaler = house_price_prediction()

6. Assumptions of Linear Regression

Checking Assumptions

Linear regression makes several key assumptions. Here's how to check them:

def check_linear_regression_assumptions(X_train, y_train, model):
"""Check assumptions of linear regression"""
y_pred = model.predict(X_train)
residuals = y_train - y_pred
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# 1. Linearity: residuals vs predicted should have no pattern
axes[0, 0].scatter(y_pred, residuals, alpha=0.6)
axes[0, 0].axhline(y=0, color='r', linestyle='--')
axes[0, 0].set_xlabel('Predicted Values')
axes[0, 0].set_ylabel('Residuals')
axes[0, 0].set_title('1. Linearity Check: No pattern in residuals')
axes[0, 0].grid(True, alpha=0.3)
# 2. Normality of residuals: Q-Q plot
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('2. Normality Check: Q-Q Plot')
axes[0, 1].grid(True, alpha=0.3)
# 3. Homoscedasticity: constant variance of residuals
axes[1, 0].scatter(y_pred, np.abs(residuals), alpha=0.6)
axes[1, 0].set_xlabel('Predicted Values')
axes[1, 0].set_ylabel('Absolute Residuals')
axes[1, 0].set_title('3. Homoscedasticity: Constant spread of residuals')
axes[1, 0].grid(True, alpha=0.3)
# 4. Independence: residuals vs order (for time series data)
axes[1, 1].scatter(range(len(residuals)), residuals, alpha=0.6)
axes[1, 1].axhline(y=0, color='r', linestyle='--')
axes[1, 1].set_xlabel('Observation Order')
axes[1, 1].set_ylabel('Residuals')
axes[1, 1].set_title('4. Independence Check: No pattern in residuals')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Statistical tests
print("Statistical Tests for Assumptions")
print("=" * 50)
# Shapiro-Wilk test for normality
shapiro_stat, shapiro_p = stats.shapiro(residuals[:5000] if len(residuals) > 5000 else residuals)
print(f"Shapiro-Wilk Normality Test: p-value = {shapiro_p:.4f}")
if shapiro_p > 0.05:
print("  → Residuals appear normally distributed")
else:
print("  → Residuals may not be normally distributed")
# Durbin-Watson test for autocorrelation
from statsmodels.stats.stattools import durbin_watson
dw = durbin_watson(residuals)
print(f"Durbin-Watson Statistic: {dw:.4f}")
if 1.5 < dw < 2.5:
print("  → No significant autocorrelation")
else:
print("  → Potential autocorrelation detected")
# Check assumptions using our previous model
check_linear_regression_assumptions(X_train_scaled, y_train, model)

7. Beyond Simple Linearity

Polynomial Regression

When data isn't linear, we can add polynomial terms:

def polynomial_regression_demo():
"""Demonstrate polynomial regression for non-linear data"""
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# Generate non-linear data
np.random.seed(42)
x = np.linspace(-3, 3, 100)
y = 0.5 * x**3 - 2 * x**2 + x + 3 + np.random.normal(0, 1.5, 100)
# Try different polynomial degrees
degrees = [1, 2, 3, 4]
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for idx, degree in enumerate(degrees):
ax = axes[idx // 2, idx % 2]
# Create polynomial regression model
poly_model = Pipeline([
('poly', PolynomialFeatures(degree=degree)),
('linear', LinearRegression())
])
# Fit model
X_reshaped = x.reshape(-1, 1)
poly_model.fit(X_reshaped, y)
# Predict
x_smooth = np.linspace(-3, 3, 200)
X_smooth = x_smooth.reshape(-1, 1)
y_smooth = poly_model.predict(X_smooth)
# Plot
ax.scatter(x, y, alpha=0.6, label='Data')
ax.plot(x_smooth, y_smooth, 'r-', linewidth=2, label=f'Degree {degree}')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title(f'Polynomial Regression (Degree {degree})')
ax.legend()
ax.grid(True, alpha=0.3)
# Calculate R²
y_pred = poly_model.predict(X_reshaped)
r2 = r2_score(y, y_pred)
ax.text(0.05, 0.95, f'R² = {r2:.3f}', transform=ax.transAxes, 
fontsize=10, verticalalignment='top')
plt.tight_layout()
plt.show()
polynomial_regression_demo()

Regularization: Ridge and Lasso

from sklearn.linear_model import Ridge, Lasso
def regularization_demo():
"""Compare Ridge and Lasso regression for feature selection"""
# Create data with many features, only a few relevant
np.random.seed(42)
n_samples = 200
n_features = 20
X = np.random.randn(n_samples, n_features)
# Only first 5 features actually matter
true_coefs = np.zeros(n_features)
true_coefs[:5] = [2, 1.5, -1, 0.8, 0.3]
y = X @ true_coefs + np.random.randn(n_samples) * 0.5
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Models
models = {
'Linear': LinearRegression(),
'Ridge (α=1)': Ridge(alpha=1.0),
'Lasso (α=0.1)': Lasso(alpha=0.1),
'Lasso (α=1)': Lasso(alpha=1.0)
}
results = {}
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for (name, model), ax in zip(models.items(), axes.flat):
# Train
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
# Store results
results[name] = {'r2': r2, 'coefs': model.coef_}
# Plot coefficients
ax.bar(range(n_features), model.coef_, alpha=0.7)
ax.axhline(y=0, color='black', linewidth=0.5)
ax.set_xlabel('Feature Index')
ax.set_ylabel('Coefficient Value')
ax.set_title(f'{name}\nR² = {r2:.3f}')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
# Print summary
print("Model Comparison")
print("=" * 50)
for name, result in results.items():
print(f"\n{name}:")
print(f"  R² Score: {result['r2']:.4f}")
print(f"  Non-zero coefficients: {np.sum(np.abs(result['coefs']) > 0.01)}")
print(f"  Largest coefficients: {result['coefs'][:5]}")
regularization_demo()

8. Practical Applications

Time Series Forecasting

def time_series_linear_demo():
"""Use linear regression for trend forecasting"""
# Generate time series data
np.random.seed(42)
days = np.arange(1, 366)  # One year of daily data
trend = 0.05 * days  # Linear trend
seasonal = 10 * np.sin(2 * np.pi * days / 30)  # Monthly seasonality
noise = np.random.normal(0, 2, 365)
values = 50 + trend + seasonal + noise
# Create features
X = np.column_stack([
days,  # Time trend
np.sin(2 * np.pi * days / 30),  # Monthly cycle
np.cos(2 * np.pi * days / 30),
np.sin(2 * np.pi * days / 365),  # Yearly cycle
np.cos(2 * np.pi * days / 365)
])
# Split into train/test
train_size = 300
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = values[:train_size], values[train_size:]
# Fit model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
# Visualize
plt.figure(figsize=(12, 6))
plt.plot(days, values, 'b-', alpha=0.7, label='Actual')
plt.plot(days[:train_size], y_pred_train, 'g-', label='Training Predictions')
plt.plot(days[train_size:], y_pred_test, 'r-', label='Test Predictions')
plt.axvline(x=train_size, color='k', linestyle='--', label='Train/Test Split')
plt.xlabel('Days')
plt.ylabel('Value')
plt.title('Time Series Forecasting with Linear Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print("Forecast Results")
print("=" * 50)
print(f"Training R²: {r2_score(y_train, y_pred_train):.4f}")
print(f"Test R²: {r2_score(y_test, y_pred_test):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f}")
time_series_linear_demo()

Marketing Spend Optimization

def marketing_optimization():
"""Optimize marketing spend using linear regression"""
# Generate marketing data
np.random.seed(42)
n_weeks = 100
# Marketing spends
tv_spend = np.random.uniform(0, 50, n_weeks)
social_spend = np.random.uniform(0, 30, n_weeks)
email_spend = np.random.uniform(0, 20, n_weeks)
# Sales with diminishing returns (not purely linear)
sales = (2.5 * tv_spend + 
3.2 * social_spend + 
1.8 * email_spend - 
0.02 * tv_spend**2 +  # Diminishing returns
100 + 
np.random.normal(0, 20, n_weeks))
# Ensure positive sales
sales = np.maximum(sales, 0)
# Create DataFrame
df = pd.DataFrame({
'tv_spend': tv_spend,
'social_spend': social_spend,
'email_spend': email_spend,
'sales': sales
})
# Fit linear model
X = df[['tv_spend', 'social_spend', 'email_spend']]
y = df['sales']
model = LinearRegression()
model.fit(X, y)
# Optimal allocation (simplified)
# For linear model, spend all on highest coefficient
coefficients = model.coef_
best_channel = X.columns[np.argmax(coefficients)]
best_coef = np.max(coefficients)
print("Marketing Spend Analysis")
print("=" * 50)
print("\nChannel Effectiveness:")
for channel, coef in zip(X.columns, coefficients):
print(f"  {channel}: ${coef:.2f} sales per $1 spent")
print(f"\nOptimal Strategy:")
print(f"  Focus on: {best_channel} (highest ROI: ${best_coef:.2f} per $1)")
# Visualize relationships
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for idx, channel in enumerate(X.columns):
axes[idx].scatter(df[channel], df['sales'], alpha=0.6)
axes[idx].set_xlabel(f'{channel} Spend ($)')
axes[idx].set_ylabel('Sales ($)')
axes[idx].set_title(f'{channel} vs Sales')
axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Simulate optimization
budget = 100
optimal_sales = budget * best_coef + model.intercept_
print(f"\nWith ${budget} budget:")
print(f"  Optimized sales: ${optimal_sales:.0f}")
return model
marketing_optimization()

9. Linear Functions in Neural Networks

Neuron as Linear Function + Activation

def neuron_demo():
"""Demonstrate a single neuron as linear function + activation"""
# Generate data
np.random.seed(42)
x = np.linspace(-5, 5, 100)
y = 2 * x + 1 + np.random.normal(0, 0.5, 100)  # True linear relationship
class SimpleNeuron:
def __init__(self):
self.weight = np.random.randn()
self.bias = np.random.randn()
def forward(self, x):
"""Linear function: w * x + b"""
return self.weight * x + self.bias
def loss(self, y_pred, y_true):
"""Mean squared error"""
return np.mean((y_pred - y_true) ** 2)
def train(self, x, y, learning_rate=0.01, epochs=100):
"""Train using gradient descent"""
losses = []
for epoch in range(epochs):
# Forward pass
y_pred = self.forward(x)
# Calculate loss
loss = self.loss(y_pred, y)
losses.append(loss)
# Calculate gradients
grad_weight = 2 * np.mean((y_pred - y) * x)
grad_bias = 2 * np.mean(y_pred - y)
# Update parameters
self.weight -= learning_rate * grad_weight
self.bias -= learning_rate * grad_bias
return losses
# Train neuron
neuron = SimpleNeuron()
print("Initial parameters:")
print(f"  Weight: {neuron.weight:.4f}")
print(f"  Bias: {neuron.bias:.4f}")
losses = neuron.train(x, y, learning_rate=0.01, epochs=500)
print("\nTrained parameters:")
print(f"  Weight: {neuron.weight:.4f} (true: 2.0)")
print(f"  Bias: {neuron.bias:.4f} (true: 1.0)")
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Learning curve
axes[0].plot(losses)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Loss Over Time')
axes[0].grid(True, alpha=0.3)
# Fitted line
axes[1].scatter(x, y, alpha=0.6, label='Data')
axes[1].plot(x, neuron.forward(x), 'r-', linewidth=2, 
label=f'Learned: y = {neuron.weight:.2f}x + {neuron.bias:.2f}')
axes[1].plot(x, 2*x + 1, 'g--', linewidth=2, label='True: y = 2x + 1')
axes[1].set_xlabel('x')
axes[1].set_ylabel('y')
axes[1].set_title('Learned vs True Relationship')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
neuron_demo()

10. Summary and Best Practices

Linear Models Cheat Sheet

def linear_models_cheatsheet():
"""Summary of linear models and their applications"""
models = {
"Simple Linear Regression": {
"Formula": "y = β₀ + β₁x",
"Use Cases": [
"Predicting continuous values",
"Finding linear relationships",
"Trend analysis"
],
"Pros": "Simple, interpretable, fast",
"Cons": "Only captures linear relationships",
"When to Use": "When relationship appears linear"
},
"Multiple Linear Regression": {
"Formula": "y = β₀ + β₁x₁ + β₂x₂ + ... + βₙxₙ",
"Use Cases": [
"Multi-feature prediction",
"Feature importance analysis",
"Controlling for confounding variables"
],
"Pros": "Handles multiple features, interpretable coefficients",
"Cons": "Multicollinearity issues",
"When to Use": "Multiple predictors, independent effects"
},
"Polynomial Regression": {
"Formula": "y = β₀ + β₁x + β₂x² + ... + βₙxⁿ",
"Use Cases": [
"Curved relationships",
"Non-linear trends",
"Flexible fitting"
],
"Pros": "Captures non-linearity",
"Cons": "Risk of overfitting",
"When to Use": "When relationship is clearly non-linear"
},
"Ridge Regression": {
"Formula": "L2 regularization added to loss",
"Use Cases": [
"Multicollinearity",
"Preventing overfitting",
"Feature selection (soft)"
],
"Pros": "Handles correlated features, stable",
"Cons": "All features remain",
"When to Use": "Many features, potential overfitting"
},
"Lasso Regression": {
"Formula": "L1 regularization added to loss",
"Use Cases": [
"Feature selection",
"Sparse models",
"High-dimensional data"
],
"Pros": "Automatic feature selection",
"Cons": "Can be unstable with correlated features",
"When to Use": "Want sparse model, feature selection"
}
}
print("Linear Models Cheat Sheet")
print("=" * 70)
for name, details in models.items():
print(f"\n📊 {name}")
print(f"   Formula: {details['Formula']}")
print(f"   Use Cases: {', '.join(details['Use Cases'][:2])}")
print(f"   ✓ {details['Pros']}")
print(f"   ✗ {details['Cons']}")
print(f"   💡 {details['When to Use']}")
linear_models_cheatsheet()

Best Practices for Linear Models

def linear_model_best_practices():
"""Key best practices when using linear models"""
practices = {
"Data Preparation": [
"Handle missing values appropriately",
"Scale/normalize features for regularization",
"Remove or treat outliers",
"Encode categorical variables properly",
"Check for multicollinearity"
],
"Model Validation": [
"Split data into train/test/validation sets",
"Use cross-validation for parameter tuning",
"Check residuals for patterns",
"Validate assumptions (linearity, normality, etc.)"
],
"Feature Engineering": [
"Create interaction terms if needed",
"Consider polynomial features for non-linearity",
"Transform skewed features",
"Remove or combine highly correlated features"
],
"Model Selection": [
"Start with simple model, add complexity if needed",
"Use regularization to prevent overfitting",
"Compare multiple models using cross-validation",
"Consider interpretability vs performance trade-off"
],
"Interpretation": [
"Report coefficients with confidence intervals",
"Explain feature importance in business terms",
"Be careful with causal claims (correlation ≠ causation)",
"Validate predictions with domain experts"
]
}
print("Best Practices for Linear Models")
print("=" * 60)
for category, items in practices.items():
print(f"\n📌 {category}")
for item in items:
print(f"   • {item}")
linear_model_best_practices()

Conclusion

Linear functions are the foundation of many data science techniques. Understanding them deeply enables:

Simple, interpretable models that explain relationships
Building blocks for more complex algorithms
Baseline models to benchmark against
Understanding of core machine learning concepts

Key Takeaways

Linear functions are the simplest form of relationship between variables
Slope tells us the rate of change
Intercept is the starting value
Linear regression finds the best-fit line through data
Multiple regression handles multiple input variables
Assumptions must be checked for valid inference
Regularization helps prevent overfitting
Neural networks use linear functions as building blocks

The Journey Continues

From simple straight lines to complex neural networks, the principles of linearity remain fundamental. Master these concepts, and you'll have a solid foundation for understanding more advanced topics in data science and machine learning.