Complete Guide to Data Science Functions

Introduction to Data Science Functions

Data science functions are the building blocks of data analysis, providing reusable tools for common tasks like data cleaning, transformation, modeling, and visualization. Mastering these functions—both built-in and custom—is essential for efficient and reproducible data science work.

Key Concepts

  • Vectorization: Functions that operate on entire arrays/DataFrames at once
  • Pure Functions: Deterministic functions with no side effects
  • Function Composition: Combining simple functions to build complex pipelines
  • Reusability: Creating functions that work across different datasets
  • Documentation: Clear docstrings and type hints for maintainability
  • Testing: Unit tests for critical data transformations

1. Python Functions for Data Science

Basic Function Structure

import pandas as pd
import numpy as np
from typing import List, Dict, Union, Optional, Tuple
import warnings
def calculate_summary_statistics(data: pd.DataFrame, 
numeric_columns: Optional[List[str]] = None) -> pd.DataFrame:
"""
Calculate summary statistics for numeric columns in a DataFrame.
Parameters
----------
data : pd.DataFrame
Input DataFrame
numeric_columns : List[str], optional
Specific columns to analyze. If None, all numeric columns are used.
Returns
-------
pd.DataFrame
DataFrame containing summary statistics
Raises
------
ValueError
If no numeric columns are found
"""
# Select columns
if numeric_columns is None:
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
if not numeric_columns:
raise ValueError("No numeric columns found in the DataFrame")
# Calculate statistics
stats = data[numeric_columns].agg(['mean', 'median', 'std', 'min', 'max', 'count'])
# Add skewness and kurtosis
stats.loc['skew'] = data[numeric_columns].skew()
stats.loc['kurtosis'] = data[numeric_columns].kurtosis()
# Add missing values count
stats.loc['missing'] = data[numeric_columns].isnull().sum()
return stats
# Example usage
df = pd.DataFrame({
'age': [25, 30, 35, 28, 32],
'salary': [50000, 60000, 70000, 55000, 65000],
'department': ['IT', 'Sales', 'IT', 'HR', 'Sales']
})
stats = calculate_summary_statistics(df)
print("Summary Statistics:")
print(stats)

Function for Data Cleaning

def clean_dataframe(df: pd.DataFrame,
handle_missing: str = 'drop',
fill_value: Optional[Union[int, float, str]] = None,
remove_duplicates: bool = True,
standardize_text: bool = True) -> pd.DataFrame:
"""
Comprehensive data cleaning function.
Parameters
----------
df : pd.DataFrame
Input DataFrame
handle_missing : str, default='drop'
How to handle missing values: 'drop', 'fill', or 'interpolate'
fill_value : optional
Value to use for filling missing data
remove_duplicates : bool, default=True
Whether to remove duplicate rows
standardize_text : bool, default=True
Whether to standardize text columns (strip, lowercase)
Returns
-------
pd.DataFrame
Cleaned DataFrame
"""
df_clean = df.copy()
# Remove duplicates
if remove_duplicates:
initial_len = len(df_clean)
df_clean = df_clean.drop_duplicates()
print(f"Removed {initial_len - len(df_clean)} duplicate rows")
# Handle missing values
if handle_missing == 'drop':
initial_len = len(df_clean)
df_clean = df_clean.dropna()
print(f"Removed {initial_len - len(df_clean)} rows with missing values")
elif handle_missing == 'fill':
if fill_value is None:
# Fill numeric with median, categorical with mode
for col in df_clean.columns:
if df_clean[col].dtype in ['int64', 'float64']:
df_clean[col] = df_clean[col].fillna(df_clean[col].median())
else:
df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0] if not df_clean[col].mode().empty else 'Unknown')
else:
df_clean = df_clean.fillna(fill_value)
elif handle_missing == 'interpolate':
df_clean = df_clean.interpolate()
# Standardize text columns
if standardize_text:
for col in df_clean.select_dtypes(include=['object']).columns:
df_clean[col] = df_clean[col].astype(str).str.strip().str.lower()
df_clean[col] = df_clean[col].replace(['nan', 'none', ''], np.nan)
return df_clean
# Example
df_messy = pd.DataFrame({
'name': ['Alice', 'BOB', 'Charlie', 'david', 'EMMA', 'Alice'],
'age': [25, -5, 35, 28, 32, 25],
'salary': [50000, 60000, None, 55000, 65000, 50000],
'dept': ['IT', 'SALES', 'it', 'hr', 'Sales', 'IT']
})
cleaned = clean_dataframe(df_messy, handle_missing='fill')
print("\nCleaned DataFrame:")
print(cleaned)

2. Statistical Functions

Descriptive Statistics

def descriptive_stats(df: pd.DataFrame, 
group_by: Optional[str] = None,
percentiles: List[float] = [0.25, 0.5, 0.75]) -> pd.DataFrame:
"""
Calculate comprehensive descriptive statistics.
Parameters
----------
df : pd.DataFrame
Input DataFrame
group_by : str, optional
Column name to group by
percentiles : List[float]
Percentiles to calculate
Returns
-------
pd.DataFrame
Descriptive statistics
"""
def _stats_series(s: pd.Series) -> Dict:
return {
'count': s.count(),
'missing': s.isnull().sum(),
'mean': s.mean(),
'median': s.median(),
'std': s.std(),
'min': s.min(),
'q25': s.quantile(0.25),
'q50': s.quantile(0.5),
'q75': s.quantile(0.75),
'max': s.max(),
'skew': s.skew(),
'kurtosis': s.kurtosis()
}
if group_by:
result = df.groupby(group_by).agg(_stats_series).T
else:
numeric_cols = df.select_dtypes(include=[np.number]).columns
result = pd.DataFrame({col: _stats_series(df[col]) for col in numeric_cols})
return result
# Example
df_sample = pd.DataFrame({
'value': np.random.normal(100, 15, 1000),
'group': np.random.choice(['A', 'B', 'C'], 1000)
})
stats = descriptive_stats(df_sample, group_by='group')
print("\nDescriptive Statistics by Group:")
print(stats)

Correlation Analysis

def correlation_analysis(df: pd.DataFrame,
method: str = 'pearson',
threshold: float = 0.7,
plot: bool = True) -> Dict:
"""
Perform correlation analysis on numeric columns.
Parameters
----------
df : pd.DataFrame
Input DataFrame
method : str
Correlation method: 'pearson', 'spearman', 'kendall'
threshold : float
Threshold for highlighting high correlations
plot : bool
Whether to plot correlation matrix
Returns
-------
Dict
Dictionary containing correlation matrix and high correlations
"""
numeric_df = df.select_dtypes(include=[np.number])
# Calculate correlation
corr_matrix = numeric_df.corr(method=method)
# Find highly correlated pairs
high_corr = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
if abs(corr_matrix.iloc[i, j]) > threshold:
high_corr.append({
'var1': corr_matrix.columns[i],
'var2': corr_matrix.columns[j],
'correlation': corr_matrix.iloc[i, j]
})
# Sort by absolute correlation
high_corr = sorted(high_corr, key=lambda x: abs(x['correlation']), reverse=True)
# Plot if requested
if plot:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
square=True, linewidths=0.5)
plt.title(f'Correlation Matrix ({method})')
plt.tight_layout()
plt.show()
return {
'correlation_matrix': corr_matrix,
'high_correlations': high_corr
}
# Example
df_corr = pd.DataFrame({
'x1': np.random.randn(100),
'x2': np.random.randn(100),
'x3': np.random.randn(100)
})
df_corr['x4'] = df_corr['x1'] * 0.8 + df_corr['x2'] * 0.2 + np.random.randn(100) * 0.1
df_corr['x5'] = df_corr['x2'] * 0.9 + np.random.randn(100) * 0.2
results = correlation_analysis(df_corr, threshold=0.6)
print("\nHigh Correlations:")
for corr in results['high_correlations']:
print(f"  {corr['var1']} vs {corr['var2']}: {corr['correlation']:.3f}")

3. Data Transformation Functions

Feature Scaling

def scale_features(df: pd.DataFrame,
method: str = 'standard',
columns: Optional[List[str]] = None) -> pd.DataFrame:
"""
Scale numerical features using various methods.
Parameters
----------
df : pd.DataFrame
Input DataFrame
method : str
Scaling method: 'standard', 'minmax', 'robust', 'maxabs'
columns : List[str], optional
Columns to scale. If None, all numeric columns are scaled.
Returns
-------
pd.DataFrame
Scaled DataFrame
"""
df_scaled = df.copy()
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns.tolist()
for col in columns:
if method == 'standard':
# (x - mean) / std
mean = df[col].mean()
std = df[col].std()
df_scaled[col] = (df[col] - mean) / std
elif method == 'minmax':
# (x - min) / (max - min)
min_val = df[col].min()
max_val = df[col].max()
df_scaled[col] = (df[col] - min_val) / (max_val - min_val)
elif method == 'robust':
# (x - median) / IQR
median = df[col].median()
q75 = df[col].quantile(0.75)
q25 = df[col].quantile(0.25)
iqr = q75 - q25
df_scaled[col] = (df[col] - median) / iqr
elif method == 'maxabs':
# x / max(|x|)
max_abs = df[col].abs().max()
df_scaled[col] = df[col] / max_abs
return df_scaled
# Example
df_scale = pd.DataFrame({
'age': [25, 30, 35, 28, 32],
'income': [50000, 60000, 70000, 55000, 65000],
'children': [0, 2, 1, 0, 3]
})
print("Original Data:")
print(df_scale)
print("\nStandard Scaled:")
print(scale_features(df_scale, method='standard'))
print("\nMin-Max Scaled:")
print(scale_features(df_scale, method='minmax'))

Encoding Categorical Variables

def encode_categorical(df: pd.DataFrame,
method: str = 'onehot',
columns: Optional[List[str]] = None,
drop_first: bool = False) -> pd.DataFrame:
"""
Encode categorical variables using various methods.
Parameters
----------
df : pd.DataFrame
Input DataFrame
method : str
Encoding method: 'onehot', 'label', 'frequency', 'target'
columns : List[str], optional
Columns to encode. If None, all object/category columns are encoded.
drop_first : bool
For one-hot encoding, whether to drop the first category
target : Optional[pd.Series]
Target variable for target encoding
Returns
-------
pd.DataFrame
Encoded DataFrame
"""
df_encoded = df.copy()
if columns is None:
columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
if method == 'onehot':
for col in columns:
dummies = pd.get_dummies(df[col], prefix=col, drop_first=drop_first)
df_encoded = pd.concat([df_encoded.drop(columns=[col]), dummies], axis=1)
elif method == 'label':
from sklearn.preprocessing import LabelEncoder
for col in columns:
le = LabelEncoder()
df_encoded[col] = le.fit_transform(df[col].astype(str))
elif method == 'frequency':
for col in columns:
freq_map = df[col].value_counts(normalize=True).to_dict()
df_encoded[col] = df[col].map(freq_map)
return df_encoded
# Example
df_cat = pd.DataFrame({
'color': ['red', 'blue', 'red', 'green', 'blue'],
'size': ['S', 'M', 'L', 'M', 'S'],
'value': [10, 20, 30, 40, 50]
})
print("Original Data:")
print(df_cat)
print("\nOne-Hot Encoding:")
print(encode_categorical(df_cat, method='onehot'))
print("\nLabel Encoding:")
print(encode_categorical(df_cat, method='label'))
print("\nFrequency Encoding:")
print(encode_categorical(df_cat, method='frequency'))

4. Feature Engineering Functions

Creating Polynomial Features

def create_polynomial_features(df: pd.DataFrame,
columns: List[str],
degree: int = 2,
include_interaction: bool = True) -> pd.DataFrame:
"""
Create polynomial and interaction features.
Parameters
----------
df : pd.DataFrame
Input DataFrame
columns : List[str]
Columns to use for creating polynomial features
degree : int
Maximum polynomial degree
include_interaction : bool
Whether to include interaction terms
Returns
-------
pd.DataFrame
DataFrame with polynomial features
"""
from itertools import combinations_with_replacement, combinations
df_poly = df.copy()
# Add polynomial terms
for col in columns:
for d in range(2, degree + 1):
new_col = f"{col}^{d}"
df_poly[new_col] = df[col] ** d
# Add interaction terms
if include_interaction and len(columns) > 1:
for combo in combinations(columns, 2):
new_col = f"{combo[0]}×{combo[1]}"
df_poly[new_col] = df[combo[0]] * df[combo[1]]
return df_poly
# Example
df_feat = pd.DataFrame({
'x': [1, 2, 3, 4, 5],
'y': [2, 4, 6, 8, 10],
'z': [1, 1, 2, 2, 3]
})
poly_features = create_polynomial_features(df_feat, ['x', 'y', 'z'], degree=3)
print("Polynomial Features:")
print(poly_features)

Date Feature Extraction

def extract_date_features(df: pd.DataFrame,
date_column: str,
features: List[str] = None) -> pd.DataFrame:
"""
Extract various features from datetime column.
Parameters
----------
df : pd.DataFrame
Input DataFrame
date_column : str
Name of datetime column
features : List[str], optional
Features to extract: year, month, day, dayofweek, quarter, is_weekend, etc.
Returns
-------
pd.DataFrame
DataFrame with extracted date features
"""
df_date = df.copy()
# Ensure datetime type
df_date[date_column] = pd.to_datetime(df_date[date_column])
# Default features
if features is None:
features = ['year', 'month', 'day', 'dayofweek', 'quarter', 'is_weekend']
# Extract features
for feature in features:
if feature == 'year':
df_date[f'{date_column}_year'] = df_date[date_column].dt.year
elif feature == 'month':
df_date[f'{date_column}_month'] = df_date[date_column].dt.month
elif feature == 'day':
df_date[f'{date_column}_day'] = df_date[date_column].dt.day
elif feature == 'dayofweek':
df_date[f'{date_column}_dayofweek'] = df_date[date_column].dt.dayofweek
elif feature == 'quarter':
df_date[f'{date_column}_quarter'] = df_date[date_column].dt.quarter
elif feature == 'is_weekend':
df_date[f'{date_column}_is_weekend'] = (df_date[date_column].dt.dayofweek >= 5).astype(int)
elif feature == 'is_month_start':
df_date[f'{date_column}_is_month_start'] = df_date[date_column].dt.is_month_start.astype(int)
elif feature == 'is_month_end':
df_date[f'{date_column}_is_month_end'] = df_date[date_column].dt.is_month_end.astype(int)
elif feature == 'dayofyear':
df_date[f'{date_column}_dayofyear'] = df_date[date_column].dt.dayofyear
elif feature == 'week':
df_date[f'{date_column}_week'] = df_date[date_column].dt.isocalendar().week
return df_date
# Example
df_dates = pd.DataFrame({
'date': pd.date_range('2023-01-01', periods=10, freq='D'),
'value': np.random.randn(10)
})
date_features = extract_date_features(df_dates, 'date')
print("Date Features:")
print(date_features)

5. Model Evaluation Functions

Classification Metrics

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
def evaluate_classification(y_true: np.ndarray,
y_pred: np.ndarray,
y_proba: Optional[np.ndarray] = None) -> Dict:
"""
Comprehensive classification model evaluation.
Parameters
----------
y_true : array-like
True labels
y_pred : array-like
Predicted labels
y_proba : array-like, optional
Predicted probabilities for ROC-AUC
Returns
-------
Dict
Dictionary of evaluation metrics
"""
metrics = {
'accuracy': accuracy_score(y_true, y_pred),
'precision': precision_score(y_true, y_pred, average='weighted'),
'recall': recall_score(y_true, y_pred, average='weighted'),
'f1': f1_score(y_true, y_pred, average='weighted')
}
if y_proba is not None:
metrics['roc_auc'] = roc_auc_score(y_true, y_proba[:, 1] if y_proba.ndim > 1 else y_proba)
return metrics
def confusion_matrix_analysis(y_true: np.ndarray, y_pred: np.ndarray, labels: List[str]) -> None:
"""
Analyze confusion matrix and display results.
Parameters
----------
y_true : array-like
True labels
y_pred : array-like
Predicted labels
labels : List[str]
Class labels
"""
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print("=" * 50)
print(f"{'':>12}", end='')
for label in labels:
print(f"{label:>10}", end='')
print()
for i, label in enumerate(labels):
print(f"{label:>12}", end='')
for j in range(len(labels)):
print(f"{cm[i, j]:>10}", end='')
print()
# Per-class metrics
print("\nPer-Class Metrics:")
print("=" * 50)
for i, label in enumerate(labels):
tp = cm[i, i]
fp = cm[:, i].sum() - tp
fn = cm[i, :].sum() - tp
tn = cm.sum() - tp - fp - fn
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
print(f"{label:>12}: precision={precision:.3f}, recall={recall:.3f}, f1={f1:.3f}")
# Example
y_true = [0, 1, 0, 1, 0, 1, 0, 1]
y_pred = [0, 1, 0, 0, 0, 1, 1, 1]
y_proba = [0.9, 0.8, 0.7, 0.4, 0.6, 0.3, 0.5, 0.9]
metrics = evaluate_classification(y_true, y_pred, np.array(y_proba).reshape(-1, 1))
print("Classification Metrics:")
for metric, value in metrics.items():
print(f"  {metric}: {value:.4f}")
print()
confusion_matrix_analysis(y_true, y_pred, ['Class 0', 'Class 1'])

Regression Metrics

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error
def evaluate_regression(y_true: np.ndarray, 
y_pred: np.ndarray,
y_proba: Optional[np.ndarray] = None) -> Dict:
"""
Comprehensive regression model evaluation.
Parameters
----------
y_true : array-like
True values
y_pred : array-like
Predicted values
y_proba : array-like, optional
Not used, kept for API consistency
Returns
-------
Dict
Dictionary of evaluation metrics
"""
metrics = {
'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
'MAE': mean_absolute_error(y_true, y_pred),
'R2': r2_score(y_true, y_pred),
'MAPE': mean_absolute_percentage_error(y_true, y_pred) * 100
}
return metrics
# Example
y_true = [10, 20, 30, 40, 50]
y_pred = [12, 18, 32, 38, 52]
metrics = evaluate_regression(y_true, y_pred)
print("Regression Metrics:")
for metric, value in metrics.items():
if metric == 'MAPE':
print(f"  {metric}: {value:.2f}%")
else:
print(f"  {metric}: {value:.4f}")

6. Data Visualization Functions

Custom Plotting Functions

import matplotlib.pyplot as plt
import seaborn as sns
def plot_distribution(data: pd.Series,
bins: int = 30,
title: str = 'Distribution',
kde: bool = True) -> None:
"""
Plot distribution of a single variable.
Parameters
----------
data : pd.Series
Input data
bins : int
Number of histogram bins
title : str
Plot title
kde : bool
Whether to show KDE
"""
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Histogram with KDE
sns.histplot(data, bins=bins, kde=kde, ax=axes[0])
axes[0].set_title(f'{title} - Histogram')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')
# Box plot
sns.boxplot(y=data, ax=axes[1])
axes[1].set_title(f'{title} - Box Plot')
axes[1].set_ylabel('Value')
plt.tight_layout()
plt.show()
def plot_correlation_heatmap(df: pd.DataFrame,
figsize: Tuple[int, int] = (10, 8),
annot: bool = True) -> None:
"""
Plot correlation heatmap.
Parameters
----------
df : pd.DataFrame
Input DataFrame
figsize : tuple
Figure size
annot : bool
Whether to show correlation values
"""
corr = df.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=figsize)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=annot, cmap='coolwarm', 
center=0, square=True, linewidths=0.5)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()
def plot_feature_importance(feature_names: List[str],
importances: List[float],
title: str = 'Feature Importance',
top_n: Optional[int] = None) -> None:
"""
Plot feature importance.
Parameters
----------
feature_names : List[str]
Names of features
importances : List[float]
Importance values
title : str
Plot title
top_n : int, optional
Show only top N features
"""
# Create DataFrame for sorting
importance_df = pd.DataFrame({
'feature': feature_names,
'importance': importances
}).sort_values('importance', ascending=True)
if top_n:
importance_df = importance_df.tail(top_n)
plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.title(title)
plt.tight_layout()
plt.show()

7. Data Validation Functions

def validate_dataframe(df: pd.DataFrame,
required_columns: Optional[List[str]] = None,
numeric_columns: Optional[List[str]] = None,
categorical_columns: Optional[List[str]] = None,
date_columns: Optional[List[str]] = None) -> Dict:
"""
Validate DataFrame against specified requirements.
Parameters
----------
df : pd.DataFrame
Input DataFrame
required_columns : List[str], optional
Columns that must exist
numeric_columns : List[str], optional
Columns that must be numeric
categorical_columns : List[str], optional
Columns that must be categorical
date_columns : List[str], optional
Columns that must be datetime
Returns
-------
Dict
Validation results
"""
results = {
'valid': True,
'errors': [],
'warnings': []
}
# Check required columns
if required_columns:
missing = [col for col in required_columns if col not in df.columns]
if missing:
results['valid'] = False
results['errors'].append(f"Missing required columns: {missing}")
# Check numeric columns
if numeric_columns:
for col in numeric_columns:
if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
results['valid'] = False
results['errors'].append(f"Column '{col}' should be numeric")
# Check categorical columns
if categorical_columns:
for col in categorical_columns:
if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
results['warnings'].append(f"Column '{col}' is numeric but should be categorical")
# Check date columns
if date_columns:
for col in date_columns:
if col in df.columns:
try:
pd.to_datetime(df[col])
except:
results['warnings'].append(f"Column '{col}' may not be valid datetime")
# Check for missing values
missing_counts = df.isnull().sum()
for col, count in missing_counts.items():
if count > 0:
pct = count / len(df) * 100
results['warnings'].append(f"Column '{col}' has {count} missing values ({pct:.1f}%)")
return results
# Example
df_test = pd.DataFrame({
'id': [1, 2, 3],
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 'thirty', 35],
'date': ['2023-01-01', 'invalid', '2023-03-01']
})
validation = validate_dataframe(
df_test,
required_columns=['id', 'name', 'age'],
numeric_columns=['age'],
date_columns=['date']
)
print("Validation Results:")
print(f"Valid: {validation['valid']}")
print("\nErrors:")
for error in validation['errors']:
print(f"  ✗ {error}")
print("\nWarnings:")
for warning in validation['warnings']:
print(f"  ⚠ {warning}")

8. Machine Learning Pipeline Functions

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
class DataFrameSelector(BaseEstimator, TransformerMixin):
"""
Custom transformer for selecting DataFrame columns.
"""
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns]
def create_preprocessing_pipeline(numeric_features: List[str],
categorical_features: List[str]) -> Pipeline:
"""
Create preprocessing pipeline for machine learning.
Parameters
----------
numeric_features : List[str]
Names of numeric features
categorical_features : List[str]
Names of categorical features
Returns
-------
Pipeline
Preprocessing pipeline
"""
# Numeric pipeline
numeric_pipeline = Pipeline([
('selector', DataFrameSelector(numeric_features)),
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Categorical pipeline
categorical_pipeline = Pipeline([
('selector', DataFrameSelector(categorical_features)),
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
# Combine pipelines
preprocessor = ColumnTransformer([
('numeric', numeric_pipeline, numeric_features),
('categorical', categorical_pipeline, categorical_features)
])
return preprocessor
def train_model(X_train, y_train, X_test, y_test, model):
"""
Train and evaluate a model with proper logging.
Parameters
----------
X_train : pd.DataFrame
Training features
y_train : pd.Series
Training target
X_test : pd.DataFrame
Test features
y_test : pd.Series
Test target
model : sklearn estimator
Model to train
Returns
-------
tuple
Trained model and evaluation metrics
"""
print("Training model...")
model.fit(X_train, y_train)
print("Making predictions...")
y_pred = model.predict(X_test)
if hasattr(model, 'predict_proba'):
y_proba = model.predict_proba(X_test)
else:
y_proba = None
# Evaluate
metrics = evaluate_classification(y_test, y_pred, y_proba)
print("\nModel Performance:")
for metric, value in metrics.items():
print(f"  {metric}: {value:.4f}")
return model, metrics
# Example usage
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# preprocessor = create_preprocessing_pipeline(numeric_cols, categorical_cols)
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', RandomForestClassifier())
# ])
# pipeline.fit(X_train, y_train)

9. Utility Functions

Logging Functions

import logging
import sys
from datetime import datetime
def setup_logger(name: str, level: str = 'INFO', log_file: Optional[str] = None) -> logging.Logger:
"""
Set up a logger with console and optional file output.
Parameters
----------
name : str
Logger name
level : str
Logging level: 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
log_file : str, optional
Path to log file
Returns
-------
logging.Logger
Configured logger
"""
logger = logging.getLogger(name)
logger.setLevel(getattr(logging, level.upper()))
# Create formatter
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# File handler
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
return logger
# Decorator for timing functions
def timer(func):
"""Decorator to measure function execution time."""
def wrapper(*args, **kwargs):
start = datetime.now()
result = func(*args, **kwargs)
end = datetime.now()
elapsed = (end - start).total_seconds()
print(f"{func.__name__} took {elapsed:.4f} seconds")
return result
return wrapper
# Decorator for data validation
def validate_input(func):
"""Decorator to validate input DataFrame."""
def wrapper(df, *args, **kwargs):
if not isinstance(df, pd.DataFrame):
raise TypeError(f"Expected DataFrame, got {type(df).__name__}")
if df.empty:
raise ValueError("DataFrame is empty")
return func(df, *args, **kwargs)
return wrapper
# Example usage
@timer
@validate_input
def process_data(df):
"""Process data with timing and validation."""
logger = setup_logger('data_processor', 'INFO')
logger.info(f"Processing data with shape {df.shape}")
return df.mean()
# Test
df_test = pd.DataFrame({'value': np.random.randn(1000)})
result = process_data(df_test)

10. Advanced Functions

Automatic Feature Selection

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
def auto_feature_selection(X: pd.DataFrame,
y: pd.Series,
method: str = 'mutual_info',
k: Optional[int] = None) -> pd.DataFrame:
"""
Automatically select best features using various methods.
Parameters
----------
X : pd.DataFrame
Feature matrix
y : pd.Series
Target variable
method : str
Selection method: 'mutual_info', 'f_stat', 'random_forest'
k : int, optional
Number of features to select. If None, select top 50%
Returns
-------
pd.DataFrame
Selected features
"""
if k is None:
k = max(1, X.shape[1] // 2)
if method == 'mutual_info':
selector = SelectKBest(mutual_info_classif, k=k)
selector.fit(X, y)
elif method == 'f_stat':
selector = SelectKBest(f_classif, k=k)
selector.fit(X, y)
elif method == 'random_forest':
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
importances = rf.feature_importances_
indices = np.argsort(importances)[-k:]
selector = X.columns[indices]
selected_cols = X.columns[indices]
return X[selected_cols]
# Get selected columns
selected_indices = selector.get_support(indices=True)
selected_cols = X.columns[selected_indices]
return X[selected_cols]
# Example
X = pd.DataFrame({
'f1': np.random.randn(100),
'f2': np.random.randn(100),
'f3': np.random.randn(100),
'f4': np.random.randn(100)
})
y = (X['f1'] + X['f2'] + np.random.randn(100) * 0.5 > 0).astype(int)
selected = auto_feature_selection(X, y, method='random_forest')
print("Selected Features:", selected.columns.tolist())

Cross-Validation Wrapper

from sklearn.model_selection import cross_val_score, StratifiedKFold
def cross_validate_model(model, X: pd.DataFrame, y: pd.Series, 
cv: int = 5, scoring: str = 'accuracy') -> Dict:
"""
Perform cross-validation with comprehensive reporting.
Parameters
----------
model : sklearn estimator
Model to evaluate
X : pd.DataFrame
Feature matrix
y : pd.Series
Target variable
cv : int
Number of folds
scoring : str
Scoring metric
Returns
-------
Dict
Cross-validation results
"""
# Stratified k-fold for classification
if len(np.unique(y)) <= 10:
cv_splitter = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
else:
cv_splitter = cv
# Perform cross-validation
scores = cross_val_score(model, X, y, cv=cv_splitter, scoring=scoring)
results = {
'mean_score': scores.mean(),
'std_score': scores.std(),
'scores': scores,
'cv_folds': cv
}
print(f"Cross-Validation Results ({cv} folds):")
print(f"  Mean {scoring}: {results['mean_score']:.4f}")
print(f"  Std Dev: {results['std_score']:.4f}")
print(f"  Individual scores: {', '.join([f'{s:.4f}' for s in scores])}")
return results

11. Best Practices

Function Design Guidelines

# 1. Use type hints
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
"""Clean column names by making them lowercase and replacing spaces."""
df.columns = df.columns.str.lower().str.replace(' ', '_')
return df
# 2. Provide default values for flexibility
def plot_histogram(data, bins=30, color='blue', alpha=0.7):
"""Plot histogram with customizable parameters."""
plt.hist(data, bins=bins, color=color, alpha=alpha)
plt.show()
# 3. Handle errors gracefully
def safe_read_csv(filepath, **kwargs):
"""Read CSV with error handling."""
try:
return pd.read_csv(filepath, **kwargs)
except FileNotFoundError:
print(f"File not found: {filepath}")
return None
except Exception as e:
print(f"Error reading file: {e}")
return None
# 4. Document functions with docstrings
def calculate_outliers(data, method='iqr', threshold=1.5):
"""
Detect outliers in data using specified method.
Parameters
----------
data : array-like
Input data
method : str, default='iqr'
Method to use: 'iqr' or 'zscore'
threshold : float, default=1.5
Threshold for outlier detection (IQR multiplier or Z-score)
Returns
-------
array-like
Boolean mask indicating outliers
"""
if method == 'iqr':
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1
lower = q1 - threshold * iqr
upper = q3 + threshold * iqr
return (data < lower) | (data > upper)
elif method == 'zscore':
z = np.abs((data - np.mean(data)) / np.std(data))
return z > threshold
else:
raise ValueError(f"Unknown method: {method}")
# 5. Use assertions for validation
def split_data(X, y, test_size=0.2, random_state=42):
"""Split data with validation."""
assert 0 < test_size < 1, "test_size must be between 0 and 1"
assert X.shape[0] == y.shape[0], "X and y must have same number of rows"
from sklearn.model_selection import train_test_split
return train_test_split(X, y, test_size=test_size, random_state=random_state)

12. Common Pitfalls and Solutions

Pitfall 1: Modifying Input Data

# Bad
def scale_data(df):
df['scaled'] = (df['value'] - df['value'].mean()) / df['value'].std()
return df
# Good
def scale_data(df):
df_scaled = df.copy()
df_scaled['scaled'] = (df['value'] - df['value'].mean()) / df['value'].std()
return df_scaled

Pitfall 2: Ignoring Data Types

# Bad
def process_numeric(df, column):
return df[column] * 2  # Works for numbers, fails for strings
# Good
def process_numeric(df, column):
if not pd.api.types.is_numeric_dtype(df[column]):
raise TypeError(f"Column '{column}' must be numeric")
return df[column] * 2

Pitfall 3: Hardcoding Values

# Bad
def process_data(df):
df = df.dropna()
df = df[df['value'] > 0]
return df
# Good
def process_data(df, dropna=True, min_value=None):
if dropna:
df = df.dropna()
if min_value is not None:
df = df[df['value'] > min_value]
return df

Conclusion

Functions are the building blocks of data science workflows, enabling reusable, testable, and maintainable code:

Key Takeaways

  1. Modularity: Break complex analysis into smaller, focused functions
  2. Reusability: Write functions that work across different datasets
  3. Documentation: Always include docstrings with parameters and returns
  4. Validation: Validate inputs early and handle errors gracefully
  5. Type Hints: Use type hints for better code understanding
  6. Testing: Write unit tests for critical functions
  7. Performance: Use vectorized operations when possible

Function Checklist

  • [ ] Clear, descriptive function name
  • [ ] Complete docstring with parameters and returns
  • [ ] Type hints for inputs and outputs
  • [ ] Input validation and error handling
  • [ ] Default values for optional parameters
  • [ ] Returns a copy rather than modifying input
  • [ ] Unit tests for edge cases
  • [ ] Documentation of assumptions and limitations
  • [ ] Performance considerations for large datasets
  • [ ] Consistent naming and style

Functions are the foundation of effective data science work. Mastering function design will make your analyses more reproducible, maintainable, and shareable!

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper