Introduction to Percentiles
Percentiles are fundamental statistical measures that divide a dataset into 100 equal parts, providing insights into the distribution and relative standing of data points. They are essential tools in data science for understanding data distribution, detecting outliers, and making data-driven decisions.
Key Concepts
- Definition: A percentile indicates the value below which a given percentage of observations fall
- Interpretation: The kth percentile is the value below which k% of the data lies
- Common Percentiles: 25th (Q1), 50th (median), 75th (Q3), 90th, 95th, 99th
- Applications: Outlier detection, performance benchmarking, data normalization
- Relationship: Percentiles are closely related to quartiles, quintiles, and deciles
1. Understanding Percentiles
Basic Concept
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Generate sample data
np.random.seed(42)
data = np.random.normal(100, 15, 1000) # 1000 data points, mean=100, std=15
# Calculate basic percentiles
percentiles = [0, 10, 25, 50, 75, 90, 95, 99, 100]
values = np.percentile(data, percentiles)
print("Percentile Analysis:")
print("=" * 50)
for p, v in zip(percentiles, values):
print(f"{p:3d}th percentile: {v:.2f}")
print(f"Min: {data.min():.2f}")
print(f"Max: {data.max():.2f}")
print(f"Mean: {data.mean():.2f}")
print(f"Median: {np.median(data):.2f}")
Visualizing Percentiles
# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Histogram with percentiles
axes[0].hist(data, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(values[2], color='red', linestyle='--', label='25th percentile')
axes[0].axvline(values[4], color='green', linestyle='--', label='50th percentile (median)')
axes[0].axvline(values[6], color='blue', linestyle='--', label='75th percentile')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Histogram with Percentiles')
axes[0].legend()
# Box plot with percentiles
axes[1].boxplot(data, vert=False)
axes[1].set_xlabel('Value')
axes[1].set_title('Box Plot (Quartiles)')
axes[1].axvline(values[2], color='red', linestyle='--', alpha=0.5)
axes[1].axvline(values[4], color='green', linestyle='--', alpha=0.5)
axes[1].axvline(values[6], color='blue', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
2. Calculation Methods
Different Methods for Percentile Calculation
def calculate_percentiles_manual(data, percentile):
"""
Calculate percentile using different methods.
Parameters
----------
data : array-like
Input data
percentile : float
Percentile to calculate (0-100)
Returns
-------
dict
Percentile values using different methods
"""
sorted_data = np.sort(data)
n = len(sorted_data)
# Method 1: Nearest rank
rank_nearest = np.ceil(percentile / 100 * n)
value_nearest = sorted_data[int(rank_nearest) - 1]
# Method 2: Linear interpolation (like numpy default)
position = (percentile / 100) * (n - 1)
index_low = int(np.floor(position))
index_high = int(np.ceil(position))
if index_low == index_high:
value_linear = sorted_data[index_low]
else:
weight_high = position - index_low
weight_low = 1 - weight_high
value_linear = (sorted_data[index_low] * weight_low +
sorted_data[index_high] * weight_high)
# Method 3: Weighted average (Excel method)
rank_excel = (percentile / 100) * (n + 1)
index_low_excel = int(np.floor(rank_excel))
index_high_excel = int(np.ceil(rank_excel))
if index_low_excel == index_high_excel:
value_excel = sorted_data[index_low_excel - 1]
else:
weight_high_excel = rank_excel - index_low_excel
weight_low_excel = 1 - weight_high_excel
value_excel = (sorted_data[index_low_excel - 1] * weight_low_excel +
sorted_data[index_high_excel - 1] * weight_high_excel)
return {
'nearest_rank': value_nearest,
'linear_interpolation': value_linear,
'excel_method': value_excel,
'numpy': np.percentile(data, percentile)
}
# Test with sample data
test_data = [3, 5, 7, 8, 9, 11, 13, 15]
print("Data:", test_data)
print("\nPercentile 50 (Median) Calculations:")
results = calculate_percentiles_manual(test_data, 50)
for method, value in results.items():
print(f" {method}: {value:.2f}")
3. Quartiles and IQR
Understanding Quartiles
def analyze_quartiles(data):
"""
Comprehensive quartile analysis.
Parameters
----------
data : array-like
Input data
Returns
-------
dict
Quartile statistics
"""
q1 = np.percentile(data, 25)
q2 = np.percentile(data, 50) # median
q3 = np.percentile(data, 75)
iqr = q3 - q1
# Lower and upper bounds for outlier detection
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = [x for x in data if x < lower_bound or x > upper_bound]
return {
'Q1 (25th)': q1,
'Q2 (50th/Median)': q2,
'Q3 (75th)': q3,
'IQR': iqr,
'Lower Bound': lower_bound,
'Upper Bound': upper_bound,
'Outliers': outliers,
'Outlier Count': len(outliers),
'Outlier Percentage': len(outliers) / len(data) * 100
}
# Generate data with outliers
np.random.seed(42)
data_with_outliers = np.random.normal(100, 15, 1000)
data_with_outliers = np.append(data_with_outliers, [200, 220, 10, 5]) # Add outliers
quartile_stats = analyze_quartiles(data_with_outliers)
print("Quartile Analysis:")
print("=" * 50)
for key, value in quartile_stats.items():
if isinstance(value, list):
print(f"{key}: {value[:5]}... (showing first 5)")
else:
print(f"{key}: {value:.2f}")
Visualizing Quartiles
def plot_quartiles(data):
"""
Create comprehensive quartile visualization.
"""
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# 1. Box plot
axes[0, 0].boxplot(data)
axes[0, 0].set_title('Box Plot (Quartiles Visualization)')
axes[0, 0].set_ylabel('Value')
axes[0, 0].grid(True, alpha=0.3)
# 2. Histogram with quartiles
axes[0, 1].hist(data, bins=50, edgecolor='black', alpha=0.7)
q1 = np.percentile(data, 25)
q2 = np.percentile(data, 50)
q3 = np.percentile(data, 75)
axes[0, 1].axvline(q1, color='red', linestyle='--', label='Q1')
axes[0, 1].axvline(q2, color='green', linestyle='--', label='Q2 (Median)')
axes[0, 1].axvline(q3, color='blue', linestyle='--', label='Q3')
axes[0, 1].legend()
axes[0, 1].set_title('Histogram with Quartiles')
axes[0, 1].set_xlabel('Value')
# 3. Cumulative distribution
sorted_data = np.sort(data)
cumulative = np.arange(1, len(sorted_data) + 1) / len(sorted_data) * 100
axes[1, 0].plot(sorted_data, cumulative)
axes[1, 0].axhline(25, color='red', linestyle='--', alpha=0.5)
axes[1, 0].axhline(50, color='green', linestyle='--', alpha=0.5)
axes[1, 0].axhline(75, color='blue', linestyle='--', alpha=0.5)
axes[1, 0].set_xlabel('Value')
axes[1, 0].set_ylabel('Cumulative Percentage (%)')
axes[1, 0].set_title('Cumulative Distribution')
axes[1, 0].grid(True, alpha=0.3)
# 4. Violin plot
sns.violinplot(data=data, ax=axes[1, 1])
axes[1, 1].set_title('Violin Plot (Distribution Shape)')
axes[1, 1].set_ylabel('Value')
plt.tight_layout()
plt.show()
# Plot quartiles
plot_quartiles(data_with_outliers)
4. Percentiles in Data Analysis
Descriptive Statistics with Percentiles
def comprehensive_percentile_report(data, percentiles=[1, 5, 10, 25, 50, 75, 90, 95, 99]):
"""
Generate comprehensive percentile report.
Parameters
----------
data : array-like
Input data
percentiles : list
Percentiles to calculate
Returns
-------
pd.DataFrame
DataFrame with percentile statistics
"""
# Calculate basic statistics
basic_stats = {
'Count': len(data),
'Mean': np.mean(data),
'Std Dev': np.std(data),
'Min': np.min(data),
'Max': np.max(data),
'Range': np.max(data) - np.min(data),
'Skewness': pd.Series(data).skew(),
'Kurtosis': pd.Series(data).kurtosis()
}
# Calculate percentiles
percentile_values = []
for p in percentiles:
percentile_values.append({
'Percentile': f'{p}th',
'Value': np.percentile(data, p)
})
# Create DataFrame
percentile_df = pd.DataFrame(percentile_values)
# Calculate inter-percentile ranges
percentile_df['Below'] = [f"{p}%" for p in percentiles]
return percentile_df, basic_stats
# Generate report
data_skewed = np.random.exponential(scale=50, size=1000) # Skewed distribution
percentile_df, basic_stats = comprehensive_percentile_report(data_skewed)
print("Basic Statistics:")
print("=" * 40)
for key, value in basic_stats.items():
print(f"{key}: {value:.2f}")
print("\nPercentile Report:")
print("=" * 40)
print(percentile_df.to_string(index=False))
Percentile-based Segmentation
def create_percentile_segments(data, segments=[0, 25, 50, 75, 100]):
"""
Create percentile-based segments for data.
Parameters
----------
data : array-like
Input data
segments : list
Percentile breakpoints
Returns
-------
pd.DataFrame
DataFrame with segment assignments
"""
df = pd.DataFrame({'value': data})
# Calculate percentiles
percentiles = {}
for i in range(len(segments) - 1):
p_low = segments[i]
p_high = segments[i + 1]
low_val = np.percentile(data, p_low)
high_val = np.percentile(data, p_high)
percentiles[f"{p_low}-{p_high}"] = (low_val, high_val)
# Create segment labels
labels = []
for val in data:
for i in range(len(segments) - 1):
p_low = segments[i]
p_high = segments[i + 1]
low_val = np.percentile(data, p_low)
high_val = np.percentile(data, p_high)
if low_val <= val <= high_val:
labels.append(f"{p_low}-{p_high}th percentile")
break
df['segment'] = labels
return df, percentiles
# Create segments
segment_df, percentile_ranges = create_percentile_segments(data_skewed)
print("Segment Distribution:")
print(segment_df['segment'].value_counts())
print("\nPercentile Ranges:")
for segment, (low, high) in percentile_ranges.items():
print(f" {segment}: {low:.2f} - {high:.2f}")
5. Percentiles for Outlier Detection
IQR Method for Outlier Detection
def detect_outliers_iqr(data, multiplier=1.5):
"""
Detect outliers using IQR method.
Parameters
----------
data : array-like
Input data
multiplier : float
IQR multiplier for outlier detection (default 1.5)
Returns
-------
dict
Outlier detection results
"""
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1
lower_bound = q1 - multiplier * iqr
upper_bound = q3 + multiplier * iqr
outliers = [x for x in data if x < lower_bound or x > upper_bound]
outlier_indices = [i for i, x in enumerate(data) if x < lower_bound or x > upper_bound]
return {
'Q1': q1,
'Q3': q3,
'IQR': iqr,
'lower_bound': lower_bound,
'upper_bound': upper_bound,
'outliers': outliers,
'outlier_indices': outlier_indices,
'outlier_count': len(outliers),
'outlier_percentage': len(outliers) / len(data) * 100
}
# Test with different multipliers
for mult in [1.5, 2.0, 3.0]:
results = detect_outliers_iqr(data_with_outliers, multiplier=mult)
print(f"\nIQR Multiplier = {mult}")
print(f" Outliers detected: {results['outlier_count']} ({results['outlier_percentage']:.1f}%)")
print(f" Lower bound: {results['lower_bound']:.2f}")
print(f" Upper bound: {results['upper_bound']:.2f}")
Percentile-based Outlier Detection
def detect_outliers_percentile(data, lower_percentile=1, upper_percentile=99):
"""
Detect outliers using percentile thresholds.
Parameters
----------
data : array-like
Input data
lower_percentile : float
Lower percentile threshold
upper_percentile : float
Upper percentile threshold
Returns
-------
dict
Outlier detection results
"""
lower_threshold = np.percentile(data, lower_percentile)
upper_threshold = np.percentile(data, upper_percentile)
outliers_low = [x for x in data if x < lower_threshold]
outliers_high = [x for x in data if x > upper_threshold]
outliers = outliers_low + outliers_high
return {
'lower_threshold': lower_threshold,
'upper_threshold': upper_threshold,
'outliers_low': outliers_low,
'outliers_high': outliers_high,
'outlier_count': len(outliers),
'outlier_percentage': len(outliers) / len(data) * 100,
'low_outlier_percentage': len(outliers_low) / len(data) * 100,
'high_outlier_percentage': len(outliers_high) / len(data) * 100
}
# Test different percentile thresholds
thresholds = [(1, 99), (5, 95), (10, 90)]
for low, high in thresholds:
results = detect_outliers_percentile(data_with_outliers, low, high)
print(f"\nPercentile thresholds: {low}th - {high}th")
print(f" Lower threshold: {results['lower_threshold']:.2f}")
print(f" Upper threshold: {results['upper_threshold']:.2f}")
print(f" Outliers detected: {results['outlier_count']} ({results['outlier_percentage']:.1f}%)")
6. Percentiles for Performance Analysis
Percentile-based Performance Metrics
def performance_analysis(response_times):
"""
Analyze system performance using percentiles.
Parameters
----------
response_times : array-like
Response time data in milliseconds
Returns
-------
dict
Performance metrics
"""
# Calculate key performance percentiles
percentiles = [50, 75, 90, 95, 99, 99.5, 99.9, 100]
values = np.percentile(response_times, percentiles)
# Calculate Service Level Agreement (SLA) compliance
sla_thresholds = {
'Standard': 1000, # 1 second
'Premium': 500, # 0.5 seconds
'Critical': 200 # 0.2 seconds
}
sla_compliance = {}
for name, threshold in sla_thresholds.items():
compliant = sum(1 for t in response_times if t <= threshold)
sla_compliance[name] = compliant / len(response_times) * 100
return {
'percentiles': dict(zip(percentiles, values)),
'sla_compliance': sla_compliance,
'mean': np.mean(response_times),
'median': np.median(response_times),
'std': np.std(response_times)
}
# Generate sample response time data
np.random.seed(42)
response_times = np.random.exponential(scale=200, size=1000)
response_times = np.clip(response_times, 10, 5000)
performance = performance_analysis(response_times)
print("Performance Analysis:")
print("=" * 50)
print("\nResponse Time Percentiles (ms):")
for p, value in performance['percentiles'].items():
print(f" {p}th percentile: {value:.0f} ms")
print("\nSLA Compliance:")
for tier, compliance in performance['sla_compliance'].items():
print(f" {tier}: {compliance:.1f}%")
print(f"\nStatistics:")
print(f" Mean: {performance['mean']:.0f} ms")
print(f" Median: {performance['median']:.0f} ms")
print(f" Std Dev: {performance['std']:.0f} ms")
Percentile Visualization
def plot_performance_percentiles(response_times):
"""
Create performance percentile visualization.
"""
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 1. Histogram with percentiles
axes[0, 0].hist(response_times, bins=50, edgecolor='black', alpha=0.7)
percentiles = [50, 90, 95, 99]
colors = ['green', 'orange', 'red', 'darkred']
for p, color in zip(percentiles, colors):
value = np.percentile(response_times, p)
axes[0, 0].axvline(value, color=color, linestyle='--', label=f'{p}th: {value:.0f}ms')
axes[0, 0].set_xlabel('Response Time (ms)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Response Time Distribution')
axes[0, 0].legend()
# 2. Cumulative distribution
sorted_times = np.sort(response_times)
cumulative = np.arange(1, len(sorted_times) + 1) / len(sorted_times) * 100
axes[0, 1].plot(sorted_times, cumulative)
for p, color in zip(percentiles, colors):
value = np.percentile(response_times, p)
axes[0, 1].axhline(p, color=color, linestyle='--', alpha=0.5)
axes[0, 1].axvline(value, color=color, linestyle='--', alpha=0.5)
axes[0, 1].set_xlabel('Response Time (ms)')
axes[0, 1].set_ylabel('Cumulative Percentage (%)')
axes[0, 1].set_title('Cumulative Distribution')
axes[0, 1].grid(True, alpha=0.3)
# 3. Box plot
axes[1, 0].boxplot(response_times, vert=False)
axes[1, 0].set_xlabel('Response Time (ms)')
axes[1, 0].set_title('Box Plot with Outliers')
axes[1, 0].grid(True, alpha=0.3)
# 4. Percentile trend
p_range = np.arange(0, 101, 1)
p_values = np.percentile(response_times, p_range)
axes[1, 1].plot(p_range, p_values)
axes[1, 1].set_xlabel('Percentile')
axes[1, 1].set_ylabel('Response Time (ms)')
axes[1, 1].set_title('Percentile Curve')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plot_performance_percentiles(response_times)
7. Percentiles for Score Normalization
Percentile Rank Calculation
def calculate_percentile_rank(data, value):
"""
Calculate the percentile rank of a value.
Parameters
----------
data : array-like
Reference data
value : float
Value to rank
Returns
-------
float
Percentile rank
"""
sorted_data = np.sort(data)
n = len(sorted_data)
# Count values less than given value
less_count = sum(1 for x in sorted_data if x < value)
equal_count = sum(1 for x in sorted_data if x == value)
# Standard percentile rank formula
percentile_rank = (less_count + 0.5 * equal_count) / n * 100
return percentile_rank
# Example: Student test scores
test_scores = np.random.normal(75, 10, 1000)
test_scores = np.clip(test_scores, 0, 100)
# Calculate percentile rank for a specific score
score = 85
percentile_rank = calculate_percentile_rank(test_scores, score)
print(f"Test Score: {score}")
print(f"Percentile Rank: {percentile_rank:.1f}th percentile")
print(f"Interpretation: Scored better than {percentile_rank:.1f}% of students")
# Calculate percentile ranks for all scores
percentile_ranks = [calculate_percentile_rank(test_scores, s) for s in test_scores]
Score Normalization to Percentiles
def normalize_to_percentiles(data):
"""
Normalize data to percentile scores (0-100).
Parameters
----------
data : array-like
Input data
Returns
-------
np.ndarray
Percentile scores
"""
sorted_indices = np.argsort(data)
percentile_scores = np.zeros_like(data, dtype=float)
n = len(data)
for rank, idx in enumerate(sorted_indices):
# Percentile = rank / (n-1) * 100
percentile_scores[idx] = rank / (n - 1) * 100
return percentile_scores
# Normalize test scores
percentile_scores = normalize_to_percentiles(test_scores)
# Compare original vs normalized
comparison_df = pd.DataFrame({
'Original Score': test_scores[:20],
'Percentile Score': percentile_scores[:20]
})
print("\nNormalized Scores (first 20):")
print(comparison_df.round(1))
8. Percentiles in Machine Learning
Feature Engineering with Percentiles
def create_percentile_features(X, columns, percentiles=[25, 50, 75]):
"""
Create percentile-based features.
Parameters
----------
X : pd.DataFrame
Input features
columns : list
Columns to transform
percentiles : list
Percentiles to use
Returns
-------
pd.DataFrame
DataFrame with additional percentile features
"""
X_new = X.copy()
for col in columns:
# Add percentile bucket features
for p in percentiles:
threshold = np.percentile(X[col], p)
feature_name = f"{col}_below_{p}p"
X_new[feature_name] = (X[col] < threshold).astype(int)
# Add percentile rank feature
X_new[f"{col}_percentile_rank"] = normalize_to_percentiles(X[col])
return X_new
# Example
sample_data = pd.DataFrame({
'score': test_scores[:100],
'age': np.random.randint(20, 60, 100),
'time': np.random.exponential(100, 100)
})
enhanced_data = create_percentile_features(sample_data, ['score', 'age', 'time'])
print("Enhanced Features:")
print(enhanced_data.head())
Quantile Regression
from sklearn.linear_model import QuantileRegressor
def quantile_regression_demo(X, y, quantiles=[0.25, 0.5, 0.75]):
"""
Demonstrate quantile regression for different percentiles.
Parameters
----------
X : array-like
Features
y : array-like
Target
quantiles : list
Quantiles to predict
Returns
-------
dict
Models and predictions
"""
models = {}
predictions = {}
for q in quantiles:
model = QuantileRegressor(quantile=q, alpha=0)
model.fit(X, y)
models[q] = model
predictions[q] = model.predict(X)
return models, predictions
# Generate sample data
np.random.seed(42)
X = np.random.rand(500, 1) * 10
y = 2 * X.squeeze() + np.random.randn(500) * (1 + X.squeeze())
# Fit quantile regressions
quantiles = [0.1, 0.5, 0.9]
models, predictions = quantile_regression_demo(X, y, quantiles)
# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.5, label='Data')
X_sorted = np.sort(X, axis=0)
for q in quantiles:
y_pred = models[q].predict(X_sorted)
plt.plot(X_sorted, y_pred, label=f'{int(q*100)}th percentile')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Quantile Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
9. Real-World Applications
Customer Segmentation by Spending
def customer_spending_analysis(spending_data):
"""
Analyze customer spending using percentiles.
Parameters
----------
spending_data : array-like
Customer spending amounts
Returns
-------
dict
Customer segments and insights
"""
segments = {
'Low': (0, 25),
'Medium-Low': (25, 50),
'Medium': (50, 75),
'Medium-High': (75, 90),
'High': (90, 95),
'VIP': (95, 100)
}
results = {}
for segment, (lower, upper) in segments.items():
lower_val = np.percentile(spending_data, lower)
upper_val = np.percentile(spending_data, upper)
# Get customers in this segment
if lower == 0:
customers = [s for s in spending_data if s <= upper_val]
elif upper == 100:
customers = [s for s in spending_data if s >= lower_val]
else:
customers = [s for s in spending_data if lower_val < s <= upper_val]
results[segment] = {
'range': (lower_val, upper_val),
'count': len(customers),
'percentage': len(customers) / len(spending_data) * 100,
'mean': np.mean(customers) if customers else 0
}
return results
# Generate customer spending data
customer_spending = np.random.exponential(scale=500, size=10000)
customer_spending = np.clip(customer_spending, 10, 5000)
segments = customer_spending_analysis(customer_spending)
print("Customer Spending Analysis:")
print("=" * 60)
for segment, data in segments.items():
print(f"\n{segment} Segment:")
print(f" Spending Range: ${data['range'][0]:.0f} - ${data['range'][1]:.0f}")
print(f" Customers: {data['count']} ({data['percentage']:.1f}%)")
print(f" Average Spending: ${data['mean']:.0f}")
Salary Benchmarking
def salary_benchmarking(salaries, role):
"""
Analyze salary distribution and provide benchmarking.
Parameters
----------
salaries : array-like
Salary data for the role
role : str
Job role name
Returns
-------
dict
Salary benchmarks
"""
percentiles = [10, 25, 50, 75, 90, 95, 99]
values = np.percentile(salaries, percentiles)
benchmarks = {
'Market Entry': values[0], # 10th percentile
'Below Average': values[1], # 25th percentile
'Market Average': values[2], # 50th percentile
'Above Average': values[3], # 75th percentile
'Top Performer': values[4], # 90th percentile
'Elite': values[5], # 95th percentile
'Top 1%': values[6] # 99th percentile
}
# Salary ranges for negotiation
negotiation_ranges = {
'Low Range': (values[0], values[1]),
'Fair Range': (values[1], values[3]),
'Good Range': (values[3], values[4]),
'Excellent Range': (values[4], values[5]),
'Exceptional': (values[5], values[6])
}
return {
'role': role,
'benchmarks': benchmarks,
'negotiation_ranges': negotiation_ranges,
'statistics': {
'mean': np.mean(salaries),
'median': np.median(salaries),
'std': np.std(salaries),
'min': np.min(salaries),
'max': np.max(salaries)
}
}
# Generate salaries for different roles
data_engineer_salaries = np.random.normal(120000, 20000, 1000)
data_scientist_salaries = np.random.normal(130000, 25000, 1000)
data_analyst_salaries = np.random.normal(80000, 15000, 1000)
# Analyze data engineer salaries
benchmark = salary_benchmarking(data_engineer_salaries, "Data Engineer")
print(f"{benchmark['role']} Salary Benchmark:")
print("=" * 50)
print("\nSalary Benchmarks:")
for level, salary in benchmark['benchmarks'].items():
print(f" {level}: ${salary:,.0f}")
print("\nNegotiation Ranges:")
for range_name, (low, high) in benchmark['negotiation_ranges'].items():
print(f" {range_name}: ${low:,.0f} - ${high:,.0f}")
print(f"\nStatistics:")
for stat, value in benchmark['statistics'].items():
print(f" {stat}: ${value:,.0f}")
10. Advanced Percentile Techniques
Weighted Percentiles
def weighted_percentile(data, weights, percentile):
"""
Calculate weighted percentile.
Parameters
----------
data : array-like
Data values
weights : array-like
Weights for each data point
percentile : float
Percentile to calculate (0-100)
Returns
-------
float
Weighted percentile value
"""
# Sort by data values
sorted_indices = np.argsort(data)
sorted_data = data[sorted_indices]
sorted_weights = weights[sorted_indices]
# Calculate cumulative weights
cum_weights = np.cumsum(sorted_weights)
total_weight = cum_weights[-1]
# Find target weight
target_weight = percentile / 100 * total_weight
# Find the data point where cumulative weight exceeds target
idx = np.searchsorted(cum_weights, target_weight)
if idx == 0:
return sorted_data[0]
elif idx >= len(sorted_data):
return sorted_data[-1]
else:
# Linear interpolation
weight_before = cum_weights[idx - 1]
weight_after = cum_weights[idx]
if weight_after == weight_before:
return sorted_data[idx]
ratio = (target_weight - weight_before) / (weight_after - weight_before)
return sorted_data[idx - 1] + ratio * (sorted_data[idx] - sorted_data[idx - 1])
# Example: Sales with product importance weights
sales = np.array([100, 200, 300, 400, 500, 600, 700, 800, 900, 1000])
importance = np.array([5, 3, 4, 2, 5, 1, 3, 4, 2, 5])
print("Weighted Percentile Analysis:")
print("=" * 50)
for p in [25, 50, 75, 90]:
wp = weighted_percentile(sales, importance, p)
up = np.percentile(sales, p)
print(f"{p}th percentile - Unweighted: {up:.0f}, Weighted: {wp:.0f}")
Moving Percentiles
def moving_percentile(data, window, percentile):
"""
Calculate moving percentile for time series data.
Parameters
----------
data : array-like
Time series data
window : int
Window size for moving calculation
percentile : float
Percentile to calculate
Returns
-------
np.ndarray
Moving percentile values
"""
moving_pct = np.zeros_like(data)
for i in range(len(data)):
start = max(0, i - window + 1)
window_data = data[start:i+1]
moving_pct[i] = np.percentile(window_data, percentile)
return moving_pct
# Generate sample time series data
np.random.seed(42)
time_series = np.cumsum(np.random.randn(200)) + 100
# Calculate moving percentiles
window = 20
pct_50 = moving_percentile(time_series, window, 50)
pct_90 = moving_percentile(time_series, window, 90)
pct_10 = moving_percentile(time_series, window, 10)
# Visualize
plt.figure(figsize=(12, 6))
plt.plot(time_series, label='Original', alpha=0.7)
plt.plot(pct_50, label='50th Percentile (Median)', linewidth=2)
plt.plot(pct_90, label='90th Percentile', linestyle='--')
plt.plot(pct_10, label='10th Percentile', linestyle='--')
plt.fill_between(range(len(time_series)), pct_10, pct_90, alpha=0.3)
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Moving Percentiles')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
11. Statistical Inference with Percentiles
Confidence Intervals Using Percentiles
def bootstrap_confidence_interval(data, statistic, confidence=0.95, n_bootstrap=1000):
"""
Calculate bootstrap confidence intervals using percentiles.
Parameters
----------
data : array-like
Sample data
statistic : function
Statistic to calculate
confidence : float
Confidence level (0-1)
n_bootstrap : int
Number of bootstrap samples
Returns
-------
tuple
Confidence interval (lower, upper)
"""
bootstrap_stats = []
n = len(data)
for _ in range(n_bootstrap):
# Resample with replacement
bootstrap_sample = np.random.choice(data, size=n, replace=True)
bootstrap_stats.append(statistic(bootstrap_sample))
bootstrap_stats = np.array(bootstrap_stats)
# Calculate percentiles
lower_percentile = (1 - confidence) / 2 * 100
upper_percentile = (1 - (1 - confidence) / 2) * 100
lower_bound = np.percentile(bootstrap_stats, lower_percentile)
upper_bound = np.percentile(bootstrap_stats, upper_percentile)
return lower_bound, upper_bound
# Example: Confidence interval for median
sample_data = np.random.exponential(scale=50, size=100)
median_ci = bootstrap_confidence_interval(sample_data, np.median, confidence=0.95)
mean_ci = bootstrap_confidence_interval(sample_data, np.mean, confidence=0.95)
print("Bootstrap Confidence Intervals (95%):")
print(f" Median: {median_ci[0]:.2f} - {median_ci[1]:.2f}")
print(f" Mean: {mean_ci[0]:.2f} - {mean_ci[1]:.2f}")
print(f" Sample Median: {np.median(sample_data):.2f}")
print(f" Sample Mean: {np.mean(sample_data):.2f}")
12. Best Practices and Common Pitfalls
Best Practices
# 1. Use appropriate methods for your data distribution
def choose_percentile_method(data, n):
"""
Choose appropriate percentile calculation method based on data size.
"""
if n < 100:
# Small dataset - use exact percentiles
method = "nearest_rank"
elif n > 10000:
# Large dataset - use approximate methods
method = "linear_interpolation"
else:
method = "standard"
return method
# 2. Handle edge cases
def safe_percentile(data, percentile):
"""
Safely calculate percentile with edge case handling.
"""
if len(data) == 0:
return np.nan
if percentile == 0:
return np.min(data)
if percentile == 100:
return np.max(data)
return np.percentile(data, percentile)
# 3. Document your percentile choices
def calculate_percentile_with_context(data, percentile, context=""):
"""
Calculate percentile with context about usage.
context: e.g., "p95_response_time", "q1_salary"
"""
value = np.percentile(data, percentile)
print(f"{context}: {value:.2f} ({percentile}th percentile)")
return value
Common Pitfalls
# Pitfall 1: Not handling missing values
def fix_missing_values(data):
"""Always handle NaN before percentile calculation."""
clean_data = data[~np.isnan(data)]
return np.percentile(clean_data, 50)
# Pitfall 2: Ignoring data distribution
def consider_distribution(data):
"""Consider distribution shape when interpreting percentiles."""
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1
skew = pd.Series(data).skew()
if abs(skew) > 1:
print("Warning: Highly skewed distribution - percentiles may be more meaningful than mean")
return {'median': np.percentile(data, 50), 'mean': np.mean(data)}
# Pitfall 3: Misinterpreting percentiles
def interpret_percentile(score, percentile_rank):
"""Correct interpretation of percentile rank."""
# A percentile rank of 80 means the score is better than 80% of the comparison group
print(f"Score {score} is in the {percentile_rank}th percentile")
print(f"This means {percentile_rank}% of the comparison group scored lower")
Conclusion
Percentiles are powerful statistical tools that provide deep insights into data distribution and relative positioning:
Key Takeaways
- Interpretation: The kth percentile is the value below which k% of observations fall
- Quartiles: Q1 (25th), Q2 (50th/median), Q3 (75th) divide data into four equal parts
- IQR: Interquartile range (Q3 - Q1) is a robust measure of spread
- Outlier Detection: IQR method (1.5 * IQR) is standard for identifying outliers
- Performance Analysis: High percentiles (95th, 99th) capture worst-case scenarios
- Normalization: Percentile ranks transform data to a uniform 0-100 scale
- Robustness: Percentiles are less sensitive to outliers than mean
Common Percentiles Reference
| Percentile | Name | Use Case |
|---|---|---|
| 0 | Minimum | Range boundaries |
| 25 | Q1 | Lower quartile |
| 50 | Q2 | Median, central tendency |
| 75 | Q3 | Upper quartile |
| 90 | P90 | Performance monitoring |
| 95 | P95 | SLA targets |
| 99 | P99 | Critical thresholds |
| 100 | Maximum | Range boundaries |
Best Practices
- Use median (50th percentile) for skewed distributions
- Combine percentiles with IQR for robust outlier detection
- Document which percentile method you're using
- Consider sample size when interpreting percentiles
- Use bootstrap for confidence intervals
- Visualize percentiles with box plots or cumulative distributions
Percentiles are essential for understanding data distributions, detecting anomalies, and making data-driven decisions. Master these concepts to become a more effective data scientist!