Skip to content
Unverified — AI-generated content. Help verify this page

SciPy Statistics for EDA

SciPy's stats module provides the statistical backbone for rigorous EDA: distribution fitting, hypothesis testing, normality checks, and goodness-of-fit assessments. This page covers everything you need to move from "this looks normal" to "this is statistically confirmed to be normal."


Probability Distributions

Common Distributions for EDA

python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

np.random.seed(42)

# Key distributions and when you encounter them in EDA
distributions = {
    'Normal':      stats.norm(loc=50, scale=10),
    'Log-Normal':  stats.lognorm(s=0.5, loc=0, scale=np.exp(4)),
    'Exponential': stats.expon(scale=10),
    'Gamma':       stats.gamma(a=2, scale=5),
    'Beta':        stats.beta(a=2, b=5),
    'Poisson':     stats.poisson(mu=5),
    'Uniform':     stats.uniform(loc=0, scale=100),
    'Weibull':     stats.weibull_min(c=1.5, scale=10),
}

fig, axes = plt.subplots(2, 4, figsize=(20, 8))
axes = axes.flatten()

for i, (name, dist) in enumerate(distributions.items()):
    ax = axes[i]
    if hasattr(dist, 'pdf'):  # continuous
        x = np.linspace(dist.ppf(0.001), dist.ppf(0.999), 200)
        ax.plot(x, dist.pdf(x), 'b-', linewidth=2)
        ax.fill_between(x, dist.pdf(x), alpha=0.2)
    else:  # discrete
        x = np.arange(dist.ppf(0.001), dist.ppf(0.999))
        ax.bar(x, dist.pmf(x), color='steelblue', alpha=0.7)
    ax.set_title(name, fontweight='bold')
    ax.set_ylabel('Density')

plt.tight_layout()
plt.show()

Distribution Properties

python
dist = stats.norm(loc=100, scale=15)

print(f"Mean:     {dist.mean():.2f}")
print(f"Median:   {dist.median():.2f}")
print(f"Variance: {dist.var():.2f}")
print(f"Std dev:  {dist.std():.2f}")
print(f"Skewness: {dist.stats(moments='s')[0]:.4f}")
print(f"Kurtosis: {dist.stats(moments='k')[0]:.4f}")

# Percentiles / quantiles
print(f"\nPercentiles:")
for p in [1, 5, 25, 50, 75, 95, 99]:
    print(f"  P{p}: {dist.ppf(p/100):.2f}")

# Probability calculations
print(f"\nP(X > 120): {1 - dist.cdf(120):.4f}")
print(f"P(80 < X < 120): {dist.cdf(120) - dist.cdf(80):.4f}")
print(f"P(X < 70): {dist.cdf(70):.4f}")

# Generate random samples
samples = dist.rvs(size=10000)

Distribution Fitting

Fitting Data to Known Distributions

python
# Generate synthetic right-skewed data (common in EDA: prices, incomes)
data = np.random.lognormal(mean=3, sigma=0.8, size=5000)

# Fit multiple distributions
candidates = {
    'norm':       stats.norm,
    'lognorm':    stats.lognorm,
    'gamma':      stats.gamma,
    'expon':      stats.expon,
    'weibull_min': stats.weibull_min,
}

fit_results = {}
for name, dist_class in candidates.items():
    try:
        params = dist_class.fit(data)
        # KS test for goodness of fit
        ks_stat, ks_p = stats.kstest(data, dist_class.cdf, args=params)
        # Log-likelihood
        ll = np.sum(dist_class.logpdf(data, *params))
        # AIC (lower is better)
        k = len(params)
        aic = 2 * k - 2 * ll

        fit_results[name] = {
            'params': params,
            'ks_stat': ks_stat,
            'ks_p': ks_p,
            'log_likelihood': ll,
            'aic': aic,
        }
    except Exception as e:
        print(f"  {name}: fit failed ({e})")

# Rank by AIC
ranked = sorted(fit_results.items(), key=lambda x: x[1]['aic'])
print("Distribution Fit Rankings (by AIC):")
print(f"{'Distribution':<15} {'AIC':>12} {'KS Stat':>10} {'KS p-value':>12}")
print("-" * 52)
for name, r in ranked:
    print(f"{name:<15} {r['aic']:>12.1f} {r['ks_stat']:>10.4f} {r['ks_p']:>12.4f}")

Visualizing Fitted Distributions

python
best_name, best_result = ranked[0]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram + fitted PDF
x = np.linspace(data.min(), np.percentile(data, 99), 200)
axes[0].hist(data, bins=60, density=True, alpha=0.5, edgecolor='white', label='Data')
for name, r in ranked[:3]:
    dist_class = candidates[name]
    pdf = dist_class.pdf(x, *r['params'])
    axes[0].plot(x, pdf, linewidth=2, label=f'{name} (AIC={r["aic"]:.0f})')
axes[0].set_title('Data vs Fitted Distributions')
axes[0].legend(fontsize=9)
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Density')

# CDF comparison
axes[1].hist(data, bins=60, density=True, cumulative=True, alpha=0.3, label='Empirical CDF')
for name, r in ranked[:3]:
    dist_class = candidates[name]
    cdf = dist_class.cdf(x, *r['params'])
    axes[1].plot(x, cdf, linewidth=2, label=name)
axes[1].set_title('CDF Comparison')
axes[1].legend(fontsize=9)

# PP-plot for best fit
dist_class = candidates[best_name]
theoretical_cdf = dist_class.cdf(np.sort(data), *best_result['params'])
empirical_cdf = np.arange(1, len(data) + 1) / len(data)
axes[2].scatter(theoretical_cdf, empirical_cdf, alpha=0.1, s=5)
axes[2].plot([0, 1], [0, 1], 'r--', linewidth=2)
axes[2].set_title(f'PP-Plot ({best_name})')
axes[2].set_xlabel('Theoretical CDF')
axes[2].set_ylabel('Empirical CDF')
axes[2].set_aspect('equal')

plt.tight_layout()
plt.show()

QQ Plots (Quantile-Quantile)

Normality Assessment

python
def qq_plot_analysis(data, title="QQ Plot"):
    """Create QQ plot with interpretation guide."""
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    # Standard QQ plot
    stats.probplot(data, dist='norm', plot=axes[0])
    axes[0].set_title(f'{title} — Normal QQ')
    axes[0].get_lines()[0].set_markersize(3)
    axes[0].get_lines()[0].set_alpha(0.5)

    # Log-transformed QQ (for skewed data)
    positive_data = data[data > 0]
    stats.probplot(np.log(positive_data), dist='norm', plot=axes[1])
    axes[1].set_title(f'{title} — Log-Normal QQ')
    axes[1].get_lines()[0].set_markersize(3)

    # Distribution of residuals from QQ line
    osm, osr = stats.probplot(data, dist='norm')[:2]
    slope, intercept = np.polyfit(osm[0], osm[1], 1)
    residuals = osm[1] - (slope * osm[0] + intercept)
    axes[2].hist(residuals, bins=40, edgecolor='white', alpha=0.7)
    axes[2].set_title('QQ Residuals (should be ~0)')
    axes[2].axvline(0, color='red', linestyle='--')

    plt.tight_layout()
    plt.show()

# Test with different distributions
normal_data = np.random.normal(100, 15, 2000)
skewed_data = np.random.lognormal(4, 0.8, 2000)
heavy_tails = np.random.standard_t(3, 2000) * 10 + 100

qq_plot_analysis(normal_data, "Normal Data")
qq_plot_analysis(skewed_data, "Skewed Data")
qq_plot_analysis(heavy_tails, "Heavy-Tailed Data")

QQ Plot Interpretation Guide


Normality Tests

Comprehensive Normality Testing

python
def test_normality(data, name="data", alpha=0.05):
    """Run multiple normality tests and report results."""
    data = data[~np.isnan(data)]

    print(f"\nNormality Tests for '{name}' (n={len(data)}, alpha={alpha})")
    print("=" * 65)

    results = {}

    # 1. Shapiro-Wilk (best for n < 5000)
    if len(data) <= 5000:
        stat, p = stats.shapiro(data)
        results['Shapiro-Wilk'] = (stat, p)
        verdict = "NORMAL" if p > alpha else "NOT normal"
        print(f"  Shapiro-Wilk:    W={stat:.4f}, p={p:.4f} --> {verdict}")
    else:
        sample = np.random.choice(data, 5000, replace=False)
        stat, p = stats.shapiro(sample)
        results['Shapiro-Wilk (sample)'] = (stat, p)
        print(f"  Shapiro-Wilk*:   W={stat:.4f}, p={p:.4f} (sampled 5000)")

    # 2. D'Agostino-Pearson (good for n > 20)
    if len(data) >= 20:
        stat, p = stats.normaltest(data)
        results["D'Agostino-Pearson"] = (stat, p)
        verdict = "NORMAL" if p > alpha else "NOT normal"
        print(f"  D'Agostino-Pearson: K2={stat:.4f}, p={p:.4f} --> {verdict}")

    # 3. Anderson-Darling
    result = stats.anderson(data, dist='norm')
    ad_stat = result.statistic
    # Use 5% significance level
    idx = list(result.significance_level).index(5.0)
    critical = result.critical_values[idx]
    is_normal = ad_stat < critical
    results['Anderson-Darling'] = (ad_stat, critical)
    verdict = "NORMAL" if is_normal else "NOT normal"
    print(f"  Anderson-Darling: A2={ad_stat:.4f}, critical(5%)={critical:.4f} --> {verdict}")

    # 4. Kolmogorov-Smirnov
    stat, p = stats.kstest(data, 'norm', args=(data.mean(), data.std()))
    results['KS test'] = (stat, p)
    verdict = "NORMAL" if p > alpha else "NOT normal"
    print(f"  Kolmogorov-Smirnov: D={stat:.4f}, p={p:.4f} --> {verdict}")

    # 5. Jarque-Bera
    stat, p = stats.jarque_bera(data)
    results['Jarque-Bera'] = (stat, p)
    verdict = "NORMAL" if p > alpha else "NOT normal"
    print(f"  Jarque-Bera:     JB={stat:.4f}, p={p:.4f} --> {verdict}")

    # Summary stats
    print(f"\n  Descriptive: mean={data.mean():.2f}, std={data.std():.2f}")
    print(f"  Skewness: {stats.skew(data):.4f}")
    print(f"  Kurtosis (excess): {stats.kurtosis(data):.4f}")

    return results

# Test
normal_data = np.random.normal(100, 15, 1000)
test_normality(normal_data, "Normal Sample")

skewed_data = np.random.lognormal(4, 0.8, 1000)
test_normality(skewed_data, "Log-Normal Sample")

Statistical Tests for EDA

Comparing Two Groups

python
def compare_two_groups(group_a, group_b, name_a="A", name_b="B", alpha=0.05):
    """Comprehensive two-sample comparison."""
    print(f"\nComparing {name_a} (n={len(group_a)}) vs {name_b} (n={len(group_b)})")
    print("=" * 60)

    # Descriptive
    print(f"  {name_a}: mean={group_a.mean():.3f}, std={group_a.std():.3f}, median={np.median(group_a):.3f}")
    print(f"  {name_b}: mean={group_b.mean():.3f}, std={group_b.std():.3f}, median={np.median(group_b):.3f}")

    # Check normality
    _, p_norm_a = stats.shapiro(group_a[:5000])
    _, p_norm_b = stats.shapiro(group_b[:5000])
    both_normal = p_norm_a > alpha and p_norm_b > alpha

    # Check equal variance
    _, p_levene = stats.levene(group_a, group_b)
    equal_var = p_levene > alpha

    print(f"\n  Normality: {name_a} p={p_norm_a:.4f}, {name_b} p={p_norm_b:.4f}")
    print(f"  Equal variance (Levene): p={p_levene:.4f} {'(equal)' if equal_var else '(unequal)'}")

    # Parametric tests
    print(f"\n  Parametric Tests:")
    if equal_var:
        stat, p = stats.ttest_ind(group_a, group_b)
        print(f"    Student's t-test: t={stat:.4f}, p={p:.4f}")
    stat, p = stats.ttest_ind(group_a, group_b, equal_var=False)
    print(f"    Welch's t-test:   t={stat:.4f}, p={p:.4f}")

    # Non-parametric tests
    print(f"\n  Non-Parametric Tests:")
    stat, p = stats.mannwhitneyu(group_a, group_b, alternative='two-sided')
    print(f"    Mann-Whitney U:   U={stat:.1f}, p={p:.4f}")

    stat, p = stats.ks_2samp(group_a, group_b)
    print(f"    KS 2-sample:      D={stat:.4f}, p={p:.4f}")

    # Effect size (Cohen's d)
    pooled_std = np.sqrt((group_a.std()**2 + group_b.std()**2) / 2)
    d = (group_a.mean() - group_b.mean()) / pooled_std
    magnitude = "negligible" if abs(d) < 0.2 else "small" if abs(d) < 0.5 else "medium" if abs(d) < 0.8 else "large"
    print(f"\n  Effect Size: Cohen's d = {d:.4f} ({magnitude})")

# Example
group_a = np.random.normal(50, 10, 500)
group_b = np.random.normal(52, 12, 500)
compare_two_groups(group_a, group_b, "Control", "Treatment")

Comparing Multiple Groups

python
def compare_multiple_groups(groups, group_names, alpha=0.05):
    """ANOVA / Kruskal-Wallis for k-group comparison."""
    print(f"\nComparing {len(groups)} groups")
    print("=" * 60)

    for name, g in zip(group_names, groups):
        print(f"  {name}: n={len(g)}, mean={g.mean():.3f}, std={g.std():.3f}")

    # Check normality for all groups
    all_normal = all(stats.shapiro(g[:5000])[1] > alpha for g in groups)

    if all_normal:
        # One-way ANOVA
        stat, p = stats.f_oneway(*groups)
        print(f"\n  One-way ANOVA: F={stat:.4f}, p={p:.6f}")
    else:
        print(f"\n  Non-normal data detected.")

    # Kruskal-Wallis (non-parametric alternative)
    stat, p = stats.kruskal(*groups)
    print(f"  Kruskal-Wallis: H={stat:.4f}, p={p:.6f}")

    # Effect size (eta-squared for ANOVA)
    all_data = np.concatenate(groups)
    grand_mean = all_data.mean()
    ss_between = sum(len(g) * (g.mean() - grand_mean)**2 for g in groups)
    ss_total = np.sum((all_data - grand_mean)**2)
    eta_sq = ss_between / ss_total
    print(f"  Eta-squared: {eta_sq:.4f}")

    if p < alpha:
        print(f"\n  Post-hoc pairwise comparisons (Mann-Whitney with Bonferroni):")
        n_tests = len(groups) * (len(groups) - 1) // 2
        adjusted_alpha = alpha / n_tests
        for i in range(len(groups)):
            for j in range(i + 1, len(groups)):
                stat, p_val = stats.mannwhitneyu(groups[i], groups[j])
                sig = "***" if p_val < adjusted_alpha else ""
                print(f"    {group_names[i]} vs {group_names[j]}: "
                      f"U={stat:.1f}, p={p_val:.4f} {sig}")

groups = [
    np.random.normal(50, 10, 300),
    np.random.normal(52, 10, 300),
    np.random.normal(55, 12, 300),
    np.random.normal(48, 9, 300),
]
compare_multiple_groups(groups, ['North', 'South', 'East', 'West'])

Correlation Tests

python
def correlation_analysis(x, y, name_x="X", name_y="Y"):
    """Comprehensive correlation analysis."""
    print(f"\nCorrelation: {name_x} vs {name_y} (n={len(x)})")
    print("=" * 55)

    # Pearson (linear, assumes normality)
    r, p = stats.pearsonr(x, y)
    print(f"  Pearson r:    {r:+.4f}, p={p:.4f}")

    # Spearman (monotonic, rank-based)
    rho, p = stats.spearmanr(x, y)
    print(f"  Spearman rho: {rho:+.4f}, p={p:.4f}")

    # Kendall (ordinal, robust)
    tau, p = stats.kendalltau(x, y)
    print(f"  Kendall tau:  {tau:+.4f}, p={p:.4f}")

    # Point-biserial (if one variable is binary)
    # rpb, p = stats.pointbiserialr(binary, continuous)

    print(f"\n  Interpretation:")
    mag = abs(r)
    if mag < 0.1:
        print(f"    Negligible linear relationship")
    elif mag < 0.3:
        print(f"    Weak linear relationship")
    elif mag < 0.5:
        print(f"    Moderate linear relationship")
    elif mag < 0.7:
        print(f"    Strong linear relationship")
    else:
        print(f"    Very strong linear relationship")

    if abs(rho - r) > 0.1:
        print(f"    Spearman differs from Pearson: possible nonlinear relationship")

x = np.random.randn(500)
y = x**2 + np.random.randn(500) * 0.5  # nonlinear relationship
correlation_analysis(x, y, "X", "X-squared+noise")

Goodness-of-Fit Tests

Kolmogorov-Smirnov Test

python
def ks_test_suite(data, name="data"):
    """Test data against multiple reference distributions."""
    print(f"\nKS Goodness-of-Fit Tests for '{name}' (n={len(data)})")
    print("=" * 60)

    tests = [
        ('Normal', 'norm', (data.mean(), data.std())),
        ('Exponential', 'expon', (data.min(), data.std())),
        ('Uniform', 'uniform', (data.min(), data.max() - data.min())),
    ]

    # Also fit and test lognormal (positive data only)
    if np.all(data > 0):
        params = stats.lognorm.fit(data)
        tests.append(('Log-Normal (fitted)', 'lognorm', params))

    for name_dist, dist_name, params in tests:
        stat, p = stats.kstest(data, dist_name, args=params)
        verdict = "PASS" if p > 0.05 else "FAIL"
        print(f"  {name_dist:<25} D={stat:.4f}, p={p:.4f} [{verdict}]")

# Test
income_data = np.random.lognormal(10, 0.8, 2000)
ks_test_suite(income_data, "Income")

Chi-Squared Goodness-of-Fit

python
# Test if observed frequencies match expected
observed = np.array([48, 35, 42, 30, 45])
expected_probs = np.array([0.2, 0.2, 0.2, 0.2, 0.2])  # uniform
expected = expected_probs * observed.sum()

stat, p = stats.chisquare(observed, f_exp=expected)
print(f"Chi-squared test: X2={stat:.4f}, p={p:.4f}")
print(f"Uniform distribution: {'plausible' if p > 0.05 else 'rejected'}")

# Chi-squared test of independence (contingency table)
contingency = np.array([
    [50, 30, 20],  # Group A
    [35, 45, 20],  # Group B
    [40, 25, 35],  # Group C
])
chi2, p, dof, expected = stats.chi2_contingency(contingency)
print(f"\nIndependence test: X2={chi2:.4f}, p={p:.4f}, dof={dof}")
print(f"Groups and categories are {'dependent' if p < 0.05 else 'independent'}")

# Cramers V (effect size for chi-squared)
n = contingency.sum()
min_dim = min(contingency.shape) - 1
cramers_v = np.sqrt(chi2 / (n * min_dim))
print(f"Cramer's V: {cramers_v:.4f}")

Statistical Test Decision Guide


Automated Statistical Report

python
def statistical_eda_report(df, numeric_cols=None, categorical_cols=None, target=None):
    """Generate a comprehensive statistical EDA report."""
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include='number').columns.tolist()
    if categorical_cols is None:
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    print("=" * 70)
    print("STATISTICAL EDA REPORT")
    print("=" * 70)

    # 1. Normality tests for all numeric columns
    print("\n[1] NORMALITY TESTS")
    print("-" * 50)
    for col in numeric_cols:
        data = df[col].dropna().values
        if len(data) < 20:
            continue
        sample = data[:5000]
        _, p_shapiro = stats.shapiro(sample)
        skew = stats.skew(data)
        kurt = stats.kurtosis(data)
        is_normal = p_shapiro > 0.05
        print(f"  {col:<25} Shapiro p={p_shapiro:.4f} "
              f"skew={skew:+.2f} kurt={kurt:+.2f} "
              f"{'[NORMAL]' if is_normal else '[NOT NORMAL]'}")

    # 2. Correlation significance
    if len(numeric_cols) >= 2:
        print(f"\n[2] SIGNIFICANT CORRELATIONS (p < 0.05)")
        print("-" * 50)
        for i in range(len(numeric_cols)):
            for j in range(i + 1, len(numeric_cols)):
                x = df[numeric_cols[i]].dropna()
                y = df[numeric_cols[j]].dropna()
                common = x.index.intersection(y.index)
                r, p = stats.pearsonr(x[common], y[common])
                if p < 0.05:
                    print(f"  {numeric_cols[i]:>20} x {numeric_cols[j]:<20} "
                          f"r={r:+.3f} p={p:.4f}")

    # 3. Group comparisons (if target is categorical)
    if target and target in categorical_cols:
        print(f"\n[3] GROUP COMPARISONS (target: {target})")
        print("-" * 50)
        groups = df[target].unique()
        for col in numeric_cols:
            group_data = [df[df[target] == g][col].dropna().values for g in groups]
            if len(group_data) == 2:
                stat, p = stats.mannwhitneyu(group_data[0], group_data[1])
                print(f"  {col:<25} Mann-Whitney p={p:.4f} "
                      f"{'***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''}")
            else:
                stat, p = stats.kruskal(*[g for g in group_data if len(g) > 0])
                print(f"  {col:<25} Kruskal-Wallis p={p:.4f} "
                      f"{'***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''}")

    # 4. Categorical independence
    if len(categorical_cols) >= 2:
        print(f"\n[4] CATEGORICAL INDEPENDENCE (Chi-squared)")
        print("-" * 50)
        for i in range(len(categorical_cols)):
            for j in range(i + 1, len(categorical_cols)):
                ct = pd.crosstab(df[categorical_cols[i]], df[categorical_cols[j]])
                chi2, p, dof, _ = stats.chi2_contingency(ct)
                n = ct.sum().sum()
                cramers = np.sqrt(chi2 / (n * (min(ct.shape) - 1)))
                if p < 0.05:
                    print(f"  {categorical_cols[i]:>20} x {categorical_cols[j]:<20} "
                          f"X2={chi2:.1f} p={p:.4f} V={cramers:.3f}")

# Usage with sample data
np.random.seed(42)
report_df = pd.DataFrame({
    'revenue':    np.random.lognormal(4, 1, 1000),
    'age':        np.random.normal(40, 12, 1000),
    'score':      np.random.normal(700, 60, 1000),
    'tenure':     np.random.exponential(5, 1000),
    'segment':    np.random.choice(['A', 'B', 'C'], 1000),
    'region':     np.random.choice(['N', 'S', 'E', 'W'], 1000),
})

statistical_eda_report(report_df, target='segment')

Key Takeaways

  • QQ plots are the visual gold standard for assessing distributional fit — always plot before testing
  • Use Shapiro-Wilk for n < 5000, Anderson-Darling for larger samples
  • Choose the right test based on normality, sample size, and number of groups (see decision flowchart)
  • Effect sizes (Cohen's d, eta-squared, Cramer's V) matter more than p-values for practical significance
  • Fit candidate distributions with AIC ranking, not just KS p-values
  • With large samples (n > 10000), nearly everything will be "statistically significant" — focus on effect size
  • Always report confidence intervals alongside point estimates and p-values

"What I cannot create, I do not understand." — Richard Feynman