Skip to content
Unverified — AI-generated content. Help verify this page

Statistical Power & Sample Size

Statistical power determines whether your analysis can detect real effects. An underpowered EDA might miss important patterns; an overpowered one might flag trivially small effects as "significant." Understanding power is essential for trustworthy conclusions.


Power Analysis Fundamentals

QuantityTypical ValueDescription
Effect size (d)Determined by domainHow big is the difference/relationship?
Sample size (n)Solve for thisHow many observations do you need?
Significance (alpha)0.05Probability of false positive
Power (1-beta)0.80Probability of detecting a real effect

Effect Size Measures

python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

np.random.seed(42)

def cohens_d(group_a, group_b):
    """Compute Cohen's d effect size for two groups."""
    na, nb = len(group_a), len(group_b)
    pooled_std = np.sqrt(((na - 1) * group_a.std()**2 + (nb - 1) * group_b.std()**2) / (na + nb - 2))
    return (group_a.mean() - group_b.mean()) / pooled_std

def cohens_h(p1, p2):
    """Effect size for comparing two proportions."""
    return 2 * (np.arcsin(np.sqrt(p1)) - np.arcsin(np.sqrt(p2)))

def eta_squared(groups):
    """Effect size for ANOVA (proportion of variance explained)."""
    all_data = np.concatenate(groups)
    grand_mean = all_data.mean()
    ss_between = sum(len(g) * (g.mean() - grand_mean)**2 for g in groups)
    ss_total = np.sum((all_data - grand_mean)**2)
    return ss_between / ss_total

# Demonstrate effect sizes
np.random.seed(42)
small = np.random.normal(50, 10, 200)
medium = np.random.normal(55, 10, 200)
large = np.random.normal(60, 10, 200)

print("Cohen's d Interpretation:")
print(f"  Small  (d=0.2):  {cohens_d(small, np.random.normal(52, 10, 200)):.3f}")
print(f"  Medium (d=0.5):  {cohens_d(small, medium):.3f}")
print(f"  Large  (d=0.8):  {cohens_d(small, large):.3f}")

# Visualize effect sizes
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for i, (label, effect, other) in enumerate([
    ('Small (d=0.2)', np.random.normal(50, 10, 1000), np.random.normal(52, 10, 1000)),
    ('Medium (d=0.5)', np.random.normal(50, 10, 1000), np.random.normal(55, 10, 1000)),
    ('Large (d=0.8)', np.random.normal(50, 10, 1000), np.random.normal(58, 10, 1000)),
]):
    axes[i].hist(effect, bins=40, alpha=0.5, label='Group A', color='steelblue', density=True)
    axes[i].hist(other, bins=40, alpha=0.5, label='Group B', color='coral', density=True)
    d = cohens_d(np.array(effect), np.array(other))
    axes[i].set_title(f'{label}\nActual d = {abs(d):.2f}')
    axes[i].legend()

plt.tight_layout()
plt.show()

Effect Size Reference Table

Effect SizeMeasureSmallMediumLarge
Mean differenceCohen's d0.20.50.8
Correlationr0.10.30.5
ANOVAEta-squared0.010.060.14
ProportionsCohen's h0.20.50.8
Chi-squaredCramer's V0.10.30.5
Regressionf-squared0.020.150.35

Sample Size Calculations

Two-Sample t-test

python
def sample_size_ttest(effect_size, alpha=0.05, power=0.80, ratio=1.0):
    """Required sample size per group for two-sample t-test.
    ratio: n2/n1 ratio (1.0 = equal groups)
    """
    from scipy.stats import norm

    z_alpha = norm.ppf(1 - alpha / 2)
    z_beta = norm.ppf(power)

    n1 = ((z_alpha + z_beta)**2 * (1 + 1/ratio)) / effect_size**2
    n2 = ratio * n1

    return int(np.ceil(n1)), int(np.ceil(n2))

# Sample size table
print("Sample Size per Group (two-sample t-test, alpha=0.05)")
print(f"{'Effect Size':<15} {'Power=0.80':>12} {'Power=0.90':>12} {'Power=0.95':>12}")
print("-" * 53)
for d in [0.1, 0.2, 0.3, 0.5, 0.8, 1.0, 1.5]:
    n80 = sample_size_ttest(d, power=0.80)[0]
    n90 = sample_size_ttest(d, power=0.90)[0]
    n95 = sample_size_ttest(d, power=0.95)[0]
    print(f"d = {d:<11} {n80:>12,} {n90:>12,} {n95:>12,}")

Comparing Proportions (A/B Test)

python
def sample_size_proportions(p1, p2, alpha=0.05, power=0.80):
    """Sample size per group for comparing two proportions."""
    from scipy.stats import norm

    z_alpha = norm.ppf(1 - alpha / 2)
    z_beta = norm.ppf(power)

    p_bar = (p1 + p2) / 2
    n = ((z_alpha * np.sqrt(2 * p_bar * (1 - p_bar)) +
          z_beta * np.sqrt(p1 * (1-p1) + p2 * (1-p2)))**2) / (p1 - p2)**2

    return int(np.ceil(n))

# A/B test scenarios
print("\nA/B Test Sample Sizes (alpha=0.05, power=0.80):")
print(f"{'Baseline':>10} {'Target':>10} {'Lift':>10} {'n/group':>12} {'Total':>12}")
print("-" * 58)
for p1, p2 in [(0.05, 0.06), (0.05, 0.07), (0.10, 0.12),
                (0.10, 0.13), (0.20, 0.24), (0.20, 0.25)]:
    n = sample_size_proportions(p1, p2)
    lift = (p2 - p1) / p1 * 100
    print(f"{p1:>10.1%} {p2:>10.1%} {lift:>9.0f}% {n:>12,} {2*n:>12,}")

Correlation

python
def sample_size_correlation(r, alpha=0.05, power=0.80):
    """Sample size needed to detect a given correlation."""
    from scipy.stats import norm

    z_alpha = norm.ppf(1 - alpha / 2)
    z_beta = norm.ppf(power)

    # Fisher z transform
    z_r = 0.5 * np.log((1 + r) / (1 - r))
    n = ((z_alpha + z_beta) / z_r)**2 + 3

    return int(np.ceil(n))

print("\nSample Size to Detect Correlation (alpha=0.05, power=0.80):")
for r in [0.05, 0.10, 0.15, 0.20, 0.30, 0.50]:
    n = sample_size_correlation(r)
    print(f"  r = {r}: n = {n:,}")

Power Curves

python
def compute_power(effect_size, n, alpha=0.05):
    """Compute power for a two-sample t-test."""
    from scipy.stats import norm
    z_alpha = norm.ppf(1 - alpha / 2)
    z_power = effect_size * np.sqrt(n / 2) - z_alpha
    return norm.cdf(z_power)

# Power curves for different sample sizes
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Power vs effect size
effect_sizes = np.linspace(0.05, 1.5, 100)
for n in [30, 50, 100, 200, 500, 1000]:
    powers = [compute_power(d, n) for d in effect_sizes]
    axes[0].plot(effect_sizes, powers, label=f'n={n}', linewidth=2)
axes[0].axhline(y=0.80, color='red', linestyle='--', alpha=0.5, label='80% power')
axes[0].set_xlabel("Effect Size (Cohen's d)")
axes[0].set_ylabel('Power')
axes[0].set_title('Power vs Effect Size')
axes[0].legend(fontsize=8)
axes[0].grid(True, alpha=0.3)

# Power vs sample size
sample_sizes = np.arange(10, 500, 5)
for d in [0.2, 0.3, 0.5, 0.8]:
    powers = [compute_power(d, n) for n in sample_sizes]
    axes[1].plot(sample_sizes, powers, label=f'd={d}', linewidth=2)
axes[1].axhline(y=0.80, color='red', linestyle='--', alpha=0.5, label='80% power')
axes[1].set_xlabel('Sample Size per Group')
axes[1].set_ylabel('Power')
axes[1].set_title('Power vs Sample Size')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Minimum Detectable Effect (MDE)

python
def minimum_detectable_effect(n, alpha=0.05, power=0.80):
    """Given a sample size, what is the smallest effect you can detect?"""
    from scipy.stats import norm

    z_alpha = norm.ppf(1 - alpha / 2)
    z_beta = norm.ppf(power)

    mde = (z_alpha + z_beta) * np.sqrt(2 / n)
    return mde

# What can your current data detect?
print("Minimum Detectable Effect Size (alpha=0.05, power=0.80):")
print(f"{'n per group':>15} {'MDE (d)':>10} {'Interpretation':>20}")
print("-" * 50)
for n in [20, 50, 100, 200, 500, 1000, 5000, 10000]:
    mde = minimum_detectable_effect(n)
    interp = 'very large' if mde > 0.8 else 'large' if mde > 0.5 else 'medium' if mde > 0.2 else 'small'
    print(f"{n:>15,} {mde:>10.3f} {interp:>20}")

When Your Data is Too Small

python
def too_small_data_strategies(n, effect_size_needed):
    """Recommendations when sample size is insufficient."""
    actual_power = compute_power(effect_size_needed, n)
    n_needed = sample_size_ttest(effect_size_needed)[0]

    print(f"INSUFFICIENT SAMPLE SIZE ANALYSIS")
    print(f"{'='*50}")
    print(f"Current n per group: {n}")
    print(f"Target effect size: d={effect_size_needed}")
    print(f"Current power: {actual_power:.1%}")
    print(f"Needed n per group: {n_needed:,}")
    print(f"Shortfall: {max(0, n_needed - n):,} additional per group")

    strategies = []

    if actual_power < 0.50:
        strategies.append("CRITICAL: Power < 50%. Results are essentially a coin flip.")
    elif actual_power < 0.80:
        strategies.append(f"UNDERPOWERED: Power = {actual_power:.0%}. High risk of false negatives.")

    strategies.extend([
        "",
        "STRATEGIES:",
        "1. Collect more data (if possible)",
        f"   Need {n_needed - n:,} more per group for 80% power",
        "",
        "2. Accept larger effect sizes only",
        f"   Current MDE: d={minimum_detectable_effect(n):.3f}",
        "",
        "3. Reduce alpha (accept more false positives)",
        f"   At alpha=0.10: power={compute_power(effect_size_needed, n, 0.10):.1%}",
        "",
        "4. Use one-sided test (if justified)",
        f"   One-sided power: {compute_power(effect_size_needed, n, 0.10):.1%}",
        "",
        "5. Use Bayesian methods (credible intervals instead of p-values)",
        "",
        "6. Pool data across time periods or related subgroups",
        "",
        "7. Report confidence intervals instead of binary significance",
        "   Wide CIs honestly communicate uncertainty",
    ])

    for s in strategies:
        print(f"  {s}")

too_small_data_strategies(n=50, effect_size_needed=0.3)

Statistical vs Practical Significance

python
def significance_analysis(group_a, group_b, practical_threshold=None):
    """Distinguish statistical from practical significance."""
    n_a, n_b = len(group_a), len(group_b)

    # Statistical significance
    stat, p = stats.mannwhitneyu(group_a, group_b)
    d = cohens_d(group_a, group_b)

    # Confidence interval for the difference
    diff = group_a.mean() - group_b.mean()
    se = np.sqrt(group_a.var()/n_a + group_b.var()/n_b)
    ci_lower = diff - 1.96 * se
    ci_upper = diff + 1.96 * se

    print("SIGNIFICANCE ANALYSIS")
    print("=" * 50)
    print(f"Group A: n={n_a}, mean={group_a.mean():.3f}")
    print(f"Group B: n={n_b}, mean={group_b.mean():.3f}")
    print(f"Difference: {diff:.3f} (95% CI: [{ci_lower:.3f}, {ci_upper:.3f}])")
    print(f"Cohen's d: {d:.3f}")
    print(f"p-value: {p:.6f}")

    # Classification
    stat_sig = p < 0.05
    if practical_threshold:
        prac_sig = abs(diff) > practical_threshold
        print(f"\nPractical threshold: {practical_threshold}")
        print(f"Statistical significance: {'Yes' if stat_sig else 'No'}")
        print(f"Practical significance:   {'Yes' if prac_sig else 'No'}")

        if stat_sig and prac_sig:
            print("VERDICT: Real and meaningful difference")
        elif stat_sig and not prac_sig:
            print("VERDICT: Statistically significant but too small to matter")
            print("  (common with large samples)")
        elif not stat_sig and prac_sig:
            print("VERDICT: Possibly meaningful but not yet confirmed")
            print("  (may need more data)")
        else:
            print("VERDICT: No evidence of meaningful difference")

# Large sample: statistically significant but trivially small
big_a = np.random.normal(100.0, 10, 50000)
big_b = np.random.normal(100.3, 10, 50000)
significance_analysis(big_a, big_b, practical_threshold=2.0)

print("\n")

# Small sample: practically significant but not statistically
small_a = np.random.normal(100, 10, 20)
small_b = np.random.normal(108, 10, 20)
significance_analysis(small_a, small_b, practical_threshold=2.0)

Power Analysis Decision Guide


Key Takeaways

  • Always compute power before interpreting "not significant" as "no effect" — the study may be underpowered
  • With large samples, everything is significant — focus on effect size and practical significance
  • With small samples, nothing is significant — report confidence intervals and effect sizes instead
  • The minimum detectable effect (MDE) tells you the smallest effect your data can reliably find
  • For A/B tests, small lifts (e.g., 1pp on a 5% baseline) require tens of thousands of observations per group
  • Statistical significance (p < 0.05) does not mean practical significance (the effect matters in the real world)
  • When data is too small: use Bayesian methods, report CIs, pool data, or accept that you can only detect large effects
  • Power analysis should be done before collecting data, not after finding non-significant results

"What I cannot create, I do not understand." — Richard Feynman