Skip to content
Unverified — AI-generated content. Help verify this page

Project: Healthcare EDA

Healthcare data requires special care: strict privacy regulations, domain-specific distributions, survival outcomes, and ethical considerations. This project demonstrates a complete EDA pipeline for clinical data with emphasis on responsible analysis practices.


Ethical Framework

Key principles:

  • Never report individual patient records
  • Suppress cells with counts < 5 (statistical disclosure control)
  • Avoid combinations of quasi-identifiers that could re-identify patients
  • All dates should be shifted or generalized to year/quarter

Dataset Setup

python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_theme(style='whitegrid')
np.random.seed(42)

n_patients = 10_000

# Simulated de-identified patient data
patients = pd.DataFrame({
    'patient_id': [f'P{i:06d}' for i in range(n_patients)],
    'age': np.random.normal(58, 15, n_patients).clip(18, 95).astype(int),
    'sex': np.random.choice(['M', 'F'], n_patients, p=[0.52, 0.48]),
    'bmi': np.random.normal(28, 5, n_patients).clip(15, 55).round(1),
    'systolic_bp': np.random.normal(130, 18, n_patients).clip(80, 220).astype(int),
    'diastolic_bp': np.random.normal(80, 12, n_patients).clip(50, 140).astype(int),
    'cholesterol': np.random.normal(200, 40, n_patients).clip(100, 400).astype(int),
    'glucose': np.random.lognormal(4.5, 0.3, n_patients).clip(60, 500).round(0),
    'hba1c': np.random.normal(6.2, 1.2, n_patients).clip(3.5, 14).round(1),
    'creatinine': np.random.lognormal(0.1, 0.4, n_patients).clip(0.4, 10).round(2),
    'hemoglobin': np.random.normal(13.5, 1.8, n_patients).clip(5, 20).round(1),
    'smoker': np.random.choice(['Never', 'Former', 'Current'], n_patients, p=[0.45, 0.35, 0.2]),
    'diabetes': np.random.choice([0, 1], n_patients, p=[0.7, 0.3]),
    'hypertension': np.random.choice([0, 1], n_patients, p=[0.6, 0.4]),
    'heart_disease': np.random.choice([0, 1], n_patients, p=[0.85, 0.15]),
    'admission_type': np.random.choice(['Emergency', 'Elective', 'Urgent'], n_patients, p=[0.4, 0.35, 0.25]),
    'los_days': np.random.lognormal(1.2, 0.8, n_patients).clip(0, 90).round(0).astype(int),
    'readmitted_30d': np.random.choice([0, 1], n_patients, p=[0.85, 0.15]),
    'mortality': np.random.choice([0, 1], n_patients, p=[0.92, 0.08]),
    'time_to_event_days': np.random.exponential(365, n_patients).clip(1, 1825).astype(int),
    'event_observed': np.random.choice([0, 1], n_patients, p=[0.7, 0.3]),
})

# Clinical logic: higher mortality for older, sicker patients
patients.loc[(patients['age'] > 75) & (patients['heart_disease'] == 1), 'mortality'] = \
    np.random.choice([0, 1], ((patients['age'] > 75) & (patients['heart_disease'] == 1)).sum(), p=[0.7, 0.3])

# Inject some missing values (realistic pattern)
for col in ['bmi', 'cholesterol', 'hba1c', 'hemoglobin']:
    mask = np.random.rand(n_patients) < 0.05
    patients.loc[mask, col] = np.nan

print(f"Patients: {n_patients:,}")
print(f"Columns: {patients.shape[1]}")

Demographics Analysis

python
# Age distribution by sex
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Age pyramid
age_bins = np.arange(15, 100, 5)
male_ages = patients[patients['sex'] == 'M']['age']
female_ages = patients[patients['sex'] == 'F']['age']

male_hist, _ = np.histogram(male_ages, bins=age_bins)
female_hist, _ = np.histogram(female_ages, bins=age_bins)

y_pos = range(len(age_bins) - 1)
labels = [f'{age_bins[i]}-{age_bins[i+1]-1}' for i in range(len(age_bins)-1)]

axes[0, 0].barh(y_pos, -male_hist, height=0.8, color='steelblue', label='Male')
axes[0, 0].barh(y_pos, female_hist, height=0.8, color='coral', label='Female')
axes[0, 0].set_yticks(y_pos)
axes[0, 0].set_yticklabels(labels)
axes[0, 0].set_title('Age-Sex Pyramid')
axes[0, 0].legend()
axes[0, 0].set_xlabel('Count')

# BMI distribution
bmi_data = patients['bmi'].dropna()
axes[0, 1].hist(bmi_data, bins=40, edgecolor='white', color='steelblue', alpha=0.7)
axes[0, 1].axvline(18.5, color='green', linestyle='--', label='Underweight')
axes[0, 1].axvline(25, color='orange', linestyle='--', label='Overweight')
axes[0, 1].axvline(30, color='red', linestyle='--', label='Obese')
axes[0, 1].set_title('BMI Distribution')
axes[0, 1].legend(fontsize=8)

# Comorbidity prevalence
comorbidities = ['diabetes', 'hypertension', 'heart_disease']
prevalence = [patients[c].mean() * 100 for c in comorbidities]
axes[1, 0].barh(comorbidities, prevalence, color='steelblue')
axes[1, 0].set_xlabel('Prevalence (%)')
axes[1, 0].set_title('Comorbidity Prevalence')
for i, v in enumerate(prevalence):
    axes[1, 0].text(v + 0.5, i, f'{v:.1f}%', va='center')

# Smoking status
smoke_counts = patients['smoker'].value_counts()
axes[1, 1].pie(smoke_counts.values, labels=smoke_counts.index,
               autopct='%1.1f%%', colors=['#16a34a', '#f59e0b', '#dc2626'])
axes[1, 1].set_title('Smoking Status')

plt.tight_layout()
plt.show()

Clinical Values Analysis

python
# Clinical reference ranges
reference_ranges = {
    'systolic_bp':  (90, 120, 'mmHg'),
    'diastolic_bp': (60, 80, 'mmHg'),
    'cholesterol':  (0, 200, 'mg/dL'),
    'glucose':      (70, 100, 'mg/dL'),
    'hba1c':        (4.0, 5.7, '%'),
    'creatinine':   (0.6, 1.2, 'mg/dL'),
    'hemoglobin':   (12.0, 17.5, 'g/dL'),
}

fig, axes = plt.subplots(2, 4, figsize=(20, 8))
axes = axes.flatten()

for i, (col, (low, high, unit)) in enumerate(reference_ranges.items()):
    if i >= len(axes):
        break
    ax = axes[i]
    data = patients[col].dropna()

    ax.hist(data, bins=40, edgecolor='white', alpha=0.7, color='steelblue')
    ax.axvline(low, color='green', linestyle='--', linewidth=2, label=f'Normal low ({low})')
    ax.axvline(high, color='red', linestyle='--', linewidth=2, label=f'Normal high ({high})')

    pct_abnormal = ((data < low) | (data > high)).mean() * 100
    ax.set_title(f'{col} ({pct_abnormal:.0f}% abnormal)')
    ax.set_xlabel(unit)
    ax.legend(fontsize=7)

axes[-1].set_visible(False)
plt.tight_layout()
plt.show()

Outcome Analysis

python
# Mortality analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Mortality rate by age group
patients['age_group'] = pd.cut(patients['age'], bins=[18, 30, 45, 60, 75, 95],
                                labels=['18-29', '30-44', '45-59', '60-74', '75+'])
mort_by_age = patients.groupby('age_group')['mortality'].mean() * 100
axes[0, 0].bar(mort_by_age.index.astype(str), mort_by_age.values, color='coral')
axes[0, 0].set_title('Mortality Rate by Age Group')
axes[0, 0].set_ylabel('Mortality Rate (%)')
for i, v in enumerate(mort_by_age.values):
    axes[0, 0].text(i, v + 0.3, f'{v:.1f}%', ha='center')

# LOS distribution
axes[0, 1].hist(patients['los_days'], bins=50, edgecolor='white', color='steelblue')
axes[0, 1].axvline(patients['los_days'].median(), color='red', linestyle='--',
                    label=f"Median: {patients['los_days'].median():.0f} days")
axes[0, 1].set_title('Length of Stay Distribution')
axes[0, 1].set_xlabel('Days')
axes[0, 1].legend()

# Readmission by admission type
readmit_by_type = patients.groupby('admission_type')['readmitted_30d'].mean() * 100
axes[1, 0].barh(readmit_by_type.index, readmit_by_type.values, color='steelblue')
axes[1, 0].set_xlabel('30-Day Readmission Rate (%)')
axes[1, 0].set_title('Readmission by Admission Type')

# Mortality by comorbidity count
patients['n_comorbidities'] = (
    patients['diabetes'] + patients['hypertension'] + patients['heart_disease']
)
mort_by_comorbid = patients.groupby('n_comorbidities')['mortality'].mean() * 100
axes[1, 1].bar(mort_by_comorbid.index, mort_by_comorbid.values, color='coral')
axes[1, 1].set_title('Mortality by Comorbidity Count')
axes[1, 1].set_xlabel('Number of Comorbidities')
axes[1, 1].set_ylabel('Mortality Rate (%)')

plt.tight_layout()
plt.show()

Survival Analysis

python
# Kaplan-Meier survival estimation (manual implementation)
def kaplan_meier(time, event):
    """Compute Kaplan-Meier survival curve."""
    df_km = pd.DataFrame({'time': time, 'event': event}).sort_values('time')

    unique_times = df_km['time'].unique()
    n_at_risk = len(df_km)
    survival = 1.0
    results = [{'time': 0, 'survival': 1.0, 'n_at_risk': n_at_risk}]

    for t in sorted(unique_times):
        at_time = df_km[df_km['time'] == t]
        d_i = at_time['event'].sum()    # events at time t
        c_i = (at_time['event'] == 0).sum()  # censored at time t

        if n_at_risk > 0 and d_i > 0:
            survival *= (1 - d_i / n_at_risk)

        results.append({
            'time': t,
            'survival': survival,
            'n_at_risk': n_at_risk,
            'events': d_i,
        })

        n_at_risk -= (d_i + c_i)

    return pd.DataFrame(results)

# Overall survival
km_all = kaplan_meier(patients['time_to_event_days'], patients['event_observed'])

# Survival by heart disease status
km_hd = kaplan_meier(
    patients[patients['heart_disease'] == 1]['time_to_event_days'],
    patients[patients['heart_disease'] == 1]['event_observed']
)
km_no_hd = kaplan_meier(
    patients[patients['heart_disease'] == 0]['time_to_event_days'],
    patients[patients['heart_disease'] == 0]['event_observed']
)

fig, ax = plt.subplots(figsize=(12, 6))
ax.step(km_all['time'], km_all['survival'], where='post', label='Overall', color='black', linewidth=2)
ax.step(km_hd['time'], km_hd['survival'], where='post', label='Heart Disease', color='red')
ax.step(km_no_hd['time'], km_no_hd['survival'], where='post', label='No Heart Disease', color='green')
ax.set_xlabel('Days')
ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Survival Curves')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_xlim(0, 1825)
ax.set_ylim(0, 1.05)
plt.tight_layout()
plt.show()

# Log-rank test (manual)
from scipy.stats import chi2

def log_rank_test(time1, event1, time2, event2):
    """Simplified log-rank test."""
    all_times = np.unique(np.concatenate([time1[event1 == 1], time2[event2 == 1]]))
    O1, E1 = 0, 0

    for t in all_times:
        n1 = (time1 >= t).sum()
        n2 = (time2 >= t).sum()
        d1 = ((time1 == t) & (event1 == 1)).sum()
        d2 = ((time2 == t) & (event2 == 1)).sum()
        n = n1 + n2
        d = d1 + d2

        if n > 0:
            e1 = d * n1 / n
            O1 += d1
            E1 += e1

    V = E1 * (1 - E1 / O1) if O1 > 0 else 0.001
    chi2_stat = (O1 - E1)**2 / max(V, 0.001)
    p_value = 1 - chi2.cdf(chi2_stat, df=1)
    return chi2_stat, p_value

stat, p = log_rank_test(
    patients[patients['heart_disease'] == 1]['time_to_event_days'].values,
    patients[patients['heart_disease'] == 1]['event_observed'].values,
    patients[patients['heart_disease'] == 0]['time_to_event_days'].values,
    patients[patients['heart_disease'] == 0]['event_observed'].values,
)
print(f"Log-rank test: chi2={stat:.3f}, p={p:.4f}")

Comorbidity Analysis

python
# Comorbidity co-occurrence
comorbid_cols = ['diabetes', 'hypertension', 'heart_disease']
comorbid_matrix = patients[comorbid_cols].T.dot(patients[comorbid_cols])

# Normalize by total patients
comorbid_pct = (comorbid_matrix / n_patients * 100).round(1)

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(comorbid_pct, annot=True, fmt='.1f', cmap='Reds', ax=ax)
ax.set_title('Comorbidity Co-occurrence (% of patients)')
plt.tight_layout()
plt.show()

# Odds ratio: diabetes and heart disease
a = ((patients['diabetes'] == 1) & (patients['heart_disease'] == 1)).sum()
b = ((patients['diabetes'] == 1) & (patients['heart_disease'] == 0)).sum()
c = ((patients['diabetes'] == 0) & (patients['heart_disease'] == 1)).sum()
d = ((patients['diabetes'] == 0) & (patients['heart_disease'] == 0)).sum()

odds_ratio = (a * d) / (b * c)
se_log_or = np.sqrt(1/a + 1/b + 1/c + 1/d)
ci_lower = np.exp(np.log(odds_ratio) - 1.96 * se_log_or)
ci_upper = np.exp(np.log(odds_ratio) + 1.96 * se_log_or)

print(f"\nDiabetes-Heart Disease Association:")
print(f"  Odds Ratio: {odds_ratio:.3f}")
print(f"  95% CI: ({ci_lower:.3f}, {ci_upper:.3f})")

Risk Factor Analysis

python
# Compare clinical values: mortality vs survival
outcomes = {0: 'Survived', 1: 'Died'}
clinical_cols = ['age', 'bmi', 'systolic_bp', 'cholesterol', 'glucose', 'hba1c', 'creatinine']

fig, axes = plt.subplots(2, 4, figsize=(20, 8))
axes = axes.flatten()

for i, col in enumerate(clinical_cols):
    survived = patients[patients['mortality'] == 0][col].dropna()
    died = patients[patients['mortality'] == 1][col].dropna()

    axes[i].hist(survived, bins=30, alpha=0.5, label='Survived', color='green', density=True)
    axes[i].hist(died, bins=30, alpha=0.5, label='Died', color='red', density=True)

    # Statistical test
    stat, p = stats.mannwhitneyu(survived, died, alternative='two-sided')
    d = (survived.mean() - died.mean()) / np.sqrt((survived.std()**2 + died.std()**2) / 2)

    axes[i].set_title(f'{col}\np={p:.3f}, d={d:+.2f}')
    axes[i].legend(fontsize=8)

axes[-1].set_visible(False)
plt.suptitle('Clinical Values: Survived vs Died', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

Privacy-Preserving Reporting

python
def safe_report(df, group_col, value_col, min_cell_size=5):
    """Generate a report that suppresses small cell counts."""
    grouped = df.groupby(group_col)[value_col].agg(['count', 'mean', 'std', 'median'])

    # Suppress cells with count < min_cell_size
    mask = grouped['count'] < min_cell_size
    if mask.any():
        print(f"WARNING: {mask.sum()} groups suppressed (n < {min_cell_size})")
        grouped.loc[mask, ['mean', 'std', 'median']] = np.nan
        grouped.loc[mask, 'count'] = '<5'

    return grouped

# Safe reporting example
report = safe_report(patients, 'age_group', 'los_days')
print("\nLength of Stay by Age Group (privacy-preserving):")
print(report)

Key Takeaways

  • Ethics first: always confirm data is de-identified and analysis is approved before starting EDA
  • Clinical reference ranges provide natural thresholds for flagging abnormal values — more meaningful than statistical outliers
  • Survival analysis (Kaplan-Meier, log-rank test) is the correct framework for time-to-event data with censoring
  • Comorbidity co-occurrence and odds ratios reveal disease relationships beyond simple prevalence
  • Privacy-preserving reporting with minimum cell size suppression prevents re-identification
  • Always stratify outcomes by age and sex before drawing conclusions — these are the strongest confounders in healthcare
  • Length of stay is typically right-skewed (lognormal) — use median, not mean, for central tendency
  • Domain knowledge is essential: a statistically significant finding may be clinically irrelevant, and vice versa

"What I cannot create, I do not understand." — Richard Feynman