Skip to content
Unverified — AI-generated content. Help verify this page

Streamlit EDA App

This page contains a complete, working Streamlit EDA application. Copy the code below into eda_app.py and run with streamlit run eda_app.py. It provides a full interactive pipeline: upload data, inspect types, view statistics, analyze missing data, explore distributions, examine correlations, and export results.


Application Architecture


Complete Application Code

python
"""
Streamlit EDA App — Complete Interactive Explorer
Run: streamlit run eda_app.py
"""

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from io import BytesIO

# ─────────────────────────────────────────────
# Page Configuration
# ─────────────────────────────────────────────
st.set_page_config(
    page_title="Interactive EDA Tool",
    page_icon="🔬",
    layout="wide",
    initial_sidebar_state="expanded",
)

# Custom CSS
st.markdown("""
<style>
    .stMetric { background-color: #f8fafc; padding: 10px; border-radius: 8px; }
    .stTabs [data-baseweb="tab-list"] { gap: 8px; }
    .stTabs [data-baseweb="tab"] {
        padding: 8px 16px;
        border-radius: 4px;
    }
</style>
""", unsafe_allow_html=True)


# ─────────────────────────────────────────────
# Caching
# ─────────────────────────────────────────────
@st.cache_data
def load_csv(file):
    return pd.read_csv(file)

@st.cache_data
def load_excel(file):
    return pd.read_excel(file, engine='openpyxl')

@st.cache_data
def compute_profile(df):
    """Pre-compute expensive statistics."""
    numeric = df.select_dtypes(include='number')
    profile = {
        'describe': df.describe(include='all').round(3),
        'dtypes': df.dtypes.astype(str).to_dict(),
        'missing': df.isna().sum().to_dict(),
        'missing_pct': (df.isna().mean() * 100).round(2).to_dict(),
        'nunique': df.nunique().to_dict(),
        'skewness': numeric.skew().round(3).to_dict() if len(numeric.columns) > 0 else {},
        'kurtosis': numeric.kurtosis().round(3).to_dict() if len(numeric.columns) > 0 else {},
    }
    if len(numeric.columns) >= 2:
        profile['correlation'] = numeric.corr().round(3)
    return profile


# ─────────────────────────────────────────────
# Sidebar: File Upload & Global Controls
# ─────────────────────────────────────────────
with st.sidebar:
    st.header("Data Source")
    upload_method = st.radio("Load data from:", ["Upload File", "Sample Dataset"])

    if upload_method == "Upload File":
        uploaded = st.file_uploader("Upload CSV or Excel", type=['csv', 'xlsx', 'xls'])
        if uploaded:
            if uploaded.name.endswith('.csv'):
                df = load_csv(uploaded)
            else:
                df = load_excel(uploaded)
        else:
            df = None
    else:
        dataset_choice = st.selectbox("Choose sample", [
            "E-Commerce Orders", "Customer Churn", "Sales Performance"
        ])
        np.random.seed(42)
        n = 5000
        if dataset_choice == "E-Commerce Orders":
            df = pd.DataFrame({
                'order_id': range(1, n+1),
                'customer_id': np.random.randint(100, 2000, n),
                'order_date': pd.date_range('2023-01-01', periods=n, freq='2h'),
                'product': np.random.choice(['Widget', 'Gadget', 'Doohickey', 'Gizmo'], n),
                'category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Books'], n),
                'quantity': np.random.randint(1, 10, n),
                'unit_price': np.round(np.random.lognormal(3, 0.8, n), 2),
                'region': np.random.choice(['North', 'South', 'East', 'West'], n),
                'discount': np.random.choice([0, 0.05, 0.1, 0.15, 0.2], n),
            })
            df['total'] = (df['quantity'] * df['unit_price'] * (1 - df['discount'])).round(2)
            # Inject some missing values
            for col in ['unit_price', 'region', 'discount']:
                mask = np.random.rand(n) < 0.03
                df.loc[mask, col] = np.nan
        elif dataset_choice == "Customer Churn":
            df = pd.DataFrame({
                'customer_id': range(1, n+1),
                'age': np.random.normal(42, 12, n).clip(18, 85).astype(int),
                'income': np.round(np.random.lognormal(10.8, 0.7, n), 0),
                'tenure_months': np.random.exponential(24, n).clip(1, 120).astype(int),
                'n_products': np.random.poisson(3, n).clip(1, 10),
                'credit_score': np.random.normal(680, 70, n).clip(300, 850).astype(int),
                'region': np.random.choice(['North', 'South', 'East', 'West'], n),
                'plan': np.random.choice(['Basic', 'Standard', 'Premium'], n, p=[0.4, 0.4, 0.2]),
                'churned': np.random.choice([0, 1], n, p=[0.82, 0.18]),
            })
        else:
            df = pd.DataFrame({
                'rep_id': np.random.randint(1, 50, n),
                'date': pd.date_range('2023-01-01', periods=n, freq='3h'),
                'region': np.random.choice(['North', 'South', 'East', 'West'], n),
                'deal_size': np.round(np.random.lognormal(8, 1.2, n), 2),
                'stage': np.random.choice(['Lead', 'Qualified', 'Proposal', 'Closed Won', 'Closed Lost'], n),
                'days_in_pipeline': np.random.exponential(30, n).astype(int),
                'customer_type': np.random.choice(['New', 'Existing', 'Upsell'], n),
            })

    if df is not None:
        st.divider()
        st.header("Filters")
        # Row sampling
        sample_pct = st.slider("Sample %", 10, 100, 100, 10)
        if sample_pct < 100:
            df = df.sample(frac=sample_pct/100, random_state=42)
            st.info(f"Sampled to {len(df):,} rows")

# ─────────────────────────────────────────────
# Main Content
# ─────────────────────────────────────────────
st.title("Interactive EDA Explorer")

if df is None:
    st.info("Upload a dataset or select a sample from the sidebar to begin.")
    st.stop()

profile = compute_profile(df)
numeric_cols = df.select_dtypes(include='number').columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_cols = df.select_dtypes(include='datetime').columns.tolist()

# Tab navigation
tabs = st.tabs([
    "Overview", "Missing Data", "Distributions",
    "Correlations", "Categorical", "Bivariate", "Export"
])


# ─────────────────────────────────── TAB 1: Overview
with tabs[0]:
    st.header("Dataset Overview")

    # Key metrics
    c1, c2, c3, c4, c5 = st.columns(5)
    c1.metric("Rows", f"{df.shape[0]:,}")
    c2.metric("Columns", df.shape[1])
    c3.metric("Numeric", len(numeric_cols))
    c4.metric("Categorical", len(categorical_cols))
    c5.metric("Missing Cells", f"{df.isna().sum().sum():,}")

    # Memory usage
    mem_mb = df.memory_usage(deep=True).sum() / 1024**2
    st.metric("Memory Usage", f"{mem_mb:.2f} MB")

    # Data types
    col_a, col_b = st.columns(2)
    with col_a:
        st.subheader("Column Types")
        dtype_df = pd.DataFrame({
            'Column': df.columns,
            'Type': df.dtypes.astype(str).values,
            'Non-Null': df.notna().sum().values,
            'Null %': (df.isna().mean() * 100).round(1).values,
            'Unique': df.nunique().values,
        })
        st.dataframe(dtype_df, use_container_width=True, hide_index=True)

    with col_b:
        st.subheader("Descriptive Statistics")
        st.dataframe(profile['describe'], use_container_width=True)

    # Sample data
    with st.expander("View Raw Data", expanded=False):
        st.dataframe(df, use_container_width=True, height=400)


# ─────────────────────────────────── TAB 2: Missing Data
with tabs[1]:
    st.header("Missing Data Analysis")

    missing_df = pd.DataFrame({
        'Column': df.columns,
        'Missing': df.isna().sum().values,
        'Percent': (df.isna().mean() * 100).round(2).values,
        'Type': df.dtypes.astype(str).values,
    }).sort_values('Percent', ascending=False)

    has_missing = missing_df[missing_df['Missing'] > 0]

    if len(has_missing) == 0:
        st.success("No missing values found.")
    else:
        st.warning(f"{len(has_missing)} columns have missing values")

        # Bar chart of missing percentages
        fig = px.bar(
            has_missing, x='Column', y='Percent',
            color='Percent', color_continuous_scale='Reds',
            title='Missing Data by Column',
            labels={'Percent': 'Missing %'},
        )
        fig.update_layout(height=400)
        st.plotly_chart(fig, use_container_width=True)

        # Missing data pattern (nullity matrix)
        st.subheader("Missing Data Pattern")
        sample_for_pattern = df[has_missing['Column'].tolist()].sample(
            min(200, len(df)), random_state=42
        )
        fig = px.imshow(
            sample_for_pattern.isna().astype(int).T,
            color_continuous_scale=['white', 'crimson'],
            title='Nullity Matrix (white=present, red=missing)',
            labels={'x': 'Row', 'y': 'Column'},
            aspect='auto',
        )
        fig.update_layout(height=300)
        st.plotly_chart(fig, use_container_width=True)

        # Missing data table
        st.dataframe(has_missing, use_container_width=True, hide_index=True)


# ─────────────────────────────────── TAB 3: Distributions
with tabs[2]:
    st.header("Distribution Explorer")

    if not numeric_cols:
        st.warning("No numeric columns found.")
    else:
        col_select = st.selectbox("Select Numeric Column", numeric_cols, key='dist_col')
        series = df[col_select].dropna()

        # Stats summary
        c1, c2, c3, c4, c5, c6 = st.columns(6)
        c1.metric("Mean", f"{series.mean():.3f}")
        c2.metric("Median", f"{series.median():.3f}")
        c3.metric("Std Dev", f"{series.std():.3f}")
        c4.metric("Skewness", f"{series.skew():.3f}")
        c5.metric("Kurtosis", f"{series.kurtosis():.3f}")
        c6.metric("Outliers (IQR)", f"{((series < series.quantile(0.25) - 1.5*(series.quantile(0.75)-series.quantile(0.25))) | (series > series.quantile(0.75) + 1.5*(series.quantile(0.75)-series.quantile(0.25)))).sum()}")

        plot_col1, plot_col2 = st.columns(2)

        with plot_col1:
            n_bins = st.slider("Bins", 10, 200, 50, key='hist_bins')
            fig = px.histogram(
                df, x=col_select, nbins=n_bins,
                marginal='box',
                title=f'{col_select} Distribution',
            )
            mean_val = series.mean()
            median_val = series.median()
            fig.add_vline(x=mean_val, line_dash='dash', line_color='red',
                          annotation_text=f'Mean: {mean_val:.2f}')
            fig.add_vline(x=median_val, line_dash='dash', line_color='green',
                          annotation_text=f'Median: {median_val:.2f}')
            st.plotly_chart(fig, use_container_width=True)

        with plot_col2:
            # QQ plot
            theoretical_q = np.linspace(0.001, 0.999, min(len(series), 500))
            sorted_data = np.sort(series.values)
            if len(sorted_data) > 500:
                indices = np.linspace(0, len(sorted_data)-1, 500).astype(int)
                sorted_data = sorted_data[indices]
            theoretical_vals = stats.norm.ppf(theoretical_q, series.mean(), series.std())

            fig = go.Figure()
            fig.add_trace(go.Scatter(x=theoretical_vals, y=sorted_data,
                                      mode='markers', marker=dict(size=3, opacity=0.5),
                                      name='Data'))
            min_v = min(theoretical_vals.min(), sorted_data.min())
            max_v = max(theoretical_vals.max(), sorted_data.max())
            fig.add_trace(go.Scatter(x=[min_v, max_v], y=[min_v, max_v],
                                      mode='lines', line=dict(color='red', dash='dash'),
                                      name='Normal Reference'))
            fig.update_layout(title=f'QQ Plot: {col_select}',
                              xaxis_title='Theoretical Quantiles',
                              yaxis_title='Sample Quantiles')
            st.plotly_chart(fig, use_container_width=True)

        # Normality test
        sample_for_test = series.values[:5000]
        shapiro_stat, shapiro_p = stats.shapiro(sample_for_test)
        if shapiro_p > 0.05:
            st.success(f"Shapiro-Wilk: W={shapiro_stat:.4f}, p={shapiro_p:.4f} — Data appears normally distributed")
        else:
            st.warning(f"Shapiro-Wilk: W={shapiro_stat:.4f}, p={shapiro_p:.4f} — Data is NOT normally distributed")

        # Percentile table
        with st.expander("Percentile Table"):
            pcts = [1, 5, 10, 25, 50, 75, 90, 95, 99]
            pct_vals = series.quantile([p/100 for p in pcts]).values
            pct_df = pd.DataFrame({'Percentile': [f'P{p}' for p in pcts], 'Value': pct_vals.round(3)})
            st.dataframe(pct_df, use_container_width=True, hide_index=True)

        # All distributions overview
        if st.checkbox("Show All Distributions", key='all_dist'):
            n_cols_plot = min(3, len(numeric_cols))
            n_rows_plot = (len(numeric_cols) + n_cols_plot - 1) // n_cols_plot
            fig = make_subplots(rows=n_rows_plot, cols=n_cols_plot,
                                subplot_titles=numeric_cols)
            for idx, col in enumerate(numeric_cols):
                r = idx // n_cols_plot + 1
                c = idx % n_cols_plot + 1
                fig.add_trace(go.Histogram(x=df[col].dropna(), nbinsx=40, name=col,
                                            showlegend=False), row=r, col=c)
            fig.update_layout(height=300*n_rows_plot, title='All Numeric Distributions')
            st.plotly_chart(fig, use_container_width=True)


# ─────────────────────────────────── TAB 4: Correlations
with tabs[3]:
    st.header("Correlation Analysis")

    if len(numeric_cols) < 2:
        st.warning("Need at least 2 numeric columns for correlation analysis.")
    else:
        corr_method = st.selectbox("Method", ['pearson', 'spearman', 'kendall'], key='corr_method')
        corr = df[numeric_cols].corr(method=corr_method)

        # Heatmap
        fig = px.imshow(
            corr, text_auto='.2f',
            color_continuous_scale='RdBu_r',
            zmin=-1, zmax=1,
            title=f'{corr_method.title()} Correlation Matrix',
            aspect='equal',
        )
        fig.update_layout(height=max(500, len(numeric_cols) * 60))
        st.plotly_chart(fig, use_container_width=True)

        # Top correlations table
        st.subheader("Top Correlations")
        corr_pairs = []
        for i in range(len(corr)):
            for j in range(i+1, len(corr)):
                corr_pairs.append({
                    'Feature 1': corr.columns[i],
                    'Feature 2': corr.columns[j],
                    'Correlation': corr.iloc[i, j],
                    'Abs Correlation': abs(corr.iloc[i, j]),
                })
        corr_df = pd.DataFrame(corr_pairs).sort_values('Abs Correlation', ascending=False)
        st.dataframe(corr_df.head(20).round(4), use_container_width=True, hide_index=True)


# ─────────────────────────────────── TAB 5: Categorical
with tabs[4]:
    st.header("Categorical Analysis")

    if not categorical_cols:
        st.warning("No categorical columns found.")
    else:
        cat_col = st.selectbox("Select Column", categorical_cols, key='cat_col')
        vc = df[cat_col].value_counts()

        c1, c2 = st.columns(2)
        with c1:
            fig = px.bar(x=vc.index[:20], y=vc.values[:20],
                         title=f'{cat_col} — Top 20 Values',
                         labels={'x': cat_col, 'y': 'Count'})
            st.plotly_chart(fig, use_container_width=True)

        with c2:
            fig = px.pie(values=vc.values[:10], names=vc.index[:10],
                         title=f'{cat_col} — Composition (Top 10)',
                         hole=0.4)
            st.plotly_chart(fig, use_container_width=True)

        # Categorical vs numeric
        if numeric_cols:
            st.subheader("Category vs Numeric")
            num_col = st.selectbox("Numeric Column", numeric_cols, key='cat_num')
            chart = st.radio("Chart", ['Box', 'Violin', 'Strip'], key='cat_chart', horizontal=True)

            if chart == 'Box':
                fig = px.box(df, x=cat_col, y=num_col, color=cat_col, title=f'{num_col} by {cat_col}')
            elif chart == 'Violin':
                fig = px.violin(df, x=cat_col, y=num_col, color=cat_col, box=True, title=f'{num_col} by {cat_col}')
            else:
                fig = px.strip(df.sample(min(1000, len(df))), x=cat_col, y=num_col, color=cat_col, title=f'{num_col} by {cat_col}')
            st.plotly_chart(fig, use_container_width=True)

            # Group statistics
            group_stats = df.groupby(cat_col)[num_col].agg(['count', 'mean', 'median', 'std']).round(3)
            st.dataframe(group_stats, use_container_width=True)


# ─────────────────────────────────── TAB 6: Bivariate
with tabs[5]:
    st.header("Bivariate Explorer")

    if len(numeric_cols) < 2:
        st.warning("Need at least 2 numeric columns.")
    else:
        bv1, bv2 = st.columns(2)
        with bv1:
            x_col = st.selectbox("X Axis", numeric_cols, index=0, key='bv_x')
        with bv2:
            y_col = st.selectbox("Y Axis", numeric_cols, index=min(1, len(numeric_cols)-1), key='bv_y')

        color_col = st.selectbox("Color by (optional)",
                                  ['None'] + categorical_cols, key='bv_color')

        color = None if color_col == 'None' else color_col
        sample_size = min(st.slider("Sample size", 100, len(df), min(2000, len(df)), key='bv_sample'), len(df))
        sample = df.sample(sample_size, random_state=42)

        fig = px.scatter(
            sample, x=x_col, y=y_col, color=color,
            opacity=0.5, trendline='ols',
            title=f'{x_col} vs {y_col}',
            marginal_x='histogram', marginal_y='histogram',
        )
        fig.update_layout(height=600)
        st.plotly_chart(fig, use_container_width=True)

        # Correlation stats
        x_data = df[x_col].dropna()
        y_data = df[y_col].dropna()
        common_idx = x_data.index.intersection(y_data.index)
        r_pearson, p_pearson = stats.pearsonr(x_data[common_idx], y_data[common_idx])
        r_spearman, p_spearman = stats.spearmanr(x_data[common_idx], y_data[common_idx])

        mc1, mc2, mc3, mc4 = st.columns(4)
        mc1.metric("Pearson r", f"{r_pearson:.4f}")
        mc2.metric("Pearson p", f"{p_pearson:.4f}")
        mc3.metric("Spearman rho", f"{r_spearman:.4f}")
        mc4.metric("Spearman p", f"{p_spearman:.4f}")


# ─────────────────────────────────── TAB 7: Export
with tabs[6]:
    st.header("Export Analysis")

    # Summary report
    st.subheader("Download Data")
    c1, c2 = st.columns(2)
    with c1:
        csv = df.to_csv(index=False).encode('utf-8')
        st.download_button("Download CSV", data=csv,
                           file_name="eda_data.csv", mime="text/csv")
    with c2:
        buffer = BytesIO()
        with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name='Data')
            profile['describe'].to_excel(writer, sheet_name='Statistics')
        st.download_button("Download Excel", data=buffer.getvalue(),
                           file_name="eda_report.xlsx",
                           mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")

    # Text report
    st.subheader("Text Summary")
    report_lines = [
        f"EDA Report",
        f"{'='*40}",
        f"Shape: {df.shape[0]:,} rows x {df.shape[1]} columns",
        f"Memory: {df.memory_usage(deep=True).sum()/1024**2:.2f} MB",
        f"Missing cells: {df.isna().sum().sum():,}",
        f"Duplicate rows: {df.duplicated().sum():,}",
        f"",
        f"Numeric columns: {', '.join(numeric_cols)}",
        f"Categorical columns: {', '.join(categorical_cols)}",
    ]
    st.code('\n'.join(report_lines), language='text')

# Footer
st.divider()
st.caption("Built with Streamlit. Upload your own CSV/Excel file to explore.")

Running the App

bash
# Install dependencies
pip install streamlit pandas numpy plotly scipy openpyxl

# Run the app
streamlit run eda_app.py

# Run with custom port
streamlit run eda_app.py --server.port 8080

Features Summary

FeatureDescription
File uploadCSV and Excel support with auto-detection
Sample datasets3 built-in datasets for demonstration
Overview tabShape, types, memory, descriptive stats
Missing dataBar chart, nullity matrix, pattern analysis
DistributionsHistogram, box plot, QQ plot, normality test
CorrelationsHeatmap, ranked pairs, 3 correlation methods
CategoricalValue counts, pie charts, group comparisons
BivariateScatter with trendline, marginals, correlation stats
ExportCSV, Excel with multiple sheets, text summary

Key Takeaways

  • The entire app is a single Python file under 350 lines that provides a complete EDA workflow
  • @st.cache_data ensures expensive computations run only once per dataset
  • Tabs organize the analysis into logical sections without overwhelming the user
  • Plotly provides interactive charts where users can zoom, hover, and filter
  • Sample datasets allow instant demonstration without requiring a file upload
  • The Export tab generates downloadable CSV and Excel reports for stakeholders
  • This template can be extended with additional tabs for time series, geospatial, or ML-assisted analysis

"What I cannot create, I do not understand." — Richard Feynman