Skip to content
Unverified — AI-generated content. Help verify this page

Streamlit for EDA

Streamlit turns Python scripts into interactive web apps in minutes. For EDA, it replaces static Jupyter notebooks with shareable, interactive dashboards where stakeholders can explore data themselves.


Quickstart

python
# app.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px

st.set_page_config(
    page_title="EDA Dashboard",
    page_icon="📊",
    layout="wide",
    initial_sidebar_state="expanded",
)

st.title("Exploratory Data Analysis Dashboard")
st.markdown("Upload a CSV file to begin exploring your data.")

# File upload
uploaded = st.file_uploader("Upload CSV", type=['csv', 'xlsx'])

if uploaded is not None:
    df = pd.read_csv(uploaded)
    st.success(f"Loaded {df.shape[0]:,} rows and {df.shape[1]} columns")
    st.dataframe(df.head(20), use_container_width=True)
else:
    st.info("Please upload a file to get started.")

Run with: streamlit run app.py


Layout and Organization

Columns and Containers

python
# Two-column layout
col1, col2 = st.columns(2)
with col1:
    st.metric("Total Rows", f"{len(df):,}")
with col2:
    st.metric("Total Columns", df.shape[1])

# Three columns with custom ratios
left, center, right = st.columns([1, 2, 1])
with center:
    st.plotly_chart(fig, use_container_width=True)

# Expander for collapsible sections
with st.expander("View Raw Data", expanded=False):
    st.dataframe(df, use_container_width=True)

# Tabs
tab1, tab2, tab3 = st.tabs(["Overview", "Distributions", "Correlations"])
with tab1:
    st.write("Dataset overview here")
with tab2:
    st.write("Distribution plots here")
with tab3:
    st.write("Correlation analysis here")
python
with st.sidebar:
    st.header("Controls")

    # Column selector
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    selected_col = st.selectbox("Select Column", numeric_cols)

    # Range filter
    min_val, max_val = float(df[selected_col].min()), float(df[selected_col].max())
    range_filter = st.slider(
        "Value Range",
        min_value=min_val, max_value=max_val,
        value=(min_val, max_val)
    )

    # Categorical filter
    if 'category' in df.columns:
        categories = st.multiselect(
            "Filter Categories",
            options=df['category'].unique().tolist(),
            default=df['category'].unique().tolist()
        )

    # Number input
    n_bins = st.number_input("Histogram Bins", min_value=5, max_value=200, value=50)

    # Checkbox
    show_outliers = st.checkbox("Show Outliers", value=True)

Essential Widgets for EDA

python
# Text input
search = st.text_input("Search Column Names", "")
matching = [c for c in df.columns if search.lower() in c.lower()]
st.write(f"Matching columns: {matching}")

# Radio buttons
chart_type = st.radio("Chart Type", ["Histogram", "Box Plot", "Violin"])

# Select slider (discrete steps)
confidence = st.select_slider("Confidence Level", options=[0.90, 0.95, 0.99], value=0.95)

# Date input
start_date = st.date_input("Start Date", value=pd.Timestamp('2024-01-01'))

# Download button
csv_data = df.to_csv(index=False).encode('utf-8')
st.download_button(
    "Download Filtered Data",
    data=csv_data,
    file_name="filtered_data.csv",
    mime="text/csv"
)

# Progress bar for long operations
progress = st.progress(0)
for i in range(100):
    progress.progress(i + 1)

# Display code
with st.echo():
    result = df.describe()
    st.write(result)

Caching for Performance

python
@st.cache_data
def load_data(filepath):
    """Load and cache data. Re-runs only if filepath changes."""
    df = pd.read_csv(filepath)
    # Preprocessing
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    for col in df.select_dtypes(include='object').columns:
        if df[col].nunique() < 50:
            df[col] = df[col].astype('category')
    return df

@st.cache_data
def compute_statistics(df):
    """Cache expensive statistical computations."""
    stats = {
        'describe': df.describe(),
        'missing': df.isna().sum(),
        'correlations': df.select_dtypes(include='number').corr(),
        'skewness': df.select_dtypes(include='number').skew(),
    }
    return stats

@st.cache_resource
def get_model():
    """Cache ML models or database connections."""
    # from sklearn.ensemble import IsolationForest
    # return IsolationForest(random_state=42)
    pass

# Clear cache when needed
# st.cache_data.clear()

Cache with TTL and Hashing

python
@st.cache_data(ttl=3600)  # expire after 1 hour
def load_from_api(url):
    """Cache API responses with time-to-live."""
    return pd.read_json(url)

@st.cache_data(show_spinner="Computing statistics...")
def expensive_computation(df):
    """Show custom spinner during computation."""
    import time
    time.sleep(2)  # simulate slow computation
    return df.describe()

Session State

python
# Initialize session state
if 'analysis_history' not in st.session_state:
    st.session_state.analysis_history = []
if 'current_filter' not in st.session_state:
    st.session_state.current_filter = {}

# Use session state for stateful interactions
if st.button("Run Analysis"):
    result = df.describe()
    st.session_state.analysis_history.append({
        'timestamp': pd.Timestamp.now(),
        'type': 'describe',
        'result': result,
    })
    st.success(f"Analysis #{len(st.session_state.analysis_history)} completed!")

# Display history
if st.session_state.analysis_history:
    st.subheader(f"Analysis History ({len(st.session_state.analysis_history)} runs)")
    for i, entry in enumerate(st.session_state.analysis_history[-5:]):
        with st.expander(f"Run {i+1}{entry['timestamp'].strftime('%H:%M:%S')}"):
            st.write(entry['result'])

# Callback-based state updates
def on_column_change():
    st.session_state.current_filter['column'] = st.session_state.col_select

st.selectbox("Column", numeric_cols, key='col_select', on_change=on_column_change)

Display Methods

python
# DataFrame display
st.dataframe(df.head(50), use_container_width=True, height=400)

# Static table (no scrolling)
st.table(df.describe().round(2))

# Metrics row
col1, col2, col3, col4 = st.columns(4)
col1.metric("Mean", f"${df['revenue'].mean():,.0f}", delta="+5.2%")
col2.metric("Median", f"${df['revenue'].median():,.0f}", delta="-1.3%")
col3.metric("Std Dev", f"${df['revenue'].std():,.0f}")
col4.metric("Missing", f"{df['revenue'].isna().sum()}")

# JSON display
st.json({'shape': list(df.shape), 'dtypes': {c: str(d) for c, d in df.dtypes.items()}})

# Matplotlib figure
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.hist(df['revenue'], bins=50)
st.pyplot(fig)

# Plotly chart
fig = px.histogram(df, x='revenue', color='category')
st.plotly_chart(fig, use_container_width=True)

Multi-Page Apps

my_app/
  app.py              # Main entry point
  pages/
    1_Overview.py      # Page 1
    2_Distributions.py # Page 2
    3_Correlations.py  # Page 3
    4_Export.py         # Page 4
python
# pages/1_Overview.py
import streamlit as st
import pandas as pd

st.header("Dataset Overview")

# Access shared data from session state
if 'df' not in st.session_state:
    st.warning("Please upload data on the main page first.")
    st.stop()

df = st.session_state.df

col1, col2, col3 = st.columns(3)
col1.metric("Rows", f"{len(df):,}")
col2.metric("Columns", df.shape[1])
col3.metric("Missing Cells", f"{df.isna().sum().sum():,}")

st.subheader("Column Types")
dtype_counts = df.dtypes.value_counts()
for dtype, count in dtype_counts.items():
    st.write(f"- **{dtype}**: {count} columns")

st.subheader("Sample Data")
n_rows = st.slider("Rows to display", 5, 100, 20)
st.dataframe(df.head(n_rows), use_container_width=True)

Deployment

Streamlit Community Cloud

python
# requirements.txt
# streamlit>=1.30.0
# pandas>=2.0.0
# plotly>=5.18.0
# numpy>=1.24.0
# seaborn>=0.13.0
# scipy>=1.11.0

# .streamlit/config.toml
# [theme]
# primaryColor = "#2563eb"
# backgroundColor = "#ffffff"
# secondaryBackgroundColor = "#f0f2f6"
# textColor = "#1f2937"
# font = "sans serif"

Docker Deployment

dockerfile
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8501
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

python
def eda_widget_gallery(df):
    """Reusable EDA widget gallery component."""

    st.subheader("Quick Statistics")
    numeric = df.select_dtypes(include='number')

    # Stats grid
    cols = st.columns(min(4, len(numeric.columns)))
    for i, col_name in enumerate(numeric.columns[:4]):
        with cols[i]:
            st.metric(col_name, f"{numeric[col_name].mean():.2f}",
                      delta=f"std: {numeric[col_name].std():.2f}")

    st.subheader("Column Explorer")
    col_choice = st.selectbox("Select column to explore", df.columns)
    series = df[col_choice]

    if series.dtype in ['int64', 'float64']:
        tab1, tab2, tab3 = st.tabs(["Distribution", "Statistics", "Outliers"])
        with tab1:
            fig = px.histogram(df, x=col_choice, nbins=50, marginal='box')
            st.plotly_chart(fig, use_container_width=True)
        with tab2:
            st.write(series.describe())
        with tab3:
            q1, q3 = series.quantile(0.25), series.quantile(0.75)
            iqr = q3 - q1
            n_outliers = ((series < q1 - 1.5*iqr) | (series > q3 + 1.5*iqr)).sum()
            st.metric("Outliers (IQR)", n_outliers)
    else:
        vc = series.value_counts().head(20)
        fig = px.bar(x=vc.index, y=vc.values, labels={'x': col_choice, 'y': 'Count'})
        st.plotly_chart(fig, use_container_width=True)

# Usage: eda_widget_gallery(df)

Key Takeaways

  • Streamlit converts Python scripts into interactive web dashboards with streamlit run app.py
  • Use @st.cache_data for expensive computations and data loading; it re-runs only when inputs change
  • Session state (st.session_state) persists data across widget interactions and reruns
  • Layout with st.columns(), st.tabs(), and st.expander() organizes complex EDA into digestible sections
  • Sidebar is ideal for filters and controls that affect the entire dashboard
  • Multi-page apps via the pages/ directory scale to full EDA toolkits
  • Deploy to Streamlit Community Cloud for free, or Docker for enterprise environments
  • For EDA, Streamlit replaces static notebooks with shareable, interactive analysis that non-technical stakeholders can use

"What I cannot create, I do not understand." — Richard Feynman