Skip to content
Unverified — AI-generated content. Help verify this page

Data Profiling

Data profiling is the first thing you do with any dataset. It takes 15 minutes and prevents 15 hours of debugging later. Profiling is not analysis — it is reconnaissance. You are not looking for insights yet. You are looking for land mines: wrong types, missing data, duplicates, impossible values, and unexpected distributions.

This page covers a systematic 15-minute profiling protocol, the data quality dimensions you must check, and automated profiling tools that do most of the work for you.


The 15-Minute Profiling Protocol

python
# profiling_protocol.py — Complete 15-minute profiling
import pandas as pd
import numpy as np
import sys

# Load dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
)

# ==================================================================
# MINUTE 0-2: Shape & Size
# ==================================================================
print("=" * 60)
print("STEP 1: Shape & Size (Minute 0-2)")
print("=" * 60)
print(f"Rows: {len(df):,}")
print(f"Columns: {df.shape[1]}")
print(f"Total cells: {df.size:,}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nColumn names:\n  {df.columns.tolist()}")

# ==================================================================
# MINUTE 2-4: Types & Memory
# ==================================================================
print(f"\n{'=' * 60}")
print("STEP 2: Types & Memory (Minute 2-4)")
print("=" * 60)
type_summary = pd.DataFrame({
    'dtype': df.dtypes,
    'memory_KB': df.memory_usage(deep=True)[1:] / 1024,
    'non_null': df.notna().sum(),
    'null': df.isnull().sum(),
})
print(type_summary.round(1))
print(f"\nType distribution:")
print(f"  Numeric: {len(df.select_dtypes(include='number').columns)}")
print(f"  Object:  {len(df.select_dtypes(include='object').columns)}")
print(f"  Bool:    {len(df.select_dtypes(include='bool').columns)}")

# ==================================================================
# MINUTE 4-6: Missing Data
# ==================================================================
print(f"\n{'=' * 60}")
print("STEP 3: Missing Data (Minute 4-6)")
print("=" * 60)
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(1)
missing_report = pd.DataFrame({
    'missing_count': missing,
    'missing_pct': missing_pct,
}).sort_values('missing_pct', ascending=False)
missing_report = missing_report[missing_report['missing_count'] > 0]
print(missing_report)

if len(missing_report) > 0:
    print(f"\nTotal missing cells: {df.isnull().sum().sum()} "
          f"({df.isnull().sum().sum() / df.size * 100:.1f}%)")
    print(f"Columns with missing: {len(missing_report)} of {df.shape[1]}")
    print(f"Rows with ANY missing: {df.isnull().any(axis=1).sum()} "
          f"({df.isnull().any(axis=1).mean():.1%})")
else:
    print("No missing data!")

# ==================================================================
# MINUTE 6-8: Unique Values
# ==================================================================
print(f"\n{'=' * 60}")
print("STEP 4: Unique Values (Minute 6-8)")
print("=" * 60)
for col in df.columns:
    n_unique = df[col].nunique()
    pct_unique = n_unique / len(df) * 100
    likely_type = "ID/Key" if pct_unique > 90 else \
                  "Categorical" if n_unique <= 20 else \
                  "High-cardinality" if n_unique <= 100 else "Continuous"
    print(f"  {col:>15}: {n_unique:5d} unique ({pct_unique:5.1f}%) -> {likely_type}")

# ==================================================================
# MINUTE 8-11: Descriptive Statistics
# ==================================================================
print(f"\n{'=' * 60}")
print("STEP 5: Descriptive Statistics (Minute 8-11)")
print("=" * 60)

# Numeric columns
print("\nNumeric columns:")
numeric_desc = df.describe().round(2)
print(numeric_desc)

# Check for suspicious numeric values
print("\nSuspicious values check:")
for col in df.select_dtypes(include='number').columns:
    s = df[col]
    issues = []
    if s.min() == s.max():
        issues.append("CONSTANT column")
    if (s == 0).sum() / len(s) > 0.5:
        issues.append(f"{(s==0).mean():.0%} zeros")
    if s.skew() > 2:
        issues.append(f"highly right-skewed ({s.skew():.1f})")
    if s.skew() < -2:
        issues.append(f"highly left-skewed ({s.skew():.1f})")
    if len(issues) > 0:
        print(f"  {col}: {', '.join(issues)}")

# Categorical columns
print(f"\nCategorical columns:")
for col in df.select_dtypes(include='object').columns:
    vc = df[col].value_counts()
    print(f"\n  {col} ({vc.shape[0]} unique):")
    for val, count in vc.head(5).items():
        print(f"    {val}: {count} ({count/len(df)*100:.1f}%)")
    if vc.shape[0] > 5:
        print(f"    ... and {vc.shape[0] - 5} more")

# ==================================================================
# MINUTE 11-13: Samples & Outliers
# ==================================================================
print(f"\n{'=' * 60}")
print("STEP 6: Samples & Outliers (Minute 11-13)")
print("=" * 60)

# Head and tail
print("First 3 rows:")
print(df.head(3).to_string())
print(f"\nLast 3 rows:")
print(df.tail(3).to_string())

# Random sample
print(f"\nRandom sample of 3 rows:")
print(df.sample(3, random_state=42).to_string())

# Quick outlier scan
print(f"\nOutlier scan (IQR method):")
for col in df.select_dtypes(include='number').columns:
    q1, q3 = df[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    n_outliers = ((df[col] < lower) | (df[col] > upper)).sum()
    if n_outliers > 0:
        pct = n_outliers / len(df) * 100
        print(f"  {col}: {n_outliers} outliers ({pct:.1f}%)")

# ==================================================================
# MINUTE 13-15: Duplicates & Quality Score
# ==================================================================
print(f"\n{'=' * 60}")
print("STEP 7: Duplicates & Quality Score (Minute 13-15)")
print("=" * 60)

# Duplicates
n_exact_dupes = df.duplicated().sum()
print(f"Exact duplicate rows: {n_exact_dupes}")
if 'PassengerId' in df.columns:
    n_key_dupes = df.duplicated(subset=['PassengerId']).sum()
    print(f"Duplicate PassengerIds: {n_key_dupes}")

# Data quality score
completeness = 1 - df.isnull().sum().sum() / df.size
uniqueness = 1 - n_exact_dupes / len(df)
validity = 1  # Would need domain-specific rules

print(f"\nData Quality Score:")
print(f"  Completeness: {completeness:.1%}")
print(f"  Uniqueness:   {uniqueness:.1%}")
print(f"  Validity:     (requires domain rules)")
print(f"  Overall:      {(completeness + uniqueness) / 2:.1%}")

Data Quality Dimensions

The six dimensions of data quality from the DAMA framework:

python
# quality_dimensions.py — Measuring all six dimensions
import pandas as pd
import numpy as np

df = pd.read_csv(
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
)

def measure_quality(df, rules=None):
    """Compute data quality scores across all six dimensions."""
    report = {}

    # 1. Completeness: % of non-null values
    report['completeness'] = df.notna().sum().sum() / df.size

    # Per-column completeness
    col_completeness = df.notna().mean()
    report['completeness_by_col'] = col_completeness[col_completeness < 1.0]

    # 2. Uniqueness: % of distinct rows
    report['uniqueness'] = 1 - df.duplicated().sum() / len(df)

    # 3. Validity: % of values conforming to rules
    validity_checks = {}
    if rules:
        for col, rule in rules.items():
            if col in df.columns:
                valid = rule(df[col])
                validity_checks[col] = valid.mean()
    report['validity'] = validity_checks

    # 4. Consistency: cross-column checks
    consistency_checks = {}
    # Example: SibSp + Parch should not be negative
    if 'SibSp' in df.columns and 'Parch' in df.columns:
        family = df['SibSp'] + df['Parch']
        consistency_checks['family_size_non_negative'] = (family >= 0).mean()
    report['consistency'] = consistency_checks

    # 5. Timeliness: metadata check (no auto-check possible)
    report['timeliness'] = "Manual check required — when was data collected?"

    # 6. Accuracy: can only be checked against ground truth
    report['accuracy'] = "Requires external validation source"

    return report

# Define validity rules
validity_rules = {
    'Age': lambda s: s.between(0, 120) | s.isna(),
    'Fare': lambda s: s >= 0,
    'Survived': lambda s: s.isin([0, 1]),
    'Pclass': lambda s: s.isin([1, 2, 3]),
    'Sex': lambda s: s.isin(['male', 'female']),
}

quality = measure_quality(df, rules=validity_rules)

print("=== DATA QUALITY REPORT ===\n")
print(f"Completeness: {quality['completeness']:.1%}")
if len(quality['completeness_by_col']) > 0:
    print(f"  Incomplete columns:")
    for col, pct in quality['completeness_by_col'].items():
        print(f"    {col}: {pct:.1%} complete")

print(f"\nUniqueness: {quality['uniqueness']:.1%}")

print(f"\nValidity:")
for col, pct in quality['validity'].items():
    status = "PASS" if pct >= 0.99 else "WARN" if pct >= 0.95 else "FAIL"
    print(f"  [{status}] {col}: {pct:.1%} valid")

print(f"\nConsistency:")
for check, pct in quality['consistency'].items():
    status = "PASS" if pct >= 0.99 else "WARN" if pct >= 0.95 else "FAIL"
    print(f"  [{status}] {check}: {pct:.1%}")

Automated Profiling with ydata-profiling

python
# ydata_profiling_demo.py — Auto-generate a complete EDA report
# pip install ydata-profiling

import pandas as pd

df = pd.read_csv(
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
)

# Generate a full profiling report in one line
from ydata_profiling import ProfileReport

profile = ProfileReport(
    df,
    title="Titanic Dataset Profiling Report",
    explorative=True,
    correlations={
        "pearson": {"calculate": True},
        "spearman": {"calculate": True},
        "phi_k": {"calculate": True},     # Works for categorical too
    },
    missing_diagrams={
        "bar": True,
        "matrix": True,
        "heatmap": True,
    },
)

# Save as HTML (interactive report)
profile.to_file("titanic_profile_report.html")

# Or get key stats as a dict
description = profile.get_description()

print("=== YDATA-PROFILING OUTPUT SECTIONS ===")
sections = [
    "Overview: dataset stats, variable types, warnings",
    "Variables: per-column distributions, stats, histograms",
    "Interactions: scatter matrix, correlation heatmaps",
    "Correlations: Pearson, Spearman, Phi_k, Cramér's V",
    "Missing values: bar chart, matrix, heatmap",
    "Sample: first/last rows",
    "Duplicates: exact duplicate rows",
]
for s in sections:
    print(f"  - {s}")

print("\nReport saved to: titanic_profile_report.html")

When to Use ydata-profiling vs Manual EDA

Use ydata-profiling for datasets under 100K rows as a first-pass overview. It takes seconds and catches obvious issues. But always follow up with manual EDA for domain-specific checks, feature engineering ideas, and questions that require human judgment.

ydata-profiling for Large Datasets

python
# ydata_large_datasets.py — Handling large datasets
import pandas as pd
# from ydata_profiling import ProfileReport

# For large datasets, use minimal mode
# profile = ProfileReport(
#     large_df,
#     minimal=True,         # Skip expensive computations
#     samples=None,         # Skip sample display
#     correlations=None,    # Skip correlation matrix
#     explorative=False,    # Skip advanced analysis
#     progress_bar=True,
# )

# Or sample first, profile the sample
# sample = large_df.sample(10000, random_state=42)
# profile = ProfileReport(sample, title="Sample Profile (n=10000)")

# Scale guide
print("=== YDATA-PROFILING SCALE GUIDE ===")
scale = pd.DataFrame({
    'Dataset Size': ['< 10K rows', '10K - 100K rows', '100K - 1M rows', '> 1M rows'],
    'Mode': ['Full', 'Full', 'Minimal', 'Sample first'],
    'Time': ['< 30 sec', '1-5 min', '5-15 min', 'Sample to 100K'],
    'Recommendation': [
        'Full report with all features',
        'Full report, maybe skip interactions',
        'Minimal mode, skip correlations',
        'Sample 100K rows, then minimal mode',
    ],
})
print(scale.to_string(index=False))

Automated Profiling with Sweetviz

python
# sweetviz_demo.py — Sweetviz for dataset comparison
# pip install sweetviz

import pandas as pd
import sweetviz as sv

df = pd.read_csv(
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
)

# Basic analysis
report = sv.analyze(df, target_feat='Survived')
report.show_html('titanic_sweetviz.html')

# Compare two datasets (e.g., train vs test)
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42)

comparison = sv.compare(train, test, target_feat='Survived')
comparison.show_html('train_vs_test_comparison.html')

# Compare subgroups
survived = df[df['Survived'] == 1]
died = df[df['Survived'] == 0]

group_compare = sv.compare(survived, died)
group_compare.show_html('survived_vs_died.html')

print("=== SWEETVIZ vs YDATA-PROFILING ===")
comparison_table = pd.DataFrame({
    'Feature': ['Auto-report', 'Dataset comparison', 'Target analysis',
                'Speed', 'Customization', 'Large datasets'],
    'ydata-profiling': ['Full HTML report', 'Manual comparison',
                         'Correlation with target', 'Slower (more thorough)',
                         'Many config options', 'Minimal mode'],
    'sweetviz': ['Full HTML report', 'Side-by-side comparison',
                  'Built-in target feature', 'Faster (lighter)',
                  'Fewer options', 'Better performance'],
})
print(comparison_table.to_string(index=False))

Custom Profiling Function

For repeated use, build your own profiling function tailored to your domain:

python
# custom_profiler.py — Reusable profiling function
import pandas as pd
import numpy as np
from datetime import datetime

def profile_dataset(df, name="Dataset", key_column=None, date_column=None):
    """
    Comprehensive dataset profiling in a single function call.

    Parameters:
    -----------
    df : pd.DataFrame
    name : str — dataset name for the report header
    key_column : str — expected unique identifier column
    date_column : str — datetime column for temporal checks
    """
    report = []
    report.append(f"{'=' * 60}")
    report.append(f"PROFILING REPORT: {name}")
    report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
    report.append(f"{'=' * 60}")

    # Section 1: Overview
    report.append(f"\n--- OVERVIEW ---")
    report.append(f"Rows: {len(df):,}")
    report.append(f"Columns: {df.shape[1]}")
    report.append(f"Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Section 2: Column Types
    report.append(f"\n--- COLUMN TYPES ---")
    for dtype, cols in df.columns.to_series().groupby(df.dtypes):
        report.append(f"  {dtype}: {len(cols)} columns — {cols.tolist()}")

    # Section 3: Missing Data
    report.append(f"\n--- MISSING DATA ---")
    missing = df.isnull().sum()
    if missing.sum() == 0:
        report.append("  No missing values!")
    else:
        for col in missing[missing > 0].sort_values(ascending=False).index:
            pct = missing[col] / len(df) * 100
            severity = "LOW" if pct < 5 else "MEDIUM" if pct < 30 else "HIGH"
            report.append(f"  [{severity:>6}] {col}: {missing[col]:,} ({pct:.1f}%)")

    # Section 4: Uniqueness
    report.append(f"\n--- UNIQUENESS ---")
    n_dupes = df.duplicated().sum()
    report.append(f"Exact duplicate rows: {n_dupes}")
    if key_column and key_column in df.columns:
        key_dupes = df.duplicated(subset=[key_column]).sum()
        report.append(f"Duplicate {key_column}: {key_dupes}")

    # Section 5: Numeric Summary
    numeric_cols = df.select_dtypes(include='number').columns
    if len(numeric_cols) > 0:
        report.append(f"\n--- NUMERIC SUMMARY ---")
        for col in numeric_cols:
            s = df[col].dropna()
            skew = s.skew()
            report.append(f"  {col}: range=[{s.min():.2f}, {s.max():.2f}], "
                         f"mean={s.mean():.2f}, median={s.median():.2f}, "
                         f"skew={skew:.2f}")

    # Section 6: Categorical Summary
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(cat_cols) > 0:
        report.append(f"\n--- CATEGORICAL SUMMARY ---")
        for col in cat_cols:
            n_unique = df[col].nunique()
            top = df[col].value_counts().head(3)
            top_str = ', '.join([f"{v}({c})" for v, c in top.items()])
            report.append(f"  {col}: {n_unique} unique, top: {top_str}")

    # Section 7: Temporal Check
    if date_column and date_column in df.columns:
        report.append(f"\n--- TEMPORAL CHECK ---")
        dates = pd.to_datetime(df[date_column], errors='coerce')
        report.append(f"  Range: {dates.min()} to {dates.max()}")
        report.append(f"  Span: {dates.max() - dates.min()}")
        report.append(f"  Parse errors: {dates.isnull().sum() - df[date_column].isnull().sum()}")

    # Section 8: Warnings
    report.append(f"\n--- WARNINGS ---")
    warnings = []
    for col in df.columns:
        if df[col].nunique() == 1:
            warnings.append(f"  CONSTANT: {col} has only one value")
        if df[col].nunique() == len(df) and df[col].dtype == 'object':
            warnings.append(f"  UNIQUE: {col} is 100% unique (possible ID)")
        if df[col].dtype in ['float64', 'int64'] and df[col].nunique() <= 5:
            warnings.append(f"  LOW-CARDINALITY NUMERIC: {col} ({df[col].nunique()} values) — maybe categorical?")
    if not warnings:
        report.append("  No warnings.")
    for w in warnings:
        report.append(w)

    # Print report
    full_report = '\n'.join(report)
    print(full_report)
    return full_report

# Demo
df = pd.read_csv(
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
)
profile_dataset(df, name="Titanic", key_column="PassengerId")

Profiling Quick Reference

CheckCommandWhat to Look For
Shapedf.shapeExpected row/column count
Typesdf.dtypesNumbers as objects, objects that should be numeric
Infodf.info()Non-null counts, dtypes, memory
Missingdf.isnull().sum()Columns with > 5% missing
Describedf.describe()Min/max outliers, zero std (constant)
Uniquedf.nunique()1 unique (constant), all unique (ID)
Value countsdf['col'].value_counts()Rare categories, unexpected values
Memorydf.memory_usage(deep=True)Object columns consuming too much memory
Duplicatesdf.duplicated().sum()Any exact duplicate rows
Sampledf.sample(5)"Does this look reasonable?"

Summary

ConceptKey Takeaway
15-minute protocolShape, types, missing, uniques, stats, samples, duplicates — in that order
Quality dimensionsCompleteness, uniqueness, validity, accuracy, timeliness, consistency
ydata-profilingFull auto-generated HTML report; use minimal mode for > 100K rows
sweetvizBest for comparing two datasets side by side
Custom profilerBuild a reusable function for your domain's specific checks
Common warningsConstant columns, numeric IDs, low-cardinality numerics

What's Next

PageWhat You'll Learn
Understanding DistributionsNormal, log-normal, exponential, and how to test them
Missing DataMCAR/MAR/MNAR, imputation strategies
Data Quality ValidationPandera, Great Expectations for systematic validation

"What I cannot create, I do not understand." — Richard Feynman