Text Preprocessing for NLP
Text preprocessing transforms raw human language into a format that algorithms can process. The same sentence — "I can't believe it's NOT working!!!" — needs different preprocessing depending on whether you are building a sentiment classifier (keep the emphasis), a search index (remove punctuation), or a topic model (keep only meaningful words). This page covers every preprocessing technique and when to apply each.
The NLP Preprocessing Pipeline
HTML and Markup Removal
# html_removal.py — Strip HTML, Markdown, and markup from text
import re
from html import unescape
from bs4 import BeautifulSoup
def strip_html(text: str) -> str:
"""Remove HTML tags and decode entities."""
# Decode HTML entities first: & -> &, < -> <
text = unescape(text)
# Remove HTML tags
text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
# Clean up whitespace
text = re.sub(r"\s+", " ", text).strip()
return text
def strip_markdown(text: str) -> str:
"""Remove common Markdown formatting."""
# Headers: ## Title -> Title
text = re.sub(r"#{1,6}\s*", "", text)
# Bold/italic: **bold**, *italic*, __bold__, _italic_
text = re.sub(r"\*{1,3}(.*?)\*{1,3}", r"\1", text)
text = re.sub(r"_{1,3}(.*?)_{1,3}", r"\1", text)
# Links: [text](url) -> text
text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)
# Images:  -> alt
text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", r"\1", text)
# Code blocks: ```code``` or `code`
text = re.sub(r"```[\s\S]*?```", "", text)
text = re.sub(r"`([^`]+)`", r"\1", text)
# Blockquotes: > text -> text
text = re.sub(r"^\s*>\s*", "", text, flags=re.MULTILINE)
# Lists: - item or * item or 1. item
text = re.sub(r"^\s*[-*+]\s+", "", text, flags=re.MULTILINE)
text = re.sub(r"^\s*\d+\.\s+", "", text, flags=re.MULTILINE)
# Horizontal rules
text = re.sub(r"^\s*[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
return text.strip()
def strip_urls(text: str) -> str:
"""Remove URLs from text."""
return re.sub(r"https?://\S+|www\.\S+", "", text)
def strip_emails(text: str) -> str:
"""Remove email addresses."""
return re.sub(r"\S+@\S+\.\S+", "", text)
def strip_special_characters(text: str, keep: str = "") -> str:
"""Remove special characters, optionally keeping some."""
pattern = f"[^a-zA-Z0-9\\s{re.escape(keep)}]"
return re.sub(pattern, " ", text)Tokenization Strategies
# tokenization.py — Different tokenization approaches
import re
from typing import Iterator
import logging
logger = logging.getLogger(__name__)
class Tokenizer:
"""Multiple tokenization strategies for different NLP tasks."""
@staticmethod
def whitespace_tokenize(text: str) -> list[str]:
"""Split on whitespace. Simplest and fastest."""
return text.split()
@staticmethod
def word_tokenize(text: str) -> list[str]:
"""
Regex-based word tokenization.
Handles contractions and hyphenated words better than split().
"""
return re.findall(r"\b\w+(?:'\w+)?\b", text.lower())
@staticmethod
def nltk_tokenize(text: str) -> list[str]:
"""NLTK's word tokenizer — handles English contractions properly."""
from nltk.tokenize import word_tokenize
return word_tokenize(text)
@staticmethod
def spacy_tokenize(text: str, nlp=None) -> list[str]:
"""spaCy tokenizer — linguistically informed."""
if nlp is None:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
return [token.text for token in doc]
@staticmethod
def sentence_tokenize(text: str) -> list[str]:
"""Split text into sentences."""
from nltk.tokenize import sent_tokenize
return sent_tokenize(text)
@staticmethod
def subword_tokenize(text: str, vocab_size: int = 10000) -> list[str]:
"""
Byte-Pair Encoding (BPE) tokenization.
Used by modern LLMs (GPT, BERT variants).
Handles out-of-vocabulary words by splitting into subwords.
"""
from tokenizers import Tokenizer as HFTokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
tokenizer = HFTokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
# In practice, load a pre-trained tokenizer
# tokenizer = HFTokenizer.from_pretrained("bert-base-uncased")
output = tokenizer.encode(text)
return output.tokens
@staticmethod
def character_ngrams(text: str, n: int = 3) -> list[str]:
"""Character n-grams — useful for fuzzy matching and language detection."""
text = text.lower().strip()
return [text[i:i + n] for i in range(len(text) - n + 1)]
@staticmethod
def word_ngrams(tokens: list[str], n: int = 2) -> list[str]:
"""Word n-grams from a token list."""
return [
" ".join(tokens[i:i + n])
for i in range(len(tokens) - n + 1)
]
# Usage comparison
text = "I can't believe it's not working! Dr. Smith's algorithm isn't great."
print("Whitespace:", Tokenizer.whitespace_tokenize(text))
# ["I", "can't", "believe", "it's", "not", "working!", "Dr.", "Smith's", ...]
print("Word regex:", Tokenizer.word_tokenize(text))
# ["i", "can't", "believe", "it's", "not", "working", "dr", "smith's", ...]
print("NLTK:", Tokenizer.nltk_tokenize(text))
# ["I", "ca", "n't", "believe", "it", "'s", "not", "working", "!", ...]Stopword Removal
# stopwords.py — Intelligent stopword handling
import re
from typing import Set
class StopwordRemover:
"""Remove stopwords with domain-aware customization."""
# English stopwords (subset — full list has ~180 words)
ENGLISH_STOPWORDS = {
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to",
"for", "of", "with", "by", "from", "is", "are", "was", "were",
"be", "been", "being", "have", "has", "had", "do", "does",
"did", "will", "would", "could", "should", "may", "might",
"shall", "can", "this", "that", "these", "those", "it", "its",
"i", "me", "my", "we", "our", "you", "your", "he", "she",
"him", "her", "his", "they", "them", "their", "what", "which",
"who", "whom", "where", "when", "how", "not", "no", "nor",
"if", "then", "than", "too", "very", "just", "about", "above",
"after", "again", "all", "also", "am", "any", "because",
"before", "between", "both", "each", "few", "more", "most",
"other", "own", "same", "so", "some", "such", "only", "into",
"over", "under", "up", "down", "out", "off", "here", "there",
}
def __init__(
self,
language: str = "english",
custom_stopwords: set | None = None,
keep_words: set | None = None,
):
if language == "english":
self.stopwords = self.ENGLISH_STOPWORDS.copy()
else:
try:
from nltk.corpus import stopwords as nltk_sw
self.stopwords = set(nltk_sw.words(language))
except Exception:
self.stopwords = set()
if custom_stopwords:
self.stopwords.update(custom_stopwords)
if keep_words:
self.stopwords -= keep_words
def remove(self, tokens: list[str]) -> list[str]:
"""Remove stopwords from token list."""
return [t for t in tokens if t.lower() not in self.stopwords]
def remove_from_text(self, text: str) -> str:
"""Remove stopwords from raw text string."""
tokens = text.lower().split()
filtered = self.remove(tokens)
return " ".join(filtered)
# Domain-specific stopword handling
# IMPORTANT: "not" is a stopword, but removing it flips sentiment!
# "This product is NOT good" -> "product good" (wrong!)
# For sentiment analysis, keep negation words:
sentiment_remover = StopwordRemover(
keep_words={"not", "no", "nor", "never", "neither", "nobody",
"nothing", "nowhere", "hardly", "barely", "scarcely"},
)
# For medical text, add domain stopwords:
medical_remover = StopwordRemover(
custom_stopwords={"patient", "presented", "history", "examination"},
)Stemming vs Lemmatization
# stem_lemma.py — Reduce words to base forms
import logging
logger = logging.getLogger(__name__)
class TextNormalizer:
"""Stemming and lemmatization with trade-off analysis."""
@staticmethod
def porter_stem(tokens: list[str]) -> list[str]:
"""
Porter stemmer: aggressive, rule-based suffix stripping.
Pros: Fast, no dictionary needed
Cons: Produces non-words ("studies" -> "studi", "university" -> "univers")
"""
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
return [stemmer.stem(t) for t in tokens]
@staticmethod
def snowball_stem(tokens: list[str], language: str = "english") -> list[str]:
"""
Snowball stemmer: improved Porter, supports multiple languages.
Slightly less aggressive than Porter.
"""
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer(language)
return [stemmer.stem(t) for t in tokens]
@staticmethod
def wordnet_lemmatize(tokens: list[str]) -> list[str]:
"""
WordNet lemmatizer: dictionary lookup for base forms.
Pros: Produces real words ("studies" -> "study")
Cons: Slower, needs POS tagging for best results
"""
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith("J"):
return wordnet.ADJ
elif treebank_tag.startswith("V"):
return wordnet.VERB
elif treebank_tag.startswith("N"):
return wordnet.NOUN
elif treebank_tag.startswith("R"):
return wordnet.ADV
return wordnet.NOUN # Default
tagged = pos_tag(tokens)
return [
lemmatizer.lemmatize(word, get_wordnet_pos(pos))
for word, pos in tagged
]
@staticmethod
def spacy_lemmatize(tokens: list[str], nlp=None) -> list[str]:
"""
spaCy lemmatizer: uses trained models for accurate lemmatization.
Best accuracy but slowest.
"""
import spacy
if nlp is None:
nlp = spacy.load("en_core_web_sm")
text = " ".join(tokens)
doc = nlp(text)
return [token.lemma_ for token in doc]
@staticmethod
def compare(tokens: list[str]) -> dict:
"""Compare all methods on the same input."""
return {
"original": tokens,
"porter": TextNormalizer.porter_stem(tokens),
"snowball": TextNormalizer.snowball_stem(tokens),
"wordnet": TextNormalizer.wordnet_lemmatize(tokens),
}
# Comparison example
tokens = ["running", "studies", "better", "feet", "geese", "universities"]
comparison = TextNormalizer.compare(tokens)
for method, result in comparison.items():
print(f" {method:12s}: {result}")
# original : ['running', 'studies', 'better', 'feet', 'geese', 'universities']
# porter : ['run', 'studi', 'better', 'feet', 'gees', 'univers']
# snowball : ['run', 'studi', 'better', 'feet', 'gees', 'univers']
# wordnet : ['run', 'study', 'good', 'foot', 'goose', 'university']Language Detection
# language_detection.py — Detect text language
from langdetect import detect, detect_langs, DetectorFactory
import pandas as pd
import logging
# Make detection deterministic
DetectorFactory.seed = 42
logger = logging.getLogger(__name__)
def detect_language(text: str) -> dict:
"""Detect language of a text string."""
if not text or len(text.strip()) < 10:
return {"language": "unknown", "confidence": 0.0}
try:
langs = detect_langs(text)
top = langs[0]
return {
"language": top.lang,
"confidence": top.prob,
"alternatives": [
{"lang": l.lang, "prob": l.prob} for l in langs[1:3]
],
}
except Exception:
return {"language": "unknown", "confidence": 0.0}
def filter_by_language(
df: pd.DataFrame,
column: str,
target_language: str = "en",
min_confidence: float = 0.8,
) -> pd.DataFrame:
"""Filter DataFrame to rows matching target language."""
detections = df[column].apply(
lambda x: detect_language(str(x)) if pd.notna(x)
else {"language": "unknown", "confidence": 0.0}
)
df_result = df.copy()
df_result["_detected_language"] = detections.apply(lambda x: x["language"])
df_result["_language_confidence"] = detections.apply(lambda x: x["confidence"])
mask = (
(df_result["_detected_language"] == target_language) &
(df_result["_language_confidence"] >= min_confidence)
)
filtered = df_result[mask].drop(columns=["_detected_language", "_language_confidence"])
removed = len(df) - len(filtered)
logger.info(f"Filtered {removed} rows (not '{target_language}' or low confidence)")
return filteredSpell Correction
# spell_correction.py — Fix spelling errors in text
from spellchecker import SpellChecker
import re
import pandas as pd
import logging
logger = logging.getLogger(__name__)
class TextSpellCorrector:
"""Spell correction for text data."""
def __init__(self, language: str = "en", custom_words: set | None = None):
self.checker = SpellChecker(language=language)
if custom_words:
self.checker.word_frequency.load_words(list(custom_words))
def correct_text(self, text: str, max_corrections: int = 50) -> str:
"""Correct spelling in a text string."""
words = text.split()
corrections = 0
corrected = []
for word in words:
# Skip non-alphabetic tokens
if not word.isalpha() or len(word) <= 2:
corrected.append(word)
continue
if word.lower() in self.checker:
corrected.append(word)
else:
correction = self.checker.correction(word.lower())
if correction and corrections < max_corrections:
corrected.append(correction)
corrections += 1
else:
corrected.append(word)
return " ".join(corrected)
def correct_column(
self,
series: pd.Series,
min_word_count: int = 3,
) -> pd.Series:
"""Correct spelling in a text column."""
return series.apply(
lambda x: self.correct_text(str(x))
if pd.notna(x) and len(str(x).split()) >= min_word_count
else x
)
def get_misspelled(self, text: str) -> list[dict]:
"""Find misspelled words and their suggestions."""
words = [w for w in text.split() if w.isalpha() and len(w) > 2]
misspelled = self.checker.unknown(words)
return [
{
"word": word,
"correction": self.checker.correction(word),
"candidates": list(self.checker.candidates(word))[:5],
}
for word in misspelled
]
# Usage
corrector = TextSpellCorrector(
custom_words={"sklearn", "pytorch", "tensorflow", "numpy", "pandas"}
)
text = "The machne lerning modle was trainned on teh datset"
corrected = corrector.correct_text(text)
print(f"Original: {text}")
print(f"Corrected: {corrected}")Complete Text Preprocessing Pipeline
# text_pipeline.py — Composable NLP preprocessing pipeline
import pandas as pd
import re
import unicodedata
from typing import Callable
from html import unescape
class NLPPreprocessor:
"""Composable text preprocessing pipeline for NLP tasks."""
def __init__(self):
self.steps: list[tuple[str, Callable]] = []
def add(self, name: str, fn: Callable) -> "NLPPreprocessor":
"""Add a preprocessing step."""
self.steps.append((name, fn))
return self
def process(self, text: str) -> str:
"""Apply all steps to a single text."""
if not isinstance(text, str) or not text:
return ""
for _, fn in self.steps:
text = fn(text)
return text
def process_series(self, series: pd.Series) -> pd.Series:
"""Apply pipeline to a pandas Series."""
return series.apply(lambda x: self.process(x) if pd.notna(x) else "")
# --- Factory methods for common pipelines ---
@classmethod
def for_classification(cls) -> "NLPPreprocessor":
"""Pipeline optimized for text classification."""
return (
cls()
.add("html", lambda t: unescape(re.sub(r"<[^>]+>", " ", t)))
.add("urls", lambda t: re.sub(r"https?://\S+", " URL ", t))
.add("emails", lambda t: re.sub(r"\S+@\S+", " EMAIL ", t))
.add("unicode", lambda t: unicodedata.normalize("NFKC", t))
.add("lowercase", str.lower)
.add("numbers", lambda t: re.sub(r"\d+", " NUM ", t))
.add("punctuation", lambda t: re.sub(r"[^\w\s]", " ", t))
.add("whitespace", lambda t: re.sub(r"\s+", " ", t).strip())
)
@classmethod
def for_search(cls) -> "NLPPreprocessor":
"""Pipeline optimized for search/retrieval."""
return (
cls()
.add("html", lambda t: re.sub(r"<[^>]+>", " ", t))
.add("unicode", lambda t: unicodedata.normalize("NFKC", t))
.add("lowercase", str.lower)
.add("special_chars", lambda t: re.sub(r"[^\w\s\-]", " ", t))
.add("whitespace", lambda t: re.sub(r"\s+", " ", t).strip())
)
@classmethod
def for_embeddings(cls) -> "NLPPreprocessor":
"""Minimal cleaning for transformer-based embeddings."""
return (
cls()
.add("html", lambda t: re.sub(r"<[^>]+>", " ", t))
.add("unicode", lambda t: unicodedata.normalize("NFKC", t))
.add("whitespace", lambda t: re.sub(r"\s+", " ", t).strip())
# Keep casing and punctuation — transformers use them
)
@classmethod
def for_topic_modeling(cls) -> "NLPPreprocessor":
"""Aggressive cleaning for topic modeling (LDA, NMF)."""
return (
cls()
.add("html", lambda t: re.sub(r"<[^>]+>", " ", t))
.add("urls", lambda t: re.sub(r"https?://\S+", "", t))
.add("unicode", lambda t: unicodedata.normalize("NFKC", t))
.add("lowercase", str.lower)
.add("numbers", lambda t: re.sub(r"\d+", "", t))
.add("punctuation", lambda t: re.sub(r"[^\w\s]", "", t))
.add("short_words", lambda t: " ".join(
w for w in t.split() if len(w) > 2
))
.add("whitespace", lambda t: re.sub(r"\s+", " ", t).strip())
)
# Usage
classifier_pipeline = NLPPreprocessor.for_classification()
embedding_pipeline = NLPPreprocessor.for_embeddings()
text = "<p>Check out https://example.com for 50% off! Contact info@test.com</p>"
print("Classification:", classifier_pipeline.process(text))
# "check out URL for NUM off contact EMAIL"
print("Embeddings:", embedding_pipeline.process(text))
# "Check out https://example.com for 50% off! Contact info@test.com"
# Apply to DataFrame
df["text_clean"] = classifier_pipeline.process_series(df["text"])Quick Reference
| Task | Preprocessing | Keep | Remove |
|---|---|---|---|
| Sentiment Analysis | Minimal | Negations, punctuation, case | HTML, URLs |
| Text Classification | Moderate | Content words | Stopwords, numbers, special chars |
| Topic Modeling | Aggressive | Nouns, meaningful words | Everything else |
| Search/Retrieval | Moderate | Hyphenated words | HTML, special chars |
| Named Entity Recognition | Minimal | Case, punctuation | HTML only |
| Transformer Embeddings | Very Minimal | Almost everything | HTML tags |
| Technique | Speed | Accuracy | Produces Real Words |
|---|---|---|---|
| Porter Stemming | Fast | Low | No |
| Snowball Stemming | Fast | Medium | No |
| WordNet Lemmatization | Medium | High | Yes |
| spaCy Lemmatization | Slow | Highest | Yes |
Key Takeaway
- The right preprocessing depends entirely on the NLP task: sentiment analysis needs punctuation and negations, while topic modeling aggressively removes everything but content words.
- Lemmatization produces real words and is more accurate than stemming, but stemming is 10x faster and sufficient for information retrieval tasks.
- Transformer-based models (BERT, GPT) need minimal preprocessing -- they were trained on raw text with casing and punctuation, so aggressive cleaning degrades their performance.
Exercise
Build Task-Specific Text Preprocessors
Create three preprocessing pipelines and compare their output on the sentence: "<p>I can't believe it's NOT working! Check https://help.com for $50 off.</p>":
- Sentiment analysis pipeline: keep negations and emphasis, remove HTML and URLs.
- Search index pipeline: lowercase, remove special characters, keep hyphens.
- Topic modeling pipeline: remove everything except meaningful words longer than 2 characters.
Solution Sketch
import re
from html import unescape
text = "<p>I can't believe it's NOT working! Check https://help.com for $50 off.</p>"
# Sentiment: keep negation, emphasis
sent = re.sub(r"<[^>]+>", " ", text)
sent = re.sub(r"https?://\S+", "", sent)
sent = re.sub(r"\s+", " ", sent).strip()
# "I can't believe it's NOT working! Check for $50 off."
# Search: lowercase, remove special
search = re.sub(r"<[^>]+>", " ", text)
search = search.lower()
search = re.sub(r"[^\w\s\-]", " ", search)
search = re.sub(r"\s+", " ", search).strip()
# "i can t believe it s not working check https help com for 50 off"
# Topic: only long meaningful words
topic = re.sub(r"<[^>]+>", " ", text)
topic = re.sub(r"https?://\S+", "", topic)
topic = topic.lower()
topic = re.sub(r"[^\w\s]", "", topic)
topic = " ".join(w for w in topic.split() if len(w) > 2)
# "can believe not working check for off"Debugging Scenario
Your sentiment classifier achieves 95% accuracy in development but only 60% in production. The production text contains HTML email content with inline CSS and base64 images.
Diagnose and fix it.
Answer
The model was trained on clean text but production data contains raw HTML that was not stripped. The HTML tags, CSS, and base64 strings are treated as "words" by the tokenizer, drowning out the actual text content.
Fix:
- Add HTML stripping to the production preprocessing pipeline: use
BeautifulSoup(text, "html.parser").get_text(separator=" ")followed byhtml.unescape(). - Remove base64 content:
re.sub(r"data:[^;]+;base64,[A-Za-z0-9+/=]+", "", text). - Strip inline CSS:
re.sub(r"style=\"[^\"]*\"", "", text)before HTML parsing. - Retrain or fine-tune the model on data preprocessed with the production pipeline to ensure train-time and inference-time preprocessing match exactly.
The root cause is training-serving skew: the preprocessing pipeline used during training differed from production.
Common Misconceptions
- "Removing stopwords always improves NLP models." Removing "not" from "not good" flips the sentiment. Keep negation words for sentiment tasks. For topic modeling, removing stopwords is essential.
- "Stemming and lemmatization are interchangeable." Stemming produces non-words ("studies" becomes "studi"), which is acceptable for search but breaks readability. Lemmatization produces real base forms ("studies" becomes "study").
- "Transformers do not need any preprocessing." Transformers need minimal preprocessing, but HTML tags, base64 content, and excessive whitespace should still be removed. Over-cleaning (lowercasing, removing punctuation) hurts transformer performance.
- "Spell correction improves all NLP tasks." Spell correction changes domain-specific terms, product names, and intentional misspellings (slang). Only use it when misspelling genuinely introduces noise.
Quiz
1. Why should you keep punctuation for sentiment analysis but remove it for topic modeling?
Punctuation carries emotional signal in sentiment ("amazing!!!" vs "amazing"), but adds noise in topic modeling where only content words matter for identifying themes.
2. What is the difference between character n-grams and word n-grams?
Character n-grams are substrings of N characters ("hello" with n=3 produces "hel", "ell", "llo"). Word n-grams are sequences of N consecutive words. Character n-grams are useful for language detection and fuzzy matching; word n-grams capture phrases.
3. Why does spaCy's lemmatizer outperform WordNet's?
spaCy uses trained statistical models that understand context, so "better" as an adjective lemmatizes to "good" but "better" as a verb lemmatizes to "better." WordNet relies on explicit POS tags that may be wrong.
4. When should you use subword tokenization (BPE) instead of word tokenization?
When handling out-of-vocabulary words, morphologically rich languages, or training/using transformer models. BPE splits unknown words into known subword pieces, ensuring every input can be tokenized.
5. Why is langdetect unreliable on short texts?
Language detection relies on character n-gram frequency distributions, which need enough text to produce stable statistics. Short texts (under 20 characters) have too few n-grams for reliable classification.
One-Liner Summary: Text preprocessing is task-dependent: sentiment analysis needs negations and emphasis, topic modeling needs only content words, and transformers need almost nothing -- the wrong pipeline for the wrong task silently destroys model performance.