Welcome to this comprehensive guide on text preprocessing with NLTK (Natural Language Toolkit)! 🚀 This notebook will walk you through various essential text preprocessing techniques, explained in simple terms with easy-to-follow code examples. Whether you're just starting out in NLP (Natural Language Processing) or looking to brush up on your skills, you're in the right place!
NLTK provides a powerful suite of tools for processing and analyzing unstructured text data. Let’s dive into the essential preprocessing steps:
Tokenization splits text into individual words or sentences.
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
text = "Hello World. This is NLTK. It is great for text processing."
sentences = sent_tokenize(text)
print("Sentence Tokenization:", sentences)
from nltk.tokenize import word_tokenize
words = word_tokenize(text)
print("Word Tokenization:", words)
Stop words are common words that might not be useful for analysis (e.g., "is", "the", "and").
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print("Filtered Words:", filtered_words)
Stemming reduces words to their root form by chopping off the ends.
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("Stemmed Words:", stemmed_words)
Lemmatization reduces words to their base form (lemma), considering the word’s meaning.
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Lemmatized Words:", lemmatized_words)
Tagging words with their parts of speech (POS) helps understand grammatical structure.
nltk.download('averaged_perceptron_tagger')
pos_tags = nltk.pos_tag(lemmatized_words)
print("POS Tags:", pos_tags)
Identify named entities such as names of people, organizations, and locations.
%pip install numpy
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.chunk import ne_chunk
named_entities = ne_chunk(pos_tags)
print("Named Entities:", named_entities)
Count the frequency of each word in the text.
from nltk.probability import FreqDist
freq_dist = FreqDist(lemmatized_words)
print("Most Common Words:", freq_dist.most_common(5))
Remove punctuation from the text.
import string
no_punct = [word for word in lemmatized_words if word not in string.punctuation]
print("Words Without Punctuation:", no_punct)
Convert all words to lowercase.
lowercased = [word.lower() for word in no_punct]
print("Lowercased Words:", lowercased)
Correct the spelling of words.
%pip install pyspellchecker
from nltk.corpus import wordnet
from spellchecker import SpellChecker
spell = SpellChecker()
def correct_spelling(word):
if not wordnet.synsets(word):
return spell.correction(word)
return word
lemmatized_words = ['hello', 'world', '.', 'klown', 'taxt', 'procass', '.']
words_with_corrected_spelling = [correct_spelling(word) for word in lemmatized_words]
print("Words with Corrected Spelling:", words_with_corrected_spelling)
Remove numerical values from the text.
lemmatized_words = ['hello', 'world', '88', 'text', 'process', '.']
no_numbers = [word for word in lemmatized_words if not word.isdigit()]
print("Words Without Numbers:", no_numbers)
Replace specific words with other words (e.g., slang with formal words).
lemmatized_words = ['hello', 'world', 'gr8', 'text', 'NLTK', '.']
replacements = {'NLTK': 'Natural Language Toolkit', 'gr8': 'great'}
replaced_words = [replacements.get(word, word) for word in lemmatized_words]
print("Words with Replacements:", replaced_words)
Replace words with their synonyms.
from nltk.corpus import wordnet
lemmatized_words = ['hello', 'world', 'awesome', 'text', 'great', '.']
def get_synonym(word):
synonyms = wordnet.synsets(word)
if synonyms:
return synonyms[0].lemmas()[0].name()
return word
synonym_replaced = [get_synonym(word) for word in lemmatized_words]
print("Words with Synonyms:", synonym_replaced)
Extract bigrams (pairs of consecutive words) and trigrams (triplets of consecutive words).
from nltk import bigrams, trigrams
bigrams_list = list(bigrams(lemmatized_words))
print("Bigrams:", bigrams_list)
trigrams_list = list(trigrams(lemmatized_words))
print("Trigrams:", trigrams_list)
Split text into sentences while considering abbreviations and other punctuation complexities.
import nltk.data
text = 'Hello World. This is NLTK. It is great for text preprocessing.'
# Load the sentence tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Tokenize the text into sentences
sentences = tokenizer.tokenize(text)
# Print the tokenized sentences
print("Segmented Sentences:", sentences)
Identify and display the frequency of words in a text.
from nltk.probability import FreqDist
lemmatized_words = ['hello', 'hello', 'awesome', 'text', 'great', '.', '.', '.']
word_freq = FreqDist(lemmatized_words)
for word, freq in word_freq.items():
print(f"{word}: {freq}")
Remove HTML tags from the text.
%pip install bs4
from bs4 import BeautifulSoup
html_text = "<p>Hello World. This is NLTK.</p>"
soup = BeautifulSoup(html_text, "html.parser")
cleaned_text = soup.get_text()
print("Cleaned Text:", cleaned_text)
Detect the language of the text.
%pip install langdetect
from langdetect import detect
language = detect(text)
print("Detected Language:", language) # `en` (for English)
Use Regular Expressions to tokenize text.
text = 'Hello World. This is NLTK. It is great for text preprocessing.'
from nltk.tokenize import regexp_tokenize
pattern = r'\w+'
regex_tokens = regexp_tokenize(text, pattern)
print("Regex Tokens:", regex_tokens)
Remove frequent words (high-frequency words) from a list of tokens.
import nltk
# Input text
text = "Natural language processing is a field of AI. I love AI."
# Tokenize the text
tokens = nltk.word_tokenize(text)
# Calculate the frequency of each word
fdist = nltk.FreqDist(tokens)
# Remove the most common words (e.g., the top 10% by frequency)
filtered_tokens = [token for token in tokens if fdist[token] < fdist.N() * 0.1]
print("Tokens Without Frequent Words:", filtered_tokens)
Tokenize the input string into individual sentences and remove leading or trailing whitespace from each sentence.
import nltk.data
# Text data
text = 'Hello World. This is NLTK. It is great for text preprocessing.'
# Load the sentence tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Tokenize the text into sentences
sentences = tokenizer.tokenize(text)
# Remove extra whitespace from each sentence
sentences = [sentence.strip() for sentence in sentences]
# Print the tokenized sentences
print("Sentences Without Extra Whitespace:", sentences)
Text preprocessing is a crucial step in natural language processing (NLP) and can significantly impact the performance of your models and applications. With NLTK, you have a powerful toolset that simplifies and streamlines these tasks.
Happy coding! 💻