Natural Language Processing

Process, analyze, and understand text data with modern NLP techniques.

Quick Start

Text Preprocessing

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return ' '.join(tokens)

Sentiment Analysis

from transformers import pipeline

# Pre-trained model
sentiment_analyzer = pipeline("sentiment-analysis")
result = sentiment_analyzer("I love this product!")
# [{'label': 'POSITIVE', 'score': 0.9998}]

# Custom model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(documents)

model = LogisticRegression()
model.fit(X, labels)

TF-IDF Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,            # Minimum document frequency
    max_df=0.8           # Maximum document frequency
)

X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

Named Entity Recognition

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple Inc. was founded by Steve Jobs in California.")

for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")
# Apple Inc.: ORG
# Steve Jobs: PERSON
# California: GPE

BERT for Text Classification

from transformers import (
    BertTokenizer, BertForSequenceClassification,
    Trainer, TrainingArguments
)

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

# Tokenize
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Train
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

trainer.train()

Text Generation with GPT

from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

input_text = "The future of AI is"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

output = model.generate(
    input_ids,
    max_length=50,
    num_return_sequences=1,
    temperature=0.7,
    top_k=50,
    top_p=0.95
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Topic Modeling with LDA

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000, max_df=0.8, min_df=2)
X = vectorizer.fit_transform(documents)

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Display topics
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[-10:]]
    print(f"Topic {topic_idx}: {', '.join(top_words)}")

Word Embeddings

from gensim.models import Word2Vec

# Train Word2Vec
sentences = [word_tokenize(doc) for doc in documents]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)

# Get vector
vector = model.wv['king']

# Find similar words
similar = model.wv.most_similar('king', topn=5)

Common Tasks

Text Classification:

Sentiment analysis
Spam detection
Intent classification
Topic categorization

Sequence Labeling:

Named Entity Recognition (NER)
Part-of-Speech (POS) tagging
Keyword extraction

Generation:

Text summarization
Machine translation
Chatbots
Code generation

Best Practices

Clean text (remove noise, normalize)
Handle class imbalance
Use pre-trained models when possible
Fine-tune on domain-specific data
Validate with diverse test data
Monitor for bias and fairness

nlp-processing

Natural Language Processing

Quick Start

Text Preprocessing

Sentiment Analysis

TF-IDF Vectorization

Named Entity Recognition

BERT for Text Classification

Text Generation with GPT

Topic Modeling with LDA

Word Embeddings

Common Tasks

Best Practices

Similar Skills