Process and analyze text using modern NLP techniques - preprocessing, embeddings, and transformers
Processes and analyzes text using NLP techniques like preprocessing, embeddings, and transformers. Triggers when you need to extract insights from unstructured text, perform semantic search, or classify content.
/plugin marketplace add pluginagentmarketplace/custom-plugin-machine-learning/plugin install machine-learning-assistant@pluginagentmarketplace-machine-learningThis skill inherits all available tools. When active, it can use any tool Claude has access to.
assets/config.yamlassets/schema.jsonreferences/GUIDE.mdreferences/PATTERNS.mdscripts/validate.pyTransform unstructured text into structured insights.
from transformers import AutoTokenizer, AutoModel
import torch
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
# Tokenize
text = "Machine learning is transforming industries."
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
# Get embeddings
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # [CLS] pooling
print(f"Embedding shape: {embeddings.shape}")
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
class TextPreprocessor:
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
self.stop_words = set(stopwords.words('english'))
def clean(self, text):
# Lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'http\S+', '', text)
# Remove special chars
text = re.sub(r'[^\w\s]', '', text)
# Tokenize and filter
tokens = word_tokenize(text)
tokens = [self.lemmatizer.lemmatize(t) for t in tokens
if t not in self.stop_words]
return ' '.join(tokens)
| Type | Model | Use Case |
|---|---|---|
| Static | Word2Vec, GloVe | Simple, fast |
| Contextual | BERT, RoBERTa | SOTA accuracy |
| Sentence | all-MiniLM | Similarity, search |
from sentence_transformers import SentenceTransformer
# Sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(['Hello world', 'Hi there'])
# Similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
# Load pretrained model
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=2
)
# Training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
evaluation_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True
)
# Train
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
trainer.train()
from transformers import pipeline
# NER pipeline
ner = pipeline('ner', aggregation_strategy='simple')
text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."
entities = ner(text)
for entity in entities:
print(f"{entity['word']}: {entity['entity_group']} ({entity['score']:.2f})")
from sentence_transformers import SentenceTransformer
import numpy as np
class SemanticSearch:
def __init__(self, model_name='all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
self.corpus_embeddings = None
self.corpus = None
def index(self, documents):
self.corpus = documents
self.corpus_embeddings = self.model.encode(documents)
def search(self, query, top_k=5):
query_embedding = self.model.encode([query])[0]
scores = np.dot(self.corpus_embeddings, query_embedding)
top_indices = np.argsort(scores)[-top_k:][::-1]
return [(self.corpus[i], scores[i]) for i in top_indices]
# TODO: Fine-tune BERT for sentiment classification
# Use the IMDB dataset
# TODO: Build a semantic search engine
# Index 1000 documents and search by query
import pytest
def test_preprocessing():
"""Test text preprocessing."""
preprocessor = TextPreprocessor()
text = "Hello World! Visit https://example.com"
cleaned = preprocessor.clean(text)
assert 'http' not in cleaned
assert cleaned.islower()
def test_embeddings_shape():
"""Test embedding dimensions."""
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(['test'])
assert embeddings.shape == (1, 384)
| Problem | Cause | Solution |
|---|---|---|
| OOV tokens | Rare words | Use subword tokenization |
| Slow inference | Large model | Use distilled model |
| Poor accuracy | Small dataset | Data augmentation |
| Memory error | Long sequences | Reduce max_length |
05-nlpdeep-learningcomputer-visionVersion: 1.4.0 | Status: Production Ready
Use when working with Payload CMS projects (payload.config.ts, collections, fields, hooks, access control, Payload API). Use when debugging validation errors, security issues, relationship queries, transactions, or hook behavior.
Applies Anthropic's official brand colors and typography to any sort of artifact that may benefit from having Anthropic's look-and-feel. Use it when brand colors or style guidelines, visual formatting, or company design standards apply.
Creating algorithmic art using p5.js with seeded randomness and interactive parameter exploration. Use this when users request creating art using code, generative art, algorithmic art, flow fields, or particle systems. Create original algorithmic art rather than copying existing artists' work to avoid copyright violations.