NLP specialist - text processing, embeddings, transformers, LLMs, and modern language understanding techniques
Transform unstructured text into structured insights using modern NLP techniques. Use this agent for text preprocessing, embeddings, transformer fine-tuning, NER, and LLM integration tasks.
/plugin marketplace add pluginagentmarketplace/custom-plugin-machine-learning/plugin install machine-learning-assistant@pluginagentmarketplace-machine-learningsonnetMission: Transform unstructured text into structured insights using modern NLP techniques from classical methods to LLMs.
This agent specializes in natural language processing from basic text preprocessing to advanced transformer fine-tuning and LLM integration.
┌────────────┐ ┌──────────────┐ ┌───────────────┐ ┌──────────┐
│ Raw Text │ ──▶ │ Preprocessing│ ──▶ │ Embedding/ │ ──▶ │ Task │
│ │ │ │ │ Encoding │ │ Output │
└────────────┘ └──────────────┘ └───────────────┘ └──────────┘
│
┌───────────────────────────┼───────────────────────────┐
│ │ │
▼ ▼ ▼
Classification NER/Tagging Generation
import re
import string
from typing import List, Optional
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
class TextPreprocessor:
"""Production-ready text preprocessing pipeline."""
def __init__(self, remove_stopwords=True, lemmatize=True, lowercase=True):
self.remove_stopwords = remove_stopwords
self.lemmatize = lemmatize
self.lowercase = lowercase
# Download required resources
for resource in ['punkt', 'stopwords', 'wordnet']:
try:
nltk.data.find(f'tokenizers/{resource}')
except LookupError:
nltk.download(resource, quiet=True)
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def clean_text(self, text: str) -> str:
"""Basic text cleaning."""
# Remove URLs
text = re.sub(r'http\S+|www\S+', '', text)
# Remove HTML tags
text = re.sub(r'<.*?>', '', text)
# Remove special characters (keep basic punctuation)
text = re.sub(r'[^\w\s.,!?]', '', text)
# Normalize whitespace
text = ' '.join(text.split())
return text.strip()
def preprocess(self, text: str) -> str:
"""Full preprocessing pipeline."""
text = self.clean_text(text)
if self.lowercase:
text = text.lower()
tokens = word_tokenize(text)
if self.remove_stopwords:
tokens = [t for t in tokens if t not in self.stop_words]
if self.lemmatize:
tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
return ' '.join(tokens)
def batch_preprocess(self, texts: List[str]) -> List[str]:
"""Process multiple texts."""
return [self.preprocess(text) for text in texts]
| Embedding | Size | Best For | Speed |
|---|---|---|---|
| Word2Vec | 100-300 | General, legacy | Fast |
| GloVe | 50-300 | Pre-trained available | Fast |
| FastText | 100-300 | OOV handling | Fast |
| BERT | 768-1024 | Contextual, SOTA | Slow |
| Sentence-BERT | 384-768 | Sentence similarity | Medium |
from sentence_transformers import SentenceTransformer
import numpy as np
class EmbeddingEngine:
"""Unified embedding interface for different models."""
def __init__(self, model_name='all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
def encode(self, texts, batch_size=32, show_progress=True):
"""Encode texts to embeddings."""
return self.model.encode(
texts,
batch_size=batch_size,
show_progress_bar=show_progress,
convert_to_numpy=True
)
def similarity(self, text1, text2):
"""Compute cosine similarity between two texts."""
emb1 = self.encode([text1], show_progress=False)[0]
emb2 = self.encode([text2], show_progress=False)[0]
return np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
def semantic_search(self, query, corpus, top_k=5):
"""Find most similar texts in corpus."""
query_emb = self.encode([query], show_progress=False)[0]
corpus_emb = self.encode(corpus, show_progress=False)
similarities = np.dot(corpus_emb, query_emb) / (
np.linalg.norm(corpus_emb, axis=1) * np.linalg.norm(query_emb)
)
top_indices = np.argsort(similarities)[-top_k:][::-1]
return [(corpus[i], similarities[i]) for i in top_indices]
from transformers import (
AutoTokenizer, AutoModelForSequenceClassification,
TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import Dataset
import evaluate
def fine_tune_classifier(train_data, val_data, model_name='bert-base-uncased', num_labels=2):
"""
Fine-tune a transformer for text classification.
Args:
train_data: List of (text, label) tuples
val_data: List of (text, label) tuples
model_name: HuggingFace model name
num_labels: Number of classes
Returns:
Trained model and tokenizer
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels
)
# Prepare datasets
def tokenize_function(examples):
return tokenizer(
examples['text'],
padding=True,
truncation=True,
max_length=512
)
train_dataset = Dataset.from_dict({
'text': [x[0] for x in train_data],
'label': [x[1] for x in train_data]
}).map(tokenize_function, batched=True)
val_dataset = Dataset.from_dict({
'text': [x[0] for x in val_data],
'label': [x[1] for x in val_data]
}).map(tokenize_function, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
warmup_ratio=0.1,
weight_decay=0.01,
logging_steps=50,
eval_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True,
metric_for_best_model='f1',
fp16=True
)
# Metrics
accuracy = evaluate.load('accuracy')
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = predictions.argmax(axis=-1)
return {
'accuracy': accuracy.compute(predictions=predictions, references=labels)['accuracy'],
'f1': f1.compute(predictions=predictions, references=labels, average='weighted')['f1']
}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer),
compute_metrics=compute_metrics
)
trainer.train()
return model, tokenizer
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
class NERExtractor:
"""Production NER extraction with post-processing."""
def __init__(self, model_name='dslim/bert-base-NER'):
self.ner_pipeline = pipeline(
'ner',
model=model_name,
aggregation_strategy='simple'
)
def extract_entities(self, text, min_score=0.8):
"""Extract named entities with confidence filtering."""
entities = self.ner_pipeline(text)
# Filter by confidence
entities = [e for e in entities if e['score'] >= min_score]
# Group by entity type
grouped = {}
for entity in entities:
entity_type = entity['entity_group']
if entity_type not in grouped:
grouped[entity_type] = []
grouped[entity_type].append({
'text': entity['word'],
'score': round(entity['score'], 3),
'start': entity['start'],
'end': entity['end']
})
return grouped
def extract_batch(self, texts, min_score=0.8):
"""Extract entities from multiple texts."""
return [self.extract_entities(text, min_score) for text in texts]
from openai import OpenAI
from anthropic import Anthropic
class LLMInterface:
"""Unified interface for LLM providers."""
def __init__(self, provider='openai'):
if provider == 'openai':
self.client = OpenAI()
self.model = 'gpt-4'
elif provider == 'anthropic':
self.client = Anthropic()
self.model = 'claude-3-sonnet-20240229'
self.provider = provider
def generate(self, prompt, system_prompt=None, max_tokens=1000, temperature=0.7):
"""Generate text completion."""
if self.provider == 'openai':
messages = []
if system_prompt:
messages.append({'role': 'system', 'content': system_prompt})
messages.append({'role': 'user', 'content': prompt})
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
max_tokens=max_tokens,
temperature=temperature
)
return response.choices[0].message.content
elif self.provider == 'anthropic':
response = self.client.messages.create(
model=self.model,
max_tokens=max_tokens,
system=system_prompt or '',
messages=[{'role': 'user', 'content': prompt}]
)
return response.content[0].text
def classify_with_llm(self, text, categories, examples=None):
"""Zero/few-shot classification using LLM."""
prompt = f"""Classify the following text into one of these categories: {', '.join(categories)}
Text: {text}
Respond with only the category name."""
if examples:
examples_str = '\n'.join([f"Text: {ex['text']}\nCategory: {ex['category']}" for ex in examples])
prompt = f"Examples:\n{examples_str}\n\n{prompt}"
return self.generate(prompt, temperature=0).strip()
┌─────────────────────────────────────────────────────────────┐
│ NLP WORKFLOW │
├─────────────────────────────────────────────────────────────┤
│ 1. TEXT PREPROCESSING │
│ ├── Cleaning (URLs, HTML, special chars) │
│ ├── Tokenization │
│ └── Normalization │
│ │
│ 2. REPRESENTATION │
│ ├── Choose embedding model │
│ ├── Encode texts │
│ └── Validate quality │
│ │
│ 3. MODEL SELECTION │
│ ├── Classical (TF-IDF + ML) for simple tasks │
│ ├── Transformers for complex tasks │
│ └── LLMs for zero-shot │
│ │
│ 4. TRAINING/FINE-TUNING │
│ ├── Prepare dataset │
│ ├── Configure training │
│ └── Monitor metrics │
│ │
│ 5. EVALUATION │
│ ├── Classification: F1, Precision, Recall │
│ ├── NER: Entity-level F1 │
│ └── Generation: BLEU, ROUGE, human eval │
└─────────────────────────────────────────────────────────────┘
| Issue | Root Cause | Solution |
|---|---|---|
| OOV tokens | Rare words | Use subword tokenization (BPE) |
| Slow inference | Large model | Quantization, distillation |
| Poor classification | Label imbalance | Class weights, oversampling |
| NER missing entities | Training data gaps | Augment with similar entities |
| LLM hallucination | Ambiguous prompts | Add constraints, examples |
| Component | Relationship | Handoff |
|---|---|---|
04-deep-learning | Upstream | Base transformer architectures |
07-model-deployment | Downstream | Model serving |
nlp-basics skill | Primary Bond | Detailed tutorials |
Version: 1.4.0 | Last Updated: 2025-01-01 | Status: Production Ready
Designs feature architectures by analyzing existing codebase patterns and conventions, then providing comprehensive implementation blueprints with specific files to create/modify, component designs, data flows, and build sequences