PyTorch, TensorFlow, neural networks, CNNs, transformers, and deep learning for production
Provides production-grade PyTorch code for training neural networks, implementing architectures like transformers and CNNs, and exporting models to ONNX/TorchScript. Use when users need to build, train, or deploy deep learning models.
/plugin marketplace add pluginagentmarketplace/custom-plugin-data-engineer/plugin install data-engineer-development-assistant@pluginagentmarketplace-data-engineerThis skill inherits all available tools. When active, it can use any tool Claude has access to.
assets/config.yamlassets/schema.jsonreferences/GUIDE.mdreferences/PATTERNS.mdscripts/validate.pyProduction-grade deep learning with PyTorch, neural network architectures, and modern training practices.
# PyTorch Production Training Loop
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import wandb
class TransformerClassifier(nn.Module):
def __init__(self, vocab_size: int, d_model: int = 256, n_heads: int = 8, n_classes: int = 2):
super().__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward=1024, batch_first=True)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6)
self.classifier = nn.Linear(d_model, n_classes)
self.dropout = nn.Dropout(0.1)
def forward(self, x, mask=None):
x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
x = self.dropout(x)
x = self.transformer(x, src_key_padding_mask=mask)
x = x.mean(dim=1) # Global average pooling
return self.classifier(x)
# Training configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size=30000).to(device)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=10)
criterion = nn.CrossEntropyLoss()
# Training loop with mixed precision
scaler = torch.cuda.amp.GradScaler()
for epoch in range(10):
model.train()
for batch in train_loader:
optimizer.zero_grad()
with torch.cuda.amp.autocast():
logits = model(batch["input_ids"].to(device))
loss = criterion(logits, batch["labels"].to(device))
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
scheduler.step()
import torch
import torch.nn as nn
import torch.nn.functional as F
class ResidualBlock(nn.Module):
"""Residual block with skip connection."""
def __init__(self, channels: int):
super().__init__()
self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn1 = nn.BatchNorm2d(channels)
self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn2 = nn.BatchNorm2d(channels)
def forward(self, x):
residual = x
x = F.relu(self.bn1(self.conv1(x)))
x = self.bn2(self.conv2(x))
return F.relu(x + residual)
class AttentionBlock(nn.Module):
"""Multi-head self-attention."""
def __init__(self, d_model: int, n_heads: int = 8):
super().__init__()
self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
self.norm = nn.LayerNorm(d_model)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_model * 4),
nn.GELU(),
nn.Linear(d_model * 4, d_model)
)
self.norm2 = nn.LayerNorm(d_model)
def forward(self, x, mask=None):
attn_out, _ = self.attention(x, x, x, attn_mask=mask)
x = self.norm(x + attn_out)
return self.norm2(x + self.ffn(x))
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR
# Gradient clipping and accumulation
def train_epoch(model, loader, optimizer, accumulation_steps=4):
model.train()
optimizer.zero_grad()
for i, batch in enumerate(loader):
with torch.cuda.amp.autocast():
loss = model(batch) / accumulation_steps
scaler.scale(loss).backward()
if (i + 1) % accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
# Early stopping
class EarlyStopping:
def __init__(self, patience: int = 5, min_delta: float = 0.001):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.best_loss = float('inf')
def __call__(self, val_loss: float) -> bool:
if val_loss < self.best_loss - self.min_delta:
self.best_loss = val_loss
self.counter = 0
else:
self.counter += 1
return self.counter >= self.patience
# Learning rate finder
def find_lr(model, loader, optimizer, start_lr=1e-7, end_lr=10, num_iter=100):
lrs, losses = [], []
lr_mult = (end_lr / start_lr) ** (1 / num_iter)
for i, batch in enumerate(loader):
if i >= num_iter:
break
lr = start_lr * (lr_mult ** i)
for pg in optimizer.param_groups:
pg['lr'] = lr
loss = train_step(model, batch, optimizer)
lrs.append(lr)
losses.append(loss)
return lrs, losses
import torch.onnx
import onnxruntime as ort
# Export to ONNX
def export_to_onnx(model, sample_input, path="model.onnx"):
model.eval()
torch.onnx.export(
model,
sample_input,
path,
export_params=True,
opset_version=17,
do_constant_folding=True,
input_names=['input'],
output_names=['output'],
dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
)
# ONNX Runtime inference
class ONNXPredictor:
def __init__(self, model_path: str):
self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
def predict(self, input_data):
return self.session.run(None, {'input': input_data})[0]
# TorchScript for production
scripted_model = torch.jit.script(model)
scripted_model.save("model_scripted.pt")
| Tool | Purpose | Version (2025) |
|---|---|---|
| PyTorch | Deep learning framework | 2.2+ |
| PyTorch Lightning | Training framework | 2.2+ |
| Hugging Face | Transformers, datasets | 4.38+ |
| ONNX Runtime | Model inference | 1.17+ |
| TensorRT | GPU optimization | 8.6+ |
| Weights & Biases | Experiment tracking | Latest |
| Ray | Distributed training | 2.9+ |
| Issue | Symptoms | Root Cause | Fix |
|---|---|---|---|
| Vanishing Gradient | Loss not decreasing | Deep network, wrong activation | Use ReLU/GELU, residual connections |
| Exploding Gradient | NaN loss | Learning rate too high | Gradient clipping, lower LR |
| Overfitting | Train >> Val accuracy | Model too complex | Dropout, regularization, data aug |
| OOM Error | CUDA out of memory | Batch too large | Reduce batch, gradient accumulation |
| Slow Training | Low GPU utilization | Data loading bottleneck | More workers, prefetch |
# Check GPU memory
print(torch.cuda.memory_summary())
# Profile training
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]
) as prof:
train_step(model, batch, optimizer)
print(prof.key_averages().table(sort_by="cuda_time_total"))
# Gradient flow check
for name, param in model.named_parameters():
if param.grad is not None:
print(f"{name}: grad_mean={param.grad.mean():.6f}")
# ✅ DO: Use mixed precision training
with torch.cuda.amp.autocast():
output = model(input)
# ✅ DO: Initialize weights properly
def init_weights(m):
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
# ✅ DO: Use gradient checkpointing for large models
from torch.utils.checkpoint import checkpoint
x = checkpoint(self.layer, x)
# ✅ DO: Freeze base model for fine-tuning
for param in model.base.parameters():
param.requires_grad = False
# ❌ DON'T: Use dropout during inference
model.eval()
# ❌ DON'T: Forget to move data to device
Skill Certification Checklist:
Use when working with Payload CMS projects (payload.config.ts, collections, fields, hooks, access control, Payload API). Use when debugging validation errors, security issues, relationship queries, transactions, or hook behavior.