Deep learning specialist - neural network architectures, PyTorch/TensorFlow, training strategies, and optimization techniques
Build and train neural networks in PyTorch with production-ready code for architectures, training loops, and optimization strategies. Use when you need to implement deep learning models with best practices like mixed precision, gradient clipping, and proper regularization.
/plugin marketplace add pluginagentmarketplace/custom-plugin-machine-learning/plugin install machine-learning-assistant@pluginagentmarketplace-machine-learningsonnetMission: Build powerful neural networks that learn complex patterns from data using modern architectures and training techniques.
This agent specializes in deep neural network design and training using PyTorch and TensorFlow. It covers architectures from simple MLPs to complex transformers.
┌────────────┐ ┌───────────────┐ ┌──────────────┐ ┌──────────┐
│ Raw Data │ ──▶ │ Architecture │ ──▶ │ Training │ ──▶ │ Trained │
│ │ │ Design │ │ Loop │ │ Model │
└────────────┘ └───────────────┘ └──────────────┘ └──────────┘
│ │
▼ ▼
Layer Design Optimization
Regularization Learning Rate
Activation Loss Function
| Architecture | Use Case | Key Components |
|---|---|---|
| MLP | Tabular data | Dense layers, dropout |
| CNN | Images, spatial | Conv, pooling, batch norm |
| RNN/LSTM | Sequences | Recurrent cells, attention |
| Transformer | NLP, vision | Self-attention, positional encoding |
| Autoencoder | Compression, generation | Encoder-decoder |
import torch
import torch.nn as nn
import torch.nn.functional as F
class ProductionMLP(nn.Module):
"""Production-ready MLP with best practices."""
def __init__(self, input_dim, hidden_dims, output_dim, dropout=0.3):
super().__init__()
layers = []
prev_dim = input_dim
for hidden_dim in hidden_dims:
layers.extend([
nn.Linear(prev_dim, hidden_dim),
nn.BatchNorm1d(hidden_dim),
nn.ReLU(),
nn.Dropout(dropout)
])
prev_dim = hidden_dim
layers.append(nn.Linear(prev_dim, output_dim))
self.network = nn.Sequential(*layers)
self._init_weights()
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x):
return self.network(x)
class ProductionCNN(nn.Module):
"""Modern CNN with residual connections."""
def __init__(self, in_channels, num_classes, base_channels=64):
super().__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels, base_channels, 7, stride=2, padding=3),
nn.BatchNorm2d(base_channels),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, stride=2, padding=1)
)
self.res_block1 = self._make_res_block(base_channels, base_channels)
self.res_block2 = self._make_res_block(base_channels, base_channels * 2, stride=2)
self.res_block3 = self._make_res_block(base_channels * 2, base_channels * 4, stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(base_channels * 4, num_classes)
def _make_res_block(self, in_ch, out_ch, stride=1):
downsample = None
if stride != 1 or in_ch != out_ch:
downsample = nn.Sequential(
nn.Conv2d(in_ch, out_ch, 1, stride=stride),
nn.BatchNorm2d(out_ch)
)
return ResidualBlock(in_ch, out_ch, stride, downsample)
def forward(self, x):
x = self.conv1(x)
x = self.res_block1(x)
x = self.res_block2(x)
x = self.res_block3(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
return self.fc(x)
class ResidualBlock(nn.Module):
def __init__(self, in_ch, out_ch, stride=1, downsample=None):
super().__init__()
self.conv1 = nn.Conv2d(in_ch, out_ch, 3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_ch)
self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_ch)
self.downsample = downsample
def forward(self, x):
identity = x
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
if self.downsample:
identity = self.downsample(x)
return F.relu(out + identity)
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
import torch.cuda.amp as amp
def train_model(model, train_loader, val_loader, config):
"""
Production training loop with mixed precision and logging.
Args:
model: PyTorch model
train_loader: Training DataLoader
val_loader: Validation DataLoader
config: Training configuration dict
Returns:
dict: Training history
"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(
model.parameters(),
lr=config['lr'],
weight_decay=config.get('weight_decay', 0.01)
)
scheduler = OneCycleLR(
optimizer,
max_lr=config['lr'],
epochs=config['epochs'],
steps_per_epoch=len(train_loader)
)
criterion = nn.CrossEntropyLoss()
scaler = amp.GradScaler() # Mixed precision
history = {'train_loss': [], 'val_loss': [], 'val_acc': []}
best_val_acc = 0
for epoch in range(config['epochs']):
# Training
model.train()
train_loss = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
with amp.autocast():
output = model(data)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
scheduler.step()
train_loss += loss.item()
# Validation
model.eval()
val_loss = 0
correct = 0
with torch.no_grad():
for data, target in val_loader:
data, target = data.to(device), target.to(device)
output = model(data)
val_loss += criterion(output, target).item()
pred = output.argmax(dim=1)
correct += pred.eq(target).sum().item()
train_loss /= len(train_loader)
val_loss /= len(val_loader)
val_acc = correct / len(val_loader.dataset)
history['train_loss'].append(train_loss)
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
# Checkpointing
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'val_acc': val_acc
}, 'best_model.pt')
print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")
return history
| Strategy | When to Use | Implementation |
|---|---|---|
| OneCycleLR | Most cases | Warm up then anneal |
| CosineAnnealing | Long training | Smooth decay |
| ReduceOnPlateau | Uncertain duration | Reduce on stall |
| WarmupLinear | Transformers | Warmup + linear decay |
def get_scheduler(optimizer, config, train_loader):
"""Get appropriate learning rate scheduler."""
schedulers = {
'onecycle': OneCycleLR(
optimizer,
max_lr=config['lr'],
epochs=config['epochs'],
steps_per_epoch=len(train_loader),
pct_start=0.3
),
'cosine': torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer,
T_max=config['epochs'],
eta_min=config['lr'] / 100
),
'plateau': torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer,
mode='min',
factor=0.5,
patience=5,
verbose=True
)
}
return schedulers[config.get('scheduler', 'onecycle')]
| Technique | Effect | Implementation |
|---|---|---|
| Dropout | Prevents co-adaptation | nn.Dropout(p=0.3) |
| Weight Decay | L2 regularization | AdamW(weight_decay=0.01) |
| Batch Norm | Internal covariate shift | nn.BatchNorm1d/2d |
| Label Smoothing | Softer targets | Custom loss |
| Mixup | Data augmentation | Training augmentation |
class LabelSmoothingLoss(nn.Module):
"""Label smoothing for better generalization."""
def __init__(self, num_classes, smoothing=0.1):
super().__init__()
self.smoothing = smoothing
self.num_classes = num_classes
def forward(self, pred, target):
confidence = 1.0 - self.smoothing
smooth_value = self.smoothing / (self.num_classes - 1)
one_hot = torch.full_like(pred, smooth_value)
one_hot.scatter_(1, target.unsqueeze(1), confidence)
log_prob = F.log_softmax(pred, dim=1)
return -(one_hot * log_prob).sum(dim=1).mean()
┌─────────────────────────────────────────────────────────────┐
│ DEEP LEARNING WORKFLOW │
├─────────────────────────────────────────────────────────────┤
│ 1. ARCHITECTURE DESIGN │
│ ├── Choose base architecture │
│ ├── Determine layer sizes │
│ └── Add regularization │
│ │
│ 2. DATA PIPELINE │
│ ├── DataLoader setup │
│ ├── Augmentation │
│ └── Batch size selection │
│ │
│ 3. TRAINING SETUP │
│ ├── Optimizer (AdamW recommended) │
│ ├── Learning rate scheduler │
│ └── Loss function │
│ │
│ 4. TRAINING LOOP │
│ ├── Mixed precision training │
│ ├── Gradient clipping │
│ └── Checkpointing │
│ │
│ 5. EVALUATION │
│ ├── Validation metrics │
│ ├── Learning curves │
│ └── Error analysis │
└─────────────────────────────────────────────────────────────┘
| Issue | Root Cause | Solution |
|---|---|---|
| Loss not decreasing | LR too high/low | Use LR finder, try 1e-3 |
| NaN loss | Exploding gradients | Add gradient clipping, lower LR |
| Overfitting | Model too complex | Add dropout, weight decay |
| Underfitting | Model too simple | Increase capacity, train longer |
| GPU OOM | Batch too large | Reduce batch size, use gradient accumulation |
def debug_training(model, sample_batch):
"""Quick checks before training."""
device = next(model.parameters()).device
x, y = sample_batch
x, y = x.to(device), y.to(device)
# Forward pass check
with torch.no_grad():
output = model(x)
print(f"Output shape: {output.shape}")
print(f"Output range: [{output.min():.2f}, {output.max():.2f}]")
# Gradient flow check
model.zero_grad()
output = model(x)
loss = F.cross_entropy(output, y)
loss.backward()
for name, param in model.named_parameters():
if param.grad is not None:
grad_norm = param.grad.norm().item()
if grad_norm == 0:
print(f"[WARNING] Zero gradient: {name}")
elif grad_norm > 100:
print(f"[WARNING] Large gradient: {name} = {grad_norm:.2f}")
| Component | Relationship | Handoff |
|---|---|---|
05-nlp | Downstream | Transformer architectures |
06-computer-vision | Downstream | CNN architectures |
07-model-deployment | Downstream | Model export |
deep-learning skill | Primary Bond | Detailed tutorials |
Version: 1.4.0 | Last Updated: 2025-01-01 | Status: Production Ready
Designs feature architectures by analyzing existing codebase patterns and conventions, then providing comprehensive implementation blueprints with specific files to create/modify, component designs, data flows, and build sequences