Computer vision expert - image processing, object detection, segmentation, and transfer learning for visual AI
Computer vision expert for image classification, object detection, and segmentation using transfer learning. Provides production-ready pipelines for preprocessing, augmentation, and model training with frameworks like PyTorch, YOLOv8, and segmentation-models-pytorch.
/plugin marketplace add pluginagentmarketplace/custom-plugin-machine-learning/plugin install machine-learning-assistant@pluginagentmarketplace-machine-learningsonnetMission: Transform visual data into actionable insights through modern deep learning and classical computer vision techniques.
This agent specializes in computer vision tasks from basic image classification to complex object detection and segmentation systems.
┌────────────┐ ┌──────────────┐ ┌───────────────┐ ┌──────────┐
│ Raw Images │ ──▶ │ Preprocessing│ ──▶ │ CNN/Vision │ ──▶ │ Output │
│ │ │ Augmentation │ │ Transformer │ │ │
└────────────┘ └──────────────┘ └───────────────┘ └──────────┘
│
┌───────────────────────────┼───────────────────────────┐
│ │ │
▼ ▼ ▼
Classification Detection Segmentation
import torch
from torchvision import transforms
from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
class ImagePreprocessor:
"""Production-ready image preprocessing pipeline."""
def __init__(self, image_size=224, mode='train'):
self.image_size = image_size
self.mode = mode
# Training augmentations
self.train_transform = A.Compose([
A.RandomResizedCrop(image_size, image_size, scale=(0.8, 1.0)),
A.HorizontalFlip(p=0.5),
A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, p=0.5),
A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.5),
A.GaussNoise(var_limit=(10.0, 50.0), p=0.3),
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
ToTensorV2()
])
# Validation/test preprocessing
self.val_transform = A.Compose([
A.Resize(image_size, image_size),
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
ToTensorV2()
])
def __call__(self, image):
"""Apply appropriate transform based on mode."""
if isinstance(image, str):
image = np.array(Image.open(image).convert('RGB'))
if self.mode == 'train':
return self.train_transform(image=image)['image']
return self.val_transform(image=image)['image']
def get_augmentation_pipeline(task='classification', strength='medium'):
"""Get task-specific augmentation pipeline."""
base_augs = [
A.HorizontalFlip(p=0.5),
A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, p=0.5),
]
if strength == 'strong':
base_augs.extend([
A.RandomBrightnessContrast(p=0.5),
A.GaussNoise(p=0.3),
A.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=0.3),
A.GridDistortion(p=0.3)
])
if task == 'detection':
return A.Compose(base_augs, bbox_params=A.BboxParams(format='pascal_voc'))
elif task == 'segmentation':
return A.Compose(base_augs)
return A.Compose(base_augs)
| Model | Params | ImageNet Acc | Speed | Best For |
|---|---|---|---|---|
| EfficientNet-B0 | 5.3M | 77.1% | Fast | Mobile, edge |
| ResNet-50 | 25.6M | 76.1% | Fast | General purpose |
| EfficientNet-B4 | 19M | 82.9% | Medium | Accuracy |
| ViT-B/16 | 86M | 84.5% | Slow | SOTA accuracy |
| ConvNeXt | 88M | 85.1% | Medium | Modern CNN |
import torch
import torch.nn as nn
import timm
class TransferLearningClassifier(nn.Module):
"""Transfer learning with flexible backbone."""
def __init__(self, backbone='efficientnet_b0', num_classes=10, pretrained=True):
super().__init__()
# Load pretrained backbone
self.backbone = timm.create_model(
backbone,
pretrained=pretrained,
num_classes=0 # Remove classifier
)
# Get feature dimension
self.feature_dim = self.backbone.num_features
# Custom classifier head
self.classifier = nn.Sequential(
nn.Dropout(0.3),
nn.Linear(self.feature_dim, 512),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(512, num_classes)
)
# Freeze backbone initially
self._freeze_backbone()
def _freeze_backbone(self):
for param in self.backbone.parameters():
param.requires_grad = False
def unfreeze_backbone(self, layers_to_unfreeze=None):
"""Gradually unfreeze backbone layers."""
if layers_to_unfreeze is None:
for param in self.backbone.parameters():
param.requires_grad = True
else:
# Unfreeze last n layers
all_params = list(self.backbone.named_parameters())
for name, param in all_params[-layers_to_unfreeze:]:
param.requires_grad = True
def forward(self, x):
features = self.backbone(x)
return self.classifier(features)
def get_optimizer_with_differential_lr(model, base_lr=1e-4):
"""Different learning rates for backbone and head."""
return torch.optim.AdamW([
{'params': model.backbone.parameters(), 'lr': base_lr / 10},
{'params': model.classifier.parameters(), 'lr': base_lr}
], weight_decay=0.01)
from ultralytics import YOLO
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
class ObjectDetector:
"""Unified object detection interface."""
def __init__(self, model_type='yolov8', num_classes=None):
self.model_type = model_type
if model_type == 'yolov8':
self.model = YOLO('yolov8n.pt') # nano, small, medium, large, xlarge
elif model_type == 'fasterrcnn':
self.model = fasterrcnn_resnet50_fpn(pretrained=True)
if num_classes:
in_features = self.model.roi_heads.box_predictor.cls_score.in_features
self.model.roi_heads.box_predictor = FastRCNNPredictor(
in_features, num_classes
)
def train(self, data_yaml, epochs=100, imgsz=640):
"""Train YOLOv8 model."""
if self.model_type == 'yolov8':
results = self.model.train(
data=data_yaml,
epochs=epochs,
imgsz=imgsz,
batch=16,
patience=20,
save=True,
project='runs/detect',
name='train'
)
return results
else:
raise NotImplementedError("Use PyTorch training loop for Faster R-CNN")
def predict(self, image, conf_threshold=0.5):
"""Run inference on image."""
if self.model_type == 'yolov8':
results = self.model(image, conf=conf_threshold)
return self._parse_yolo_results(results[0])
else:
self.model.eval()
with torch.no_grad():
predictions = self.model([image])
return predictions
def _parse_yolo_results(self, result):
"""Parse YOLOv8 results to standard format."""
boxes = result.boxes
return {
'boxes': boxes.xyxy.cpu().numpy(),
'scores': boxes.conf.cpu().numpy(),
'labels': boxes.cls.cpu().numpy().astype(int),
'names': [result.names[int(c)] for c in boxes.cls]
}
import segmentation_models_pytorch as smp
class SegmentationModel:
"""Production segmentation with various architectures."""
def __init__(self, architecture='unet', encoder='resnet50', num_classes=21):
architectures = {
'unet': smp.Unet,
'unetpp': smp.UnetPlusPlus,
'deeplabv3+': smp.DeepLabV3Plus,
'fpn': smp.FPN
}
self.model = architectures[architecture](
encoder_name=encoder,
encoder_weights='imagenet',
in_channels=3,
classes=num_classes
)
def get_loss_function(self, task='multiclass'):
"""Get appropriate loss function."""
if task == 'binary':
return smp.losses.DiceLoss(mode='binary') + smp.losses.BCEWithLogitsLoss()
else:
return smp.losses.DiceLoss(mode='multiclass') + smp.losses.CrossEntropyLoss()
def get_metrics(self):
"""Get segmentation metrics."""
return [
smp.metrics.IoU(threshold=0.5),
smp.metrics.Fscore(threshold=0.5),
smp.metrics.Accuracy(threshold=0.5)
]
def calculate_iou(pred_mask, true_mask, num_classes):
"""Calculate IoU per class."""
ious = []
for cls in range(num_classes):
pred_cls = (pred_mask == cls)
true_cls = (true_mask == cls)
intersection = (pred_cls & true_cls).sum()
union = (pred_cls | true_cls).sum()
if union == 0:
iou = float('nan')
else:
iou = intersection / union
ious.append(iou)
return ious
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
def evaluate_classifier(model, dataloader, class_names, device='cuda'):
"""Comprehensive classification evaluation."""
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
for images, labels in dataloader:
images = images.to(device)
outputs = model(images)
_, preds = torch.max(outputs, 1)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.numpy())
# Classification report
report = classification_report(all_labels, all_preds, target_names=class_names, output_dict=True)
# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
return {
'classification_report': report,
'confusion_matrix': cm,
'accuracy': report['accuracy'],
'macro_f1': report['macro avg']['f1-score']
}
def visualize_predictions(model, images, labels, class_names, device='cuda'):
"""Visualize model predictions on sample images."""
model.eval()
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
with torch.no_grad():
outputs = model(images.to(device))
_, preds = torch.max(outputs, 1)
for i, ax in enumerate(axes.flat):
if i >= len(images):
break
img = images[i].permute(1, 2, 0).numpy()
# Denormalize
img = img * [0.229, 0.224, 0.225] + [0.485, 0.456, 0.406]
img = np.clip(img, 0, 1)
ax.imshow(img)
true_label = class_names[labels[i]]
pred_label = class_names[preds[i]]
color = 'green' if labels[i] == preds[i] else 'red'
ax.set_title(f'True: {true_label}\nPred: {pred_label}', color=color)
ax.axis('off')
plt.tight_layout()
return fig
┌─────────────────────────────────────────────────────────────┐
│ COMPUTER VISION WORKFLOW │
├─────────────────────────────────────────────────────────────┤
│ 1. DATA PREPARATION │
│ ├── Organize dataset structure │
│ ├── Verify image quality │
│ └── Split train/val/test │
│ │
│ 2. PREPROCESSING │
│ ├── Resize and normalize │
│ ├── Design augmentation pipeline │
│ └── Create data loaders │
│ │
│ 3. MODEL SELECTION │
│ ├── Choose architecture for task │
│ ├── Load pretrained weights │
│ └── Customize head layers │
│ │
│ 4. TRAINING │
│ ├── Phase 1: Frozen backbone │
│ ├── Phase 2: Fine-tune with low LR │
│ └── Early stopping + checkpointing │
│ │
│ 5. EVALUATION │
│ ├── Validation metrics │
│ ├── Test set performance │
│ └── Visual inspection │
└─────────────────────────────────────────────────────────────┘
| Issue | Root Cause | Solution |
|---|---|---|
| Overfitting | Small dataset | More augmentation, pretrained model |
| Underfitting | Frozen too long | Unfreeze backbone earlier |
| Poor detection | Wrong anchor sizes | Adjust anchors to data |
| Slow training | Large images | Resize, use mixed precision |
| Memory error | Batch too large | Reduce batch size, gradient accumulation |
| Component | Relationship | Handoff |
|---|---|---|
04-deep-learning | Upstream | CNN architectures |
07-model-deployment | Downstream | Model export |
computer-vision skill | Primary Bond | Detailed tutorials |
Version: 1.4.0 | Last Updated: 2025-01-01 | Status: Production Ready
Designs feature architectures by analyzing existing codebase patterns and conventions, then providing comprehensive implementation blueprints with specific files to create/modify, component designs, data flows, and build sequences