Computer Vision Skill

Build visual AI systems from classification to detection.

Quick Start

import torch
import timm
from PIL import Image
from torchvision import transforms

# Load pretrained model
model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=10)
model.eval()

# Preprocessing
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Inference
image = Image.open('image.jpg').convert('RGB')
input_tensor = transform(image).unsqueeze(0)

with torch.no_grad():
    output = model(input_tensor)
    predicted_class = output.argmax(dim=1).item()

Key Topics

1. Data Augmentation

import albumentations as A
from albumentations.pytorch import ToTensorV2

train_transform = A.Compose([
    A.RandomResizedCrop(224, 224, scale=(0.8, 1.0)),
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15),
    A.ColorJitter(brightness=0.2, contrast=0.2),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

val_transform = A.Compose([
    A.Resize(256, 256),
    A.CenterCrop(224, 224),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

2. Transfer Learning

Model	Params	ImageNet Acc	Speed
EfficientNet-B0	5.3M	77%	Fast
ResNet-50	25.6M	76%	Fast
ViT-B/16	86M	84%	Slow

import timm

class TransferClassifier(torch.nn.Module):
    def __init__(self, backbone='efficientnet_b0', num_classes=10):
        super().__init__()
        self.backbone = timm.create_model(backbone, pretrained=True, num_classes=0)
        self.classifier = torch.nn.Linear(self.backbone.num_features, num_classes)

        # Freeze backbone
        for param in self.backbone.parameters():
            param.requires_grad = False

    def unfreeze(self):
        for param in self.backbone.parameters():
            param.requires_grad = True

    def forward(self, x):
        features = self.backbone(x)
        return self.classifier(features)

3. Object Detection (YOLOv8)

from ultralytics import YOLO

# Load model
model = YOLO('yolov8n.pt')

# Train
results = model.train(
    data='dataset.yaml',
    epochs=100,
    imgsz=640,
    batch=16
)

# Inference
results = model('image.jpg')
for r in results:
    boxes = r.boxes
    for box in boxes:
        print(f"Class: {r.names[int(box.cls)]}, Conf: {box.conf:.2f}")

4. Image Segmentation

import segmentation_models_pytorch as smp

# Create U-Net model
model = smp.Unet(
    encoder_name='resnet50',
    encoder_weights='imagenet',
    in_channels=3,
    classes=21
)

# Loss function
loss_fn = smp.losses.DiceLoss(mode='multiclass')

5. Model Evaluation

from sklearn.metrics import classification_report, confusion_matrix

def evaluate_classifier(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for images, labels in dataloader:
            outputs = model(images.to(device))
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())

    return {
        'report': classification_report(all_labels, all_preds),
        'confusion_matrix': confusion_matrix(all_labels, all_preds)
    }

Best Practices

DO

Use pretrained models
Apply consistent augmentation
Use mixed precision training
Normalize with ImageNet stats
Visualize predictions

DON'T

Don't train from scratch on small data
Don't use same augmentations for val
Don't ignore class imbalance
Don't skip visual error analysis

Exercises

Exercise 1: Transfer Learning

# TODO: Fine-tune EfficientNet on CIFAR-10
# Freeze backbone first, then unfreeze

Exercise 2: Object Detection

# TODO: Train YOLOv8 on custom dataset
# Create dataset.yaml and train

Unit Test Template

import pytest
import torch

def test_model_output_shape():
    """Test model output dimensions."""
    model = TransferClassifier(num_classes=10)
    x = torch.randn(4, 3, 224, 224)

    output = model(x)

    assert output.shape == (4, 10)

def test_augmentation_preserves_shape():
    """Test augmentation output shape."""
    import numpy as np
    image = np.random.randint(0, 255, (256, 256, 3), dtype=np.uint8)

    augmented = train_transform(image=image)['image']

    assert augmented.shape == (3, 224, 224)

Troubleshooting

Problem	Cause	Solution
Overfitting	Small dataset	More augmentation
Slow training	Large images	Resize, use AMP
Poor detection	Wrong anchors	Adjust anchor sizes
Memory error	Batch too large	Reduce batch size

Related Resources

Agent: 06-computer-vision
Previous: nlp-basics
Next: ml-deployment
Docs: timm

Version: 1.4.0 | Status: Production Ready

computer-vision

Computer Vision Skill

Quick Start

Key Topics

1. Data Augmentation

2. Transfer Learning

3. Object Detection (YOLOv8)

4. Image Segmentation

5. Model Evaluation

Best Practices

DO

DON'T

Exercises

Exercise 1: Transfer Learning

Exercise 2: Object Detection

Unit Test Template

Troubleshooting

Related Resources

Similar Skills