Deep Learning

Production-grade deep learning with PyTorch, neural network architectures, and modern training practices.

Quick Start

# PyTorch Production Training Loop
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import wandb

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size: int, d_model: int = 256, n_heads: int = 8, n_classes: int = 2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward=1024, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6)
        self.classifier = nn.Linear(d_model, n_classes)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
        x = self.dropout(x)
        x = self.transformer(x, src_key_padding_mask=mask)
        x = x.mean(dim=1)  # Global average pooling
        return self.classifier(x)

# Training configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size=30000).to(device)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=10)
criterion = nn.CrossEntropyLoss()

# Training loop with mixed precision
scaler = torch.cuda.amp.GradScaler()

for epoch in range(10):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model(batch["input_ids"].to(device))
            loss = criterion(logits, batch["labels"].to(device))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    scheduler.step()

Core Concepts

1. Modern Neural Network Architectures

import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    """Residual block with skip connection."""
    def __init__(self, channels: int):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)

    def forward(self, x):
        residual = x
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))
        return F.relu(x + residual)

class AttentionBlock(nn.Module):
    """Multi-head self-attention."""
    def __init__(self, d_model: int, n_heads: int = 8):
        super().__init__()
        self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.norm = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        attn_out, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.norm(x + attn_out)
        return self.norm2(x + self.ffn(x))

2. Training Best Practices

from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR

# Gradient clipping and accumulation
def train_epoch(model, loader, optimizer, accumulation_steps=4):
    model.train()
    optimizer.zero_grad()

    for i, batch in enumerate(loader):
        with torch.cuda.amp.autocast():
            loss = model(batch) / accumulation_steps

        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

# Early stopping
class EarlyStopping:
    def __init__(self, patience: int = 5, min_delta: float = 0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')

    def __call__(self, val_loss: float) -> bool:
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience

# Learning rate finder
def find_lr(model, loader, optimizer, start_lr=1e-7, end_lr=10, num_iter=100):
    lrs, losses = [], []
    lr_mult = (end_lr / start_lr) ** (1 / num_iter)

    for i, batch in enumerate(loader):
        if i >= num_iter:
            break

        lr = start_lr * (lr_mult ** i)
        for pg in optimizer.param_groups:
            pg['lr'] = lr

        loss = train_step(model, batch, optimizer)
        lrs.append(lr)
        losses.append(loss)

    return lrs, losses

3. Model Deployment

import torch.onnx
import onnxruntime as ort

# Export to ONNX
def export_to_onnx(model, sample_input, path="model.onnx"):
    model.eval()
    torch.onnx.export(
        model,
        sample_input,
        path,
        export_params=True,
        opset_version=17,
        do_constant_folding=True,
        input_names=['input'],
        output_names=['output'],
        dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
    )

# ONNX Runtime inference
class ONNXPredictor:
    def __init__(self, model_path: str):
        self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])

    def predict(self, input_data):
        return self.session.run(None, {'input': input_data})[0]

# TorchScript for production
scripted_model = torch.jit.script(model)
scripted_model.save("model_scripted.pt")

Tools & Technologies

Tool	Purpose	Version (2025)
PyTorch	Deep learning framework	2.2+
PyTorch Lightning	Training framework	2.2+
Hugging Face	Transformers, datasets	4.38+
ONNX Runtime	Model inference	1.17+
TensorRT	GPU optimization	8.6+
Weights & Biases	Experiment tracking	Latest
Ray	Distributed training	2.9+

Troubleshooting Guide

Issue	Symptoms	Root Cause	Fix
Vanishing Gradient	Loss not decreasing	Deep network, wrong activation	Use ReLU/GELU, residual connections
Exploding Gradient	NaN loss	Learning rate too high	Gradient clipping, lower LR
Overfitting	Train >> Val accuracy	Model too complex	Dropout, regularization, data aug
OOM Error	CUDA out of memory	Batch too large	Reduce batch, gradient accumulation
Slow Training	Low GPU utilization	Data loading bottleneck	More workers, prefetch

Debug Commands

# Check GPU memory
print(torch.cuda.memory_summary())

# Profile training
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]
) as prof:
    train_step(model, batch, optimizer)
print(prof.key_averages().table(sort_by="cuda_time_total"))

# Gradient flow check
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: grad_mean={param.grad.mean():.6f}")

Best Practices

# ✅ DO: Use mixed precision training
with torch.cuda.amp.autocast():
    output = model(input)

# ✅ DO: Initialize weights properly
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)

# ✅ DO: Use gradient checkpointing for large models
from torch.utils.checkpoint import checkpoint
x = checkpoint(self.layer, x)

# ✅ DO: Freeze base model for fine-tuning
for param in model.base.parameters():
    param.requires_grad = False

# ❌ DON'T: Use dropout during inference
model.eval()

# ❌ DON'T: Forget to move data to device

Resources

PyTorch Tutorials
Hugging Face Course
Fast.ai
"Deep Learning" by Goodfellow et al.

Skill Certification Checklist:

Can build and train neural networks in PyTorch
Can implement attention mechanisms and transformers
Can use mixed precision and gradient accumulation
Can export models to ONNX/TorchScript
Can debug training issues (gradients, memory)

deep-learning

Safety Notice

Copy this and send it to your AI assistant to learn

Deep Learning

Quick Start

Core Concepts

1. Modern Neural Network Architectures

2. Training Best Practices

3. Model Deployment

Tools & Technologies

Troubleshooting Guide

Debug Commands

Best Practices

Resources

Source Transparency

Related Skills

data-engineering

machine learning

python-programming

statistics-math