Efficient AI
Techniques for building resource-efficient ML systems.
Model Compression Overview
┌─────────────────────────────────────────────────────────────┐ │ MODEL COMPRESSION TECHNIQUES │ ├─────────────────────────────────────────────────────────────┤ │ │ │ QUANTIZATION PRUNING DISTILLATION │ │ ───────────── ────────── ──────────── │ │ FP32 → INT8 Remove weights Teacher→Student │ │ 2-4x smaller 50-90% sparse 10-100x smaller │ │ 1.5-3x faster 2-4x faster Same accuracy │ │ │ │ ARCHITECTURE LOW-RANK NEURAL ARCH │ │ ───────────── ────────── ──────────── │ │ MobileNet Matrix decomp AutoML search │ │ EfficientNet LoRA adapters Hardware-aware │ │ Depth-separable Rank reduction Latency targets │ │ │ └─────────────────────────────────────────────────────────────┘
Quantization
Post-Training Quantization
import torch from torch.quantization import quantize_dynamic, quantize_static
Dynamic quantization (weights only)
model_dynamic = quantize_dynamic( model, {torch.nn.Linear, torch.nn.LSTM}, dtype=torch.qint8 )
Static quantization (weights + activations)
model.qconfig = torch.quantization.get_default_qconfig('fbgemm') model_prepared = torch.quantization.prepare(model)
Calibrate with representative data
with torch.no_grad(): for batch in calibration_loader: model_prepared(batch)
model_static = torch.quantization.convert(model_prepared)
Quantization-Aware Training
import torch.quantization as quant
class QuantizedModel(nn.Module): def init(self): super().init() self.quant = quant.QuantStub() self.dequant = quant.DeQuantStub() self.layers = nn.Sequential( nn.Linear(784, 256), nn.ReLU(), nn.Linear(256, 10) )
def forward(self, x):
x = self.quant(x)
x = self.layers(x)
x = self.dequant(x)
return x
Enable QAT
model.qconfig = quant.get_default_qat_qconfig('fbgemm') model = quant.prepare_qat(model)
Train normally
for epoch in range(epochs): train(model, train_loader)
Convert to quantized
model = quant.convert(model)
Pruning
Magnitude Pruning
import torch.nn.utils.prune as prune
Unstructured pruning (individual weights)
prune.l1_unstructured(model.layer1, name='weight', amount=0.3)
Structured pruning (entire channels)
prune.ln_structured( model.conv1, name='weight', amount=0.5, n=2, dim=0 # Prune 50% of output channels )
Global pruning (across layers)
parameters_to_prune = [ (model.layer1, 'weight'), (model.layer2, 'weight'), ] prune.global_unstructured( parameters_to_prune, pruning_method=prune.L1Unstructured, amount=0.4 )
Make pruning permanent
for module, name in parameters_to_prune: prune.remove(module, name)
Iterative Pruning with Fine-tuning
def iterative_pruning(model, train_loader, target_sparsity=0.9): current_sparsity = 0 sparsity_schedule = [0.5, 0.75, 0.9]
for target in sparsity_schedule:
# Prune
for name, module in model.named_modules():
if isinstance(module, nn.Linear):
prune.l1_unstructured(module, 'weight', amount=target)
# Fine-tune
for epoch in range(fine_tune_epochs):
train_epoch(model, train_loader)
# Measure sparsity
total_zeros = sum((p == 0).sum().item() for p in model.parameters())
total_params = sum(p.numel() for p in model.parameters())
current_sparsity = total_zeros / total_params
print(f"Sparsity: {current_sparsity:.2%}")
return model
Knowledge Distillation
class DistillationLoss(nn.Module): def init(self, temperature=4.0, alpha=0.5): super().init() self.temperature = temperature self.alpha = alpha self.ce_loss = nn.CrossEntropyLoss() self.kl_loss = nn.KLDivLoss(reduction='batchmean')
def forward(self, student_logits, teacher_logits, labels):
# Hard label loss
hard_loss = self.ce_loss(student_logits, labels)
# Soft label loss (distillation)
soft_student = F.log_softmax(student_logits / self.temperature, dim=1)
soft_teacher = F.softmax(teacher_logits / self.temperature, dim=1)
soft_loss = self.kl_loss(soft_student, soft_teacher) * (self.temperature ** 2)
return self.alpha * hard_loss + (1 - self.alpha) * soft_loss
Training loop
teacher.eval() for batch in train_loader: x, y = batch with torch.no_grad(): teacher_logits = teacher(x) student_logits = student(x) loss = distill_loss(student_logits, teacher_logits, y) loss.backward() optimizer.step()
Efficient Architectures
Depth-Separable Convolutions
class DepthSeparableConv(nn.Module): def init(self, in_channels, out_channels, kernel_size=3): super().init() self.depthwise = nn.Conv2d( in_channels, in_channels, kernel_size, padding=kernel_size//2, groups=in_channels ) self.pointwise = nn.Conv2d(in_channels, out_channels, 1)
def forward(self, x):
x = self.depthwise(x)
x = self.pointwise(x)
return x
Compare params: Regular 3x3 conv with C_in=64, C_out=128
Regular: 64 * 128 * 3 * 3 = 73,728 params
DepthSep: 64 * 3 * 3 + 64 * 128 = 576 + 8,192 = 8,768 params (8.4x fewer)
MobileNet Inverted Residual Block
class InvertedResidual(nn.Module): def init(self, in_ch, out_ch, stride, expand_ratio): super().init() hidden_dim = in_ch * expand_ratio self.use_residual = stride == 1 and in_ch == out_ch
self.conv = nn.Sequential(
# Expand
nn.Conv2d(in_ch, hidden_dim, 1, bias=False),
nn.BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
# Depthwise
nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
# Project
nn.Conv2d(hidden_dim, out_ch, 1, bias=False),
nn.BatchNorm2d(out_ch),
)
def forward(self, x):
if self.use_residual:
return x + self.conv(x)
return self.conv(x)
Low-Rank Factorization
import torch.nn.utils.parametrize as parametrize
class LowRankLinear(nn.Module): def init(self, in_features, out_features, rank): super().init() self.A = nn.Linear(in_features, rank, bias=False) self.B = nn.Linear(rank, out_features, bias=True)
def forward(self, x):
return self.B(self.A(x))
LoRA-style adaptation
class LoRALayer(nn.Module): def init(self, original_layer, rank=8, alpha=16): super().init() self.original = original_layer self.lora_A = nn.Linear(original_layer.in_features, rank, bias=False) self.lora_B = nn.Linear(rank, original_layer.out_features, bias=False) self.scaling = alpha / rank
nn.init.kaiming_uniform_(self.lora_A.weight)
nn.init.zeros_(self.lora_B.weight)
def forward(self, x):
return self.original(x) + self.scaling * self.lora_B(self.lora_A(x))
Efficiency Metrics
def measure_efficiency(model, input_shape, device='cuda'): import time
model = model.to(device)
model.eval()
# Model size
param_size = sum(p.numel() * p.element_size() for p in model.parameters())
buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
size_mb = (param_size + buffer_size) / 1024 / 1024
# FLOPs (using thop)
from thop import profile
dummy_input = torch.randn(1, *input_shape).to(device)
flops, params = profile(model, inputs=(dummy_input,))
# Latency
warmup = 10
iterations = 100
for _ in range(warmup):
model(dummy_input)
torch.cuda.synchronize()
start = time.time()
for _ in range(iterations):
model(dummy_input)
torch.cuda.synchronize()
latency_ms = (time.time() - start) / iterations * 1000
return {
"size_mb": size_mb,
"params": params,
"flops": flops,
"latency_ms": latency_ms,
"throughput": 1000 / latency_ms
}
Commands
-
/omgoptim:quantize
-
Apply quantization
-
/omgoptim:prune
-
Apply pruning
-
/omgoptim:distill
-
Knowledge distillation
-
/omgoptim:profile
-
Profile efficiency
Best Practices
-
Start with the largest model that works
-
Quantize first (usually free accuracy)
-
Prune iteratively with fine-tuning
-
Use distillation for maximum compression
-
Profile on target hardware