Monitoring & Logging
Overview
Application observability through logging, metrics collection, monitoring dashboards, and alerting systems.
Structured Logging
Pino Logger (Node.js)
import pino from 'pino';
// Base logger configuration const logger = pino({ level: process.env.LOG_LEVEL || 'info', formatters: { level: (label) => ({ level: label }), bindings: () => ({}), // Remove pid and hostname }, timestamp: pino.stdTimeFunctions.isoTime, redact: { paths: ['password', 'token', 'authorization', '.password', '.token'], censor: '[REDACTED]', }, });
// Child logger with context function createRequestLogger(req: Request) { return logger.child({ requestId: req.headers['x-request-id'] || crypto.randomUUID(), method: req.method, path: req.path, userAgent: req.headers['user-agent'], userId: req.user?.id, }); }
// Express middleware app.use((req, res, next) => { req.log = createRequestLogger(req);
const startTime = Date.now();
res.on('finish', () => { const duration = Date.now() - startTime;
req.log.info({
statusCode: res.statusCode,
duration,
contentLength: res.get('content-length'),
}, 'request completed');
});
next(); });
// Usage in handlers app.get('/api/users/:id', async (req, res) => { req.log.info({ userId: req.params.id }, 'fetching user');
try { const user = await getUser(req.params.id); req.log.debug({ user: user.id }, 'user found'); res.json(user); } catch (error) { req.log.error({ error }, 'failed to fetch user'); res.status(500).json({ error: 'Internal error' }); } });
Log Levels
// Log level guidelines logger.trace('Detailed debugging info'); // 10 - Very verbose logger.debug('Debugging information'); // 20 - Debug mode only logger.info('Normal operation events'); // 30 - Default level logger.warn('Warning conditions'); // 40 - Potential issues logger.error('Error conditions'); // 50 - Errors that need attention logger.fatal('System-critical errors'); // 60 - System failure
// Contextual logging logger.info({ orderId, userId, amount }, 'order placed'); logger.error({ error: err.message, stack: err.stack }, 'payment failed'); logger.warn({ retryCount, maxRetries }, 'retry attempt');
Log Aggregation Format
{ "timestamp": "2024-01-15T10:30:00.000Z", "level": "info", "message": "request completed", "service": "api", "version": "1.2.3", "environment": "production", "requestId": "abc-123", "traceId": "xyz-789", "method": "GET", "path": "/api/users/123", "statusCode": 200, "duration": 45, "userId": "user-456" }
Metrics Collection
Prometheus Metrics
import { Counter, Histogram, Gauge, Registry, collectDefaultMetrics } from 'prom-client';
const register = new Registry();
// Collect default Node.js metrics collectDefaultMetrics({ register });
// HTTP request metrics const httpRequestsTotal = new Counter({ name: 'http_requests_total', help: 'Total number of HTTP requests', labelNames: ['method', 'path', 'status'], registers: [register], });
const httpRequestDuration = new Histogram({ name: 'http_request_duration_seconds', help: 'Duration of HTTP requests in seconds', labelNames: ['method', 'path'], buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], registers: [register], });
// Business metrics const ordersTotal = new Counter({ name: 'orders_total', help: 'Total number of orders', labelNames: ['status', 'payment_method'], registers: [register], });
const activeUsers = new Gauge({ name: 'active_users', help: 'Number of currently active users', registers: [register], });
const orderAmount = new Histogram({ name: 'order_amount_dollars', help: 'Distribution of order amounts', buckets: [10, 50, 100, 250, 500, 1000, 5000], registers: [register], });
// Middleware to collect metrics app.use((req, res, next) => { const end = httpRequestDuration.startTimer({ method: req.method, path: req.route?.path || req.path, });
res.on('finish', () => { end(); httpRequestsTotal .labels(req.method, req.route?.path || req.path, res.statusCode.toString()) .inc(); });
next(); });
// Metrics endpoint app.get('/metrics', async (req, res) => { res.set('Content-Type', register.contentType); res.send(await register.metrics()); });
// Business metric usage async function createOrder(order: Order) { // ... create order ordersTotal.labels(order.status, order.paymentMethod).inc(); orderAmount.observe(order.total); }
Custom Metrics Patterns
// Rate limiting metrics const rateLimitHits = new Counter({ name: 'rate_limit_hits_total', help: 'Number of rate limit hits', labelNames: ['endpoint', 'user_tier'], });
// Cache metrics const cacheHits = new Counter({ name: 'cache_hits_total', help: 'Number of cache hits', labelNames: ['cache_name'], });
const cacheMisses = new Counter({ name: 'cache_misses_total', help: 'Number of cache misses', labelNames: ['cache_name'], });
// Database metrics const dbQueryDuration = new Histogram({ name: 'db_query_duration_seconds', help: 'Database query duration', labelNames: ['operation', 'table'], buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1], });
const dbConnectionPool = new Gauge({ name: 'db_connection_pool_size', help: 'Database connection pool size', labelNames: ['state'], // active, idle, waiting });
// Queue metrics const queueSize = new Gauge({ name: 'queue_size', help: 'Number of items in queue', labelNames: ['queue_name'], });
const jobDuration = new Histogram({ name: 'job_duration_seconds', help: 'Job processing duration', labelNames: ['job_type', 'status'], });
Alerting
Alert Rules (Prometheus)
prometheus/alerts.yml
groups:
-
name: application rules:
High error rate
- alert: HighErrorRate expr: | sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05 for: 5m labels: severity: critical annotations: summary: "High error rate detected" description: "Error rate is {{ $value | humanizePercentage }}"
High latency
- alert: HighLatency
expr: |
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
1 for: 5m labels: severity: warning annotations: summary: "High latency detected" description: "95th percentile latency is {{ $value }}s"
Service down
- alert: ServiceDown expr: up == 0 for: 1m labels: severity: critical annotations: summary: "Service {{ $labels.instance }} is down"
High memory usage
- alert: HighMemoryUsage expr: | process_resident_memory_bytes / 1024 / 1024 / 1024 > 4 for: 10m labels: severity: warning annotations: summary: "High memory usage" description: "Memory usage is {{ $value | humanize }}GB"
-
name: business rules:
Low order rate
- alert: LowOrderRate expr: | sum(rate(orders_total[1h])) < 10 for: 30m labels: severity: warning annotations: summary: "Order rate is below normal"
Payment failures
- alert: HighPaymentFailures expr: | sum(rate(orders_total{status="failed"}[15m])) / sum(rate(orders_total[15m])) > 0.1 for: 10m labels: severity: critical annotations: summary: "High payment failure rate"
PagerDuty Integration
import axios from 'axios';
interface Alert { severity: 'critical' | 'error' | 'warning' | 'info'; summary: string; source: string; details?: Record<string, any>; }
async function sendPagerDutyAlert(alert: Alert) {
const event = {
routing_key: process.env.PAGERDUTY_ROUTING_KEY,
event_action: 'trigger',
dedup_key: ${alert.source}-${alert.summary},
payload: {
summary: alert.summary,
severity: alert.severity,
source: alert.source,
custom_details: alert.details,
timestamp: new Date().toISOString(),
},
};
await axios.post( 'https://events.pagerduty.com/v2/enqueue', event ); }
// Resolve alert async function resolvePagerDutyAlert(dedupKey: string) { await axios.post('https://events.pagerduty.com/v2/enqueue', { routing_key: process.env.PAGERDUTY_ROUTING_KEY, event_action: 'resolve', dedup_key: dedupKey, }); }
Dashboards
Grafana Dashboard JSON
{ "title": "Application Overview", "panels": [ { "title": "Request Rate", "type": "graph", "targets": [ { "expr": "sum(rate(http_requests_total[5m])) by (status)", "legendFormat": "{{status}}" } ] }, { "title": "Latency (p95)", "type": "graph", "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path))", "legendFormat": "{{path}}" } ] }, { "title": "Error Rate", "type": "stat", "targets": [ { "expr": "sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100" } ], "fieldConfig": { "defaults": { "unit": "percent", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 5 } ] } } } }, { "title": "Active Users", "type": "stat", "targets": [ { "expr": "active_users" } ] } ] }
Health Checks
import { Router } from 'express';
const healthRouter = Router();
// Liveness probe - is the app running? healthRouter.get('/health/live', (req, res) => { res.json({ status: 'ok' }); });
// Readiness probe - is the app ready to serve traffic? healthRouter.get('/health/ready', async (req, res) => { const checks = await Promise.allSettled([ checkDatabase(), checkRedis(), checkExternalApi(), ]);
const results = { database: checks[0].status === 'fulfilled' ? 'ok' : 'error', redis: checks[1].status === 'fulfilled' ? 'ok' : 'error', externalApi: checks[2].status === 'fulfilled' ? 'ok' : 'error', };
const allHealthy = Object.values(results).every(s => s === 'ok');
res.status(allHealthy ? 200 : 503).json({ status: allHealthy ? 'ok' : 'degraded', checks: results, timestamp: new Date().toISOString(), }); });
async function checkDatabase() { const start = Date.now(); await db.query('SELECT 1'); return { latency: Date.now() - start }; }
async function checkRedis() { const start = Date.now(); await redis.ping(); return { latency: Date.now() - start }; }
async function checkExternalApi() { const start = Date.now(); await fetch('https://api.example.com/health', { timeout: 5000 }); return { latency: Date.now() - start }; }
Related Skills
-
[[reliability-engineering]] - SRE practices
-
[[devops-cicd]] - CI/CD monitoring
-
[[cloud-platforms]] - Cloud monitoring