SQL-Patterns
Advanced SQL patterns for data engineering beyond basic SELECT/JOIN.
Common Table Expressions (CTEs)
-- Chain transformations readably WITH active_users AS ( SELECT user_id, email FROM users WHERE status = 'active' ), user_orders AS ( SELECT u.user_id, COUNT(*) as order_count FROM active_users u JOIN orders o ON u.user_id = o.user_id GROUP BY u.user_id ) SELECT * FROM user_orders WHERE order_count > 5;
Window Functions
-- Row numbering within groups SELECT *, ROW_NUMBER() OVER (PARTITION BY category ORDER BY created_at DESC) as rn FROM products;
-- Running totals SELECT date, revenue, SUM(revenue) OVER (ORDER BY date) as cumulative_revenue FROM daily_sales;
-- Percent of total SELECT category, sales, sales * 100.0 / SUM(sales) OVER () as pct_of_total FROM category_sales;
-- Lead/Lag for time series SELECT date, value, LAG(value, 1) OVER (ORDER BY date) as prev_value, value - LAG(value, 1) OVER (ORDER BY date) as change FROM metrics;
-- Ranking with ties SELECT *, RANK() OVER (ORDER BY score DESC) as rank, -- 1,2,2,4 DENSE_RANK() OVER (ORDER BY score DESC) as drank -- 1,2,2,3 FROM scores;
Recursive CTEs
-- Hierarchical data (org chart, categories) WITH RECURSIVE org_tree AS ( -- Base case: top-level managers SELECT id, name, manager_id, 1 as depth FROM employees WHERE manager_id IS NULL
UNION ALL
-- Recursive case: subordinates SELECT e.id, e.name, e.manager_id, t.depth + 1 FROM employees e JOIN org_tree t ON e.manager_id = t.id ) SELECT * FROM org_tree;
-- Generate date series WITH RECURSIVE dates AS ( SELECT DATE '2024-01-01' as dt UNION ALL SELECT dt + INTERVAL '1 day' FROM dates WHERE dt < DATE '2024-12-31' ) SELECT * FROM dates;
CASE Expressions
-- Simple CASE SELECT CASE status WHEN 'A' THEN 'Active' WHEN 'I' THEN 'Inactive' ELSE 'Unknown' END as status_label FROM users;
-- Searched CASE for ranges SELECT CASE WHEN age < 18 THEN 'Minor' WHEN age < 65 THEN 'Adult' ELSE 'Senior' END as age_group FROM users;
-- Conditional aggregation SELECT COUNT() as total, COUNT() FILTER (WHERE status = 'active') as active_count, -- PostgreSQL SUM(CASE WHEN status = 'active' THEN 1 ELSE 0 END) as active_count -- ANSI FROM users;
UPSERT Patterns
-- PostgreSQL: INSERT ON CONFLICT INSERT INTO inventory (sku, quantity, updated_at) VALUES ('ABC123', 100, NOW()) ON CONFLICT (sku) DO UPDATE SET quantity = EXCLUDED.quantity, updated_at = EXCLUDED.updated_at;
-- MySQL: INSERT ON DUPLICATE KEY INSERT INTO inventory (sku, quantity, updated_at) VALUES ('ABC123', 100, NOW()) ON DUPLICATE KEY UPDATE quantity = VALUES(quantity), updated_at = VALUES(updated_at);
-- SQLite: INSERT OR REPLACE INSERT OR REPLACE INTO inventory (sku, quantity, updated_at) VALUES ('ABC123', 100, datetime('now'));
Efficient Pagination
-- BAD: OFFSET for large pages SELECT * FROM orders ORDER BY id LIMIT 20 OFFSET 10000;
-- GOOD: Keyset pagination SELECT * FROM orders WHERE id > 10000 -- last seen id ORDER BY id LIMIT 20;
Batch Operations
-- Batch DELETE with limit (avoid long locks) DELETE FROM logs WHERE created_at < NOW() - INTERVAL '90 days' LIMIT 10000;
-- Batch UPDATE UPDATE orders SET status = 'archived' WHERE id IN ( SELECT id FROM orders WHERE status = 'completed' AND completed_at < NOW() - INTERVAL '1 year' LIMIT 1000 );
Index-Friendly Queries
-- BAD: Function on indexed column SELECT * FROM users WHERE LOWER(email) = 'test@example.com';
-- GOOD: Store lowercase or use expression index SELECT * FROM users WHERE email_lower = 'test@example.com'; -- Or: CREATE INDEX idx_email_lower ON users (LOWER(email));
-- BAD: Leading wildcard SELECT * FROM products WHERE name LIKE '%widget%';
-- GOOD: Full-text search or prefix match SELECT * FROM products WHERE name LIKE 'widget%';
NULL Handling
-- COALESCE for defaults SELECT COALESCE(nickname, first_name, 'Anonymous') as display_name FROM users;
-- NULLIF to convert values to NULL SELECT NULLIF(status, '') as status -- empty string -> NULL FROM records;
-- IS DISTINCT FROM (NULL-safe comparison) SELECT * FROM a WHERE a.value IS DISTINCT FROM b.value; -- treats NULL != NULL as false
LATERAL Joins
-- Top N per group SELECT d.name, t.product, t.revenue FROM departments d CROSS JOIN LATERAL ( SELECT product, revenue FROM sales WHERE sales.dept_id = d.id ORDER BY revenue DESC LIMIT 3 ) t;
Materialized Views
-- Create for expensive aggregations CREATE MATERIALIZED VIEW daily_stats AS SELECT DATE_TRUNC('day', created_at) as date, COUNT(*) as total_orders, SUM(amount) as revenue FROM orders GROUP BY 1;
-- Refresh periodically REFRESH MATERIALIZED VIEW CONCURRENTLY daily_stats;
Query Optimization Checklist
-
Check EXPLAIN ANALYZE - Look for sequential scans on large tables
-
Add missing indexes - Columns in WHERE, JOIN, ORDER BY
-
**Avoid SELECT *** - Fetch only needed columns
-
Use EXISTS over IN - For correlated subqueries
-
Batch large operations - Avoid long-running transactions
-
Partition large tables - By date or category
-
Use connection pooling - Avoid connection overhead