SQL for Data Analysis
Exploratory Data Analysis (EDA)
Data Profiling
-- Understand data structure and quality SELECT COUNT() as record_count FROM employees; SELECT COUNT(DISTINCT department) as unique_departments FROM employees; SELECT COUNT() - COUNT(email) as missing_emails FROM employees;
-- Column value distribution SELECT salary, COUNT(*) as frequency FROM employees GROUP BY salary ORDER BY frequency DESC;
-- Missing data analysis SELECT COUNT() as total_records, COUNT(phone) as non_null_phone, COUNT() - COUNT(phone) as missing_phone, ROUND(100.0 * (COUNT() - COUNT(phone)) / COUNT(), 2) as missing_percentage FROM employees;
-- Data type and range checks SELECT MIN(salary) as min_salary, MAX(salary) as max_salary, ROUND(AVG(salary), 2) as avg_salary, ROUND(STDDEV(salary), 2) as salary_stddev FROM employees;
Distribution Analysis
-- Value frequency distribution SELECT department, COUNT() as emp_count, ROUND(100.0 * COUNT() / SUM(COUNT(*)) OVER (), 2) as percentage FROM employees GROUP BY department ORDER BY emp_count DESC;
-- Salary ranges and distribution SELECT CASE WHEN salary < 50000 THEN 'Under 50K' WHEN salary < 75000 THEN '50K-75K' WHEN salary < 100000 THEN '75K-100K' ELSE '100K+' END as salary_range, COUNT(*) as emp_count, MIN(salary) as min_sal, MAX(salary) as max_sal, ROUND(AVG(salary), 2) as avg_sal FROM employees GROUP BY salary_range ORDER BY MIN(salary);
-- Distribution visualization data SELECT salary, COUNT() as frequency, ROUND(100.0 * COUNT() / SUM(COUNT()) OVER (), 2) as pct, RPAD('', COUNT() / 10, '') as bar_chart FROM employees GROUP BY salary ORDER BY salary;
Statistical Analysis
Summary Statistics
-- Comprehensive statistics by group SELECT department, COUNT(*) as count, ROUND(AVG(salary), 2) as mean_salary, ROUND(MIN(salary), 2) as min_salary, ROUND(MAX(salary), 2) as max_salary, ROUND(STDDEV(salary), 2) as stddev_salary, ROUND(AVG(ABS(salary - (SELECT AVG(salary) FROM employees WHERE department = e.department))), 2) as avg_deviation FROM employees e GROUP BY department ORDER BY mean_salary DESC;
-- Percentile analysis SELECT department, ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary), 2) as q1, ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY salary), 2) as median, ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary), 2) as q3, ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY salary), 2) as p95 FROM employees GROUP BY department;
Outlier Detection
-- Find outliers using standard deviation SELECT emp_id, first_name, salary, ROUND(AVG(salary) OVER (), 2) as avg_salary, ROUND(STDDEV(salary) OVER (), 2) as stddev_salary, ROUND(ABS(salary - AVG(salary) OVER ()) / NULLIF(STDDEV(salary) OVER (), 0), 2) as z_score FROM employees HAVING ABS(salary - AVG(salary) OVER ()) / NULLIF(STDDEV(salary) OVER (), 0) > 3 ORDER BY z_score DESC;
-- IQR method for outliers WITH salary_stats AS ( SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary) as q1, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary) as q3 FROM employees ) SELECT emp_id, salary, CASE WHEN salary < (SELECT q1 FROM salary_stats) - 1.5 * ((SELECT q3 FROM salary_stats) - (SELECT q1 FROM salary_stats)) OR salary > (SELECT q3 FROM salary_stats) + 1.5 * ((SELECT q3 FROM salary_stats) - (SELECT q1 FROM salary_stats)) THEN 'Outlier' ELSE 'Normal' END as outlier_status FROM employees;
Comparative Analysis
Period-over-Period Comparison
-- Year-over-year sales comparison SELECT EXTRACT(QUARTER FROM order_date) as quarter, EXTRACT(YEAR FROM order_date) as year, ROUND(SUM(amount), 2) as total_sales, ROUND(LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as prev_period, ROUND(SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as yoy_change, ROUND(100.0 * (SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date))) / LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as yoy_pct_change FROM orders GROUP BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date) ORDER BY year, quarter;
Cohort Analysis
-- User cohort analysis WITH user_cohorts AS ( SELECT DATE_TRUNC('month', first_order_date)::DATE as cohort_month, user_id, DATE_TRUNC('month', order_date)::DATE as order_month FROM users u LEFT JOIN orders o ON u.id = o.user_id ) SELECT cohort_month, DATE_PART('month', order_month - cohort_month) / 1 as months_since_cohort, COUNT(DISTINCT user_id) as users, ROUND(100.0 * COUNT(DISTINCT user_id) / (SELECT COUNT(DISTINCT user_id) FROM user_cohorts WHERE order_month = cohort_month), 2) as retention_rate FROM user_cohorts WHERE order_month >= cohort_month GROUP BY cohort_month, months_since_cohort ORDER BY cohort_month, months_since_cohort;
Correlation & Relationship Analysis
-- Correlation between variables WITH salary_data AS ( SELECT years_experience, salary, AVG(salary) OVER () as avg_salary, AVG(years_experience) OVER () as avg_experience, STDDEV(salary) OVER () as stddev_salary, STDDEV(years_experience) OVER () as stddev_experience FROM employees ) SELECT ROUND( SUM((years_experience - avg_experience) * (salary - avg_salary)) / (COUNT(*) * stddev_salary * stddev_experience), 4 ) as correlation FROM salary_data;
-- Segment analysis SELECT CASE WHEN years_experience < 2 THEN 'Junior' WHEN years_experience < 5 THEN 'Mid-level' WHEN years_experience < 10 THEN 'Senior' ELSE 'Expert' END as experience_level, COUNT() as count, ROUND(AVG(salary), 2) as avg_salary, ROUND(AVG(performance_rating), 2) as avg_rating FROM employees GROUP BY experience_level ORDER BY COUNT() DESC;
Data Quality Validation
-- Check for invalid values SELECT CASE WHEN salary < 0 THEN 'Negative salary' WHEN salary > 1000000 THEN 'Unusually high salary' WHEN email NOT LIKE '%@%' THEN 'Invalid email' WHEN hire_date > CURRENT_DATE THEN 'Future hire date' WHEN years_experience > 70 THEN 'Impossible experience' ELSE NULL END as data_quality_issue, COUNT(*) as count FROM employees WHERE salary < 0 OR salary > 1000000 OR email NOT LIKE '%@%' OR hire_date > CURRENT_DATE OR years_experience > 70 GROUP BY data_quality_issue;
-- Duplicate detection SELECT email, COUNT() as occurrence_count, STRING_AGG(DISTINCT emp_id::text, ', ') as emp_ids FROM employees WHERE email IS NOT NULL GROUP BY email HAVING COUNT() > 1 ORDER BY occurrence_count DESC;
Trend Analysis
-- Moving average SELECT order_date, amount, ROUND(AVG(amount) OVER ( ORDER BY order_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW ), 2) as moving_avg_7day, ROUND(AVG(amount) OVER ( ORDER BY order_date ROWS BETWEEN 29 PRECEDING AND CURRENT ROW ), 2) as moving_avg_30day FROM daily_orders ORDER BY order_date;
-- Growth rate SELECT DATE_TRUNC('month', order_date)::DATE as month, ROUND(SUM(amount), 2) as monthly_revenue, ROUND((SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY DATE_TRUNC('month', order_date))) / LAG(SUM(amount)) OVER (ORDER BY DATE_TRUNC('month', order_date)) * 100, 2) as growth_rate_pct FROM orders GROUP BY DATE_TRUNC('month', order_date) ORDER BY month;
Next Steps
Learn advanced SQL concepts and optimization techniques in the advanced-sql skill.