data-analysis-sql

SQL for Data Analysis

Safety Notice

This listing is imported from skills.sh public index metadata. Review upstream SKILL.md and repository scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "data-analysis-sql" with this command: npx skills add pluginagentmarketplace/custom-plugin-sql/pluginagentmarketplace-custom-plugin-sql-data-analysis-sql

SQL for Data Analysis

Exploratory Data Analysis (EDA)

Data Profiling

-- Understand data structure and quality SELECT COUNT() as record_count FROM employees; SELECT COUNT(DISTINCT department) as unique_departments FROM employees; SELECT COUNT() - COUNT(email) as missing_emails FROM employees;

-- Column value distribution SELECT salary, COUNT(*) as frequency FROM employees GROUP BY salary ORDER BY frequency DESC;

-- Missing data analysis SELECT COUNT() as total_records, COUNT(phone) as non_null_phone, COUNT() - COUNT(phone) as missing_phone, ROUND(100.0 * (COUNT() - COUNT(phone)) / COUNT(), 2) as missing_percentage FROM employees;

-- Data type and range checks SELECT MIN(salary) as min_salary, MAX(salary) as max_salary, ROUND(AVG(salary), 2) as avg_salary, ROUND(STDDEV(salary), 2) as salary_stddev FROM employees;

Distribution Analysis

-- Value frequency distribution SELECT department, COUNT() as emp_count, ROUND(100.0 * COUNT() / SUM(COUNT(*)) OVER (), 2) as percentage FROM employees GROUP BY department ORDER BY emp_count DESC;

-- Salary ranges and distribution SELECT CASE WHEN salary < 50000 THEN 'Under 50K' WHEN salary < 75000 THEN '50K-75K' WHEN salary < 100000 THEN '75K-100K' ELSE '100K+' END as salary_range, COUNT(*) as emp_count, MIN(salary) as min_sal, MAX(salary) as max_sal, ROUND(AVG(salary), 2) as avg_sal FROM employees GROUP BY salary_range ORDER BY MIN(salary);

-- Distribution visualization data SELECT salary, COUNT() as frequency, ROUND(100.0 * COUNT() / SUM(COUNT()) OVER (), 2) as pct, RPAD('', COUNT() / 10, '') as bar_chart FROM employees GROUP BY salary ORDER BY salary;

Statistical Analysis

Summary Statistics

-- Comprehensive statistics by group SELECT department, COUNT(*) as count, ROUND(AVG(salary), 2) as mean_salary, ROUND(MIN(salary), 2) as min_salary, ROUND(MAX(salary), 2) as max_salary, ROUND(STDDEV(salary), 2) as stddev_salary, ROUND(AVG(ABS(salary - (SELECT AVG(salary) FROM employees WHERE department = e.department))), 2) as avg_deviation FROM employees e GROUP BY department ORDER BY mean_salary DESC;

-- Percentile analysis SELECT department, ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary), 2) as q1, ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY salary), 2) as median, ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary), 2) as q3, ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY salary), 2) as p95 FROM employees GROUP BY department;

Outlier Detection

-- Find outliers using standard deviation SELECT emp_id, first_name, salary, ROUND(AVG(salary) OVER (), 2) as avg_salary, ROUND(STDDEV(salary) OVER (), 2) as stddev_salary, ROUND(ABS(salary - AVG(salary) OVER ()) / NULLIF(STDDEV(salary) OVER (), 0), 2) as z_score FROM employees HAVING ABS(salary - AVG(salary) OVER ()) / NULLIF(STDDEV(salary) OVER (), 0) > 3 ORDER BY z_score DESC;

-- IQR method for outliers WITH salary_stats AS ( SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary) as q1, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary) as q3 FROM employees ) SELECT emp_id, salary, CASE WHEN salary < (SELECT q1 FROM salary_stats) - 1.5 * ((SELECT q3 FROM salary_stats) - (SELECT q1 FROM salary_stats)) OR salary > (SELECT q3 FROM salary_stats) + 1.5 * ((SELECT q3 FROM salary_stats) - (SELECT q1 FROM salary_stats)) THEN 'Outlier' ELSE 'Normal' END as outlier_status FROM employees;

Comparative Analysis

Period-over-Period Comparison

-- Year-over-year sales comparison SELECT EXTRACT(QUARTER FROM order_date) as quarter, EXTRACT(YEAR FROM order_date) as year, ROUND(SUM(amount), 2) as total_sales, ROUND(LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as prev_period, ROUND(SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as yoy_change, ROUND(100.0 * (SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date))) / LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as yoy_pct_change FROM orders GROUP BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date) ORDER BY year, quarter;

Cohort Analysis

-- User cohort analysis WITH user_cohorts AS ( SELECT DATE_TRUNC('month', first_order_date)::DATE as cohort_month, user_id, DATE_TRUNC('month', order_date)::DATE as order_month FROM users u LEFT JOIN orders o ON u.id = o.user_id ) SELECT cohort_month, DATE_PART('month', order_month - cohort_month) / 1 as months_since_cohort, COUNT(DISTINCT user_id) as users, ROUND(100.0 * COUNT(DISTINCT user_id) / (SELECT COUNT(DISTINCT user_id) FROM user_cohorts WHERE order_month = cohort_month), 2) as retention_rate FROM user_cohorts WHERE order_month >= cohort_month GROUP BY cohort_month, months_since_cohort ORDER BY cohort_month, months_since_cohort;

Correlation & Relationship Analysis

-- Correlation between variables WITH salary_data AS ( SELECT years_experience, salary, AVG(salary) OVER () as avg_salary, AVG(years_experience) OVER () as avg_experience, STDDEV(salary) OVER () as stddev_salary, STDDEV(years_experience) OVER () as stddev_experience FROM employees ) SELECT ROUND( SUM((years_experience - avg_experience) * (salary - avg_salary)) / (COUNT(*) * stddev_salary * stddev_experience), 4 ) as correlation FROM salary_data;

-- Segment analysis SELECT CASE WHEN years_experience < 2 THEN 'Junior' WHEN years_experience < 5 THEN 'Mid-level' WHEN years_experience < 10 THEN 'Senior' ELSE 'Expert' END as experience_level, COUNT() as count, ROUND(AVG(salary), 2) as avg_salary, ROUND(AVG(performance_rating), 2) as avg_rating FROM employees GROUP BY experience_level ORDER BY COUNT() DESC;

Data Quality Validation

-- Check for invalid values SELECT CASE WHEN salary < 0 THEN 'Negative salary' WHEN salary > 1000000 THEN 'Unusually high salary' WHEN email NOT LIKE '%@%' THEN 'Invalid email' WHEN hire_date > CURRENT_DATE THEN 'Future hire date' WHEN years_experience > 70 THEN 'Impossible experience' ELSE NULL END as data_quality_issue, COUNT(*) as count FROM employees WHERE salary < 0 OR salary > 1000000 OR email NOT LIKE '%@%' OR hire_date > CURRENT_DATE OR years_experience > 70 GROUP BY data_quality_issue;

-- Duplicate detection SELECT email, COUNT() as occurrence_count, STRING_AGG(DISTINCT emp_id::text, ', ') as emp_ids FROM employees WHERE email IS NOT NULL GROUP BY email HAVING COUNT() > 1 ORDER BY occurrence_count DESC;

Trend Analysis

-- Moving average SELECT order_date, amount, ROUND(AVG(amount) OVER ( ORDER BY order_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW ), 2) as moving_avg_7day, ROUND(AVG(amount) OVER ( ORDER BY order_date ROWS BETWEEN 29 PRECEDING AND CURRENT ROW ), 2) as moving_avg_30day FROM daily_orders ORDER BY order_date;

-- Growth rate SELECT DATE_TRUNC('month', order_date)::DATE as month, ROUND(SUM(amount), 2) as monthly_revenue, ROUND((SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY DATE_TRUNC('month', order_date))) / LAG(SUM(amount)) OVER (ORDER BY DATE_TRUNC('month', order_date)) * 100, 2) as growth_rate_pct FROM orders GROUP BY DATE_TRUNC('month', order_date) ORDER BY month;

Next Steps

Learn advanced SQL concepts and optimization techniques in the advanced-sql skill.

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.