SQL Advanced Core Knowledge
Deep Knowledge: Use
mcp__documentation__fetch_docswith technology:sqlfor comprehensive documentation.
Common Table Expressions (CTEs)
Basic CTE
WITH active_users AS (
SELECT id, name, email
FROM users
WHERE status = 'active'
)
SELECT u.name, COUNT(o.id) as order_count
FROM active_users u
LEFT JOIN orders o ON o.user_id = u.id
GROUP BY u.id, u.name;
Multiple CTEs
WITH
active_users AS (
SELECT id, name FROM users WHERE status = 'active'
),
user_orders AS (
SELECT user_id, COUNT(*) as order_count, SUM(total) as total_spent
FROM orders
WHERE status = 'completed'
GROUP BY user_id
),
high_value_users AS (
SELECT u.*, uo.order_count, uo.total_spent
FROM active_users u
JOIN user_orders uo ON uo.user_id = u.id
WHERE uo.total_spent > 10000
)
SELECT * FROM high_value_users ORDER BY total_spent DESC;
Recursive CTEs
-- Hierarchical data (org chart, categories)
WITH RECURSIVE org_chart AS (
-- Base case: top-level employees
SELECT id, name, manager_id, 1 as level, ARRAY[name] as path
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- Recursive case
SELECT e.id, e.name, e.manager_id, oc.level + 1, oc.path || e.name
FROM employees e
JOIN org_chart oc ON e.manager_id = oc.id
)
SELECT * FROM org_chart ORDER BY path;
-- Generate series
WITH RECURSIVE numbers AS (
SELECT 1 as n
UNION ALL
SELECT n + 1 FROM numbers WHERE n < 100
)
SELECT * FROM numbers;
-- Date range
WITH RECURSIVE dates AS (
SELECT DATE '2024-01-01' as date
UNION ALL
SELECT date + INTERVAL '1 day' FROM dates WHERE date < '2024-12-31'
)
SELECT * FROM dates;
Materialized CTE (PostgreSQL 12+)
-- Force CTE to be materialized (evaluated once)
WITH active_users AS MATERIALIZED (
SELECT * FROM users WHERE status = 'active'
)
SELECT * FROM active_users WHERE id = 1
UNION ALL
SELECT * FROM active_users WHERE id = 2;
-- Force CTE to be inlined (not materialized)
WITH active_users AS NOT MATERIALIZED (
SELECT * FROM users WHERE status = 'active'
)
SELECT * FROM active_users WHERE id = 1;
Window Functions Deep Dive
Partitioned Calculations
SELECT
department,
name,
salary,
-- Within department
SUM(salary) OVER (PARTITION BY department) as dept_total,
AVG(salary) OVER (PARTITION BY department) as dept_avg,
salary - AVG(salary) OVER (PARTITION BY department) as diff_from_avg,
-- Percentage of department total
ROUND(100.0 * salary / SUM(salary) OVER (PARTITION BY department), 2) as pct_of_dept
FROM employees;
Running Calculations
SELECT
date,
amount,
-- Running totals
SUM(amount) OVER (ORDER BY date) as running_total,
SUM(amount) OVER (ORDER BY date ROWS UNBOUNDED PRECEDING) as same_as_above,
-- Moving averages
AVG(amount) OVER (ORDER BY date ROWS 6 PRECEDING) as moving_avg_7d,
AVG(amount) OVER (ORDER BY date ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING) as centered_avg,
-- Cumulative stats
COUNT(*) OVER (ORDER BY date) as cumulative_count,
MIN(amount) OVER (ORDER BY date) as running_min,
MAX(amount) OVER (ORDER BY date) as running_max
FROM daily_transactions;
Gap and Island Analysis
-- Find consecutive sequences (islands)
WITH numbered AS (
SELECT
date,
value,
ROW_NUMBER() OVER (ORDER BY date) as rn,
date - (ROW_NUMBER() OVER (ORDER BY date) * INTERVAL '1 day') as grp
FROM daily_data
)
SELECT
MIN(date) as island_start,
MAX(date) as island_end,
COUNT(*) as days_in_sequence
FROM numbered
GROUP BY grp
ORDER BY island_start;
-- Find gaps in sequence
SELECT
id,
LEAD(id) OVER (ORDER BY id) as next_id,
LEAD(id) OVER (ORDER BY id) - id - 1 as gap_size
FROM items
WHERE LEAD(id) OVER (ORDER BY id) - id > 1;
First/Last in Group
-- Get first and last values per group
SELECT DISTINCT ON (department)
department,
name as highest_paid,
salary
FROM employees
ORDER BY department, salary DESC;
-- With window functions
SELECT DISTINCT
department,
FIRST_VALUE(name) OVER w as highest_paid,
LAST_VALUE(name) OVER w as lowest_paid
FROM employees
WINDOW w AS (
PARTITION BY department
ORDER BY salary DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
);
Query Optimization
EXPLAIN Basics
-- Show query plan
EXPLAIN SELECT * FROM users WHERE email = 'test@example.com';
-- Show actual execution stats
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'test@example.com';
-- More details
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT * FROM users WHERE email = 'test@example.com';
-- JSON output for tools
EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON)
SELECT * FROM users WHERE email = 'test@example.com';
Reading EXPLAIN Output
Seq Scan on users (cost=0.00..155.00 rows=1 width=100) (actual time=0.015..0.842 rows=1 loops=1)
Filter: (email = 'test@example.com'::text)
Rows Removed by Filter: 4999
| Term | Meaning |
|------|---------|
| Seq Scan | Full table scan (often bad) |
| Index Scan | Using index (good) |
| Index Only Scan | Using covering index (best) |
| Bitmap Index Scan | Using multiple indexes |
| cost=0.00..155.00 | Estimated startup..total cost |
| rows=1 | Estimated rows returned |
| actual time=0.015..0.842 | Real startup..total time (ms) |
| Rows Removed by Filter | Rows read but not returned |
Common Performance Issues
Missing Index
-- Problem: Seq Scan
EXPLAIN SELECT * FROM users WHERE email = 'test@example.com';
-- Solution: Add index
CREATE INDEX idx_users_email ON users(email);
Index Not Used
-- Problem: Function on column prevents index use
SELECT * FROM users WHERE LOWER(email) = 'test@example.com';
-- Solution: Expression index
CREATE INDEX idx_users_email_lower ON users(LOWER(email));
N+1 Query Problem
-- Problem: Querying in a loop (application code)
-- For each user: SELECT * FROM orders WHERE user_id = ?
-- Solution: Single query with JOIN
SELECT u.*, o.*
FROM users u
LEFT JOIN orders o ON o.user_id = u.id
WHERE u.id IN (1, 2, 3, 4, 5);
Index Strategies
-- Composite index (column order matters!)
-- Good for: WHERE a = ? AND b = ?
-- Good for: WHERE a = ? ORDER BY b
CREATE INDEX idx_orders_user_date ON orders(user_id, created_at DESC);
-- Covering index (all needed columns in index)
CREATE INDEX idx_orders_user_covering ON orders(user_id) INCLUDE (status, total);
-- Partial index (filtered)
CREATE INDEX idx_active_orders ON orders(user_id) WHERE status = 'active';
-- Expression index
CREATE INDEX idx_orders_year ON orders(EXTRACT(YEAR FROM created_at));
Query Rewriting
-- Avoid: Subquery in SELECT
SELECT
name,
(SELECT COUNT(*) FROM orders WHERE user_id = users.id) as order_count
FROM users;
-- Better: JOIN with aggregation
SELECT u.name, COALESCE(o.order_count, 0) as order_count
FROM users u
LEFT JOIN (
SELECT user_id, COUNT(*) as order_count
FROM orders GROUP BY user_id
) o ON o.user_id = u.id;
-- Avoid: OR conditions on different columns
SELECT * FROM users WHERE email = 'a@b.com' OR phone = '123';
-- Better: UNION (can use indexes)
SELECT * FROM users WHERE email = 'a@b.com'
UNION
SELECT * FROM users WHERE phone = '123';
-- Avoid: NOT IN with NULLs (tricky behavior)
SELECT * FROM users WHERE id NOT IN (SELECT user_id FROM banned);
-- Better: NOT EXISTS
SELECT * FROM users u WHERE NOT EXISTS (
SELECT 1 FROM banned b WHERE b.user_id = u.id
);
Advanced Patterns
Pivot/Unpivot
-- Pivot: Rows to columns
SELECT
product_id,
SUM(CASE WHEN month = 1 THEN sales END) as jan,
SUM(CASE WHEN month = 2 THEN sales END) as feb,
SUM(CASE WHEN month = 3 THEN sales END) as mar
FROM monthly_sales
GROUP BY product_id;
-- PostgreSQL: crosstab
SELECT * FROM crosstab(
'SELECT product_id, month, sales FROM monthly_sales ORDER BY 1,2'
) AS ct(product_id INT, jan INT, feb INT, mar INT);
-- Unpivot: Columns to rows (PostgreSQL)
SELECT product_id, month, sales
FROM products,
LATERAL (VALUES
('jan', jan_sales),
('feb', feb_sales),
('mar', mar_sales)
) AS t(month, sales);
De-duplication
-- Keep first occurrence
DELETE FROM users a USING users b
WHERE a.id > b.id AND a.email = b.email;
-- With CTE (safer)
WITH duplicates AS (
SELECT id, ROW_NUMBER() OVER (PARTITION BY email ORDER BY created_at) as rn
FROM users
)
DELETE FROM users WHERE id IN (
SELECT id FROM duplicates WHERE rn > 1
);
Temporal Queries
-- Events active at specific time
SELECT * FROM events
WHERE start_time <= '2024-06-15 10:00:00'
AND end_time > '2024-06-15 10:00:00';
-- Overlapping periods
SELECT a.*, b.*
FROM reservations a, reservations b
WHERE a.id < b.id
AND a.room_id = b.room_id
AND a.start_time < b.end_time
AND a.end_time > b.start_time;
-- Fill gaps with generate_series
SELECT
d.date,
COALESCE(s.revenue, 0) as revenue
FROM generate_series(
'2024-01-01'::date,
'2024-12-31'::date,
'1 day'
) d(date)
LEFT JOIN daily_sales s ON s.date = d.date;
When NOT to Use This Skill
- Basic SQL (SELECT, JOIN, INSERT) - Use
sql-fundamentalsskill - PostgreSQL specifics (arrays, JSONB) - Use
postgresqlskill - MySQL specifics (stored procedures) - Use
mysqlskill - ORM queries - Use
prisma,typeorm, or relevant ORM skill
Anti-Patterns
| Anti-Pattern | Problem | Solution | |--------------|---------|----------| | Correlated subqueries | Slow performance, N+1 | Use JOINs or window functions | | Functions on indexed columns | Index not used | Use functional indexes | | Deep recursion without limit | Stack overflow | Add recursion depth limit | | Missing WHERE in CTEs | Processes unnecessary data | Filter early in CTEs | | Over-using window functions | Memory pressure | Limit result set first | | Not analyzing EXPLAIN output | Slow queries go unnoticed | Always check execution plans |
Quick Troubleshooting
| Problem | Diagnostic | Fix | |---------|------------|-----| | Slow CTE execution | EXPLAIN ANALYZE | Add MATERIALIZED hint or rewrite | | High memory usage | Check sort/hash operations | Increase work_mem, optimize query | | Recursion limit exceeded | Check recursion depth | Add LIMIT, redesign query | | Window function slow | Check PARTITION BY cardinality | Add indexes on partition columns | | Query plan changes | Compare EXPLAIN outputs | Update statistics, pin plan |
Scan to join WeChat group