Anomaly Detection in SQL: Finding Outliers with Z-Score and IQR
SOTAAZ·

Anomaly Detection in SQL: Finding Outliers with Z-Score and IQR
Automatically detect abnormal data with SQL. Implement Z-Score, IQR, and percentile-based outlier detection.
TL;DR
- Outlier: Data point outside the normal range
- Z-Score: Standard deviations from mean (|Z| > 3 = outlier)
- IQR: Quartile-based (outside Q1-1.5×IQR to Q3+1.5×IQR = outlier)
- Use Cases: Fraud detection, abnormal traffic, data errors
1. Why Anomaly Detection?
Real Example
Daily revenue: $100K, $105K, $98K, $102K, $950K(??), $101KIs $950K normal?
- System error?
- Bulk order?
- Fraudulent transaction?
→ Need automatic detection and alerts!
Types of Anomalies
2. Z-Score Method
Basic Concept
Z = (x - μ) / σ
x: Individual value
μ: Mean
σ: Standard deviationSQL Implementation
WITH stats AS (
SELECT
AVG(amount) as mean_amount,
STDDEV(amount) as std_amount
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
),
z_scores AS (
SELECT
o.order_id,
o.user_id,
o.amount,
o.created_at,
(o.amount - s.mean_amount) / NULLIF(s.std_amount, 0) as z_score
FROM orders o
CROSS JOIN stats s
WHERE o.created_at >= CURRENT_DATE - INTERVAL '30 days'
)
SELECT
order_id,
user_id,
amount,
created_at,
ROUND(z_score, 2) as z_score,
CASE
WHEN z_score > 3 THEN 'HIGH_OUTLIER'
WHEN z_score < -3 THEN 'LOW_OUTLIER'
ELSE 'NORMAL'
END as status
FROM z_scores
WHERE ABS(z_score) > 3
ORDER BY ABS(z_score) DESC;Result:
3. IQR (Interquartile Range) Method
Basic Concept
IQR = Q3 - Q1
Lower bound = Q1 - 1.5 × IQR
Upper bound = Q3 + 1.5 × IQRSQL Implementation
WITH quartiles AS (
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY amount) as q1,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY amount) as q2,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY amount) as q3
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
),
bounds AS (
SELECT
q1,
q2,
q3,
q3 - q1 as iqr,
q1 - 1.5 * (q3 - q1) as lower_bound,
q3 + 1.5 * (q3 - q1) as upper_bound
FROM quartiles
)
SELECT
o.order_id,
o.user_id,
o.amount,
o.created_at,
b.lower_bound,
b.upper_bound,
CASE
WHEN o.amount > b.upper_bound THEN 'HIGH_OUTLIER'
WHEN o.amount < b.lower_bound THEN 'LOW_OUTLIER'
ELSE 'NORMAL'
END as status
FROM orders o
CROSS JOIN bounds b
WHERE o.created_at >= CURRENT_DATE - INTERVAL '30 days'
AND (o.amount > b.upper_bound OR o.amount < b.lower_bound)
ORDER BY o.amount DESC;4. Modified Z-Score (MAD-based)
Problem with Standard Z-Score
- Extreme outliers skew mean/std
- Solution: Use Median and MAD
SQL Implementation
WITH median_calc AS (
SELECT
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) as median_amount
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
),
mad_calc AS (
SELECT
m.median_amount,
PERCENTILE_CONT(0.5) WITHIN GROUP (
ORDER BY ABS(o.amount - m.median_amount)
) as mad
FROM orders o
CROSS JOIN median_calc m
WHERE o.created_at >= CURRENT_DATE - INTERVAL '30 days'
),
modified_z AS (
SELECT
o.order_id,
o.amount,
m.median_amount,
m.mad,
0.6745 * (o.amount - m.median_amount) / NULLIF(m.mad, 0) as modified_z_score
FROM orders o
CROSS JOIN mad_calc m
WHERE o.created_at >= CURRENT_DATE - INTERVAL '30 days'
)
SELECT
order_id,
amount,
ROUND(modified_z_score, 2) as modified_z,
CASE
WHEN ABS(modified_z_score) > 3.5 THEN 'OUTLIER'
ELSE 'NORMAL'
END as status
FROM modified_z
WHERE ABS(modified_z_score) > 3.5
ORDER BY ABS(modified_z_score) DESC;5. Time Series Anomaly Detection
Moving Average Based
WITH daily_metrics AS (
SELECT
DATE(created_at) as date,
SUM(amount) as daily_revenue,
COUNT(*) as daily_orders
FROM orders
GROUP BY DATE(created_at)
),
with_moving_stats AS (
SELECT
date,
daily_revenue,
daily_orders,
AVG(daily_revenue) OVER (
ORDER BY date
ROWS BETWEEN 7 PRECEDING AND 1 PRECEDING
) as ma_7d,
STDDEV(daily_revenue) OVER (
ORDER BY date
ROWS BETWEEN 7 PRECEDING AND 1 PRECEDING
) as std_7d
FROM daily_metrics
)
SELECT
date,
daily_revenue,
ROUND(ma_7d, 0) as ma_7d,
ROUND(std_7d, 0) as std_7d,
ROUND((daily_revenue - ma_7d) / NULLIF(std_7d, 0), 2) as z_score,
CASE
WHEN (daily_revenue - ma_7d) / NULLIF(std_7d, 0) > 2 THEN 'SPIKE'
WHEN (daily_revenue - ma_7d) / NULLIF(std_7d, 0) < -2 THEN 'DROP'
ELSE 'NORMAL'
END as status
FROM with_moving_stats
WHERE ma_7d IS NOT NULL
ORDER BY date DESC;6. User-Level Anomaly Detection
Individual Pattern-Based
WITH user_stats AS (
SELECT
user_id,
AVG(amount) as user_avg,
STDDEV(amount) as user_std,
COUNT(*) as order_count
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '90 days'
GROUP BY user_id
HAVING COUNT(*) >= 5 -- Minimum 5 orders
),
recent_orders AS (
SELECT
o.order_id,
o.user_id,
o.amount,
o.created_at,
us.user_avg,
us.user_std,
(o.amount - us.user_avg) / NULLIF(us.user_std, 0) as user_z_score
FROM orders o
JOIN user_stats us ON o.user_id = us.user_id
WHERE o.created_at >= CURRENT_DATE - INTERVAL '7 days'
)
SELECT
order_id,
user_id,
amount,
ROUND(user_avg, 0) as user_avg,
ROUND(user_z_score, 2) as z_vs_user_pattern,
CASE
WHEN user_z_score > 3 THEN 'UNUSUAL_HIGH'
WHEN user_z_score < -2 THEN 'UNUSUAL_LOW'
ELSE 'NORMAL'
END as anomaly_flag
FROM recent_orders
WHERE ABS(user_z_score) > 2
ORDER BY ABS(user_z_score) DESC;7. Multi-Metric Anomalies
Composite Anomaly Detection
WITH order_features AS (
SELECT
order_id,
user_id,
amount,
items_count,
created_at,
EXTRACT(HOUR FROM created_at) as order_hour,
EXTRACT(DOW FROM created_at) as order_dow
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
),
global_stats AS (
SELECT
AVG(amount) as avg_amount,
STDDEV(amount) as std_amount,
AVG(items_count) as avg_items,
STDDEV(items_count) as std_items
FROM order_features
),
anomaly_scores AS (
SELECT
of.order_id,
of.user_id,
of.amount,
of.items_count,
of.created_at,
(of.amount - gs.avg_amount) / NULLIF(gs.std_amount, 0) as z_amount,
(of.items_count - gs.avg_items) / NULLIF(gs.std_items, 0) as z_items,
CASE WHEN of.order_hour BETWEEN 2 AND 5 THEN 1 ELSE 0 END as late_night_flag
FROM order_features of
CROSS JOIN global_stats gs
)
SELECT
order_id,
user_id,
amount,
items_count,
created_at,
ROUND(z_amount, 2) as z_amount,
ROUND(z_items, 2) as z_items,
late_night_flag,
ROUND(ABS(z_amount) + ABS(z_items) + late_night_flag * 2, 2) as anomaly_score
FROM anomaly_scores
WHERE ABS(z_amount) > 2 OR ABS(z_items) > 2 OR late_night_flag = 1
ORDER BY anomaly_score DESC
LIMIT 100;8. Percentile-Based Detection
Top/Bottom N% Detection
WITH percentile_bounds AS (
SELECT
PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY amount) as p1,
PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY amount) as p99
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
)
SELECT
o.order_id,
o.user_id,
o.amount,
o.created_at,
CASE
WHEN o.amount >= pb.p99 THEN 'TOP_1%'
WHEN o.amount <= pb.p1 THEN 'BOTTOM_1%'
ELSE 'NORMAL'
END as percentile_status,
ROUND(PERCENT_RANK() OVER (ORDER BY o.amount) * 100, 2) as percentile
FROM orders o
CROSS JOIN percentile_bounds pb
WHERE o.created_at >= CURRENT_DATE - INTERVAL '7 days'
AND (o.amount >= pb.p99 OR o.amount <= pb.p1)
ORDER BY o.amount DESC;9. Real-Time Alert Query
Immediate Detection
WITH recent_baseline AS (
SELECT
AVG(amount) as avg_amount,
STDDEV(amount) as std_amount,
PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY amount) as p99
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '7 days'
AND created_at < CURRENT_TIMESTAMP - INTERVAL '1 hour'
),
last_hour_orders AS (
SELECT
order_id,
user_id,
amount,
created_at
FROM orders
WHERE created_at >= CURRENT_TIMESTAMP - INTERVAL '1 hour'
)
SELECT
o.order_id,
o.user_id,
o.amount,
o.created_at,
ROUND((o.amount - b.avg_amount) / NULLIF(b.std_amount, 0), 2) as z_score,
CASE
WHEN o.amount > b.p99 * 2 THEN 'CRITICAL'
WHEN o.amount > b.p99 THEN 'WARNING'
WHEN (o.amount - b.avg_amount) / NULLIF(b.std_amount, 0) > 3 THEN 'WARNING'
ELSE 'OK'
END as alert_level
FROM last_hour_orders o
CROSS JOIN recent_baseline b
WHERE o.amount > b.p99
OR (o.amount - b.avg_amount) / NULLIF(b.std_amount, 0) > 3
ORDER BY o.amount DESC;10. Anomaly Summary Dashboard
Daily Outlier Summary
WITH daily_orders AS (
SELECT
DATE(created_at) as date,
order_id,
amount
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
),
daily_stats AS (
SELECT
date,
COUNT(*) as total_orders,
AVG(amount) as avg_amount,
STDDEV(amount) as std_amount
FROM daily_orders
GROUP BY date
),
outlier_counts AS (
SELECT
DATE(o.created_at) as date,
COUNT(*) as outlier_count,
SUM(o.amount) as outlier_amount
FROM orders o
JOIN daily_stats ds ON DATE(o.created_at) = ds.date
WHERE (o.amount - ds.avg_amount) / NULLIF(ds.std_amount, 0) > 3
AND o.created_at >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY DATE(o.created_at)
)
SELECT
ds.date,
ds.total_orders,
COALESCE(oc.outlier_count, 0) as outliers,
ROUND(COALESCE(oc.outlier_count, 0) * 100.0 / ds.total_orders, 2) as outlier_pct,
ROUND(ds.avg_amount, 0) as avg_amount,
COALESCE(ROUND(oc.outlier_amount, 0), 0) as outlier_amount
FROM daily_stats ds
LEFT JOIN outlier_counts oc ON ds.date = oc.date
ORDER BY ds.date DESC;Conclusion
Anomaly Detection Key Points:
- Use composite conditions over single methods
- Compare global stats + individual patterns
- Build real-time alerting systems
References
- NIST - Guidelines for Outlier Detection
- Scikit-learn - Outlier Detection Methods
- PostgreSQL - Statistical Functions