이상치 탐지를 SQL로: Z-Score와 IQR 기반 아웃라이어 찾기
SOTAAZ·

이상치 탐지를 SQL로: Z-Score와 IQR 기반 아웃라이어 찾기
비정상 데이터를 SQL로 자동 탐지. Z-Score, IQR, 백분위 기반 이상치 탐지 구현.
TL;DR
- 이상치(Outlier): 정상 범위를 벗어난 데이터 포인트
- Z-Score: 평균에서 몇 표준편차 떨어졌는지 (|Z| > 3이면 이상치)
- IQR: 사분위 범위 기반 (Q1-1.5×IQR ~ Q3+1.5×IQR 벗어나면 이상치)
- 활용: 이상 거래, 비정상 트래픽, 데이터 오류 탐지
1. 왜 이상치 탐지가 필요한가?
실제 사례
일별 매출: 100만, 105만, 98만, 102만, 950만(??), 101만950만원은 정상인가?
- 시스템 오류?
- 대량 주문?
- 사기 거래?
→ 자동으로 감지해서 알림 필요!
이상치 유형
2. Z-Score 방식
기본 개념
Z = (x - μ) / σ
x: 개별 값
μ: 평균
σ: 표준편차SQL 구현
WITH stats AS (
SELECT
AVG(amount) as mean_amount,
STDDEV(amount) as std_amount
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
),
z_scores AS (
SELECT
o.order_id,
o.user_id,
o.amount,
o.created_at,
(o.amount - s.mean_amount) / NULLIF(s.std_amount, 0) as z_score
FROM orders o
CROSS JOIN stats s
WHERE o.created_at >= CURRENT_DATE - INTERVAL '30 days'
)
SELECT
order_id,
user_id,
amount,
created_at,
ROUND(z_score, 2) as z_score,
CASE
WHEN z_score > 3 THEN 'HIGH_OUTLIER'
WHEN z_score < -3 THEN 'LOW_OUTLIER'
ELSE 'NORMAL'
END as status
FROM z_scores
WHERE ABS(z_score) > 3
ORDER BY ABS(z_score) DESC;결과:
3. IQR (사분위 범위) 방식
기본 개념
IQR = Q3 - Q1
하한 = Q1 - 1.5 × IQR
상한 = Q3 + 1.5 × IQRSQL 구현
WITH quartiles AS (
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY amount) as q1,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY amount) as q2,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY amount) as q3
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
),
bounds AS (
SELECT
q1,
q2,
q3,
q3 - q1 as iqr,
q1 - 1.5 * (q3 - q1) as lower_bound,
q3 + 1.5 * (q3 - q1) as upper_bound
FROM quartiles
)
SELECT
o.order_id,
o.user_id,
o.amount,
o.created_at,
b.lower_bound,
b.upper_bound,
CASE
WHEN o.amount > b.upper_bound THEN 'HIGH_OUTLIER'
WHEN o.amount < b.lower_bound THEN 'LOW_OUTLIER'
ELSE 'NORMAL'
END as status
FROM orders o
CROSS JOIN bounds b
WHERE o.created_at >= CURRENT_DATE - INTERVAL '30 days'
AND (o.amount > b.upper_bound OR o.amount < b.lower_bound)
ORDER BY o.amount DESC;4. Modified Z-Score (MAD 기반)
일반 Z-Score의 문제
- 극단적 이상치에 평균/표준편차가 왜곡됨
- 해결: 중앙값(Median)과 MAD 사용
SQL 구현
WITH median_calc AS (
SELECT
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) as median_amount
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
),
mad_calc AS (
SELECT
m.median_amount,
PERCENTILE_CONT(0.5) WITHIN GROUP (
ORDER BY ABS(o.amount - m.median_amount)
) as mad
FROM orders o
CROSS JOIN median_calc m
WHERE o.created_at >= CURRENT_DATE - INTERVAL '30 days'
),
modified_z AS (
SELECT
o.order_id,
o.amount,
m.median_amount,
m.mad,
0.6745 * (o.amount - m.median_amount) / NULLIF(m.mad, 0) as modified_z_score
FROM orders o
CROSS JOIN mad_calc m
WHERE o.created_at >= CURRENT_DATE - INTERVAL '30 days'
)
SELECT
order_id,
amount,
ROUND(modified_z_score, 2) as modified_z,
CASE
WHEN ABS(modified_z_score) > 3.5 THEN 'OUTLIER'
ELSE 'NORMAL'
END as status
FROM modified_z
WHERE ABS(modified_z_score) > 3.5
ORDER BY ABS(modified_z_score) DESC;5. 시계열 이상치 탐지
이동평균 기반 이상치
WITH daily_metrics AS (
SELECT
DATE(created_at) as date,
SUM(amount) as daily_revenue,
COUNT(*) as daily_orders
FROM orders
GROUP BY DATE(created_at)
),
with_moving_stats AS (
SELECT
date,
daily_revenue,
daily_orders,
AVG(daily_revenue) OVER (
ORDER BY date
ROWS BETWEEN 7 PRECEDING AND 1 PRECEDING
) as ma_7d,
STDDEV(daily_revenue) OVER (
ORDER BY date
ROWS BETWEEN 7 PRECEDING AND 1 PRECEDING
) as std_7d
FROM daily_metrics
)
SELECT
date,
daily_revenue,
ROUND(ma_7d, 0) as ma_7d,
ROUND(std_7d, 0) as std_7d,
ROUND((daily_revenue - ma_7d) / NULLIF(std_7d, 0), 2) as z_score,
CASE
WHEN (daily_revenue - ma_7d) / NULLIF(std_7d, 0) > 2 THEN 'SPIKE'
WHEN (daily_revenue - ma_7d) / NULLIF(std_7d, 0) < -2 THEN 'DROP'
ELSE 'NORMAL'
END as status
FROM with_moving_stats
WHERE ma_7d IS NOT NULL
ORDER BY date DESC;6. 사용자별 이상 행동 탐지
개인 패턴 대비 이상치
WITH user_stats AS (
SELECT
user_id,
AVG(amount) as user_avg,
STDDEV(amount) as user_std,
COUNT(*) as order_count
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '90 days'
GROUP BY user_id
HAVING COUNT(*) >= 5 -- 최소 5건 이상
),
recent_orders AS (
SELECT
o.order_id,
o.user_id,
o.amount,
o.created_at,
us.user_avg,
us.user_std,
(o.amount - us.user_avg) / NULLIF(us.user_std, 0) as user_z_score
FROM orders o
JOIN user_stats us ON o.user_id = us.user_id
WHERE o.created_at >= CURRENT_DATE - INTERVAL '7 days'
)
SELECT
order_id,
user_id,
amount,
ROUND(user_avg, 0) as user_avg,
ROUND(user_z_score, 2) as z_vs_user_pattern,
CASE
WHEN user_z_score > 3 THEN 'UNUSUAL_HIGH'
WHEN user_z_score < -2 THEN 'UNUSUAL_LOW'
ELSE 'NORMAL'
END as anomaly_flag
FROM recent_orders
WHERE ABS(user_z_score) > 2
ORDER BY ABS(user_z_score) DESC;7. 다중 지표 이상치
복합 조건으로 이상 탐지
WITH order_features AS (
SELECT
order_id,
user_id,
amount,
items_count,
created_at,
-- 시간대
EXTRACT(HOUR FROM created_at) as order_hour,
-- 요일
EXTRACT(DOW FROM created_at) as order_dow
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
),
global_stats AS (
SELECT
AVG(amount) as avg_amount,
STDDEV(amount) as std_amount,
AVG(items_count) as avg_items,
STDDEV(items_count) as std_items
FROM order_features
),
anomaly_scores AS (
SELECT
of.order_id,
of.user_id,
of.amount,
of.items_count,
of.created_at,
(of.amount - gs.avg_amount) / NULLIF(gs.std_amount, 0) as z_amount,
(of.items_count - gs.avg_items) / NULLIF(gs.std_items, 0) as z_items,
-- 새벽 시간 주문
CASE WHEN of.order_hour BETWEEN 2 AND 5 THEN 1 ELSE 0 END as late_night_flag
FROM order_features of
CROSS JOIN global_stats gs
)
SELECT
order_id,
user_id,
amount,
items_count,
created_at,
ROUND(z_amount, 2) as z_amount,
ROUND(z_items, 2) as z_items,
late_night_flag,
-- 복합 이상 점수
ROUND(ABS(z_amount) + ABS(z_items) + late_night_flag * 2, 2) as anomaly_score
FROM anomaly_scores
WHERE ABS(z_amount) > 2 OR ABS(z_items) > 2 OR late_night_flag = 1
ORDER BY anomaly_score DESC
LIMIT 100;8. 백분위 기반 탐지
상위/하위 N% 탐지
WITH percentile_bounds AS (
SELECT
PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY amount) as p1,
PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY amount) as p99
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
)
SELECT
o.order_id,
o.user_id,
o.amount,
o.created_at,
CASE
WHEN o.amount >= pb.p99 THEN 'TOP_1%'
WHEN o.amount <= pb.p1 THEN 'BOTTOM_1%'
ELSE 'NORMAL'
END as percentile_status,
ROUND(PERCENT_RANK() OVER (ORDER BY o.amount) * 100, 2) as percentile
FROM orders o
CROSS JOIN percentile_bounds pb
WHERE o.created_at >= CURRENT_DATE - INTERVAL '7 days'
AND (o.amount >= pb.p99 OR o.amount <= pb.p1)
ORDER BY o.amount DESC;9. 실시간 알림 쿼리
즉시 감지해야 할 이상치
WITH recent_baseline AS (
SELECT
AVG(amount) as avg_amount,
STDDEV(amount) as std_amount,
PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY amount) as p99
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '7 days'
AND created_at < CURRENT_TIMESTAMP - INTERVAL '1 hour'
),
last_hour_orders AS (
SELECT
order_id,
user_id,
amount,
created_at
FROM orders
WHERE created_at >= CURRENT_TIMESTAMP - INTERVAL '1 hour'
)
SELECT
o.order_id,
o.user_id,
o.amount,
o.created_at,
ROUND((o.amount - b.avg_amount) / NULLIF(b.std_amount, 0), 2) as z_score,
CASE
WHEN o.amount > b.p99 * 2 THEN 'CRITICAL'
WHEN o.amount > b.p99 THEN 'WARNING'
WHEN (o.amount - b.avg_amount) / NULLIF(b.std_amount, 0) > 3 THEN 'WARNING'
ELSE 'OK'
END as alert_level
FROM last_hour_orders o
CROSS JOIN recent_baseline b
WHERE o.amount > b.p99
OR (o.amount - b.avg_amount) / NULLIF(b.std_amount, 0) > 3
ORDER BY o.amount DESC;10. 이상치 요약 대시보드
일별 이상치 현황
WITH daily_orders AS (
SELECT
DATE(created_at) as date,
order_id,
amount
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
),
daily_stats AS (
SELECT
date,
COUNT(*) as total_orders,
AVG(amount) as avg_amount,
STDDEV(amount) as std_amount
FROM daily_orders
GROUP BY date
),
outlier_counts AS (
SELECT
DATE(o.created_at) as date,
COUNT(*) as outlier_count,
SUM(o.amount) as outlier_amount
FROM orders o
JOIN daily_stats ds ON DATE(o.created_at) = ds.date
WHERE (o.amount - ds.avg_amount) / NULLIF(ds.std_amount, 0) > 3
AND o.created_at >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY DATE(o.created_at)
)
SELECT
ds.date,
ds.total_orders,
COALESCE(oc.outlier_count, 0) as outliers,
ROUND(COALESCE(oc.outlier_count, 0) * 100.0 / ds.total_orders, 2) as outlier_pct,
ROUND(ds.avg_amount, 0) as avg_amount,
COALESCE(ROUND(oc.outlier_amount, 0), 0) as outlier_amount
FROM daily_stats ds
LEFT JOIN outlier_counts oc ON ds.date = oc.date
ORDER BY ds.date DESC;결론
이상치 탐지 핵심:
- 단일 방법보다 복합 조건 사용
- 전역 통계 + 개인 패턴 비교
- 실시간 알림 체계 구축
References
- NIST - Guidelines for Outlier Detection
- Scikit-learn - Outlier Detection Methods
- PostgreSQL - Statistical Functions