A/B 테스트 결과 분석을 SQL로: 통계적 유의성 직접 계산하기
SOTAAZ·

A/B 테스트 결과 분석을 SQL로: 통계적 유의성 직접 계산하기
외부 툴 없이 SQL만으로 A/B 테스트 결과 분석. Z-검정, 신뢰구간, 샘플 사이즈 계산.
TL;DR
- A/B 테스트: 두 버전을 비교해 어떤 게 더 나은지 판단
- 통계적 유의성: 결과가 우연이 아닌지 확인 (p-value < 0.05)
- SQL로 계산: 전환율, Z-score, 신뢰구간 모두 SQL로 가능
- 주의: 샘플 사이즈, MDE, 테스트 기간 고려 필수
1. A/B 테스트 기초
기본 개념
Control (A): 기존 버전 - 전환율 10%
Treatment (B): 새 버전 - 전환율 12%
질문: B가 정말 더 좋은 건가, 아니면 우연인가?핵심 지표
2. 기본 테이블 구조
-- 실험 배정 테이블
CREATE TABLE experiment_assignments (
user_id INT,
experiment_name VARCHAR(100),
variant VARCHAR(50), -- 'control' or 'treatment'
assigned_at TIMESTAMP
);
-- 전환 이벤트 테이블
CREATE TABLE conversions (
user_id INT,
conversion_type VARCHAR(100),
converted_at TIMESTAMP,
revenue DECIMAL(10, 2)
);3. 기본 전환율 비교
각 그룹의 전환율 계산
WITH experiment_data AS (
SELECT
ea.variant,
COUNT(DISTINCT ea.user_id) as users,
COUNT(DISTINCT c.user_id) as conversions
FROM experiment_assignments ea
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
AND c.converted_at < ea.assigned_at + INTERVAL '7 days'
WHERE ea.experiment_name = 'checkout_button_color'
GROUP BY ea.variant
)
SELECT
variant,
users,
conversions,
ROUND(conversions * 100.0 / users, 4) as conversion_rate,
ROUND(conversions * 1.0 / users, 6) as cr_decimal
FROM experiment_data;결과:
4. Z-검정으로 유의성 계산
두 비율의 차이 검정
WITH experiment_stats AS (
SELECT
variant,
COUNT(DISTINCT ea.user_id) as n,
COUNT(DISTINCT c.user_id) as x,
COUNT(DISTINCT c.user_id) * 1.0 / COUNT(DISTINCT ea.user_id) as p
FROM experiment_assignments ea
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
WHERE ea.experiment_name = 'checkout_button_color'
GROUP BY ea.variant
),
pooled_stats AS (
SELECT
SUM(x) * 1.0 / SUM(n) as p_pooled,
MAX(CASE WHEN variant = 'control' THEN n END) as n_control,
MAX(CASE WHEN variant = 'control' THEN p END) as p_control,
MAX(CASE WHEN variant = 'treatment' THEN n END) as n_treatment,
MAX(CASE WHEN variant = 'treatment' THEN p END) as p_treatment
FROM experiment_stats
),
z_calculation AS (
SELECT
p_control,
p_treatment,
n_control,
n_treatment,
p_pooled,
-- Standard Error
SQRT(p_pooled * (1 - p_pooled) * (1.0/n_control + 1.0/n_treatment)) as se,
-- Difference
p_treatment - p_control as diff
FROM pooled_stats
)
SELECT
ROUND(p_control * 100, 2) as control_rate_pct,
ROUND(p_treatment * 100, 2) as treatment_rate_pct,
ROUND(diff * 100, 2) as absolute_lift_pct,
ROUND((diff / p_control) * 100, 2) as relative_lift_pct,
ROUND(diff / se, 4) as z_score,
-- p-value 근사 (양측 검정)
CASE
WHEN ABS(diff / se) > 2.576 THEN '< 0.01 ***'
WHEN ABS(diff / se) > 1.96 THEN '< 0.05 **'
WHEN ABS(diff / se) > 1.645 THEN '< 0.10 *'
ELSE '> 0.10 (not significant)'
END as p_value_approx
FROM z_calculation;결과:
→ Z-score 3.51 > 2.576이므로 99% 신뢰수준에서 유의미!
5. 신뢰구간 계산
효과의 범위 추정
WITH experiment_stats AS (
SELECT
variant,
COUNT(DISTINCT ea.user_id) as n,
COUNT(DISTINCT c.user_id) * 1.0 / COUNT(DISTINCT ea.user_id) as p
FROM experiment_assignments ea
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
WHERE ea.experiment_name = 'checkout_button_color'
GROUP BY ea.variant
),
stats AS (
SELECT
MAX(CASE WHEN variant = 'control' THEN n END) as n_c,
MAX(CASE WHEN variant = 'control' THEN p END) as p_c,
MAX(CASE WHEN variant = 'treatment' THEN n END) as n_t,
MAX(CASE WHEN variant = 'treatment' THEN p END) as p_t
FROM experiment_stats
),
confidence_interval AS (
SELECT
p_t - p_c as diff,
-- SE for difference of two proportions
SQRT(
(p_c * (1 - p_c) / n_c) +
(p_t * (1 - p_t) / n_t)
) as se
FROM stats
)
SELECT
ROUND(diff * 100, 2) as lift_pct,
ROUND((diff - 1.96 * se) * 100, 2) as ci_lower_95,
ROUND((diff + 1.96 * se) * 100, 2) as ci_upper_95,
ROUND((diff - 2.576 * se) * 100, 2) as ci_lower_99,
ROUND((diff + 2.576 * se) * 100, 2) as ci_upper_99,
CASE
WHEN (diff - 1.96 * se) > 0 THEN 'Positive effect (95% confident)'
WHEN (diff + 1.96 * se) < 0 THEN 'Negative effect (95% confident)'
ELSE 'Inconclusive'
END as conclusion
FROM confidence_interval;결과:
→ 95% 신뢰구간: 0.66% ~ 2.34% (0을 포함하지 않으므로 유의미)
6. 필요 샘플 사이즈 계산
테스트 시작 전 필요한 샘플 수
-- 파라미터 설정
WITH params AS (
SELECT
0.10 as baseline_rate, -- 기존 전환율 10%
0.02 as mde, -- 최소 감지 효과 2%p (10% → 12%)
1.96 as z_alpha, -- 95% 신뢰수준 (양측)
0.84 as z_beta -- 80% 검정력
),
sample_size AS (
SELECT
baseline_rate as p1,
baseline_rate + mde as p2,
(baseline_rate + baseline_rate + mde) / 2 as p_avg,
z_alpha,
z_beta
FROM params
)
SELECT
-- 그룹당 필요 샘플 수 공식
CEIL(
2 * POWER(
(z_alpha * SQRT(2 * p_avg * (1 - p_avg)) +
z_beta * SQRT(p1 * (1 - p1) + p2 * (1 - p2)))
/ (p2 - p1),
2
)
) as sample_per_group,
-- 총 필요 샘플
2 * CEIL(
2 * POWER(
(z_alpha * SQRT(2 * p_avg * (1 - p_avg)) +
z_beta * SQRT(p1 * (1 - p1) + p2 * (1 - p2)))
/ (p2 - p1),
2
)
) as total_sample
FROM sample_size;7. 매출 기반 A/B 테스트 (t-검정)
평균 매출 비교
WITH revenue_stats AS (
SELECT
ea.variant,
COUNT(DISTINCT ea.user_id) as n,
AVG(COALESCE(c.revenue, 0)) as mean_revenue,
STDDEV(COALESCE(c.revenue, 0)) as std_revenue
FROM experiment_assignments ea
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
WHERE ea.experiment_name = 'pricing_test'
GROUP BY ea.variant
),
t_test AS (
SELECT
MAX(CASE WHEN variant = 'control' THEN mean_revenue END) as mean_c,
MAX(CASE WHEN variant = 'control' THEN std_revenue END) as std_c,
MAX(CASE WHEN variant = 'control' THEN n END) as n_c,
MAX(CASE WHEN variant = 'treatment' THEN mean_revenue END) as mean_t,
MAX(CASE WHEN variant = 'treatment' THEN std_revenue END) as std_t,
MAX(CASE WHEN variant = 'treatment' THEN n END) as n_t
FROM revenue_stats
)
SELECT
ROUND(mean_c, 2) as control_arpu,
ROUND(mean_t, 2) as treatment_arpu,
ROUND(mean_t - mean_c, 2) as difference,
ROUND((mean_t - mean_c) / mean_c * 100, 2) as lift_pct,
-- Welch's t-test
ROUND(
(mean_t - mean_c) /
SQRT(POWER(std_c, 2) / n_c + POWER(std_t, 2) / n_t),
4
) as t_statistic,
CASE
WHEN ABS((mean_t - mean_c) / SQRT(POWER(std_c, 2) / n_c + POWER(std_t, 2) / n_t)) > 2.576 THEN '< 0.01 ***'
WHEN ABS((mean_t - mean_c) / SQRT(POWER(std_c, 2) / n_c + POWER(std_t, 2) / n_t)) > 1.96 THEN '< 0.05 **'
ELSE '> 0.05'
END as significance
FROM t_test;8. 세그먼트별 분석
사용자 그룹별 효과 차이
WITH segment_results AS (
SELECT
u.country,
ea.variant,
COUNT(DISTINCT ea.user_id) as users,
COUNT(DISTINCT c.user_id) as conversions,
COUNT(DISTINCT c.user_id) * 1.0 / COUNT(DISTINCT ea.user_id) as conversion_rate
FROM experiment_assignments ea
JOIN users u ON ea.user_id = u.user_id
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
WHERE ea.experiment_name = 'checkout_button_color'
GROUP BY u.country, ea.variant
),
segment_comparison AS (
SELECT
country,
MAX(CASE WHEN variant = 'control' THEN conversion_rate END) as cr_control,
MAX(CASE WHEN variant = 'treatment' THEN conversion_rate END) as cr_treatment,
MAX(CASE WHEN variant = 'control' THEN users END) as n_control,
MAX(CASE WHEN variant = 'treatment' THEN users END) as n_treatment
FROM segment_results
GROUP BY country
)
SELECT
country,
ROUND(cr_control * 100, 2) as control_pct,
ROUND(cr_treatment * 100, 2) as treatment_pct,
ROUND((cr_treatment - cr_control) * 100, 2) as lift_pct,
n_control + n_treatment as total_users,
CASE
WHEN n_control + n_treatment < 1000 THEN 'Low sample - inconclusive'
WHEN cr_treatment > cr_control * 1.1 THEN 'Strong positive'
WHEN cr_treatment > cr_control THEN 'Slight positive'
ELSE 'Negative or no effect'
END as segment_result
FROM segment_comparison
ORDER BY total_users DESC;9. 시간에 따른 효과 추적
Novelty Effect 감지
WITH daily_results AS (
SELECT
DATE(ea.assigned_at) as date,
ea.variant,
COUNT(DISTINCT ea.user_id) as users,
COUNT(DISTINCT c.user_id) as conversions
FROM experiment_assignments ea
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
AND c.converted_at < ea.assigned_at + INTERVAL '1 day'
WHERE ea.experiment_name = 'checkout_button_color'
GROUP BY DATE(ea.assigned_at), ea.variant
)
SELECT
date,
MAX(CASE WHEN variant = 'control' THEN conversions * 100.0 / users END) as control_rate,
MAX(CASE WHEN variant = 'treatment' THEN conversions * 100.0 / users END) as treatment_rate,
MAX(CASE WHEN variant = 'treatment' THEN conversions * 100.0 / users END) -
MAX(CASE WHEN variant = 'control' THEN conversions * 100.0 / users END) as daily_lift
FROM daily_results
GROUP BY date
ORDER BY date;시간이 지나도 lift가 유지되는지 확인!
10. SRM (Sample Ratio Mismatch) 체크
배정 비율이 정상인지 확인
WITH assignment_counts AS (
SELECT
variant,
COUNT(*) as assigned
FROM experiment_assignments
WHERE experiment_name = 'checkout_button_color'
GROUP BY variant
),
srm_check AS (
SELECT
SUM(assigned) as total,
MAX(CASE WHEN variant = 'control' THEN assigned END) as control,
MAX(CASE WHEN variant = 'treatment' THEN assigned END) as treatment
FROM assignment_counts
)
SELECT
control,
treatment,
total,
ROUND(control * 100.0 / total, 2) as control_pct,
ROUND(treatment * 100.0 / total, 2) as treatment_pct,
-- Chi-square test for 50/50 split
ROUND(
POWER(control - total * 0.5, 2) / (total * 0.5) +
POWER(treatment - total * 0.5, 2) / (total * 0.5),
4
) as chi_square,
CASE
WHEN POWER(control - total * 0.5, 2) / (total * 0.5) +
POWER(treatment - total * 0.5, 2) / (total * 0.5) > 3.84
THEN 'SRM DETECTED - Check implementation!'
ELSE 'OK - Ratio looks fine'
END as srm_result
FROM srm_check;11. 실전 체크리스트
테스트 전
-- 1. 충분한 샘플?
SELECT COUNT(*) FROM daily_active_users WHERE date >= CURRENT_DATE - 7;
-- 2. 베이스라인 전환율?
SELECT AVG(converted) FROM historical_conversions WHERE date >= CURRENT_DATE - 30;테스트 중
-- 3. SRM 체크
-- 4. 일별 트렌드 확인테스트 후
-- 5. 유의성 검정
-- 6. 세그먼트 분석
-- 7. 장기 효과 추적결론
A/B 테스트 핵심:
- p-value < 0.05는 최소 기준
- 신뢰구간이 0을 포함하면 불확실
- 샘플 사이즈 부족하면 결론 내리지 마라
- SRM은 구현 버그의 신호
References
- Evan Miller - Sample Size Calculator
- Optimizely - Statistics Engine
- VWO - A/B Testing Statistical Significance