A/B Test Analysis in SQL: Calculating Statistical Significance Yourself
SOTAAZ·

A/B Test Analysis in SQL: Calculating Statistical Significance Yourself
Analyze A/B test results with SQL alone. Z-test, confidence intervals, and sample size calculation.
TL;DR
- A/B Testing: Compare two versions to determine which performs better
- Statistical Significance: Confirm results aren't due to chance (p-value < 0.05)
- SQL Calculation: Conversion rates, Z-scores, confidence intervals all possible in SQL
- Caution: Consider sample size, MDE, and test duration
1. A/B Testing Basics
Core Concept
Control (A): Original version - 10% conversion rate
Treatment (B): New version - 12% conversion rate
Question: Is B really better, or is it just chance?Key Metrics
2. Basic Table Structure
-- Experiment assignment table
CREATE TABLE experiment_assignments (
user_id INT,
experiment_name VARCHAR(100),
variant VARCHAR(50), -- 'control' or 'treatment'
assigned_at TIMESTAMP
);
-- Conversion events table
CREATE TABLE conversions (
user_id INT,
conversion_type VARCHAR(100),
converted_at TIMESTAMP,
revenue DECIMAL(10, 2)
);3. Basic Conversion Rate Comparison
Calculate Conversion Rate per Group
WITH experiment_data AS (
SELECT
ea.variant,
COUNT(DISTINCT ea.user_id) as users,
COUNT(DISTINCT c.user_id) as conversions
FROM experiment_assignments ea
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
AND c.converted_at < ea.assigned_at + INTERVAL '7 days'
WHERE ea.experiment_name = 'checkout_button_color'
GROUP BY ea.variant
)
SELECT
variant,
users,
conversions,
ROUND(conversions * 100.0 / users, 4) as conversion_rate,
ROUND(conversions * 1.0 / users, 6) as cr_decimal
FROM experiment_data;Result:
4. Z-Test for Significance
Testing Difference Between Two Proportions
WITH experiment_stats AS (
SELECT
variant,
COUNT(DISTINCT ea.user_id) as n,
COUNT(DISTINCT c.user_id) as x,
COUNT(DISTINCT c.user_id) * 1.0 / COUNT(DISTINCT ea.user_id) as p
FROM experiment_assignments ea
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
WHERE ea.experiment_name = 'checkout_button_color'
GROUP BY ea.variant
),
pooled_stats AS (
SELECT
SUM(x) * 1.0 / SUM(n) as p_pooled,
MAX(CASE WHEN variant = 'control' THEN n END) as n_control,
MAX(CASE WHEN variant = 'control' THEN p END) as p_control,
MAX(CASE WHEN variant = 'treatment' THEN n END) as n_treatment,
MAX(CASE WHEN variant = 'treatment' THEN p END) as p_treatment
FROM experiment_stats
),
z_calculation AS (
SELECT
p_control,
p_treatment,
n_control,
n_treatment,
p_pooled,
-- Standard Error
SQRT(p_pooled * (1 - p_pooled) * (1.0/n_control + 1.0/n_treatment)) as se,
-- Difference
p_treatment - p_control as diff
FROM pooled_stats
)
SELECT
ROUND(p_control * 100, 2) as control_rate_pct,
ROUND(p_treatment * 100, 2) as treatment_rate_pct,
ROUND(diff * 100, 2) as absolute_lift_pct,
ROUND((diff / p_control) * 100, 2) as relative_lift_pct,
ROUND(diff / se, 4) as z_score,
-- p-value approximation (two-tailed)
CASE
WHEN ABS(diff / se) > 2.576 THEN '< 0.01 ***'
WHEN ABS(diff / se) > 1.96 THEN '< 0.05 **'
WHEN ABS(diff / se) > 1.645 THEN '< 0.10 *'
ELSE '> 0.10 (not significant)'
END as p_value_approx
FROM z_calculation;Result:
→ Z-score 3.51 > 2.576, significant at 99% confidence level!
5. Confidence Interval Calculation
Estimate Effect Range
WITH experiment_stats AS (
SELECT
variant,
COUNT(DISTINCT ea.user_id) as n,
COUNT(DISTINCT c.user_id) * 1.0 / COUNT(DISTINCT ea.user_id) as p
FROM experiment_assignments ea
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
WHERE ea.experiment_name = 'checkout_button_color'
GROUP BY ea.variant
),
stats AS (
SELECT
MAX(CASE WHEN variant = 'control' THEN n END) as n_c,
MAX(CASE WHEN variant = 'control' THEN p END) as p_c,
MAX(CASE WHEN variant = 'treatment' THEN n END) as n_t,
MAX(CASE WHEN variant = 'treatment' THEN p END) as p_t
FROM experiment_stats
),
confidence_interval AS (
SELECT
p_t - p_c as diff,
-- SE for difference of two proportions
SQRT(
(p_c * (1 - p_c) / n_c) +
(p_t * (1 - p_t) / n_t)
) as se
FROM stats
)
SELECT
ROUND(diff * 100, 2) as lift_pct,
ROUND((diff - 1.96 * se) * 100, 2) as ci_lower_95,
ROUND((diff + 1.96 * se) * 100, 2) as ci_upper_95,
ROUND((diff - 2.576 * se) * 100, 2) as ci_lower_99,
ROUND((diff + 2.576 * se) * 100, 2) as ci_upper_99,
CASE
WHEN (diff - 1.96 * se) > 0 THEN 'Positive effect (95% confident)'
WHEN (diff + 1.96 * se) < 0 THEN 'Negative effect (95% confident)'
ELSE 'Inconclusive'
END as conclusion
FROM confidence_interval;Result:
→ 95% CI: 0.66% to 2.34% (doesn't include 0, so significant)
6. Sample Size Calculation
Required Samples Before Starting Test
-- Parameter setup
WITH params AS (
SELECT
0.10 as baseline_rate, -- Baseline 10%
0.02 as mde, -- Minimum detectable effect 2pp
1.96 as z_alpha, -- 95% confidence (two-tailed)
0.84 as z_beta -- 80% power
),
sample_size AS (
SELECT
baseline_rate as p1,
baseline_rate + mde as p2,
(baseline_rate + baseline_rate + mde) / 2 as p_avg,
z_alpha,
z_beta
FROM params
)
SELECT
-- Sample per group formula
CEIL(
2 * POWER(
(z_alpha * SQRT(2 * p_avg * (1 - p_avg)) +
z_beta * SQRT(p1 * (1 - p1) + p2 * (1 - p2)))
/ (p2 - p1),
2
)
) as sample_per_group,
-- Total sample
2 * CEIL(
2 * POWER(
(z_alpha * SQRT(2 * p_avg * (1 - p_avg)) +
z_beta * SQRT(p1 * (1 - p1) + p2 * (1 - p2)))
/ (p2 - p1),
2
)
) as total_sample
FROM sample_size;7. Revenue-Based A/B Test (t-test)
Comparing Average Revenue
WITH revenue_stats AS (
SELECT
ea.variant,
COUNT(DISTINCT ea.user_id) as n,
AVG(COALESCE(c.revenue, 0)) as mean_revenue,
STDDEV(COALESCE(c.revenue, 0)) as std_revenue
FROM experiment_assignments ea
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
WHERE ea.experiment_name = 'pricing_test'
GROUP BY ea.variant
),
t_test AS (
SELECT
MAX(CASE WHEN variant = 'control' THEN mean_revenue END) as mean_c,
MAX(CASE WHEN variant = 'control' THEN std_revenue END) as std_c,
MAX(CASE WHEN variant = 'control' THEN n END) as n_c,
MAX(CASE WHEN variant = 'treatment' THEN mean_revenue END) as mean_t,
MAX(CASE WHEN variant = 'treatment' THEN std_revenue END) as std_t,
MAX(CASE WHEN variant = 'treatment' THEN n END) as n_t
FROM revenue_stats
)
SELECT
ROUND(mean_c, 2) as control_arpu,
ROUND(mean_t, 2) as treatment_arpu,
ROUND(mean_t - mean_c, 2) as difference,
ROUND((mean_t - mean_c) / mean_c * 100, 2) as lift_pct,
-- Welch's t-test
ROUND(
(mean_t - mean_c) /
SQRT(POWER(std_c, 2) / n_c + POWER(std_t, 2) / n_t),
4
) as t_statistic,
CASE
WHEN ABS((mean_t - mean_c) / SQRT(POWER(std_c, 2) / n_c + POWER(std_t, 2) / n_t)) > 2.576 THEN '< 0.01 ***'
WHEN ABS((mean_t - mean_c) / SQRT(POWER(std_c, 2) / n_c + POWER(std_t, 2) / n_t)) > 1.96 THEN '< 0.05 **'
ELSE '> 0.05'
END as significance
FROM t_test;8. Segment Analysis
Effect Differences by User Group
WITH segment_results AS (
SELECT
u.country,
ea.variant,
COUNT(DISTINCT ea.user_id) as users,
COUNT(DISTINCT c.user_id) as conversions,
COUNT(DISTINCT c.user_id) * 1.0 / COUNT(DISTINCT ea.user_id) as conversion_rate
FROM experiment_assignments ea
JOIN users u ON ea.user_id = u.user_id
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
WHERE ea.experiment_name = 'checkout_button_color'
GROUP BY u.country, ea.variant
),
segment_comparison AS (
SELECT
country,
MAX(CASE WHEN variant = 'control' THEN conversion_rate END) as cr_control,
MAX(CASE WHEN variant = 'treatment' THEN conversion_rate END) as cr_treatment,
MAX(CASE WHEN variant = 'control' THEN users END) as n_control,
MAX(CASE WHEN variant = 'treatment' THEN users END) as n_treatment
FROM segment_results
GROUP BY country
)
SELECT
country,
ROUND(cr_control * 100, 2) as control_pct,
ROUND(cr_treatment * 100, 2) as treatment_pct,
ROUND((cr_treatment - cr_control) * 100, 2) as lift_pct,
n_control + n_treatment as total_users,
CASE
WHEN n_control + n_treatment < 1000 THEN 'Low sample - inconclusive'
WHEN cr_treatment > cr_control * 1.1 THEN 'Strong positive'
WHEN cr_treatment > cr_control THEN 'Slight positive'
ELSE 'Negative or no effect'
END as segment_result
FROM segment_comparison
ORDER BY total_users DESC;9. Time-Based Effect Tracking
Detecting Novelty Effect
WITH daily_results AS (
SELECT
DATE(ea.assigned_at) as date,
ea.variant,
COUNT(DISTINCT ea.user_id) as users,
COUNT(DISTINCT c.user_id) as conversions
FROM experiment_assignments ea
LEFT JOIN conversions c
ON ea.user_id = c.user_id
AND c.converted_at > ea.assigned_at
AND c.converted_at < ea.assigned_at + INTERVAL '1 day'
WHERE ea.experiment_name = 'checkout_button_color'
GROUP BY DATE(ea.assigned_at), ea.variant
)
SELECT
date,
MAX(CASE WHEN variant = 'control' THEN conversions * 100.0 / users END) as control_rate,
MAX(CASE WHEN variant = 'treatment' THEN conversions * 100.0 / users END) as treatment_rate,
MAX(CASE WHEN variant = 'treatment' THEN conversions * 100.0 / users END) -
MAX(CASE WHEN variant = 'control' THEN conversions * 100.0 / users END) as daily_lift
FROM daily_results
GROUP BY date
ORDER BY date;Verify that lift is sustained over time!
10. SRM (Sample Ratio Mismatch) Check
Verify Assignment Ratio is Correct
WITH assignment_counts AS (
SELECT
variant,
COUNT(*) as assigned
FROM experiment_assignments
WHERE experiment_name = 'checkout_button_color'
GROUP BY variant
),
srm_check AS (
SELECT
SUM(assigned) as total,
MAX(CASE WHEN variant = 'control' THEN assigned END) as control,
MAX(CASE WHEN variant = 'treatment' THEN assigned END) as treatment
FROM assignment_counts
)
SELECT
control,
treatment,
total,
ROUND(control * 100.0 / total, 2) as control_pct,
ROUND(treatment * 100.0 / total, 2) as treatment_pct,
-- Chi-square test for 50/50 split
ROUND(
POWER(control - total * 0.5, 2) / (total * 0.5) +
POWER(treatment - total * 0.5, 2) / (total * 0.5),
4
) as chi_square,
CASE
WHEN POWER(control - total * 0.5, 2) / (total * 0.5) +
POWER(treatment - total * 0.5, 2) / (total * 0.5) > 3.84
THEN 'SRM DETECTED - Check implementation!'
ELSE 'OK - Ratio looks fine'
END as srm_result
FROM srm_check;11. Practical Checklist
Before Test
-- 1. Sufficient sample?
SELECT COUNT(*) FROM daily_active_users WHERE date >= CURRENT_DATE - 7;
-- 2. Baseline conversion rate?
SELECT AVG(converted) FROM historical_conversions WHERE date >= CURRENT_DATE - 30;During Test
-- 3. SRM check
-- 4. Daily trend monitoringAfter Test
-- 5. Significance testing
-- 6. Segment analysis
-- 7. Long-term effect trackingConclusion
A/B Testing Key Points:
- p-value < 0.05 is the minimum bar
- If CI includes 0, result is uncertain
- Don't conclude with insufficient samples
- SRM is a sign of implementation bugs
References
- Evan Miller - Sample Size Calculator
- Optimizely - Statistics Engine
- VWO - A/B Testing Statistical Significance