Building Cohort Analysis in SQL: The Complete Guide to Retention
SOTAAZ·

Building Cohort Analysis in SQL: The Complete Guide to Retention
Build cohort analysis without GA4. Implement monthly retention and N-day retention directly in SQL.
TL;DR
- Cohort: A group of users who performed the same action at a specific time
- Retention: The percentage of a cohort that continues using the service over time
- SQL Implementation: Window functions and CTEs enable analysis without GA4
- Key Insights: Which cohorts retain better, when do users churn
1. What is Cohort Analysis?
Why Do You Need Cohort Analysis?
Wrong Analysis:
This month's DAU: 10,000 → "Great!"Right Analysis:
January signups still active after 3 months: 20%
February signups still active after 3 months: 35%
→ "Something changed in February!"Cohort analysis tracks user behavior changes over time.
Types of Cohorts
2. Basic Table Structure
-- Users table
CREATE TABLE users (
user_id INT PRIMARY KEY,
created_at TIMESTAMP, -- Signup date
signup_channel VARCHAR(50)
);
-- Activity log table
CREATE TABLE user_activities (
id INT PRIMARY KEY,
user_id INT,
activity_type VARCHAR(50),
created_at TIMESTAMP
);
-- Orders table
CREATE TABLE orders (
order_id INT PRIMARY KEY,
user_id INT,
amount DECIMAL(10, 2),
created_at TIMESTAMP
);3. Monthly Signup Cohort Retention
The Most Basic Cohort Analysis
WITH user_cohorts AS (
-- Each user's signup month (cohort)
SELECT
user_id,
DATE_TRUNC('month', created_at) as cohort_month
FROM users
),
user_activities_monthly AS (
-- Each user's monthly activity
SELECT DISTINCT
user_id,
DATE_TRUNC('month', created_at) as activity_month
FROM user_activities
),
cohort_data AS (
-- Combine cohort and activity data
SELECT
uc.cohort_month,
ua.activity_month,
-- Months since signup
DATE_PART('year', ua.activity_month) * 12 + DATE_PART('month', ua.activity_month)
- DATE_PART('year', uc.cohort_month) * 12 - DATE_PART('month', uc.cohort_month)
as months_since_signup,
COUNT(DISTINCT uc.user_id) as active_users
FROM user_cohorts uc
JOIN user_activities_monthly ua ON uc.user_id = ua.user_id
WHERE ua.activity_month >= uc.cohort_month
GROUP BY uc.cohort_month, ua.activity_month
),
cohort_sizes AS (
-- Total users in each cohort
SELECT
cohort_month,
COUNT(*) as cohort_size
FROM user_cohorts
GROUP BY cohort_month
)
SELECT
cd.cohort_month,
cs.cohort_size,
cd.months_since_signup,
cd.active_users,
ROUND(cd.active_users * 100.0 / cs.cohort_size, 2) as retention_rate
FROM cohort_data cd
JOIN cohort_sizes cs ON cd.cohort_month = cs.cohort_month
ORDER BY cd.cohort_month, cd.months_since_signup;Result:
4. Pivot Table Format
Cohort Retention Matrix
WITH user_cohorts AS (
SELECT
user_id,
DATE_TRUNC('month', created_at) as cohort_month
FROM users
),
user_activities_monthly AS (
SELECT DISTINCT
user_id,
DATE_TRUNC('month', created_at) as activity_month
FROM user_activities
),
retention_data AS (
SELECT
uc.cohort_month,
DATE_PART('year', ua.activity_month) * 12 + DATE_PART('month', ua.activity_month)
- DATE_PART('year', uc.cohort_month) * 12 - DATE_PART('month', uc.cohort_month)
as month_number,
COUNT(DISTINCT uc.user_id) as users
FROM user_cohorts uc
LEFT JOIN user_activities_monthly ua
ON uc.user_id = ua.user_id
AND ua.activity_month >= uc.cohort_month
GROUP BY uc.cohort_month, month_number
),
cohort_sizes AS (
SELECT cohort_month, COUNT(*) as size
FROM user_cohorts
GROUP BY cohort_month
)
SELECT
TO_CHAR(r.cohort_month, 'YYYY-MM') as cohort,
cs.size as cohort_size,
MAX(CASE WHEN month_number = 0 THEN ROUND(users * 100.0 / cs.size, 1) END) as "M0",
MAX(CASE WHEN month_number = 1 THEN ROUND(users * 100.0 / cs.size, 1) END) as "M1",
MAX(CASE WHEN month_number = 2 THEN ROUND(users * 100.0 / cs.size, 1) END) as "M2",
MAX(CASE WHEN month_number = 3 THEN ROUND(users * 100.0 / cs.size, 1) END) as "M3",
MAX(CASE WHEN month_number = 4 THEN ROUND(users * 100.0 / cs.size, 1) END) as "M4",
MAX(CASE WHEN month_number = 5 THEN ROUND(users * 100.0 / cs.size, 1) END) as "M5",
MAX(CASE WHEN month_number = 6 THEN ROUND(users * 100.0 / cs.size, 1) END) as "M6"
FROM retention_data r
JOIN cohort_sizes cs ON r.cohort_month = cs.cohort_month
GROUP BY r.cohort_month, cs.size
ORDER BY r.cohort_month;Result (Retention Matrix):
5. N-Day Retention (Day 1, Day 7, Day 30)
Common Metrics for Mobile Apps
WITH user_first_activity AS (
-- Each user's first activity date
SELECT
user_id,
DATE(MIN(created_at)) as first_activity_date
FROM user_activities
GROUP BY user_id
),
daily_activities AS (
-- User activity dates
SELECT DISTINCT
user_id,
DATE(created_at) as activity_date
FROM user_activities
),
retention_flags AS (
SELECT
ufa.user_id,
ufa.first_activity_date,
-- Check Day N activity
MAX(CASE WHEN da.activity_date = ufa.first_activity_date + INTERVAL '1 day'
THEN 1 ELSE 0 END) as day_1,
MAX(CASE WHEN da.activity_date = ufa.first_activity_date + INTERVAL '7 days'
THEN 1 ELSE 0 END) as day_7,
MAX(CASE WHEN da.activity_date = ufa.first_activity_date + INTERVAL '14 days'
THEN 1 ELSE 0 END) as day_14,
MAX(CASE WHEN da.activity_date = ufa.first_activity_date + INTERVAL '30 days'
THEN 1 ELSE 0 END) as day_30
FROM user_first_activity ufa
LEFT JOIN daily_activities da ON ufa.user_id = da.user_id
GROUP BY ufa.user_id, ufa.first_activity_date
)
SELECT
DATE_TRUNC('week', first_activity_date) as cohort_week,
COUNT(*) as cohort_size,
ROUND(SUM(day_1) * 100.0 / COUNT(*), 2) as day_1_retention,
ROUND(SUM(day_7) * 100.0 / COUNT(*), 2) as day_7_retention,
ROUND(SUM(day_14) * 100.0 / COUNT(*), 2) as day_14_retention,
ROUND(SUM(day_30) * 100.0 / COUNT(*), 2) as day_30_retention
FROM retention_flags
GROUP BY DATE_TRUNC('week', first_activity_date)
ORDER BY cohort_week;Result:
6. Purchase Cohort: Repeat Purchase Analysis
Repurchase Patterns After First Order
WITH first_purchase AS (
-- Each customer's first purchase
SELECT
user_id,
DATE_TRUNC('month', MIN(created_at)) as first_purchase_month
FROM orders
GROUP BY user_id
),
monthly_purchases AS (
-- Customer monthly purchase activity
SELECT DISTINCT
user_id,
DATE_TRUNC('month', created_at) as purchase_month
FROM orders
),
repurchase_data AS (
SELECT
fp.first_purchase_month as cohort,
DATE_PART('year', mp.purchase_month) * 12 + DATE_PART('month', mp.purchase_month)
- DATE_PART('year', fp.first_purchase_month) * 12 - DATE_PART('month', fp.first_purchase_month)
as months_since_first,
COUNT(DISTINCT fp.user_id) as customers
FROM first_purchase fp
JOIN monthly_purchases mp ON fp.user_id = mp.user_id
WHERE mp.purchase_month >= fp.first_purchase_month
GROUP BY fp.first_purchase_month, months_since_first
),
cohort_sizes AS (
SELECT first_purchase_month, COUNT(*) as size
FROM first_purchase
GROUP BY first_purchase_month
)
SELECT
TO_CHAR(rd.cohort, 'YYYY-MM') as cohort,
cs.size,
rd.months_since_first,
rd.customers,
ROUND(rd.customers * 100.0 / cs.size, 2) as repurchase_rate
FROM repurchase_data rd
JOIN cohort_sizes cs ON rd.cohort = cs.first_purchase_month
WHERE rd.months_since_first <= 6
ORDER BY rd.cohort, rd.months_since_first;7. Comparing Cohorts by Channel
Which Channel's Users Retain Better?
WITH user_cohorts AS (
SELECT
user_id,
DATE_TRUNC('month', created_at) as cohort_month,
signup_channel
FROM users
),
user_activities_monthly AS (
SELECT DISTINCT
user_id,
DATE_TRUNC('month', created_at) as activity_month
FROM user_activities
),
channel_retention AS (
SELECT
uc.signup_channel,
DATE_PART('year', ua.activity_month) * 12 + DATE_PART('month', ua.activity_month)
- DATE_PART('year', uc.cohort_month) * 12 - DATE_PART('month', uc.cohort_month)
as month_number,
COUNT(DISTINCT uc.user_id) as active_users,
COUNT(DISTINCT CASE WHEN ua.activity_month = uc.cohort_month
THEN uc.user_id END) as cohort_size
FROM user_cohorts uc
LEFT JOIN user_activities_monthly ua
ON uc.user_id = ua.user_id
AND ua.activity_month >= uc.cohort_month
GROUP BY uc.signup_channel, month_number
)
SELECT
signup_channel,
month_number,
active_users,
ROUND(active_users * 100.0 / NULLIF(MAX(cohort_size) OVER (PARTITION BY signup_channel), 0), 2) as retention_rate
FROM channel_retention
WHERE month_number BETWEEN 0 AND 6
ORDER BY signup_channel, month_number;Result:
→ Organic traffic has 2x better retention than paid ads!
8. Rolling Retention vs Classic Retention
Classic Retention
"Users who logged in exactly on Day 7"
Rolling Retention (More Forgiving)
"Users who logged in any time on or after Day 7"
WITH user_first_activity AS (
SELECT
user_id,
DATE(MIN(created_at)) as first_date
FROM user_activities
GROUP BY user_id
),
daily_activities AS (
SELECT DISTINCT user_id, DATE(created_at) as activity_date
FROM user_activities
),
retention_comparison AS (
SELECT
ufa.user_id,
ufa.first_date,
-- Classic: exactly Day 7
MAX(CASE WHEN da.activity_date = ufa.first_date + 7 THEN 1 ELSE 0 END) as classic_d7,
-- Rolling: any time Day 7 or later
MAX(CASE WHEN da.activity_date >= ufa.first_date + 7 THEN 1 ELSE 0 END) as rolling_d7
FROM user_first_activity ufa
LEFT JOIN daily_activities da ON ufa.user_id = da.user_id
GROUP BY ufa.user_id, ufa.first_date
)
SELECT
DATE_TRUNC('week', first_date) as cohort_week,
COUNT(*) as users,
ROUND(SUM(classic_d7) * 100.0 / COUNT(*), 2) as classic_d7_retention,
ROUND(SUM(rolling_d7) * 100.0 / COUNT(*), 2) as rolling_d7_retention
FROM retention_comparison
GROUP BY DATE_TRUNC('week', first_date)
ORDER BY cohort_week;9. LTV by Cohort
Customer Lifetime Value by Cohort
WITH user_cohorts AS (
SELECT
user_id,
DATE_TRUNC('month', created_at) as cohort_month
FROM users
),
user_revenue AS (
SELECT
user_id,
SUM(amount) as total_revenue,
COUNT(*) as order_count
FROM orders
GROUP BY user_id
)
SELECT
TO_CHAR(uc.cohort_month, 'YYYY-MM') as cohort,
COUNT(DISTINCT uc.user_id) as users,
COUNT(DISTINCT ur.user_id) as paying_users,
ROUND(COUNT(DISTINCT ur.user_id) * 100.0 / COUNT(DISTINCT uc.user_id), 2) as conversion_rate,
ROUND(AVG(ur.total_revenue), 2) as avg_revenue_per_paying_user,
ROUND(SUM(ur.total_revenue) / COUNT(DISTINCT uc.user_id), 2) as revenue_per_user
FROM user_cohorts uc
LEFT JOIN user_revenue ur ON uc.user_id = ur.user_id
GROUP BY uc.cohort_month
ORDER BY uc.cohort_month;10. Pro Tips
1. Filter by Cohort Size
Small cohorts are statistically unreliable.
HAVING COUNT(*) >= 100 -- Minimum 100 users2. Data Format for Visualization
-- Long format for heatmap visualization
SELECT
cohort_month,
month_number,
retention_rate
FROM retention_data
ORDER BY cohort_month, month_number;3. Churn Rate = 100 - Retention
SELECT
cohort,
month_number,
retention_rate,
100 - retention_rate as churn_rate,
LAG(retention_rate) OVER (PARTITION BY cohort ORDER BY month_number) - retention_rate as monthly_churn
FROM retention_matrix;Conclusion
Cohort Analysis Key Points:
- Track user behavior over time
- Understand trends by group, not averages
- Quantitatively measure the impact of product changes
References
- Amplitude - Retention Analysis Guide
- Mixpanel - Cohort Analysis Best Practices
- Mode Analytics - SQL for Cohort Analysis