Automating Data Quality Checks: SQL Templates for NULL, Duplicates, and Consistency
SOTAAZ·

Data Quality Checks in SQL: Building Trustworthy Data
Essential before any analysis! Automate NULL, duplicate, range, and consistency checks with SQL.
TL;DR
- Data Quality: Core factor determining analysis reliability
- Check Types: Completeness, uniqueness, validity, consistency, timeliness
- Automation: Regular validation catches issues early
- Reporting: Quality score dashboards for visibility
1. Why Data Quality Matters
Garbage In, Garbage Out
Revenue Report: $1.2M
Actual Revenue: $800K
→ NULL values not handled
→ Duplicate orders included
→ Cancelled orders not excludedBad data → Bad decisions → Financial loss
The 5 Dimensions of Data Quality
2. Completeness Check (NULL Inspection)
Column-Level NULL Rates
-- NULL rate per column
SELECT
'user_id' as column_name,
COUNT(*) as total_rows,
SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) as null_count,
ROUND(SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as null_pct
FROM orders
UNION ALL
SELECT
'email',
COUNT(*),
SUM(CASE WHEN email IS NULL THEN 1 ELSE 0 END),
ROUND(SUM(CASE WHEN email IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2)
FROM orders
UNION ALL
SELECT
'amount',
COUNT(*),
SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END),
ROUND(SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2)
FROM orders
ORDER BY null_pct DESC;Result:
Dynamic NULL Check (PostgreSQL)
-- Auto-check all columns using information_schema
DO $$
DECLARE
col RECORD;
null_count INTEGER;
total_count INTEGER;
BEGIN
SELECT COUNT(*) INTO total_count FROM orders;
FOR col IN
SELECT column_name
FROM information_schema.columns
WHERE table_name = 'orders'
LOOP
EXECUTE format('SELECT COUNT(*) FROM orders WHERE %I IS NULL', col.column_name)
INTO null_count;
RAISE NOTICE '% : % / % (%.2f%%)',
col.column_name, null_count, total_count,
null_count * 100.0 / total_count;
END LOOP;
END $$;Check Empty Strings Too
-- NULL + empty string + whitespace only
SELECT
COUNT(*) as total,
SUM(CASE
WHEN email IS NULL
OR TRIM(email) = ''
THEN 1 ELSE 0
END) as empty_count,
ROUND(SUM(CASE
WHEN email IS NULL
OR TRIM(email) = ''
THEN 1 ELSE 0
END) * 100.0 / COUNT(*), 2) as empty_pct
FROM users;3. Uniqueness Check (Duplicate Detection)
Primary Key Duplicates
-- Check if order_id is unique
SELECT
order_id,
COUNT(*) as duplicate_count
FROM orders
GROUP BY order_id
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC;Business Key Duplicates
-- Same user, same product, same minute (suspicious duplicate)
SELECT
user_id,
product_id,
DATE_TRUNC('minute', created_at) as order_minute,
COUNT(*) as count
FROM orders
GROUP BY user_id, product_id, DATE_TRUNC('minute', created_at)
HAVING COUNT(*) > 1
ORDER BY count DESC;Duplicate Rate Report
WITH duplicate_analysis AS (
SELECT
order_id,
COUNT(*) as occurrence
FROM orders
GROUP BY order_id
)
SELECT
COUNT(*) as total_unique_ids,
SUM(occurrence) as total_rows,
SUM(CASE WHEN occurrence > 1 THEN occurrence ELSE 0 END) as duplicate_rows,
ROUND(
SUM(CASE WHEN occurrence > 1 THEN occurrence ELSE 0 END) * 100.0 / SUM(occurrence),
4
) as duplicate_pct
FROM duplicate_analysis;4. Validity Check (Range/Format Inspection)
Numeric Range Check
-- Find abnormal values
SELECT
'amount' as field,
COUNT(*) as invalid_count,
MIN(amount) as min_value,
MAX(amount) as max_value
FROM orders
WHERE amount < 0 OR amount > 100000000 -- Over $100M
UNION ALL
SELECT
'quantity',
COUNT(*),
MIN(quantity),
MAX(quantity)
FROM order_items
WHERE quantity <= 0 OR quantity > 1000
UNION ALL
SELECT
'discount_rate',
COUNT(*),
MIN(discount_rate),
MAX(discount_rate)
FROM promotions
WHERE discount_rate < 0 OR discount_rate > 1; -- 0-100% rangeDate Validity Check
-- Future dates, too-old dates
SELECT
order_id,
created_at,
CASE
WHEN created_at > CURRENT_TIMESTAMP THEN 'FUTURE_DATE'
WHEN created_at < '2020-01-01' THEN 'TOO_OLD'
ELSE 'VALID'
END as date_status
FROM orders
WHERE created_at > CURRENT_TIMESTAMP
OR created_at < '2020-01-01';Format Check (Regex)
-- Email format validation
SELECT
email,
CASE
WHEN email IS NULL THEN 'NULL'
WHEN email !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 'INVALID_FORMAT'
ELSE 'VALID'
END as email_status
FROM users
WHERE email IS NULL
OR email !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$';
-- Phone number format validation
SELECT
phone,
CASE
WHEN phone IS NULL THEN 'NULL'
WHEN phone !~ '^\d{2,3}-\d{3,4}-\d{4}$' THEN 'INVALID_FORMAT'
ELSE 'VALID'
END as phone_status
FROM users
WHERE phone IS NOT NULL
AND phone !~ '^\d{2,3}-\d{3,4}-\d{4}$';ENUM Value Check
-- Verify only allowed values exist
SELECT
status,
COUNT(*) as count
FROM orders
WHERE status NOT IN ('pending', 'confirmed', 'shipped', 'delivered', 'cancelled')
GROUP BY status;
-- Compare against allowed values list
WITH allowed_values AS (
SELECT unnest(ARRAY['pending', 'confirmed', 'shipped', 'delivered', 'cancelled']) as status
)
SELECT DISTINCT o.status as invalid_status
FROM orders o
LEFT JOIN allowed_values av ON o.status = av.status
WHERE av.status IS NULL;5. Consistency Check (Logical Integrity)
Amount Consistency
-- Order total = item total - discount + shipping
WITH order_calculated AS (
SELECT
o.order_id,
o.total_amount as recorded_amount,
COALESCE(SUM(oi.quantity * oi.unit_price), 0) as items_total,
COALESCE(o.discount_amount, 0) as discount,
COALESCE(o.shipping_fee, 0) as shipping
FROM orders o
LEFT JOIN order_items oi ON o.order_id = oi.order_id
GROUP BY o.order_id, o.total_amount, o.discount_amount, o.shipping_fee
)
SELECT
order_id,
recorded_amount,
items_total - discount + shipping as calculated_amount,
recorded_amount - (items_total - discount + shipping) as difference
FROM order_calculated
WHERE ABS(recorded_amount - (items_total - discount + shipping)) > 1 -- >$1 difference
ORDER BY ABS(difference) DESC
LIMIT 100;Date Sequence Consistency
-- Verify order_date < payment_date < shipped_date < delivered_date
SELECT
order_id,
order_date,
payment_date,
shipped_date,
delivered_date,
CASE
WHEN payment_date < order_date THEN 'PAYMENT_BEFORE_ORDER'
WHEN shipped_date < payment_date THEN 'SHIPPED_BEFORE_PAYMENT'
WHEN delivered_date < shipped_date THEN 'DELIVERED_BEFORE_SHIPPED'
ELSE 'VALID'
END as date_consistency
FROM orders
WHERE payment_date < order_date
OR shipped_date < payment_date
OR delivered_date < shipped_date;Foreign Key Referential Integrity
-- Check if orders.user_id exists in users table
SELECT
o.order_id,
o.user_id as orphan_user_id
FROM orders o
LEFT JOIN users u ON o.user_id = u.user_id
WHERE u.user_id IS NULL;
-- Check for non-existent product_id references
SELECT
oi.order_item_id,
oi.product_id as orphan_product_id
FROM order_items oi
LEFT JOIN products p ON oi.product_id = p.product_id
WHERE p.product_id IS NULL;State Transition Consistency
-- Find invalid status changes (e.g., pending directly to delivered)
WITH status_transitions AS (
SELECT
order_id,
status,
LAG(status) OVER (PARTITION BY order_id ORDER BY updated_at) as prev_status,
updated_at
FROM order_status_history
)
SELECT *
FROM status_transitions
WHERE prev_status IS NOT NULL
AND NOT (
(prev_status = 'pending' AND status IN ('confirmed', 'cancelled'))
OR (prev_status = 'confirmed' AND status IN ('shipped', 'cancelled'))
OR (prev_status = 'shipped' AND status IN ('delivered', 'returned'))
OR (prev_status = 'delivered' AND status = 'returned')
);6. Timeliness Check (Data Freshness)
Latest Data Check
-- Last update time per table
SELECT
'orders' as table_name,
MAX(created_at) as latest_record,
CURRENT_TIMESTAMP - MAX(created_at) as data_lag
FROM orders
UNION ALL
SELECT
'events',
MAX(event_time),
CURRENT_TIMESTAMP - MAX(event_time)
FROM events
UNION ALL
SELECT
'users',
MAX(updated_at),
CURRENT_TIMESTAMP - MAX(updated_at)
FROM users
ORDER BY data_lag DESC;Result:
Hourly Data Ingestion Check
-- Hourly data volume (0 indicates problem)
SELECT
DATE_TRUNC('hour', created_at) as hour,
COUNT(*) as record_count
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '3 days'
GROUP BY DATE_TRUNC('hour', created_at)
ORDER BY hour DESC;Find Missing Dates
WITH RECURSIVE date_series AS (
SELECT DATE '2024-01-01' as date
UNION ALL
SELECT date + 1
FROM date_series
WHERE date < CURRENT_DATE - 1
),
daily_counts AS (
SELECT DATE(created_at) as date, COUNT(*) as cnt
FROM orders
WHERE created_at >= '2024-01-01'
GROUP BY DATE(created_at)
)
SELECT
ds.date as missing_date
FROM date_series ds
LEFT JOIN daily_counts dc ON ds.date = dc.date
WHERE dc.date IS NULL
ORDER BY ds.date;7. Comprehensive Quality Report
Table Quality Score
WITH completeness AS (
SELECT
'completeness' as dimension,
100 - ROUND(
(SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) +
SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) +
SUM(CASE WHEN created_at IS NULL THEN 1 ELSE 0 END)) * 100.0 /
(COUNT(*) * 3), 2
) as score
FROM orders
),
uniqueness AS (
SELECT
'uniqueness' as dimension,
100 - ROUND(
(SELECT COUNT(*) FROM (
SELECT order_id FROM orders GROUP BY order_id HAVING COUNT(*) > 1
) dups) * 100.0 / COUNT(*), 2
) as score
FROM orders
),
validity AS (
SELECT
'validity' as dimension,
100 - ROUND(
SUM(CASE
WHEN amount < 0 OR amount > 100000000
OR created_at > CURRENT_TIMESTAMP
THEN 1 ELSE 0
END) * 100.0 / COUNT(*), 2
) as score
FROM orders
),
timeliness AS (
SELECT
'timeliness' as dimension,
CASE
WHEN MAX(created_at) > CURRENT_TIMESTAMP - INTERVAL '1 hour' THEN 100
WHEN MAX(created_at) > CURRENT_TIMESTAMP - INTERVAL '1 day' THEN 80
WHEN MAX(created_at) > CURRENT_TIMESTAMP - INTERVAL '7 days' THEN 50
ELSE 0
END as score
FROM orders
)
SELECT * FROM completeness
UNION ALL SELECT * FROM uniqueness
UNION ALL SELECT * FROM validity
UNION ALL SELECT * FROM timeliness;Result:
Daily Quality Trend
WITH daily_quality AS (
SELECT
DATE(created_at) as date,
COUNT(*) as total_records,
-- Completeness
ROUND(100 - SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as completeness,
-- Validity
ROUND(100 - SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as validity
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY DATE(created_at)
)
SELECT
date,
total_records,
completeness,
validity,
ROUND((completeness + validity) / 2, 2) as avg_quality_score
FROM daily_quality
ORDER BY date DESC;8. Automated Quality Checks
Quality Rules Table
-- Quality rule definition table
CREATE TABLE data_quality_rules (
rule_id SERIAL PRIMARY KEY,
table_name VARCHAR(100),
rule_name VARCHAR(200),
rule_type VARCHAR(50), -- completeness, uniqueness, validity, consistency
check_query TEXT,
threshold DECIMAL(5,2), -- Pass threshold (e.g., 99.0 = 99% minimum)
is_active BOOLEAN DEFAULT TRUE
);
-- Insert sample rules
INSERT INTO data_quality_rules (table_name, rule_name, rule_type, check_query, threshold) VALUES
('orders', 'order_id_not_null', 'completeness',
'SELECT 100 - COUNT(*) FILTER (WHERE order_id IS NULL) * 100.0 / COUNT(*) FROM orders', 100),
('orders', 'order_id_unique', 'uniqueness',
'SELECT 100 - (SELECT COUNT(*) FROM (SELECT order_id FROM orders GROUP BY order_id HAVING COUNT(*) > 1) d) * 100.0 / COUNT(*) FROM orders', 100),
('orders', 'amount_positive', 'validity',
'SELECT 100 - COUNT(*) FILTER (WHERE amount < 0) * 100.0 / COUNT(*) FROM orders', 99.9);Execute Quality Checks and Store Results
-- Quality check results table
CREATE TABLE data_quality_results (
result_id SERIAL PRIMARY KEY,
rule_id INT REFERENCES data_quality_rules(rule_id),
check_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
score DECIMAL(5,2),
passed BOOLEAN,
details JSONB
);
-- Execution example (in practice, run via procedure or scheduler)
INSERT INTO data_quality_results (rule_id, score, passed)
SELECT
1 as rule_id,
100 - COUNT(*) FILTER (WHERE order_id IS NULL) * 100.0 / COUNT(*) as score,
(100 - COUNT(*) FILTER (WHERE order_id IS NULL) * 100.0 / COUNT(*)) >= 100 as passed
FROM orders;Quality Alert Query
-- Alert on failed checks
SELECT
r.table_name,
r.rule_name,
r.rule_type,
r.threshold as required_score,
qr.score as actual_score,
qr.check_timestamp
FROM data_quality_rules r
JOIN data_quality_results qr ON r.rule_id = qr.rule_id
WHERE qr.check_timestamp = (
SELECT MAX(check_timestamp)
FROM data_quality_results qr2
WHERE qr2.rule_id = qr.rule_id
)
AND qr.passed = FALSE
ORDER BY qr.score ASC;9. Data Profiling
Column Statistics Summary
-- Numeric column profiling
SELECT
'amount' as column_name,
COUNT(*) as total_count,
COUNT(DISTINCT amount) as distinct_count,
SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) as null_count,
MIN(amount) as min_value,
MAX(amount) as max_value,
ROUND(AVG(amount), 2) as avg_value,
ROUND(STDDEV(amount), 2) as stddev_value,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) as median_value
FROM orders;Categorical Column Distribution
-- Top value distribution
WITH value_counts AS (
SELECT
status,
COUNT(*) as count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM orders
GROUP BY status
)
SELECT
status,
count,
percentage,
REPEAT('█', (percentage / 5)::INT) as bar
FROM value_counts
ORDER BY count DESC;Result:
10. Fixing Quality Issues
Handling NULL Values
-- Replace with default (backup recommended before UPDATE)
UPDATE orders
SET shipping_fee = 0
WHERE shipping_fee IS NULL;
-- Pull value from another table
UPDATE orders o
SET user_email = u.email
FROM users u
WHERE o.user_id = u.user_id
AND o.user_email IS NULL;Removing Duplicates
-- Keep only the most recent among duplicates
WITH duplicates AS (
SELECT
order_id,
ROW_NUMBER() OVER (
PARTITION BY user_id, product_id, DATE_TRUNC('minute', created_at)
ORDER BY created_at DESC
) as rn
FROM orders
)
DELETE FROM orders
WHERE order_id IN (
SELECT order_id FROM duplicates WHERE rn > 1
);Flagging Invalid Data
-- Flag instead of delete
ALTER TABLE orders ADD COLUMN data_quality_flag VARCHAR(50);
UPDATE orders
SET data_quality_flag = 'INVALID_AMOUNT'
WHERE amount < 0 OR amount > 100000000;
UPDATE orders
SET data_quality_flag = 'FUTURE_DATE'
WHERE created_at > CURRENT_TIMESTAMP;Conclusion
Data Quality Management Key Points:
- Regular monitoring catches issues early
- Automated rules ensure consistent validation
- Quality score dashboards provide visibility
- Always backup before fixing issues
References
- DAMA - Data Quality Dimensions
- Great Expectations - Data Validation Framework
- dbt - Data Build Tool Testing