데이터 품질 체크 자동화: NULL, 중복, 일관성 검증 SQL 템플릿
SOTAAZ·

데이터 품질 체크를 SQL로: 신뢰할 수 있는 데이터 만들기
데이터 분석 전 필수! NULL, 중복, 범위, 일관성 체크를 SQL로 자동화하기.
TL;DR
- 데이터 품질: 분석 결과의 신뢰성을 결정하는 핵심 요소
- 체크 항목: 완전성, 유일성, 유효성, 일관성, 적시성
- 자동화: 정기적 품질 검증으로 문제 조기 발견
- 리포팅: 품질 점수 대시보드로 현황 파악
1. 왜 데이터 품질이 중요한가?
Garbage In, Garbage Out
매출 리포트: 1억 2천만원
실제 매출: 8천만원
→ NULL 값을 0으로 처리 안 함
→ 중복 주문이 포함됨
→ 취소 건이 제외 안 됨잘못된 데이터 → 잘못된 의사결정 → 비용 손실
데이터 품질의 5가지 차원
2. 완전성 체크 (NULL 검사)
테이블별 NULL 비율 확인
-- 각 컬럼별 NULL 비율
SELECT
'user_id' as column_name,
COUNT(*) as total_rows,
SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) as null_count,
ROUND(SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as null_pct
FROM orders
UNION ALL
SELECT
'email',
COUNT(*),
SUM(CASE WHEN email IS NULL THEN 1 ELSE 0 END),
ROUND(SUM(CASE WHEN email IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2)
FROM orders
UNION ALL
SELECT
'amount',
COUNT(*),
SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END),
ROUND(SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2)
FROM orders
ORDER BY null_pct DESC;결과:
동적 NULL 체크 (PostgreSQL)
-- 모든 컬럼 자동 검사 (information_schema 활용)
DO $$
DECLARE
col RECORD;
null_count INTEGER;
total_count INTEGER;
BEGIN
SELECT COUNT(*) INTO total_count FROM orders;
FOR col IN
SELECT column_name
FROM information_schema.columns
WHERE table_name = 'orders'
LOOP
EXECUTE format('SELECT COUNT(*) FROM orders WHERE %I IS NULL', col.column_name)
INTO null_count;
RAISE NOTICE '% : % / % (%.2f%%)',
col.column_name, null_count, total_count,
null_count * 100.0 / total_count;
END LOOP;
END $$;빈 문자열도 체크
-- NULL + 빈 문자열 + 공백만 있는 경우
SELECT
COUNT(*) as total,
SUM(CASE
WHEN email IS NULL
OR TRIM(email) = ''
THEN 1 ELSE 0
END) as empty_count,
ROUND(SUM(CASE
WHEN email IS NULL
OR TRIM(email) = ''
THEN 1 ELSE 0
END) * 100.0 / COUNT(*), 2) as empty_pct
FROM users;3. 유일성 체크 (중복 검사)
PK 중복 확인
-- order_id가 유일해야 하는 경우
SELECT
order_id,
COUNT(*) as duplicate_count
FROM orders
GROUP BY order_id
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC;비즈니스 키 중복
-- 같은 사용자가 같은 시간에 같은 상품을 주문 (중복 의심)
SELECT
user_id,
product_id,
DATE_TRUNC('minute', created_at) as order_minute,
COUNT(*) as count
FROM orders
GROUP BY user_id, product_id, DATE_TRUNC('minute', created_at)
HAVING COUNT(*) > 1
ORDER BY count DESC;중복률 리포트
WITH duplicate_analysis AS (
SELECT
order_id,
COUNT(*) as occurrence
FROM orders
GROUP BY order_id
)
SELECT
COUNT(*) as total_unique_ids,
SUM(occurrence) as total_rows,
SUM(CASE WHEN occurrence > 1 THEN occurrence ELSE 0 END) as duplicate_rows,
ROUND(
SUM(CASE WHEN occurrence > 1 THEN occurrence ELSE 0 END) * 100.0 / SUM(occurrence),
4
) as duplicate_pct
FROM duplicate_analysis;4. 유효성 체크 (범위/형식 검사)
숫자 범위 검사
-- 비정상적인 값 찾기
SELECT
'amount' as field,
COUNT(*) as invalid_count,
MIN(amount) as min_value,
MAX(amount) as max_value
FROM orders
WHERE amount < 0 OR amount > 100000000 -- 1억 초과
UNION ALL
SELECT
'quantity',
COUNT(*),
MIN(quantity),
MAX(quantity)
FROM order_items
WHERE quantity <= 0 OR quantity > 1000
UNION ALL
SELECT
'discount_rate',
COUNT(*),
MIN(discount_rate),
MAX(discount_rate)
FROM promotions
WHERE discount_rate < 0 OR discount_rate > 1; -- 0~100% 범위날짜 유효성 검사
-- 미래 날짜, 너무 오래된 날짜
SELECT
order_id,
created_at,
CASE
WHEN created_at > CURRENT_TIMESTAMP THEN 'FUTURE_DATE'
WHEN created_at < '2020-01-01' THEN 'TOO_OLD'
ELSE 'VALID'
END as date_status
FROM orders
WHERE created_at > CURRENT_TIMESTAMP
OR created_at < '2020-01-01';형식 검사 (정규식)
-- 이메일 형식 검사
SELECT
email,
CASE
WHEN email IS NULL THEN 'NULL'
WHEN email !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 'INVALID_FORMAT'
ELSE 'VALID'
END as email_status
FROM users
WHERE email IS NULL
OR email !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$';
-- 전화번호 형식 검사
SELECT
phone,
CASE
WHEN phone IS NULL THEN 'NULL'
WHEN phone !~ '^\d{2,3}-\d{3,4}-\d{4}$' THEN 'INVALID_FORMAT'
ELSE 'VALID'
END as phone_status
FROM users
WHERE phone IS NOT NULL
AND phone !~ '^\d{2,3}-\d{3,4}-\d{4}$';ENUM 값 검사
-- 허용된 값만 있는지 확인
SELECT
status,
COUNT(*) as count
FROM orders
WHERE status NOT IN ('pending', 'confirmed', 'shipped', 'delivered', 'cancelled')
GROUP BY status;
-- 허용 값 목록과 비교
WITH allowed_values AS (
SELECT unnest(ARRAY['pending', 'confirmed', 'shipped', 'delivered', 'cancelled']) as status
)
SELECT DISTINCT o.status as invalid_status
FROM orders o
LEFT JOIN allowed_values av ON o.status = av.status
WHERE av.status IS NULL;5. 일관성 체크 (논리적 정합성)
금액 일관성
-- 주문 금액 = 상품 금액 합계 - 할인 + 배송비
WITH order_calculated AS (
SELECT
o.order_id,
o.total_amount as recorded_amount,
COALESCE(SUM(oi.quantity * oi.unit_price), 0) as items_total,
COALESCE(o.discount_amount, 0) as discount,
COALESCE(o.shipping_fee, 0) as shipping
FROM orders o
LEFT JOIN order_items oi ON o.order_id = oi.order_id
GROUP BY o.order_id, o.total_amount, o.discount_amount, o.shipping_fee
)
SELECT
order_id,
recorded_amount,
items_total - discount + shipping as calculated_amount,
recorded_amount - (items_total - discount + shipping) as difference
FROM order_calculated
WHERE ABS(recorded_amount - (items_total - discount + shipping)) > 1 -- 1원 이상 차이
ORDER BY ABS(difference) DESC
LIMIT 100;날짜 순서 일관성
-- 주문일 < 결제일 < 배송일 < 완료일 순서 확인
SELECT
order_id,
order_date,
payment_date,
shipped_date,
delivered_date,
CASE
WHEN payment_date < order_date THEN 'PAYMENT_BEFORE_ORDER'
WHEN shipped_date < payment_date THEN 'SHIPPED_BEFORE_PAYMENT'
WHEN delivered_date < shipped_date THEN 'DELIVERED_BEFORE_SHIPPED'
ELSE 'VALID'
END as date_consistency
FROM orders
WHERE payment_date < order_date
OR shipped_date < payment_date
OR delivered_date < shipped_date;FK 참조 무결성
-- orders.user_id가 users 테이블에 존재하는지
SELECT
o.order_id,
o.user_id as orphan_user_id
FROM orders o
LEFT JOIN users u ON o.user_id = u.user_id
WHERE u.user_id IS NULL;
-- 존재하지 않는 product_id 참조
SELECT
oi.order_item_id,
oi.product_id as orphan_product_id
FROM order_items oi
LEFT JOIN products p ON oi.product_id = p.product_id
WHERE p.product_id IS NULL;상태 전이 일관성
-- 잘못된 상태 변경 찾기 (예: pending에서 바로 delivered)
WITH status_transitions AS (
SELECT
order_id,
status,
LAG(status) OVER (PARTITION BY order_id ORDER BY updated_at) as prev_status,
updated_at
FROM order_status_history
)
SELECT *
FROM status_transitions
WHERE prev_status IS NOT NULL
AND NOT (
(prev_status = 'pending' AND status IN ('confirmed', 'cancelled'))
OR (prev_status = 'confirmed' AND status IN ('shipped', 'cancelled'))
OR (prev_status = 'shipped' AND status IN ('delivered', 'returned'))
OR (prev_status = 'delivered' AND status = 'returned')
);6. 적시성 체크 (데이터 신선도)
최신 데이터 확인
-- 테이블별 마지막 업데이트 시간
SELECT
'orders' as table_name,
MAX(created_at) as latest_record,
CURRENT_TIMESTAMP - MAX(created_at) as data_lag
FROM orders
UNION ALL
SELECT
'events',
MAX(event_time),
CURRENT_TIMESTAMP - MAX(event_time)
FROM events
UNION ALL
SELECT
'users',
MAX(updated_at),
CURRENT_TIMESTAMP - MAX(updated_at)
FROM users
ORDER BY data_lag DESC;결과:
시간대별 데이터 유입 확인
-- 시간별 데이터 유입량 (갑자기 0이면 문제)
SELECT
DATE_TRUNC('hour', created_at) as hour,
COUNT(*) as record_count
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '3 days'
GROUP BY DATE_TRUNC('hour', created_at)
ORDER BY hour DESC;누락된 날짜 찾기
WITH RECURSIVE date_series AS (
SELECT DATE '2024-01-01' as date
UNION ALL
SELECT date + 1
FROM date_series
WHERE date < CURRENT_DATE - 1
),
daily_counts AS (
SELECT DATE(created_at) as date, COUNT(*) as cnt
FROM orders
WHERE created_at >= '2024-01-01'
GROUP BY DATE(created_at)
)
SELECT
ds.date as missing_date
FROM date_series ds
LEFT JOIN daily_counts dc ON ds.date = dc.date
WHERE dc.date IS NULL
ORDER BY ds.date;7. 종합 품질 리포트
테이블별 품질 점수
WITH completeness AS (
SELECT
'completeness' as dimension,
100 - ROUND(
(SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) +
SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) +
SUM(CASE WHEN created_at IS NULL THEN 1 ELSE 0 END)) * 100.0 /
(COUNT(*) * 3), 2
) as score
FROM orders
),
uniqueness AS (
SELECT
'uniqueness' as dimension,
100 - ROUND(
(SELECT COUNT(*) FROM (
SELECT order_id FROM orders GROUP BY order_id HAVING COUNT(*) > 1
) dups) * 100.0 / COUNT(*), 2
) as score
FROM orders
),
validity AS (
SELECT
'validity' as dimension,
100 - ROUND(
SUM(CASE
WHEN amount < 0 OR amount > 100000000
OR created_at > CURRENT_TIMESTAMP
THEN 1 ELSE 0
END) * 100.0 / COUNT(*), 2
) as score
FROM orders
),
timeliness AS (
SELECT
'timeliness' as dimension,
CASE
WHEN MAX(created_at) > CURRENT_TIMESTAMP - INTERVAL '1 hour' THEN 100
WHEN MAX(created_at) > CURRENT_TIMESTAMP - INTERVAL '1 day' THEN 80
WHEN MAX(created_at) > CURRENT_TIMESTAMP - INTERVAL '7 days' THEN 50
ELSE 0
END as score
FROM orders
)
SELECT * FROM completeness
UNION ALL SELECT * FROM uniqueness
UNION ALL SELECT * FROM validity
UNION ALL SELECT * FROM timeliness;결과:
일별 품질 추이
WITH daily_quality AS (
SELECT
DATE(created_at) as date,
COUNT(*) as total_records,
-- 완전성
ROUND(100 - SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as completeness,
-- 유효성
ROUND(100 - SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as validity
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY DATE(created_at)
)
SELECT
date,
total_records,
completeness,
validity,
ROUND((completeness + validity) / 2, 2) as avg_quality_score
FROM daily_quality
ORDER BY date DESC;8. 자동화된 품질 체크
품질 규칙 테이블
-- 품질 규칙 정의 테이블
CREATE TABLE data_quality_rules (
rule_id SERIAL PRIMARY KEY,
table_name VARCHAR(100),
rule_name VARCHAR(200),
rule_type VARCHAR(50), -- completeness, uniqueness, validity, consistency
check_query TEXT,
threshold DECIMAL(5,2), -- 허용 기준 (예: 99.0 = 99% 이상)
is_active BOOLEAN DEFAULT TRUE
);
-- 규칙 예시 삽입
INSERT INTO data_quality_rules (table_name, rule_name, rule_type, check_query, threshold) VALUES
('orders', 'order_id_not_null', 'completeness',
'SELECT 100 - COUNT(*) FILTER (WHERE order_id IS NULL) * 100.0 / COUNT(*) FROM orders', 100),
('orders', 'order_id_unique', 'uniqueness',
'SELECT 100 - (SELECT COUNT(*) FROM (SELECT order_id FROM orders GROUP BY order_id HAVING COUNT(*) > 1) d) * 100.0 / COUNT(*) FROM orders', 100),
('orders', 'amount_positive', 'validity',
'SELECT 100 - COUNT(*) FILTER (WHERE amount < 0) * 100.0 / COUNT(*) FROM orders', 99.9);품질 체크 실행 및 결과 저장
-- 품질 체크 결과 테이블
CREATE TABLE data_quality_results (
result_id SERIAL PRIMARY KEY,
rule_id INT REFERENCES data_quality_rules(rule_id),
check_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
score DECIMAL(5,2),
passed BOOLEAN,
details JSONB
);
-- 실행 예시 (실제로는 프로시저나 스케줄러로 실행)
INSERT INTO data_quality_results (rule_id, score, passed)
SELECT
1 as rule_id,
100 - COUNT(*) FILTER (WHERE order_id IS NULL) * 100.0 / COUNT(*) as score,
(100 - COUNT(*) FILTER (WHERE order_id IS NULL) * 100.0 / COUNT(*)) >= 100 as passed
FROM orders;품질 알림 쿼리
-- 기준 미달 항목 알림
SELECT
r.table_name,
r.rule_name,
r.rule_type,
r.threshold as required_score,
qr.score as actual_score,
qr.check_timestamp
FROM data_quality_rules r
JOIN data_quality_results qr ON r.rule_id = qr.rule_id
WHERE qr.check_timestamp = (
SELECT MAX(check_timestamp)
FROM data_quality_results qr2
WHERE qr2.rule_id = qr.rule_id
)
AND qr.passed = FALSE
ORDER BY qr.score ASC;9. 데이터 프로파일링
컬럼 통계 요약
-- 숫자 컬럼 프로파일링
SELECT
'amount' as column_name,
COUNT(*) as total_count,
COUNT(DISTINCT amount) as distinct_count,
SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) as null_count,
MIN(amount) as min_value,
MAX(amount) as max_value,
ROUND(AVG(amount), 2) as avg_value,
ROUND(STDDEV(amount), 2) as stddev_value,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) as median_value
FROM orders;카테고리 컬럼 분포
-- 상위 값 분포
WITH value_counts AS (
SELECT
status,
COUNT(*) as count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM orders
GROUP BY status
)
SELECT
status,
count,
percentage,
REPEAT('█', (percentage / 5)::INT) as bar
FROM value_counts
ORDER BY count DESC;결과:
10. 품질 이슈 수정
NULL 값 처리
-- 기본값으로 대체 (UPDATE 전 백업 권장)
UPDATE orders
SET shipping_fee = 0
WHERE shipping_fee IS NULL;
-- 다른 테이블에서 값 가져오기
UPDATE orders o
SET user_email = u.email
FROM users u
WHERE o.user_id = u.user_id
AND o.user_email IS NULL;중복 제거
-- 중복 중 최신 것만 남기기
WITH duplicates AS (
SELECT
order_id,
ROW_NUMBER() OVER (
PARTITION BY user_id, product_id, DATE_TRUNC('minute', created_at)
ORDER BY created_at DESC
) as rn
FROM orders
)
DELETE FROM orders
WHERE order_id IN (
SELECT order_id FROM duplicates WHERE rn > 1
);이상값 플래그
-- 삭제 대신 플래그로 표시
ALTER TABLE orders ADD COLUMN data_quality_flag VARCHAR(50);
UPDATE orders
SET data_quality_flag = 'INVALID_AMOUNT'
WHERE amount < 0 OR amount > 100000000;
UPDATE orders
SET data_quality_flag = 'FUTURE_DATE'
WHERE created_at > CURRENT_TIMESTAMP;결론
데이터 품질 관리 핵심:
- 정기적 모니터링으로 문제 조기 발견
- 자동화된 규칙으로 일관된 검증
- 품질 점수 대시보드로 현황 파악
- 이슈 수정 전 반드시 백업
References
- DAMA - Data Quality Dimensions
- Great Expectations - Data Validation Framework
- dbt - Data Build Tool Testing