(Applies hypothesis testing from ch277; uses multiple testing correction from ch278)
1. What A/B Testing Is¶
An A/B test is a controlled experiment in which users are randomly assigned to one of two variants and a metric is measured on each group. It is hypothesis testing applied to live systems with three practical complications:
You cannot always wait until the experiment is over before looking at results (peeking problem)
You often run many experiments simultaneously (multiple comparisons)
The metric of interest may not be normally distributed
2. The Standard A/B Test¶
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
rng = np.random.default_rng(42)
def two_proportion_z_test(
control: np.ndarray,
treatment: np.ndarray,
alpha: float = 0.05,
) -> dict:
"""
Two-sided two-proportion z-test.
Inputs are binary (0/1) arrays.
"""
n_c, n_t = len(control), len(treatment)
p_c, p_t = control.mean(), treatment.mean()
p_pool = (control.sum() + treatment.sum()) / (n_c + n_t)
se = np.sqrt(p_pool * (1 - p_pool) * (1/n_c + 1/n_t))
z = (p_t - p_c) / se
p_val = 2 * stats.norm.sf(abs(z))
se_diff = np.sqrt(p_c*(1-p_c)/n_c + p_t*(1-p_t)/n_t)
z_crit = stats.norm.ppf(1 - alpha/2)
diff = p_t - p_c
return {
'control_rate': p_c,
'treatment_rate': p_t,
'lift_pct': (p_t - p_c) / p_c * 100,
'diff': diff,
'ci_lower': diff - z_crit * se_diff,
'ci_upper': diff + z_crit * se_diff,
'z_stat': z,
'p_value': p_val,
'significant': p_val < alpha,
}
# Experiment: new checkout UI, true effect +1.5pp
n_per_group = 2000
control_conv = rng.binomial(1, 0.120, n_per_group)
treatment_conv = rng.binomial(1, 0.135, n_per_group)
result = two_proportion_z_test(control_conv, treatment_conv)
print("=== A/B Test Report ===")
for k, v in result.items():
if isinstance(v, float):
print(f" {k:<20}: {v:+.4f}" if 'pct' in k or 'diff' in k or 'ci' in k
else f" {k:<20}: {v:.6f}")
else:
print(f" {k:<20}: {v}")=== A/B Test Report ===
control_rate : 0.123000
treatment_rate : 0.134000
lift_pct : +8.9431
diff : +0.0110
ci_lower : -0.0097
ci_upper : +0.0317
z_stat : 1.039459
p_value : 0.298591
significant : False
3. Sample Size Planning¶
def required_sample_size(
baseline_rate: float,
mde: float,
alpha: float = 0.05,
power: float = 0.80,
) -> int:
"""Per-group sample size for two-proportion z-test."""
p1 = baseline_rate
p2 = baseline_rate + mde
z_a = stats.norm.ppf(1 - alpha / 2)
z_b = stats.norm.ppf(power)
p_bar = (p1 + p2) / 2
num = (z_a * np.sqrt(2 * p_bar * (1-p_bar)) +
z_b * np.sqrt(p1*(1-p1) + p2*(1-p2)))**2
return int(np.ceil(num / (p2 - p1)**2))
baseline = 0.12
effects = [0.005, 0.01, 0.015, 0.02, 0.03, 0.05]
print(f"Baseline={baseline:.2f} | α=0.05 | power=0.80")
print(f"{'MDE':>7} {'n/group':>10} {'n total':>10} {'Weeks @1k/day':>14}")
print("-" * 50)
for mde in effects:
n = required_sample_size(baseline, mde)
weeks = 2 * n / (1000 * 7)
print(f"{mde:>7.3f} {n:>10,d} {2*n:>10,d} {weeks:>14.1f}")Baseline=0.12 | α=0.05 | power=0.80
MDE n/group n total Weeks @1k/day
--------------------------------------------------
0.005 67,496 134,992 19.3
0.010 17,169 34,338 4.9
0.015 7,761 15,522 2.2
0.020 4,438 8,876 1.3
0.030 2,036 4,072 0.6
0.050 778 1,556 0.2
4. The Peeking Problem¶
n_simulations = 3000
max_n = 500
check_points = np.arange(50, max_n + 1, 10)
fp_end = 0
fp_any = 0
for _ in range(n_simulations):
a = rng.binomial(1, 0.12, max_n)
b = rng.binomial(1, 0.12, max_n) # H0 true
_, p_end = stats.ttest_ind(a, b)
if p_end < 0.05:
fp_end += 1
found = False
for n in check_points:
_, p = stats.ttest_ind(a[:n], b[:n])
if p < 0.05:
found = True
break
if found:
fp_any += 1
print("Under H0 (no real effect):")
print(f" Check at end only: FP rate = {fp_end/n_simulations:.3f} (nominal 0.05)")
print(f" Peek at every step: FP rate = {fp_any/n_simulations:.3f} <- severely inflated")
print()
print("Fix: pre-register n and check ONCE. Or use sequential testing methods.")Under H0 (no real effect):
Check at end only: FP rate = 0.054 (nominal 0.05)
Peek at every step: FP rate = 0.269 <- severely inflated
Fix: pre-register n and check ONCE. Or use sequential testing methods.
5. Multiple Experiments — FDR Control¶
def bh_correction(pvals: np.ndarray, alpha: float = 0.05) -> np.ndarray:
"""Benjamini-Hochberg FDR correction."""
m = len(pvals)
rank = np.argsort(pvals)
sp = pvals[rank]
thr = (np.arange(1, m + 1) / m) * alpha
below = sp <= thr
if not below.any():
return np.zeros(m, dtype=bool)
k_max = int(np.where(below)[0].max())
rs = np.zeros(m, dtype=bool)
rs[:k_max + 1] = True
result = np.zeros(m, dtype=bool)
result[rank] = rs
return result
n_exp = 50
n_real = 10
effects = np.concatenate([np.full(n_real, 0.02), np.zeros(n_exp - n_real)])
truth = effects != 0
pvals = np.array([
stats.ttest_ind(
rng.binomial(1, 0.12, 1000),
rng.binomial(1, 0.12 + eff, 1000)
).pvalue
for eff in effects
])
naive_rej = pvals < 0.05
bonf_rej = pvals < (0.05 / n_exp)
bh_rej = bh_correction(pvals)
print(f"{'Method':<14} {'Rejected':>8} {'True Pos':>9} {'False Pos':>10}")
print("-" * 48)
for name, rej in [('Naive', naive_rej), ('Bonferroni', bonf_rej), ('B-H FDR', bh_rej)]:
tp = int((rej & truth).sum())
fp = int((rej & ~truth).sum())
print(f"{name:<14} {int(rej.sum()):>8d} {tp:>9d} {fp:>10d}")Method Rejected True Pos False Pos
------------------------------------------------
Naive 3 3 0
Bonferroni 1 1 0
B-H FDR 1 1 0
6. What Comes Next¶
A/B testing gives a binary decision: reject or not. ch286 — Bayesian Statistics computes a posterior distribution over the effect size — more informative, naturally sequential, and does not require pre-registering sample size.