Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

(Applies hypothesis testing from ch277; uses multiple testing correction from ch278)

1. What A/B Testing Is

An A/B test is a controlled experiment in which users are randomly assigned to one of two variants and a metric is measured on each group. It is hypothesis testing applied to live systems with three practical complications:

  1. You cannot always wait until the experiment is over before looking at results (peeking problem)

  2. You often run many experiments simultaneously (multiple comparisons)

  3. The metric of interest may not be normally distributed


2. The Standard A/B Test

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

rng = np.random.default_rng(42)


def two_proportion_z_test(
    control: np.ndarray,
    treatment: np.ndarray,
    alpha: float = 0.05,
) -> dict:
    """
    Two-sided two-proportion z-test.
    Inputs are binary (0/1) arrays.
    """
    n_c, n_t = len(control), len(treatment)
    p_c, p_t = control.mean(), treatment.mean()
    p_pool   = (control.sum() + treatment.sum()) / (n_c + n_t)
    se       = np.sqrt(p_pool * (1 - p_pool) * (1/n_c + 1/n_t))
    z        = (p_t - p_c) / se
    p_val    = 2 * stats.norm.sf(abs(z))

    se_diff  = np.sqrt(p_c*(1-p_c)/n_c + p_t*(1-p_t)/n_t)
    z_crit   = stats.norm.ppf(1 - alpha/2)
    diff     = p_t - p_c

    return {
        'control_rate':   p_c,
        'treatment_rate': p_t,
        'lift_pct':       (p_t - p_c) / p_c * 100,
        'diff':           diff,
        'ci_lower':       diff - z_crit * se_diff,
        'ci_upper':       diff + z_crit * se_diff,
        'z_stat':         z,
        'p_value':        p_val,
        'significant':    p_val < alpha,
    }


# Experiment: new checkout UI, true effect +1.5pp
n_per_group    = 2000
control_conv   = rng.binomial(1, 0.120, n_per_group)
treatment_conv = rng.binomial(1, 0.135, n_per_group)

result = two_proportion_z_test(control_conv, treatment_conv)

print("=== A/B Test Report ===")
for k, v in result.items():
    if isinstance(v, float):
        print(f"  {k:<20}: {v:+.4f}" if 'pct' in k or 'diff' in k or 'ci' in k
              else f"  {k:<20}: {v:.6f}")
    else:
        print(f"  {k:<20}: {v}")
=== A/B Test Report ===
  control_rate        : 0.123000
  treatment_rate      : 0.134000
  lift_pct            : +8.9431
  diff                : +0.0110
  ci_lower            : -0.0097
  ci_upper            : +0.0317
  z_stat              : 1.039459
  p_value             : 0.298591
  significant         : False

3. Sample Size Planning

def required_sample_size(
    baseline_rate: float,
    mde: float,
    alpha: float = 0.05,
    power: float = 0.80,
) -> int:
    """Per-group sample size for two-proportion z-test."""
    p1    = baseline_rate
    p2    = baseline_rate + mde
    z_a   = stats.norm.ppf(1 - alpha / 2)
    z_b   = stats.norm.ppf(power)
    p_bar = (p1 + p2) / 2
    num   = (z_a * np.sqrt(2 * p_bar * (1-p_bar)) +
             z_b * np.sqrt(p1*(1-p1) + p2*(1-p2)))**2
    return int(np.ceil(num / (p2 - p1)**2))


baseline = 0.12
effects  = [0.005, 0.01, 0.015, 0.02, 0.03, 0.05]

print(f"Baseline={baseline:.2f} | α=0.05 | power=0.80")
print(f"{'MDE':>7}  {'n/group':>10}  {'n total':>10}  {'Weeks @1k/day':>14}")
print("-" * 50)
for mde in effects:
    n     = required_sample_size(baseline, mde)
    weeks = 2 * n / (1000 * 7)
    print(f"{mde:>7.3f}  {n:>10,d}  {2*n:>10,d}  {weeks:>14.1f}")
Baseline=0.12 | α=0.05 | power=0.80
    MDE     n/group     n total   Weeks @1k/day
--------------------------------------------------
  0.005      67,496     134,992            19.3
  0.010      17,169      34,338             4.9
  0.015       7,761      15,522             2.2
  0.020       4,438       8,876             1.3
  0.030       2,036       4,072             0.6
  0.050         778       1,556             0.2

4. The Peeking Problem

n_simulations = 3000
max_n         = 500
check_points  = np.arange(50, max_n + 1, 10)

fp_end = 0
fp_any = 0

for _ in range(n_simulations):
    a = rng.binomial(1, 0.12, max_n)
    b = rng.binomial(1, 0.12, max_n)  # H0 true

    _, p_end = stats.ttest_ind(a, b)
    if p_end < 0.05:
        fp_end += 1

    found = False
    for n in check_points:
        _, p = stats.ttest_ind(a[:n], b[:n])
        if p < 0.05:
            found = True
            break
    if found:
        fp_any += 1

print("Under H0 (no real effect):")
print(f"  Check at end only:  FP rate = {fp_end/n_simulations:.3f}  (nominal 0.05)")
print(f"  Peek at every step: FP rate = {fp_any/n_simulations:.3f}  <- severely inflated")
print()
print("Fix: pre-register n and check ONCE. Or use sequential testing methods.")
Under H0 (no real effect):
  Check at end only:  FP rate = 0.054  (nominal 0.05)
  Peek at every step: FP rate = 0.269  <- severely inflated

Fix: pre-register n and check ONCE. Or use sequential testing methods.

5. Multiple Experiments — FDR Control

def bh_correction(pvals: np.ndarray, alpha: float = 0.05) -> np.ndarray:
    """Benjamini-Hochberg FDR correction."""
    m    = len(pvals)
    rank = np.argsort(pvals)
    sp   = pvals[rank]
    thr  = (np.arange(1, m + 1) / m) * alpha
    below = sp <= thr
    if not below.any():
        return np.zeros(m, dtype=bool)
    k_max = int(np.where(below)[0].max())
    rs    = np.zeros(m, dtype=bool)
    rs[:k_max + 1] = True
    result = np.zeros(m, dtype=bool)
    result[rank] = rs
    return result


n_exp  = 50
n_real = 10
effects = np.concatenate([np.full(n_real, 0.02), np.zeros(n_exp - n_real)])
truth   = effects != 0

pvals = np.array([
    stats.ttest_ind(
        rng.binomial(1, 0.12, 1000),
        rng.binomial(1, 0.12 + eff, 1000)
    ).pvalue
    for eff in effects
])

naive_rej = pvals < 0.05
bonf_rej  = pvals < (0.05 / n_exp)
bh_rej    = bh_correction(pvals)

print(f"{'Method':<14}  {'Rejected':>8}  {'True Pos':>9}  {'False Pos':>10}")
print("-" * 48)
for name, rej in [('Naive', naive_rej), ('Bonferroni', bonf_rej), ('B-H FDR', bh_rej)]:
    tp = int((rej & truth).sum())
    fp = int((rej & ~truth).sum())
    print(f"{name:<14}  {int(rej.sum()):>8d}  {tp:>9d}  {fp:>10d}")
Method          Rejected   True Pos   False Pos
------------------------------------------------
Naive                  3          3           0
Bonferroni             1          1           0
B-H FDR                1          1           0

6. What Comes Next

A/B testing gives a binary decision: reject or not. ch286 — Bayesian Statistics computes a posterior distribution over the effect size — more informative, naturally sequential, and does not require pre-registering sample size.