Part VIII — Probability
Generating functions are one of probability’s most elegant tools: they encode an entire distribution inside a single function, turning convolution into multiplication and moment computation into differentiation. This chapter builds the machinery from first principles and shows where it surfaces in statistics and ML.
1. Probability Generating Functions (PGFs)¶
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson, binom
# For a discrete rv X with P(X=k) = p_k, the PGF is G(z) = E[z^X] = sum_k p_k z^k
# Evaluated at z=1: G(1) = 1 (normalization)
# G'(1) = E[X] (first moment)
# G''(1) = E[X(X-1)] (factorial moment)
def pgf_bernoulli(z, p):
"""G(z) = (1-p) + p*z"""
return (1 - p) + p * z
def pgf_binomial(z, n, p):
"""G(z) = ((1-p) + p*z)^n — product of n Bernoulli PGFs"""
return ((1 - p) + p * z) ** n
def pgf_poisson(z, lam):
"""G(z) = exp(lambda*(z-1))"""
return np.exp(lam * (z - 1))
z = np.linspace(0, 1, 300)
fig, ax = plt.subplots(figsize=(9, 4))
ax.plot(z, pgf_binomial(z, 10, 0.3), label='Binomial(10, 0.3)')
ax.plot(z, pgf_poisson(z, 3.0), label='Poisson(3)')
ax.plot(z, pgf_bernoulli(z, 0.4), label='Bernoulli(0.4)')
ax.axvline(1, color='k', lw=0.8, ls='--', label='z = 1')
ax.set_xlabel('z'); ax.set_ylabel('G(z)')
ax.set_title('Probability Generating Functions'); ax.legend(); ax.grid(alpha=0.3)
plt.tight_layout(); plt.show()
print('G_binom(1) =', pgf_binomial(1, 10, 0.3)) # must be 1
print('G_pois(1) =', pgf_poisson(1, 3.0))

G_binom(1) = 1.0
G_pois(1) = 1.0
2. Extracting Moments from PGFs¶
# E[X] = G'(1), Var[X] = G''(1) + G'(1) - [G'(1)]^2
# Use numerical differentiation so the pattern is clear
def numerical_deriv(f, z, h=1e-5):
return (f(z + h) - f(z - h)) / (2 * h)
def numerical_deriv2(f, z, h=1e-5):
return (f(z + h) - 2*f(z) + f(z - h)) / h**2
for name, G_fn in [
('Binomial(10,0.3)', lambda z: pgf_binomial(z, 10, 0.3)),
('Poisson(3)', lambda z: pgf_poisson(z, 3.0)),
]:
g1 = numerical_deriv(G_fn, 1.0 - 1e-4) # approach from left to stay in domain
g2 = numerical_deriv2(G_fn, 1.0 - 1e-4)
mean_pgf = g1
var_pgf = g2 + g1 - g1**2
print(f'{name}: E[X] ≈ {mean_pgf:.4f}, Var[X] ≈ {var_pgf:.4f}')
print()
print('Reference — Binomial(10,0.3): E=3.0, Var=2.1')
print('Reference — Poisson(3): E=3.0, Var=3.0')
Binomial(10,0.3): E[X] ≈ 2.9992, Var[X] ≈ 2.1021
Poisson(3): E[X] ≈ 2.9991, Var[X] ≈ 3.0018
Reference — Binomial(10,0.3): E=3.0, Var=2.1
Reference — Poisson(3): E=3.0, Var=3.0
3. Moment Generating Functions (MGFs)¶
# M_X(t) = E[e^{tX}]
# For continuous distributions this is more natural than PGF.
# E[X^k] = M^(k)(0) — k-th derivative at t=0
def mgf_normal(t, mu, sigma):
return np.exp(mu * t + 0.5 * sigma**2 * t**2)
def mgf_exponential(t, lam):
# Defined only for t < lambda
valid = t < lam
out = np.where(valid, lam / (lam - t), np.nan)
return out
t = np.linspace(-2, 2, 400)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(t, mgf_normal(t, 0, 1), label='N(0,1)')
axes[0].plot(t, mgf_normal(t, 1, 2), label='N(1,2)')
axes[0].set_title('MGF — Normal'); axes[0].legend(); axes[0].grid(alpha=0.3)
axes[0].set_ylim(0, 20)
t_exp = np.linspace(-2, 0.9, 400)
axes[1].plot(t_exp, mgf_exponential(t_exp, 1.0), label='Exp(1)')
axes[1].set_title('MGF — Exponential(1)'); axes[1].legend(); axes[1].grid(alpha=0.3)
plt.suptitle('Moment Generating Functions'); plt.tight_layout(); plt.show()
# Extract moments: M'(0) = E[X], M''(0) = E[X^2]
h = 1e-5
mu, sigma = 2.0, 3.0
M = lambda t: mgf_normal(t, mu, sigma)
mean_mgf = numerical_deriv(M, 0.0)
ex2 = numerical_deriv2(M, 0.0) + numerical_deriv(M, 0.0) # second raw moment needs care
var_mgf = numerical_deriv2(M, 0.0) # for MGF: Var = M''(0) - [M'(0)]^2
print(f'Normal({mu},{sigma}): E[X] from MGF ≈ {mean_mgf:.4f} (true {mu})')
print(f'Normal({mu},{sigma}): Var from MGF ≈ {var_mgf:.4f} (true {sigma**2})')

Normal(2.0,3.0): E[X] from MGF ≈ 2.0000 (true 2.0)
Normal(2.0,3.0): Var from MGF ≈ 13.0000 (true 9.0)
4. MGF Convolution — Why Independence Makes Products¶
# If X and Y are independent, M_{X+Y}(t) = M_X(t) * M_Y(t)
# This is the MGF equivalent of the convolution theorem.
# Direct consequence: sum of independent normals is normal.
# Demonstrate numerically: X~N(1,1), Y~N(2,1), X+Y~N(3,sqrt(2))
rng = np.random.default_rng(42)
X = rng.normal(1, 1, 100_000)
Y = rng.normal(2, 1, 100_000)
Z = X + Y
t_vals = np.array([-1.0, -0.5, 0.0, 0.5, 1.0])
print(f'{'t':>6} {'M_X*M_Y':>12} {'M_{X+Y} (exact)':>16} {'match':>6}')
for t in t_vals:
product = mgf_normal(t, 1, 1) * mgf_normal(t, 2, 1)
exact = mgf_normal(t, 3, np.sqrt(2))
print(f'{t:6.2f} {product:12.6f} {exact:16.6f} {abs(product-exact)<1e-10!s:>6}')
# Visualize empirical Z vs N(3, sqrt(2))
xs = np.linspace(-2, 8, 300)
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(Z, bins=80, density=True, alpha=0.5, label='Empirical X+Y')
ax.plot(xs, norm.pdf(xs, 3, np.sqrt(2)), 'r-', lw=2, label='N(3, √2) predicted')
ax.set_title('Sum of Two Normals — MGF Convolution Confirmed')
ax.legend(); ax.grid(alpha=0.3); plt.tight_layout(); plt.show()
t M_X*M_Y M_{X+Y} (exact) match
-1.00 0.135335 0.135335 True
-0.50 0.286505 0.286505 True
0.00 1.000000 1.000000 True
0.50 5.754603 5.754603 True
1.00 54.598150 54.598150 True

5. Characteristic Functions — MGF Without Convergence Worries¶
# phi_X(t) = E[e^{itX}] (i = imaginary unit)
# Always exists; |phi(t)| <= 1.
# CLT proof: phi_{S_n}(t) -> exp(-t^2/2)
# For Normal(mu, sigma): phi(t) = exp(i*mu*t - sigma^2*t^2/2)
# For Cauchy: phi(t) = exp(-|t|) — no finite moments, MGF doesn't exist
t_arr = np.linspace(-5, 5, 1000)
def char_fn_normal(t, mu, sigma):
return np.exp(1j * mu * t - 0.5 * sigma**2 * t**2)
def char_fn_cauchy(t):
return np.exp(-np.abs(t))
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for mu, sigma, col in [(0,1,'C0'), (2,0.5,'C1')]:
phi = char_fn_normal(t_arr, mu, sigma)
axes[0].plot(t_arr, np.real(phi), color=col, label=f'Re, N({mu},{sigma})')
axes[0].plot(t_arr, np.imag(phi), color=col, ls='--', alpha=0.6)
axes[0].set_title('Characteristic Function — Normal'); axes[0].legend(); axes[0].grid(alpha=0.3)
phi_c = char_fn_cauchy(t_arr)
axes[1].plot(t_arr, np.real(phi_c), label='Cauchy Re (=exp(-|t|))')
axes[1].set_title('Characteristic Function — Cauchy'); axes[1].legend(); axes[1].grid(alpha=0.3)
plt.suptitle('Characteristic Functions'); plt.tight_layout(); plt.show()
# CLT via characteristic functions
# phi_{mean of n iid X}(t) = [phi_X(t/sqrt(n))]^n -> exp(-t^2/2)
print('CLT characteristic function convergence:')
phi_X = lambda t: char_fn_normal(t, 0, 1) # standardized
for n in [1, 5, 20, 100]:
t0 = 1.0
approx = phi_X(t0 / np.sqrt(n)) ** n
exact = np.exp(-t0**2 / 2)
print(f' n={n:4d}: phi_Sn({t0}) = {approx:.6f}+{approx.imag:.6f}j (exact {exact:.6f})')

CLT characteristic function convergence:
n= 1: phi_Sn(1.0) = 0.606531+0.000000j+0.000000j (exact 0.606531)
n= 5: phi_Sn(1.0) = 0.606531+0.000000j+0.000000j (exact 0.606531)
n= 20: phi_Sn(1.0) = 0.606531+0.000000j+0.000000j (exact 0.606531)
n= 100: phi_Sn(1.0) = 0.606531+0.000000j+0.000000j (exact 0.606531)
6. Summary¶
| Transform | Domain | Key property |
|---|---|---|
| PGF G(z) | Discrete | G^(k)(0)/k! = P(X=k) |
| MGF M(t) | Continuous/discrete | M^(k)(0) = E[X^k] |
| Characteristic φ(t) | Any | Always exists; |
Convolution → multiplication: independence turns the hard operation into the easy one.
Moment extraction: differentiate at 0.
CLT: characteristic functions make the proof rigorous (ch254 — Central Limit Theorem).
(introduced in ch249 — Expected Value, ch250 — Variance)
7. Forward References¶
ch271 — Data and Measurement: sampling distributions derived from MGF arguments.
ch280 — Hypothesis Testing: test statistics whose distributions follow from characteristic functions.
ch295 — Information Theory: entropy as the log of a normalizing constant in exponential families — same transform machinery.