(Synthesizes entropy from ch288 and KL divergence from ch289; connects to feature selection in ch291)
1. Definition¶
Mutual Information (MI) measures how much knowing reduces uncertainty about :
Equivalently:
It is zero iff and are independent, and positive otherwise. Unlike correlation (ch280), MI detects any statistical dependence, not just linear.
2. Discrete Mutual Information¶
import numpy as np
import matplotlib.pyplot as plt
rng = np.random.default_rng(42)
def entropy_discrete(p: np.ndarray) -> float:
p = np.asarray(p, dtype=float)
mask = p > 0
return -np.sum(p[mask] * np.log(p[mask]))
def mutual_information_discrete(
x: np.ndarray,
y: np.ndarray,
) -> float:
"""
Estimate MI from samples using joint frequency counts.
x, y: 1D integer arrays of equal length.
Returns MI in nats.
"""
x_vals = np.unique(x)
y_vals = np.unique(y)
n = len(x)
# Build joint distribution
joint = np.zeros((len(x_vals), len(y_vals)))
x_idx = {v: i for i, v in enumerate(x_vals)}
y_idx = {v: i for i, v in enumerate(y_vals)}
for xi, yi in zip(x, y):
joint[x_idx[xi], y_idx[yi]] += 1
joint /= n
p_x = joint.sum(axis=1)
p_y = joint.sum(axis=0)
h_x = entropy_discrete(p_x)
h_y = entropy_discrete(p_y)
h_xy = entropy_discrete(joint.flatten())
return h_x + h_y - h_xy
# Compare MI vs Pearson r across relationship types
n = 1000
x_cont = rng.normal(0, 1, n)
relationships = {
'Linear': x_cont * 2 + rng.normal(0, 0.5, n),
'Quadratic': x_cont**2 + rng.normal(0, 0.5, n),
'Sinusoidal': np.sin(3 * x_cont) + rng.normal(0, 0.2, n),
'Independent': rng.normal(0, 1, n),
}
# Discretize for MI estimate
def discretize(arr, bins=20):
return np.digitize(arr, np.percentile(arr, np.linspace(0, 100, bins)))
x_disc = discretize(x_cont)
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for ax, (name, y) in zip(axes, relationships.items()):
r = np.corrcoef(x_cont, y)[0, 1]
mi = mutual_information_discrete(x_disc, discretize(y))
ax.scatter(x_cont, y, s=5, alpha=0.3, color='steelblue')
ax.set_title(f'{name}\nPearson r={r:+.3f}\nMI={mi:.3f} nats')
ax.set_xlabel('X')
plt.suptitle('Mutual Information detects non-linear dependence; Pearson r does not', fontsize=11)
plt.tight_layout()
plt.show()3. Normalized Mutual Information¶
def normalized_mi(
x: np.ndarray,
y: np.ndarray,
method: str = 'arithmetic',
) -> float:
"""
Normalize MI to [0, 1].
method='arithmetic': NMI = 2*MI / (H(X)+H(Y))
method='geometric': NMI = MI / sqrt(H(X)*H(Y))
method='max': NMI = MI / max(H(X), H(Y))
"""
x_disc = discretize(x)
y_disc = discretize(y)
mi = mutual_information_discrete(x_disc, y_disc)
hx = entropy_discrete(np.bincount(x_disc) / len(x_disc))
hy = entropy_discrete(np.bincount(y_disc) / len(y_disc))
if method == 'arithmetic':
return 2 * mi / (hx + hy + 1e-15)
elif method == 'geometric':
return mi / (np.sqrt(hx * hy) + 1e-15)
elif method == 'max':
return mi / (max(hx, hy) + 1e-15)
# Use NMI for clustering evaluation
# NMI is symmetric and scale-invariant: perfect for comparing cluster labels
n_cls = 300
true_labels = np.repeat([0, 1, 2], 100)
# Perfect clustering
perfect = true_labels.copy()
# Random clustering
random = rng.integers(0, 3, n_cls)
# Near-perfect (5% errors)
near = true_labels.copy()
err_idx = rng.choice(n_cls, 15, replace=False)
near[err_idx] = rng.integers(0, 3, 15)
print("Clustering quality via NMI (0=random, 1=perfect):")
for name, pred in [('Perfect', perfect), ('Near-perfect', near), ('Random', random)]:
nmi = normalized_mi(true_labels.astype(float), pred.astype(float))
print(f" {name:<15}: NMI = {nmi:.4f}")4. What Comes Next¶
Information gain (a form of MI) is the splitting criterion in decision trees. NMI is the standard metric for comparing clustering results. ch291 — Feature Engineering uses MI-based selection to identify which features contain the most information about the target — a principled, model-free alternative to correlation-based selection.
The KL divergence expression reappears in ch295 — Neural Network Math Review as the objective function in variational autoencoders, where the ELBO loss contains exactly this KL term.