ch290 — Mutual Information - Mathematics for Programmers

(Synthesizes entropy from ch288 and KL divergence from ch289; connects to feature selection in ch291)

1. Definition¶

Mutual Information (MI) measures how much knowing $X$ reduces uncertainty about $Y$ :

I(X; Y) = H(Y) - H(Y|X) = H(X) - H(X|Y) = H(X) + H(Y) - H(X,Y)

(1)

Equivalently:

I(X;Y) = D_{KL}(P_{XY} \| P_X \otimes P_Y)

(2)

It is zero iff $X$ and $Y$ are independent, and positive otherwise. Unlike correlation (ch280), MI detects any statistical dependence, not just linear.

2. Discrete Mutual Information¶

import numpy as np
import matplotlib.pyplot as plt

rng = np.random.default_rng(42)


def entropy_discrete(p: np.ndarray) -> float:
    p = np.asarray(p, dtype=float)
    mask = p > 0
    return -np.sum(p[mask] * np.log(p[mask]))


def mutual_information_discrete(
    x: np.ndarray,
    y: np.ndarray,
) -> float:
    """
    Estimate MI from samples using joint frequency counts.
    x, y: 1D integer arrays of equal length.
    Returns MI in nats.
    """
    x_vals = np.unique(x)
    y_vals = np.unique(y)
    n      = len(x)

    # Build joint distribution
    joint = np.zeros((len(x_vals), len(y_vals)))
    x_idx = {v: i for i, v in enumerate(x_vals)}
    y_idx = {v: i for i, v in enumerate(y_vals)}
    for xi, yi in zip(x, y):
        joint[x_idx[xi], y_idx[yi]] += 1
    joint /= n

    p_x = joint.sum(axis=1)
    p_y = joint.sum(axis=0)

    h_x   = entropy_discrete(p_x)
    h_y   = entropy_discrete(p_y)
    h_xy  = entropy_discrete(joint.flatten())

    return h_x + h_y - h_xy


# Compare MI vs Pearson r across relationship types
n = 1000
x_cont = rng.normal(0, 1, n)

relationships = {
    'Linear':      x_cont * 2 + rng.normal(0, 0.5, n),
    'Quadratic':   x_cont**2 + rng.normal(0, 0.5, n),
    'Sinusoidal':  np.sin(3 * x_cont) + rng.normal(0, 0.2, n),
    'Independent': rng.normal(0, 1, n),
}

# Discretize for MI estimate
def discretize(arr, bins=20):
    return np.digitize(arr, np.percentile(arr, np.linspace(0, 100, bins)))

x_disc = discretize(x_cont)

fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for ax, (name, y) in zip(axes, relationships.items()):
    r = np.corrcoef(x_cont, y)[0, 1]
    mi = mutual_information_discrete(x_disc, discretize(y))
    ax.scatter(x_cont, y, s=5, alpha=0.3, color='steelblue')
    ax.set_title(f'{name}\nPearson r={r:+.3f}\nMI={mi:.3f} nats')
    ax.set_xlabel('X')

plt.suptitle('Mutual Information detects non-linear dependence; Pearson r does not', fontsize=11)
plt.tight_layout()
plt.show()

3. Normalized Mutual Information¶

def normalized_mi(
    x: np.ndarray,
    y: np.ndarray,
    method: str = 'arithmetic',
) -> float:
    """
    Normalize MI to [0, 1].
    method='arithmetic': NMI = 2*MI / (H(X)+H(Y))
    method='geometric':  NMI = MI / sqrt(H(X)*H(Y))
    method='max':        NMI = MI / max(H(X), H(Y))
    """
    x_disc = discretize(x)
    y_disc = discretize(y)

    mi  = mutual_information_discrete(x_disc, y_disc)
    hx  = entropy_discrete(np.bincount(x_disc) / len(x_disc))
    hy  = entropy_discrete(np.bincount(y_disc) / len(y_disc))

    if method == 'arithmetic':
        return 2 * mi / (hx + hy + 1e-15)
    elif method == 'geometric':
        return mi / (np.sqrt(hx * hy) + 1e-15)
    elif method == 'max':
        return mi / (max(hx, hy) + 1e-15)


# Use NMI for clustering evaluation
# NMI is symmetric and scale-invariant: perfect for comparing cluster labels
n_cls = 300
true_labels = np.repeat([0, 1, 2], 100)

# Perfect clustering
perfect = true_labels.copy()
# Random clustering
random  = rng.integers(0, 3, n_cls)
# Near-perfect (5% errors)
near    = true_labels.copy()
err_idx = rng.choice(n_cls, 15, replace=False)
near[err_idx] = rng.integers(0, 3, 15)

print("Clustering quality via NMI (0=random, 1=perfect):")
for name, pred in [('Perfect', perfect), ('Near-perfect', near), ('Random', random)]:
    nmi = normalized_mi(true_labels.astype(float), pred.astype(float))
    print(f"  {name:<15}: NMI = {nmi:.4f}")

4. What Comes Next¶

Information gain (a form of MI) is the splitting criterion in decision trees. NMI is the standard metric for comparing clustering results. ch291 — Feature Engineering uses MI-based selection to identify which features contain the most information about the target — a principled, model-free alternative to correlation-based selection.

The KL divergence expression $D_{KL}(P_{XY}\|P_X \otimes P_Y)$ reappears in ch295 — Neural Network Math Review as the objective function in variational autoencoders, where the ELBO loss contains exactly this KL term.