Chapter 269 - Hidden Markov Models - Mathematics for Programmers

Part VIII - Probability

A Markov chain (ch257) lets us model sequences of states. But in many real problems the states are hidden: we observe noisy measurements but not the underlying process directly. Hidden Markov Models (HMMs) combine the Markov structure with probabilistic observations, enabling three fundamental inference tasks: computing sequence likelihood, decoding the most probable state sequence, and learning parameters from data.

1. HMM Structure and Parameters¶

import numpy as np
import matplotlib.pyplot as plt

rng = np.random.default_rng(99)

# Hidden Markov Model structure:
# - Hidden states: S_t in {0, 1, ..., K-1}  (Markov chain, ch257)
# - Observations: O_t depends only on S_t
#
# Parameters:
# - pi: initial state distribution (K,)
# - A: transition matrix (K, K), A[i,j] = P(S_{t+1}=j | S_t=i)
# - B: emission matrix (K, V) for discrete obs, or emission parameters for continuous

K = 2    # number of hidden states (e.g., "bull" and "bear" market)
V = 3    # observation vocabulary (e.g., "up", "flat", "down")

# Define a simple HMM for market states
pi = np.array([0.6, 0.4])   # initial: more likely in state 0

A = np.array([
    [0.9, 0.1],   # state 0 (bull): tends to stay bull
    [0.3, 0.7],   # state 1 (bear): tends to stay bear
])

B = np.array([
    [0.7, 0.2, 0.1],  # bull -> likely "up"
    [0.1, 0.3, 0.6],  # bear -> likely "down"
])

labels_states = ["Bull", "Bear"]
labels_obs    = ["Up", "Flat", "Down"]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
im0 = axes[0].imshow(A, cmap="Blues", vmin=0, vmax=1)
axes[0].set_title("Transition Matrix A"); plt.colorbar(im0, ax=axes[0])
axes[0].set_xticks([0,1]); axes[0].set_yticks([0,1])
axes[0].set_xticklabels(labels_states); axes[0].set_yticklabels(labels_states)
for i in range(K):
    for j in range(K):
        axes[0].text(j, i, f"{A[i,j]:.1f}", ha="center", va="center", fontsize=13)

im1 = axes[1].imshow(B, cmap="Oranges", vmin=0, vmax=1)
axes[1].set_title("Emission Matrix B"); plt.colorbar(im1, ax=axes[1])
axes[1].set_xticks(range(V)); axes[1].set_yticks(range(K))
axes[1].set_xticklabels(labels_obs); axes[1].set_yticklabels(labels_states)
for i in range(K):
    for j in range(V):
        axes[1].text(j, i, f"{B[i,j]:.1f}", ha="center", va="center", fontsize=11)

plt.suptitle("HMM Parameters: Transition and Emission Matrices")
plt.tight_layout(); plt.show()

2. Generating Sequences from an HMM¶

def hmm_sample(pi, A, B, T, rng):
    K = len(pi)
    states = np.zeros(T, dtype=int)
    obs    = np.zeros(T, dtype=int)
    states[0] = rng.choice(K, p=pi)
    obs[0]    = rng.choice(B.shape[1], p=B[states[0]])
    for t in range(1, T):
        states[t] = rng.choice(K, p=A[states[t-1]])
        obs[t]    = rng.choice(B.shape[1], p=B[states[t]])
    return states, obs

T = 80
true_states, observations = hmm_sample(pi, A, B, T, rng)

fig, axes = plt.subplots(2, 1, figsize=(14, 5), sharex=True)
axes[0].plot(true_states, drawstyle="steps-post", color="steelblue", lw=2)
axes[0].set_yticks([0, 1]); axes[0].set_yticklabels(labels_states)
axes[0].set_title("Hidden States (unobserved)"); axes[0].grid(alpha=0.3)

obs_colors = ["green", "gray", "red"]
for t, (o, s) in enumerate(zip(observations, true_states)):
    axes[1].bar(t, 1, color=obs_colors[o], alpha=0.7, width=0.9)
axes[1].set_title("Observations (visible)")
axes[1].set_yticks([]); axes[1].set_xlabel("Time step")
from matplotlib.patches import Patch
axes[1].legend(handles=[Patch(color=obs_colors[i], label=labels_obs[i]) for i in range(V)])
plt.suptitle("HMM: Hidden States and Noisy Observations")
plt.tight_layout(); plt.show()
print(f"Bull (state 0): {np.sum(true_states==0)} steps | Bear (state 1): {np.sum(true_states==1)} steps")

Bull (state 0): 66 steps | Bear (state 1): 14 steps

3. The Forward Algorithm - Computing Sequence Likelihood¶

Naive computation of P(O) requires summing over all K^T state sequences: exponential cost. Dynamic programming reduces this to O(K^2 T).

# Forward algorithm: compute P(O_{1:t}, S_t = k) efficiently via dynamic programming
# alpha[t, k] = P(O_1,...,O_t, S_t = k)
# Base: alpha[0, k] = pi[k] * B[k, O_0]
# Recursion: alpha[t, k] = B[k, O_t] * sum_j(alpha[t-1, j] * A[j, k])

def forward(pi, A, B, obs):
    T = len(obs)
    K = len(pi)
    alpha = np.zeros((T, K))
    alpha[0] = pi * B[:, obs[0]]
    for t in range(1, T):
        alpha[t] = B[:, obs[t]] * (alpha[t-1] @ A)
    return alpha

alpha = forward(pi, A, B, observations)
likelihood = alpha[-1].sum()
print(f"P(observations) = {likelihood:.6e}")
print(f"log P(observations) = {np.log(likelihood):.4f}")

# Visualize forward probabilities over time
fig, ax = plt.subplots(figsize=(14, 4))
ax.stackplot(range(T), alpha[:, 0], alpha[:, 1],
             labels=labels_states, alpha=0.7, colors=["steelblue", "darkorange"])
ax.set_title("Forward Probabilities alpha[t, k] = P(O_1..O_t, S_t=k)")
ax.set_xlabel("Time"); ax.legend(loc="upper right"); ax.grid(alpha=0.3)
plt.tight_layout(); plt.show()

P(observations) = 2.601230e-34
log P(observations) = -77.3319

4. The Viterbi Algorithm - Decoding Hidden States¶

Viterbi finds the single most probable state sequence using the same DP structure as forward, but with max instead of sum. Log-domain computation prevents underflow. (Dynamic programming from ch257 - Markov Chains)

# Viterbi algorithm: find most likely state sequence
# delta[t, k] = max_{s_{1:t-1}} P(s_{1:t-1}, S_t=k, O_{1:t})
# psi[t, k] = argmax_j delta[t-1, j] * A[j,k]  (backpointer)

def viterbi(pi, A, B, obs):
    T = len(obs)
    K = len(pi)
    log_A = np.log(A + 1e-300)
    log_B = np.log(B + 1e-300)
    log_pi = np.log(pi + 1e-300)

    delta = np.full((T, K), -np.inf)
    psi   = np.zeros((T, K), dtype=int)

    delta[0] = log_pi + log_B[:, obs[0]]
    for t in range(1, T):
        for k in range(K):
            scores = delta[t-1] + log_A[:, k]
            psi[t, k]   = np.argmax(scores)
            delta[t, k] = scores[psi[t, k]] + log_B[k, obs[t]]

    # Backtrack
    path = np.zeros(T, dtype=int)
    path[-1] = np.argmax(delta[-1])
    for t in range(T-2, -1, -1):
        path[t] = psi[t+1, path[t+1]]
    return path, delta

viterbi_path, delta = viterbi(pi, A, B, observations)

# Compare Viterbi path vs true hidden states
accuracy = np.mean(viterbi_path == true_states)
print(f"Viterbi state accuracy: {accuracy:.3f}")

fig, axes = plt.subplots(2, 1, figsize=(14, 5), sharex=True)
axes[0].step(range(T), true_states, where="post", label="True", lw=2, color="steelblue")
axes[0].step(range(T), viterbi_path, where="post", label="Viterbi", lw=2,
             ls="--", color="darkorange")
axes[0].set_yticks([0, 1]); axes[0].set_yticklabels(labels_states)
axes[0].legend(); axes[0].set_title(f"True vs Viterbi Path (accuracy={accuracy:.2f})")
axes[0].grid(alpha=0.3)

errors = (viterbi_path != true_states).astype(int)
axes[1].bar(range(T), errors, color="red", alpha=0.6, label="Error")
axes[1].set_title("Classification Errors"); axes[1].legend(); axes[1].grid(alpha=0.3)
plt.tight_layout(); plt.show()

Viterbi state accuracy: 0.887

5. Gaussian HMMs - Continuous Observations¶

Replacing discrete emission distributions with Gaussians extends HMMs to continuous data: speech features, financial returns, sensor readings.

# Gaussian HMM: continuous observations from Gaussian emission distributions
# Each state k emits: X_t | S_t=k ~ N(mu_k, sigma_k^2)
# Common in speech recognition, finance, bioinformatics

K_g = 3
mu_emit   = np.array([-2.0, 0.0, 3.0])
sig_emit  = np.array([0.5, 1.0, 0.8])

A_g = np.array([
    [0.8, 0.15, 0.05],
    [0.1, 0.8,  0.1 ],
    [0.05, 0.15, 0.8],
])
pi_g = np.array([0.4, 0.4, 0.2])

def gaussian_hmm_sample(pi, A, mu, sig, T, rng):
    K = len(pi)
    states = np.zeros(T, dtype=int)
    obs    = np.zeros(T)
    states[0] = rng.choice(K, p=pi)
    obs[0]    = rng.normal(mu[states[0]], sig[states[0]])
    for t in range(1, T):
        states[t] = rng.choice(K, p=A[states[t-1]])
        obs[t]    = rng.normal(mu[states[t]], sig[states[t]])
    return states, obs

T_g = 200
g_states, g_obs = gaussian_hmm_sample(pi_g, A_g, mu_emit, sig_emit, T_g, rng)

fig, axes = plt.subplots(2, 1, figsize=(14, 6), sharex=True)
colors_g = ["steelblue", "darkorange", "seagreen"]
for k in range(K_g):
    mask = g_states == k
    t_idx = np.where(mask)[0]
    axes[0].scatter(t_idx, g_obs[t_idx], s=10, color=colors_g[k],
                    label=f"State {k} (mu={mu_emit[k]})", zorder=3)
axes[0].set_title("Gaussian HMM: Continuous Observations Colored by True State")
axes[0].legend(fontsize=9); axes[0].grid(alpha=0.3)

axes[1].step(range(T_g), g_states, where="post", lw=1.5)
axes[1].set_yticks(range(K_g)); axes[1].set_title("Hidden States")
axes[1].set_xlabel("Time"); axes[1].grid(alpha=0.3)
plt.tight_layout(); plt.show()
print("In practice, HMM parameters are estimated with the Baum-Welch EM algorithm.")
print("This connects to ch246 (Bayes) and ch257 (Markov Chains).")

In practice, HMM parameters are estimated with the Baum-Welch EM algorithm.
This connects to ch246 (Bayes) and ch257 (Markov Chains).

6. Summary¶

HMM inference algorithms:

Algorithm	Question answered	Complexity
Forward	P(observations)	O(K^2 T)
Forward-Backward	P(S_t	obs) marginals
Viterbi	argmax P(states, obs)	O(K^2 T)
Baum-Welch	MLE parameter estimation	O(K^2 T) per iteration

(builds on ch257 - Markov Chains, ch246 - Bayes Theorem, ch245 - Conditional Probability)

7. Forward References¶

ch271 - Data and Measurement: sequential data preprocessing for HMM inputs.
ch288 - Classification: HMMs as sequence classifiers vs discriminative models.
ch299 - Capstone: sequence modeling as a component of end-to-end AI pipelines.