1. Why Transformers replaced RNNs¶
RNNs process sequences sequentially: depends on . This prevents parallelism during training — each step must wait for the previous one.
Transformers (Vaswani et al., 2017) process the entire sequence in parallel using self-attention. Every position attends to every other position simultaneously. Training parallelism enables scaling to billions of parameters on GPU/TPU clusters.
2. The Transformer block¶
Each layer has two sublayers with residual connections and Layer Norm:
where FFN expands to with GELU, then contracts:
(Multi-Head Attention: ch321. Layer Norm: ch310. Residual connections: ch316. GELU: ch309.)
import numpy as np
import matplotlib.pyplot as plt
def gelu(x):
return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))
def layer_norm(x: np.ndarray, gamma: np.ndarray, beta: np.ndarray, eps: float = 1e-6):
mean = x.mean(axis=-1, keepdims=True)
var = x.var(axis=-1, keepdims=True)
return gamma * (x - mean) / np.sqrt(var + eps) + beta
def softmax_last(z: np.ndarray) -> np.ndarray:
z_s = z - z.max(axis=-1, keepdims=True)
e = np.exp(z_s)
return e / e.sum(axis=-1, keepdims=True)
def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray,
V: np.ndarray, mask: np.ndarray = None):
"""Q,K,V: (H, T, d_k). Returns (H, T, d_v), weights."""
d_k = Q.shape[-1]
scores = Q @ K.transpose(0, 2, 1) / np.sqrt(d_k)
if mask is not None:
scores = np.where(mask, -1e9, scores)
w = softmax_last(scores)
return w @ V, w
class TransformerBlock:
"""Single Transformer encoder block — pre-LayerNorm variant."""
def __init__(self, d_model: int, n_heads: int, d_ff: int, seed: int = 0):
assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
rng = np.random.default_rng(seed)
self.d_model = d_model
self.H = n_heads
self.d_k = d_model // n_heads
scale = np.sqrt(1.0 / d_model)
self.W_q = rng.normal(0, scale, (d_model, d_model))
self.W_k = rng.normal(0, scale, (d_model, d_model))
self.W_v = rng.normal(0, scale, (d_model, d_model))
self.W_o = rng.normal(0, scale, (d_model, d_model))
self.W1 = rng.normal(0, scale, (d_ff, d_model))
self.b1 = np.zeros(d_ff)
self.W2 = rng.normal(0, scale, (d_model, d_ff))
self.b2 = np.zeros(d_model)
self.gamma1 = np.ones(d_model); self.beta1 = np.zeros(d_model)
self.gamma2 = np.ones(d_model); self.beta2 = np.zeros(d_model)
self.d_ff = d_ff
def _split_heads(self, X: np.ndarray) -> np.ndarray:
"""(T, d_model) → (H, T, d_k)"""
T = X.shape[0]
return X.reshape(T, self.H, self.d_k).transpose(1, 0, 2)
def _merge_heads(self, X: np.ndarray) -> np.ndarray:
"""(H, T, d_k) → (T, d_model)"""
return X.transpose(1, 0, 2).reshape(X.shape[1], self.d_model)
def forward(self, X: np.ndarray, mask: np.ndarray = None) -> np.ndarray:
"""X: (T, d_model). Returns (T, d_model)."""
# ── Self-attention sublayer ──
X_ln = layer_norm(X, self.gamma1, self.beta1)
Q = self._split_heads(X_ln @ self.W_q)
K = self._split_heads(X_ln @ self.W_k)
V = self._split_heads(X_ln @ self.W_v)
attn_out, _ = scaled_dot_product_attention(Q, K, V, mask)
attn_out = self._merge_heads(attn_out) @ self.W_o
X = X + attn_out # residual
# ── FFN sublayer ──
X_ln2 = layer_norm(X, self.gamma2, self.beta2)
ffn_out = gelu(X_ln2 @ self.W1.T + self.b1) @ self.W2.T + self.b2
X = X + ffn_out # residual
return X
def param_count(self) -> int:
d, H, ff = self.d_model, self.H, self.d_ff
return 4 * d * d + 2 * d * ff + 2 * d + 2 * ff
# ── Demo: stack multiple blocks, measure output statistics ──
T, d_model, n_heads, d_ff = 16, 64, 4, 256
blocks = [TransformerBlock(d_model, n_heads, d_ff, seed=i) for i in range(6)]
rng = np.random.default_rng(0)
X = rng.normal(0, 1, (T, d_model))
print("Forward pass through 6-layer Transformer encoder:")
print(f" Input: {X.shape}, std={X.std():.3f}")
for i, block in enumerate(blocks):
X = block.forward(X)
print(f" Layer {i+1}: std={X.std():.3f}")
total_params = sum(b.param_count() for b in blocks)
print(f"\nTotal parameters (6 layers): {total_params:,}")
print(f"Per-layer params: {blocks[0].param_count():,}")# Visualise attention patterns across heads
T = 12; d_model = 32; n_heads = 4; d_ff = 128
block = TransformerBlock(d_model, n_heads, d_ff, seed=42)
# Create a causal mask for autoregressive decoding
causal_mask = np.triu(np.ones((1, T, T), dtype=bool), k=1) # broadcast over heads
rng = np.random.default_rng(5)
X_demo = rng.normal(0, 1, (T, d_model))
# Extract attention weights per head
X_ln = layer_norm(X_demo, block.gamma1, block.beta1)
Q = block._split_heads(X_ln @ block.W_q)
K = block._split_heads(X_ln @ block.W_k)
V = block._split_heads(X_ln @ block.W_v)
_, attn_weights = scaled_dot_product_attention(Q, K, V, causal_mask)
fig, axes = plt.subplots(1, n_heads, figsize=(14, 3))
for h, ax in enumerate(axes):
im = ax.imshow(attn_weights[h], cmap='Blues', vmin=0, vmax=1)
ax.set_title(f'Head {h+1}')
ax.set_xlabel('Key pos'); ax.set_ylabel('Query pos')
plt.colorbar(im, ax=ax, fraction=0.046)
plt.suptitle('Causal self-attention weights per head\n'
'(lower triangular: each position attends to itself and past)', fontsize=11)
plt.tight_layout()
plt.savefig('ch322_transformer_attention.png', dpi=120)
plt.show()3. Encoder vs Decoder vs Encoder-Decoder¶
| Architecture | Self-attention | Cross-attention | Use case |
|---|---|---|---|
| Encoder-only (BERT) | Bidirectional | — | Classification, NER, embeddings |
| Decoder-only (GPT) | Causal (masked) | — | Text generation, LLMs |
| Encoder-Decoder (T5) | Bidirectional enc + causal dec | Enc→Dec | Translation, summarisation |
4. Parameter count scaling¶
For a Transformer with layers, , FFN:
GPT-3 (175B params): , . Doubling quadruples parameters.
5. Summary¶
Transformer = Multi-Head Self-Attention + FFN + Residual + LayerNorm, repeated times.
Processes entire sequence in parallel → training is GPU-parallelisable.
Three variants: encoder-only (bidirectional), decoder-only (causal), encoder-decoder.
Scales predictably with depth and width; scaling laws (ch332) govern this.
6. Forward and backward references¶
Used here: attention (ch321), layer norm (ch310), residual connections (ch316), GELU (ch309), positional encoding (ch323).
This will reappear in ch337 — Project: Transformer Block from Scratch, where the full Transformer is implemented with training on a toy sequence task.