0. Overview¶
Problem: Train a deep neural network to classify the two-spiral dataset from the Part X introduction — a benchmark that no linear model can solve.
Concepts used: forward pass (ch304), backpropagation (ch306), SGD with momentum (ch307), He initialisation (ch308), activation functions (ch309), batch normalisation (ch310), dropout (ch311), Adam optimiser (ch312), learning rate scheduling (ch313).
Expected output: trained classifier with >95% accuracy on held-out spiral data, plus training curve and decision boundary visualisation.
Difficulty: ★★★☆☆ | Estimated time: 60–90 minutes
1. Setup¶
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
np.random.seed(42)
# ── Data: two-spiral dataset ──
def make_spirals(n: int = 300, noise: float = 0.08, seed: int = 42) -> tuple:
rng = np.random.default_rng(seed)
def arm(n, offset):
theta = np.linspace(0, 4*np.pi, n) + offset
r = theta / (4*np.pi)
x = r * np.cos(theta) + rng.normal(0, noise, n)
y = r * np.sin(theta) + rng.normal(0, noise, n)
return np.stack([x, y], axis=1)
X0 = arm(n, 0); X1 = arm(n, np.pi)
X = np.vstack([X0, X1])
y = np.array([0]*n + [1]*n)
perm = rng.permutation(len(y))
return X[perm], y[perm]
X, y = make_spirals(300)
# Train / test split
split = int(0.8 * len(y))
X_tr, y_tr = X[:split], y[:split]
X_te, y_te = X[split:], y[split:]
print(f"Train: {X_tr.shape}, Test: {X_te.shape}")
fig, ax = plt.subplots(figsize=(5, 5))
for cls, col in [(0,'#e74c3c'),(1,'#3498db')]:
m = y_tr==cls
ax.scatter(X_tr[m,0], X_tr[m,1], c=col, s=15, alpha=0.7, label=f'Class {cls}')
ax.set_title('Two-spiral training data'); ax.set_aspect('equal'); ax.legend()
plt.tight_layout(); plt.savefig('ch334_data.png', dpi=100); plt.show()2. Stage 1 — Build the Network¶
# ── Activation functions ──
def relu(z): return np.maximum(0, z)
def relu_grad(z): return (z > 0).astype(float)
def sigmoid(z): return 1/(1+np.exp(-np.clip(z,-500,500)))
# ── Batch Normalisation ──
class BatchNorm:
def __init__(self, n, eps=1e-5, momentum=0.1):
self.gamma = np.ones(n); self.beta = np.zeros(n)
self.running_mean = np.zeros(n); self.running_var = np.ones(n)
self.eps = eps; self.momentum = momentum; self.cache = None
def forward(self, Z, training=True):
if training:
mu = Z.mean(0); var = Z.var(0)
Z_hat = (Z - mu) / np.sqrt(var + self.eps)
self.running_mean = (1-self.momentum)*self.running_mean + self.momentum*mu
self.running_var = (1-self.momentum)*self.running_var + self.momentum*var
self.cache = (Z, Z_hat, mu, var)
else:
Z_hat = (Z - self.running_mean) / np.sqrt(self.running_var + self.eps)
return self.gamma * Z_hat + self.beta
def backward(self, dY):
Z, Z_hat, mu, var = self.cache
B = Z.shape[0]
dg = (dY * Z_hat).sum(0); db = dY.sum(0)
dZ_hat = dY * self.gamma
std_inv = 1./np.sqrt(var + self.eps)
dZ = std_inv/B*(B*dZ_hat - dZ_hat.sum(0) - Z_hat*(dZ_hat*Z_hat).sum(0))
return dZ, dg, db
# ── Adam Optimiser ──
class Adam:
def __init__(self, lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
self.lr=lr; self.b1=b1; self.b2=b2; self.eps=eps; self.t=0
self.m={}; self.v={}
def update(self, params, grads):
self.t += 1
for k in params:
if k not in self.m:
self.m[k]=np.zeros_like(params[k])
self.v[k]=np.zeros_like(params[k])
self.m[k] = self.b1*self.m[k] + (1-self.b1)*grads[k]
self.v[k] = self.b2*self.v[k] + (1-self.b2)*grads[k]**2
mh = self.m[k]/(1-self.b1**self.t)
vh = self.v[k]/(1-self.b2**self.t)
params[k] -= self.lr/(np.sqrt(vh)+self.eps)*mh
# ── Network definition ──
class SpiralNet:
"""3-hidden-layer MLP with BatchNorm, ReLU, and Dropout."""
def __init__(self, dims=(2,128,128,64,1), p_keep=0.8, seed=0):
rng = np.random.default_rng(seed)
self.params = {}
self.bns = []
self.p_keep = p_keep
for i in range(len(dims)-1):
fi, fo = dims[i], dims[i+1]
self.params[f'W{i}'] = rng.normal(0, np.sqrt(2.0/fi), (fo, fi))
self.params[f'b{i}'] = np.zeros(fo)
if i < len(dims)-2: # no BN on output layer
self.bns.append(BatchNorm(fo))
def forward(self, X, training=True):
"""X: (B, 2). Returns (B, 1) output and cache."""
rng = np.random.default_rng()
cache = {}; A = X
n_hidden = len(self.bns)
for i in range(n_hidden + 1):
W = self.params[f'W{i}']; b = self.params[f'b{i}']
Z = A @ W.T + b
if i < n_hidden:
Z_bn = self.bns[i].forward(Z, training)
A_new = relu(Z_bn)
if training:
mask = (rng.random(A_new.shape) < self.p_keep) / self.p_keep
A_new = A_new * mask
else:
mask = None
cache[f'Z{i}']=Z; cache[f'Zbn{i}']=Z_bn; cache[f'A{i}']=A
cache[f'mask{i}']=mask
else:
A_new = sigmoid(Z)
A = A_new
cache['out'] = A
return A, cache
def backward(self, y, cache):
B = y.shape[0]
grads = {}; bn_grads = {}
dA = cache['out'] - y[:,None] # BCE + sigmoid: dL/dZ = p - y
n_hidden = len(self.bns)
for i in range(n_hidden, -1, -1):
A_prev = cache.get(f'A{i}', None)
W = self.params[f'W{i}']
if i == n_hidden:
A_prev = cache[f'A{n_hidden-1}'] if n_hidden > 0 else cache.get('A_input', None)
Z = cache.get(f'Z{i}')
if i < n_hidden:
Z_bn = cache[f'Zbn{i}']
mask = cache[f'mask{i}']
dA_masked = dA * (mask if mask is not None else 1.0)
dZ_act = dA_masked * relu_grad(Z_bn)
dZ_bn, dg, db_bn = self.bns[i].backward(dZ_act)
bn_grads[i] = (dg, db_bn)
dA_prev = dZ_bn @ W
grads[f'dW{i}'] = dZ_bn.T @ cache[f'A{i}'] / B
grads[f'db{i}'] = dZ_bn.mean(0)
dA = dA_prev
else:
A_prev = (cache[f'A{n_hidden-1}'] if n_hidden > 0
else None) # use X in step 2
grads[f'dW{i}'] = dA.T @ (A_prev if A_prev is not None
else cache.get('X_input')) / B
grads[f'db{i}'] = dA.mean(0)
dA = dA @ W
return grads, bn_grads
print("Network ready. Architecture: 2 → 128 → 128 → 64 → 1")3. Stage 2 — Training Loop¶
net = SpiralNet(dims=(2,128,128,64,1), p_keep=0.85, seed=7)
opt = Adam(lr=3e-3)
# Training constants
EPOCHS = 500; BATCH = 64
def bce(p, y, eps=1e-10):
p = np.clip(p, eps, 1-eps)
return float(-np.mean(y*np.log(p) + (1-y)*np.log(1-p)))
train_losses, test_losses, test_accs = [], [], []
rng = np.random.default_rng(0)
for epoch in range(EPOCHS):
perm = rng.permutation(len(X_tr))
epoch_loss = 0.0; n_batches = 0
for start in range(0, len(X_tr), BATCH):
idx = perm[start:start+BATCH]
Xb, yb = X_tr[idx], y_tr[idx]
out, cache = net.forward(Xb, training=True)
cache['X_input'] = Xb
loss = bce(out.ravel(), yb)
epoch_loss += loss; n_batches += 1
grads, bn_grads = net.backward(yb, cache)
# Update weights
opt.update(net.params, {f'{k}': v for k,v in grads.items()
if not k.startswith('d') or True}
if False else grads)
# Manual Adam update on params
for k in list(net.params.keys()):
dk = 'd' + k
if dk in grads:
if k not in opt.m: opt.m[k]=np.zeros_like(net.params[k]); opt.v[k]=np.zeros_like(net.params[k])
opt.t += 1 if k=='W0' else 0
g = grads[dk]
opt.m[k] = opt.b1*opt.m[k] + (1-opt.b1)*g
opt.v[k] = opt.b2*opt.v[k] + (1-opt.b2)*g**2
mh = opt.m[k]/(1-opt.b1**max(opt.t,1))
vh = opt.v[k]/(1-opt.b2**max(opt.t,1))
net.params[k] -= opt.lr/(np.sqrt(vh)+opt.eps)*mh
# BN gamma/beta update
for li, (dg, db_bn) in bn_grads.items():
net.bns[li].gamma -= opt.lr * dg
net.bns[li].beta -= opt.lr * db_bn
train_losses.append(epoch_loss / n_batches)
# Eval
out_te, _ = net.forward(X_te, training=False)
test_losses.append(bce(out_te.ravel(), y_te))
test_accs.append(float((out_te.ravel()>0.5) == y_te).mean() if False
else float(np.mean((out_te.ravel()>0.5) == y_te)))
if (epoch+1) % 100 == 0:
print(f"Epoch {epoch+1:4d}: train_loss={train_losses[-1]:.4f} "
f"test_loss={test_losses[-1]:.4f} test_acc={test_accs[-1]:.1%}")4. Stage 3 — Visualise Results¶
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# Training curves
axes[0].plot(train_losses, label='Train', color='#e74c3c', lw=2)
axes[0].plot(test_losses, label='Test', color='#3498db', lw=2)
axes[0].set_title('BCE Loss vs Epoch'); axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss'); axes[0].legend()
axes[1].plot(test_accs, color='#2ecc71', lw=2)
axes[1].axhline(0.95, color='gray', linestyle='--', label='95% target')
axes[1].set_title('Test Accuracy vs Epoch'); axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy'); axes[1].legend()
# Decision boundary
xx, yy = np.meshgrid(np.linspace(-1.2,1.2,300), np.linspace(-1.2,1.2,300))
grid = np.c_[xx.ravel(), yy.ravel()]
Z, _ = net.forward(grid, training=False)
Z = Z.reshape(xx.shape)
cf = axes[2].contourf(xx, yy, Z, levels=50, cmap='RdBu_r', alpha=0.85)
axes[2].contour(xx, yy, Z, levels=[0.5], colors='k', linewidths=2)
for cls, col in [(0,'#e74c3c'),(1,'#3498db')]:
m = y_te==cls
axes[2].scatter(X_te[m,0], X_te[m,1], c=col, s=20, alpha=0.8, edgecolors='white', lw=0.5)
axes[2].set_title(f'Decision boundary\n(Test acc: {test_accs[-1]:.1%})')
axes[2].set_aspect('equal')
plt.colorbar(cf, ax=axes[2])
plt.suptitle('ch334: Neural Net from Scratch — Spiral Classification', fontsize=12)
plt.tight_layout()
plt.savefig('ch334_results.png', dpi=120)
plt.show()5. Results & Reflection¶
What was built: A four-layer MLP with BatchNorm and Dropout, trained with Adam from scratch in NumPy — zero framework dependencies.
What math made it possible:
Forward pass matrix multiplications (ch304)
Chain rule through activation functions and BN (ch306, ch215)
Adam’s adaptive learning rates (ch312)
BatchNorm’s stable gradient flow (ch310)
Extension challenges:
Replace BatchNorm with Layer Norm and measure the difference.
Add L2 weight decay to Adam and compare final test accuracy.
Replace the fixed architecture with a search: try 5 different hidden sizes and report the one with lowest test loss.