ch334 — Project: Neural Net from Scratch (NumPy only)

0. Overview¶

Problem: Train a deep neural network to classify the two-spiral dataset from the Part X introduction — a benchmark that no linear model can solve.

Concepts used: forward pass (ch304), backpropagation (ch306), SGD with momentum (ch307), He initialisation (ch308), activation functions (ch309), batch normalisation (ch310), dropout (ch311), Adam optimiser (ch312), learning rate scheduling (ch313).

Expected output: trained classifier with >95% accuracy on held-out spiral data, plus training curve and decision boundary visualisation.

Difficulty: ★★★☆☆ | Estimated time: 60–90 minutes

1. Setup¶

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

np.random.seed(42)

# ── Data: two-spiral dataset ──
def make_spirals(n: int = 300, noise: float = 0.08, seed: int = 42) -> tuple:
    rng = np.random.default_rng(seed)
    def arm(n, offset):
        theta = np.linspace(0, 4*np.pi, n) + offset
        r = theta / (4*np.pi)
        x = r * np.cos(theta) + rng.normal(0, noise, n)
        y = r * np.sin(theta) + rng.normal(0, noise, n)
        return np.stack([x, y], axis=1)
    X0 = arm(n, 0); X1 = arm(n, np.pi)
    X = np.vstack([X0, X1])
    y = np.array([0]*n + [1]*n)
    perm = rng.permutation(len(y))
    return X[perm], y[perm]

X, y = make_spirals(300)
# Train / test split
split = int(0.8 * len(y))
X_tr, y_tr = X[:split], y[:split]
X_te, y_te = X[split:], y[split:]
print(f"Train: {X_tr.shape}, Test: {X_te.shape}")

fig, ax = plt.subplots(figsize=(5, 5))
for cls, col in [(0,'#e74c3c'),(1,'#3498db')]:
    m = y_tr==cls
    ax.scatter(X_tr[m,0], X_tr[m,1], c=col, s=15, alpha=0.7, label=f'Class {cls}')
ax.set_title('Two-spiral training data'); ax.set_aspect('equal'); ax.legend()
plt.tight_layout(); plt.savefig('ch334_data.png', dpi=100); plt.show()

2. Stage 1 — Build the Network¶

# ── Activation functions ──
def relu(z): return np.maximum(0, z)
def relu_grad(z): return (z > 0).astype(float)
def sigmoid(z): return 1/(1+np.exp(-np.clip(z,-500,500)))

# ── Batch Normalisation ──
class BatchNorm:
    def __init__(self, n, eps=1e-5, momentum=0.1):
        self.gamma = np.ones(n); self.beta = np.zeros(n)
        self.running_mean = np.zeros(n); self.running_var = np.ones(n)
        self.eps = eps; self.momentum = momentum; self.cache = None
    def forward(self, Z, training=True):
        if training:
            mu = Z.mean(0); var = Z.var(0)
            Z_hat = (Z - mu) / np.sqrt(var + self.eps)
            self.running_mean = (1-self.momentum)*self.running_mean + self.momentum*mu
            self.running_var  = (1-self.momentum)*self.running_var  + self.momentum*var
            self.cache = (Z, Z_hat, mu, var)
        else:
            Z_hat = (Z - self.running_mean) / np.sqrt(self.running_var + self.eps)
        return self.gamma * Z_hat + self.beta
    def backward(self, dY):
        Z, Z_hat, mu, var = self.cache
        B = Z.shape[0]
        dg = (dY * Z_hat).sum(0); db = dY.sum(0)
        dZ_hat = dY * self.gamma
        std_inv = 1./np.sqrt(var + self.eps)
        dZ = std_inv/B*(B*dZ_hat - dZ_hat.sum(0) - Z_hat*(dZ_hat*Z_hat).sum(0))
        return dZ, dg, db

# ── Adam Optimiser ──
class Adam:
    def __init__(self, lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
        self.lr=lr; self.b1=b1; self.b2=b2; self.eps=eps; self.t=0
        self.m={}; self.v={}
    def update(self, params, grads):
        self.t += 1
        for k in params:
            if k not in self.m:
                self.m[k]=np.zeros_like(params[k])
                self.v[k]=np.zeros_like(params[k])
            self.m[k] = self.b1*self.m[k] + (1-self.b1)*grads[k]
            self.v[k] = self.b2*self.v[k] + (1-self.b2)*grads[k]**2
            mh = self.m[k]/(1-self.b1**self.t)
            vh = self.v[k]/(1-self.b2**self.t)
            params[k] -= self.lr/(np.sqrt(vh)+self.eps)*mh

# ── Network definition ──
class SpiralNet:
    """3-hidden-layer MLP with BatchNorm, ReLU, and Dropout."""
    def __init__(self, dims=(2,128,128,64,1), p_keep=0.8, seed=0):
        rng = np.random.default_rng(seed)
        self.params = {}
        self.bns = []
        self.p_keep = p_keep
        for i in range(len(dims)-1):
            fi, fo = dims[i], dims[i+1]
            self.params[f'W{i}'] = rng.normal(0, np.sqrt(2.0/fi), (fo, fi))
            self.params[f'b{i}'] = np.zeros(fo)
            if i < len(dims)-2:  # no BN on output layer
                self.bns.append(BatchNorm(fo))

    def forward(self, X, training=True):
        """X: (B, 2). Returns (B, 1) output and cache."""
        rng = np.random.default_rng()
        cache = {}; A = X
        n_hidden = len(self.bns)
        for i in range(n_hidden + 1):
            W = self.params[f'W{i}']; b = self.params[f'b{i}']
            Z = A @ W.T + b
            if i < n_hidden:
                Z_bn = self.bns[i].forward(Z, training)
                A_new = relu(Z_bn)
                if training:
                    mask = (rng.random(A_new.shape) < self.p_keep) / self.p_keep
                    A_new = A_new * mask
                else:
                    mask = None
                cache[f'Z{i}']=Z; cache[f'Zbn{i}']=Z_bn; cache[f'A{i}']=A
                cache[f'mask{i}']=mask
            else:
                A_new = sigmoid(Z)
            A = A_new
        cache['out'] = A
        return A, cache

    def backward(self, y, cache):
        B = y.shape[0]
        grads = {}; bn_grads = {}
        dA = cache['out'] - y[:,None]  # BCE + sigmoid: dL/dZ = p - y
        n_hidden = len(self.bns)
        for i in range(n_hidden, -1, -1):
            A_prev = cache.get(f'A{i}', None)
            W = self.params[f'W{i}']
            if i == n_hidden:
                A_prev = cache[f'A{n_hidden-1}'] if n_hidden > 0 else                          cache.get('A_input', None)
            Z = cache.get(f'Z{i}')
            if i < n_hidden:
                Z_bn = cache[f'Zbn{i}']
                mask = cache[f'mask{i}']
                dA_masked = dA * (mask if mask is not None else 1.0)
                dZ_act = dA_masked * relu_grad(Z_bn)
                dZ_bn, dg, db_bn = self.bns[i].backward(dZ_act)
                bn_grads[i] = (dg, db_bn)
                dA_prev = dZ_bn @ W
                grads[f'dW{i}'] = dZ_bn.T @ cache[f'A{i}'] / B
                grads[f'db{i}'] = dZ_bn.mean(0)
                dA = dA_prev
            else:
                A_prev = (cache[f'A{n_hidden-1}'] if n_hidden > 0
                          else None)  # use X in step 2
                grads[f'dW{i}'] = dA.T @ (A_prev if A_prev is not None
                                           else cache.get('X_input')) / B
                grads[f'db{i}'] = dA.mean(0)
                dA = dA @ W
        return grads, bn_grads

print("Network ready. Architecture: 2 → 128 → 128 → 64 → 1")

3. Stage 2 — Training Loop¶

net = SpiralNet(dims=(2,128,128,64,1), p_keep=0.85, seed=7)
opt = Adam(lr=3e-3)

# Training constants
EPOCHS = 500; BATCH = 64

def bce(p, y, eps=1e-10):
    p = np.clip(p, eps, 1-eps)
    return float(-np.mean(y*np.log(p) + (1-y)*np.log(1-p)))

train_losses, test_losses, test_accs = [], [], []
rng = np.random.default_rng(0)

for epoch in range(EPOCHS):
    perm = rng.permutation(len(X_tr))
    epoch_loss = 0.0; n_batches = 0
    for start in range(0, len(X_tr), BATCH):
        idx = perm[start:start+BATCH]
        Xb, yb = X_tr[idx], y_tr[idx]
        out, cache = net.forward(Xb, training=True)
        cache['X_input'] = Xb
        loss = bce(out.ravel(), yb)
        epoch_loss += loss; n_batches += 1
        grads, bn_grads = net.backward(yb, cache)
        # Update weights
        opt.update(net.params, {f'{k}': v for k,v in grads.items()
                                 if not k.startswith('d') or True}
                   if False else grads)
        # Manual Adam update on params
        for k in list(net.params.keys()):
            dk = 'd' + k
            if dk in grads:
                if k not in opt.m: opt.m[k]=np.zeros_like(net.params[k]); opt.v[k]=np.zeros_like(net.params[k])
                opt.t += 1 if k=='W0' else 0
                g = grads[dk]
                opt.m[k] = opt.b1*opt.m[k] + (1-opt.b1)*g
                opt.v[k] = opt.b2*opt.v[k] + (1-opt.b2)*g**2
                mh = opt.m[k]/(1-opt.b1**max(opt.t,1))
                vh = opt.v[k]/(1-opt.b2**max(opt.t,1))
                net.params[k] -= opt.lr/(np.sqrt(vh)+opt.eps)*mh
        # BN gamma/beta update
        for li, (dg, db_bn) in bn_grads.items():
            net.bns[li].gamma -= opt.lr * dg
            net.bns[li].beta  -= opt.lr * db_bn

    train_losses.append(epoch_loss / n_batches)
    # Eval
    out_te, _ = net.forward(X_te, training=False)
    test_losses.append(bce(out_te.ravel(), y_te))
    test_accs.append(float((out_te.ravel()>0.5) == y_te).mean() if False
                     else float(np.mean((out_te.ravel()>0.5) == y_te)))

    if (epoch+1) % 100 == 0:
        print(f"Epoch {epoch+1:4d}: train_loss={train_losses[-1]:.4f} "
              f"test_loss={test_losses[-1]:.4f} test_acc={test_accs[-1]:.1%}")

4. Stage 3 — Visualise Results¶

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Training curves
axes[0].plot(train_losses, label='Train', color='#e74c3c', lw=2)
axes[0].plot(test_losses,  label='Test',  color='#3498db', lw=2)
axes[0].set_title('BCE Loss vs Epoch'); axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss'); axes[0].legend()

axes[1].plot(test_accs, color='#2ecc71', lw=2)
axes[1].axhline(0.95, color='gray', linestyle='--', label='95% target')
axes[1].set_title('Test Accuracy vs Epoch'); axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy'); axes[1].legend()

# Decision boundary
xx, yy = np.meshgrid(np.linspace(-1.2,1.2,300), np.linspace(-1.2,1.2,300))
grid = np.c_[xx.ravel(), yy.ravel()]
Z, _ = net.forward(grid, training=False)
Z = Z.reshape(xx.shape)

cf = axes[2].contourf(xx, yy, Z, levels=50, cmap='RdBu_r', alpha=0.85)
axes[2].contour(xx, yy, Z, levels=[0.5], colors='k', linewidths=2)
for cls, col in [(0,'#e74c3c'),(1,'#3498db')]:
    m = y_te==cls
    axes[2].scatter(X_te[m,0], X_te[m,1], c=col, s=20, alpha=0.8, edgecolors='white', lw=0.5)
axes[2].set_title(f'Decision boundary\n(Test acc: {test_accs[-1]:.1%})')
axes[2].set_aspect('equal')
plt.colorbar(cf, ax=axes[2])

plt.suptitle('ch334: Neural Net from Scratch — Spiral Classification', fontsize=12)
plt.tight_layout()
plt.savefig('ch334_results.png', dpi=120)
plt.show()

5. Results & Reflection¶

What was built: A four-layer MLP with BatchNorm and Dropout, trained with Adam from scratch in NumPy — zero framework dependencies.

What math made it possible:

Forward pass matrix multiplications (ch304)
Chain rule through activation functions and BN (ch306, ch215)
Adam’s adaptive learning rates (ch312)
BatchNorm’s stable gradient flow (ch310)

Extension challenges:

Replace BatchNorm with Layer Norm and measure the difference.
Add L2 weight decay to Adam and compare final test accuracy.
Replace the fixed architecture with a search: try 5 different hidden sizes and report the one with lowest test loss.