0. Overview¶
Problem statement: Gradient descent is the engine of all learned models. But watching a loss curve decline tells you little about why it behaves as it does — why certain learning rates diverge, why momentum helps in ravines, why Adam adapts. This project builds a self-contained visualizer that makes the geometry of gradient descent legible.
Concepts applied:
Gradient computation and backpropagation (ch209, ch216)
Gradient descent variants (ch212, ch227)
Optimization landscapes: saddle points, ravines (ch213, ch214)
Taylor series: quadratic approximation of loss (ch219)
Expected output: Animated trajectory plots on several 2D loss surfaces, with side-by-side optimizer comparison.
Difficulty: Intermediate. Estimated time: 60–90 min.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
np.random.seed(42)
print("Setup complete.")
Setup complete.
1. Setup — Loss Surface Library¶
# Define a set of canonical 2D loss surfaces for visualisation
def quadratic(w):
w1, w2 = w
return (w1 - 1)**2 + 5*(w2 - 1)**2
def grad_quadratic(w):
w1, w2 = w
return np.array([2*(w1 - 1), 10*(w2 - 1)])
def rosenbrock(w):
x, y = w
return (1 - x)**2 + 100*(y - x**2)**2
def grad_rosenbrock(w):
x, y = w
dx = -2*(1 - x) - 400*x*(y - x**2)
dy = 200*(y - x**2)
return np.array([dx, dy])
def saddle(w):
x, y = w
return x**2 - y**2
def grad_saddle(w):
x, y = w
return np.array([2*x, -2*y])
surfaces = {
'Quadratic (ideal)': (quadratic, grad_quadratic, [2.5, 2.5], (-0.5, 3.5), (-0.5, 3.5)),
'Rosenbrock (valley)': (rosenbrock, grad_rosenbrock, [-1.0, 2.0], (-2, 2), (-0.5, 3.5)),
'Saddle Point': (saddle, grad_saddle, [1.5, 0.1], (-2.5, 2.5), (-2.5, 2.5)),
}
print("Loss surfaces defined:", list(surfaces.keys()))
Loss surfaces defined: ['Quadratic (ideal)', 'Rosenbrock (valley)', 'Saddle Point']
2. Stage 1 — Optimizer Implementations¶
class OptimizerBase:
def __init__(self, lr):
self.lr = lr
self.state = {}
def step(self, w, grad):
raise NotImplementedError
class SGD(OptimizerBase):
def step(self, w, grad):
return w - self.lr * grad
class MomentumSGD(OptimizerBase):
def __init__(self, lr, mu=0.9):
super().__init__(lr); self.mu = mu
def step(self, w, grad):
v = self.state.get('v', np.zeros_like(w))
v = self.mu * v + grad
self.state['v'] = v
return w - self.lr * v
class Adam(OptimizerBase):
def __init__(self, lr=0.1, beta1=0.9, beta2=0.999, eps=1e-8):
super().__init__(lr)
self.beta1, self.beta2, self.eps = beta1, beta2, eps
def step(self, w, grad):
t = self.state.get('t', 0) + 1
m = self.beta1 * self.state.get('m', np.zeros_like(w)) + (1 - self.beta1) * grad
v = self.beta2 * self.state.get('v', np.zeros_like(w)) + (1 - self.beta2) * grad**2
self.state.update({'t': t, 'm': m, 'v': v})
m_hat = m / (1 - self.beta1**t)
v_hat = v / (1 - self.beta2**t)
return w - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
def run_optimizer(opt, grad_fn, w0, n_steps=200):
w = w0.copy().astype(float)
path = [w.copy()]
for _ in range(n_steps):
g = grad_fn(w)
w = opt.step(w, g)
path.append(w.copy())
return np.array(path)
print("Optimizers: SGD, MomentumSGD, Adam")
Optimizers: SGD, MomentumSGD, Adam
3. Stage 2 — Trajectory Visualisation¶
def plot_trajectories(surface_name, n_steps=150, n_contours=30):
loss_fn, grad_fn, w0, xlim, ylim = surfaces[surface_name]
# Build contour grid
xs = np.linspace(*xlim, 200)
ys = np.linspace(*ylim, 200)
X, Y = np.meshgrid(xs, ys)
Z = np.vectorize(lambda x, y: loss_fn([x, y]))(X, Y)
Z_log = np.log1p(np.clip(Z, 0, None)) # log scale for better visual
optimizers = {
'SGD (lr=0.05)': SGD(lr=0.05),
'Momentum (lr=0.05)': MomentumSGD(lr=0.05),
'Adam (lr=0.1)': Adam(lr=0.1),
}
colors = ['red', 'blue', 'green']
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax in axes:
ax.contour(X, Y, Z_log, levels=n_contours, cmap='plasma', alpha=0.5)
ax.set_xlim(xlim); ax.set_ylim(ylim)
ax.set_xlabel('w1'); ax.set_ylabel('w2')
axes[0].set_title(f'{surface_name} — Trajectories')
axes[1].set_title('Loss over Iterations')
for (name, opt), color in zip(optimizers.items(), colors):
path = run_optimizer(opt, grad_fn, np.array(w0), n_steps)
losses = [loss_fn(w) for w in path]
axes[0].plot(path[:, 0], path[:, 1], '-', color=color, lw=1.5, alpha=0.8, label=name)
axes[0].plot(path[0, 0], path[0, 1], 'o', color=color, ms=8)
axes[1].semilogy(losses, color=color, lw=2, label=name)
axes[0].legend(fontsize=8); axes[1].legend(fontsize=8)
axes[1].set_xlabel('Step'); axes[1].set_ylabel('Loss (log)')
axes[1].grid(True, which='both', alpha=0.3)
plt.suptitle(surface_name, fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(f"ch228_{surface_name.replace(' ', '_').replace('(', '').replace(')', '')}.png", dpi=100)
plt.show()
for name in surfaces:
plot_trajectories(name)

C:\Users\user\AppData\Local\Temp\ipykernel_24116\2162303551.py:17: RuntimeWarning: overflow encountered in scalar power
dx = -2*(1 - x) - 400*x*(y - x**2)
C:\Users\user\AppData\Local\Temp\ipykernel_24116\2162303551.py:18: RuntimeWarning: overflow encountered in scalar power
dy = 200*(y - x**2)
C:\Users\user\AppData\Local\Temp\ipykernel_24116\2162303551.py:17: RuntimeWarning: invalid value encountered in scalar subtract
dx = -2*(1 - x) - 400*x*(y - x**2)
C:\Users\user\AppData\Local\Temp\ipykernel_24116\2162303551.py:18: RuntimeWarning: invalid value encountered in scalar subtract
dy = 200*(y - x**2)
C:\Users\user\AppData\Local\Temp\ipykernel_24116\2162303551.py:13: RuntimeWarning: overflow encountered in scalar power
return (1 - x)**2 + 100*(y - x**2)**2
C:\Users\user\AppData\Local\Temp\ipykernel_24116\2162303551.py:13: RuntimeWarning: invalid value encountered in scalar subtract
return (1 - x)**2 + 100*(y - x**2)**2


4. Stage 3 — Learning Rate Sensitivity Study¶
# How does learning rate affect convergence on a simple quadratic?
loss_fn, grad_fn, w0_quad, xlim, ylim = surfaces['Quadratic (ideal)']
lr_values = [0.01, 0.05, 0.1, 0.18, 0.22, 0.25]
n_steps = 100
fig, axes = plt.subplots(2, 3, figsize=(14, 8))
xs = np.linspace(*xlim, 200)
ys = np.linspace(*ylim, 200)
X, Y = np.meshgrid(xs, ys)
Z = np.vectorize(lambda x, y: loss_fn([x, y]))(X, Y)
for ax, lr in zip(axes.flat, lr_values):
ax.contour(X, Y, np.log1p(Z), levels=20, cmap='plasma', alpha=0.5)
path = run_optimizer(SGD(lr=lr), grad_fn, np.array(w0_quad), n_steps)
ax.plot(path[:, 0], path[:, 1], 'r.-', ms=3, lw=1)
ax.plot(path[0, 0], path[0, 1], 'go', ms=8, label='Start')
ax.set_title(f'lr = {lr}'); ax.set_xlim(xlim); ax.set_ylim(ylim)
ax.legend(fontsize=8); ax.grid(True, alpha=0.2)
plt.suptitle('Learning Rate Sensitivity — SGD on Quadratic', fontsize=12, fontweight='bold')
plt.tight_layout(); plt.savefig('ch228_lr_sensitivity.png', dpi=100); plt.show()

5. Results & Reflection¶
What was built: A trajectory visualizer that shows gradient descent paths on three canonical loss surfaces — a well-conditioned quadratic, the notoriously difficult Rosenbrock valley, and a saddle point.
What math made it possible:
Gradients (ch209) give the direction of steepest ascent at each point
The chain rule (ch215) lets us compute those gradients analytically
Momentum and Adam (ch227) accumulate gradient history to overcome oscillation
Quadratic approximation (ch219) explains why learning rate > 2/L causes divergence
Extension challenges:
Add Nesterov momentum (look-ahead gradient evaluation) and compare its path on the Rosenbrock surface to standard momentum.
Implement a learning rate scheduler (e.g., cosine annealing) and observe its effect on final convergence.
Add a 3D surface plot that rotates in sync with the 2D trajectory animation.