ch231 — Newton’s Method - Mathematics for Programmers

Advanced Calculus Experiment 1.

Gradient descent uses only first-order information (ch205). Newton’s method uses the second derivative (ch217) to jump directly toward the minimum of a quadratic approximation. For well-conditioned problems it converges quadratically — the number of correct digits doubles each iteration.

import numpy as np
import matplotlib.pyplot as plt

# Newton's method for minimisation: w_{t+1} = w_t - f''(w)^{-1} * f'(w)
# Example: minimise f(x) = x^4 - 2x^2 + 0.5x

def f(x): return x**4 - 2*x**2 + 0.5*x
def f1(x): return 4*x**3 - 4*x + 0.5
def f2(x): return 12*x**2 - 4

# Newton vs Gradient Descent from x=2.0
x0 = 2.0
n_steps = 10

x_n, x_g = x0, x0
newtons_path, gd_path = [x0], [x0]

lr_gd = 0.05
for _ in range(n_steps):
    # Newton step
    x_n = x_n - f1(x_n) / f2(x_n)
    newtons_path.append(x_n)
    # Gradient descent step
    x_g = x_g - lr_gd * f1(x_g)
    gd_path.append(x_g)

# True minimum
from scipy.optimize import minimize_scalar
result = minimize_scalar(f)
x_min = result.x

x_plot = np.linspace(-2, 2.2, 300)
fig, axes = plt.subplots(1, 2, figsize=(13, 4))
axes[0].plot(x_plot, f(x_plot), 'k', lw=2, label='f(x)')
axes[0].axvline(x_min, color='gray', ls=':', label=f'Minimum x={x_min:.4f}')
for path, label, color in [(newtons_path, "Newton's", 'blue'), (gd_path, 'GD', 'red')]:
    axes[0].plot(path, f(np.array(path)), 'o-', color=color, ms=5, lw=1.5, label=label)
axes[0].legend(); axes[0].grid(True, alpha=0.3); axes[0].set_title('Trajectories')

# Convergence in error
n_err = [abs(x - x_min) for x in newtons_path]
g_err = [abs(x - x_min) for x in gd_path]
axes[1].semilogy(n_err, 'b-o', label="Newton's (quadratic)")
axes[1].semilogy(g_err, 'r-s', label='GD (linear)')
axes[1].set_xlabel('Iteration'); axes[1].set_ylabel('|x - x*|'); axes[1].set_title('Convergence Rate')
axes[1].legend(); axes[1].grid(True, which='both', alpha=0.3)
plt.tight_layout(); plt.savefig('ch231_newton.png', dpi=100); plt.show()

print(f"True minimum: {x_min:.10f}")
print(f"Newton after {n_steps} steps: {newtons_path[-1]:.10f}")
print(f"GD after {n_steps} steps: {gd_path[-1]:.10f}")

True minimum: 0.9304029273
Newton after 10 steps: 0.9304029266
GD after 10 steps: 0.9229711790

Newton’s Method in Multiple Dimensions¶

In multiple dimensions, the scalar second derivative becomes the Hessian matrix H:

w_{t+1} = w_t - H^{-1} * gradient

This is exact for quadratic losses. For non-quadratic losses it is approximate but still converges much faster than gradient descent near the minimum.

# 2D Newton's method
def loss_2d(w): return (w[0]-1)**2 + 10*(w[1]-2)**2 + 0.5*w[0]*w[1]
def grad_2d(w): return np.array([2*(w[0]-1) + 0.5*w[1], 20*(w[1]-2) + 0.5*w[0]])
def hessian_2d(w): return np.array([[2, 0.5], [0.5, 20]])

w = np.array([-1.0, 4.0])
path_newton = [w.copy()]
for _ in range(20):
    g = grad_2d(w); H = hessian_2d(w)
    w = w - np.linalg.solve(H, g)
    path_newton.append(w.copy())

w_gd = np.array([-1.0, 4.0])
path_gd = [w_gd.copy()]
for _ in range(200):
    w_gd = w_gd - 0.04 * grad_2d(w_gd)
    path_gd.append(w_gd.copy())

w1r = np.linspace(-2, 3, 200); w2r = np.linspace(0, 5, 200)
W1, W2 = np.meshgrid(w1r, w2r)
Z = (W1-1)**2 + 10*(W2-2)**2 + 0.5*W1*W2

plt.figure(figsize=(8, 6))
plt.contour(W1, W2, np.log1p(Z), levels=20, cmap='viridis', alpha=0.5)
pn = np.array(path_newton); pg = np.array(path_gd)
plt.plot(pn[:, 0], pn[:, 1], 'b-o', ms=5, lw=2, label=f"Newton's (20 steps)")
plt.plot(pg[:, 0], pg[:, 1], 'r-', lw=1, alpha=0.7, label='GD (200 steps)')
plt.xlabel('w1'); plt.ylabel('w2'); plt.title("Newton's Method vs GD in 2D")
plt.legend(); plt.grid(True, alpha=0.2)
plt.tight_layout(); plt.savefig('ch231_newton_2d.png', dpi=100); plt.show()

Summary¶

Method	Convergence	Cost per Step	Best for
GD	Linear	O(d)	Large-scale, non-convex
Newton’s	Quadratic	O(d^3) (Hessian inversion)	Small-d, convex
L-BFGS	Super-linear	O(d·m) (approximation)	Medium-scale, smooth

Forward reference: Quasi-Newton methods and L-BFGS are the practical compromise used in ch291 — Optimisation Methods (Part IX).