ch294 — Classification - Mathematics for Programmers

(Extends logistic regression from ch229; uses sigmoid from ch063; applies cross-entropy from ch287)

1. The Classification Problem¶

Given features, predict a class label from a finite set. Classification is regression with a categorical output constrained to a probability simplex.

2. Logistic Regression from Scratch¶

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

rng = np.random.default_rng(42)


def sigmoid(z):
    """Numerically stable sigmoid."""
    return np.where(z >= 0,
                    1 / (1 + np.exp(-z)),
                    np.exp(z) / (1 + np.exp(z)))


class LogisticRegressionScratch:
    """Binary logistic regression via gradient descent."""

    def __init__(self, lr=0.1, n_iter=500, lam=0.01):
        self.lr = lr
        self.n_iter = n_iter
        self.lam = lam
        self.w = None
        self.b = 0.0
        self.losses = []

    def fit(self, X, y):
        n, p = X.shape
        self.w = np.zeros(p)
        self.b = 0.0
        for _ in range(self.n_iter):
            z = X @ self.w + self.b
            y_hat = sigmoid(z)
            err = y_hat - y
            self.w -= self.lr * (X.T @ err / n + self.lam * self.w)
            self.b -= self.lr * err.mean()
            eps = 1e-15
            loss = -np.mean(
                y * np.log(np.clip(y_hat, eps, 1)) +
                (1 - y) * np.log(np.clip(1 - y_hat, eps, 1))
            )
            self.losses.append(loss)
        return self

    def predict_proba(self, X):
        return sigmoid(X @ self.w + self.b)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)


X, y = make_classification(n_samples=500, n_features=10, n_informative=5, random_state=42)
X = (X - X.mean(0)) / X.std(0)
X_tr, X_te = X[:-100], X[-100:]
y_tr, y_te = y[:-100], y[-100:]

model = LogisticRegressionScratch(lr=0.5, n_iter=300, lam=0.01)
model.fit(X_tr, y_tr)

acc = (model.predict(X_te) == y_te).mean()
lr_sk = LogisticRegression(C=1/0.01, max_iter=1000)
lr_sk.fit(X_tr, y_tr)
acc_sk = lr_sk.score(X_te, y_te)

print(f'Manual accuracy:  {acc:.4f}')
print(f'Sklearn accuracy: {acc_sk:.4f}')

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(model.losses, color='steelblue', lw=2)
ax.set_xlabel('Iteration'); ax.set_ylabel('Binary Cross-Entropy')
ax.set_title('Training Loss Convergence')
plt.tight_layout()
plt.show()

Manual accuracy:  0.8500
Sklearn accuracy: 0.8500

3. Decision Boundaries¶

X_2d, y_2d = make_classification(
    n_samples=300, n_features=2, n_redundant=0, n_informative=2,
    random_state=0, n_clusters_per_class=2
)

classifiers = [
    ('Logistic Regression', LogisticRegression(C=1, max_iter=1000)),
    ('k-NN (k=5)',          KNeighborsClassifier(n_neighbors=5)),
    ('Decision Tree (d=4)', DecisionTreeClassifier(max_depth=4, random_state=0)),
]

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
x0_min, x0_max = X_2d[:, 0].min() - 0.5, X_2d[:, 0].max() + 0.5
x1_min, x1_max = X_2d[:, 1].min() - 0.5, X_2d[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 300),
                      np.linspace(x1_min, x1_max, 300))

for ax, (name, clf) in zip(axes, classifiers):
    clf.fit(X_2d, y_2d)
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdBu')
    ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap='RdBu',
               edgecolors='k', s=20, lw=0.5)
    ax.set_title(f'{name}\nTrain acc={clf.score(X_2d, y_2d):.3f}')

plt.suptitle('Decision Boundaries — Different Classifiers', fontsize=11)
plt.tight_layout()
plt.show()

4. Softmax for Multiclass¶

def softmax(Z):
    """Numerically stable softmax over rows."""
    Z_shift = Z - Z.max(axis=1, keepdims=True)
    exp_Z = np.exp(Z_shift)
    return exp_Z / exp_Z.sum(axis=1, keepdims=True)

Z = np.array([[2.0, 1.0, 0.5],
              [1.0, 3.0, 0.1]])
probs = softmax(Z)

print('Softmax output:')
print(np.round(probs, 4))
print('Row sums (must be 1):', probs.sum(axis=1))
print()
print('Softmax + cross-entropy = standard multiclass classification loss.')
print('This is the output layer of any classification neural network.')

Softmax output:
[[0.6285 0.2312 0.1402]
 [0.1137 0.8401 0.0462]]
Row sums (must be 1): [1. 1.]

Softmax + cross-entropy = standard multiclass classification loss.
This is the output layer of any classification neural network.

5. What Comes Next¶

Logistic regression is a single linear layer followed by sigmoid/softmax. ch295 — Neural Network Math Review stacks these layers with non-linear activations and trains via backpropagation — the chain rule applied through the computation graph. The softmax + cross-entropy combination here is exactly the final layer of a classification neural network.