Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

ch336 — Project: Character-Level Language Model

0. Overview

Problem: Train an LSTM to model character-level sequences from a small text corpus, then generate new text by sampling from the learned distribution.

Concepts used: RNN/LSTM (ch317–318), embeddings (ch320), cross-entropy loss (ch305), backprop through time (ch317), Adam (ch312), temperature sampling.

Expected output: a trained character-level language model that generates plausible English-like text, plus training curve and per-character loss analysis.

Difficulty: ★★★★☆ | Estimated time: 90 minutes

1. Setup

import numpy as np

# Tiny corpus — Pride and Prejudice opening (public domain)
corpus = """It is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood this truth is so well fixed in the minds of the surrounding families that he is considered as the rightful property of some one or other of their daughters. My dear Mr Bennet said his lady to him one day have you heard that Netherfield Park is let at last. Mr Bennet replied that he had not. But it is returned she for Mrs Long has just been here and she told me all about it. Mr Bennet made no answer. Do you not want to know who has taken it cried his wife impatiently. You want to tell me and I have no objection to hearing it. This was invitation enough. Why my dear you must know Mrs Long says that Netherfield is taken by a young man of large fortune from the north of England that he came down on Monday in a chaise and four to see the place and was so much delighted with it that he agreed with Mr Morris immediately that it is to be taken for Michaelmas and some of his servants are to be in the house by the end of next week."""

# Character vocabulary
chars = sorted(set(corpus))
V = len(chars)
c2i = {c:i for i,c in enumerate(chars)}
i2c = {i:c for i,c in enumerate(chars)}
data = np.array([c2i[c] for c in corpus])

print(f"Corpus length: {len(corpus)} chars")
print(f"Vocabulary size: {V}")
print(f"First 50 chars: {corpus[:50]!r}")

2. Stage 1 — LSTM Language Model

def sigmoid(z): return 1/(1+np.exp(-np.clip(z,-500,500)))
def softmax(z): z_s=z-z.max(); e=np.exp(z_s); return e/e.sum()

class CharLSTM:
    """Character-level LSTM language model."""

    def __init__(self, vocab_size, embed_dim, hidden_size, seed=0):
        rng=np.random.default_rng(seed)
        self.V=vocab_size; self.E=embed_dim; self.H=hidden_size
        # Embedding table
        self.embed=rng.normal(0,0.1,(vocab_size,embed_dim))
        # LSTM weights (all 4 gates concatenated)
        concat=embed_dim+hidden_size
        self.W=rng.normal(0,np.sqrt(1./concat),(4*hidden_size,concat))
        self.b=np.zeros(4*hidden_size); self.b[hidden_size:2*hidden_size]=1.  # forget gate bias
        # Output projection
        self.W_out=rng.normal(0,np.sqrt(1./hidden_size),(vocab_size,hidden_size))
        self.b_out=np.zeros(vocab_size)

    def lstm_step(self, x_embed, h_prev, c_prev):
        H=self.H
        xh=np.concatenate([x_embed,h_prev])
        gates=self.W@xh+self.b
        i=sigmoid(gates[:H]); f=sigmoid(gates[H:2*H])
        o=sigmoid(gates[2*H:3*H]); g=np.tanh(gates[3*H:])
        c=f*c_prev+i*g; h=o*np.tanh(c)
        return h,c,(xh,i,f,o,g,c_prev,c,h)

    def forward(self, token_ids):
        """token_ids: list of ints. Returns (logits per step, caches)."""
        h=np.zeros(self.H); c=np.zeros(self.H)
        logits=[]; caches=[]
        for tid in token_ids:
            x=self.embed[tid]
            h,c,cache=self.lstm_step(x,h,c)
            logit=self.W_out@h+self.b_out
            logits.append(logit); caches.append((tid,cache,h))
        return logits,caches

    def loss(self, token_ids):
        """Cross-entropy on next-character prediction."""
        if len(token_ids)<2: return 0.0
        logits,_=self.forward(token_ids[:-1])
        targets=token_ids[1:]
        total=0.
        for logit,target in zip(logits,targets):
            p=softmax(logit)
            total-=np.log(p[target]+1e-10)
        return total/len(targets)

    def generate(self, seed_text, n_chars=100, temperature=1.0, rng=None):
        """Sample from the model character by character."""
        if rng is None: rng=np.random.default_rng(0)
        # Prime the LSTM with seed
        h=np.zeros(self.H); c=np.zeros(self.H)
        for ch in seed_text:
            if ch in c2i:
                x=self.embed[c2i[ch]]
                h,c,_=self.lstm_step(x,h,c)
        # Generate
        out=seed_text
        tid=c2i.get(seed_text[-1], 0)
        for _ in range(n_chars):
            x=self.embed[tid]
            h,c,_=self.lstm_step(x,h,c)
            logit=self.W_out@h+self.b_out
            logit_scaled=logit/temperature
            p=softmax(logit_scaled)
            tid=rng.choice(self.V,p=p)
            out+=i2c[tid]
        return out

model=CharLSTM(V,embed_dim=32,hidden_size=128,seed=0)
print(f"Parameters: embed={model.embed.size}, LSTM={model.W.size+model.b.size}, "
      f"out={model.W_out.size+model.b_out.size}, "
      f"total={model.embed.size+model.W.size+model.b.size+model.W_out.size+model.b_out.size:,}")

3. Stage 2 — Training with Truncated BPTT

import matplotlib.pyplot as plt

rng=np.random.default_rng(42)
SEQ_LEN=40; EPOCHS=200; lr=0.005
losses=[]

for epoch in range(EPOCHS):
    # Sample random starting position
    start=rng.integers(0,len(data)-SEQ_LEN-1)
    seq=data[start:start+SEQ_LEN+1].tolist()

    # Forward pass
    loss=model.loss(seq)
    losses.append(loss)

    # Numerical gradient update (sparse, for correctness demonstration)
    eps=1e-4
    all_params=[('embed',model.embed),('W',model.W),('b',model.b),
                ('W_out',model.W_out),('b_out',model.b_out)]

    for pname,P in all_params:
        flat=P.ravel()
        # Sample fraction of parameters for speed
        n_sample=max(1,len(flat)//50)
        indices=rng.choice(len(flat),n_sample,replace=False)
        for idx in indices:
            flat[idx]+=eps; lp=model.loss(seq)
            flat[idx]-=2*eps; lm=model.loss(seq)
            flat[idx]+=eps
            flat[idx]-=lr*(lp-lm)/(2*eps)

    if (epoch+1)%50==0:
        sample=model.generate('It is',n_chars=80,temperature=0.8,rng=rng)
        print(f"Epoch {epoch+1:3d}: loss={loss:.4f}")
        print(f"  Sample: {sample!r}\n")

fig,ax=plt.subplots(figsize=(10,4))
window=10
smoothed=np.convolve(losses,np.ones(window)/window,mode='valid')
ax.plot(losses,alpha=0.3,color='#3498db',lw=1)
ax.plot(smoothed,color='#e74c3c',lw=2,label=f'{window}-step avg')
ax.set_title('Character-level LM training loss'); ax.set_xlabel('Step')
ax.set_ylabel('Cross-entropy'); ax.legend()
plt.tight_layout(); plt.savefig('ch336_charlm.png',dpi=120); plt.show()

4. Stage 3 — Generation at Different Temperatures

print("Generated text at different temperatures:\n")
rng2=np.random.default_rng(1)

for temp in [0.5, 1.0, 1.5]:
    generated=model.generate('Mr Bennet',n_chars=120,temperature=temp,rng=rng2)
    print(f"Temperature {temp}:")
    print(f"  {generated!r}")
    print()

# Per-character loss analysis
char_losses={}
for i in range(len(data)-1):
    seq=data[max(0,i-20):i+2].tolist()
    if len(seq)<2: continue
    logits,_=model.forward(seq[:-1])
    p=softmax(logits[-1]); target=seq[-1]
    l=float(-np.log(p[target]+1e-10))
    ch=i2c[seq[-1]]
    char_losses.setdefault(ch,[]).append(l)

avg_losses={ch:np.mean(v) for ch,v in char_losses.items()}
sorted_chars=sorted(avg_losses.items(),key=lambda x:-x[1])[:15]

fig,ax=plt.subplots(figsize=(10,4))
chars_plt=[x[0] for x in sorted_chars]
vals=[x[1] for x in sorted_chars]
ax.bar([repr(c) for c in chars_plt],vals,color='#3498db',alpha=0.8)
ax.set_title('Per-character average loss (hardest to predict)')
ax.set_xlabel('Character'); ax.set_ylabel('Cross-entropy')
plt.tight_layout(); plt.savefig('ch336_per_char_loss.png',dpi=120); plt.show()

5. Results & Reflection

What was built: A character-level LSTM language model trained via truncated BPTT, capable of generating text sequences that mimic the style of the training corpus.

What math made it possible:

  • LSTM gating for long-term memory (ch318)

  • Cross-entropy as the prediction loss (ch305)

  • Softmax temperature to control generation diversity

  • Embeddings mapping character indices to continuous vectors (ch320)

Extension challenges:

  1. Implement teacher forcing properly: fix the input to always be the true previous character.

  2. Train on a larger corpus (e.g., Project Gutenberg text) and observe when the loss plateaus.

  3. Visualise the LSTM’s hidden state over a sentence using a heatmap.