Complete Code

Here is the complete source code for text generation, along with all files from previous chapters.

generate.ts

/**
 * Inference / Text Generation
 *
 * Starting from the BOS token, the model predicts one token at a time:
 * 1. Forward the current token through the model to get logits
 * 2. Apply temperature scaling (lower = more conservative, higher = more creative)
 * 3. Apply top-k filtering (keep only the k most likely tokens)
 * 4. Apply top-p / nucleus filtering (keep smallest set summing to p)
 * 5. Convert to probabilities via softmax
 * 6. Randomly sample the next token from that distribution
 * 7. Stop when BOS is produced (end of sequence) or max length is reached
 */

import { type Model, gpt, createKVCache } from "./model.js";
import { weightedChoice } from "./rng.js";
import type { Tokenizer } from "./tokenizer.js";

export interface GenerateOptions {
  temperature?: number;
  topK?: number;
  topP?: number;
}

/** Generate new text samples from a trained model. */
export function generate(
  model: Model,
  tokenizer: Tokenizer,
  numSamples: number,
  options: GenerateOptions = {},
): string[] {
  const { temperature = 0.8, topK = 0, topP = 1.0 } = options;
  const samples: string[] = [];

  for (let i = 0; i < numSamples; i++) {
    const { keys, values } = createKVCache(model);
    let tokenId = tokenizer.BOS;
    const tokens: number[] = [];

    for (let posId = 0; posId < model.config.blockSize; posId++) {
      const logits = gpt(model, tokenId, posId, keys, values);

      // Temperature scaling
      let scores = logits.map((l) => l.data / temperature);

      // Top-k: keep only the k highest scores
      if (topK > 0 && topK < scores.length) {
        const sorted = [...scores].sort((a, b) => b - a);
        const cutoff = sorted[topK - 1];
        scores = scores.map((s) => (s >= cutoff ? s : -Infinity));
      }

      // Top-p (nucleus): keep smallest set whose probabilities sum to p
      if (topP < 1.0) {
        const indices = scores.map((s, idx) => idx);
        indices.sort((a, b) => scores[b] - scores[a]);
        // Compute softmax on current scores to get probabilities for filtering
        const maxS = Math.max(...scores.filter((s) => s !== -Infinity));
        const exps = scores.map((s) => (s === -Infinity ? 0 : Math.exp(s - maxS)));
        const total = exps.reduce((a, b) => a + b, 0);
        const probs = exps.map((e) => e / total);
        let cumSum = 0;
        let exceeded = false;
        for (const idx of indices) {
          if (exceeded) {
            scores[idx] = -Infinity;
          }
          cumSum += probs[idx];
          if (cumSum > topP) {
            exceeded = true;
          }
        }
      }

      // Softmax and sample (using plain numbers, not Value, for efficiency)
      const maxS = Math.max(...scores.filter((s) => s !== -Infinity));
      const exps = scores.map((s) => (s === -Infinity ? 0 : Math.exp(s - maxS)));
      const total = exps.reduce((a, b) => a + b, 0);
      const probs = exps.map((e) => e / total);

      tokenId = weightedChoice(probs);
      if (tokenId === tokenizer.BOS) break;
      tokens.push(tokenId);
    }

    samples.push(tokenizer.decode(tokens));
  }

  return samples;
}

train.ts

/**
 * Training Loop
 *
 * Each step:
 * 1. Pick a sentence and tokenize it
 * 2. Forward each token through the model, predicting the next token
 * 3. Compute cross-entropy loss (how surprised the model is at the correct answer)
 * 4. Backward pass: compute gradients via the chain rule
 * 5. Adam optimizer: update parameters to reduce loss
 *
 * Loss starts at ~6.39 (random guessing among 597 tokens = -log(1/597))
 * and decreases to ~1.76 over 5,000 steps.
 */

import { Value, vsum } from "./autograd.js";
import { softmax } from "./nn.js";
import { type Model, gpt, createKVCache } from "./model.js";
import type { Tokenizer } from "./tokenizer.js";

export interface TrainConfig {
  numSteps: number;
  learningRate: number;
  beta1: number;
  beta2: number;
  epsAdam: number;
}

/** Train the model on the dataset and return the trained model. */
export function train(
  model: Model,
  trainConfig: TrainConfig,
  sentences: string[],
  tokenizer: Tokenizer,
): Model {
  const { numSteps, learningRate, beta1, beta2, epsAdam } = trainConfig;
  const { params } = model;

  // Adam optimizer moment buffers
  const mBuf = new Float64Array(params.length);
  const vBuf = new Float64Array(params.length);

  for (let step = 0; step < numSteps; step++) {
    // Tokenize a single sentence, surrounded by BOS on both sides
    const sentence = sentences[step % sentences.length];
    const tokens = tokenizer.encode(sentence);
    const n = Math.min(model.config.blockSize, tokens.length - 1);

    // Forward: build the computation graph from tokens to loss
    const { keys, values } = createKVCache(model);
    const losses: Value[] = [];

    for (let posId = 0; posId < n; posId++) {
      const tokenId = tokens[posId];
      const targetId = tokens[posId + 1];
      const logits = gpt(model, tokenId, posId, keys, values);
      const probs = softmax(logits);
      losses.push(probs[targetId].log().neg());
    }

    // Average cross-entropy loss over the sequence
    const loss = vsum(losses).div(n);

    // Backward: walk the computation graph and fill in .grad on every Value node.
    // Returns void — gradients are stored as a side effect on the same param objects
    // that the Adam loop below reads from.
    loss.backward();

    // Adam update with linear learning rate decay
    const lrT = learningRate * (1 - step / numSteps);
    for (let i = 0; i < params.length; i++) {
      mBuf[i] = beta1 * mBuf[i] + (1 - beta1) * params[i].grad;
      vBuf[i] = beta2 * vBuf[i] + (1 - beta2) * params[i].grad ** 2;
      const mHat = mBuf[i] / (1 - beta1 ** (step + 1));
      const vHat = vBuf[i] / (1 - beta2 ** (step + 1));
      params[i].data -= lrT * mHat / (Math.sqrt(vHat) + epsAdam);
      params[i].grad = 0;
    }

    process.stdout.write(
      `\rstep ${String(step + 1).padStart(4)} / ${numSteps} | loss ${loss.data.toFixed(4)}`
    );
  }

  return model;
}

model.ts

/**
 * GPT Model
 *
 * The transformer architecture: a function that maps input tokens to a
 * probability distribution over what comes next.
 *
 * Follows GPT-2 with minor simplifications:
 * - RMSNorm instead of LayerNorm
 * - No biases
 * - ReLU instead of GeLU
 *
 * Config: 32 embedding dims, 4 attention heads, 2 layers, 16 max context
 * → 63,296 parameters total
 */

import { readFileSync, writeFileSync } from "node:fs";
import { Value, vsum } from "./autograd.js";
import { type Matrix, linear, softmax, rmsnorm } from "./nn.js";
import { gauss } from "./rng.js";

// --- Configuration ---

export interface GPTConfig {
  nLayer: number;
  nEmbd: number;
  blockSize: number;
  nHead: number;
  headDim: number;
  vocabSize: number;
}

interface Attention {
  query: Matrix;
  key: Matrix;
  value: Matrix;
  output: Matrix;
}

interface MLP {
  hidden: Matrix;
  output: Matrix;
}

interface Layer {
  attention: Attention;
  mlp: MLP;
}

export interface Weights {
  tokenEmbedding: Matrix;
  positionEmbedding: Matrix;
  output: Matrix;
  layers: Layer[];
}

/** The trained (or untrained) model: config + weights + flattened parameter list. */
export interface Model {
  config: GPTConfig;
  weights: Weights;
  params: Value[];
}

// --- Model Creation ---

function matrix(nout: number, nin: number, std = 0.08): Matrix {
  return Array.from({ length: nout }, () =>
    Array.from({ length: nin }, () => new Value(gauss(0, std)))
  );
}

/** Create a new model with randomly initialized weights. */
export function createModel(config: GPTConfig): Model {
  const { nEmbd, nLayer, blockSize, vocabSize } = config;

  const weights: Weights = {
    tokenEmbedding: matrix(vocabSize, nEmbd),
    positionEmbedding: matrix(blockSize, nEmbd),
    output: matrix(vocabSize, nEmbd),
    layers: Array.from({ length: nLayer }, () => ({
      attention: {
        query: matrix(nEmbd, nEmbd),
        key: matrix(nEmbd, nEmbd),
        value: matrix(nEmbd, nEmbd),
        output: matrix(nEmbd, nEmbd),
      },
      mlp: {
        hidden: matrix(4 * nEmbd, nEmbd),
        output: matrix(nEmbd, 4 * nEmbd),
      },
    })),
  };

  const allMatrices: Matrix[] = [
    weights.tokenEmbedding,
    weights.positionEmbedding,
    weights.output,
    ...weights.layers.flatMap((layer) => [
      layer.attention.query, layer.attention.key,
      layer.attention.value, layer.attention.output,
      layer.mlp.hidden, layer.mlp.output,
    ]),
  ];
  const params = allMatrices.flatMap((mat) => mat.flatMap((row) => row));

  return { config, weights, params };
}

// --- Save / Load ---

/** Save a trained model to a JSON file (config + parameter values). */
export function saveModel(model: Model, path: string): void {
  const data = {
    config: model.config,
    weights: model.params.map((p) => p.data),
  };
  writeFileSync(path, JSON.stringify(data));
}

/** Load a model from a JSON file. Recreates the structure, then fills in the learned weights. */
export function loadModel(path: string): Model {
  const data = JSON.parse(readFileSync(path, "utf-8"));
  const model = createModel(data.config);
  for (let i = 0; i < model.params.length; i++) {
    model.params[i].data = data.weights[i];
  }
  return model;
}

// --- KV Cache ---

/** Create fresh key/value caches for a new sequence. Must be called per-sequence. */
export function createKVCache(model: Model): {
  keys: Value[][][];
  values: Value[][][];
} {
  return {
    keys: Array.from({ length: model.config.nLayer }, () => []),
    values: Array.from({ length: model.config.nLayer }, () => []),
  };
}

// --- Forward Pass ---

/**
 * Run one step of the GPT: given a token at a position, return logits
 * over the vocabulary for the next token.
 *
 * The keys/values caches are mutated (appended to) on each call —
 * this is the KV cache that avoids recomputing attention for past positions.
 */
export function gpt(
  model: Model,
  tokenId: number,
  posId: number,
  keys: Value[][][],
  values: Value[][][],
): Value[] {
  const { nLayer, nHead, headDim } = model.config;
  const { weights } = model;

  // Step 1: Embedding lookup
  // Combine "what word is this?" with "where does it appear?" into a single
  // vector. This is the hidden state that flows through the rest of the network.
  const tokenVector: Value[] = weights.tokenEmbedding[tokenId];
  const positionVector: Value[] = weights.positionEmbedding[posId];
  let hidden: Value[] = tokenVector.map((t, i) => t.add(positionVector[i]));

  // Normalize before the first layer to keep values at a stable scale
  hidden = rmsnorm(hidden);

  // Step 2: Transformer layers
  // Each layer has two blocks: attention (gather context from other tokens)
  // followed by MLP (process the gathered information). Both use residual
  // connections so the input is added back to the output of each block.
  for (let li = 0; li < nLayer; li++) {
    const layer = weights.layers[li];

    // --- Attention block: look at previous tokens to gather context ---

    // Save the hidden state so we can add it back after the block (residual)
    const beforeAttention: Value[] = hidden;
    hidden = rmsnorm(hidden);

    // Project the hidden state into query, key, and value vectors.
    // These are structurally identical projections — training teaches them
    // to play different roles: query asks "what am I looking for?",
    // key advertises "what do I contain?", value carries "what to retrieve".
    const query: Value[] = linear(hidden, layer.attention.query);
    const key: Value[] = linear(hidden, layer.attention.key);
    const value: Value[] = linear(hidden, layer.attention.value);

    // Cache the key and value so future tokens can attend to this position
    keys[li].push(key);
    values[li].push(value);

    // Each head independently attends to a different slice of the vectors,
    // allowing the model to track multiple relationships at once
    const attentionOutput: Value[] = [];
    for (let h = 0; h < nHead; h++) {
      const headStart = h * headDim;
      const headQuery = query.slice(headStart, headStart + headDim);
      const headKeys = keys[li].map((ki) => ki.slice(headStart, headStart + headDim));
      const headValues = values[li].map((vi) => vi.slice(headStart, headStart + headDim));

      // Scaled dot-product attention: score = (query · key) / √headDim
      const attnLogits = headKeys.map((cachedKey) =>
        vsum(headQuery.map((q, j) => q.mul(cachedKey[j]))).div(Math.sqrt(headDim))
      );

      // Softmax converts scores into weights that sum to 1
      const attnWeights = softmax(attnLogits);

      // Weighted sum of value vectors — high-scoring positions contribute more
      for (let j = 0; j < headDim; j++) {
        attentionOutput.push(vsum(attnWeights.map((w, t) => w.mul(headValues[t][j]))));
      }
    }

    // Project concatenated head outputs back to the hidden dimension
    hidden = linear(attentionOutput, layer.attention.output);
    // Residual connection: add back what we had before attention
    hidden = hidden.map((h, i) => h.add(beforeAttention[i]));

    // --- MLP block: process each token's representation independently ---

    const beforeMLP: Value[] = hidden;
    hidden = rmsnorm(hidden);
    hidden = linear(hidden, layer.mlp.hidden);   // expand to 4x wider
    hidden = hidden.map((h) => h.relu());         // nonlinearity
    hidden = linear(hidden, layer.mlp.output);    // compress back
    // Residual connection: add back what we had before the MLP
    hidden = hidden.map((h, i) => h.add(beforeMLP[i]));
  }

  // Step 3: Output projection
  // Project the final hidden state to vocabulary size — one score per word.
  // These raw scores (logits) will be passed through softmax later to get
  // a probability distribution over the next token.
  return linear(hidden, weights.output);
}

rng.ts

/**
 * Seeded Random Number Generator
 *
 * JavaScript has no built-in seeded RNG, so we roll our own:
 * - mulberry32 for uniform random numbers
 * - Box-Muller transform for Gaussian (normal) distribution
 * - Fisher-Yates for array shuffling
 * - Weighted random choice for sampling from probability distributions
 */

let _rngState = 0;

export function seed(s: number): void {
  _rngState = s | 0;
}

/** Mulberry32: a fast, seedable 32-bit PRNG. Returns a uniform float in [0, 1). */
export function random(): number {
  _rngState = (_rngState + 0x6d2b79f5) | 0;
  let t = _rngState;
  t = Math.imul(t ^ (t >>> 15), t | 1);
  t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
  return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
}

/** Box-Muller transform: convert two uniform samples into a Gaussian sample. */
export function gauss(mean: number, std: number): number {
  let u1 = random();
  while (u1 === 0) u1 = random();
  const u2 = random();
  return mean + std * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
}

/** Fisher-Yates shuffle: randomly reorder an array in-place. */
export function shuffle<T>(arr: T[]): void {
  for (let i = arr.length - 1; i > 0; i--) {
    const j = Math.floor(random() * (i + 1));
    [arr[i], arr[j]] = [arr[j], arr[i]];
  }
}

/** Sample an index from a weighted distribution. Used for token sampling during inference. */
export function weightedChoice(weights: number[]): number {
  const total = weights.reduce((a, b) => a + b, 0);
  let r = random() * total;
  for (let i = 0; i < weights.length; i++) {
    r -= weights[i];
    if (r <= 0) return i;
  }
  return weights.length - 1;
}

nn.ts

/**
 * Neural Network Primitives
 *
 * The building blocks that the GPT architecture assembles:
 * - linear: matrix-vector multiply (the fundamental neural net operation)
 * - softmax: convert raw scores into a probability distribution
 * - rmsnorm: normalize activations to stabilize training
 *
 * These mirror PyTorch's torch.nn.functional — general-purpose operations
 * used by the model, training loop, and inference.
 */

import { Value, vsum } from "./autograd.js";

export type Matrix = Value[][];

/** y = Wx: multiply a weight matrix by an input vector. */
export function linear(input: Value[], weights: Matrix): Value[] {
  return weights.map((row) => vsum(row.map((w, i) => w.mul(input[i]))));
}

/** Convert raw logits to probabilities. Subtracts max for numerical stability. */
export function softmax(logits: Value[]): Value[] {
  const maxVal = Math.max(...logits.map((v) => v.data));
  const exps = logits.map((v) => v.sub(maxVal).exp());
  const total = vsum(exps);
  return exps.map((e) => e.div(total));
}

/** Root Mean Square normalization: scale activations to unit variance. */
export function rmsnorm(input: Value[]): Value[] {
  const ms = vsum(input.map((xi) => xi.mul(xi))).div(input.length);
  const scale = ms.add(1e-5).pow(-0.5);
  return input.map((xi) => xi.mul(scale));
}

autograd.ts

/**
 * Autograd Engine
 *
 * A scalar-valued automatic differentiation engine. Each Value node records:
 * - its forward-pass result (data)
 * - its gradient w.r.t. the loss (grad), filled in by backward()
 * - its children in the computation graph and the local derivatives
 *
 * The backward() method applies the chain rule via reverse-mode autodiff:
 * topologically sort the graph, then propagate gradients from output to inputs.
 *
 * "If I nudge this parameter slightly, how does the loss change?"
 */

export class Value {
  data: number;
  grad: number;
  children: Value[];
  localGrads: number[];

  constructor(data: number, children: Value[] = [], localGrads: number[] = []) {
    this.data = data;
    this.grad = 0;
    this.children = children;
    this.localGrads = localGrads;
  }

  add(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return new Value(this.data + o.data, [this, o], [1, 1]);
  }

  mul(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return new Value(this.data * o.data, [this, o], [o.data, this.data]);
  }

  pow(n: number): Value {
    return new Value(this.data ** n, [this], [n * this.data ** (n - 1)]);
  }

  log(): Value {
    return new Value(Math.log(this.data), [this], [1 / this.data]);
  }

  exp(): Value {
    return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]);
  }

  relu(): Value {
    return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]);
  }

  neg(): Value {
    return this.mul(-1);
  }

  sub(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return this.add(o.neg());
  }

  div(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return this.mul(o.pow(-1));
  }

  backward(): void {
    const topo: Value[] = [];
    const visited = new Set<Value>();
    const buildTopo = (v: Value): void => {
      if (!visited.has(v)) {
        visited.add(v);
        for (const child of v.children) buildTopo(child);
        topo.push(v);
      }
    };
    buildTopo(this);
    this.grad = 1;
    for (const v of topo.reverse()) {
      for (let i = 0; i < v.children.length; i++) {
        v.children[i].grad += v.localGrads[i] * v.grad;
      }
    }
  }
}

/** Sum a list of Values through the computation graph. */
export function vsum(values: Value[]): Value {
  return values.reduce((acc, v) => acc.add(v), new Value(0));
}

tokenizer.ts

/**
 * Tokenizer
 *
 * Translates strings to sequences of integers ("tokens") and back.
 * Builds a word-level vocabulary from the training corpus, with a BOS
 * (Beginning of Sequence) delimiter appended on each side.
 */

export interface Tokenizer {
  vocabSize: number;
  BOS: number;
  encode(sentence: string): number[];
  decode(tokens: number[]): string;
}

/** Word-level tokenizer. Discovers the word vocabulary from the corpus. */
export function createWordTokenizer(sentences: string[]): Tokenizer {
  const words = [...new Set(sentences.flatMap((d) => d.split(" ")))].sort();
  const BOS = words.length;
  const vocabSize = words.length + 1;

  return {
    vocabSize,
    BOS,
    encode(sentence: string): number[] {
      return [BOS, ...sentence.split(" ").map((w) => words.indexOf(w)), BOS];
    },
    decode(tokens: number[]): string {
      return tokens.map((t) => words[t]).join(" ");
    },
  };
}

Keyboard shortcuts

LLMs, the Hard Way