Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Complete Code

Here is the complete source code for the neural network primitives, along with all files from previous chapters.

nn.ts

/**
 * Neural Network Primitives
 *
 * The building blocks that the GPT architecture assembles:
 * - linear: matrix-vector multiply (the fundamental neural net operation)
 * - softmax: convert raw scores into a probability distribution
 * - rmsnorm: normalize activations to stabilize training
 *
 * These mirror PyTorch's torch.nn.functional — general-purpose operations
 * used by the model, training loop, and inference.
 */

import { Value, vsum } from "./autograd.js";

export type Matrix = Value[][];

/** y = Wx: multiply a weight matrix by an input vector. */
export function linear(input: Value[], weights: Matrix): Value[] {
  return weights.map((row) => vsum(row.map((w, i) => w.mul(input[i]))));
}

/** Convert raw logits to probabilities. Subtracts max for numerical stability. */
export function softmax(logits: Value[]): Value[] {
  const maxVal = Math.max(...logits.map((v) => v.data));
  const exps = logits.map((v) => v.sub(maxVal).exp());
  const total = vsum(exps);
  return exps.map((e) => e.div(total));
}

/** Root Mean Square normalization: scale activations to unit variance. */
export function rmsnorm(input: Value[]): Value[] {
  const ms = vsum(input.map((xi) => xi.mul(xi))).div(input.length);
  const scale = ms.add(1e-5).pow(-0.5);
  return input.map((xi) => xi.mul(scale));
}

autograd.ts

/**
 * Autograd Engine
 *
 * A scalar-valued automatic differentiation engine. Each Value node records:
 * - its forward-pass result (data)
 * - its gradient w.r.t. the loss (grad), filled in by backward()
 * - its children in the computation graph and the local derivatives
 *
 * The backward() method applies the chain rule via reverse-mode autodiff:
 * topologically sort the graph, then propagate gradients from output to inputs.
 *
 * "If I nudge this parameter slightly, how does the loss change?"
 */

export class Value {
  data: number;
  grad: number;
  children: Value[];
  localGrads: number[];

  constructor(data: number, children: Value[] = [], localGrads: number[] = []) {
    this.data = data;
    this.grad = 0;
    this.children = children;
    this.localGrads = localGrads;
  }

  add(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return new Value(this.data + o.data, [this, o], [1, 1]);
  }

  mul(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return new Value(this.data * o.data, [this, o], [o.data, this.data]);
  }

  pow(n: number): Value {
    return new Value(this.data ** n, [this], [n * this.data ** (n - 1)]);
  }

  log(): Value {
    return new Value(Math.log(this.data), [this], [1 / this.data]);
  }

  exp(): Value {
    return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]);
  }

  relu(): Value {
    return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]);
  }

  neg(): Value {
    return this.mul(-1);
  }

  sub(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return this.add(o.neg());
  }

  div(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return this.mul(o.pow(-1));
  }

  backward(): void {
    const topo: Value[] = [];
    const visited = new Set<Value>();
    const buildTopo = (v: Value): void => {
      if (!visited.has(v)) {
        visited.add(v);
        for (const child of v.children) buildTopo(child);
        topo.push(v);
      }
    };
    buildTopo(this);
    this.grad = 1;
    for (const v of topo.reverse()) {
      for (let i = 0; i < v.children.length; i++) {
        v.children[i].grad += v.localGrads[i] * v.grad;
      }
    }
  }
}

/** Sum a list of Values through the computation graph. */
export function vsum(values: Value[]): Value {
  return values.reduce((acc, v) => acc.add(v), new Value(0));
}

tokenizer.ts

/**
 * Tokenizer
 *
 * Translates strings to sequences of integers ("tokens") and back.
 * Builds a word-level vocabulary from the training corpus, with a BOS
 * (Beginning of Sequence) delimiter appended on each side.
 */

export interface Tokenizer {
  vocabSize: number;
  BOS: number;
  encode(sentence: string): number[];
  decode(tokens: number[]): string;
}

/** Word-level tokenizer. Discovers the word vocabulary from the corpus. */
export function createWordTokenizer(sentences: string[]): Tokenizer {
  const words = [...new Set(sentences.flatMap((d) => d.split(" ")))].sort();
  const BOS = words.length;
  const vocabSize = words.length + 1;

  return {
    vocabSize,
    BOS,
    encode(sentence: string): number[] {
      return [BOS, ...sentence.split(" ").map((w) => words.indexOf(w)), BOS];
    },
    decode(tokens: number[]): string {
      return tokens.map((t) => words[t]).join(" ");
    },
  };
}