Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Complete Code

Here is the complete source code for the smoke test entry points, along with all files from previous chapters.

phrases-train.ts

/**
 * MicroGPT — Train on Phrases
 *
 * Trains a word-level GPT on grade 1 English sentences and saves the model.
 * Run inference afterwards with: npx tsx phrases-generate.ts
 *
 *   npx tsx phrases-train.ts
 */

import { readFileSync } from "node:fs";
import { seed, shuffle } from "./rng.js";
import { createWordTokenizer } from "./tokenizer.js";
import { createModel, saveModel } from "./model.js";
import { train } from "./train.js";

// 1. Seed the RNG for reproducibility
seed(42);

// 2. Load dataset: 30K grade 1 sentences
const sentences = readFileSync("data/grade1_sentences.txt", "utf-8")
  .split("\n")
  .filter((l) => l.trim())
  .map((l) => l.trim().toLowerCase());
shuffle(sentences);
console.log(`num sentences: ${sentences.length}`);

// 3. Build word-level tokenizer from the corpus
const tokenizer = createWordTokenizer(sentences);
console.log(`vocab size: ${tokenizer.vocabSize} (${tokenizer.vocabSize - 1} words + BOS)`);

// 4. Create the model — bigger than the names model to handle the larger vocabulary
const untrained = createModel({
  nLayer: 2,
  nEmbd: 32,
  blockSize: 16,
  nHead: 4,
  headDim: 8,
  vocabSize: tokenizer.vocabSize,
});
console.log(`num params: ${untrained.params.length}`);

// 5. Train the model
const model = train(
  untrained,
  { numSteps: 5000, learningRate: 0.01, beta1: 0.85, beta2: 0.99, epsAdam: 1e-8 },
  sentences,
  tokenizer,
);

// 6. Save the trained model to disk
saveModel(model, "phrases-model.json");
console.log("\nmodel saved to phrases-model.json");

phrases-generate.ts

/**
 * MicroGPT — Generate Phrases
 *
 * Loads a trained model and generates new sentences.
 * Train first with: npx tsx phrases-train.ts
 *
 *   npx tsx phrases-generate.ts
 *   npx tsx phrases-generate.ts 50                  # generate 50 sentences
 *   npx tsx phrases-generate.ts 20 --temp=0.3       # low temperature
 *   npx tsx phrases-generate.ts 20 --top-k=10       # top-k filtering
 *   npx tsx phrases-generate.ts 20 --top-p=0.9      # nucleus sampling
 *   npx tsx phrases-generate.ts 20 --temp=0.7 --top-p=0.9 --top-k=50
 *   npx tsx phrases-generate.ts 20 --model=phrases-fine-tuned-model.json
 */

import { readFileSync } from "node:fs";
import { seed } from "./rng.js";
import { createWordTokenizer } from "./tokenizer.js";
import { loadModel } from "./model.js";
import { generate, type GenerateOptions } from "./generate.js";

// Parse CLI arguments
const args = process.argv.slice(2);
const positional = args.filter((a) => !a.startsWith("--"));
const flags = Object.fromEntries(
  args.filter((a) => a.startsWith("--")).map((a) => {
    const [k, v] = a.slice(2).split("=");
    return [k, v];
  })
);

const numSamples = parseInt(positional[0] ?? "20", 10);
const options: GenerateOptions = {
  temperature: parseFloat(flags["temp"] ?? "0.8"),
  topK: parseInt(flags["top-k"] ?? "0", 10),
  topP: parseFloat(flags["top-p"] ?? "1.0"),
};

// 1. Seed the RNG
seed(Date.now());

// 2. Rebuild the tokenizer from the same corpus (needed for encode/decode)
const corpus = readFileSync("data/grade1_sentences.txt", "utf-8")
  .split("\n")
  .filter((l) => l.trim())
  .map((l) => l.trim().toLowerCase());
const tokenizer = createWordTokenizer(corpus);

// 3. Load the trained model
const modelPath = flags["model"] ?? "phrases-model.json";
const model = loadModel(modelPath);
console.log(`loaded ${modelPath}: ${model.params.length} params, vocab ${model.config.vocabSize}`);

// 4. Generate
const parts = [`temperature=${options.temperature}`];
if (options.topK! > 0) parts.push(`top-k=${options.topK}`);
if (options.topP! < 1.0) parts.push(`top-p=${options.topP}`);
console.log(`generating ${numSamples} sentences (${parts.join(", ")}):\n`);

const sentences = generate(model, tokenizer, numSamples, options);
sentences.forEach((s, i) =>
  console.log(`  ${String(i + 1).padStart(2)}. ${s}`)
);

generate.ts

/**
 * Inference / Text Generation
 *
 * Starting from the BOS token, the model predicts one token at a time:
 * 1. Forward the current token through the model to get logits
 * 2. Apply temperature scaling (lower = more conservative, higher = more creative)
 * 3. Apply top-k filtering (keep only the k most likely tokens)
 * 4. Apply top-p / nucleus filtering (keep smallest set summing to p)
 * 5. Convert to probabilities via softmax
 * 6. Randomly sample the next token from that distribution
 * 7. Stop when BOS is produced (end of sequence) or max length is reached
 */

import { type Model, gpt, createKVCache } from "./model.js";
import { weightedChoice } from "./rng.js";
import type { Tokenizer } from "./tokenizer.js";

export interface GenerateOptions {
  temperature?: number;
  topK?: number;
  topP?: number;
}

/** Generate new text samples from a trained model. */
export function generate(
  model: Model,
  tokenizer: Tokenizer,
  numSamples: number,
  options: GenerateOptions = {},
): string[] {
  const { temperature = 0.8, topK = 0, topP = 1.0 } = options;
  const samples: string[] = [];

  for (let i = 0; i < numSamples; i++) {
    const { keys, values } = createKVCache(model);
    let tokenId = tokenizer.BOS;
    const tokens: number[] = [];

    for (let posId = 0; posId < model.config.blockSize; posId++) {
      const logits = gpt(model, tokenId, posId, keys, values);

      // Temperature scaling
      let scores = logits.map((l) => l.data / temperature);

      // Top-k: keep only the k highest scores
      if (topK > 0 && topK < scores.length) {
        const sorted = [...scores].sort((a, b) => b - a);
        const cutoff = sorted[topK - 1];
        scores = scores.map((s) => (s >= cutoff ? s : -Infinity));
      }

      // Top-p (nucleus): keep smallest set whose probabilities sum to p
      if (topP < 1.0) {
        const indices = scores.map((s, idx) => idx);
        indices.sort((a, b) => scores[b] - scores[a]);
        // Compute softmax on current scores to get probabilities for filtering
        const maxS = Math.max(...scores.filter((s) => s !== -Infinity));
        const exps = scores.map((s) => (s === -Infinity ? 0 : Math.exp(s - maxS)));
        const total = exps.reduce((a, b) => a + b, 0);
        const probs = exps.map((e) => e / total);
        let cumSum = 0;
        let exceeded = false;
        for (const idx of indices) {
          if (exceeded) {
            scores[idx] = -Infinity;
          }
          cumSum += probs[idx];
          if (cumSum > topP) {
            exceeded = true;
          }
        }
      }

      // Softmax and sample (using plain numbers, not Value, for efficiency)
      const maxS = Math.max(...scores.filter((s) => s !== -Infinity));
      const exps = scores.map((s) => (s === -Infinity ? 0 : Math.exp(s - maxS)));
      const total = exps.reduce((a, b) => a + b, 0);
      const probs = exps.map((e) => e / total);

      tokenId = weightedChoice(probs);
      if (tokenId === tokenizer.BOS) break;
      tokens.push(tokenId);
    }

    samples.push(tokenizer.decode(tokens));
  }

  return samples;
}

train.ts

/**
 * Training Loop
 *
 * Each step:
 * 1. Pick a sentence and tokenize it
 * 2. Forward each token through the model, predicting the next token
 * 3. Compute cross-entropy loss (how surprised the model is at the correct answer)
 * 4. Backward pass: compute gradients via the chain rule
 * 5. Adam optimizer: update parameters to reduce loss
 *
 * Loss starts at -log(1/vocabSize) (random guessing) and decreases as
 * the model learns to predict the next word.
 */

import { Value, vsum } from "./autograd.js";
import { softmax } from "./nn.js";
import { type Model, gpt, createKVCache } from "./model.js";
import type { Tokenizer } from "./tokenizer.js";

export interface TrainConfig {
  numSteps: number;
  learningRate: number;
  beta1: number;
  beta2: number;
  epsAdam: number;
}

/** Train the model on the dataset and return the trained model. */
export function train(
  model: Model,
  trainConfig: TrainConfig,
  sentences: string[],
  tokenizer: Tokenizer,
): Model {
  const { numSteps, learningRate, beta1, beta2, epsAdam } = trainConfig;
  const { params } = model;

  // Adam optimizer moment buffers
  const mBuf = new Float64Array(params.length);
  const vBuf = new Float64Array(params.length);

  for (let step = 0; step < numSteps; step++) {
    // Tokenize a single sentence, surrounded by BOS on both sides
    const sentence = sentences[step % sentences.length];
    const tokens = tokenizer.encode(sentence);
    const n = Math.min(model.config.blockSize, tokens.length - 1);

    // Forward: build the computation graph from tokens to loss
    const { keys, values } = createKVCache(model);
    const losses: Value[] = [];

    for (let posId = 0; posId < n; posId++) {
      const tokenId = tokens[posId];
      const targetId = tokens[posId + 1];
      const logits = gpt(model, tokenId, posId, keys, values);
      const probs = softmax(logits);
      losses.push(probs[targetId].log().neg());
    }

    // Average cross-entropy loss over the sequence
    const loss = vsum(losses).div(n);

    // Backward: walk the computation graph and fill in .grad on every Value node.
    // Returns void — gradients are stored as a side effect on the same param objects
    // that the Adam loop below reads from.
    loss.backward();

    // Adam update with linear learning rate decay
    const lrT = learningRate * (1 - step / numSteps);
    for (let i = 0; i < params.length; i++) {
      mBuf[i] = beta1 * mBuf[i] + (1 - beta1) * params[i].grad;
      vBuf[i] = beta2 * vBuf[i] + (1 - beta2) * params[i].grad ** 2;
      const mHat = mBuf[i] / (1 - beta1 ** (step + 1));
      const vHat = vBuf[i] / (1 - beta2 ** (step + 1));
      params[i].data -= lrT * mHat / (Math.sqrt(vHat) + epsAdam);
      params[i].grad = 0;
    }

    process.stdout.write(
      `\rstep ${String(step + 1).padStart(4)} / ${numSteps} | loss ${loss.data.toFixed(4)}`
    );
  }

  return model;
}

model.ts

/**
 * GPT Model
 *
 * The transformer architecture: a function that maps input tokens to a
 * probability distribution over what comes next.
 *
 * Follows GPT-2 with minor simplifications:
 * - RMSNorm instead of LayerNorm
 * - No biases
 * - ReLU instead of GeLU
 *
 * Config: 32 embedding dims, 4 attention heads, 2 layers, 16 max context
 * → 63,296 parameters total
 */

import { readFileSync, writeFileSync } from "node:fs";
import { Value, vsum } from "./autograd.js";
import { type Matrix, linear, softmax, rmsnorm } from "./nn.js";
import { gauss } from "./rng.js";

// --- Configuration ---

export interface GPTConfig {
  nLayer: number;
  nEmbd: number;
  blockSize: number;
  nHead: number;
  headDim: number;
  vocabSize: number;
}

interface Attention {
  query: Matrix;
  key: Matrix;
  value: Matrix;
  output: Matrix;
}

interface MLP {
  hidden: Matrix;
  output: Matrix;
}

interface Layer {
  attention: Attention;
  mlp: MLP;
}

export interface Weights {
  tokenEmbedding: Matrix;
  positionEmbedding: Matrix;
  output: Matrix;
  layers: Layer[];
}

/** The trained (or untrained) model: config + weights + flattened parameter list. */
export interface Model {
  config: GPTConfig;
  weights: Weights;
  params: Value[];
}

// --- Model Creation ---

function matrix(nout: number, nin: number, std = 0.08): Matrix {
  return Array.from({ length: nout }, () =>
    Array.from({ length: nin }, () => new Value(gauss(0, std)))
  );
}

/** Create a new model with randomly initialized weights. */
export function createModel(config: GPTConfig): Model {
  const { nEmbd, nLayer, blockSize, vocabSize } = config;

  const weights: Weights = {
    tokenEmbedding: matrix(vocabSize, nEmbd),
    positionEmbedding: matrix(blockSize, nEmbd),
    output: matrix(vocabSize, nEmbd),
    layers: Array.from({ length: nLayer }, () => ({
      attention: {
        query: matrix(nEmbd, nEmbd),
        key: matrix(nEmbd, nEmbd),
        value: matrix(nEmbd, nEmbd),
        output: matrix(nEmbd, nEmbd),
      },
      mlp: {
        hidden: matrix(4 * nEmbd, nEmbd),
        output: matrix(nEmbd, 4 * nEmbd),
      },
    })),
  };

  const allMatrices: Matrix[] = [
    weights.tokenEmbedding,
    weights.positionEmbedding,
    weights.output,
    ...weights.layers.flatMap((layer) => [
      layer.attention.query, layer.attention.key,
      layer.attention.value, layer.attention.output,
      layer.mlp.hidden, layer.mlp.output,
    ]),
  ];
  const params = allMatrices.flatMap((mat) => mat.flatMap((row) => row));

  return { config, weights, params };
}

// --- Save / Load ---

/** Save a trained model to a JSON file (config + parameter values). */
export function saveModel(model: Model, path: string): void {
  const data = {
    config: model.config,
    weights: model.params.map((p) => p.data),
  };
  writeFileSync(path, JSON.stringify(data));
}

/** Load a model from a JSON file. Recreates the structure, then fills in the learned weights. */
export function loadModel(path: string): Model {
  const data = JSON.parse(readFileSync(path, "utf-8"));
  const model = createModel(data.config);
  for (let i = 0; i < model.params.length; i++) {
    model.params[i].data = data.weights[i];
  }
  return model;
}

// --- KV Cache ---

/** Create fresh key/value caches for a new sequence. Must be called per-sequence. */
export function createKVCache(model: Model): {
  keys: Value[][][];
  values: Value[][][];
} {
  return {
    keys: Array.from({ length: model.config.nLayer }, () => []),
    values: Array.from({ length: model.config.nLayer }, () => []),
  };
}

// --- Forward Pass ---

/**
 * Run one step of the GPT: given a token at a position, return logits
 * over the vocabulary for the next token.
 *
 * The keys/values caches are mutated (appended to) on each call —
 * this is the KV cache that avoids recomputing attention for past positions.
 */
export function gpt(
  model: Model,
  tokenId: number,
  posId: number,
  keys: Value[][][],
  values: Value[][][],
): Value[] {
  const { nLayer, nHead, headDim } = model.config;
  const { weights } = model;

  // Step 1: Embedding lookup
  // Combine "what word is this?" with "where does it appear?" into a single
  // vector. This is the hidden state that flows through the rest of the network.
  const tokenVector: Value[] = weights.tokenEmbedding[tokenId];
  const positionVector: Value[] = weights.positionEmbedding[posId];
  let hidden: Value[] = tokenVector.map((t, i) => t.add(positionVector[i]));

  // Normalize before the first layer to keep values at a stable scale
  hidden = rmsnorm(hidden);

  // Step 2: Transformer layers
  // Each layer has two blocks: attention (gather context from other tokens)
  // followed by MLP (process the gathered information). Both use residual
  // connections so the input is added back to the output of each block.
  for (let li = 0; li < nLayer; li++) {
    const layer = weights.layers[li];

    // --- Attention block: look at previous tokens to gather context ---

    // Save the hidden state so we can add it back after the block (residual)
    const beforeAttention: Value[] = hidden;
    hidden = rmsnorm(hidden);

    // Project the hidden state into query, key, and value vectors.
    // These are structurally identical projections — training teaches them
    // to play different roles: query asks "what am I looking for?",
    // key advertises "what do I contain?", value carries "what to retrieve".
    const query: Value[] = linear(hidden, layer.attention.query);
    const key: Value[] = linear(hidden, layer.attention.key);
    const value: Value[] = linear(hidden, layer.attention.value);

    // Cache the key and value so future tokens can attend to this position
    keys[li].push(key);
    values[li].push(value);

    // Each head independently attends to a different slice of the vectors,
    // allowing the model to track multiple relationships at once
    const attentionOutput: Value[] = [];
    for (let h = 0; h < nHead; h++) {
      const headStart = h * headDim;
      const headQuery = query.slice(headStart, headStart + headDim);
      const headKeys = keys[li].map((ki) => ki.slice(headStart, headStart + headDim));
      const headValues = values[li].map((vi) => vi.slice(headStart, headStart + headDim));

      // Scaled dot-product attention: score = (query · key) / √headDim
      const attnLogits = headKeys.map((cachedKey) =>
        vsum(headQuery.map((q, j) => q.mul(cachedKey[j]))).div(Math.sqrt(headDim))
      );

      // Softmax converts scores into weights that sum to 1
      const attnWeights = softmax(attnLogits);

      // Weighted sum of value vectors — high-scoring positions contribute more
      for (let j = 0; j < headDim; j++) {
        attentionOutput.push(vsum(attnWeights.map((w, t) => w.mul(headValues[t][j]))));
      }
    }

    // Project concatenated head outputs back to the hidden dimension
    hidden = linear(attentionOutput, layer.attention.output);
    // Residual connection: add back what we had before attention
    hidden = hidden.map((h, i) => h.add(beforeAttention[i]));

    // --- MLP block: process each token's representation independently ---

    const beforeMLP: Value[] = hidden;
    hidden = rmsnorm(hidden);
    hidden = linear(hidden, layer.mlp.hidden);   // expand to 4x wider
    hidden = hidden.map((h) => h.relu());         // nonlinearity
    hidden = linear(hidden, layer.mlp.output);    // compress back
    // Residual connection: add back what we had before the MLP
    hidden = hidden.map((h, i) => h.add(beforeMLP[i]));
  }

  // Step 3: Output projection
  // Project the final hidden state to vocabulary size — one score per word.
  // These raw scores (logits) will be passed through softmax later to get
  // a probability distribution over the next token.
  return linear(hidden, weights.output);
}

rng.ts

/**
 * Seeded Random Number Generator
 *
 * JavaScript has no built-in seeded RNG, so we roll our own:
 * - mulberry32 for uniform random numbers
 * - Box-Muller transform for Gaussian (normal) distribution
 * - Fisher-Yates for array shuffling
 * - Weighted random choice for sampling from probability distributions
 */

let _rngState = 0;

export function seed(s: number): void {
  _rngState = s | 0;
}

/** Mulberry32: a fast, seedable 32-bit PRNG. Returns a uniform float in [0, 1). */
export function random(): number {
  _rngState = (_rngState + 0x6d2b79f5) | 0;
  let t = _rngState;
  t = Math.imul(t ^ (t >>> 15), t | 1);
  t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
  return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
}

/** Box-Muller transform: convert two uniform samples into a Gaussian sample. */
export function gauss(mean: number, std: number): number {
  let u1 = random();
  while (u1 === 0) u1 = random();
  const u2 = random();
  return mean + std * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
}

/** Fisher-Yates shuffle: randomly reorder an array in-place. */
export function shuffle<T>(arr: T[]): void {
  for (let i = arr.length - 1; i > 0; i--) {
    const j = Math.floor(random() * (i + 1));
    [arr[i], arr[j]] = [arr[j], arr[i]];
  }
}

/** Sample an index from a weighted distribution. Used for token sampling during inference. */
export function weightedChoice(weights: number[]): number {
  const total = weights.reduce((a, b) => a + b, 0);
  let r = random() * total;
  for (let i = 0; i < weights.length; i++) {
    r -= weights[i];
    if (r <= 0) return i;
  }
  return weights.length - 1;
}

nn.ts

/**
 * Neural Network Primitives
 *
 * The building blocks that the GPT architecture assembles:
 * - linear: matrix-vector multiply (the fundamental neural net operation)
 * - softmax: convert raw scores into a probability distribution
 * - rmsnorm: normalize activations to stabilize training
 *
 * These mirror PyTorch's torch.nn.functional — general-purpose operations
 * used by the model, training loop, and inference.
 */

import { Value, vsum } from "./autograd.js";

export type Matrix = Value[][];

/** y = Wx: multiply a weight matrix by an input vector. */
export function linear(input: Value[], weights: Matrix): Value[] {
  return weights.map((row) => vsum(row.map((w, i) => w.mul(input[i]))));
}

/** Convert raw logits to probabilities. Subtracts max for numerical stability. */
export function softmax(logits: Value[]): Value[] {
  const maxVal = Math.max(...logits.map((v) => v.data));
  const exps = logits.map((v) => v.sub(maxVal).exp());
  const total = vsum(exps);
  return exps.map((e) => e.div(total));
}

/** Root Mean Square normalization: scale activations to unit variance. */
export function rmsnorm(input: Value[]): Value[] {
  const ms = vsum(input.map((xi) => xi.mul(xi))).div(input.length);
  const scale = ms.add(1e-5).pow(-0.5);
  return input.map((xi) => xi.mul(scale));
}

autograd.ts

/**
 * Autograd Engine
 *
 * A scalar-valued automatic differentiation engine. Each Value node records:
 * - its forward-pass result (data)
 * - its gradient w.r.t. the loss (grad), filled in by backward()
 * - its children in the computation graph and the local derivatives
 *
 * The backward() method applies the chain rule via reverse-mode autodiff:
 * topologically sort the graph, then propagate gradients from output to inputs.
 *
 * "If I nudge this parameter slightly, how does the loss change?"
 */

export class Value {
  data: number;
  grad: number;
  children: Value[];
  localGrads: number[];

  constructor(data: number, children: Value[] = [], localGrads: number[] = []) {
    this.data = data;
    this.grad = 0;
    this.children = children;
    this.localGrads = localGrads;
  }

  add(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return new Value(this.data + o.data, [this, o], [1, 1]);
  }

  mul(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return new Value(this.data * o.data, [this, o], [o.data, this.data]);
  }

  pow(n: number): Value {
    return new Value(this.data ** n, [this], [n * this.data ** (n - 1)]);
  }

  log(): Value {
    return new Value(Math.log(this.data), [this], [1 / this.data]);
  }

  exp(): Value {
    return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]);
  }

  relu(): Value {
    return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]);
  }

  neg(): Value {
    return this.mul(-1);
  }

  sub(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return this.add(o.neg());
  }

  div(other: Value | number): Value {
    const o = typeof other === "number" ? new Value(other) : other;
    return this.mul(o.pow(-1));
  }

  backward(): void {
    const topo: Value[] = [];
    const visited = new Set<Value>();
    const buildTopo = (v: Value): void => {
      if (!visited.has(v)) {
        visited.add(v);
        for (const child of v.children) buildTopo(child);
        topo.push(v);
      }
    };
    buildTopo(this);
    this.grad = 1;
    for (const v of topo.reverse()) {
      for (let i = 0; i < v.children.length; i++) {
        v.children[i].grad += v.localGrads[i] * v.grad;
      }
    }
  }
}

/** Sum a list of Values through the computation graph. */
export function vsum(values: Value[]): Value {
  return values.reduce((acc, v) => acc.add(v), new Value(0));
}

tokenizer.ts

/**
 * Tokenizer
 *
 * Translates strings to sequences of integers ("tokens") and back.
 * Builds a word-level vocabulary from the training corpus, with a BOS
 * (Beginning of Sequence) delimiter appended on each side.
 */

export interface Tokenizer {
  vocabSize: number;
  BOS: number;
  encode(sentence: string): number[];
  decode(tokens: number[]): string;
}

/** Word-level tokenizer. Discovers the word vocabulary from the corpus. */
export function createWordTokenizer(sentences: string[]): Tokenizer {
  const words = [...new Set(sentences.flatMap((d) => d.split(" ")))].sort();
  const BOS = words.length;
  const vocabSize = words.length + 1;

  return {
    vocabSize,
    BOS,
    encode(sentence: string): number[] {
      return [BOS, ...sentence.split(" ").map((w) => words.indexOf(w)), BOS];
    },
    decode(tokens: number[]): string {
      return tokens.map((t) => words[t]).join(" ");
    },
  };
}