embed.py: split page vs paragraph embedding models

Pages (similar-links.json, build-only) move to nomic-embed-text-v1.5 (768d) with an on-disk npz cache; paragraphs (browser semantic search) stay on all-MiniLM-L6-v2 (384d), so the client contract is unchanged. WRITING.md search row updated accordingly. einops added for nomic's remote modeling code; cache gitignored with a trailing glob so interrupted-write debris is covered too. Known follow-ups (AUDIT-2026-06-09.md §1.3, §4): pin the nomic-bert-2048 remote code, catch BadZipFile in cache loads, fix the staleness check defeated by stamp-build-time ordering. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-09 18:57:43 -04:00 · 2026-06-09 18:57:43 -04:00 · 7c5354efa7
parent 37665f67db
commit 7c5354efa7
5 changed files with 163 additions and 25 deletions
--- a/.gitignore
+++ b/.gitignore
@ -73,6 +73,8 @@ data/build-stamp.txt
 data/last-build-seconds.txt
 data/semantic-index.bin
 data/semantic-meta.json
+# Trailing glob also catches interrupted-write debris (.tmp / .tmp.npz)
+data/embed-cache-pages.npz*

 # Archive: generated text + its staleness stamp (recreated from the
 # committed artifact on every build — deterministic, so committing them is
--- a/WRITING.md
+++ b/WRITING.md
@ -1125,7 +1125,7 @@ These pages are built automatically and require no content files or markup:
 | Author indexes | `/authors/<slug>/` | All content attributed to an author |
 | Random manifest | `/random-pages.json` | JSON array of page URLs for the random-page button |
 | Atom feeds | `/feed.xml`, `/music/feed.xml` | All content feed + music-only feed |
-| Search | `/search.html` | Pagefind full-text search + client-side semantic search (`nomic-embed-text-v1.5` ONNX model) |
+| Search | `/search.html` | Pagefind full-text search + client-side semantic search (`all-MiniLM-L6-v2` ONNX model) |

 ---

--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,7 +7,6 @@ dependencies = [
    # Visualization
    "matplotlib>=3.9,<4",
    "altair>=5.4,<6",
-
    # Embedding pipeline
    # Upper bounds are intentionally generous (next major) but always
    # present so that an unrelated `uv sync` upgrade can't silently pull
@ -18,7 +17,6 @@ dependencies = [
    "beautifulsoup4>=4.12,<5",
    # CPU-only torch — avoids pulling ~3 GB of CUDA libraries
    "torch>=2.5,<3",
-
    # Photography pipeline
    # Pillow handles EXIF reading when exiftool is not installed (the
    # preferred path); colorthief computes the 5-color palette strip.
@ -26,6 +24,7 @@ dependencies = [
    "pillow>=10.0,<12",
    "colorthief>=0.2,<1",
    "pyyaml>=6.0,<7",
+    "einops>=0.8.2",
 ]

 [[tool.uv.index]]
--- a/tools/embed.py
+++ b/tools/embed.py
@ -5,16 +5,25 @@ embed.py — Build-time embedding pipeline.
 Produces two outputs from _site/**/*.html:

  data/similar-links.json       Page-level similarity (for "Related" footer section)
-  data/semantic-index.bin       Paragraph vectors as raw Float32 array (N × DIM)
+  data/semantic-index.bin       Paragraph vectors as raw Float32 array (N × PARA_DIM)
  data/semantic-meta.json       Paragraph metadata: [{url, title, heading, excerpt}]

-Both use all-MiniLM-L6-v2 (384 dims) — the same model shipped to the browser
-via transformers.js for query-time semantic search.
+Two models, one process:
+
+  * Pages use nomic-embed-text-v1.5 (768 dims) — build-time only, never
+    shipped to the browser. Chosen for its well-separated cosine scores on
+    small corpora, which keeps the MIN_SCORE gate meaningful so every essay
+    reliably gets a "Related" footer section.
+
+  * Paragraphs use all-MiniLM-L6-v2 (384 dims) — must match what the
+    browser runs via transformers.js (static/js/semantic-search.js) since
+    query vectors are dotted against the shipped index.

 Called by `make build` when .venv exists. Failures are non-fatal.
 Staleness check: skips if all output files are newer than every HTML in _site/.
 """

+import hashlib
 import json
 import os
 import re
@ -35,13 +44,42 @@ SITE_DIR       = REPO_ROOT / "_site"
 SIMILAR_OUT    = REPO_ROOT / "data" / "similar-links.json"
 SEMANTIC_BIN   = REPO_ROOT / "data" / "semantic-index.bin"
 SEMANTIC_META  = REPO_ROOT / "data" / "semantic-meta.json"
+# Content-addressed cache for nomic page embeddings. Keyed by sha256 of the
+# prefixed page text; invalidated wholesale on model name/revision/dim change.
+# Gitignored — a build artifact, not source. Survives `make clean`.
+PAGE_CACHE     = REPO_ROOT / "data" / "embed-cache-pages.npz"

-MODEL_NAME     = "sentence-transformers/all-MiniLM-L6-v2"
-# Pinned to a specific HuggingFace commit so a future model bump can't
-# silently change embedding semantics. Bump deliberately when validating
-# (and re-run a full embed pass to refresh data/semantic-* + similar-links).
-MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf"
-DIM            = 384
+# Two models, deliberately split:
+#
+#   PARA_MODEL — embeds paragraphs for data/semantic-index.bin. This index
+#   is fetched by the browser at /search/ and ranked against query vectors
+#   computed client-side. The client (static/js/semantic-search.js) embeds
+#   queries with MiniLM-L6-v2 via transformers.js, so the build-time model
+#   must match exactly — both the architecture and the embedding dimension
+#   are part of the wire contract.
+#
+#   PAGE_MODEL — embeds full pages for data/similar-links.json. This file
+#   is consumed only at Hakyll-build time (SimilarLinks.hs) and never
+#   shipped to the browser, so it is free to use a different, stronger
+#   model. nomic-embed-text-v1.5 produces well-separated cosine scores on
+#   small corpora (top neighbours at 0.7–0.9 instead of MiniLM's compressed
+#   0.1–0.3), so the MIN_SCORE gate below is meaningful and every essay
+#   reliably gets a "Related" footer section.
+#
+# Both pins are deliberate. Bump only when validating and re-run a full
+# embed pass to refresh the corresponding output files.
+
+PARA_MODEL_NAME     = "sentence-transformers/all-MiniLM-L6-v2"
+PARA_MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf"
+PARA_DIM            = 384
+
+PAGE_MODEL_NAME     = "nomic-ai/nomic-embed-text-v1.5"
+PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab"
+PAGE_DIM            = 768
+# Nomic requires task-prefixed input. Documents (corpus side) get
+# "search_document: "; queries would get "search_query: ". similar-links
+# only ever embeds documents, so the prefix is constant here.
+PAGE_PREFIX         = "search_document: "

 TOP_N          = 5      # similar-links: neighbours per page
 MIN_SCORE      = 0.30   # similar-links: discard weak matches
@ -80,6 +118,71 @@ def atomic_write_bytes(path: Path, data: bytes) -> None:
 def atomic_write_text(path: Path, text: str) -> None:
    atomic_write_bytes(path, text.encode("utf-8"))

+
+# ---------------------------------------------------------------------------
+# Page-embedding cache
+# ---------------------------------------------------------------------------
+#
+# Loading the nomic model and embedding 26 pages on CPU takes ~3 minutes
+# every `make build`. Pages rarely change between builds — usually one
+# essay is edited and everything else is identical. This cache stores
+# one nomic vector per page content hash so unchanged pages are reused
+# verbatim and only edited/new pages are re-embedded. A fully-warm cache
+# skips the model load entirely.
+
+def content_hash(text: str) -> str:
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+
+def load_page_cache() -> dict[str, np.ndarray]:
+    """Load {hash: vector} from disk. Returns an empty dict if the cache
+    is absent, unreadable, or pinned to a different model — in those
+    cases save_page_cache() will overwrite the stale file on next save."""
+    if not PAGE_CACHE.exists():
+        return {}
+    try:
+        npz = np.load(PAGE_CACHE, allow_pickle=False)
+        if (npz["model"].item()    != PAGE_MODEL_NAME or
+            npz["revision"].item() != PAGE_MODEL_REVISION or
+            int(npz["dim"].item()) != PAGE_DIM):
+            return {}
+        hashes  = npz["hashes"]
+        vectors = npz["vectors"]
+        if vectors.shape != (len(hashes), PAGE_DIM):
+            return {}
+        return {h.item(): vectors[i] for i, h in enumerate(hashes)}
+    except (OSError, KeyError, ValueError) as e:
+        print(f"embed.py: page cache unreadable ({e}) — discarding",
+              file=sys.stderr)
+        return {}
+
+
+def save_page_cache(cache: dict[str, np.ndarray]) -> None:
+    """Atomically persist {hash: vector}. Empty cache writes an empty
+    file so a subsequent load returns {} cleanly (instead of falling
+    through to the "no file" path)."""
+    if cache:
+        hashes  = np.array(list(cache.keys()))
+        vectors = np.stack(list(cache.values())).astype(np.float32)
+    else:
+        hashes  = np.array([], dtype="U64")
+        vectors = np.zeros((0, PAGE_DIM), dtype=np.float32)
+    PAGE_CACHE.parent.mkdir(parents=True, exist_ok=True)
+    # Pass an open file handle, not a path: np.savez_compressed appends
+    # ".npz" to bare paths, which would mangle our atomic-rename target.
+    tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp")
+    with open(tmp, "wb") as f:
+        np.savez_compressed(
+            f,
+            model=PAGE_MODEL_NAME,
+            revision=PAGE_MODEL_REVISION,
+            dim=PAGE_DIM,
+            hashes=hashes,
+            vectors=vectors,
+        )
+    os.replace(tmp, PAGE_CACHE)
+
+
 STRIP_SELECTORS = [
    "nav", "footer", "#toc", ".link-popup", "script", "style",
    ".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
@ -211,18 +314,38 @@ def main() -> int:
        print("embed.py: no indexable pages found", file=sys.stderr)
        return 0

-    # --- Load model once for both tasks ---
-    print(f"embed.py: loading {MODEL_NAME}@{MODEL_REVISION[:8]}…")
-    model = SentenceTransformer(MODEL_NAME, revision=MODEL_REVISION)
+    # --- Similar-links (page level, nomic, content-hash cached) ---
+    cache       = load_page_cache()
+    page_inputs = [PAGE_PREFIX + p["text"] for p in pages]
+    hashes      = [content_hash(t) for t in page_inputs]
+    miss_idxs   = [i for i, h in enumerate(hashes) if h not in cache]

-    # --- Similar-links (page level) ---
-    print(f"embed.py: embedding {len(pages)} pages…")
-    page_vecs = model.encode(
-        [p["text"] for p in pages],
+    print(f"embed.py: {len(pages) - len(miss_idxs)} cached / "
+          f"{len(miss_idxs)} to embed")
+
+    if miss_idxs:
+        print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}…")
+        page_model = SentenceTransformer(
+            PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True,
+        )
+        new_vecs = page_model.encode(
+            [page_inputs[i] for i in miss_idxs],
            normalize_embeddings=True,
            show_progress_bar=True,
-        batch_size=64,
+            batch_size=8,
        ).astype(np.float32)
+        for i, vec in zip(miss_idxs, new_vecs):
+            cache[hashes[i]] = vec
+        # Drop the model before loading MiniLM below; sentence-transformers
+        # holds the full weight tensor in RAM until GC runs.
+        del page_model
+
+    # Assemble page_vecs in the original pages[] order.
+    page_vecs = np.stack([cache[h] for h in hashes]).astype(np.float32)
+
+    # Prune the cache to only currently-present hashes so a deleted page
+    # doesn't keep its vector around forever. Then persist.
+    save_page_cache({h: cache[h] for h in hashes})

    index = faiss.IndexFlatIP(page_vecs.shape[1])
    index.add(page_vecs)
@ -245,13 +368,16 @@ def main() -> int:
    atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
    print(f"embed.py: wrote {len(similar)} similar-links entries")

-    # --- Semantic index (paragraph level) ---
+    # --- Semantic index (paragraph level, MiniLM) ---
    if not paragraphs:
        print("embed.py: no paragraphs extracted — skipping semantic index")
        return 0

+    print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…")
+    para_model = SentenceTransformer(PARA_MODEL_NAME, revision=PARA_MODEL_REVISION)
+
    print(f"embed.py: embedding {len(paragraphs)} paragraphs…")
-    para_vecs = model.encode(
+    para_vecs = para_model.encode(
        [p["text"] for p in paragraphs],
        normalize_embeddings=True,
        show_progress_bar=True,
--- a/uv.lock
+++ b/uv.lock
@ -156,6 +156,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" },
 ]

+[[package]]
+name = "einops"
+version = "0.8.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" },
+]
+
 [[package]]
 name = "faiss-cpu"
 version = "1.13.2"
@ -364,6 +373,7 @@ dependencies = [
    { name = "altair" },
    { name = "beautifulsoup4" },
    { name = "colorthief" },
+    { name = "einops" },
    { name = "faiss-cpu" },
    { name = "matplotlib" },
    { name = "numpy" },
@ -379,6 +389,7 @@ requires-dist = [
    { name = "altair", specifier = ">=5.4,<6" },
    { name = "beautifulsoup4", specifier = ">=4.12,<5" },
    { name = "colorthief", specifier = ">=0.2,<1" },
+    { name = "einops", specifier = ">=0.8.2" },
    { name = "faiss-cpu", specifier = ">=1.9,<2" },
    { name = "matplotlib", specifier = ">=3.9,<4" },
    { name = "numpy", specifier = ">=2.0,<3" },