From 7c5354efa7d05c12454996444eac2a8edaf46cd5 Mon Sep 17 00:00:00 2001 From: Levi Neuwirth Date: Tue, 9 Jun 2026 18:57:43 -0400 Subject: [PATCH] embed.py: split page vs paragraph embedding models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pages (similar-links.json, build-only) move to nomic-embed-text-v1.5 (768d) with an on-disk npz cache; paragraphs (browser semantic search) stay on all-MiniLM-L6-v2 (384d), so the client contract is unchanged. WRITING.md search row updated accordingly. einops added for nomic's remote modeling code; cache gitignored with a trailing glob so interrupted-write debris is covered too. Known follow-ups (AUDIT-2026-06-09.md §1.3, §4): pin the nomic-bert-2048 remote code, catch BadZipFile in cache loads, fix the staleness check defeated by stamp-build-time ordering. Co-Authored-By: Claude Fable 5 --- .gitignore | 2 + WRITING.md | 2 +- pyproject.toml | 3 +- tools/embed.py | 170 ++++++++++++++++++++++++++++++++++++++++++------- uv.lock | 11 ++++ 5 files changed, 163 insertions(+), 25 deletions(-) diff --git a/.gitignore b/.gitignore index e7ca3f8..2bcd68c 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,8 @@ data/build-stamp.txt data/last-build-seconds.txt data/semantic-index.bin data/semantic-meta.json +# Trailing glob also catches interrupted-write debris (.tmp / .tmp.npz) +data/embed-cache-pages.npz* # Archive: generated text + its staleness stamp (recreated from the # committed artifact on every build — deterministic, so committing them is diff --git a/WRITING.md b/WRITING.md index 0d6fcbb..4e8c37f 100644 --- a/WRITING.md +++ b/WRITING.md @@ -1125,7 +1125,7 @@ These pages are built automatically and require no content files or markup: | Author indexes | `/authors//` | All content attributed to an author | | Random manifest | `/random-pages.json` | JSON array of page URLs for the random-page button | | Atom feeds | `/feed.xml`, `/music/feed.xml` | All content feed + music-only feed | -| Search | `/search.html` | Pagefind full-text search + client-side semantic search (`nomic-embed-text-v1.5` ONNX model) | +| Search | `/search.html` | Pagefind full-text search + client-side semantic search (`all-MiniLM-L6-v2` ONNX model) | --- diff --git a/pyproject.toml b/pyproject.toml index 5332d99..b82d0d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,6 @@ dependencies = [ # Visualization "matplotlib>=3.9,<4", "altair>=5.4,<6", - # Embedding pipeline # Upper bounds are intentionally generous (next major) but always # present so that an unrelated `uv sync` upgrade can't silently pull @@ -18,7 +17,6 @@ dependencies = [ "beautifulsoup4>=4.12,<5", # CPU-only torch — avoids pulling ~3 GB of CUDA libraries "torch>=2.5,<3", - # Photography pipeline # Pillow handles EXIF reading when exiftool is not installed (the # preferred path); colorthief computes the 5-color palette strip. @@ -26,6 +24,7 @@ dependencies = [ "pillow>=10.0,<12", "colorthief>=0.2,<1", "pyyaml>=6.0,<7", + "einops>=0.8.2", ] [[tool.uv.index]] diff --git a/tools/embed.py b/tools/embed.py index e201f66..67f57c8 100644 --- a/tools/embed.py +++ b/tools/embed.py @@ -5,16 +5,25 @@ embed.py — Build-time embedding pipeline. Produces two outputs from _site/**/*.html: data/similar-links.json Page-level similarity (for "Related" footer section) - data/semantic-index.bin Paragraph vectors as raw Float32 array (N × DIM) + data/semantic-index.bin Paragraph vectors as raw Float32 array (N × PARA_DIM) data/semantic-meta.json Paragraph metadata: [{url, title, heading, excerpt}] -Both use all-MiniLM-L6-v2 (384 dims) — the same model shipped to the browser -via transformers.js for query-time semantic search. +Two models, one process: + + * Pages use nomic-embed-text-v1.5 (768 dims) — build-time only, never + shipped to the browser. Chosen for its well-separated cosine scores on + small corpora, which keeps the MIN_SCORE gate meaningful so every essay + reliably gets a "Related" footer section. + + * Paragraphs use all-MiniLM-L6-v2 (384 dims) — must match what the + browser runs via transformers.js (static/js/semantic-search.js) since + query vectors are dotted against the shipped index. Called by `make build` when .venv exists. Failures are non-fatal. Staleness check: skips if all output files are newer than every HTML in _site/. """ +import hashlib import json import os import re @@ -35,13 +44,42 @@ SITE_DIR = REPO_ROOT / "_site" SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json" SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin" SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json" +# Content-addressed cache for nomic page embeddings. Keyed by sha256 of the +# prefixed page text; invalidated wholesale on model name/revision/dim change. +# Gitignored — a build artifact, not source. Survives `make clean`. +PAGE_CACHE = REPO_ROOT / "data" / "embed-cache-pages.npz" -MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" -# Pinned to a specific HuggingFace commit so a future model bump can't -# silently change embedding semantics. Bump deliberately when validating -# (and re-run a full embed pass to refresh data/semantic-* + similar-links). -MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf" -DIM = 384 +# Two models, deliberately split: +# +# PARA_MODEL — embeds paragraphs for data/semantic-index.bin. This index +# is fetched by the browser at /search/ and ranked against query vectors +# computed client-side. The client (static/js/semantic-search.js) embeds +# queries with MiniLM-L6-v2 via transformers.js, so the build-time model +# must match exactly — both the architecture and the embedding dimension +# are part of the wire contract. +# +# PAGE_MODEL — embeds full pages for data/similar-links.json. This file +# is consumed only at Hakyll-build time (SimilarLinks.hs) and never +# shipped to the browser, so it is free to use a different, stronger +# model. nomic-embed-text-v1.5 produces well-separated cosine scores on +# small corpora (top neighbours at 0.7–0.9 instead of MiniLM's compressed +# 0.1–0.3), so the MIN_SCORE gate below is meaningful and every essay +# reliably gets a "Related" footer section. +# +# Both pins are deliberate. Bump only when validating and re-run a full +# embed pass to refresh the corresponding output files. + +PARA_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" +PARA_MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf" +PARA_DIM = 384 + +PAGE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5" +PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab" +PAGE_DIM = 768 +# Nomic requires task-prefixed input. Documents (corpus side) get +# "search_document: "; queries would get "search_query: ". similar-links +# only ever embeds documents, so the prefix is constant here. +PAGE_PREFIX = "search_document: " TOP_N = 5 # similar-links: neighbours per page MIN_SCORE = 0.30 # similar-links: discard weak matches @@ -80,6 +118,71 @@ def atomic_write_bytes(path: Path, data: bytes) -> None: def atomic_write_text(path: Path, text: str) -> None: atomic_write_bytes(path, text.encode("utf-8")) + +# --------------------------------------------------------------------------- +# Page-embedding cache +# --------------------------------------------------------------------------- +# +# Loading the nomic model and embedding 26 pages on CPU takes ~3 minutes +# every `make build`. Pages rarely change between builds — usually one +# essay is edited and everything else is identical. This cache stores +# one nomic vector per page content hash so unchanged pages are reused +# verbatim and only edited/new pages are re-embedded. A fully-warm cache +# skips the model load entirely. + +def content_hash(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def load_page_cache() -> dict[str, np.ndarray]: + """Load {hash: vector} from disk. Returns an empty dict if the cache + is absent, unreadable, or pinned to a different model — in those + cases save_page_cache() will overwrite the stale file on next save.""" + if not PAGE_CACHE.exists(): + return {} + try: + npz = np.load(PAGE_CACHE, allow_pickle=False) + if (npz["model"].item() != PAGE_MODEL_NAME or + npz["revision"].item() != PAGE_MODEL_REVISION or + int(npz["dim"].item()) != PAGE_DIM): + return {} + hashes = npz["hashes"] + vectors = npz["vectors"] + if vectors.shape != (len(hashes), PAGE_DIM): + return {} + return {h.item(): vectors[i] for i, h in enumerate(hashes)} + except (OSError, KeyError, ValueError) as e: + print(f"embed.py: page cache unreadable ({e}) — discarding", + file=sys.stderr) + return {} + + +def save_page_cache(cache: dict[str, np.ndarray]) -> None: + """Atomically persist {hash: vector}. Empty cache writes an empty + file so a subsequent load returns {} cleanly (instead of falling + through to the "no file" path).""" + if cache: + hashes = np.array(list(cache.keys())) + vectors = np.stack(list(cache.values())).astype(np.float32) + else: + hashes = np.array([], dtype="U64") + vectors = np.zeros((0, PAGE_DIM), dtype=np.float32) + PAGE_CACHE.parent.mkdir(parents=True, exist_ok=True) + # Pass an open file handle, not a path: np.savez_compressed appends + # ".npz" to bare paths, which would mangle our atomic-rename target. + tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp") + with open(tmp, "wb") as f: + np.savez_compressed( + f, + model=PAGE_MODEL_NAME, + revision=PAGE_MODEL_REVISION, + dim=PAGE_DIM, + hashes=hashes, + vectors=vectors, + ) + os.replace(tmp, PAGE_CACHE) + + STRIP_SELECTORS = [ "nav", "footer", "#toc", ".link-popup", "script", "style", ".page-meta-footer", ".metadata", "[data-pagefind-ignore]", @@ -211,18 +314,38 @@ def main() -> int: print("embed.py: no indexable pages found", file=sys.stderr) return 0 - # --- Load model once for both tasks --- - print(f"embed.py: loading {MODEL_NAME}@{MODEL_REVISION[:8]}…") - model = SentenceTransformer(MODEL_NAME, revision=MODEL_REVISION) + # --- Similar-links (page level, nomic, content-hash cached) --- + cache = load_page_cache() + page_inputs = [PAGE_PREFIX + p["text"] for p in pages] + hashes = [content_hash(t) for t in page_inputs] + miss_idxs = [i for i, h in enumerate(hashes) if h not in cache] - # --- Similar-links (page level) --- - print(f"embed.py: embedding {len(pages)} pages…") - page_vecs = model.encode( - [p["text"] for p in pages], - normalize_embeddings=True, - show_progress_bar=True, - batch_size=64, - ).astype(np.float32) + print(f"embed.py: {len(pages) - len(miss_idxs)} cached / " + f"{len(miss_idxs)} to embed") + + if miss_idxs: + print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}…") + page_model = SentenceTransformer( + PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True, + ) + new_vecs = page_model.encode( + [page_inputs[i] for i in miss_idxs], + normalize_embeddings=True, + show_progress_bar=True, + batch_size=8, + ).astype(np.float32) + for i, vec in zip(miss_idxs, new_vecs): + cache[hashes[i]] = vec + # Drop the model before loading MiniLM below; sentence-transformers + # holds the full weight tensor in RAM until GC runs. + del page_model + + # Assemble page_vecs in the original pages[] order. + page_vecs = np.stack([cache[h] for h in hashes]).astype(np.float32) + + # Prune the cache to only currently-present hashes so a deleted page + # doesn't keep its vector around forever. Then persist. + save_page_cache({h: cache[h] for h in hashes}) index = faiss.IndexFlatIP(page_vecs.shape[1]) index.add(page_vecs) @@ -245,13 +368,16 @@ def main() -> int: atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2)) print(f"embed.py: wrote {len(similar)} similar-links entries") - # --- Semantic index (paragraph level) --- + # --- Semantic index (paragraph level, MiniLM) --- if not paragraphs: print("embed.py: no paragraphs extracted — skipping semantic index") return 0 + print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…") + para_model = SentenceTransformer(PARA_MODEL_NAME, revision=PARA_MODEL_REVISION) + print(f"embed.py: embedding {len(paragraphs)} paragraphs…") - para_vecs = model.encode( + para_vecs = para_model.encode( [p["text"] for p in paragraphs], normalize_embeddings=True, show_progress_bar=True, diff --git a/uv.lock b/uv.lock index 679f303..dad9b9e 100644 --- a/uv.lock +++ b/uv.lock @@ -156,6 +156,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, ] +[[package]] +name = "einops" +version = "0.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" }, +] + [[package]] name = "faiss-cpu" version = "1.13.2" @@ -364,6 +373,7 @@ dependencies = [ { name = "altair" }, { name = "beautifulsoup4" }, { name = "colorthief" }, + { name = "einops" }, { name = "faiss-cpu" }, { name = "matplotlib" }, { name = "numpy" }, @@ -379,6 +389,7 @@ requires-dist = [ { name = "altair", specifier = ">=5.4,<6" }, { name = "beautifulsoup4", specifier = ">=4.12,<5" }, { name = "colorthief", specifier = ">=0.2,<1" }, + { name = "einops", specifier = ">=0.8.2" }, { name = "faiss-cpu", specifier = ">=1.9,<2" }, { name = "matplotlib", specifier = ">=3.9,<4" }, { name = "numpy", specifier = ">=2.0,<3" },