embed.py: split page vs paragraph embedding models

Pages (similar-links.json, build-only) move to nomic-embed-text-v1.5
(768d) with an on-disk npz cache; paragraphs (browser semantic search)
stay on all-MiniLM-L6-v2 (384d), so the client contract is unchanged.
WRITING.md search row updated accordingly. einops added for nomic's
remote modeling code; cache gitignored with a trailing glob so
interrupted-write debris is covered too.

Known follow-ups (AUDIT-2026-06-09.md §1.3, §4): pin the
nomic-bert-2048 remote code, catch BadZipFile in cache loads, fix the
staleness check defeated by stamp-build-time ordering.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Levi Neuwirth 2026-06-09 18:57:43 -04:00
parent 37665f67db
commit 7c5354efa7
5 changed files with 163 additions and 25 deletions

2
.gitignore vendored
View File

@ -73,6 +73,8 @@ data/build-stamp.txt
data/last-build-seconds.txt
data/semantic-index.bin
data/semantic-meta.json
# Trailing glob also catches interrupted-write debris (.tmp / .tmp.npz)
data/embed-cache-pages.npz*
# Archive: generated text + its staleness stamp (recreated from the
# committed artifact on every build — deterministic, so committing them is

View File

@ -1125,7 +1125,7 @@ These pages are built automatically and require no content files or markup:
| Author indexes | `/authors/<slug>/` | All content attributed to an author |
| Random manifest | `/random-pages.json` | JSON array of page URLs for the random-page button |
| Atom feeds | `/feed.xml`, `/music/feed.xml` | All content feed + music-only feed |
| Search | `/search.html` | Pagefind full-text search + client-side semantic search (`nomic-embed-text-v1.5` ONNX model) |
| Search | `/search.html` | Pagefind full-text search + client-side semantic search (`all-MiniLM-L6-v2` ONNX model) |
---

View File

@ -7,7 +7,6 @@ dependencies = [
# Visualization
"matplotlib>=3.9,<4",
"altair>=5.4,<6",
# Embedding pipeline
# Upper bounds are intentionally generous (next major) but always
# present so that an unrelated `uv sync` upgrade can't silently pull
@ -18,7 +17,6 @@ dependencies = [
"beautifulsoup4>=4.12,<5",
# CPU-only torch — avoids pulling ~3 GB of CUDA libraries
"torch>=2.5,<3",
# Photography pipeline
# Pillow handles EXIF reading when exiftool is not installed (the
# preferred path); colorthief computes the 5-color palette strip.
@ -26,6 +24,7 @@ dependencies = [
"pillow>=10.0,<12",
"colorthief>=0.2,<1",
"pyyaml>=6.0,<7",
"einops>=0.8.2",
]
[[tool.uv.index]]

View File

@ -5,16 +5,25 @@ embed.py — Build-time embedding pipeline.
Produces two outputs from _site/**/*.html:
data/similar-links.json Page-level similarity (for "Related" footer section)
data/semantic-index.bin Paragraph vectors as raw Float32 array (N × DIM)
data/semantic-index.bin Paragraph vectors as raw Float32 array (N × PARA_DIM)
data/semantic-meta.json Paragraph metadata: [{url, title, heading, excerpt}]
Both use all-MiniLM-L6-v2 (384 dims) the same model shipped to the browser
via transformers.js for query-time semantic search.
Two models, one process:
* Pages use nomic-embed-text-v1.5 (768 dims) build-time only, never
shipped to the browser. Chosen for its well-separated cosine scores on
small corpora, which keeps the MIN_SCORE gate meaningful so every essay
reliably gets a "Related" footer section.
* Paragraphs use all-MiniLM-L6-v2 (384 dims) must match what the
browser runs via transformers.js (static/js/semantic-search.js) since
query vectors are dotted against the shipped index.
Called by `make build` when .venv exists. Failures are non-fatal.
Staleness check: skips if all output files are newer than every HTML in _site/.
"""
import hashlib
import json
import os
import re
@ -35,13 +44,42 @@ SITE_DIR = REPO_ROOT / "_site"
SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json"
SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin"
SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json"
# Content-addressed cache for nomic page embeddings. Keyed by sha256 of the
# prefixed page text; invalidated wholesale on model name/revision/dim change.
# Gitignored — a build artifact, not source. Survives `make clean`.
PAGE_CACHE = REPO_ROOT / "data" / "embed-cache-pages.npz"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
# Pinned to a specific HuggingFace commit so a future model bump can't
# silently change embedding semantics. Bump deliberately when validating
# (and re-run a full embed pass to refresh data/semantic-* + similar-links).
MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf"
DIM = 384
# Two models, deliberately split:
#
# PARA_MODEL — embeds paragraphs for data/semantic-index.bin. This index
# is fetched by the browser at /search/ and ranked against query vectors
# computed client-side. The client (static/js/semantic-search.js) embeds
# queries with MiniLM-L6-v2 via transformers.js, so the build-time model
# must match exactly — both the architecture and the embedding dimension
# are part of the wire contract.
#
# PAGE_MODEL — embeds full pages for data/similar-links.json. This file
# is consumed only at Hakyll-build time (SimilarLinks.hs) and never
# shipped to the browser, so it is free to use a different, stronger
# model. nomic-embed-text-v1.5 produces well-separated cosine scores on
# small corpora (top neighbours at 0.70.9 instead of MiniLM's compressed
# 0.10.3), so the MIN_SCORE gate below is meaningful and every essay
# reliably gets a "Related" footer section.
#
# Both pins are deliberate. Bump only when validating and re-run a full
# embed pass to refresh the corresponding output files.
PARA_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
PARA_MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf"
PARA_DIM = 384
PAGE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab"
PAGE_DIM = 768
# Nomic requires task-prefixed input. Documents (corpus side) get
# "search_document: "; queries would get "search_query: ". similar-links
# only ever embeds documents, so the prefix is constant here.
PAGE_PREFIX = "search_document: "
TOP_N = 5 # similar-links: neighbours per page
MIN_SCORE = 0.30 # similar-links: discard weak matches
@ -80,6 +118,71 @@ def atomic_write_bytes(path: Path, data: bytes) -> None:
def atomic_write_text(path: Path, text: str) -> None:
atomic_write_bytes(path, text.encode("utf-8"))
# ---------------------------------------------------------------------------
# Page-embedding cache
# ---------------------------------------------------------------------------
#
# Loading the nomic model and embedding 26 pages on CPU takes ~3 minutes
# every `make build`. Pages rarely change between builds — usually one
# essay is edited and everything else is identical. This cache stores
# one nomic vector per page content hash so unchanged pages are reused
# verbatim and only edited/new pages are re-embedded. A fully-warm cache
# skips the model load entirely.
def content_hash(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def load_page_cache() -> dict[str, np.ndarray]:
"""Load {hash: vector} from disk. Returns an empty dict if the cache
is absent, unreadable, or pinned to a different model in those
cases save_page_cache() will overwrite the stale file on next save."""
if not PAGE_CACHE.exists():
return {}
try:
npz = np.load(PAGE_CACHE, allow_pickle=False)
if (npz["model"].item() != PAGE_MODEL_NAME or
npz["revision"].item() != PAGE_MODEL_REVISION or
int(npz["dim"].item()) != PAGE_DIM):
return {}
hashes = npz["hashes"]
vectors = npz["vectors"]
if vectors.shape != (len(hashes), PAGE_DIM):
return {}
return {h.item(): vectors[i] for i, h in enumerate(hashes)}
except (OSError, KeyError, ValueError) as e:
print(f"embed.py: page cache unreadable ({e}) — discarding",
file=sys.stderr)
return {}
def save_page_cache(cache: dict[str, np.ndarray]) -> None:
"""Atomically persist {hash: vector}. Empty cache writes an empty
file so a subsequent load returns {} cleanly (instead of falling
through to the "no file" path)."""
if cache:
hashes = np.array(list(cache.keys()))
vectors = np.stack(list(cache.values())).astype(np.float32)
else:
hashes = np.array([], dtype="U64")
vectors = np.zeros((0, PAGE_DIM), dtype=np.float32)
PAGE_CACHE.parent.mkdir(parents=True, exist_ok=True)
# Pass an open file handle, not a path: np.savez_compressed appends
# ".npz" to bare paths, which would mangle our atomic-rename target.
tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp")
with open(tmp, "wb") as f:
np.savez_compressed(
f,
model=PAGE_MODEL_NAME,
revision=PAGE_MODEL_REVISION,
dim=PAGE_DIM,
hashes=hashes,
vectors=vectors,
)
os.replace(tmp, PAGE_CACHE)
STRIP_SELECTORS = [
"nav", "footer", "#toc", ".link-popup", "script", "style",
".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
@ -211,18 +314,38 @@ def main() -> int:
print("embed.py: no indexable pages found", file=sys.stderr)
return 0
# --- Load model once for both tasks ---
print(f"embed.py: loading {MODEL_NAME}@{MODEL_REVISION[:8]}")
model = SentenceTransformer(MODEL_NAME, revision=MODEL_REVISION)
# --- Similar-links (page level, nomic, content-hash cached) ---
cache = load_page_cache()
page_inputs = [PAGE_PREFIX + p["text"] for p in pages]
hashes = [content_hash(t) for t in page_inputs]
miss_idxs = [i for i, h in enumerate(hashes) if h not in cache]
# --- Similar-links (page level) ---
print(f"embed.py: embedding {len(pages)} pages…")
page_vecs = model.encode(
[p["text"] for p in pages],
normalize_embeddings=True,
show_progress_bar=True,
batch_size=64,
).astype(np.float32)
print(f"embed.py: {len(pages) - len(miss_idxs)} cached / "
f"{len(miss_idxs)} to embed")
if miss_idxs:
print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}")
page_model = SentenceTransformer(
PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True,
)
new_vecs = page_model.encode(
[page_inputs[i] for i in miss_idxs],
normalize_embeddings=True,
show_progress_bar=True,
batch_size=8,
).astype(np.float32)
for i, vec in zip(miss_idxs, new_vecs):
cache[hashes[i]] = vec
# Drop the model before loading MiniLM below; sentence-transformers
# holds the full weight tensor in RAM until GC runs.
del page_model
# Assemble page_vecs in the original pages[] order.
page_vecs = np.stack([cache[h] for h in hashes]).astype(np.float32)
# Prune the cache to only currently-present hashes so a deleted page
# doesn't keep its vector around forever. Then persist.
save_page_cache({h: cache[h] for h in hashes})
index = faiss.IndexFlatIP(page_vecs.shape[1])
index.add(page_vecs)
@ -245,13 +368,16 @@ def main() -> int:
atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
print(f"embed.py: wrote {len(similar)} similar-links entries")
# --- Semantic index (paragraph level) ---
# --- Semantic index (paragraph level, MiniLM) ---
if not paragraphs:
print("embed.py: no paragraphs extracted — skipping semantic index")
return 0
print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}")
para_model = SentenceTransformer(PARA_MODEL_NAME, revision=PARA_MODEL_REVISION)
print(f"embed.py: embedding {len(paragraphs)} paragraphs…")
para_vecs = model.encode(
para_vecs = para_model.encode(
[p["text"] for p in paragraphs],
normalize_embeddings=True,
show_progress_bar=True,

11
uv.lock
View File

@ -156,6 +156,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" },
]
[[package]]
name = "einops"
version = "0.8.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" },
]
[[package]]
name = "faiss-cpu"
version = "1.13.2"
@ -364,6 +373,7 @@ dependencies = [
{ name = "altair" },
{ name = "beautifulsoup4" },
{ name = "colorthief" },
{ name = "einops" },
{ name = "faiss-cpu" },
{ name = "matplotlib" },
{ name = "numpy" },
@ -379,6 +389,7 @@ requires-dist = [
{ name = "altair", specifier = ">=5.4,<6" },
{ name = "beautifulsoup4", specifier = ">=4.12,<5" },
{ name = "colorthief", specifier = ">=0.2,<1" },
{ name = "einops", specifier = ">=0.8.2" },
{ name = "faiss-cpu", specifier = ">=1.9,<2" },
{ name = "matplotlib", specifier = ">=3.9,<4" },
{ name = "numpy", specifier = ">=2.0,<3" },