embed.py: split page vs paragraph embedding models
Pages (similar-links.json, build-only) move to nomic-embed-text-v1.5 (768d) with an on-disk npz cache; paragraphs (browser semantic search) stay on all-MiniLM-L6-v2 (384d), so the client contract is unchanged. WRITING.md search row updated accordingly. einops added for nomic's remote modeling code; cache gitignored with a trailing glob so interrupted-write debris is covered too. Known follow-ups (AUDIT-2026-06-09.md §1.3, §4): pin the nomic-bert-2048 remote code, catch BadZipFile in cache loads, fix the staleness check defeated by stamp-build-time ordering. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
37665f67db
commit
7c5354efa7
|
|
@ -73,6 +73,8 @@ data/build-stamp.txt
|
|||
data/last-build-seconds.txt
|
||||
data/semantic-index.bin
|
||||
data/semantic-meta.json
|
||||
# Trailing glob also catches interrupted-write debris (.tmp / .tmp.npz)
|
||||
data/embed-cache-pages.npz*
|
||||
|
||||
# Archive: generated text + its staleness stamp (recreated from the
|
||||
# committed artifact on every build — deterministic, so committing them is
|
||||
|
|
|
|||
|
|
@ -1125,7 +1125,7 @@ These pages are built automatically and require no content files or markup:
|
|||
| Author indexes | `/authors/<slug>/` | All content attributed to an author |
|
||||
| Random manifest | `/random-pages.json` | JSON array of page URLs for the random-page button |
|
||||
| Atom feeds | `/feed.xml`, `/music/feed.xml` | All content feed + music-only feed |
|
||||
| Search | `/search.html` | Pagefind full-text search + client-side semantic search (`nomic-embed-text-v1.5` ONNX model) |
|
||||
| Search | `/search.html` | Pagefind full-text search + client-side semantic search (`all-MiniLM-L6-v2` ONNX model) |
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ dependencies = [
|
|||
# Visualization
|
||||
"matplotlib>=3.9,<4",
|
||||
"altair>=5.4,<6",
|
||||
|
||||
# Embedding pipeline
|
||||
# Upper bounds are intentionally generous (next major) but always
|
||||
# present so that an unrelated `uv sync` upgrade can't silently pull
|
||||
|
|
@ -18,7 +17,6 @@ dependencies = [
|
|||
"beautifulsoup4>=4.12,<5",
|
||||
# CPU-only torch — avoids pulling ~3 GB of CUDA libraries
|
||||
"torch>=2.5,<3",
|
||||
|
||||
# Photography pipeline
|
||||
# Pillow handles EXIF reading when exiftool is not installed (the
|
||||
# preferred path); colorthief computes the 5-color palette strip.
|
||||
|
|
@ -26,6 +24,7 @@ dependencies = [
|
|||
"pillow>=10.0,<12",
|
||||
"colorthief>=0.2,<1",
|
||||
"pyyaml>=6.0,<7",
|
||||
"einops>=0.8.2",
|
||||
]
|
||||
|
||||
[[tool.uv.index]]
|
||||
|
|
|
|||
164
tools/embed.py
164
tools/embed.py
|
|
@ -5,16 +5,25 @@ embed.py — Build-time embedding pipeline.
|
|||
Produces two outputs from _site/**/*.html:
|
||||
|
||||
data/similar-links.json Page-level similarity (for "Related" footer section)
|
||||
data/semantic-index.bin Paragraph vectors as raw Float32 array (N × DIM)
|
||||
data/semantic-index.bin Paragraph vectors as raw Float32 array (N × PARA_DIM)
|
||||
data/semantic-meta.json Paragraph metadata: [{url, title, heading, excerpt}]
|
||||
|
||||
Both use all-MiniLM-L6-v2 (384 dims) — the same model shipped to the browser
|
||||
via transformers.js for query-time semantic search.
|
||||
Two models, one process:
|
||||
|
||||
* Pages use nomic-embed-text-v1.5 (768 dims) — build-time only, never
|
||||
shipped to the browser. Chosen for its well-separated cosine scores on
|
||||
small corpora, which keeps the MIN_SCORE gate meaningful so every essay
|
||||
reliably gets a "Related" footer section.
|
||||
|
||||
* Paragraphs use all-MiniLM-L6-v2 (384 dims) — must match what the
|
||||
browser runs via transformers.js (static/js/semantic-search.js) since
|
||||
query vectors are dotted against the shipped index.
|
||||
|
||||
Called by `make build` when .venv exists. Failures are non-fatal.
|
||||
Staleness check: skips if all output files are newer than every HTML in _site/.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
|
@ -35,13 +44,42 @@ SITE_DIR = REPO_ROOT / "_site"
|
|||
SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json"
|
||||
SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin"
|
||||
SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json"
|
||||
# Content-addressed cache for nomic page embeddings. Keyed by sha256 of the
|
||||
# prefixed page text; invalidated wholesale on model name/revision/dim change.
|
||||
# Gitignored — a build artifact, not source. Survives `make clean`.
|
||||
PAGE_CACHE = REPO_ROOT / "data" / "embed-cache-pages.npz"
|
||||
|
||||
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
# Pinned to a specific HuggingFace commit so a future model bump can't
|
||||
# silently change embedding semantics. Bump deliberately when validating
|
||||
# (and re-run a full embed pass to refresh data/semantic-* + similar-links).
|
||||
MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf"
|
||||
DIM = 384
|
||||
# Two models, deliberately split:
|
||||
#
|
||||
# PARA_MODEL — embeds paragraphs for data/semantic-index.bin. This index
|
||||
# is fetched by the browser at /search/ and ranked against query vectors
|
||||
# computed client-side. The client (static/js/semantic-search.js) embeds
|
||||
# queries with MiniLM-L6-v2 via transformers.js, so the build-time model
|
||||
# must match exactly — both the architecture and the embedding dimension
|
||||
# are part of the wire contract.
|
||||
#
|
||||
# PAGE_MODEL — embeds full pages for data/similar-links.json. This file
|
||||
# is consumed only at Hakyll-build time (SimilarLinks.hs) and never
|
||||
# shipped to the browser, so it is free to use a different, stronger
|
||||
# model. nomic-embed-text-v1.5 produces well-separated cosine scores on
|
||||
# small corpora (top neighbours at 0.7–0.9 instead of MiniLM's compressed
|
||||
# 0.1–0.3), so the MIN_SCORE gate below is meaningful and every essay
|
||||
# reliably gets a "Related" footer section.
|
||||
#
|
||||
# Both pins are deliberate. Bump only when validating and re-run a full
|
||||
# embed pass to refresh the corresponding output files.
|
||||
|
||||
PARA_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
PARA_MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf"
|
||||
PARA_DIM = 384
|
||||
|
||||
PAGE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
|
||||
PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab"
|
||||
PAGE_DIM = 768
|
||||
# Nomic requires task-prefixed input. Documents (corpus side) get
|
||||
# "search_document: "; queries would get "search_query: ". similar-links
|
||||
# only ever embeds documents, so the prefix is constant here.
|
||||
PAGE_PREFIX = "search_document: "
|
||||
|
||||
TOP_N = 5 # similar-links: neighbours per page
|
||||
MIN_SCORE = 0.30 # similar-links: discard weak matches
|
||||
|
|
@ -80,6 +118,71 @@ def atomic_write_bytes(path: Path, data: bytes) -> None:
|
|||
def atomic_write_text(path: Path, text: str) -> None:
|
||||
atomic_write_bytes(path, text.encode("utf-8"))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Page-embedding cache
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# Loading the nomic model and embedding 26 pages on CPU takes ~3 minutes
|
||||
# every `make build`. Pages rarely change between builds — usually one
|
||||
# essay is edited and everything else is identical. This cache stores
|
||||
# one nomic vector per page content hash so unchanged pages are reused
|
||||
# verbatim and only edited/new pages are re-embedded. A fully-warm cache
|
||||
# skips the model load entirely.
|
||||
|
||||
def content_hash(text: str) -> str:
|
||||
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def load_page_cache() -> dict[str, np.ndarray]:
|
||||
"""Load {hash: vector} from disk. Returns an empty dict if the cache
|
||||
is absent, unreadable, or pinned to a different model — in those
|
||||
cases save_page_cache() will overwrite the stale file on next save."""
|
||||
if not PAGE_CACHE.exists():
|
||||
return {}
|
||||
try:
|
||||
npz = np.load(PAGE_CACHE, allow_pickle=False)
|
||||
if (npz["model"].item() != PAGE_MODEL_NAME or
|
||||
npz["revision"].item() != PAGE_MODEL_REVISION or
|
||||
int(npz["dim"].item()) != PAGE_DIM):
|
||||
return {}
|
||||
hashes = npz["hashes"]
|
||||
vectors = npz["vectors"]
|
||||
if vectors.shape != (len(hashes), PAGE_DIM):
|
||||
return {}
|
||||
return {h.item(): vectors[i] for i, h in enumerate(hashes)}
|
||||
except (OSError, KeyError, ValueError) as e:
|
||||
print(f"embed.py: page cache unreadable ({e}) — discarding",
|
||||
file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def save_page_cache(cache: dict[str, np.ndarray]) -> None:
|
||||
"""Atomically persist {hash: vector}. Empty cache writes an empty
|
||||
file so a subsequent load returns {} cleanly (instead of falling
|
||||
through to the "no file" path)."""
|
||||
if cache:
|
||||
hashes = np.array(list(cache.keys()))
|
||||
vectors = np.stack(list(cache.values())).astype(np.float32)
|
||||
else:
|
||||
hashes = np.array([], dtype="U64")
|
||||
vectors = np.zeros((0, PAGE_DIM), dtype=np.float32)
|
||||
PAGE_CACHE.parent.mkdir(parents=True, exist_ok=True)
|
||||
# Pass an open file handle, not a path: np.savez_compressed appends
|
||||
# ".npz" to bare paths, which would mangle our atomic-rename target.
|
||||
tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp")
|
||||
with open(tmp, "wb") as f:
|
||||
np.savez_compressed(
|
||||
f,
|
||||
model=PAGE_MODEL_NAME,
|
||||
revision=PAGE_MODEL_REVISION,
|
||||
dim=PAGE_DIM,
|
||||
hashes=hashes,
|
||||
vectors=vectors,
|
||||
)
|
||||
os.replace(tmp, PAGE_CACHE)
|
||||
|
||||
|
||||
STRIP_SELECTORS = [
|
||||
"nav", "footer", "#toc", ".link-popup", "script", "style",
|
||||
".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
|
||||
|
|
@ -211,18 +314,38 @@ def main() -> int:
|
|||
print("embed.py: no indexable pages found", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
# --- Load model once for both tasks ---
|
||||
print(f"embed.py: loading {MODEL_NAME}@{MODEL_REVISION[:8]}…")
|
||||
model = SentenceTransformer(MODEL_NAME, revision=MODEL_REVISION)
|
||||
# --- Similar-links (page level, nomic, content-hash cached) ---
|
||||
cache = load_page_cache()
|
||||
page_inputs = [PAGE_PREFIX + p["text"] for p in pages]
|
||||
hashes = [content_hash(t) for t in page_inputs]
|
||||
miss_idxs = [i for i, h in enumerate(hashes) if h not in cache]
|
||||
|
||||
# --- Similar-links (page level) ---
|
||||
print(f"embed.py: embedding {len(pages)} pages…")
|
||||
page_vecs = model.encode(
|
||||
[p["text"] for p in pages],
|
||||
print(f"embed.py: {len(pages) - len(miss_idxs)} cached / "
|
||||
f"{len(miss_idxs)} to embed")
|
||||
|
||||
if miss_idxs:
|
||||
print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}…")
|
||||
page_model = SentenceTransformer(
|
||||
PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True,
|
||||
)
|
||||
new_vecs = page_model.encode(
|
||||
[page_inputs[i] for i in miss_idxs],
|
||||
normalize_embeddings=True,
|
||||
show_progress_bar=True,
|
||||
batch_size=64,
|
||||
batch_size=8,
|
||||
).astype(np.float32)
|
||||
for i, vec in zip(miss_idxs, new_vecs):
|
||||
cache[hashes[i]] = vec
|
||||
# Drop the model before loading MiniLM below; sentence-transformers
|
||||
# holds the full weight tensor in RAM until GC runs.
|
||||
del page_model
|
||||
|
||||
# Assemble page_vecs in the original pages[] order.
|
||||
page_vecs = np.stack([cache[h] for h in hashes]).astype(np.float32)
|
||||
|
||||
# Prune the cache to only currently-present hashes so a deleted page
|
||||
# doesn't keep its vector around forever. Then persist.
|
||||
save_page_cache({h: cache[h] for h in hashes})
|
||||
|
||||
index = faiss.IndexFlatIP(page_vecs.shape[1])
|
||||
index.add(page_vecs)
|
||||
|
|
@ -245,13 +368,16 @@ def main() -> int:
|
|||
atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
|
||||
print(f"embed.py: wrote {len(similar)} similar-links entries")
|
||||
|
||||
# --- Semantic index (paragraph level) ---
|
||||
# --- Semantic index (paragraph level, MiniLM) ---
|
||||
if not paragraphs:
|
||||
print("embed.py: no paragraphs extracted — skipping semantic index")
|
||||
return 0
|
||||
|
||||
print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…")
|
||||
para_model = SentenceTransformer(PARA_MODEL_NAME, revision=PARA_MODEL_REVISION)
|
||||
|
||||
print(f"embed.py: embedding {len(paragraphs)} paragraphs…")
|
||||
para_vecs = model.encode(
|
||||
para_vecs = para_model.encode(
|
||||
[p["text"] for p in paragraphs],
|
||||
normalize_embeddings=True,
|
||||
show_progress_bar=True,
|
||||
|
|
|
|||
11
uv.lock
11
uv.lock
|
|
@ -156,6 +156,15 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "einops"
|
||||
version = "0.8.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "faiss-cpu"
|
||||
version = "1.13.2"
|
||||
|
|
@ -364,6 +373,7 @@ dependencies = [
|
|||
{ name = "altair" },
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "colorthief" },
|
||||
{ name = "einops" },
|
||||
{ name = "faiss-cpu" },
|
||||
{ name = "matplotlib" },
|
||||
{ name = "numpy" },
|
||||
|
|
@ -379,6 +389,7 @@ requires-dist = [
|
|||
{ name = "altair", specifier = ">=5.4,<6" },
|
||||
{ name = "beautifulsoup4", specifier = ">=4.12,<5" },
|
||||
{ name = "colorthief", specifier = ">=0.2,<1" },
|
||||
{ name = "einops", specifier = ">=0.8.2" },
|
||||
{ name = "faiss-cpu", specifier = ">=1.9,<2" },
|
||||
{ name = "matplotlib", specifier = ">=3.9,<4" },
|
||||
{ name = "numpy", specifier = ">=2.0,<3" },
|
||||
|
|
|
|||
Loading…
Reference in New Issue