embed.py: hash-cache the paragraph pass; drop the dead mtime skip

The 'skip if outputs newer than every HTML' check could never fire: stamp-build-time.py rewrites every page's footer AFTER embed.py runs, so the comparison was always false and the full MiniLM paragraph pass (and model load) ran on every build (AUDIT §4.3). Replaced with the same content-hash cache the page pass already had — generalized load/save_vec_cache, keyed by sha256 of the input text, invalidated on model/revision/dim change. A no-change rerun now does no model loads: measured 97s cold -> 4.8s warm. Also strips section.footnotes from extraction: the new no-JS fallback duplicates each sidenote's text at document end, which would double footnotes in search results and skew page similarity. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 10:51:01 -04:00 · 2026-06-10 10:51:01 -04:00 · 945086421a
parent b2951c0c2c
commit 945086421a
2 changed files with 69 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
@ -76,8 +76,9 @@ data/build-stamp.txt
 data/last-build-seconds.txt
 data/semantic-index.bin
 data/semantic-meta.json
-# Trailing glob also catches interrupted-write debris (.tmp / .tmp.npz)
+# Both embed caches (pages + paragraphs); the trailing glob also
-data/embed-cache-pages.npz*
+# catches interrupted-write debris (.tmp / .tmp.npz)
 data/embed-cache-*
 # Archive: generated text + its staleness stamp (recreated from the
 # committed artifact on every build — deterministic, so committing them is
--- a/tools/embed.py
+++ b/tools/embed.py
@ -20,7 +20,13 @@ Two models, one process:
    query vectors are dotted against the shipped index.
 Called by `make build` when .venv exists. Failures are non-fatal.
-Staleness check: skips if all output files are newer than every HTML in _site/.
+
 Staleness: both passes are content-hash cached (data/embed-cache-*.npz),
 so an unchanged site re-embeds nothing and loads no model — only the
 HTML extraction pass runs. There is deliberately no mtime-based skip:
 stamp-build-time.py rewrites every page's footer after this script runs,
 so "are outputs newer than the HTML" is always false and a check based
 on it can never fire.
 """
 import hashlib
@ -45,10 +51,11 @@ SITE_DIR       = REPO_ROOT / "_site"
 SIMILAR_OUT    = REPO_ROOT / "data" / "similar-links.json"
 SEMANTIC_BIN   = REPO_ROOT / "data" / "semantic-index.bin"
 SEMANTIC_META  = REPO_ROOT / "data" / "semantic-meta.json"
-# Content-addressed cache for nomic page embeddings. Keyed by sha256 of the
+# Content-addressed caches, one per pass. Keyed by sha256 of the (prefixed)
-# prefixed page text; invalidated wholesale on model name/revision/dim change.
+# input text; invalidated wholesale on model name/revision/dim change.
-# Gitignored — a build artifact, not source. Survives `make clean`.
+# Gitignored — build artifacts, not source. Survive `make clean`.
 PAGE_CACHE     = REPO_ROOT / "data" / "embed-cache-pages.npz"
 PARA_CACHE     = REPO_ROOT / "data" / "embed-cache-paragraphs.npz"
 # Two models, deliberately split:
 #
@ -140,31 +147,33 @@ def content_hash(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()
-def load_page_cache() -> dict[str, np.ndarray]:
+def load_vec_cache(path: Path, model: str, revision: str,
                   dim: int) -> dict[str, np.ndarray]:
    """Load {hash: vector} from disk. Returns an empty dict if the cache
    is absent, unreadable, or pinned to a different model — in those
-    cases save_page_cache() will overwrite the stale file on next save."""
+    cases save_vec_cache() will overwrite the stale file on next save."""
-    if not PAGE_CACHE.exists():
+    if not path.exists():
        return {}
    try:
-        npz = np.load(PAGE_CACHE, allow_pickle=False)
+        npz = np.load(path, allow_pickle=False)
-        if (npz["model"].item()    != PAGE_MODEL_NAME or
+        if (npz["model"].item()    != model or
-            npz["revision"].item() != PAGE_MODEL_REVISION or
+            npz["revision"].item() != revision or
-            int(npz["dim"].item()) != PAGE_DIM):
+            int(npz["dim"].item()) != dim):
            return {}
        hashes  = npz["hashes"]
        vectors = npz["vectors"]
-        if vectors.shape != (len(hashes), PAGE_DIM):
+        if vectors.shape != (len(hashes), dim):
            return {}
        return {h.item(): vectors[i] for i, h in enumerate(hashes)}
    except (OSError, KeyError, ValueError, EOFError,
            zipfile.BadZipFile) as e:
-        print(f"embed.py: page cache unreadable ({e}) — discarding",
+        print(f"embed.py: cache {path.name} unreadable ({e}) — discarding",
              file=sys.stderr)
        return {}
-def save_page_cache(cache: dict[str, np.ndarray]) -> None:
+def save_vec_cache(path: Path, model: str, revision: str, dim: int,
                   cache: dict[str, np.ndarray]) -> None:
    """Atomically persist {hash: vector}. Empty cache writes an empty
    file so a subsequent load returns {} cleanly (instead of falling
    through to the "no file" path)."""
@ -173,22 +182,22 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
        vectors = np.stack(list(cache.values())).astype(np.float32)
    else:
        hashes  = np.array([], dtype="U64")
-        vectors = np.zeros((0, PAGE_DIM), dtype=np.float32)
+        vectors = np.zeros((0, dim), dtype=np.float32)
-    PAGE_CACHE.parent.mkdir(parents=True, exist_ok=True)
+    path.parent.mkdir(parents=True, exist_ok=True)
    # Pass an open file handle, not a path: np.savez_compressed appends
    # ".npz" to bare paths, which would mangle our atomic-rename target.
-    tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp")
+    tmp = path.with_suffix(path.suffix + ".tmp")
    try:
        with open(tmp, "wb") as f:
            np.savez_compressed(
                f,
-                model=PAGE_MODEL_NAME,
+                model=model,
-                revision=PAGE_MODEL_REVISION,
+                revision=revision,
-                dim=PAGE_DIM,
+                dim=dim,
                hashes=hashes,
                vectors=vectors,
            )
-        os.replace(tmp, PAGE_CACHE)
+        os.replace(tmp, path)
    except BaseException:
        tmp.unlink(missing_ok=True)
        raise
@ -197,19 +206,12 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
 STRIP_SELECTORS = [
    "nav", "footer", "#toc", ".link-popup", "script", "style",
    ".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
    # The no-JS footnotes fallback duplicates each sidenote's text
    # verbatim at the document end — indexing it would double every
    # footnote in search results and skew page similarity.
    "section.footnotes",
 ]
 # ---------------------------------------------------------------------------
 # Staleness check
 # ---------------------------------------------------------------------------
 def needs_update() -> bool:
    outputs = [SIMILAR_OUT, SEMANTIC_BIN, SEMANTIC_META]
    if not all(p.exists() for p in outputs):
        return True
    oldest = min(p.stat().st_mtime for p in outputs)
    return any(html.stat().st_mtime > oldest for html in SITE_DIR.rglob("*.html"))
 # ---------------------------------------------------------------------------
 # HTML parsing helpers
 # ---------------------------------------------------------------------------
@ -305,10 +307,6 @@ def main() -> int:
        print("embed.py: _site/ not found — skipping", file=sys.stderr)
        return 0
    if not needs_update():
        print("embed.py: all outputs up to date — skipping")
        return 0
    # --- Extract pages + paragraphs in one pass ---
    print("embed.py: extracting pages…")
    pages = []
@ -326,12 +324,13 @@ def main() -> int:
        return 0
    # --- Similar-links (page level, nomic, content-hash cached) ---
-    cache       = load_page_cache()
+    cache       = load_vec_cache(PAGE_CACHE, PAGE_MODEL_NAME,
                                 PAGE_MODEL_REVISION, PAGE_DIM)
    page_inputs = [PAGE_PREFIX + p["text"] for p in pages]
    hashes      = [content_hash(t) for t in page_inputs]
    miss_idxs   = [i for i, h in enumerate(hashes) if h not in cache]
-    print(f"embed.py: {len(pages) - len(miss_idxs)} cached / "
+    print(f"embed.py: pages: {len(pages) - len(miss_idxs)} cached / "
          f"{len(miss_idxs)} to embed")
    if miss_idxs:
@ -360,7 +359,8 @@ def main() -> int:
    # Prune the cache to only currently-present hashes so a deleted page
    # doesn't keep its vector around forever. Then persist.
-    save_page_cache({h: cache[h] for h in hashes})
+    save_vec_cache(PAGE_CACHE, PAGE_MODEL_NAME, PAGE_MODEL_REVISION,
                   PAGE_DIM, {h: cache[h] for h in hashes})
    index = faiss.IndexFlatIP(page_vecs.shape[1])
    index.add(page_vecs)
@ -383,21 +383,38 @@ def main() -> int:
    atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
    print(f"embed.py: wrote {len(similar)} similar-links entries")
-    # --- Semantic index (paragraph level, MiniLM) ---
+    # --- Semantic index (paragraph level, MiniLM, content-hash cached) ---
    if not paragraphs:
        print("embed.py: no paragraphs extracted — skipping semantic index")
        return 0
-    print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…")
+    pcache      = load_vec_cache(PARA_CACHE, PARA_MODEL_NAME,
-    para_model = SentenceTransformer(PARA_MODEL_NAME, revision=PARA_MODEL_REVISION)
+                                 PARA_MODEL_REVISION, PARA_DIM)
    para_inputs = [p["text"] for p in paragraphs]
    para_hashes = [content_hash(t) for t in para_inputs]
    para_miss   = [i for i, h in enumerate(para_hashes) if h not in pcache]
-    print(f"embed.py: embedding {len(paragraphs)} paragraphs…")
+    print(f"embed.py: paragraphs: {len(paragraphs) - len(para_miss)} cached / "
-    para_vecs = para_model.encode(
+          f"{len(para_miss)} to embed")
-        [p["text"] for p in paragraphs],
+
    if para_miss:
        print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…")
        para_model = SentenceTransformer(PARA_MODEL_NAME,
                                         revision=PARA_MODEL_REVISION)
        new_para_vecs = para_model.encode(
            [para_inputs[i] for i in para_miss],
            normalize_embeddings=True,
            show_progress_bar=True,
            batch_size=64,
        ).astype(np.float32)
        for i, vec in zip(para_miss, new_para_vecs):
            pcache[para_hashes[i]] = vec
        del para_model
    # Assemble in original paragraph order; prune + persist the cache.
    para_vecs = np.stack([pcache[h] for h in para_hashes]).astype(np.float32)
    save_vec_cache(PARA_CACHE, PARA_MODEL_NAME, PARA_MODEL_REVISION,
                   PARA_DIM, {h: pcache[h] for h in para_hashes})
    atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes())