embed.py: hash-cache the paragraph pass; drop the dead mtime skip

The 'skip if outputs newer than every HTML' check could never fire:
stamp-build-time.py rewrites every page's footer AFTER embed.py runs,
so the comparison was always false and the full MiniLM paragraph pass
(and model load) ran on every build (AUDIT §4.3). Replaced with the
same content-hash cache the page pass already had — generalized
load/save_vec_cache, keyed by sha256 of the input text, invalidated on
model/revision/dim change. A no-change rerun now does no model loads:
measured 97s cold -> 4.8s warm.

Also strips section.footnotes from extraction: the new no-JS fallback
duplicates each sidenote's text at document end, which would double
footnotes in search results and skew page similarity.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Levi Neuwirth 2026-06-10 10:51:01 -04:00
parent b2951c0c2c
commit 945086421a
2 changed files with 69 additions and 51 deletions

5
.gitignore vendored
View File

@ -76,8 +76,9 @@ data/build-stamp.txt
data/last-build-seconds.txt data/last-build-seconds.txt
data/semantic-index.bin data/semantic-index.bin
data/semantic-meta.json data/semantic-meta.json
# Trailing glob also catches interrupted-write debris (.tmp / .tmp.npz) # Both embed caches (pages + paragraphs); the trailing glob also
data/embed-cache-pages.npz* # catches interrupted-write debris (.tmp / .tmp.npz)
data/embed-cache-*
# Archive: generated text + its staleness stamp (recreated from the # Archive: generated text + its staleness stamp (recreated from the
# committed artifact on every build — deterministic, so committing them is # committed artifact on every build — deterministic, so committing them is

View File

@ -20,7 +20,13 @@ Two models, one process:
query vectors are dotted against the shipped index. query vectors are dotted against the shipped index.
Called by `make build` when .venv exists. Failures are non-fatal. Called by `make build` when .venv exists. Failures are non-fatal.
Staleness check: skips if all output files are newer than every HTML in _site/.
Staleness: both passes are content-hash cached (data/embed-cache-*.npz),
so an unchanged site re-embeds nothing and loads no model only the
HTML extraction pass runs. There is deliberately no mtime-based skip:
stamp-build-time.py rewrites every page's footer after this script runs,
so "are outputs newer than the HTML" is always false and a check based
on it can never fire.
""" """
import hashlib import hashlib
@ -45,10 +51,11 @@ SITE_DIR = REPO_ROOT / "_site"
SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json" SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json"
SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin" SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin"
SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json" SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json"
# Content-addressed cache for nomic page embeddings. Keyed by sha256 of the # Content-addressed caches, one per pass. Keyed by sha256 of the (prefixed)
# prefixed page text; invalidated wholesale on model name/revision/dim change. # input text; invalidated wholesale on model name/revision/dim change.
# Gitignored — a build artifact, not source. Survives `make clean`. # Gitignored — build artifacts, not source. Survive `make clean`.
PAGE_CACHE = REPO_ROOT / "data" / "embed-cache-pages.npz" PAGE_CACHE = REPO_ROOT / "data" / "embed-cache-pages.npz"
PARA_CACHE = REPO_ROOT / "data" / "embed-cache-paragraphs.npz"
# Two models, deliberately split: # Two models, deliberately split:
# #
@ -140,31 +147,33 @@ def content_hash(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest() return hashlib.sha256(text.encode("utf-8")).hexdigest()
def load_page_cache() -> dict[str, np.ndarray]: def load_vec_cache(path: Path, model: str, revision: str,
dim: int) -> dict[str, np.ndarray]:
"""Load {hash: vector} from disk. Returns an empty dict if the cache """Load {hash: vector} from disk. Returns an empty dict if the cache
is absent, unreadable, or pinned to a different model in those is absent, unreadable, or pinned to a different model in those
cases save_page_cache() will overwrite the stale file on next save.""" cases save_vec_cache() will overwrite the stale file on next save."""
if not PAGE_CACHE.exists(): if not path.exists():
return {} return {}
try: try:
npz = np.load(PAGE_CACHE, allow_pickle=False) npz = np.load(path, allow_pickle=False)
if (npz["model"].item() != PAGE_MODEL_NAME or if (npz["model"].item() != model or
npz["revision"].item() != PAGE_MODEL_REVISION or npz["revision"].item() != revision or
int(npz["dim"].item()) != PAGE_DIM): int(npz["dim"].item()) != dim):
return {} return {}
hashes = npz["hashes"] hashes = npz["hashes"]
vectors = npz["vectors"] vectors = npz["vectors"]
if vectors.shape != (len(hashes), PAGE_DIM): if vectors.shape != (len(hashes), dim):
return {} return {}
return {h.item(): vectors[i] for i, h in enumerate(hashes)} return {h.item(): vectors[i] for i, h in enumerate(hashes)}
except (OSError, KeyError, ValueError, EOFError, except (OSError, KeyError, ValueError, EOFError,
zipfile.BadZipFile) as e: zipfile.BadZipFile) as e:
print(f"embed.py: page cache unreadable ({e}) — discarding", print(f"embed.py: cache {path.name} unreadable ({e}) — discarding",
file=sys.stderr) file=sys.stderr)
return {} return {}
def save_page_cache(cache: dict[str, np.ndarray]) -> None: def save_vec_cache(path: Path, model: str, revision: str, dim: int,
cache: dict[str, np.ndarray]) -> None:
"""Atomically persist {hash: vector}. Empty cache writes an empty """Atomically persist {hash: vector}. Empty cache writes an empty
file so a subsequent load returns {} cleanly (instead of falling file so a subsequent load returns {} cleanly (instead of falling
through to the "no file" path).""" through to the "no file" path)."""
@ -173,22 +182,22 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
vectors = np.stack(list(cache.values())).astype(np.float32) vectors = np.stack(list(cache.values())).astype(np.float32)
else: else:
hashes = np.array([], dtype="U64") hashes = np.array([], dtype="U64")
vectors = np.zeros((0, PAGE_DIM), dtype=np.float32) vectors = np.zeros((0, dim), dtype=np.float32)
PAGE_CACHE.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
# Pass an open file handle, not a path: np.savez_compressed appends # Pass an open file handle, not a path: np.savez_compressed appends
# ".npz" to bare paths, which would mangle our atomic-rename target. # ".npz" to bare paths, which would mangle our atomic-rename target.
tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp") tmp = path.with_suffix(path.suffix + ".tmp")
try: try:
with open(tmp, "wb") as f: with open(tmp, "wb") as f:
np.savez_compressed( np.savez_compressed(
f, f,
model=PAGE_MODEL_NAME, model=model,
revision=PAGE_MODEL_REVISION, revision=revision,
dim=PAGE_DIM, dim=dim,
hashes=hashes, hashes=hashes,
vectors=vectors, vectors=vectors,
) )
os.replace(tmp, PAGE_CACHE) os.replace(tmp, path)
except BaseException: except BaseException:
tmp.unlink(missing_ok=True) tmp.unlink(missing_ok=True)
raise raise
@ -197,19 +206,12 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
STRIP_SELECTORS = [ STRIP_SELECTORS = [
"nav", "footer", "#toc", ".link-popup", "script", "style", "nav", "footer", "#toc", ".link-popup", "script", "style",
".page-meta-footer", ".metadata", "[data-pagefind-ignore]", ".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
# The no-JS footnotes fallback duplicates each sidenote's text
# verbatim at the document end — indexing it would double every
# footnote in search results and skew page similarity.
"section.footnotes",
] ]
# ---------------------------------------------------------------------------
# Staleness check
# ---------------------------------------------------------------------------
def needs_update() -> bool:
outputs = [SIMILAR_OUT, SEMANTIC_BIN, SEMANTIC_META]
if not all(p.exists() for p in outputs):
return True
oldest = min(p.stat().st_mtime for p in outputs)
return any(html.stat().st_mtime > oldest for html in SITE_DIR.rglob("*.html"))
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# HTML parsing helpers # HTML parsing helpers
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -305,10 +307,6 @@ def main() -> int:
print("embed.py: _site/ not found — skipping", file=sys.stderr) print("embed.py: _site/ not found — skipping", file=sys.stderr)
return 0 return 0
if not needs_update():
print("embed.py: all outputs up to date — skipping")
return 0
# --- Extract pages + paragraphs in one pass --- # --- Extract pages + paragraphs in one pass ---
print("embed.py: extracting pages…") print("embed.py: extracting pages…")
pages = [] pages = []
@ -326,12 +324,13 @@ def main() -> int:
return 0 return 0
# --- Similar-links (page level, nomic, content-hash cached) --- # --- Similar-links (page level, nomic, content-hash cached) ---
cache = load_page_cache() cache = load_vec_cache(PAGE_CACHE, PAGE_MODEL_NAME,
PAGE_MODEL_REVISION, PAGE_DIM)
page_inputs = [PAGE_PREFIX + p["text"] for p in pages] page_inputs = [PAGE_PREFIX + p["text"] for p in pages]
hashes = [content_hash(t) for t in page_inputs] hashes = [content_hash(t) for t in page_inputs]
miss_idxs = [i for i, h in enumerate(hashes) if h not in cache] miss_idxs = [i for i, h in enumerate(hashes) if h not in cache]
print(f"embed.py: {len(pages) - len(miss_idxs)} cached / " print(f"embed.py: pages: {len(pages) - len(miss_idxs)} cached / "
f"{len(miss_idxs)} to embed") f"{len(miss_idxs)} to embed")
if miss_idxs: if miss_idxs:
@ -360,7 +359,8 @@ def main() -> int:
# Prune the cache to only currently-present hashes so a deleted page # Prune the cache to only currently-present hashes so a deleted page
# doesn't keep its vector around forever. Then persist. # doesn't keep its vector around forever. Then persist.
save_page_cache({h: cache[h] for h in hashes}) save_vec_cache(PAGE_CACHE, PAGE_MODEL_NAME, PAGE_MODEL_REVISION,
PAGE_DIM, {h: cache[h] for h in hashes})
index = faiss.IndexFlatIP(page_vecs.shape[1]) index = faiss.IndexFlatIP(page_vecs.shape[1])
index.add(page_vecs) index.add(page_vecs)
@ -383,21 +383,38 @@ def main() -> int:
atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2)) atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
print(f"embed.py: wrote {len(similar)} similar-links entries") print(f"embed.py: wrote {len(similar)} similar-links entries")
# --- Semantic index (paragraph level, MiniLM) --- # --- Semantic index (paragraph level, MiniLM, content-hash cached) ---
if not paragraphs: if not paragraphs:
print("embed.py: no paragraphs extracted — skipping semantic index") print("embed.py: no paragraphs extracted — skipping semantic index")
return 0 return 0
print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}") pcache = load_vec_cache(PARA_CACHE, PARA_MODEL_NAME,
para_model = SentenceTransformer(PARA_MODEL_NAME, revision=PARA_MODEL_REVISION) PARA_MODEL_REVISION, PARA_DIM)
para_inputs = [p["text"] for p in paragraphs]
para_hashes = [content_hash(t) for t in para_inputs]
para_miss = [i for i, h in enumerate(para_hashes) if h not in pcache]
print(f"embed.py: embedding {len(paragraphs)} paragraphs…") print(f"embed.py: paragraphs: {len(paragraphs) - len(para_miss)} cached / "
para_vecs = para_model.encode( f"{len(para_miss)} to embed")
[p["text"] for p in paragraphs],
if para_miss:
print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}")
para_model = SentenceTransformer(PARA_MODEL_NAME,
revision=PARA_MODEL_REVISION)
new_para_vecs = para_model.encode(
[para_inputs[i] for i in para_miss],
normalize_embeddings=True, normalize_embeddings=True,
show_progress_bar=True, show_progress_bar=True,
batch_size=64, batch_size=64,
).astype(np.float32) ).astype(np.float32)
for i, vec in zip(para_miss, new_para_vecs):
pcache[para_hashes[i]] = vec
del para_model
# Assemble in original paragraph order; prune + persist the cache.
para_vecs = np.stack([pcache[h] for h in para_hashes]).astype(np.float32)
save_vec_cache(PARA_CACHE, PARA_MODEL_NAME, PARA_MODEL_REVISION,
PARA_DIM, {h: pcache[h] for h in para_hashes})
atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes()) atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes())