diff --git a/.gitignore b/.gitignore index fa857e8..cf1f7c2 100644 --- a/.gitignore +++ b/.gitignore @@ -76,8 +76,9 @@ data/build-stamp.txt data/last-build-seconds.txt data/semantic-index.bin data/semantic-meta.json -# Trailing glob also catches interrupted-write debris (.tmp / .tmp.npz) -data/embed-cache-pages.npz* +# Both embed caches (pages + paragraphs); the trailing glob also +# catches interrupted-write debris (.tmp / .tmp.npz) +data/embed-cache-* # Archive: generated text + its staleness stamp (recreated from the # committed artifact on every build — deterministic, so committing them is diff --git a/tools/embed.py b/tools/embed.py index ab57788..d368f17 100644 --- a/tools/embed.py +++ b/tools/embed.py @@ -20,7 +20,13 @@ Two models, one process: query vectors are dotted against the shipped index. Called by `make build` when .venv exists. Failures are non-fatal. -Staleness check: skips if all output files are newer than every HTML in _site/. + +Staleness: both passes are content-hash cached (data/embed-cache-*.npz), +so an unchanged site re-embeds nothing and loads no model — only the +HTML extraction pass runs. There is deliberately no mtime-based skip: +stamp-build-time.py rewrites every page's footer after this script runs, +so "are outputs newer than the HTML" is always false and a check based +on it can never fire. """ import hashlib @@ -45,10 +51,11 @@ SITE_DIR = REPO_ROOT / "_site" SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json" SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin" SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json" -# Content-addressed cache for nomic page embeddings. Keyed by sha256 of the -# prefixed page text; invalidated wholesale on model name/revision/dim change. -# Gitignored — a build artifact, not source. Survives `make clean`. +# Content-addressed caches, one per pass. Keyed by sha256 of the (prefixed) +# input text; invalidated wholesale on model name/revision/dim change. +# Gitignored — build artifacts, not source. Survive `make clean`. PAGE_CACHE = REPO_ROOT / "data" / "embed-cache-pages.npz" +PARA_CACHE = REPO_ROOT / "data" / "embed-cache-paragraphs.npz" # Two models, deliberately split: # @@ -140,31 +147,33 @@ def content_hash(text: str) -> str: return hashlib.sha256(text.encode("utf-8")).hexdigest() -def load_page_cache() -> dict[str, np.ndarray]: +def load_vec_cache(path: Path, model: str, revision: str, + dim: int) -> dict[str, np.ndarray]: """Load {hash: vector} from disk. Returns an empty dict if the cache is absent, unreadable, or pinned to a different model — in those - cases save_page_cache() will overwrite the stale file on next save.""" - if not PAGE_CACHE.exists(): + cases save_vec_cache() will overwrite the stale file on next save.""" + if not path.exists(): return {} try: - npz = np.load(PAGE_CACHE, allow_pickle=False) - if (npz["model"].item() != PAGE_MODEL_NAME or - npz["revision"].item() != PAGE_MODEL_REVISION or - int(npz["dim"].item()) != PAGE_DIM): + npz = np.load(path, allow_pickle=False) + if (npz["model"].item() != model or + npz["revision"].item() != revision or + int(npz["dim"].item()) != dim): return {} hashes = npz["hashes"] vectors = npz["vectors"] - if vectors.shape != (len(hashes), PAGE_DIM): + if vectors.shape != (len(hashes), dim): return {} return {h.item(): vectors[i] for i, h in enumerate(hashes)} except (OSError, KeyError, ValueError, EOFError, zipfile.BadZipFile) as e: - print(f"embed.py: page cache unreadable ({e}) — discarding", + print(f"embed.py: cache {path.name} unreadable ({e}) — discarding", file=sys.stderr) return {} -def save_page_cache(cache: dict[str, np.ndarray]) -> None: +def save_vec_cache(path: Path, model: str, revision: str, dim: int, + cache: dict[str, np.ndarray]) -> None: """Atomically persist {hash: vector}. Empty cache writes an empty file so a subsequent load returns {} cleanly (instead of falling through to the "no file" path).""" @@ -173,22 +182,22 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None: vectors = np.stack(list(cache.values())).astype(np.float32) else: hashes = np.array([], dtype="U64") - vectors = np.zeros((0, PAGE_DIM), dtype=np.float32) - PAGE_CACHE.parent.mkdir(parents=True, exist_ok=True) + vectors = np.zeros((0, dim), dtype=np.float32) + path.parent.mkdir(parents=True, exist_ok=True) # Pass an open file handle, not a path: np.savez_compressed appends # ".npz" to bare paths, which would mangle our atomic-rename target. - tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp") + tmp = path.with_suffix(path.suffix + ".tmp") try: with open(tmp, "wb") as f: np.savez_compressed( f, - model=PAGE_MODEL_NAME, - revision=PAGE_MODEL_REVISION, - dim=PAGE_DIM, + model=model, + revision=revision, + dim=dim, hashes=hashes, vectors=vectors, ) - os.replace(tmp, PAGE_CACHE) + os.replace(tmp, path) except BaseException: tmp.unlink(missing_ok=True) raise @@ -197,19 +206,12 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None: STRIP_SELECTORS = [ "nav", "footer", "#toc", ".link-popup", "script", "style", ".page-meta-footer", ".metadata", "[data-pagefind-ignore]", + # The no-JS footnotes fallback duplicates each sidenote's text + # verbatim at the document end — indexing it would double every + # footnote in search results and skew page similarity. + "section.footnotes", ] -# --------------------------------------------------------------------------- -# Staleness check -# --------------------------------------------------------------------------- - -def needs_update() -> bool: - outputs = [SIMILAR_OUT, SEMANTIC_BIN, SEMANTIC_META] - if not all(p.exists() for p in outputs): - return True - oldest = min(p.stat().st_mtime for p in outputs) - return any(html.stat().st_mtime > oldest for html in SITE_DIR.rglob("*.html")) - # --------------------------------------------------------------------------- # HTML parsing helpers # --------------------------------------------------------------------------- @@ -305,10 +307,6 @@ def main() -> int: print("embed.py: _site/ not found — skipping", file=sys.stderr) return 0 - if not needs_update(): - print("embed.py: all outputs up to date — skipping") - return 0 - # --- Extract pages + paragraphs in one pass --- print("embed.py: extracting pages…") pages = [] @@ -326,12 +324,13 @@ def main() -> int: return 0 # --- Similar-links (page level, nomic, content-hash cached) --- - cache = load_page_cache() + cache = load_vec_cache(PAGE_CACHE, PAGE_MODEL_NAME, + PAGE_MODEL_REVISION, PAGE_DIM) page_inputs = [PAGE_PREFIX + p["text"] for p in pages] hashes = [content_hash(t) for t in page_inputs] miss_idxs = [i for i, h in enumerate(hashes) if h not in cache] - print(f"embed.py: {len(pages) - len(miss_idxs)} cached / " + print(f"embed.py: pages: {len(pages) - len(miss_idxs)} cached / " f"{len(miss_idxs)} to embed") if miss_idxs: @@ -360,7 +359,8 @@ def main() -> int: # Prune the cache to only currently-present hashes so a deleted page # doesn't keep its vector around forever. Then persist. - save_page_cache({h: cache[h] for h in hashes}) + save_vec_cache(PAGE_CACHE, PAGE_MODEL_NAME, PAGE_MODEL_REVISION, + PAGE_DIM, {h: cache[h] for h in hashes}) index = faiss.IndexFlatIP(page_vecs.shape[1]) index.add(page_vecs) @@ -383,21 +383,38 @@ def main() -> int: atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2)) print(f"embed.py: wrote {len(similar)} similar-links entries") - # --- Semantic index (paragraph level, MiniLM) --- + # --- Semantic index (paragraph level, MiniLM, content-hash cached) --- if not paragraphs: print("embed.py: no paragraphs extracted — skipping semantic index") return 0 - print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…") - para_model = SentenceTransformer(PARA_MODEL_NAME, revision=PARA_MODEL_REVISION) + pcache = load_vec_cache(PARA_CACHE, PARA_MODEL_NAME, + PARA_MODEL_REVISION, PARA_DIM) + para_inputs = [p["text"] for p in paragraphs] + para_hashes = [content_hash(t) for t in para_inputs] + para_miss = [i for i, h in enumerate(para_hashes) if h not in pcache] - print(f"embed.py: embedding {len(paragraphs)} paragraphs…") - para_vecs = para_model.encode( - [p["text"] for p in paragraphs], - normalize_embeddings=True, - show_progress_bar=True, - batch_size=64, - ).astype(np.float32) + print(f"embed.py: paragraphs: {len(paragraphs) - len(para_miss)} cached / " + f"{len(para_miss)} to embed") + + if para_miss: + print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…") + para_model = SentenceTransformer(PARA_MODEL_NAME, + revision=PARA_MODEL_REVISION) + new_para_vecs = para_model.encode( + [para_inputs[i] for i in para_miss], + normalize_embeddings=True, + show_progress_bar=True, + batch_size=64, + ).astype(np.float32) + for i, vec in zip(para_miss, new_para_vecs): + pcache[para_hashes[i]] = vec + del para_model + + # Assemble in original paragraph order; prune + persist the cache. + para_vecs = np.stack([pcache[h] for h in para_hashes]).astype(np.float32) + save_vec_cache(PARA_CACHE, PARA_MODEL_NAME, PARA_MODEL_REVISION, + PARA_DIM, {h: pcache[h] for h in para_hashes}) atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes())