embed.py: hash-cache the paragraph pass; drop the dead mtime skip
The 'skip if outputs newer than every HTML' check could never fire: stamp-build-time.py rewrites every page's footer AFTER embed.py runs, so the comparison was always false and the full MiniLM paragraph pass (and model load) ran on every build (AUDIT §4.3). Replaced with the same content-hash cache the page pass already had — generalized load/save_vec_cache, keyed by sha256 of the input text, invalidated on model/revision/dim change. A no-change rerun now does no model loads: measured 97s cold -> 4.8s warm. Also strips section.footnotes from extraction: the new no-JS fallback duplicates each sidenote's text at document end, which would double footnotes in search results and skew page similarity. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
b2951c0c2c
commit
945086421a
|
|
@ -76,8 +76,9 @@ data/build-stamp.txt
|
||||||
data/last-build-seconds.txt
|
data/last-build-seconds.txt
|
||||||
data/semantic-index.bin
|
data/semantic-index.bin
|
||||||
data/semantic-meta.json
|
data/semantic-meta.json
|
||||||
# Trailing glob also catches interrupted-write debris (.tmp / .tmp.npz)
|
# Both embed caches (pages + paragraphs); the trailing glob also
|
||||||
data/embed-cache-pages.npz*
|
# catches interrupted-write debris (.tmp / .tmp.npz)
|
||||||
|
data/embed-cache-*
|
||||||
|
|
||||||
# Archive: generated text + its staleness stamp (recreated from the
|
# Archive: generated text + its staleness stamp (recreated from the
|
||||||
# committed artifact on every build — deterministic, so committing them is
|
# committed artifact on every build — deterministic, so committing them is
|
||||||
|
|
|
||||||
115
tools/embed.py
115
tools/embed.py
|
|
@ -20,7 +20,13 @@ Two models, one process:
|
||||||
query vectors are dotted against the shipped index.
|
query vectors are dotted against the shipped index.
|
||||||
|
|
||||||
Called by `make build` when .venv exists. Failures are non-fatal.
|
Called by `make build` when .venv exists. Failures are non-fatal.
|
||||||
Staleness check: skips if all output files are newer than every HTML in _site/.
|
|
||||||
|
Staleness: both passes are content-hash cached (data/embed-cache-*.npz),
|
||||||
|
so an unchanged site re-embeds nothing and loads no model — only the
|
||||||
|
HTML extraction pass runs. There is deliberately no mtime-based skip:
|
||||||
|
stamp-build-time.py rewrites every page's footer after this script runs,
|
||||||
|
so "are outputs newer than the HTML" is always false and a check based
|
||||||
|
on it can never fire.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
@ -45,10 +51,11 @@ SITE_DIR = REPO_ROOT / "_site"
|
||||||
SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json"
|
SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json"
|
||||||
SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin"
|
SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin"
|
||||||
SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json"
|
SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json"
|
||||||
# Content-addressed cache for nomic page embeddings. Keyed by sha256 of the
|
# Content-addressed caches, one per pass. Keyed by sha256 of the (prefixed)
|
||||||
# prefixed page text; invalidated wholesale on model name/revision/dim change.
|
# input text; invalidated wholesale on model name/revision/dim change.
|
||||||
# Gitignored — a build artifact, not source. Survives `make clean`.
|
# Gitignored — build artifacts, not source. Survive `make clean`.
|
||||||
PAGE_CACHE = REPO_ROOT / "data" / "embed-cache-pages.npz"
|
PAGE_CACHE = REPO_ROOT / "data" / "embed-cache-pages.npz"
|
||||||
|
PARA_CACHE = REPO_ROOT / "data" / "embed-cache-paragraphs.npz"
|
||||||
|
|
||||||
# Two models, deliberately split:
|
# Two models, deliberately split:
|
||||||
#
|
#
|
||||||
|
|
@ -140,31 +147,33 @@ def content_hash(text: str) -> str:
|
||||||
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def load_page_cache() -> dict[str, np.ndarray]:
|
def load_vec_cache(path: Path, model: str, revision: str,
|
||||||
|
dim: int) -> dict[str, np.ndarray]:
|
||||||
"""Load {hash: vector} from disk. Returns an empty dict if the cache
|
"""Load {hash: vector} from disk. Returns an empty dict if the cache
|
||||||
is absent, unreadable, or pinned to a different model — in those
|
is absent, unreadable, or pinned to a different model — in those
|
||||||
cases save_page_cache() will overwrite the stale file on next save."""
|
cases save_vec_cache() will overwrite the stale file on next save."""
|
||||||
if not PAGE_CACHE.exists():
|
if not path.exists():
|
||||||
return {}
|
return {}
|
||||||
try:
|
try:
|
||||||
npz = np.load(PAGE_CACHE, allow_pickle=False)
|
npz = np.load(path, allow_pickle=False)
|
||||||
if (npz["model"].item() != PAGE_MODEL_NAME or
|
if (npz["model"].item() != model or
|
||||||
npz["revision"].item() != PAGE_MODEL_REVISION or
|
npz["revision"].item() != revision or
|
||||||
int(npz["dim"].item()) != PAGE_DIM):
|
int(npz["dim"].item()) != dim):
|
||||||
return {}
|
return {}
|
||||||
hashes = npz["hashes"]
|
hashes = npz["hashes"]
|
||||||
vectors = npz["vectors"]
|
vectors = npz["vectors"]
|
||||||
if vectors.shape != (len(hashes), PAGE_DIM):
|
if vectors.shape != (len(hashes), dim):
|
||||||
return {}
|
return {}
|
||||||
return {h.item(): vectors[i] for i, h in enumerate(hashes)}
|
return {h.item(): vectors[i] for i, h in enumerate(hashes)}
|
||||||
except (OSError, KeyError, ValueError, EOFError,
|
except (OSError, KeyError, ValueError, EOFError,
|
||||||
zipfile.BadZipFile) as e:
|
zipfile.BadZipFile) as e:
|
||||||
print(f"embed.py: page cache unreadable ({e}) — discarding",
|
print(f"embed.py: cache {path.name} unreadable ({e}) — discarding",
|
||||||
file=sys.stderr)
|
file=sys.stderr)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def save_page_cache(cache: dict[str, np.ndarray]) -> None:
|
def save_vec_cache(path: Path, model: str, revision: str, dim: int,
|
||||||
|
cache: dict[str, np.ndarray]) -> None:
|
||||||
"""Atomically persist {hash: vector}. Empty cache writes an empty
|
"""Atomically persist {hash: vector}. Empty cache writes an empty
|
||||||
file so a subsequent load returns {} cleanly (instead of falling
|
file so a subsequent load returns {} cleanly (instead of falling
|
||||||
through to the "no file" path)."""
|
through to the "no file" path)."""
|
||||||
|
|
@ -173,22 +182,22 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
|
||||||
vectors = np.stack(list(cache.values())).astype(np.float32)
|
vectors = np.stack(list(cache.values())).astype(np.float32)
|
||||||
else:
|
else:
|
||||||
hashes = np.array([], dtype="U64")
|
hashes = np.array([], dtype="U64")
|
||||||
vectors = np.zeros((0, PAGE_DIM), dtype=np.float32)
|
vectors = np.zeros((0, dim), dtype=np.float32)
|
||||||
PAGE_CACHE.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
# Pass an open file handle, not a path: np.savez_compressed appends
|
# Pass an open file handle, not a path: np.savez_compressed appends
|
||||||
# ".npz" to bare paths, which would mangle our atomic-rename target.
|
# ".npz" to bare paths, which would mangle our atomic-rename target.
|
||||||
tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp")
|
tmp = path.with_suffix(path.suffix + ".tmp")
|
||||||
try:
|
try:
|
||||||
with open(tmp, "wb") as f:
|
with open(tmp, "wb") as f:
|
||||||
np.savez_compressed(
|
np.savez_compressed(
|
||||||
f,
|
f,
|
||||||
model=PAGE_MODEL_NAME,
|
model=model,
|
||||||
revision=PAGE_MODEL_REVISION,
|
revision=revision,
|
||||||
dim=PAGE_DIM,
|
dim=dim,
|
||||||
hashes=hashes,
|
hashes=hashes,
|
||||||
vectors=vectors,
|
vectors=vectors,
|
||||||
)
|
)
|
||||||
os.replace(tmp, PAGE_CACHE)
|
os.replace(tmp, path)
|
||||||
except BaseException:
|
except BaseException:
|
||||||
tmp.unlink(missing_ok=True)
|
tmp.unlink(missing_ok=True)
|
||||||
raise
|
raise
|
||||||
|
|
@ -197,19 +206,12 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
|
||||||
STRIP_SELECTORS = [
|
STRIP_SELECTORS = [
|
||||||
"nav", "footer", "#toc", ".link-popup", "script", "style",
|
"nav", "footer", "#toc", ".link-popup", "script", "style",
|
||||||
".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
|
".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
|
||||||
|
# The no-JS footnotes fallback duplicates each sidenote's text
|
||||||
|
# verbatim at the document end — indexing it would double every
|
||||||
|
# footnote in search results and skew page similarity.
|
||||||
|
"section.footnotes",
|
||||||
]
|
]
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Staleness check
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def needs_update() -> bool:
|
|
||||||
outputs = [SIMILAR_OUT, SEMANTIC_BIN, SEMANTIC_META]
|
|
||||||
if not all(p.exists() for p in outputs):
|
|
||||||
return True
|
|
||||||
oldest = min(p.stat().st_mtime for p in outputs)
|
|
||||||
return any(html.stat().st_mtime > oldest for html in SITE_DIR.rglob("*.html"))
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# HTML parsing helpers
|
# HTML parsing helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -305,10 +307,6 @@ def main() -> int:
|
||||||
print("embed.py: _site/ not found — skipping", file=sys.stderr)
|
print("embed.py: _site/ not found — skipping", file=sys.stderr)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if not needs_update():
|
|
||||||
print("embed.py: all outputs up to date — skipping")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# --- Extract pages + paragraphs in one pass ---
|
# --- Extract pages + paragraphs in one pass ---
|
||||||
print("embed.py: extracting pages…")
|
print("embed.py: extracting pages…")
|
||||||
pages = []
|
pages = []
|
||||||
|
|
@ -326,12 +324,13 @@ def main() -> int:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# --- Similar-links (page level, nomic, content-hash cached) ---
|
# --- Similar-links (page level, nomic, content-hash cached) ---
|
||||||
cache = load_page_cache()
|
cache = load_vec_cache(PAGE_CACHE, PAGE_MODEL_NAME,
|
||||||
|
PAGE_MODEL_REVISION, PAGE_DIM)
|
||||||
page_inputs = [PAGE_PREFIX + p["text"] for p in pages]
|
page_inputs = [PAGE_PREFIX + p["text"] for p in pages]
|
||||||
hashes = [content_hash(t) for t in page_inputs]
|
hashes = [content_hash(t) for t in page_inputs]
|
||||||
miss_idxs = [i for i, h in enumerate(hashes) if h not in cache]
|
miss_idxs = [i for i, h in enumerate(hashes) if h not in cache]
|
||||||
|
|
||||||
print(f"embed.py: {len(pages) - len(miss_idxs)} cached / "
|
print(f"embed.py: pages: {len(pages) - len(miss_idxs)} cached / "
|
||||||
f"{len(miss_idxs)} to embed")
|
f"{len(miss_idxs)} to embed")
|
||||||
|
|
||||||
if miss_idxs:
|
if miss_idxs:
|
||||||
|
|
@ -360,7 +359,8 @@ def main() -> int:
|
||||||
|
|
||||||
# Prune the cache to only currently-present hashes so a deleted page
|
# Prune the cache to only currently-present hashes so a deleted page
|
||||||
# doesn't keep its vector around forever. Then persist.
|
# doesn't keep its vector around forever. Then persist.
|
||||||
save_page_cache({h: cache[h] for h in hashes})
|
save_vec_cache(PAGE_CACHE, PAGE_MODEL_NAME, PAGE_MODEL_REVISION,
|
||||||
|
PAGE_DIM, {h: cache[h] for h in hashes})
|
||||||
|
|
||||||
index = faiss.IndexFlatIP(page_vecs.shape[1])
|
index = faiss.IndexFlatIP(page_vecs.shape[1])
|
||||||
index.add(page_vecs)
|
index.add(page_vecs)
|
||||||
|
|
@ -383,21 +383,38 @@ def main() -> int:
|
||||||
atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
|
atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
|
||||||
print(f"embed.py: wrote {len(similar)} similar-links entries")
|
print(f"embed.py: wrote {len(similar)} similar-links entries")
|
||||||
|
|
||||||
# --- Semantic index (paragraph level, MiniLM) ---
|
# --- Semantic index (paragraph level, MiniLM, content-hash cached) ---
|
||||||
if not paragraphs:
|
if not paragraphs:
|
||||||
print("embed.py: no paragraphs extracted — skipping semantic index")
|
print("embed.py: no paragraphs extracted — skipping semantic index")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…")
|
pcache = load_vec_cache(PARA_CACHE, PARA_MODEL_NAME,
|
||||||
para_model = SentenceTransformer(PARA_MODEL_NAME, revision=PARA_MODEL_REVISION)
|
PARA_MODEL_REVISION, PARA_DIM)
|
||||||
|
para_inputs = [p["text"] for p in paragraphs]
|
||||||
|
para_hashes = [content_hash(t) for t in para_inputs]
|
||||||
|
para_miss = [i for i, h in enumerate(para_hashes) if h not in pcache]
|
||||||
|
|
||||||
print(f"embed.py: embedding {len(paragraphs)} paragraphs…")
|
print(f"embed.py: paragraphs: {len(paragraphs) - len(para_miss)} cached / "
|
||||||
para_vecs = para_model.encode(
|
f"{len(para_miss)} to embed")
|
||||||
[p["text"] for p in paragraphs],
|
|
||||||
normalize_embeddings=True,
|
if para_miss:
|
||||||
show_progress_bar=True,
|
print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…")
|
||||||
batch_size=64,
|
para_model = SentenceTransformer(PARA_MODEL_NAME,
|
||||||
).astype(np.float32)
|
revision=PARA_MODEL_REVISION)
|
||||||
|
new_para_vecs = para_model.encode(
|
||||||
|
[para_inputs[i] for i in para_miss],
|
||||||
|
normalize_embeddings=True,
|
||||||
|
show_progress_bar=True,
|
||||||
|
batch_size=64,
|
||||||
|
).astype(np.float32)
|
||||||
|
for i, vec in zip(para_miss, new_para_vecs):
|
||||||
|
pcache[para_hashes[i]] = vec
|
||||||
|
del para_model
|
||||||
|
|
||||||
|
# Assemble in original paragraph order; prune + persist the cache.
|
||||||
|
para_vecs = np.stack([pcache[h] for h in para_hashes]).astype(np.float32)
|
||||||
|
save_vec_cache(PARA_CACHE, PARA_MODEL_NAME, PARA_MODEL_REVISION,
|
||||||
|
PARA_DIM, {h: pcache[h] for h in para_hashes})
|
||||||
|
|
||||||
atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes())
|
atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes())
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue