433 lines
17 KiB
Python
433 lines
17 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
embed.py — Build-time embedding pipeline.
|
||
|
||
Produces two outputs from _site/**/*.html:
|
||
|
||
data/similar-links.json Page-level similarity (for "Related" footer section)
|
||
data/semantic-index.bin Paragraph vectors as raw Float32 array (N × PARA_DIM)
|
||
data/semantic-meta.json Paragraph metadata: [{url, title, heading, excerpt}]
|
||
|
||
Two models, one process:
|
||
|
||
* Pages use nomic-embed-text-v1.5 (768 dims) — build-time only, never
|
||
shipped to the browser. Chosen for its well-separated cosine scores on
|
||
small corpora, which keeps the MIN_SCORE gate meaningful so every essay
|
||
reliably gets a "Related" footer section.
|
||
|
||
* Paragraphs use all-MiniLM-L6-v2 (384 dims) — must match what the
|
||
browser runs via transformers.js (static/js/semantic-search.js) since
|
||
query vectors are dotted against the shipped index.
|
||
|
||
Called by `make build` when .venv exists. Failures are non-fatal.
|
||
|
||
Staleness: both passes are content-hash cached (data/embed-cache-*.npz),
|
||
so an unchanged site re-embeds nothing and loads no model — only the
|
||
HTML extraction pass runs. There is deliberately no mtime-based skip:
|
||
stamp-build-time.py rewrites every page's footer after this script runs,
|
||
so "are outputs newer than the HTML" is always false and a check based
|
||
on it can never fire.
|
||
"""
|
||
|
||
import hashlib
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
import zipfile
|
||
from pathlib import Path
|
||
|
||
import faiss
|
||
import numpy as np
|
||
from bs4 import BeautifulSoup
|
||
from sentence_transformers import SentenceTransformer
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Configuration
|
||
# ---------------------------------------------------------------------------
|
||
|
||
REPO_ROOT = Path(__file__).parent.parent
|
||
SITE_DIR = REPO_ROOT / "_site"
|
||
SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json"
|
||
SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin"
|
||
SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json"
|
||
# Content-addressed caches, one per pass. Keyed by sha256 of the (prefixed)
|
||
# input text; invalidated wholesale on model name/revision/dim change.
|
||
# Gitignored — build artifacts, not source. Survive `make clean`.
|
||
PAGE_CACHE = REPO_ROOT / "data" / "embed-cache-pages.npz"
|
||
PARA_CACHE = REPO_ROOT / "data" / "embed-cache-paragraphs.npz"
|
||
|
||
# Two models, deliberately split:
|
||
#
|
||
# PARA_MODEL — embeds paragraphs for data/semantic-index.bin. This index
|
||
# is fetched by the browser at /search/ and ranked against query vectors
|
||
# computed client-side. The client (static/js/semantic-search.js) embeds
|
||
# queries with MiniLM-L6-v2 via transformers.js, so the build-time model
|
||
# must match exactly — both the architecture and the embedding dimension
|
||
# are part of the wire contract.
|
||
#
|
||
# PAGE_MODEL — embeds full pages for data/similar-links.json. This file
|
||
# is consumed only at Hakyll-build time (SimilarLinks.hs) and never
|
||
# shipped to the browser, so it is free to use a different, stronger
|
||
# model. nomic-embed-text-v1.5 produces well-separated cosine scores on
|
||
# small corpora (top neighbours at 0.7–0.9 instead of MiniLM's compressed
|
||
# 0.1–0.3), so the MIN_SCORE gate below is meaningful and every essay
|
||
# reliably gets a "Related" footer section.
|
||
#
|
||
# Both pins are deliberate. Bump only when validating and re-run a full
|
||
# embed pass to refresh the corresponding output files.
|
||
|
||
PARA_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
||
PARA_MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf"
|
||
PARA_DIM = 384
|
||
|
||
PAGE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
|
||
PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab"
|
||
# The weights repo above declares its modeling code via auto_map in a
|
||
# SEPARATE repo (nomic-ai/nomic-bert-2048), which `revision=` does NOT
|
||
# pin — without this second pin, trust_remote_code executes whatever is
|
||
# at that repo's head at build time.
|
||
PAGE_MODEL_CODE_REVISION = "7710840340a098cfb869c4f65e87cf2b1b70caca"
|
||
PAGE_DIM = 768
|
||
# Nomic requires task-prefixed input. Documents (corpus side) get
|
||
# "search_document: "; queries would get "search_query: ". similar-links
|
||
# only ever embeds documents, so the prefix is constant here.
|
||
PAGE_PREFIX = "search_document: "
|
||
|
||
TOP_N = 5 # similar-links: neighbours per page
|
||
MIN_SCORE = 0.30 # similar-links: discard weak matches
|
||
MIN_PARA_CHARS = 80 # semantic: skip very short paragraphs
|
||
MAX_PARA_CHARS = 1000 # semantic: truncate before embedding
|
||
|
||
# /archive/ is the archive index — a list page that would dominate every
|
||
# entry's "Related" set; the individual /archive/<slug>/ pages stay in.
|
||
EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml",
|
||
"/music/feed.xml", "/archive/"}
|
||
|
||
# Whole subtrees kept out of the corpus. /source/ is the repository code
|
||
# mirror — source files, not content; left in, they pollute every page's
|
||
# "Related" set and semantic search (e.g. a template file surfacing as a
|
||
# neighbour, titled with its unrendered "$title$" placeholder).
|
||
EXCLUDE_PREFIXES = ("/source/",)
|
||
|
||
# Pages whose <body data-portal> are portal/landing pages — they aggregate
|
||
# excerpts from many entries and would otherwise dominate every page's
|
||
# "Related" set with high but uninformative scores. default.html sets the
|
||
# attribute when any of the `list-page`, `portal`, or `home` template flags
|
||
# is true, so adding `constField "portal" "true"` to a Hakyll rule (or
|
||
# `portal: true` to a content file's frontmatter) is enough to exclude it.
|
||
PORTAL_BODY_ATTR = "data-portal"
|
||
|
||
|
||
def atomic_write_bytes(path: Path, data: bytes) -> None:
|
||
"""Write to path.tmp then os.replace, so an interrupt mid-write
|
||
cannot leave a truncated file that the next build/serve loads."""
|
||
path.parent.mkdir(parents=True, exist_ok=True)
|
||
tmp = path.with_suffix(path.suffix + ".tmp")
|
||
tmp.write_bytes(data)
|
||
os.replace(tmp, path)
|
||
|
||
|
||
def atomic_write_text(path: Path, text: str) -> None:
|
||
atomic_write_bytes(path, text.encode("utf-8"))
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Page-embedding cache
|
||
# ---------------------------------------------------------------------------
|
||
#
|
||
# Loading the nomic model and embedding 26 pages on CPU takes ~3 minutes
|
||
# every `make build`. Pages rarely change between builds — usually one
|
||
# essay is edited and everything else is identical. This cache stores
|
||
# one nomic vector per page content hash so unchanged pages are reused
|
||
# verbatim and only edited/new pages are re-embedded. A fully-warm cache
|
||
# skips the model load entirely.
|
||
|
||
def content_hash(text: str) -> str:
|
||
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||
|
||
|
||
def load_vec_cache(path: Path, model: str, revision: str,
|
||
dim: int) -> dict[str, np.ndarray]:
|
||
"""Load {hash: vector} from disk. Returns an empty dict if the cache
|
||
is absent, unreadable, or pinned to a different model — in those
|
||
cases save_vec_cache() will overwrite the stale file on next save."""
|
||
if not path.exists():
|
||
return {}
|
||
try:
|
||
npz = np.load(path, allow_pickle=False)
|
||
if (npz["model"].item() != model or
|
||
npz["revision"].item() != revision or
|
||
int(npz["dim"].item()) != dim):
|
||
return {}
|
||
hashes = npz["hashes"]
|
||
vectors = npz["vectors"]
|
||
if vectors.shape != (len(hashes), dim):
|
||
return {}
|
||
return {h.item(): vectors[i] for i, h in enumerate(hashes)}
|
||
except (OSError, KeyError, ValueError, EOFError,
|
||
zipfile.BadZipFile) as e:
|
||
print(f"embed.py: cache {path.name} unreadable ({e}) — discarding",
|
||
file=sys.stderr)
|
||
return {}
|
||
|
||
|
||
def save_vec_cache(path: Path, model: str, revision: str, dim: int,
|
||
cache: dict[str, np.ndarray]) -> None:
|
||
"""Atomically persist {hash: vector}. Empty cache writes an empty
|
||
file so a subsequent load returns {} cleanly (instead of falling
|
||
through to the "no file" path)."""
|
||
if cache:
|
||
hashes = np.array(list(cache.keys()))
|
||
vectors = np.stack(list(cache.values())).astype(np.float32)
|
||
else:
|
||
hashes = np.array([], dtype="U64")
|
||
vectors = np.zeros((0, dim), dtype=np.float32)
|
||
path.parent.mkdir(parents=True, exist_ok=True)
|
||
# Pass an open file handle, not a path: np.savez_compressed appends
|
||
# ".npz" to bare paths, which would mangle our atomic-rename target.
|
||
tmp = path.with_suffix(path.suffix + ".tmp")
|
||
try:
|
||
with open(tmp, "wb") as f:
|
||
np.savez_compressed(
|
||
f,
|
||
model=model,
|
||
revision=revision,
|
||
dim=dim,
|
||
hashes=hashes,
|
||
vectors=vectors,
|
||
)
|
||
os.replace(tmp, path)
|
||
except BaseException:
|
||
tmp.unlink(missing_ok=True)
|
||
raise
|
||
|
||
|
||
STRIP_SELECTORS = [
|
||
"nav", "footer", "#toc", ".link-popup", "script", "style",
|
||
".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
|
||
# The no-JS footnotes fallback duplicates each sidenote's text
|
||
# verbatim at the document end — indexing it would double every
|
||
# footnote in search results and skew page similarity.
|
||
"section.footnotes",
|
||
]
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# HTML parsing helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _url_from_path(html_path: Path) -> str:
|
||
rel = html_path.relative_to(SITE_DIR)
|
||
if rel.name == "index.html":
|
||
parent = str(rel.parent)
|
||
if parent in (".", ""):
|
||
return "/"
|
||
return "/" + parent + "/"
|
||
return "/" + str(rel)
|
||
|
||
def _clean_soup(soup: BeautifulSoup) -> None:
|
||
for sel in STRIP_SELECTORS:
|
||
for el in soup.select(sel):
|
||
el.decompose()
|
||
|
||
def _title(soup: BeautifulSoup, url: str) -> str:
|
||
h1 = soup.find("h1")
|
||
if h1:
|
||
return h1.get_text(" ", strip=True)
|
||
tag = soup.find("title")
|
||
raw = tag.get_text(" ", strip=True) if tag else url
|
||
return re.split(r"\s+[—–-]\s+", raw)[0].strip()
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Page-level extraction (for similar-links)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def extract_page(html_path: Path) -> dict | None:
|
||
raw = html_path.read_text(encoding="utf-8", errors="replace")
|
||
soup = BeautifulSoup(raw, "html.parser")
|
||
url = _url_from_path(html_path)
|
||
|
||
if url in EXCLUDE_URLS or url.startswith(EXCLUDE_PREFIXES):
|
||
return None
|
||
body_tag = soup.body
|
||
if body_tag is not None and body_tag.has_attr(PORTAL_BODY_ATTR):
|
||
return None
|
||
body = soup.select_one("#markdownBody")
|
||
if body is None:
|
||
return None
|
||
|
||
title = _title(soup, url)
|
||
_clean_soup(soup)
|
||
|
||
text = re.sub(r"\s+", " ", body.get_text(" ", strip=True)).strip()
|
||
if len(text) < 100:
|
||
return None
|
||
|
||
return {"url": url, "title": title, "text": text}
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Paragraph-level extraction (for semantic search)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def extract_paragraphs(html_path: Path, url: str, title: str) -> list[dict]:
|
||
raw = html_path.read_text(encoding="utf-8", errors="replace")
|
||
soup = BeautifulSoup(raw, "html.parser")
|
||
body = soup.select_one("#markdownBody")
|
||
if body is None:
|
||
return []
|
||
|
||
_clean_soup(soup)
|
||
|
||
paras = []
|
||
heading = title # track current section heading
|
||
|
||
for el in body.find_all(["h1", "h2", "h3", "h4", "p", "li", "blockquote"]):
|
||
if el.name in ("h1", "h2", "h3", "h4"):
|
||
heading = el.get_text(" ", strip=True)
|
||
continue
|
||
text = re.sub(r"\s+", " ", el.get_text(" ", strip=True)).strip()
|
||
if len(text) < MIN_PARA_CHARS:
|
||
continue
|
||
paras.append({
|
||
"url": url,
|
||
"title": title,
|
||
"heading": heading,
|
||
"excerpt": text[:200] + ("…" if len(text) > 200 else ""),
|
||
"text": text[:MAX_PARA_CHARS],
|
||
})
|
||
|
||
return paras
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def main() -> int:
|
||
if not SITE_DIR.exists():
|
||
print("embed.py: _site/ not found — skipping", file=sys.stderr)
|
||
return 0
|
||
|
||
# --- Extract pages + paragraphs in one pass ---
|
||
print("embed.py: extracting pages…")
|
||
pages = []
|
||
paragraphs = []
|
||
|
||
for html in sorted(SITE_DIR.rglob("*.html")):
|
||
page = extract_page(html)
|
||
if page is None:
|
||
continue
|
||
pages.append(page)
|
||
paragraphs.extend(extract_paragraphs(html, page["url"], page["title"]))
|
||
|
||
if not pages:
|
||
print("embed.py: no indexable pages found", file=sys.stderr)
|
||
return 0
|
||
|
||
# --- Similar-links (page level, nomic, content-hash cached) ---
|
||
cache = load_vec_cache(PAGE_CACHE, PAGE_MODEL_NAME,
|
||
PAGE_MODEL_REVISION, PAGE_DIM)
|
||
page_inputs = [PAGE_PREFIX + p["text"] for p in pages]
|
||
hashes = [content_hash(t) for t in page_inputs]
|
||
miss_idxs = [i for i, h in enumerate(hashes) if h not in cache]
|
||
|
||
print(f"embed.py: pages: {len(pages) - len(miss_idxs)} cached / "
|
||
f"{len(miss_idxs)} to embed")
|
||
|
||
if miss_idxs:
|
||
print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}…")
|
||
page_model = SentenceTransformer(
|
||
PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True,
|
||
# code_revision pins the auto_map modeling repo; it must reach
|
||
# both AutoConfig and AutoModel.from_pretrained.
|
||
model_kwargs={"code_revision": PAGE_MODEL_CODE_REVISION},
|
||
config_kwargs={"code_revision": PAGE_MODEL_CODE_REVISION},
|
||
)
|
||
new_vecs = page_model.encode(
|
||
[page_inputs[i] for i in miss_idxs],
|
||
normalize_embeddings=True,
|
||
show_progress_bar=True,
|
||
batch_size=8,
|
||
).astype(np.float32)
|
||
for i, vec in zip(miss_idxs, new_vecs):
|
||
cache[hashes[i]] = vec
|
||
# Drop the model before loading MiniLM below; sentence-transformers
|
||
# holds the full weight tensor in RAM until GC runs.
|
||
del page_model
|
||
|
||
# Assemble page_vecs in the original pages[] order.
|
||
page_vecs = np.stack([cache[h] for h in hashes]).astype(np.float32)
|
||
|
||
# Prune the cache to only currently-present hashes so a deleted page
|
||
# doesn't keep its vector around forever. Then persist.
|
||
save_vec_cache(PAGE_CACHE, PAGE_MODEL_NAME, PAGE_MODEL_REVISION,
|
||
PAGE_DIM, {h: cache[h] for h in hashes})
|
||
|
||
index = faiss.IndexFlatIP(page_vecs.shape[1])
|
||
index.add(page_vecs)
|
||
scores_all, indices_all = index.search(page_vecs, TOP_N + 1)
|
||
|
||
similar: dict[str, list] = {}
|
||
for i, page in enumerate(pages):
|
||
neighbours = []
|
||
for rank in range(TOP_N + 1):
|
||
j, score = int(indices_all[i, rank]), float(scores_all[i, rank])
|
||
if j == i or score < MIN_SCORE:
|
||
continue
|
||
neighbours.append({"url": pages[j]["url"], "title": pages[j]["title"],
|
||
"score": round(score, 4)})
|
||
if len(neighbours) == TOP_N:
|
||
break
|
||
if neighbours:
|
||
similar[page["url"]] = neighbours
|
||
|
||
atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
|
||
print(f"embed.py: wrote {len(similar)} similar-links entries")
|
||
|
||
# --- Semantic index (paragraph level, MiniLM, content-hash cached) ---
|
||
if not paragraphs:
|
||
print("embed.py: no paragraphs extracted — skipping semantic index")
|
||
return 0
|
||
|
||
pcache = load_vec_cache(PARA_CACHE, PARA_MODEL_NAME,
|
||
PARA_MODEL_REVISION, PARA_DIM)
|
||
para_inputs = [p["text"] for p in paragraphs]
|
||
para_hashes = [content_hash(t) for t in para_inputs]
|
||
para_miss = [i for i, h in enumerate(para_hashes) if h not in pcache]
|
||
|
||
print(f"embed.py: paragraphs: {len(paragraphs) - len(para_miss)} cached / "
|
||
f"{len(para_miss)} to embed")
|
||
|
||
if para_miss:
|
||
print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…")
|
||
para_model = SentenceTransformer(PARA_MODEL_NAME,
|
||
revision=PARA_MODEL_REVISION)
|
||
new_para_vecs = para_model.encode(
|
||
[para_inputs[i] for i in para_miss],
|
||
normalize_embeddings=True,
|
||
show_progress_bar=True,
|
||
batch_size=64,
|
||
).astype(np.float32)
|
||
for i, vec in zip(para_miss, new_para_vecs):
|
||
pcache[para_hashes[i]] = vec
|
||
del para_model
|
||
|
||
# Assemble in original paragraph order; prune + persist the cache.
|
||
para_vecs = np.stack([pcache[h] for h in para_hashes]).astype(np.float32)
|
||
save_vec_cache(PARA_CACHE, PARA_MODEL_NAME, PARA_MODEL_REVISION,
|
||
PARA_DIM, {h: pcache[h] for h in para_hashes})
|
||
|
||
atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes())
|
||
|
||
meta = [{"url": p["url"], "title": p["title"],
|
||
"heading": p["heading"], "excerpt": p["excerpt"]}
|
||
for p in paragraphs]
|
||
atomic_write_text(SEMANTIC_META, json.dumps(meta, ensure_ascii=False))
|
||
|
||
print(f"embed.py: wrote {len(paragraphs)} paragraphs to semantic index "
|
||
f"({SEMANTIC_BIN.stat().st_size // 1024} KB)")
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|