#!/usr/bin/env python3 """ embed.py — Build-time embedding pipeline. Produces two outputs from _site/**/*.html: data/similar-links.json Page-level similarity (for "Related" footer section) data/semantic-index.bin Paragraph vectors as raw Float32 array (N × PARA_DIM) data/semantic-meta.json Paragraph metadata: [{url, title, heading, excerpt}] Two models, one process: * Pages use nomic-embed-text-v1.5 (768 dims) — build-time only, never shipped to the browser. Chosen for its well-separated cosine scores on small corpora, which keeps the MIN_SCORE gate meaningful so every essay reliably gets a "Related" footer section. * Paragraphs use all-MiniLM-L6-v2 (384 dims) — must match what the browser runs via transformers.js (static/js/semantic-search.js) since query vectors are dotted against the shipped index. Called by `make build` when .venv exists. Failures are non-fatal. Staleness check: skips if all output files are newer than every HTML in _site/. """ import hashlib import json import os import re import sys from pathlib import Path import faiss import numpy as np from bs4 import BeautifulSoup from sentence_transformers import SentenceTransformer # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- REPO_ROOT = Path(__file__).parent.parent SITE_DIR = REPO_ROOT / "_site" SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json" SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin" SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json" # Content-addressed cache for nomic page embeddings. Keyed by sha256 of the # prefixed page text; invalidated wholesale on model name/revision/dim change. # Gitignored — a build artifact, not source. Survives `make clean`. PAGE_CACHE = REPO_ROOT / "data" / "embed-cache-pages.npz" # Two models, deliberately split: # # PARA_MODEL — embeds paragraphs for data/semantic-index.bin. This index # is fetched by the browser at /search/ and ranked against query vectors # computed client-side. The client (static/js/semantic-search.js) embeds # queries with MiniLM-L6-v2 via transformers.js, so the build-time model # must match exactly — both the architecture and the embedding dimension # are part of the wire contract. # # PAGE_MODEL — embeds full pages for data/similar-links.json. This file # is consumed only at Hakyll-build time (SimilarLinks.hs) and never # shipped to the browser, so it is free to use a different, stronger # model. nomic-embed-text-v1.5 produces well-separated cosine scores on # small corpora (top neighbours at 0.7–0.9 instead of MiniLM's compressed # 0.1–0.3), so the MIN_SCORE gate below is meaningful and every essay # reliably gets a "Related" footer section. # # Both pins are deliberate. Bump only when validating and re-run a full # embed pass to refresh the corresponding output files. PARA_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" PARA_MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf" PARA_DIM = 384 PAGE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5" PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab" PAGE_DIM = 768 # Nomic requires task-prefixed input. Documents (corpus side) get # "search_document: "; queries would get "search_query: ". similar-links # only ever embeds documents, so the prefix is constant here. PAGE_PREFIX = "search_document: " TOP_N = 5 # similar-links: neighbours per page MIN_SCORE = 0.30 # similar-links: discard weak matches MIN_PARA_CHARS = 80 # semantic: skip very short paragraphs MAX_PARA_CHARS = 1000 # semantic: truncate before embedding # /archive/ is the archive index — a list page that would dominate every # entry's "Related" set; the individual /archive// pages stay in. EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml", "/music/feed.xml", "/archive/"} # Whole subtrees kept out of the corpus. /source/ is the repository code # mirror — source files, not content; left in, they pollute every page's # "Related" set and semantic search (e.g. a template file surfacing as a # neighbour, titled with its unrendered "$title$" placeholder). EXCLUDE_PREFIXES = ("/source/",) # Pages whose are portal/landing pages — they aggregate # excerpts from many entries and would otherwise dominate every page's # "Related" set with high but uninformative scores. default.html sets the # attribute when any of the `list-page`, `portal`, or `home` template flags # is true, so adding `constField "portal" "true"` to a Hakyll rule (or # `portal: true` to a content file's frontmatter) is enough to exclude it. PORTAL_BODY_ATTR = "data-portal" def atomic_write_bytes(path: Path, data: bytes) -> None: """Write to path.tmp then os.replace, so an interrupt mid-write cannot leave a truncated file that the next build/serve loads.""" path.parent.mkdir(parents=True, exist_ok=True) tmp = path.with_suffix(path.suffix + ".tmp") tmp.write_bytes(data) os.replace(tmp, path) def atomic_write_text(path: Path, text: str) -> None: atomic_write_bytes(path, text.encode("utf-8")) # --------------------------------------------------------------------------- # Page-embedding cache # --------------------------------------------------------------------------- # # Loading the nomic model and embedding 26 pages on CPU takes ~3 minutes # every `make build`. Pages rarely change between builds — usually one # essay is edited and everything else is identical. This cache stores # one nomic vector per page content hash so unchanged pages are reused # verbatim and only edited/new pages are re-embedded. A fully-warm cache # skips the model load entirely. def content_hash(text: str) -> str: return hashlib.sha256(text.encode("utf-8")).hexdigest() def load_page_cache() -> dict[str, np.ndarray]: """Load {hash: vector} from disk. Returns an empty dict if the cache is absent, unreadable, or pinned to a different model — in those cases save_page_cache() will overwrite the stale file on next save.""" if not PAGE_CACHE.exists(): return {} try: npz = np.load(PAGE_CACHE, allow_pickle=False) if (npz["model"].item() != PAGE_MODEL_NAME or npz["revision"].item() != PAGE_MODEL_REVISION or int(npz["dim"].item()) != PAGE_DIM): return {} hashes = npz["hashes"] vectors = npz["vectors"] if vectors.shape != (len(hashes), PAGE_DIM): return {} return {h.item(): vectors[i] for i, h in enumerate(hashes)} except (OSError, KeyError, ValueError) as e: print(f"embed.py: page cache unreadable ({e}) — discarding", file=sys.stderr) return {} def save_page_cache(cache: dict[str, np.ndarray]) -> None: """Atomically persist {hash: vector}. Empty cache writes an empty file so a subsequent load returns {} cleanly (instead of falling through to the "no file" path).""" if cache: hashes = np.array(list(cache.keys())) vectors = np.stack(list(cache.values())).astype(np.float32) else: hashes = np.array([], dtype="U64") vectors = np.zeros((0, PAGE_DIM), dtype=np.float32) PAGE_CACHE.parent.mkdir(parents=True, exist_ok=True) # Pass an open file handle, not a path: np.savez_compressed appends # ".npz" to bare paths, which would mangle our atomic-rename target. tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp") with open(tmp, "wb") as f: np.savez_compressed( f, model=PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, dim=PAGE_DIM, hashes=hashes, vectors=vectors, ) os.replace(tmp, PAGE_CACHE) STRIP_SELECTORS = [ "nav", "footer", "#toc", ".link-popup", "script", "style", ".page-meta-footer", ".metadata", "[data-pagefind-ignore]", ] # --------------------------------------------------------------------------- # Staleness check # --------------------------------------------------------------------------- def needs_update() -> bool: outputs = [SIMILAR_OUT, SEMANTIC_BIN, SEMANTIC_META] if not all(p.exists() for p in outputs): return True oldest = min(p.stat().st_mtime for p in outputs) return any(html.stat().st_mtime > oldest for html in SITE_DIR.rglob("*.html")) # --------------------------------------------------------------------------- # HTML parsing helpers # --------------------------------------------------------------------------- def _url_from_path(html_path: Path) -> str: rel = html_path.relative_to(SITE_DIR) if rel.name == "index.html": parent = str(rel.parent) if parent in (".", ""): return "/" return "/" + parent + "/" return "/" + str(rel) def _clean_soup(soup: BeautifulSoup) -> None: for sel in STRIP_SELECTORS: for el in soup.select(sel): el.decompose() def _title(soup: BeautifulSoup, url: str) -> str: h1 = soup.find("h1") if h1: return h1.get_text(" ", strip=True) tag = soup.find("title") raw = tag.get_text(" ", strip=True) if tag else url return re.split(r"\s+[—–-]\s+", raw)[0].strip() # --------------------------------------------------------------------------- # Page-level extraction (for similar-links) # --------------------------------------------------------------------------- def extract_page(html_path: Path) -> dict | None: raw = html_path.read_text(encoding="utf-8", errors="replace") soup = BeautifulSoup(raw, "html.parser") url = _url_from_path(html_path) if url in EXCLUDE_URLS or url.startswith(EXCLUDE_PREFIXES): return None body_tag = soup.body if body_tag is not None and body_tag.has_attr(PORTAL_BODY_ATTR): return None body = soup.select_one("#markdownBody") if body is None: return None title = _title(soup, url) _clean_soup(soup) text = re.sub(r"\s+", " ", body.get_text(" ", strip=True)).strip() if len(text) < 100: return None return {"url": url, "title": title, "text": text} # --------------------------------------------------------------------------- # Paragraph-level extraction (for semantic search) # --------------------------------------------------------------------------- def extract_paragraphs(html_path: Path, url: str, title: str) -> list[dict]: raw = html_path.read_text(encoding="utf-8", errors="replace") soup = BeautifulSoup(raw, "html.parser") body = soup.select_one("#markdownBody") if body is None: return [] _clean_soup(soup) paras = [] heading = title # track current section heading for el in body.find_all(["h1", "h2", "h3", "h4", "p", "li", "blockquote"]): if el.name in ("h1", "h2", "h3", "h4"): heading = el.get_text(" ", strip=True) continue text = re.sub(r"\s+", " ", el.get_text(" ", strip=True)).strip() if len(text) < MIN_PARA_CHARS: continue paras.append({ "url": url, "title": title, "heading": heading, "excerpt": text[:200] + ("…" if len(text) > 200 else ""), "text": text[:MAX_PARA_CHARS], }) return paras # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> int: if not SITE_DIR.exists(): print("embed.py: _site/ not found — skipping", file=sys.stderr) return 0 if not needs_update(): print("embed.py: all outputs up to date — skipping") return 0 # --- Extract pages + paragraphs in one pass --- print("embed.py: extracting pages…") pages = [] paragraphs = [] for html in sorted(SITE_DIR.rglob("*.html")): page = extract_page(html) if page is None: continue pages.append(page) paragraphs.extend(extract_paragraphs(html, page["url"], page["title"])) if not pages: print("embed.py: no indexable pages found", file=sys.stderr) return 0 # --- Similar-links (page level, nomic, content-hash cached) --- cache = load_page_cache() page_inputs = [PAGE_PREFIX + p["text"] for p in pages] hashes = [content_hash(t) for t in page_inputs] miss_idxs = [i for i, h in enumerate(hashes) if h not in cache] print(f"embed.py: {len(pages) - len(miss_idxs)} cached / " f"{len(miss_idxs)} to embed") if miss_idxs: print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}…") page_model = SentenceTransformer( PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True, ) new_vecs = page_model.encode( [page_inputs[i] for i in miss_idxs], normalize_embeddings=True, show_progress_bar=True, batch_size=8, ).astype(np.float32) for i, vec in zip(miss_idxs, new_vecs): cache[hashes[i]] = vec # Drop the model before loading MiniLM below; sentence-transformers # holds the full weight tensor in RAM until GC runs. del page_model # Assemble page_vecs in the original pages[] order. page_vecs = np.stack([cache[h] for h in hashes]).astype(np.float32) # Prune the cache to only currently-present hashes so a deleted page # doesn't keep its vector around forever. Then persist. save_page_cache({h: cache[h] for h in hashes}) index = faiss.IndexFlatIP(page_vecs.shape[1]) index.add(page_vecs) scores_all, indices_all = index.search(page_vecs, TOP_N + 1) similar: dict[str, list] = {} for i, page in enumerate(pages): neighbours = [] for rank in range(TOP_N + 1): j, score = int(indices_all[i, rank]), float(scores_all[i, rank]) if j == i or score < MIN_SCORE: continue neighbours.append({"url": pages[j]["url"], "title": pages[j]["title"], "score": round(score, 4)}) if len(neighbours) == TOP_N: break if neighbours: similar[page["url"]] = neighbours atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2)) print(f"embed.py: wrote {len(similar)} similar-links entries") # --- Semantic index (paragraph level, MiniLM) --- if not paragraphs: print("embed.py: no paragraphs extracted — skipping semantic index") return 0 print(f"embed.py: loading {PARA_MODEL_NAME}@{PARA_MODEL_REVISION[:8]}…") para_model = SentenceTransformer(PARA_MODEL_NAME, revision=PARA_MODEL_REVISION) print(f"embed.py: embedding {len(paragraphs)} paragraphs…") para_vecs = para_model.encode( [p["text"] for p in paragraphs], normalize_embeddings=True, show_progress_bar=True, batch_size=64, ).astype(np.float32) atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes()) meta = [{"url": p["url"], "title": p["title"], "heading": p["heading"], "excerpt": p["excerpt"]} for p in paragraphs] atomic_write_text(SEMANTIC_META, json.dumps(meta, ensure_ascii=False)) print(f"embed.py: wrote {len(paragraphs)} paragraphs to semantic index " f"({SEMANTIC_BIN.stat().st_size // 1024} KB)") return 0 if __name__ == "__main__": sys.exit(main())