#!/usr/bin/env python3 """ embed.py — Build-time embedding pipeline. Produces two outputs from _site/**/*.html: data/similar-links.json Page-level similarity (for "Related" footer section) data/semantic-index.bin Paragraph vectors as raw Float32 array (N × DIM) data/semantic-meta.json Paragraph metadata: [{url, title, heading, excerpt}] Both use all-MiniLM-L6-v2 (384 dims) — the same model shipped to the browser via transformers.js for query-time semantic search. Called by `make build` when .venv exists. Failures are non-fatal. Staleness check: skips if all output files are newer than every HTML in _site/. """ import json import re import sys from pathlib import Path import faiss import numpy as np from bs4 import BeautifulSoup from sentence_transformers import SentenceTransformer # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- REPO_ROOT = Path(__file__).parent.parent SITE_DIR = REPO_ROOT / "_site" SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json" SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin" SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json" MODEL_NAME = "all-MiniLM-L6-v2" DIM = 384 TOP_N = 5 # similar-links: neighbours per page MIN_SCORE = 0.30 # similar-links: discard weak matches MIN_PARA_CHARS = 80 # semantic: skip very short paragraphs MAX_PARA_CHARS = 1000 # semantic: truncate before embedding # Pages that should not appear in similar-links suggestions or the semantic # index. Search/build/stats are meta-pages with no prose; feeds are XML. # Photography listing surfaces (map/contact-sheet/by-year) are index pages, # not content. URLs must match exactly what `_url_from_path` produces (i.e., # directory-style URLs end with `/`, file-style URLs include the extension). EXCLUDE_URLS = { "/search.html", "/build/", "/stats/", "/library.html", "/new.html", "/feed.xml", "/music/feed.xml", "/photography/feed.xml", "/photography/map/", "/photography/contact-sheet/", } STRIP_SELECTORS = [ "nav", "footer", "#toc", ".link-popup", "script", "style", ".page-meta-footer", ".metadata", "[data-pagefind-ignore]", ] # --------------------------------------------------------------------------- # Staleness check # --------------------------------------------------------------------------- def needs_update() -> bool: outputs = [SIMILAR_OUT, SEMANTIC_BIN, SEMANTIC_META] if not all(p.exists() for p in outputs): return True oldest = min(p.stat().st_mtime for p in outputs) return any(html.stat().st_mtime > oldest for html in SITE_DIR.rglob("*.html")) # --------------------------------------------------------------------------- # HTML parsing helpers # --------------------------------------------------------------------------- def _url_from_path(html_path: Path) -> str: rel = html_path.relative_to(SITE_DIR) if rel.name == "index.html": parent = str(rel.parent) if parent in (".", ""): return "/" return "/" + parent + "/" return "/" + str(rel) def _clean_soup(soup: BeautifulSoup) -> None: for sel in STRIP_SELECTORS: for el in soup.select(sel): el.decompose() def _title(soup: BeautifulSoup, url: str) -> str: h1 = soup.find("h1") if h1: return h1.get_text(" ", strip=True) tag = soup.find("title") raw = tag.get_text(" ", strip=True) if tag else url return re.split(r"\s+[—–-]\s+", raw)[0].strip() # --------------------------------------------------------------------------- # Extraction # --------------------------------------------------------------------------- # # A single pass over each HTML file produces both: # # * a page-level record (concatenated body text, for similar-links) # * a list of paragraph-level records (for the semantic index) # # Both surfaces want the same soup; an earlier version of this script # parsed each file twice. The combined pass keeps BeautifulSoup work to # one allocation per file. def extract_one(html_path: Path) -> tuple[dict | None, list[dict]]: """Parse one HTML file and return (page-record-or-None, paragraph-list). Returns ``(None, [])`` when the URL is excluded, when the file has no ``#markdownBody`` (so it isn't a content page), or when the body text is too short to be meaningful. """ url = _url_from_path(html_path) if url in EXCLUDE_URLS: return None, [] raw = html_path.read_text(encoding="utf-8", errors="replace") soup = BeautifulSoup(raw, "html.parser") body = soup.select_one("#markdownBody") if body is None: return None, [] title = _title(soup, url) # _clean_soup mutates the tree, so it must run AFTER we've captured # the title (selectors like h1 may live inside #markdownBody on some # layouts) and BEFORE we read body text for both surfaces. _clean_soup(soup) # Page-level record. text = re.sub(r"\s+", " ", body.get_text(" ", strip=True)).strip() page = None if len(text) < 100 else { "url": url, "title": title, "text": text, } # Paragraph-level records — re-traverse the same (now-cleaned) body. paras: list[dict] = [] heading = title for el in body.find_all(["h1", "h2", "h3", "h4", "p", "li", "blockquote"]): if el.name in ("h1", "h2", "h3", "h4"): heading = el.get_text(" ", strip=True) continue para_text = re.sub(r"\s+", " ", el.get_text(" ", strip=True)).strip() if len(para_text) < MIN_PARA_CHARS: continue paras.append({ "url": url, "title": title, "heading": heading, "excerpt": para_text[:200] + ("…" if len(para_text) > 200 else ""), "text": para_text[:MAX_PARA_CHARS], }) return page, paras # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> int: if not SITE_DIR.exists(): print("embed.py: _site/ not found — skipping", file=sys.stderr) return 0 if not needs_update(): print("embed.py: all outputs up to date — skipping") return 0 # --- Extract pages + paragraphs in a single soup-per-file pass --- print("embed.py: extracting pages…") pages = [] paragraphs = [] for html in sorted(SITE_DIR.rglob("*.html")): page, paras = extract_one(html) if page is None: continue pages.append(page) paragraphs.extend(paras) if not pages: print("embed.py: no indexable pages found", file=sys.stderr) return 0 # --- Load model once for both tasks --- print(f"embed.py: loading {MODEL_NAME}…") model = SentenceTransformer(MODEL_NAME) # --- Similar-links (page level) --- print(f"embed.py: embedding {len(pages)} pages…") page_vecs = model.encode( [p["text"] for p in pages], normalize_embeddings=True, show_progress_bar=True, batch_size=64, ).astype(np.float32) index = faiss.IndexFlatIP(page_vecs.shape[1]) index.add(page_vecs) scores_all, indices_all = index.search(page_vecs, TOP_N + 1) similar: dict[str, list] = {} for i, page in enumerate(pages): neighbours = [] for rank in range(TOP_N + 1): j, score = int(indices_all[i, rank]), float(scores_all[i, rank]) if j == i or score < MIN_SCORE: continue neighbours.append({"url": pages[j]["url"], "title": pages[j]["title"], "score": round(score, 4)}) if len(neighbours) == TOP_N: break if neighbours: similar[page["url"]] = neighbours SIMILAR_OUT.parent.mkdir(parents=True, exist_ok=True) SIMILAR_OUT.write_text(json.dumps(similar, ensure_ascii=False, indent=2)) print(f"embed.py: wrote {len(similar)} similar-links entries") # --- Semantic index (paragraph level) --- if not paragraphs: print("embed.py: no paragraphs extracted — skipping semantic index") return 0 print(f"embed.py: embedding {len(paragraphs)} paragraphs…") para_vecs = model.encode( [p["text"] for p in paragraphs], normalize_embeddings=True, show_progress_bar=True, batch_size=64, ).astype(np.float32) SEMANTIC_BIN.write_bytes(para_vecs.tobytes()) meta = [{"url": p["url"], "title": p["title"], "heading": p["heading"], "excerpt": p["excerpt"]} for p in paragraphs] SEMANTIC_META.write_text(json.dumps(meta, ensure_ascii=False)) print(f"embed.py: wrote {len(paragraphs)} paragraphs to semantic index " f"({SEMANTIC_BIN.stat().st_size // 1024} KB)") return 0 if __name__ == "__main__": sys.exit(main())