#!/usr/bin/env python3 """ embed.py — Build-time similar-links generator. Reads _site/**/*.html, embeds each page with nomic-embed-text-v1.5, builds a FAISS IndexFlatIP, and writes data/similar-links.json: { "/path/to/page/": [{"url": "...", "title": "...", "score": 0.87}, ...] } Called by `make build` when .venv exists. Failures are non-fatal (make prints a warning and continues). Run `uv sync` first to provision the environment. Staleness check: skips re-embedding if data/similar-links.json is newer than every HTML file in _site/ — so content-only rebuilds that don't touch HTML won't re-embed. """ import json import os import re import sys from pathlib import Path import faiss import numpy as np from bs4 import BeautifulSoup from sentence_transformers import SentenceTransformer # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- REPO_ROOT = Path(__file__).parent.parent SITE_DIR = REPO_ROOT / "_site" OUT_FILE = REPO_ROOT / "data" / "similar-links.json" MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5" TOP_N = 5 MIN_SCORE = 0.30 # cosine similarity threshold; discard weak matches # Pages to exclude from both indexing and results (exact URL paths) EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml", "/music/feed.xml"} # --------------------------------------------------------------------------- # Staleness check # --------------------------------------------------------------------------- def needs_update() -> bool: """Return True if similar-links.json is missing or older than any _site HTML.""" if not OUT_FILE.exists(): return True json_mtime = OUT_FILE.stat().st_mtime for html in SITE_DIR.rglob("*.html"): if html.stat().st_mtime > json_mtime: return True return False # --------------------------------------------------------------------------- # HTML → text extraction # --------------------------------------------------------------------------- def extract(html_path: Path) -> dict | None: """ Parse an HTML file and extract: - url: root-relative URL path (e.g. "/essays/my-essay/") - title: page text - text: plain text of the page body (nav/footer/TOC stripped) Returns None for pages that should not be indexed. """ raw = html_path.read_text(encoding="utf-8", errors="replace") soup = BeautifulSoup(raw, "html.parser") # Derive root-relative URL from file path rel = html_path.relative_to(SITE_DIR) if rel.name == "index.html": url = "/" + str(rel.parent) + "/" url = url.replace("//", "/") # root index.html → "/" else: url = "/" + str(rel) if url in EXCLUDE_URLS: return None # Only index actual content pages — skip index/tag/feed/author pages # that have no prose body. body = soup.select_one("#markdownBody") if body is None: return None # Title: prefer <h1>, fall back to <title> (strip " — Site Name" suffix) h1 = soup.find("h1") if h1: title = h1.get_text(" ", strip=True) else: title_tag = soup.find("title") raw_title = title_tag.get_text(" ", strip=True) if title_tag else url title = re.split(r"\s+[—–-]\s+", raw_title)[0].strip() # Remove elements that aren't content for sel in ["nav", "footer", "#toc", ".link-popup", "script", "style", ".page-meta-footer", ".metadata", "[data-pagefind-ignore]"]: for el in soup.select(sel): el.decompose() text = body.get_text(" ", strip=True) # Collapse runs of whitespace text = re.sub(r"\s+", " ", text).strip() if len(text) < 100: # too short to embed meaningfully return None # Feed title + text to the model so title is part of the representation return {"url": url, "title": title, "text": f"search_document: {title}\n\n{text}"} # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> int: if not SITE_DIR.exists(): print("embed.py: _site/ not found — skipping", file=sys.stderr) return 0 if not needs_update(): print("embed.py: similar-links.json is up to date — skipping") return 0 print("embed.py: extracting pages…") pages = [] for html in sorted(SITE_DIR.rglob("*.html")): page = extract(html) if page: pages.append(page) if not pages: print("embed.py: no indexable pages found", file=sys.stderr) return 0 print(f"embed.py: embedding {len(pages)} pages with {MODEL_NAME}…") model = SentenceTransformer(MODEL_NAME, trust_remote_code=True) texts = [p["text"] for p in pages] # nomic requires a task prefix; we used "search_document:" above for the # corpus. For queries we'd use "search_query:" — but here both corpus and # query are the same documents, so we use "search_document:" throughout. embeddings = model.encode( texts, normalize_embeddings=True, # unit vectors → inner product == cosine show_progress_bar=True, batch_size=32, ) embeddings = np.array(embeddings, dtype=np.float32) print("embed.py: building FAISS index…") dim = embeddings.shape[1] index = faiss.IndexFlatIP(dim) # exact inner product; fine for < 10k pages index.add(embeddings) print("embed.py: querying nearest neighbours…") # Query all at once: returns (n_pages, TOP_N+1) — +1 because self is #1 scores_all, indices_all = index.search(embeddings, TOP_N + 1) result: dict[str, list] = {} for i, page in enumerate(pages): neighbours = [] for rank in range(TOP_N + 1): j = int(indices_all[i, rank]) score = float(scores_all[i, rank]) if j == i: continue # skip self if score < MIN_SCORE: continue # skip weak matches neighbours.append({ "url": pages[j]["url"], "title": pages[j]["title"], "score": round(score, 4), }) if len(neighbours) == TOP_N: break if neighbours: result[page["url"]] = neighbours OUT_FILE.parent.mkdir(parents=True, exist_ok=True) OUT_FILE.write_text(json.dumps(result, ensure_ascii=False, indent=2)) print(f"embed.py: wrote {len(result)} entries to {OUT_FILE.relative_to(REPO_ROOT)}") return 0 if __name__ == "__main__": sys.exit(main())