189 lines
6.6 KiB
Python
189 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
embed.py — Build-time similar-links generator.
|
|
|
|
Reads _site/**/*.html, embeds each page with nomic-embed-text-v1.5,
|
|
builds a FAISS IndexFlatIP, and writes data/similar-links.json:
|
|
|
|
{ "/path/to/page/": [{"url": "...", "title": "...", "score": 0.87}, ...] }
|
|
|
|
Called by `make build` when .venv exists. Failures are non-fatal (make prints
|
|
a warning and continues). Run `uv sync` first to provision the environment.
|
|
|
|
Staleness check: skips re-embedding if data/similar-links.json is newer than
|
|
every HTML file in _site/ — so content-only rebuilds that don't touch HTML
|
|
won't re-embed.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import faiss
|
|
import numpy as np
|
|
from bs4 import BeautifulSoup
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
REPO_ROOT = Path(__file__).parent.parent
|
|
SITE_DIR = REPO_ROOT / "_site"
|
|
OUT_FILE = REPO_ROOT / "data" / "similar-links.json"
|
|
MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
|
|
TOP_N = 5
|
|
MIN_SCORE = 0.30 # cosine similarity threshold; discard weak matches
|
|
# Pages to exclude from both indexing and results (exact URL paths)
|
|
EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml", "/music/feed.xml"}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Staleness check
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def needs_update() -> bool:
|
|
"""Return True if similar-links.json is missing or older than any _site HTML."""
|
|
if not OUT_FILE.exists():
|
|
return True
|
|
json_mtime = OUT_FILE.stat().st_mtime
|
|
for html in SITE_DIR.rglob("*.html"):
|
|
if html.stat().st_mtime > json_mtime:
|
|
return True
|
|
return False
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTML → text extraction
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def extract(html_path: Path) -> dict | None:
|
|
"""
|
|
Parse an HTML file and extract:
|
|
- url: root-relative URL path (e.g. "/essays/my-essay/")
|
|
- title: page <title> text
|
|
- text: plain text of the page body (nav/footer/TOC stripped)
|
|
Returns None for pages that should not be indexed.
|
|
"""
|
|
raw = html_path.read_text(encoding="utf-8", errors="replace")
|
|
soup = BeautifulSoup(raw, "html.parser")
|
|
|
|
# Derive root-relative URL from file path
|
|
rel = html_path.relative_to(SITE_DIR)
|
|
if rel.name == "index.html":
|
|
url = "/" + str(rel.parent) + "/"
|
|
url = url.replace("//", "/") # root index.html → "/"
|
|
else:
|
|
url = "/" + str(rel)
|
|
|
|
if url in EXCLUDE_URLS:
|
|
return None
|
|
|
|
# Only index actual content pages — skip index/tag/feed/author pages
|
|
# that have no prose body.
|
|
body = soup.select_one("#markdownBody")
|
|
if body is None:
|
|
return None
|
|
|
|
# Title: prefer <h1>, fall back to <title> (strip " — Site Name" suffix)
|
|
h1 = soup.find("h1")
|
|
if h1:
|
|
title = h1.get_text(" ", strip=True)
|
|
else:
|
|
title_tag = soup.find("title")
|
|
raw_title = title_tag.get_text(" ", strip=True) if title_tag else url
|
|
title = re.split(r"\s+[—–-]\s+", raw_title)[0].strip()
|
|
|
|
# Remove elements that aren't content
|
|
for sel in ["nav", "footer", "#toc", ".link-popup", "script", "style",
|
|
".page-meta-footer", ".metadata", "[data-pagefind-ignore]"]:
|
|
for el in soup.select(sel):
|
|
el.decompose()
|
|
|
|
text = body.get_text(" ", strip=True)
|
|
# Collapse runs of whitespace
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
|
|
if len(text) < 100: # too short to embed meaningfully
|
|
return None
|
|
|
|
# Feed title + text to the model so title is part of the representation
|
|
return {"url": url, "title": title, "text": f"search_document: {title}\n\n{text}"}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main() -> int:
|
|
if not SITE_DIR.exists():
|
|
print("embed.py: _site/ not found — skipping", file=sys.stderr)
|
|
return 0
|
|
|
|
if not needs_update():
|
|
print("embed.py: similar-links.json is up to date — skipping")
|
|
return 0
|
|
|
|
print("embed.py: extracting pages…")
|
|
pages = []
|
|
for html in sorted(SITE_DIR.rglob("*.html")):
|
|
page = extract(html)
|
|
if page:
|
|
pages.append(page)
|
|
|
|
if not pages:
|
|
print("embed.py: no indexable pages found", file=sys.stderr)
|
|
return 0
|
|
|
|
print(f"embed.py: embedding {len(pages)} pages with {MODEL_NAME}…")
|
|
model = SentenceTransformer(MODEL_NAME, trust_remote_code=True)
|
|
|
|
texts = [p["text"] for p in pages]
|
|
# nomic requires a task prefix; we used "search_document:" above for the
|
|
# corpus. For queries we'd use "search_query:" — but here both corpus and
|
|
# query are the same documents, so we use "search_document:" throughout.
|
|
embeddings = model.encode(
|
|
texts,
|
|
normalize_embeddings=True, # unit vectors → inner product == cosine
|
|
show_progress_bar=True,
|
|
batch_size=32,
|
|
)
|
|
embeddings = np.array(embeddings, dtype=np.float32)
|
|
|
|
print("embed.py: building FAISS index…")
|
|
dim = embeddings.shape[1]
|
|
index = faiss.IndexFlatIP(dim) # exact inner product; fine for < 10k pages
|
|
index.add(embeddings)
|
|
|
|
print("embed.py: querying nearest neighbours…")
|
|
# Query all at once: returns (n_pages, TOP_N+1) — +1 because self is #1
|
|
scores_all, indices_all = index.search(embeddings, TOP_N + 1)
|
|
|
|
result: dict[str, list] = {}
|
|
for i, page in enumerate(pages):
|
|
neighbours = []
|
|
for rank in range(TOP_N + 1):
|
|
j = int(indices_all[i, rank])
|
|
score = float(scores_all[i, rank])
|
|
if j == i:
|
|
continue # skip self
|
|
if score < MIN_SCORE:
|
|
continue # skip weak matches
|
|
neighbours.append({
|
|
"url": pages[j]["url"],
|
|
"title": pages[j]["title"],
|
|
"score": round(score, 4),
|
|
})
|
|
if len(neighbours) == TOP_N:
|
|
break
|
|
if neighbours:
|
|
result[page["url"]] = neighbours
|
|
|
|
OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
OUT_FILE.write_text(json.dumps(result, ensure_ascii=False, indent=2))
|
|
print(f"embed.py: wrote {len(result)} entries to {OUT_FILE.relative_to(REPO_ROOT)}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|