ozymandias/tools/embed.py

259 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
embed.py — Build-time embedding pipeline.
Produces two outputs from _site/**/*.html:
data/similar-links.json Page-level similarity (for "Related" footer section)
data/semantic-index.bin Paragraph vectors as raw Float32 array (N × DIM)
data/semantic-meta.json Paragraph metadata: [{url, title, heading, excerpt}]
Both use all-MiniLM-L6-v2 (384 dims) — the same model shipped to the browser
via transformers.js for query-time semantic search.
Called by `make build` when .venv exists. Failures are non-fatal.
Staleness check: skips if all output files are newer than every HTML in _site/.
"""
import json
import re
import sys
from pathlib import Path
import faiss
import numpy as np
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
REPO_ROOT = Path(__file__).parent.parent
SITE_DIR = REPO_ROOT / "_site"
SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json"
SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin"
SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json"
MODEL_NAME = "all-MiniLM-L6-v2"
DIM = 384
TOP_N = 5 # similar-links: neighbours per page
MIN_SCORE = 0.30 # similar-links: discard weak matches
MIN_PARA_CHARS = 80 # semantic: skip very short paragraphs
MAX_PARA_CHARS = 1000 # semantic: truncate before embedding
# Pages that should not appear in similar-links suggestions or the semantic
# index. Search/build/stats are meta-pages with no prose; feeds are XML.
# Photography listing surfaces (map/contact-sheet/by-year) are index pages,
# not content. URLs must match exactly what `_url_from_path` produces (i.e.,
# directory-style URLs end with `/`, file-style URLs include the extension).
EXCLUDE_URLS = {
"/search.html",
"/build/",
"/stats/",
"/library.html",
"/new.html",
"/feed.xml",
"/music/feed.xml",
"/photography/feed.xml",
"/photography/map/",
"/photography/contact-sheet/",
}
STRIP_SELECTORS = [
"nav", "footer", "#toc", ".link-popup", "script", "style",
".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
]
# ---------------------------------------------------------------------------
# Staleness check
# ---------------------------------------------------------------------------
def needs_update() -> bool:
outputs = [SIMILAR_OUT, SEMANTIC_BIN, SEMANTIC_META]
if not all(p.exists() for p in outputs):
return True
oldest = min(p.stat().st_mtime for p in outputs)
return any(html.stat().st_mtime > oldest for html in SITE_DIR.rglob("*.html"))
# ---------------------------------------------------------------------------
# HTML parsing helpers
# ---------------------------------------------------------------------------
def _url_from_path(html_path: Path) -> str:
rel = html_path.relative_to(SITE_DIR)
if rel.name == "index.html":
parent = str(rel.parent)
if parent in (".", ""):
return "/"
return "/" + parent + "/"
return "/" + str(rel)
def _clean_soup(soup: BeautifulSoup) -> None:
for sel in STRIP_SELECTORS:
for el in soup.select(sel):
el.decompose()
def _title(soup: BeautifulSoup, url: str) -> str:
h1 = soup.find("h1")
if h1:
return h1.get_text(" ", strip=True)
tag = soup.find("title")
raw = tag.get_text(" ", strip=True) if tag else url
return re.split(r"\s+[—–-]\s+", raw)[0].strip()
# ---------------------------------------------------------------------------
# Extraction
# ---------------------------------------------------------------------------
#
# A single pass over each HTML file produces both:
#
# * a page-level record (concatenated body text, for similar-links)
# * a list of paragraph-level records (for the semantic index)
#
# Both surfaces want the same soup; an earlier version of this script
# parsed each file twice. The combined pass keeps BeautifulSoup work to
# one allocation per file.
def extract_one(html_path: Path) -> tuple[dict | None, list[dict]]:
"""Parse one HTML file and return (page-record-or-None, paragraph-list).
Returns ``(None, [])`` when the URL is excluded, when the file has no
``#markdownBody`` (so it isn't a content page), or when the body text
is too short to be meaningful.
"""
url = _url_from_path(html_path)
if url in EXCLUDE_URLS:
return None, []
raw = html_path.read_text(encoding="utf-8", errors="replace")
soup = BeautifulSoup(raw, "html.parser")
body = soup.select_one("#markdownBody")
if body is None:
return None, []
title = _title(soup, url)
# _clean_soup mutates the tree, so it must run AFTER we've captured
# the title (selectors like h1 may live inside #markdownBody on some
# layouts) and BEFORE we read body text for both surfaces.
_clean_soup(soup)
# Page-level record.
text = re.sub(r"\s+", " ", body.get_text(" ", strip=True)).strip()
page = None if len(text) < 100 else {
"url": url, "title": title, "text": text,
}
# Paragraph-level records — re-traverse the same (now-cleaned) body.
paras: list[dict] = []
heading = title
for el in body.find_all(["h1", "h2", "h3", "h4", "p", "li", "blockquote"]):
if el.name in ("h1", "h2", "h3", "h4"):
heading = el.get_text(" ", strip=True)
continue
para_text = re.sub(r"\s+", " ", el.get_text(" ", strip=True)).strip()
if len(para_text) < MIN_PARA_CHARS:
continue
paras.append({
"url": url,
"title": title,
"heading": heading,
"excerpt": para_text[:200] + ("" if len(para_text) > 200 else ""),
"text": para_text[:MAX_PARA_CHARS],
})
return page, paras
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> int:
if not SITE_DIR.exists():
print("embed.py: _site/ not found — skipping", file=sys.stderr)
return 0
if not needs_update():
print("embed.py: all outputs up to date — skipping")
return 0
# --- Extract pages + paragraphs in a single soup-per-file pass ---
print("embed.py: extracting pages…")
pages = []
paragraphs = []
for html in sorted(SITE_DIR.rglob("*.html")):
page, paras = extract_one(html)
if page is None:
continue
pages.append(page)
paragraphs.extend(paras)
if not pages:
print("embed.py: no indexable pages found", file=sys.stderr)
return 0
# --- Load model once for both tasks ---
print(f"embed.py: loading {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)
# --- Similar-links (page level) ---
print(f"embed.py: embedding {len(pages)} pages…")
page_vecs = model.encode(
[p["text"] for p in pages],
normalize_embeddings=True,
show_progress_bar=True,
batch_size=64,
).astype(np.float32)
index = faiss.IndexFlatIP(page_vecs.shape[1])
index.add(page_vecs)
scores_all, indices_all = index.search(page_vecs, TOP_N + 1)
similar: dict[str, list] = {}
for i, page in enumerate(pages):
neighbours = []
for rank in range(TOP_N + 1):
j, score = int(indices_all[i, rank]), float(scores_all[i, rank])
if j == i or score < MIN_SCORE:
continue
neighbours.append({"url": pages[j]["url"], "title": pages[j]["title"],
"score": round(score, 4)})
if len(neighbours) == TOP_N:
break
if neighbours:
similar[page["url"]] = neighbours
SIMILAR_OUT.parent.mkdir(parents=True, exist_ok=True)
SIMILAR_OUT.write_text(json.dumps(similar, ensure_ascii=False, indent=2))
print(f"embed.py: wrote {len(similar)} similar-links entries")
# --- Semantic index (paragraph level) ---
if not paragraphs:
print("embed.py: no paragraphs extracted — skipping semantic index")
return 0
print(f"embed.py: embedding {len(paragraphs)} paragraphs…")
para_vecs = model.encode(
[p["text"] for p in paragraphs],
normalize_embeddings=True,
show_progress_bar=True,
batch_size=64,
).astype(np.float32)
SEMANTIC_BIN.write_bytes(para_vecs.tobytes())
meta = [{"url": p["url"], "title": p["title"],
"heading": p["heading"], "excerpt": p["excerpt"]}
for p in paragraphs]
SEMANTIC_META.write_text(json.dumps(meta, ensure_ascii=False))
print(f"embed.py: wrote {len(paragraphs)} paragraphs to semantic index "
f"({SEMANTIC_BIN.stat().st_size // 1024} KB)")
return 0
if __name__ == "__main__":
sys.exit(main())