ozymandias/tools/embed.py

#!/usr/bin/env python3
"""
embed.py — Build-time embedding pipeline.

Produces two outputs from _site/**/*.html:

  data/similar-links.json       Page-level similarity (for "Related" footer section)
  data/semantic-index.bin       Paragraph vectors as raw Float32 array (N × DIM)
  data/semantic-meta.json       Paragraph metadata: [{url, title, heading, excerpt}]

Both use all-MiniLM-L6-v2 (384 dims) — the same model shipped to the browser
via transformers.js for query-time semantic search.

Called by `make build` when .venv exists. Failures are non-fatal.
Staleness check: skips if all output files are newer than every HTML in _site/.
"""

import json
import re
import sys
from pathlib import Path

import faiss
import numpy as np
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

REPO_ROOT      = Path(__file__).parent.parent
SITE_DIR       = REPO_ROOT / "_site"
SIMILAR_OUT    = REPO_ROOT / "data" / "similar-links.json"
SEMANTIC_BIN   = REPO_ROOT / "data" / "semantic-index.bin"
SEMANTIC_META  = REPO_ROOT / "data" / "semantic-meta.json"

MODEL_NAME     = "all-MiniLM-L6-v2"
DIM            = 384

TOP_N          = 5      # similar-links: neighbours per page
MIN_SCORE      = 0.30   # similar-links: discard weak matches
MIN_PARA_CHARS = 80     # semantic: skip very short paragraphs
MAX_PARA_CHARS = 1000   # semantic: truncate before embedding

# Pages that should not appear in similar-links suggestions or the semantic
# index. Search/build/stats are meta-pages with no prose; feeds are XML.
# Photography listing surfaces (map/contact-sheet/by-year) are index pages,
# not content. URLs must match exactly what `_url_from_path` produces (i.e.,
# directory-style URLs end with `/`, file-style URLs include the extension).
EXCLUDE_URLS = {
    "/search.html",
    "/build/",
    "/stats/",
    "/library.html",
    "/new.html",
    "/feed.xml",
    "/music/feed.xml",
    "/photography/feed.xml",
    "/photography/map/",
    "/photography/contact-sheet/",
}

STRIP_SELECTORS = [
    "nav", "footer", "#toc", ".link-popup", "script", "style",
    ".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
]

# ---------------------------------------------------------------------------
# Staleness check
# ---------------------------------------------------------------------------

def needs_update() -> bool:
    outputs = [SIMILAR_OUT, SEMANTIC_BIN, SEMANTIC_META]
    if not all(p.exists() for p in outputs):
        return True
    oldest = min(p.stat().st_mtime for p in outputs)
    return any(html.stat().st_mtime > oldest for html in SITE_DIR.rglob("*.html"))

# ---------------------------------------------------------------------------
# HTML parsing helpers
# ---------------------------------------------------------------------------

def _url_from_path(html_path: Path) -> str:
    rel = html_path.relative_to(SITE_DIR)
    if rel.name == "index.html":
        parent = str(rel.parent)
        if parent in (".", ""):
            return "/"
        return "/" + parent + "/"
    return "/" + str(rel)

def _clean_soup(soup: BeautifulSoup) -> None:
    for sel in STRIP_SELECTORS:
        for el in soup.select(sel):
            el.decompose()

def _title(soup: BeautifulSoup, url: str) -> str:
    h1 = soup.find("h1")
    if h1:
        return h1.get_text(" ", strip=True)
    tag = soup.find("title")
    raw = tag.get_text(" ", strip=True) if tag else url
    return re.split(r"\s+[—–-]\s+", raw)[0].strip()

# ---------------------------------------------------------------------------
# Extraction
# ---------------------------------------------------------------------------
#
# A single pass over each HTML file produces both:
#
#   * a page-level record (concatenated body text, for similar-links)
#   * a list of paragraph-level records (for the semantic index)
#
# Both surfaces want the same soup; an earlier version of this script
# parsed each file twice. The combined pass keeps BeautifulSoup work to
# one allocation per file.

def extract_one(html_path: Path) -> tuple[dict | None, list[dict]]:
    """Parse one HTML file and return (page-record-or-None, paragraph-list).

    Returns ``(None, [])`` when the URL is excluded, when the file has no
    ``#markdownBody`` (so it isn't a content page), or when the body text
    is too short to be meaningful.
    """
    url = _url_from_path(html_path)
    if url in EXCLUDE_URLS:
        return None, []

    raw  = html_path.read_text(encoding="utf-8", errors="replace")
    soup = BeautifulSoup(raw, "html.parser")
    body = soup.select_one("#markdownBody")
    if body is None:
        return None, []

    title = _title(soup, url)
    # _clean_soup mutates the tree, so it must run AFTER we've captured
    # the title (selectors like h1 may live inside #markdownBody on some
    # layouts) and BEFORE we read body text for both surfaces.
    _clean_soup(soup)

    # Page-level record.
    text = re.sub(r"\s+", " ", body.get_text(" ", strip=True)).strip()
    page = None if len(text) < 100 else {
        "url": url, "title": title, "text": text,
    }

    # Paragraph-level records — re-traverse the same (now-cleaned) body.
    paras: list[dict] = []
    heading = title
    for el in body.find_all(["h1", "h2", "h3", "h4", "p", "li", "blockquote"]):
        if el.name in ("h1", "h2", "h3", "h4"):
            heading = el.get_text(" ", strip=True)
            continue
        para_text = re.sub(r"\s+", " ", el.get_text(" ", strip=True)).strip()
        if len(para_text) < MIN_PARA_CHARS:
            continue
        paras.append({
            "url":     url,
            "title":   title,
            "heading": heading,
            "excerpt": para_text[:200] + ("…" if len(para_text) > 200 else ""),
            "text":    para_text[:MAX_PARA_CHARS],
        })

    return page, paras

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> int:
    if not SITE_DIR.exists():
        print("embed.py: _site/ not found — skipping", file=sys.stderr)
        return 0

    if not needs_update():
        print("embed.py: all outputs up to date — skipping")
        return 0

    # --- Extract pages + paragraphs in a single soup-per-file pass ---
    print("embed.py: extracting pages…")
    pages = []
    paragraphs = []

    for html in sorted(SITE_DIR.rglob("*.html")):
        page, paras = extract_one(html)
        if page is None:
            continue
        pages.append(page)
        paragraphs.extend(paras)

    if not pages:
        print("embed.py: no indexable pages found", file=sys.stderr)
        return 0

    # --- Load model once for both tasks ---
    print(f"embed.py: loading {MODEL_NAME}…")
    model = SentenceTransformer(MODEL_NAME)

    # --- Similar-links (page level) ---
    print(f"embed.py: embedding {len(pages)} pages…")
    page_vecs = model.encode(
        [p["text"] for p in pages],
        normalize_embeddings=True,
        show_progress_bar=True,
        batch_size=64,
    ).astype(np.float32)

    index = faiss.IndexFlatIP(page_vecs.shape[1])
    index.add(page_vecs)
    scores_all, indices_all = index.search(page_vecs, TOP_N + 1)

    similar: dict[str, list] = {}
    for i, page in enumerate(pages):
        neighbours = []
        for rank in range(TOP_N + 1):
            j, score = int(indices_all[i, rank]), float(scores_all[i, rank])
            if j == i or score < MIN_SCORE:
                continue
            neighbours.append({"url": pages[j]["url"], "title": pages[j]["title"],
                                "score": round(score, 4)})
            if len(neighbours) == TOP_N:
                break
        if neighbours:
            similar[page["url"]] = neighbours

    SIMILAR_OUT.parent.mkdir(parents=True, exist_ok=True)
    SIMILAR_OUT.write_text(json.dumps(similar, ensure_ascii=False, indent=2))
    print(f"embed.py: wrote {len(similar)} similar-links entries")

    # --- Semantic index (paragraph level) ---
    if not paragraphs:
        print("embed.py: no paragraphs extracted — skipping semantic index")
        return 0

    print(f"embed.py: embedding {len(paragraphs)} paragraphs…")
    para_vecs = model.encode(
        [p["text"] for p in paragraphs],
        normalize_embeddings=True,
        show_progress_bar=True,
        batch_size=64,
    ).astype(np.float32)

    SEMANTIC_BIN.write_bytes(para_vecs.tobytes())

    meta = [{"url": p["url"], "title": p["title"],
             "heading": p["heading"], "excerpt": p["excerpt"]}
            for p in paragraphs]
    SEMANTIC_META.write_text(json.dumps(meta, ensure_ascii=False))

    print(f"embed.py: wrote {len(paragraphs)} paragraphs to semantic index "
          f"({SEMANTIC_BIN.stat().st_size // 1024} KB)")
    return 0


if __name__ == "__main__":
    sys.exit(main())