levineuwirth.org/tools/embed.py

#!/usr/bin/env python3
"""
embed.py — Build-time similar-links generator.

Reads _site/**/*.html, embeds each page with nomic-embed-text-v1.5,
builds a FAISS IndexFlatIP, and writes data/similar-links.json:

  { "/path/to/page/": [{"url": "...", "title": "...", "score": 0.87}, ...] }

Called by `make build` when .venv exists. Failures are non-fatal (make prints
a warning and continues). Run `uv sync` first to provision the environment.

Staleness check: skips re-embedding if data/similar-links.json is newer than
every HTML file in _site/ — so content-only rebuilds that don't touch HTML
won't re-embed.
"""

import json
import os
import re
import sys
from pathlib import Path

import faiss
import numpy as np
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

REPO_ROOT   = Path(__file__).parent.parent
SITE_DIR    = REPO_ROOT / "_site"
OUT_FILE    = REPO_ROOT / "data" / "similar-links.json"
MODEL_NAME  = "nomic-ai/nomic-embed-text-v1.5"
TOP_N       = 5
MIN_SCORE   = 0.30   # cosine similarity threshold; discard weak matches
# Pages to exclude from both indexing and results (exact URL paths)
EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml", "/music/feed.xml"}

# ---------------------------------------------------------------------------
# Staleness check
# ---------------------------------------------------------------------------

def needs_update() -> bool:
    """Return True if similar-links.json is missing or older than any _site HTML."""
    if not OUT_FILE.exists():
        return True
    json_mtime = OUT_FILE.stat().st_mtime
    for html in SITE_DIR.rglob("*.html"):
        if html.stat().st_mtime > json_mtime:
            return True
    return False

# ---------------------------------------------------------------------------
# HTML → text extraction
# ---------------------------------------------------------------------------

def extract(html_path: Path) -> dict | None:
    """
    Parse an HTML file and extract:
      - url:     root-relative URL path  (e.g. "/essays/my-essay/")
      - title:   page <title> text
      - text:    plain text of the page body (nav/footer/TOC stripped)
    Returns None for pages that should not be indexed.
    """
    raw = html_path.read_text(encoding="utf-8", errors="replace")
    soup = BeautifulSoup(raw, "html.parser")

    # Derive root-relative URL from file path
    rel = html_path.relative_to(SITE_DIR)
    if rel.name == "index.html":
        url = "/" + str(rel.parent) + "/"
        url = url.replace("//", "/")   # root index.html → "/"
    else:
        url = "/" + str(rel)

    if url in EXCLUDE_URLS:
        return None

    # Only index actual content pages — skip index/tag/feed/author pages
    # that have no prose body.
    body = soup.select_one("#markdownBody")
    if body is None:
        return None

    # Title: prefer <h1>, fall back to <title> (strip " — Site Name" suffix)
    h1 = soup.find("h1")
    if h1:
        title = h1.get_text(" ", strip=True)
    else:
        title_tag = soup.find("title")
        raw_title = title_tag.get_text(" ", strip=True) if title_tag else url
        title = re.split(r"\s+[—–-]\s+", raw_title)[0].strip()

    # Remove elements that aren't content
    for sel in ["nav", "footer", "#toc", ".link-popup", "script", "style",
                ".page-meta-footer", ".metadata", "[data-pagefind-ignore]"]:
        for el in soup.select(sel):
            el.decompose()

    text = body.get_text(" ", strip=True)
    # Collapse runs of whitespace
    text = re.sub(r"\s+", " ", text).strip()

    if len(text) < 100:   # too short to embed meaningfully
        return None

    # Feed title + text to the model so title is part of the representation
    return {"url": url, "title": title, "text": f"search_document: {title}\n\n{text}"}

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> int:
    if not SITE_DIR.exists():
        print("embed.py: _site/ not found — skipping", file=sys.stderr)
        return 0

    if not needs_update():
        print("embed.py: similar-links.json is up to date — skipping")
        return 0

    print("embed.py: extracting pages…")
    pages = []
    for html in sorted(SITE_DIR.rglob("*.html")):
        page = extract(html)
        if page:
            pages.append(page)

    if not pages:
        print("embed.py: no indexable pages found", file=sys.stderr)
        return 0

    print(f"embed.py: embedding {len(pages)} pages with {MODEL_NAME}…")
    model = SentenceTransformer(MODEL_NAME, trust_remote_code=True)

    texts = [p["text"] for p in pages]
    # nomic requires a task prefix; we used "search_document:" above for the
    # corpus. For queries we'd use "search_query:" — but here both corpus and
    # query are the same documents, so we use "search_document:" throughout.
    embeddings = model.encode(
        texts,
        normalize_embeddings=True,   # unit vectors → inner product == cosine
        show_progress_bar=True,
        batch_size=32,
    )
    embeddings = np.array(embeddings, dtype=np.float32)

    print("embed.py: building FAISS index…")
    dim   = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)   # exact inner product; fine for < 10k pages
    index.add(embeddings)

    print("embed.py: querying nearest neighbours…")
    # Query all at once: returns (n_pages, TOP_N+1) — +1 because self is #1
    scores_all, indices_all = index.search(embeddings, TOP_N + 1)

    result: dict[str, list] = {}
    for i, page in enumerate(pages):
        neighbours = []
        for rank in range(TOP_N + 1):
            j     = int(indices_all[i, rank])
            score = float(scores_all[i, rank])
            if j == i:
                continue                      # skip self
            if score < MIN_SCORE:
                continue                      # skip weak matches
            neighbours.append({
                "url":   pages[j]["url"],
                "title": pages[j]["title"],
                "score": round(score, 4),
            })
            if len(neighbours) == TOP_N:
                break
        if neighbours:
            result[page["url"]] = neighbours

    OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    OUT_FILE.write_text(json.dumps(result, ensure_ascii=False, indent=2))
    print(f"embed.py: wrote {len(result)} entries to {OUT_FILE.relative_to(REPO_ROOT)}")
    return 0


if __name__ == "__main__":
    sys.exit(main())