From eb7fef30df8b824bd52f8e2ece85fe1a2f02cbc2 Mon Sep 17 00:00:00 2001
From: Levi Neuwirth <ln@levineuwirth.org>
Date: Thu, 7 May 2026 15:08:14 -0400
Subject: [PATCH] Pin Hugging Face model revisions for downloader and embed
 pipeline

- Add tools/model-checksums.sha256 with sha256 hashes for the five
  Xenova/all-MiniLM-L6-v2 files served from static/models/.
  download-model.sh was already plumbed to verify against this file
  when present; the file itself was missing, so downloads were
  unverified. Now every fetch checks against committed hashes and
  fails closed on mismatch.
- Pin embed.py's SentenceTransformer load to a specific HF commit
  (c9745ed1d9f207416be6d2e6f8de32d1f16199bf of
  sentence-transformers/all-MiniLM-L6-v2). A future model bump can no
  longer silently change embedding semantics across builds. Bump
  deliberately when validating; re-run a full embed pass to refresh
  the semantic + similar-links data.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tools/embed.py               | 10 +++++++---
 tools/model-checksums.sha256 |  5 +++++
 2 files changed, 12 insertions(+), 3 deletions(-)
 create mode 100644 tools/model-checksums.sha256

diff --git a/tools/embed.py b/tools/embed.py
index 103b814..9034ee7 100644
--- a/tools/embed.py
+++ b/tools/embed.py
@@ -36,7 +36,11 @@ SIMILAR_OUT    = REPO_ROOT / "data" / "similar-links.json"
 SEMANTIC_BIN   = REPO_ROOT / "data" / "semantic-index.bin"
 SEMANTIC_META  = REPO_ROOT / "data" / "semantic-meta.json"
 
-MODEL_NAME     = "all-MiniLM-L6-v2"
+MODEL_NAME     = "sentence-transformers/all-MiniLM-L6-v2"
+# Pinned to a specific HuggingFace commit so a future model bump can't
+# silently change embedding semantics. Bump deliberately when validating
+# (and re-run a full embed pass to refresh data/semantic-* + similar-links).
+MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf"
 DIM            = 384
 
 TOP_N          = 5      # similar-links: neighbours per page
@@ -199,8 +203,8 @@ def main() -> int:
         return 0
 
     # --- Load model once for both tasks ---
-    print(f"embed.py: loading {MODEL_NAME}…")
-    model = SentenceTransformer(MODEL_NAME)
+    print(f"embed.py: loading {MODEL_NAME}@{MODEL_REVISION[:8]}…")
+    model = SentenceTransformer(MODEL_NAME, revision=MODEL_REVISION)
 
     # --- Similar-links (page level) ---
     print(f"embed.py: embedding {len(pages)} pages…")
diff --git a/tools/model-checksums.sha256 b/tools/model-checksums.sha256
new file mode 100644
index 0000000..a32472b
--- /dev/null
+++ b/tools/model-checksums.sha256
@@ -0,0 +1,5 @@
+7135149f7cffa1a573466c6e4d8423ed73b62fd2332c575bf738a0d033f70df7  config.json
+da0e79933b9ed51798a3ae27893d3c5fa4a201126cef75586296df9b4d2c62a0  tokenizer.json
+9261e7d79b44c8195c1cada2b453e55b00aeb81e907a6664974b4d7776172ab3  tokenizer_config.json
+b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3  special_tokens_map.json
+afdb6f1a0e45b715d0bb9b11772f032c399babd23bfc31fed1c170afc848bdb1  onnx/model_quantized.onnx