diff --git a/tools/embed.py b/tools/embed.py index 103b814..9034ee7 100644 --- a/tools/embed.py +++ b/tools/embed.py @@ -36,7 +36,11 @@ SIMILAR_OUT = REPO_ROOT / "data" / "similar-links.json" SEMANTIC_BIN = REPO_ROOT / "data" / "semantic-index.bin" SEMANTIC_META = REPO_ROOT / "data" / "semantic-meta.json" -MODEL_NAME = "all-MiniLM-L6-v2" +MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" +# Pinned to a specific HuggingFace commit so a future model bump can't +# silently change embedding semantics. Bump deliberately when validating +# (and re-run a full embed pass to refresh data/semantic-* + similar-links). +MODEL_REVISION = "c9745ed1d9f207416be6d2e6f8de32d1f16199bf" DIM = 384 TOP_N = 5 # similar-links: neighbours per page @@ -199,8 +203,8 @@ def main() -> int: return 0 # --- Load model once for both tasks --- - print(f"embed.py: loading {MODEL_NAME}…") - model = SentenceTransformer(MODEL_NAME) + print(f"embed.py: loading {MODEL_NAME}@{MODEL_REVISION[:8]}…") + model = SentenceTransformer(MODEL_NAME, revision=MODEL_REVISION) # --- Similar-links (page level) --- print(f"embed.py: embedding {len(pages)} pages…") diff --git a/tools/model-checksums.sha256 b/tools/model-checksums.sha256 new file mode 100644 index 0000000..a32472b --- /dev/null +++ b/tools/model-checksums.sha256 @@ -0,0 +1,5 @@ +7135149f7cffa1a573466c6e4d8423ed73b62fd2332c575bf738a0d033f70df7 config.json +da0e79933b9ed51798a3ae27893d3c5fa4a201126cef75586296df9b4d2c62a0 tokenizer.json +9261e7d79b44c8195c1cada2b453e55b00aeb81e907a6664974b4d7776172ab3 tokenizer_config.json +b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3 special_tokens_map.json +afdb6f1a0e45b715d0bb9b11772f032c399babd23bfc31fed1c170afc848bdb1 onnx/model_quantized.onnx