diff --git a/Makefile b/Makefile index 174f586..2863eb5 100644 --- a/Makefile +++ b/Makefile @@ -21,8 +21,12 @@ build: # so a stray secret dropped under content/ is NOT auto-staged. To # intentionally commit a normally-ignored file, use `git add -f` # manually before running `make build`. + # + # The commit and its guard are pathspec-limited to content/ so that + # anything the user had previously staged for other reasons is left + # staged, not silently swept into the auto-commit. @git add content/ - @git diff --cached --quiet || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]" + @git diff --cached --quiet -- content/ || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]" -- content/ @mkdir -p data @date +%s > data/build-start.txt @./tools/convert-images.sh diff --git a/nginx/security-headers.conf b/nginx/security-headers.conf index e1c3b5b..9c75790 100644 --- a/nginx/security-headers.conf +++ b/nginx/security-headers.conf @@ -42,8 +42,20 @@ add_header Permissions-Policy # report stream has been clean for a week. # # External origins justified inline: -# cdn.jsdelivr.net KaTeX CSS + JS, Vega / Vega-Lite / Vega-Embed +# cdn.jsdelivr.net KaTeX CSS + JS + webfonts (the KaTeX CSS +# references its fonts relatively, so they +# resolve to the CDN -> font-src), Vega / +# Vega-Lite / Vega-Embed, transformers.js +# (whose onnxruntime fetches its .wasm from +# the CDN via fetch() -> connect-src) # *.basemaps.cartocdn.com Leaflet basemap tiles (photography map only) +# connect-src API hosts link-popup providers fetched directly via +# CORS (the list popups.js documents in its +# header, plus git.levineuwirth.org for the +# Forgejo provider). The CORS-broken trio +# (arxiv, archive.org, pubmed) goes through +# the same-origin /proxy/ instead — see +# nginx/popup-proxy.conf. # # Why 'unsafe-inline' on style: # - photography.html emits for @@ -53,18 +65,14 @@ add_header Permissions-Policy # Why 'unsafe-eval' on script: # - vega-embed compiles Vega-Lite specs at runtime via new Function(). # Removing this would require pre-compiling specs at build time. +# - it also covers WebAssembly.instantiate for onnxruntime-web +# (semantic search). +# +# The value MUST stay on one physical line: nginx has no line +# continuation inside quoted strings — a trailing backslash would embed +# literal backslash + LF bytes in the header value, which is illegal in +# HTTP/2 and gets whole responses rejected by strict clients. # # To collect violation reports, set up a `report-uri` endpoint and add # `report-uri /csp-report;` (and/or `report-to ;`) below. -add_header Content-Security-Policy-Report-Only - "default-src 'self'; \ - script-src 'self' 'unsafe-eval' https://cdn.jsdelivr.net; \ - style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net; \ - img-src 'self' data: https://*.basemaps.cartocdn.com; \ - font-src 'self' data:; \ - connect-src 'self'; \ - frame-ancestors 'none'; \ - base-uri 'self'; \ - form-action 'self'; \ - object-src 'none'; \ - upgrade-insecure-requests" always; +add_header Content-Security-Policy-Report-Only "default-src 'self'; script-src 'self' 'unsafe-eval' https://cdn.jsdelivr.net; style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net; img-src 'self' data: https://*.basemaps.cartocdn.com; font-src 'self' data: https://cdn.jsdelivr.net; connect-src 'self' https://cdn.jsdelivr.net https://*.wikipedia.org https://api.crossref.org https://api.github.com https://openlibrary.org https://api.biorxiv.org https://www.youtube.com https://git.levineuwirth.org; frame-ancestors 'none'; base-uri 'self'; form-action 'self'; object-src 'none'; upgrade-insecure-requests" always; diff --git a/tools/embed.py b/tools/embed.py index 67f57c8..ab57788 100644 --- a/tools/embed.py +++ b/tools/embed.py @@ -28,6 +28,7 @@ import json import os import re import sys +import zipfile from pathlib import Path import faiss @@ -75,6 +76,11 @@ PARA_DIM = 384 PAGE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5" PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab" +# The weights repo above declares its modeling code via auto_map in a +# SEPARATE repo (nomic-ai/nomic-bert-2048), which `revision=` does NOT +# pin — without this second pin, trust_remote_code executes whatever is +# at that repo's head at build time. +PAGE_MODEL_CODE_REVISION = "7710840340a098cfb869c4f65e87cf2b1b70caca" PAGE_DIM = 768 # Nomic requires task-prefixed input. Documents (corpus side) get # "search_document: "; queries would get "search_query: ". similar-links @@ -151,7 +157,8 @@ def load_page_cache() -> dict[str, np.ndarray]: if vectors.shape != (len(hashes), PAGE_DIM): return {} return {h.item(): vectors[i] for i, h in enumerate(hashes)} - except (OSError, KeyError, ValueError) as e: + except (OSError, KeyError, ValueError, EOFError, + zipfile.BadZipFile) as e: print(f"embed.py: page cache unreadable ({e}) — discarding", file=sys.stderr) return {} @@ -171,16 +178,20 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None: # Pass an open file handle, not a path: np.savez_compressed appends # ".npz" to bare paths, which would mangle our atomic-rename target. tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp") - with open(tmp, "wb") as f: - np.savez_compressed( - f, - model=PAGE_MODEL_NAME, - revision=PAGE_MODEL_REVISION, - dim=PAGE_DIM, - hashes=hashes, - vectors=vectors, - ) - os.replace(tmp, PAGE_CACHE) + try: + with open(tmp, "wb") as f: + np.savez_compressed( + f, + model=PAGE_MODEL_NAME, + revision=PAGE_MODEL_REVISION, + dim=PAGE_DIM, + hashes=hashes, + vectors=vectors, + ) + os.replace(tmp, PAGE_CACHE) + except BaseException: + tmp.unlink(missing_ok=True) + raise STRIP_SELECTORS = [ @@ -327,6 +338,10 @@ def main() -> int: print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}…") page_model = SentenceTransformer( PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True, + # code_revision pins the auto_map modeling repo; it must reach + # both AutoConfig and AutoModel.from_pretrained. + model_kwargs={"code_revision": PAGE_MODEL_CODE_REVISION}, + config_kwargs={"code_revision": PAGE_MODEL_CODE_REVISION}, ) new_vecs = page_model.encode( [page_inputs[i] for i in miss_idxs],