Fix audit tooling/infra findings

- embed.py: pin nomic's auto_map modeling repo via code_revision —
  revision= alone left nomic-bert-2048 unpinned under
  trust_remote_code (AUDIT §1.3; verified loadable with
  HF_HUB_OFFLINE=1). Catch BadZipFile/EOFError when loading the page
  cache so a half-written npz is discarded, not fatal (§4.2), and
  unlink the tmp file on a failed save (§4.1)
- nginx: collapse the CSP to one physical line — nginx has no line
  continuation in quoted strings, so the old value embedded literal
  backslash+LF bytes, illegal in HTTP/2 (§8.1). Add the externals the
  site actually uses: KaTeX webfonts + onnxruntime wasm via jsdelivr,
  and the popup provider APIs popups.js documents (§8.2)
- Makefile: pathspec-limit the auto-commit to content/ so pre-staged
  unrelated work is no longer swept into auto: commits (§8.3)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Levi Neuwirth 2026-06-10 09:21:47 -04:00
parent c64f3d63c0
commit f11495ff9a
3 changed files with 52 additions and 25 deletions

View File

@ -21,8 +21,12 @@ build:
# so a stray secret dropped under content/ is NOT auto-staged. To
# intentionally commit a normally-ignored file, use `git add -f`
# manually before running `make build`.
#
# The commit and its guard are pathspec-limited to content/ so that
# anything the user had previously staged for other reasons is left
# staged, not silently swept into the auto-commit.
@git add content/
@git diff --cached --quiet || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]"
@git diff --cached --quiet -- content/ || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]" -- content/
@mkdir -p data
@date +%s > data/build-start.txt
@./tools/convert-images.sh

View File

@ -42,8 +42,20 @@ add_header Permissions-Policy
# report stream has been clean for a week.
#
# External origins justified inline:
# cdn.jsdelivr.net KaTeX CSS + JS, Vega / Vega-Lite / Vega-Embed
# cdn.jsdelivr.net KaTeX CSS + JS + webfonts (the KaTeX CSS
# references its fonts relatively, so they
# resolve to the CDN -> font-src), Vega /
# Vega-Lite / Vega-Embed, transformers.js
# (whose onnxruntime fetches its .wasm from
# the CDN via fetch() -> connect-src)
# *.basemaps.cartocdn.com Leaflet basemap tiles (photography map only)
# connect-src API hosts link-popup providers fetched directly via
# CORS (the list popups.js documents in its
# header, plus git.levineuwirth.org for the
# Forgejo provider). The CORS-broken trio
# (arxiv, archive.org, pubmed) goes through
# the same-origin /proxy/ instead — see
# nginx/popup-proxy.conf.
#
# Why 'unsafe-inline' on style:
# - photography.html emits <span style="background:$swatch$"> for
@ -53,18 +65,14 @@ add_header Permissions-Policy
# Why 'unsafe-eval' on script:
# - vega-embed compiles Vega-Lite specs at runtime via new Function().
# Removing this would require pre-compiling specs at build time.
# - it also covers WebAssembly.instantiate for onnxruntime-web
# (semantic search).
#
# The value MUST stay on one physical line: nginx has no line
# continuation inside quoted strings — a trailing backslash would embed
# literal backslash + LF bytes in the header value, which is illegal in
# HTTP/2 and gets whole responses rejected by strict clients.
#
# To collect violation reports, set up a `report-uri` endpoint and add
# `report-uri /csp-report;` (and/or `report-to <group>;`) below.
add_header Content-Security-Policy-Report-Only
"default-src 'self'; \
script-src 'self' 'unsafe-eval' https://cdn.jsdelivr.net; \
style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net; \
img-src 'self' data: https://*.basemaps.cartocdn.com; \
font-src 'self' data:; \
connect-src 'self'; \
frame-ancestors 'none'; \
base-uri 'self'; \
form-action 'self'; \
object-src 'none'; \
upgrade-insecure-requests" always;
add_header Content-Security-Policy-Report-Only "default-src 'self'; script-src 'self' 'unsafe-eval' https://cdn.jsdelivr.net; style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net; img-src 'self' data: https://*.basemaps.cartocdn.com; font-src 'self' data: https://cdn.jsdelivr.net; connect-src 'self' https://cdn.jsdelivr.net https://*.wikipedia.org https://api.crossref.org https://api.github.com https://openlibrary.org https://api.biorxiv.org https://www.youtube.com https://git.levineuwirth.org; frame-ancestors 'none'; base-uri 'self'; form-action 'self'; object-src 'none'; upgrade-insecure-requests" always;

View File

@ -28,6 +28,7 @@ import json
import os
import re
import sys
import zipfile
from pathlib import Path
import faiss
@ -75,6 +76,11 @@ PARA_DIM = 384
PAGE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab"
# The weights repo above declares its modeling code via auto_map in a
# SEPARATE repo (nomic-ai/nomic-bert-2048), which `revision=` does NOT
# pin — without this second pin, trust_remote_code executes whatever is
# at that repo's head at build time.
PAGE_MODEL_CODE_REVISION = "7710840340a098cfb869c4f65e87cf2b1b70caca"
PAGE_DIM = 768
# Nomic requires task-prefixed input. Documents (corpus side) get
# "search_document: "; queries would get "search_query: ". similar-links
@ -151,7 +157,8 @@ def load_page_cache() -> dict[str, np.ndarray]:
if vectors.shape != (len(hashes), PAGE_DIM):
return {}
return {h.item(): vectors[i] for i, h in enumerate(hashes)}
except (OSError, KeyError, ValueError) as e:
except (OSError, KeyError, ValueError, EOFError,
zipfile.BadZipFile) as e:
print(f"embed.py: page cache unreadable ({e}) — discarding",
file=sys.stderr)
return {}
@ -171,6 +178,7 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
# Pass an open file handle, not a path: np.savez_compressed appends
# ".npz" to bare paths, which would mangle our atomic-rename target.
tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp")
try:
with open(tmp, "wb") as f:
np.savez_compressed(
f,
@ -181,6 +189,9 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
vectors=vectors,
)
os.replace(tmp, PAGE_CACHE)
except BaseException:
tmp.unlink(missing_ok=True)
raise
STRIP_SELECTORS = [
@ -327,6 +338,10 @@ def main() -> int:
print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}")
page_model = SentenceTransformer(
PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True,
# code_revision pins the auto_map modeling repo; it must reach
# both AutoConfig and AutoModel.from_pretrained.
model_kwargs={"code_revision": PAGE_MODEL_CODE_REVISION},
config_kwargs={"code_revision": PAGE_MODEL_CODE_REVISION},
)
new_vecs = page_model.encode(
[page_inputs[i] for i in miss_idxs],