Fix audit tooling/infra findings
- embed.py: pin nomic's auto_map modeling repo via code_revision — revision= alone left nomic-bert-2048 unpinned under trust_remote_code (AUDIT §1.3; verified loadable with HF_HUB_OFFLINE=1). Catch BadZipFile/EOFError when loading the page cache so a half-written npz is discarded, not fatal (§4.2), and unlink the tmp file on a failed save (§4.1) - nginx: collapse the CSP to one physical line — nginx has no line continuation in quoted strings, so the old value embedded literal backslash+LF bytes, illegal in HTTP/2 (§8.1). Add the externals the site actually uses: KaTeX webfonts + onnxruntime wasm via jsdelivr, and the popup provider APIs popups.js documents (§8.2) - Makefile: pathspec-limit the auto-commit to content/ so pre-staged unrelated work is no longer swept into auto: commits (§8.3) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
c64f3d63c0
commit
f11495ff9a
6
Makefile
6
Makefile
|
|
@ -21,8 +21,12 @@ build:
|
|||
# so a stray secret dropped under content/ is NOT auto-staged. To
|
||||
# intentionally commit a normally-ignored file, use `git add -f`
|
||||
# manually before running `make build`.
|
||||
#
|
||||
# The commit and its guard are pathspec-limited to content/ so that
|
||||
# anything the user had previously staged for other reasons is left
|
||||
# staged, not silently swept into the auto-commit.
|
||||
@git add content/
|
||||
@git diff --cached --quiet || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]"
|
||||
@git diff --cached --quiet -- content/ || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]" -- content/
|
||||
@mkdir -p data
|
||||
@date +%s > data/build-start.txt
|
||||
@./tools/convert-images.sh
|
||||
|
|
|
|||
|
|
@ -42,8 +42,20 @@ add_header Permissions-Policy
|
|||
# report stream has been clean for a week.
|
||||
#
|
||||
# External origins justified inline:
|
||||
# cdn.jsdelivr.net KaTeX CSS + JS, Vega / Vega-Lite / Vega-Embed
|
||||
# cdn.jsdelivr.net KaTeX CSS + JS + webfonts (the KaTeX CSS
|
||||
# references its fonts relatively, so they
|
||||
# resolve to the CDN -> font-src), Vega /
|
||||
# Vega-Lite / Vega-Embed, transformers.js
|
||||
# (whose onnxruntime fetches its .wasm from
|
||||
# the CDN via fetch() -> connect-src)
|
||||
# *.basemaps.cartocdn.com Leaflet basemap tiles (photography map only)
|
||||
# connect-src API hosts link-popup providers fetched directly via
|
||||
# CORS (the list popups.js documents in its
|
||||
# header, plus git.levineuwirth.org for the
|
||||
# Forgejo provider). The CORS-broken trio
|
||||
# (arxiv, archive.org, pubmed) goes through
|
||||
# the same-origin /proxy/ instead — see
|
||||
# nginx/popup-proxy.conf.
|
||||
#
|
||||
# Why 'unsafe-inline' on style:
|
||||
# - photography.html emits <span style="background:$swatch$"> for
|
||||
|
|
@ -53,18 +65,14 @@ add_header Permissions-Policy
|
|||
# Why 'unsafe-eval' on script:
|
||||
# - vega-embed compiles Vega-Lite specs at runtime via new Function().
|
||||
# Removing this would require pre-compiling specs at build time.
|
||||
# - it also covers WebAssembly.instantiate for onnxruntime-web
|
||||
# (semantic search).
|
||||
#
|
||||
# The value MUST stay on one physical line: nginx has no line
|
||||
# continuation inside quoted strings — a trailing backslash would embed
|
||||
# literal backslash + LF bytes in the header value, which is illegal in
|
||||
# HTTP/2 and gets whole responses rejected by strict clients.
|
||||
#
|
||||
# To collect violation reports, set up a `report-uri` endpoint and add
|
||||
# `report-uri /csp-report;` (and/or `report-to <group>;`) below.
|
||||
add_header Content-Security-Policy-Report-Only
|
||||
"default-src 'self'; \
|
||||
script-src 'self' 'unsafe-eval' https://cdn.jsdelivr.net; \
|
||||
style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net; \
|
||||
img-src 'self' data: https://*.basemaps.cartocdn.com; \
|
||||
font-src 'self' data:; \
|
||||
connect-src 'self'; \
|
||||
frame-ancestors 'none'; \
|
||||
base-uri 'self'; \
|
||||
form-action 'self'; \
|
||||
object-src 'none'; \
|
||||
upgrade-insecure-requests" always;
|
||||
add_header Content-Security-Policy-Report-Only "default-src 'self'; script-src 'self' 'unsafe-eval' https://cdn.jsdelivr.net; style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net; img-src 'self' data: https://*.basemaps.cartocdn.com; font-src 'self' data: https://cdn.jsdelivr.net; connect-src 'self' https://cdn.jsdelivr.net https://*.wikipedia.org https://api.crossref.org https://api.github.com https://openlibrary.org https://api.biorxiv.org https://www.youtube.com https://git.levineuwirth.org; frame-ancestors 'none'; base-uri 'self'; form-action 'self'; object-src 'none'; upgrade-insecure-requests" always;
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ import json
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import faiss
|
||||
|
|
@ -75,6 +76,11 @@ PARA_DIM = 384
|
|||
|
||||
PAGE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
|
||||
PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab"
|
||||
# The weights repo above declares its modeling code via auto_map in a
|
||||
# SEPARATE repo (nomic-ai/nomic-bert-2048), which `revision=` does NOT
|
||||
# pin — without this second pin, trust_remote_code executes whatever is
|
||||
# at that repo's head at build time.
|
||||
PAGE_MODEL_CODE_REVISION = "7710840340a098cfb869c4f65e87cf2b1b70caca"
|
||||
PAGE_DIM = 768
|
||||
# Nomic requires task-prefixed input. Documents (corpus side) get
|
||||
# "search_document: "; queries would get "search_query: ". similar-links
|
||||
|
|
@ -151,7 +157,8 @@ def load_page_cache() -> dict[str, np.ndarray]:
|
|||
if vectors.shape != (len(hashes), PAGE_DIM):
|
||||
return {}
|
||||
return {h.item(): vectors[i] for i, h in enumerate(hashes)}
|
||||
except (OSError, KeyError, ValueError) as e:
|
||||
except (OSError, KeyError, ValueError, EOFError,
|
||||
zipfile.BadZipFile) as e:
|
||||
print(f"embed.py: page cache unreadable ({e}) — discarding",
|
||||
file=sys.stderr)
|
||||
return {}
|
||||
|
|
@ -171,16 +178,20 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
|
|||
# Pass an open file handle, not a path: np.savez_compressed appends
|
||||
# ".npz" to bare paths, which would mangle our atomic-rename target.
|
||||
tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp")
|
||||
with open(tmp, "wb") as f:
|
||||
np.savez_compressed(
|
||||
f,
|
||||
model=PAGE_MODEL_NAME,
|
||||
revision=PAGE_MODEL_REVISION,
|
||||
dim=PAGE_DIM,
|
||||
hashes=hashes,
|
||||
vectors=vectors,
|
||||
)
|
||||
os.replace(tmp, PAGE_CACHE)
|
||||
try:
|
||||
with open(tmp, "wb") as f:
|
||||
np.savez_compressed(
|
||||
f,
|
||||
model=PAGE_MODEL_NAME,
|
||||
revision=PAGE_MODEL_REVISION,
|
||||
dim=PAGE_DIM,
|
||||
hashes=hashes,
|
||||
vectors=vectors,
|
||||
)
|
||||
os.replace(tmp, PAGE_CACHE)
|
||||
except BaseException:
|
||||
tmp.unlink(missing_ok=True)
|
||||
raise
|
||||
|
||||
|
||||
STRIP_SELECTORS = [
|
||||
|
|
@ -327,6 +338,10 @@ def main() -> int:
|
|||
print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}…")
|
||||
page_model = SentenceTransformer(
|
||||
PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True,
|
||||
# code_revision pins the auto_map modeling repo; it must reach
|
||||
# both AutoConfig and AutoModel.from_pretrained.
|
||||
model_kwargs={"code_revision": PAGE_MODEL_CODE_REVISION},
|
||||
config_kwargs={"code_revision": PAGE_MODEL_CODE_REVISION},
|
||||
)
|
||||
new_vecs = page_model.encode(
|
||||
[page_inputs[i] for i in miss_idxs],
|
||||
|
|
|
|||
Loading…
Reference in New Issue