Fix audit tooling/infra findings
- embed.py: pin nomic's auto_map modeling repo via code_revision — revision= alone left nomic-bert-2048 unpinned under trust_remote_code (AUDIT §1.3; verified loadable with HF_HUB_OFFLINE=1). Catch BadZipFile/EOFError when loading the page cache so a half-written npz is discarded, not fatal (§4.2), and unlink the tmp file on a failed save (§4.1) - nginx: collapse the CSP to one physical line — nginx has no line continuation in quoted strings, so the old value embedded literal backslash+LF bytes, illegal in HTTP/2 (§8.1). Add the externals the site actually uses: KaTeX webfonts + onnxruntime wasm via jsdelivr, and the popup provider APIs popups.js documents (§8.2) - Makefile: pathspec-limit the auto-commit to content/ so pre-staged unrelated work is no longer swept into auto: commits (§8.3) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
c64f3d63c0
commit
f11495ff9a
6
Makefile
6
Makefile
|
|
@ -21,8 +21,12 @@ build:
|
||||||
# so a stray secret dropped under content/ is NOT auto-staged. To
|
# so a stray secret dropped under content/ is NOT auto-staged. To
|
||||||
# intentionally commit a normally-ignored file, use `git add -f`
|
# intentionally commit a normally-ignored file, use `git add -f`
|
||||||
# manually before running `make build`.
|
# manually before running `make build`.
|
||||||
|
#
|
||||||
|
# The commit and its guard are pathspec-limited to content/ so that
|
||||||
|
# anything the user had previously staged for other reasons is left
|
||||||
|
# staged, not silently swept into the auto-commit.
|
||||||
@git add content/
|
@git add content/
|
||||||
@git diff --cached --quiet || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]"
|
@git diff --cached --quiet -- content/ || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]" -- content/
|
||||||
@mkdir -p data
|
@mkdir -p data
|
||||||
@date +%s > data/build-start.txt
|
@date +%s > data/build-start.txt
|
||||||
@./tools/convert-images.sh
|
@./tools/convert-images.sh
|
||||||
|
|
|
||||||
|
|
@ -42,8 +42,20 @@ add_header Permissions-Policy
|
||||||
# report stream has been clean for a week.
|
# report stream has been clean for a week.
|
||||||
#
|
#
|
||||||
# External origins justified inline:
|
# External origins justified inline:
|
||||||
# cdn.jsdelivr.net KaTeX CSS + JS, Vega / Vega-Lite / Vega-Embed
|
# cdn.jsdelivr.net KaTeX CSS + JS + webfonts (the KaTeX CSS
|
||||||
|
# references its fonts relatively, so they
|
||||||
|
# resolve to the CDN -> font-src), Vega /
|
||||||
|
# Vega-Lite / Vega-Embed, transformers.js
|
||||||
|
# (whose onnxruntime fetches its .wasm from
|
||||||
|
# the CDN via fetch() -> connect-src)
|
||||||
# *.basemaps.cartocdn.com Leaflet basemap tiles (photography map only)
|
# *.basemaps.cartocdn.com Leaflet basemap tiles (photography map only)
|
||||||
|
# connect-src API hosts link-popup providers fetched directly via
|
||||||
|
# CORS (the list popups.js documents in its
|
||||||
|
# header, plus git.levineuwirth.org for the
|
||||||
|
# Forgejo provider). The CORS-broken trio
|
||||||
|
# (arxiv, archive.org, pubmed) goes through
|
||||||
|
# the same-origin /proxy/ instead — see
|
||||||
|
# nginx/popup-proxy.conf.
|
||||||
#
|
#
|
||||||
# Why 'unsafe-inline' on style:
|
# Why 'unsafe-inline' on style:
|
||||||
# - photography.html emits <span style="background:$swatch$"> for
|
# - photography.html emits <span style="background:$swatch$"> for
|
||||||
|
|
@ -53,18 +65,14 @@ add_header Permissions-Policy
|
||||||
# Why 'unsafe-eval' on script:
|
# Why 'unsafe-eval' on script:
|
||||||
# - vega-embed compiles Vega-Lite specs at runtime via new Function().
|
# - vega-embed compiles Vega-Lite specs at runtime via new Function().
|
||||||
# Removing this would require pre-compiling specs at build time.
|
# Removing this would require pre-compiling specs at build time.
|
||||||
|
# - it also covers WebAssembly.instantiate for onnxruntime-web
|
||||||
|
# (semantic search).
|
||||||
|
#
|
||||||
|
# The value MUST stay on one physical line: nginx has no line
|
||||||
|
# continuation inside quoted strings — a trailing backslash would embed
|
||||||
|
# literal backslash + LF bytes in the header value, which is illegal in
|
||||||
|
# HTTP/2 and gets whole responses rejected by strict clients.
|
||||||
#
|
#
|
||||||
# To collect violation reports, set up a `report-uri` endpoint and add
|
# To collect violation reports, set up a `report-uri` endpoint and add
|
||||||
# `report-uri /csp-report;` (and/or `report-to <group>;`) below.
|
# `report-uri /csp-report;` (and/or `report-to <group>;`) below.
|
||||||
add_header Content-Security-Policy-Report-Only
|
add_header Content-Security-Policy-Report-Only "default-src 'self'; script-src 'self' 'unsafe-eval' https://cdn.jsdelivr.net; style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net; img-src 'self' data: https://*.basemaps.cartocdn.com; font-src 'self' data: https://cdn.jsdelivr.net; connect-src 'self' https://cdn.jsdelivr.net https://*.wikipedia.org https://api.crossref.org https://api.github.com https://openlibrary.org https://api.biorxiv.org https://www.youtube.com https://git.levineuwirth.org; frame-ancestors 'none'; base-uri 'self'; form-action 'self'; object-src 'none'; upgrade-insecure-requests" always;
|
||||||
"default-src 'self'; \
|
|
||||||
script-src 'self' 'unsafe-eval' https://cdn.jsdelivr.net; \
|
|
||||||
style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net; \
|
|
||||||
img-src 'self' data: https://*.basemaps.cartocdn.com; \
|
|
||||||
font-src 'self' data:; \
|
|
||||||
connect-src 'self'; \
|
|
||||||
frame-ancestors 'none'; \
|
|
||||||
base-uri 'self'; \
|
|
||||||
form-action 'self'; \
|
|
||||||
object-src 'none'; \
|
|
||||||
upgrade-insecure-requests" always;
|
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import faiss
|
import faiss
|
||||||
|
|
@ -75,6 +76,11 @@ PARA_DIM = 384
|
||||||
|
|
||||||
PAGE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
|
PAGE_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
|
||||||
PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab"
|
PAGE_MODEL_REVISION = "e9b6763023c676ca8431644204f50c2b100d9aab"
|
||||||
|
# The weights repo above declares its modeling code via auto_map in a
|
||||||
|
# SEPARATE repo (nomic-ai/nomic-bert-2048), which `revision=` does NOT
|
||||||
|
# pin — without this second pin, trust_remote_code executes whatever is
|
||||||
|
# at that repo's head at build time.
|
||||||
|
PAGE_MODEL_CODE_REVISION = "7710840340a098cfb869c4f65e87cf2b1b70caca"
|
||||||
PAGE_DIM = 768
|
PAGE_DIM = 768
|
||||||
# Nomic requires task-prefixed input. Documents (corpus side) get
|
# Nomic requires task-prefixed input. Documents (corpus side) get
|
||||||
# "search_document: "; queries would get "search_query: ". similar-links
|
# "search_document: "; queries would get "search_query: ". similar-links
|
||||||
|
|
@ -151,7 +157,8 @@ def load_page_cache() -> dict[str, np.ndarray]:
|
||||||
if vectors.shape != (len(hashes), PAGE_DIM):
|
if vectors.shape != (len(hashes), PAGE_DIM):
|
||||||
return {}
|
return {}
|
||||||
return {h.item(): vectors[i] for i, h in enumerate(hashes)}
|
return {h.item(): vectors[i] for i, h in enumerate(hashes)}
|
||||||
except (OSError, KeyError, ValueError) as e:
|
except (OSError, KeyError, ValueError, EOFError,
|
||||||
|
zipfile.BadZipFile) as e:
|
||||||
print(f"embed.py: page cache unreadable ({e}) — discarding",
|
print(f"embed.py: page cache unreadable ({e}) — discarding",
|
||||||
file=sys.stderr)
|
file=sys.stderr)
|
||||||
return {}
|
return {}
|
||||||
|
|
@ -171,6 +178,7 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
|
||||||
# Pass an open file handle, not a path: np.savez_compressed appends
|
# Pass an open file handle, not a path: np.savez_compressed appends
|
||||||
# ".npz" to bare paths, which would mangle our atomic-rename target.
|
# ".npz" to bare paths, which would mangle our atomic-rename target.
|
||||||
tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp")
|
tmp = PAGE_CACHE.with_suffix(PAGE_CACHE.suffix + ".tmp")
|
||||||
|
try:
|
||||||
with open(tmp, "wb") as f:
|
with open(tmp, "wb") as f:
|
||||||
np.savez_compressed(
|
np.savez_compressed(
|
||||||
f,
|
f,
|
||||||
|
|
@ -181,6 +189,9 @@ def save_page_cache(cache: dict[str, np.ndarray]) -> None:
|
||||||
vectors=vectors,
|
vectors=vectors,
|
||||||
)
|
)
|
||||||
os.replace(tmp, PAGE_CACHE)
|
os.replace(tmp, PAGE_CACHE)
|
||||||
|
except BaseException:
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
STRIP_SELECTORS = [
|
STRIP_SELECTORS = [
|
||||||
|
|
@ -327,6 +338,10 @@ def main() -> int:
|
||||||
print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}…")
|
print(f"embed.py: loading {PAGE_MODEL_NAME}@{PAGE_MODEL_REVISION[:8]}…")
|
||||||
page_model = SentenceTransformer(
|
page_model = SentenceTransformer(
|
||||||
PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True,
|
PAGE_MODEL_NAME, revision=PAGE_MODEL_REVISION, trust_remote_code=True,
|
||||||
|
# code_revision pins the auto_map modeling repo; it must reach
|
||||||
|
# both AutoConfig and AutoModel.from_pretrained.
|
||||||
|
model_kwargs={"code_revision": PAGE_MODEL_CODE_REVISION},
|
||||||
|
config_kwargs={"code_revision": PAGE_MODEL_CODE_REVISION},
|
||||||
)
|
)
|
||||||
new_vecs = page_model.encode(
|
new_vecs = page_model.encode(
|
||||||
[page_inputs[i] for i in miss_idxs],
|
[page_inputs[i] for i in miss_idxs],
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue