Last audit stragglers: scaffolder, refreeze safety, atomic-write polish

- add-popup-source.sh: slug validated against ^[a-z0-9-]+$ before nginx
  interpolation; UPSTREAM_HOST derived unconditionally so the CSP
  reminder fires in the no-proxy case — which is exactly when the host
  must be added to connect-src (AUDIT §4.8)
- refreeze.sh: backs up the freeze and restores it on a failed resolve
  instead of leaving the repo with no freeze file (§4.9)
- einops gets the policy-mandated upper bound and a comment naming its
  consumer (nomic's remote modeling code) (§1.5)
- Makefile: pdftoppm failures warn instead of vanishing in the while
  pipeline; .NOTPARALLEL guards deploy's clean->build->sign ordering
  against -j invocations (§8.4)
- Atomic writers (embed, archive, the three sidecar extractors):
  PID-unique temp names so concurrent runs can't interleave, cleanup on
  failure everywhere, fsync where the artifact is not trivially
  regenerable (§4.10)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Levi Neuwirth 2026-06-10 11:43:14 -04:00
parent 23bc2d0dc1
commit 5d344f940e
10 changed files with 113 additions and 36 deletions

View File

@ -1,5 +1,10 @@
.PHONY: build deploy sign download-model download-pdfjs download-leaflet compress-assets convert-images pdf-thumbs pdfs watch clean dev audit-marks archive-gc archive-wayback archive-check .PHONY: build deploy sign download-model download-pdfjs download-leaflet compress-assets convert-images pdf-thumbs pdfs watch clean dev audit-marks archive-gc archive-wayback archive-check
# deploy's prerequisite order (clean -> build -> sign) is only correct
# serially; under `make -j` they could interleave. This build has no
# intra-target parallelism worth preserving, so disable it outright.
.NOTPARALLEL:
# Source .env for deploy / GitHub config if it exists. # Source .env for deploy / GitHub config if it exists.
# .env format: KEY=value (one per line, no `export` prefix, no quotes needed). # .env format: KEY=value (one per line, no `export` prefix, no quotes needed).
# Only the variables explicitly listed below are exported to recipe # Only the variables explicitly listed below are exported to recipe
@ -114,12 +119,16 @@ convert-images:
# Thumbnails are written as static/papers/foo.thumb.png alongside each PDF. # Thumbnails are written as static/papers/foo.thumb.png alongside each PDF.
# Skipped silently when pdftoppm is not installed or static/papers/ is empty. # Skipped silently when pdftoppm is not installed or static/papers/ is empty.
pdf-thumbs: pdf-thumbs:
# A failing pdftoppm must at least warn: the `find | while` pipeline's
# exit status is the last iteration's, so without the `||` a corrupt
# PDF would silently ship without a thumbnail.
@if command -v pdftoppm >/dev/null 2>&1; then \ @if command -v pdftoppm >/dev/null 2>&1; then \
find static/papers -name '*.pdf' 2>/dev/null | while read pdf; do \ find static/papers -name '*.pdf' 2>/dev/null | while read pdf; do \
thumb="$${pdf%.pdf}.thumb"; \ thumb="$${pdf%.pdf}.thumb"; \
if [ ! -f "$${thumb}.png" ] || [ "$$pdf" -nt "$${thumb}.png" ]; then \ if [ ! -f "$${thumb}.png" ] || [ "$$pdf" -nt "$${thumb}.png" ]; then \
echo " pdf-thumb $$pdf"; \ echo " pdf-thumb $$pdf"; \
pdftoppm -r 100 -f 1 -l 1 -png -singlefile "$$pdf" "$$thumb"; \ pdftoppm -r 100 -f 1 -l 1 -png -singlefile "$$pdf" "$$thumb" \
|| echo "Warning: pdf-thumb failed for $$pdf (page ships without a thumbnail)" >&2; \
fi; \ fi; \
done; \ done; \
else \ else \

View File

@ -24,7 +24,10 @@ dependencies = [
"pillow>=10.0,<12", "pillow>=10.0,<12",
"colorthief>=0.2,<1", "colorthief>=0.2,<1",
"pyyaml>=6.0,<7", "pyyaml>=6.0,<7",
"einops>=0.8.2", # Not imported by this repo: required at runtime by nomic-embed's
# remote modeling code (nomic-bert-2048, loaded by embed.py's page
# pass under trust_remote_code with a pinned code_revision).
"einops>=0.8.2,<1",
] ]
[[tool.uv.index]] [[tool.uv.index]]

View File

@ -49,6 +49,10 @@ EOF
bold "── new popup provider ──" bold "── new popup provider ──"
NAME=$(prompt "slug (lowercase, used as class + data-popup-source key, e.g. 'zenodo'):") NAME=$(prompt "slug (lowercase, used as class + data-popup-source key, e.g. 'zenodo'):")
[[ -z "$NAME" ]] && { warn "slug required"; exit 1; } [[ -z "$NAME" ]] && { warn "slug required"; exit 1; }
# The slug is interpolated into nginx directives (location /proxy/$NAME/,
# set \$upstream_$NAME) — validate like import-photo.sh does so a space,
# ';', or '{' can't produce a config that fails to load.
[[ "$NAME" =~ ^[a-z0-9-]+$ ]] || { warn "slug must match ^[a-z0-9-]+\$"; exit 1; }
LABEL=$(prompt "display label (e.g. 'Zenodo'):") LABEL=$(prompt "display label (e.g. 'Zenodo'):")
[[ -z "$LABEL" ]] && LABEL="$NAME" [[ -z "$LABEL" ]] && LABEL="$NAME"
@ -107,14 +111,16 @@ fi
# ── proxy prefix + upstream host derivation ────────────────────────── # ── proxy prefix + upstream host derivation ──────────────────────────
# UPSTREAM_HOST is derived unconditionally: the no-proxy (direct CORS
# fetch) case is exactly when the host must be added to connect-src, so
# the checklist's CSP reminder below needs it populated either way.
UPSTREAM_HOST=$(printf '%s' "$API_URL" | awk -F/ '{print $3}')
if [[ "$NEEDS_PROXY" -eq 1 ]]; then if [[ "$NEEDS_PROXY" -eq 1 ]]; then
UPSTREAM_HOST=$(printf '%s' "$API_URL" | awk -F/ '{print $3}')
UPSTREAM_PATH=$(printf '%s' "$API_URL" | awk -F/ 'BEGIN{OFS="/"} {$1=""; $2=""; $3=""; print}' | sed 's|^///||') UPSTREAM_PATH=$(printf '%s' "$API_URL" | awk -F/ 'BEGIN{OFS="/"} {$1=""; $2=""; $3=""; print}' | sed 's|^///||')
PROXY_PATH="/proxy/$NAME/" PROXY_PATH="/proxy/$NAME/"
PROXY_API_URL="$PROXY_PATH${UPSTREAM_PATH%%\?*}" PROXY_API_URL="$PROXY_PATH${UPSTREAM_PATH%%\?*}"
[[ "$API_URL" == *"?"* ]] && PROXY_API_URL="$PROXY_API_URL?${API_URL#*\?}" [[ "$API_URL" == *"?"* ]] && PROXY_API_URL="$PROXY_API_URL?${API_URL#*\?}"
else else
UPSTREAM_HOST=""
PROXY_API_URL="$API_URL" PROXY_API_URL="$API_URL"
fi fi
@ -205,8 +211,9 @@ cat <<EOF
EOF EOF
if [[ "$NEEDS_PROXY" -eq 0 && -n "$UPSTREAM_HOST" ]]; then if [[ "$NEEDS_PROXY" -eq 0 && -n "$UPSTREAM_HOST" ]]; then
echo " 5. In static/js/popups.js top-comment: add $UPSTREAM_HOST to the" echo " 5. Add https://$UPSTREAM_HOST to connect-src in"
echo " connect-src CSP list." echo " nginx/security-headers.conf (direct CORS fetches are blocked"
echo " by CSP otherwise), and mirror it in the popups.js top-comment."
fi fi
echo echo

View File

@ -105,14 +105,18 @@ def err(msg: str) -> None:
def atomic_write_text(path: Path, text: str) -> None: def atomic_write_text(path: Path, text: str) -> None:
"""Write to path.tmp then os.replace. PROVENANCE.json and the """Write to a PID-unique temp then os.replace. PROVENANCE.json and
generated index/state files are integrity records an interrupt the generated index/state files are integrity records an interrupt
mid-write must never leave a truncated file that the next run mid-write must never leave a truncated file that the next run parses
parses (or mistakes for corruption).""" (or mistakes for corruption); fsync makes the rename durable and the
PID suffix keeps concurrent runs from sharing a temp file."""
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".tmp") tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
try: try:
tmp.write_text(text, encoding="utf-8") with tmp.open("w", encoding="utf-8") as f:
f.write(text)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, path) os.replace(tmp, path)
except BaseException: except BaseException:
tmp.unlink(missing_ok=True) tmp.unlink(missing_ok=True)

View File

@ -120,12 +120,21 @@ PORTAL_BODY_ATTR = "data-portal"
def atomic_write_bytes(path: Path, data: bytes) -> None: def atomic_write_bytes(path: Path, data: bytes) -> None:
"""Write to path.tmp then os.replace, so an interrupt mid-write """Write to a PID-unique temp then os.replace: an interrupt mid-write
cannot leave a truncated file that the next build/serve loads.""" cannot leave a truncated file at the final path, fsync makes the
rename durable across power loss, and the PID suffix keeps two
concurrent runs from interleaving writes into one temp file."""
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".tmp") tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
tmp.write_bytes(data) try:
with tmp.open("wb") as f:
f.write(data)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, path) os.replace(tmp, path)
except BaseException:
tmp.unlink(missing_ok=True)
raise
def atomic_write_text(path: Path, text: str) -> None: def atomic_write_text(path: Path, text: str) -> None:
@ -186,7 +195,9 @@ def save_vec_cache(path: Path, model: str, revision: str, dim: int,
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
# Pass an open file handle, not a path: np.savez_compressed appends # Pass an open file handle, not a path: np.savez_compressed appends
# ".npz" to bare paths, which would mangle our atomic-rename target. # ".npz" to bare paths, which would mangle our atomic-rename target.
tmp = path.with_suffix(path.suffix + ".tmp") # PID-unique temp so concurrent runs can't interleave; fsync so the
# rename is durable.
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
try: try:
with open(tmp, "wb") as f: with open(tmp, "wb") as f:
np.savez_compressed( np.savez_compressed(
@ -197,6 +208,8 @@ def save_vec_cache(path: Path, model: str, revision: str, dim: int,
hashes=hashes, hashes=hashes,
vectors=vectors, vectors=vectors,
) )
f.flush()
os.fsync(f.fileno())
os.replace(tmp, path) os.replace(tmp, path)
except BaseException: except BaseException:
tmp.unlink(missing_ok=True) tmp.unlink(missing_ok=True)

View File

@ -31,6 +31,7 @@ images are logged and the rest of the walk continues.
from __future__ import annotations from __future__ import annotations
import os
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -62,13 +63,20 @@ def _is_stale(image: Path, sidecar: Path) -> bool:
def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None: def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None:
tmp = path.with_suffix(path.suffix + ".tmp") # PID-unique temp (concurrent runs can't share it), removed on
# failure. No fsync: sidecars are regenerated from the photo on the
# next build, so a lost rename costs one re-extraction, not data.
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
try:
with tmp.open("w", encoding="utf-8") as f: with tmp.open("w", encoding="utf-8") as f:
# Preserve a stable key order (width before height) so a manual # Preserve a stable key order (width before height) so a manual
# diff stays easy to read across regenerations. # diff stays easy to read across regenerations.
ordered = {k: data[k] for k in ("width", "height") if k in data} ordered = {k: data[k] for k in ("width", "height") if k in data}
yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True) yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True)
tmp.replace(path) tmp.replace(path)
except BaseException:
tmp.unlink(missing_ok=True)
raise
def _read_dimensions(image: Path) -> dict[str, int]: def _read_dimensions(image: Path) -> dict[str, int]:

View File

@ -381,12 +381,19 @@ def _is_stale(image: Path, sidecar: Path) -> bool:
def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None: def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None:
tmp = path.with_suffix(path.suffix + ".tmp") # PID-unique temp (concurrent runs can't share it), removed on
# failure. No fsync: sidecars are regenerated from the photo on the
# next build, so a lost rename costs one re-extraction, not data.
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
try:
with tmp.open("w", encoding="utf-8") as f: with tmp.open("w", encoding="utf-8") as f:
# Preserve the SIDECAR_KEYS order so a manual diff is easy to read. # Preserve the SIDECAR_KEYS order so a manual diff is easy to read.
ordered = {k: data[k] for k in SIDECAR_KEYS if k in data} ordered = {k: data[k] for k in SIDECAR_KEYS if k in data}
yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True) yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True)
tmp.replace(path) tmp.replace(path)
except BaseException:
tmp.unlink(missing_ok=True)
raise
def _read_one(image: Path) -> dict[str, Any]: def _read_one(image: Path) -> dict[str, Any]:

View File

@ -23,6 +23,7 @@ a palette extraction error.
from __future__ import annotations from __future__ import annotations
import os
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -62,10 +63,17 @@ def _is_stale(image: Path, sidecar: Path) -> bool:
def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None: def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None:
tmp = path.with_suffix(path.suffix + ".tmp") # PID-unique temp (concurrent runs can't share it), removed on
# failure. No fsync: sidecars are regenerated from the photo on the
# next build, so a lost rename costs one re-extraction, not data.
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
try:
with tmp.open("w", encoding="utf-8") as f: with tmp.open("w", encoding="utf-8") as f:
yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True) yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True)
tmp.replace(path) tmp.replace(path)
except BaseException:
tmp.unlink(missing_ok=True)
raise
def _extract_palette(image: Path) -> list[str]: def _extract_palette(image: Path) -> list[str]:

View File

@ -8,11 +8,29 @@ FREEZE="$REPO_ROOT/cabal.project.freeze"
cd "$REPO_ROOT" cd "$REPO_ROOT"
# Back up the current freeze and restore it if resolution fails, so an
# unsolvable index never leaves the repo with no freeze file at all
# (recoverable via git, but the script shouldn't depend on that).
BACKUP=""
if [ -f "$FREEZE" ]; then
BACKUP="$(mktemp "$FREEZE.bak.XXXXXX")"
cp "$FREEZE" "$BACKUP"
fi
restore_on_failure() {
if [ -n "$BACKUP" ]; then
echo "==> Refreeze failed — restoring previous freeze file." >&2
mv "$BACKUP" "$FREEZE"
fi
}
trap restore_on_failure ERR
echo "==> Removing stale freeze file..." echo "==> Removing stale freeze file..."
rm -f "$FREEZE" rm -f "$FREEZE"
echo "==> Resolving dependencies and writing new freeze file..." echo "==> Resolving dependencies and writing new freeze file..."
cabal freeze cabal freeze
trap - ERR
[ -n "$BACKUP" ] && rm -f "$BACKUP"
echo "==> Verifying build..." echo "==> Verifying build..."
cabal build cabal build

View File

@ -389,7 +389,7 @@ requires-dist = [
{ name = "altair", specifier = ">=5.4,<6" }, { name = "altair", specifier = ">=5.4,<6" },
{ name = "beautifulsoup4", specifier = ">=4.12,<5" }, { name = "beautifulsoup4", specifier = ">=4.12,<5" },
{ name = "colorthief", specifier = ">=0.2,<1" }, { name = "colorthief", specifier = ">=0.2,<1" },
{ name = "einops", specifier = ">=0.8.2" }, { name = "einops", specifier = ">=0.8.2,<1" },
{ name = "faiss-cpu", specifier = ">=1.9,<2" }, { name = "faiss-cpu", specifier = ">=1.9,<2" },
{ name = "matplotlib", specifier = ">=3.9,<4" }, { name = "matplotlib", specifier = ">=3.9,<4" },
{ name = "numpy", specifier = ">=2.0,<3" }, { name = "numpy", specifier = ">=2.0,<3" },