Last audit stragglers: scaffolder, refreeze safety, atomic-write polish
- add-popup-source.sh: slug validated against ^[a-z0-9-]+$ before nginx interpolation; UPSTREAM_HOST derived unconditionally so the CSP reminder fires in the no-proxy case — which is exactly when the host must be added to connect-src (AUDIT §4.8) - refreeze.sh: backs up the freeze and restores it on a failed resolve instead of leaving the repo with no freeze file (§4.9) - einops gets the policy-mandated upper bound and a comment naming its consumer (nomic's remote modeling code) (§1.5) - Makefile: pdftoppm failures warn instead of vanishing in the while pipeline; .NOTPARALLEL guards deploy's clean->build->sign ordering against -j invocations (§8.4) - Atomic writers (embed, archive, the three sidecar extractors): PID-unique temp names so concurrent runs can't interleave, cleanup on failure everywhere, fsync where the artifact is not trivially regenerable (§4.10) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
23bc2d0dc1
commit
5d344f940e
11
Makefile
11
Makefile
|
|
@ -1,5 +1,10 @@
|
||||||
.PHONY: build deploy sign download-model download-pdfjs download-leaflet compress-assets convert-images pdf-thumbs pdfs watch clean dev audit-marks archive-gc archive-wayback archive-check
|
.PHONY: build deploy sign download-model download-pdfjs download-leaflet compress-assets convert-images pdf-thumbs pdfs watch clean dev audit-marks archive-gc archive-wayback archive-check
|
||||||
|
|
||||||
|
# deploy's prerequisite order (clean -> build -> sign) is only correct
|
||||||
|
# serially; under `make -j` they could interleave. This build has no
|
||||||
|
# intra-target parallelism worth preserving, so disable it outright.
|
||||||
|
.NOTPARALLEL:
|
||||||
|
|
||||||
# Source .env for deploy / GitHub config if it exists.
|
# Source .env for deploy / GitHub config if it exists.
|
||||||
# .env format: KEY=value (one per line, no `export` prefix, no quotes needed).
|
# .env format: KEY=value (one per line, no `export` prefix, no quotes needed).
|
||||||
# Only the variables explicitly listed below are exported to recipe
|
# Only the variables explicitly listed below are exported to recipe
|
||||||
|
|
@ -114,12 +119,16 @@ convert-images:
|
||||||
# Thumbnails are written as static/papers/foo.thumb.png alongside each PDF.
|
# Thumbnails are written as static/papers/foo.thumb.png alongside each PDF.
|
||||||
# Skipped silently when pdftoppm is not installed or static/papers/ is empty.
|
# Skipped silently when pdftoppm is not installed or static/papers/ is empty.
|
||||||
pdf-thumbs:
|
pdf-thumbs:
|
||||||
|
# A failing pdftoppm must at least warn: the `find | while` pipeline's
|
||||||
|
# exit status is the last iteration's, so without the `||` a corrupt
|
||||||
|
# PDF would silently ship without a thumbnail.
|
||||||
@if command -v pdftoppm >/dev/null 2>&1; then \
|
@if command -v pdftoppm >/dev/null 2>&1; then \
|
||||||
find static/papers -name '*.pdf' 2>/dev/null | while read pdf; do \
|
find static/papers -name '*.pdf' 2>/dev/null | while read pdf; do \
|
||||||
thumb="$${pdf%.pdf}.thumb"; \
|
thumb="$${pdf%.pdf}.thumb"; \
|
||||||
if [ ! -f "$${thumb}.png" ] || [ "$$pdf" -nt "$${thumb}.png" ]; then \
|
if [ ! -f "$${thumb}.png" ] || [ "$$pdf" -nt "$${thumb}.png" ]; then \
|
||||||
echo " pdf-thumb $$pdf"; \
|
echo " pdf-thumb $$pdf"; \
|
||||||
pdftoppm -r 100 -f 1 -l 1 -png -singlefile "$$pdf" "$$thumb"; \
|
pdftoppm -r 100 -f 1 -l 1 -png -singlefile "$$pdf" "$$thumb" \
|
||||||
|
|| echo "Warning: pdf-thumb failed for $$pdf (page ships without a thumbnail)" >&2; \
|
||||||
fi; \
|
fi; \
|
||||||
done; \
|
done; \
|
||||||
else \
|
else \
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,10 @@ dependencies = [
|
||||||
"pillow>=10.0,<12",
|
"pillow>=10.0,<12",
|
||||||
"colorthief>=0.2,<1",
|
"colorthief>=0.2,<1",
|
||||||
"pyyaml>=6.0,<7",
|
"pyyaml>=6.0,<7",
|
||||||
"einops>=0.8.2",
|
# Not imported by this repo: required at runtime by nomic-embed's
|
||||||
|
# remote modeling code (nomic-bert-2048, loaded by embed.py's page
|
||||||
|
# pass under trust_remote_code with a pinned code_revision).
|
||||||
|
"einops>=0.8.2,<1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[tool.uv.index]]
|
[[tool.uv.index]]
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,10 @@ EOF
|
||||||
bold "── new popup provider ──"
|
bold "── new popup provider ──"
|
||||||
NAME=$(prompt "slug (lowercase, used as class + data-popup-source key, e.g. 'zenodo'):")
|
NAME=$(prompt "slug (lowercase, used as class + data-popup-source key, e.g. 'zenodo'):")
|
||||||
[[ -z "$NAME" ]] && { warn "slug required"; exit 1; }
|
[[ -z "$NAME" ]] && { warn "slug required"; exit 1; }
|
||||||
|
# The slug is interpolated into nginx directives (location /proxy/$NAME/,
|
||||||
|
# set \$upstream_$NAME) — validate like import-photo.sh does so a space,
|
||||||
|
# ';', or '{' can't produce a config that fails to load.
|
||||||
|
[[ "$NAME" =~ ^[a-z0-9-]+$ ]] || { warn "slug must match ^[a-z0-9-]+\$"; exit 1; }
|
||||||
|
|
||||||
LABEL=$(prompt "display label (e.g. 'Zenodo'):")
|
LABEL=$(prompt "display label (e.g. 'Zenodo'):")
|
||||||
[[ -z "$LABEL" ]] && LABEL="$NAME"
|
[[ -z "$LABEL" ]] && LABEL="$NAME"
|
||||||
|
|
@ -107,14 +111,16 @@ fi
|
||||||
|
|
||||||
# ── proxy prefix + upstream host derivation ──────────────────────────
|
# ── proxy prefix + upstream host derivation ──────────────────────────
|
||||||
|
|
||||||
|
# UPSTREAM_HOST is derived unconditionally: the no-proxy (direct CORS
|
||||||
|
# fetch) case is exactly when the host must be added to connect-src, so
|
||||||
|
# the checklist's CSP reminder below needs it populated either way.
|
||||||
|
UPSTREAM_HOST=$(printf '%s' "$API_URL" | awk -F/ '{print $3}')
|
||||||
if [[ "$NEEDS_PROXY" -eq 1 ]]; then
|
if [[ "$NEEDS_PROXY" -eq 1 ]]; then
|
||||||
UPSTREAM_HOST=$(printf '%s' "$API_URL" | awk -F/ '{print $3}')
|
|
||||||
UPSTREAM_PATH=$(printf '%s' "$API_URL" | awk -F/ 'BEGIN{OFS="/"} {$1=""; $2=""; $3=""; print}' | sed 's|^///||')
|
UPSTREAM_PATH=$(printf '%s' "$API_URL" | awk -F/ 'BEGIN{OFS="/"} {$1=""; $2=""; $3=""; print}' | sed 's|^///||')
|
||||||
PROXY_PATH="/proxy/$NAME/"
|
PROXY_PATH="/proxy/$NAME/"
|
||||||
PROXY_API_URL="$PROXY_PATH${UPSTREAM_PATH%%\?*}"
|
PROXY_API_URL="$PROXY_PATH${UPSTREAM_PATH%%\?*}"
|
||||||
[[ "$API_URL" == *"?"* ]] && PROXY_API_URL="$PROXY_API_URL?${API_URL#*\?}"
|
[[ "$API_URL" == *"?"* ]] && PROXY_API_URL="$PROXY_API_URL?${API_URL#*\?}"
|
||||||
else
|
else
|
||||||
UPSTREAM_HOST=""
|
|
||||||
PROXY_API_URL="$API_URL"
|
PROXY_API_URL="$API_URL"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
@ -205,8 +211,9 @@ cat <<EOF
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
if [[ "$NEEDS_PROXY" -eq 0 && -n "$UPSTREAM_HOST" ]]; then
|
if [[ "$NEEDS_PROXY" -eq 0 && -n "$UPSTREAM_HOST" ]]; then
|
||||||
echo " 5. In static/js/popups.js top-comment: add $UPSTREAM_HOST to the"
|
echo " 5. Add https://$UPSTREAM_HOST to connect-src in"
|
||||||
echo " connect-src CSP list."
|
echo " nginx/security-headers.conf (direct CORS fetches are blocked"
|
||||||
|
echo " by CSP otherwise), and mirror it in the popups.js top-comment."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo
|
echo
|
||||||
|
|
|
||||||
|
|
@ -105,14 +105,18 @@ def err(msg: str) -> None:
|
||||||
|
|
||||||
|
|
||||||
def atomic_write_text(path: Path, text: str) -> None:
|
def atomic_write_text(path: Path, text: str) -> None:
|
||||||
"""Write to path.tmp then os.replace. PROVENANCE.json and the
|
"""Write to a PID-unique temp then os.replace. PROVENANCE.json and
|
||||||
generated index/state files are integrity records — an interrupt
|
the generated index/state files are integrity records — an interrupt
|
||||||
mid-write must never leave a truncated file that the next run
|
mid-write must never leave a truncated file that the next run parses
|
||||||
parses (or mistakes for corruption)."""
|
(or mistakes for corruption); fsync makes the rename durable and the
|
||||||
|
PID suffix keeps concurrent runs from sharing a temp file."""
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
tmp = path.with_suffix(path.suffix + ".tmp")
|
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
|
||||||
try:
|
try:
|
||||||
tmp.write_text(text, encoding="utf-8")
|
with tmp.open("w", encoding="utf-8") as f:
|
||||||
|
f.write(text)
|
||||||
|
f.flush()
|
||||||
|
os.fsync(f.fileno())
|
||||||
os.replace(tmp, path)
|
os.replace(tmp, path)
|
||||||
except BaseException:
|
except BaseException:
|
||||||
tmp.unlink(missing_ok=True)
|
tmp.unlink(missing_ok=True)
|
||||||
|
|
|
||||||
|
|
@ -120,12 +120,21 @@ PORTAL_BODY_ATTR = "data-portal"
|
||||||
|
|
||||||
|
|
||||||
def atomic_write_bytes(path: Path, data: bytes) -> None:
|
def atomic_write_bytes(path: Path, data: bytes) -> None:
|
||||||
"""Write to path.tmp then os.replace, so an interrupt mid-write
|
"""Write to a PID-unique temp then os.replace: an interrupt mid-write
|
||||||
cannot leave a truncated file that the next build/serve loads."""
|
cannot leave a truncated file at the final path, fsync makes the
|
||||||
|
rename durable across power loss, and the PID suffix keeps two
|
||||||
|
concurrent runs from interleaving writes into one temp file."""
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
tmp = path.with_suffix(path.suffix + ".tmp")
|
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
|
||||||
tmp.write_bytes(data)
|
try:
|
||||||
|
with tmp.open("wb") as f:
|
||||||
|
f.write(data)
|
||||||
|
f.flush()
|
||||||
|
os.fsync(f.fileno())
|
||||||
os.replace(tmp, path)
|
os.replace(tmp, path)
|
||||||
|
except BaseException:
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def atomic_write_text(path: Path, text: str) -> None:
|
def atomic_write_text(path: Path, text: str) -> None:
|
||||||
|
|
@ -186,7 +195,9 @@ def save_vec_cache(path: Path, model: str, revision: str, dim: int,
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
# Pass an open file handle, not a path: np.savez_compressed appends
|
# Pass an open file handle, not a path: np.savez_compressed appends
|
||||||
# ".npz" to bare paths, which would mangle our atomic-rename target.
|
# ".npz" to bare paths, which would mangle our atomic-rename target.
|
||||||
tmp = path.with_suffix(path.suffix + ".tmp")
|
# PID-unique temp so concurrent runs can't interleave; fsync so the
|
||||||
|
# rename is durable.
|
||||||
|
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
|
||||||
try:
|
try:
|
||||||
with open(tmp, "wb") as f:
|
with open(tmp, "wb") as f:
|
||||||
np.savez_compressed(
|
np.savez_compressed(
|
||||||
|
|
@ -197,6 +208,8 @@ def save_vec_cache(path: Path, model: str, revision: str, dim: int,
|
||||||
hashes=hashes,
|
hashes=hashes,
|
||||||
vectors=vectors,
|
vectors=vectors,
|
||||||
)
|
)
|
||||||
|
f.flush()
|
||||||
|
os.fsync(f.fileno())
|
||||||
os.replace(tmp, path)
|
os.replace(tmp, path)
|
||||||
except BaseException:
|
except BaseException:
|
||||||
tmp.unlink(missing_ok=True)
|
tmp.unlink(missing_ok=True)
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,7 @@ images are logged and the rest of the walk continues.
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
@ -62,13 +63,20 @@ def _is_stale(image: Path, sidecar: Path) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None:
|
def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None:
|
||||||
tmp = path.with_suffix(path.suffix + ".tmp")
|
# PID-unique temp (concurrent runs can't share it), removed on
|
||||||
|
# failure. No fsync: sidecars are regenerated from the photo on the
|
||||||
|
# next build, so a lost rename costs one re-extraction, not data.
|
||||||
|
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
|
||||||
|
try:
|
||||||
with tmp.open("w", encoding="utf-8") as f:
|
with tmp.open("w", encoding="utf-8") as f:
|
||||||
# Preserve a stable key order (width before height) so a manual
|
# Preserve a stable key order (width before height) so a manual
|
||||||
# diff stays easy to read across regenerations.
|
# diff stays easy to read across regenerations.
|
||||||
ordered = {k: data[k] for k in ("width", "height") if k in data}
|
ordered = {k: data[k] for k in ("width", "height") if k in data}
|
||||||
yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True)
|
yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True)
|
||||||
tmp.replace(path)
|
tmp.replace(path)
|
||||||
|
except BaseException:
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def _read_dimensions(image: Path) -> dict[str, int]:
|
def _read_dimensions(image: Path) -> dict[str, int]:
|
||||||
|
|
|
||||||
|
|
@ -381,12 +381,19 @@ def _is_stale(image: Path, sidecar: Path) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None:
|
def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None:
|
||||||
tmp = path.with_suffix(path.suffix + ".tmp")
|
# PID-unique temp (concurrent runs can't share it), removed on
|
||||||
|
# failure. No fsync: sidecars are regenerated from the photo on the
|
||||||
|
# next build, so a lost rename costs one re-extraction, not data.
|
||||||
|
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
|
||||||
|
try:
|
||||||
with tmp.open("w", encoding="utf-8") as f:
|
with tmp.open("w", encoding="utf-8") as f:
|
||||||
# Preserve the SIDECAR_KEYS order so a manual diff is easy to read.
|
# Preserve the SIDECAR_KEYS order so a manual diff is easy to read.
|
||||||
ordered = {k: data[k] for k in SIDECAR_KEYS if k in data}
|
ordered = {k: data[k] for k in SIDECAR_KEYS if k in data}
|
||||||
yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True)
|
yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True)
|
||||||
tmp.replace(path)
|
tmp.replace(path)
|
||||||
|
except BaseException:
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def _read_one(image: Path) -> dict[str, Any]:
|
def _read_one(image: Path) -> dict[str, Any]:
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ a palette extraction error.
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
@ -62,10 +63,17 @@ def _is_stale(image: Path, sidecar: Path) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None:
|
def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None:
|
||||||
tmp = path.with_suffix(path.suffix + ".tmp")
|
# PID-unique temp (concurrent runs can't share it), removed on
|
||||||
|
# failure. No fsync: sidecars are regenerated from the photo on the
|
||||||
|
# next build, so a lost rename costs one re-extraction, not data.
|
||||||
|
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
|
||||||
|
try:
|
||||||
with tmp.open("w", encoding="utf-8") as f:
|
with tmp.open("w", encoding="utf-8") as f:
|
||||||
yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True)
|
yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True)
|
||||||
tmp.replace(path)
|
tmp.replace(path)
|
||||||
|
except BaseException:
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def _extract_palette(image: Path) -> list[str]:
|
def _extract_palette(image: Path) -> list[str]:
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,29 @@ FREEZE="$REPO_ROOT/cabal.project.freeze"
|
||||||
|
|
||||||
cd "$REPO_ROOT"
|
cd "$REPO_ROOT"
|
||||||
|
|
||||||
|
# Back up the current freeze and restore it if resolution fails, so an
|
||||||
|
# unsolvable index never leaves the repo with no freeze file at all
|
||||||
|
# (recoverable via git, but the script shouldn't depend on that).
|
||||||
|
BACKUP=""
|
||||||
|
if [ -f "$FREEZE" ]; then
|
||||||
|
BACKUP="$(mktemp "$FREEZE.bak.XXXXXX")"
|
||||||
|
cp "$FREEZE" "$BACKUP"
|
||||||
|
fi
|
||||||
|
restore_on_failure() {
|
||||||
|
if [ -n "$BACKUP" ]; then
|
||||||
|
echo "==> Refreeze failed — restoring previous freeze file." >&2
|
||||||
|
mv "$BACKUP" "$FREEZE"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap restore_on_failure ERR
|
||||||
|
|
||||||
echo "==> Removing stale freeze file..."
|
echo "==> Removing stale freeze file..."
|
||||||
rm -f "$FREEZE"
|
rm -f "$FREEZE"
|
||||||
|
|
||||||
echo "==> Resolving dependencies and writing new freeze file..."
|
echo "==> Resolving dependencies and writing new freeze file..."
|
||||||
cabal freeze
|
cabal freeze
|
||||||
|
trap - ERR
|
||||||
|
[ -n "$BACKUP" ] && rm -f "$BACKUP"
|
||||||
|
|
||||||
echo "==> Verifying build..."
|
echo "==> Verifying build..."
|
||||||
cabal build
|
cabal build
|
||||||
|
|
|
||||||
2
uv.lock
2
uv.lock
|
|
@ -389,7 +389,7 @@ requires-dist = [
|
||||||
{ name = "altair", specifier = ">=5.4,<6" },
|
{ name = "altair", specifier = ">=5.4,<6" },
|
||||||
{ name = "beautifulsoup4", specifier = ">=4.12,<5" },
|
{ name = "beautifulsoup4", specifier = ">=4.12,<5" },
|
||||||
{ name = "colorthief", specifier = ">=0.2,<1" },
|
{ name = "colorthief", specifier = ">=0.2,<1" },
|
||||||
{ name = "einops", specifier = ">=0.8.2" },
|
{ name = "einops", specifier = ">=0.8.2,<1" },
|
||||||
{ name = "faiss-cpu", specifier = ">=1.9,<2" },
|
{ name = "faiss-cpu", specifier = ">=1.9,<2" },
|
||||||
{ name = "matplotlib", specifier = ">=3.9,<4" },
|
{ name = "matplotlib", specifier = ">=3.9,<4" },
|
||||||
{ name = "numpy", specifier = ">=2.0,<3" },
|
{ name = "numpy", specifier = ">=2.0,<3" },
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue