diff --git a/Makefile b/Makefile index 2863eb5..6283308 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,10 @@ .PHONY: build deploy sign download-model download-pdfjs download-leaflet compress-assets convert-images pdf-thumbs pdfs watch clean dev audit-marks archive-gc archive-wayback archive-check +# deploy's prerequisite order (clean -> build -> sign) is only correct +# serially; under `make -j` they could interleave. This build has no +# intra-target parallelism worth preserving, so disable it outright. +.NOTPARALLEL: + # Source .env for deploy / GitHub config if it exists. # .env format: KEY=value (one per line, no `export` prefix, no quotes needed). # Only the variables explicitly listed below are exported to recipe @@ -114,12 +119,16 @@ convert-images: # Thumbnails are written as static/papers/foo.thumb.png alongside each PDF. # Skipped silently when pdftoppm is not installed or static/papers/ is empty. pdf-thumbs: + # A failing pdftoppm must at least warn: the `find | while` pipeline's + # exit status is the last iteration's, so without the `||` a corrupt + # PDF would silently ship without a thumbnail. @if command -v pdftoppm >/dev/null 2>&1; then \ find static/papers -name '*.pdf' 2>/dev/null | while read pdf; do \ thumb="$${pdf%.pdf}.thumb"; \ if [ ! -f "$${thumb}.png" ] || [ "$$pdf" -nt "$${thumb}.png" ]; then \ echo " pdf-thumb $$pdf"; \ - pdftoppm -r 100 -f 1 -l 1 -png -singlefile "$$pdf" "$$thumb"; \ + pdftoppm -r 100 -f 1 -l 1 -png -singlefile "$$pdf" "$$thumb" \ + || echo "Warning: pdf-thumb failed for $$pdf (page ships without a thumbnail)" >&2; \ fi; \ done; \ else \ diff --git a/pyproject.toml b/pyproject.toml index b82d0d0..8a0b5eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,10 @@ dependencies = [ "pillow>=10.0,<12", "colorthief>=0.2,<1", "pyyaml>=6.0,<7", - "einops>=0.8.2", + # Not imported by this repo: required at runtime by nomic-embed's + # remote modeling code (nomic-bert-2048, loaded by embed.py's page + # pass under trust_remote_code with a pinned code_revision). + "einops>=0.8.2,<1", ] [[tool.uv.index]] diff --git a/tools/add-popup-source.sh b/tools/add-popup-source.sh index 36fdcf5..01dd9ab 100755 --- a/tools/add-popup-source.sh +++ b/tools/add-popup-source.sh @@ -49,6 +49,10 @@ EOF bold "── new popup provider ──" NAME=$(prompt "slug (lowercase, used as class + data-popup-source key, e.g. 'zenodo'):") [[ -z "$NAME" ]] && { warn "slug required"; exit 1; } +# The slug is interpolated into nginx directives (location /proxy/$NAME/, +# set \$upstream_$NAME) — validate like import-photo.sh does so a space, +# ';', or '{' can't produce a config that fails to load. +[[ "$NAME" =~ ^[a-z0-9-]+$ ]] || { warn "slug must match ^[a-z0-9-]+\$"; exit 1; } LABEL=$(prompt "display label (e.g. 'Zenodo'):") [[ -z "$LABEL" ]] && LABEL="$NAME" @@ -107,14 +111,16 @@ fi # ── proxy prefix + upstream host derivation ────────────────────────── +# UPSTREAM_HOST is derived unconditionally: the no-proxy (direct CORS +# fetch) case is exactly when the host must be added to connect-src, so +# the checklist's CSP reminder below needs it populated either way. +UPSTREAM_HOST=$(printf '%s' "$API_URL" | awk -F/ '{print $3}') if [[ "$NEEDS_PROXY" -eq 1 ]]; then - UPSTREAM_HOST=$(printf '%s' "$API_URL" | awk -F/ '{print $3}') UPSTREAM_PATH=$(printf '%s' "$API_URL" | awk -F/ 'BEGIN{OFS="/"} {$1=""; $2=""; $3=""; print}' | sed 's|^///||') PROXY_PATH="/proxy/$NAME/" PROXY_API_URL="$PROXY_PATH${UPSTREAM_PATH%%\?*}" [[ "$API_URL" == *"?"* ]] && PROXY_API_URL="$PROXY_API_URL?${API_URL#*\?}" else - UPSTREAM_HOST="" PROXY_API_URL="$API_URL" fi @@ -205,8 +211,9 @@ cat < None: def atomic_write_text(path: Path, text: str) -> None: - """Write to path.tmp then os.replace. PROVENANCE.json and the - generated index/state files are integrity records — an interrupt - mid-write must never leave a truncated file that the next run - parses (or mistakes for corruption).""" + """Write to a PID-unique temp then os.replace. PROVENANCE.json and + the generated index/state files are integrity records — an interrupt + mid-write must never leave a truncated file that the next run parses + (or mistakes for corruption); fsync makes the rename durable and the + PID suffix keeps concurrent runs from sharing a temp file.""" path.parent.mkdir(parents=True, exist_ok=True) - tmp = path.with_suffix(path.suffix + ".tmp") + tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}") try: - tmp.write_text(text, encoding="utf-8") + with tmp.open("w", encoding="utf-8") as f: + f.write(text) + f.flush() + os.fsync(f.fileno()) os.replace(tmp, path) except BaseException: tmp.unlink(missing_ok=True) diff --git a/tools/embed.py b/tools/embed.py index d368f17..be16f57 100644 --- a/tools/embed.py +++ b/tools/embed.py @@ -120,12 +120,21 @@ PORTAL_BODY_ATTR = "data-portal" def atomic_write_bytes(path: Path, data: bytes) -> None: - """Write to path.tmp then os.replace, so an interrupt mid-write - cannot leave a truncated file that the next build/serve loads.""" + """Write to a PID-unique temp then os.replace: an interrupt mid-write + cannot leave a truncated file at the final path, fsync makes the + rename durable across power loss, and the PID suffix keeps two + concurrent runs from interleaving writes into one temp file.""" path.parent.mkdir(parents=True, exist_ok=True) - tmp = path.with_suffix(path.suffix + ".tmp") - tmp.write_bytes(data) - os.replace(tmp, path) + tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}") + try: + with tmp.open("wb") as f: + f.write(data) + f.flush() + os.fsync(f.fileno()) + os.replace(tmp, path) + except BaseException: + tmp.unlink(missing_ok=True) + raise def atomic_write_text(path: Path, text: str) -> None: @@ -186,7 +195,9 @@ def save_vec_cache(path: Path, model: str, revision: str, dim: int, path.parent.mkdir(parents=True, exist_ok=True) # Pass an open file handle, not a path: np.savez_compressed appends # ".npz" to bare paths, which would mangle our atomic-rename target. - tmp = path.with_suffix(path.suffix + ".tmp") + # PID-unique temp so concurrent runs can't interleave; fsync so the + # rename is durable. + tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}") try: with open(tmp, "wb") as f: np.savez_compressed( @@ -197,6 +208,8 @@ def save_vec_cache(path: Path, model: str, revision: str, dim: int, hashes=hashes, vectors=vectors, ) + f.flush() + os.fsync(f.fileno()) os.replace(tmp, path) except BaseException: tmp.unlink(missing_ok=True) diff --git a/tools/extract-dimensions.py b/tools/extract-dimensions.py index 04017a4..ea9b51f 100755 --- a/tools/extract-dimensions.py +++ b/tools/extract-dimensions.py @@ -31,6 +31,7 @@ images are logged and the rest of the walk continues. from __future__ import annotations +import os import sys from pathlib import Path from typing import Any @@ -62,13 +63,20 @@ def _is_stale(image: Path, sidecar: Path) -> bool: def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None: - tmp = path.with_suffix(path.suffix + ".tmp") - with tmp.open("w", encoding="utf-8") as f: - # Preserve a stable key order (width before height) so a manual - # diff stays easy to read across regenerations. - ordered = {k: data[k] for k in ("width", "height") if k in data} - yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True) - tmp.replace(path) + # PID-unique temp (concurrent runs can't share it), removed on + # failure. No fsync: sidecars are regenerated from the photo on the + # next build, so a lost rename costs one re-extraction, not data. + tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}") + try: + with tmp.open("w", encoding="utf-8") as f: + # Preserve a stable key order (width before height) so a manual + # diff stays easy to read across regenerations. + ordered = {k: data[k] for k in ("width", "height") if k in data} + yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True) + tmp.replace(path) + except BaseException: + tmp.unlink(missing_ok=True) + raise def _read_dimensions(image: Path) -> dict[str, int]: diff --git a/tools/extract-exif.py b/tools/extract-exif.py index 799ec04..bc70d64 100755 --- a/tools/extract-exif.py +++ b/tools/extract-exif.py @@ -381,12 +381,19 @@ def _is_stale(image: Path, sidecar: Path) -> bool: def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None: - tmp = path.with_suffix(path.suffix + ".tmp") - with tmp.open("w", encoding="utf-8") as f: - # Preserve the SIDECAR_KEYS order so a manual diff is easy to read. - ordered = {k: data[k] for k in SIDECAR_KEYS if k in data} - yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True) - tmp.replace(path) + # PID-unique temp (concurrent runs can't share it), removed on + # failure. No fsync: sidecars are regenerated from the photo on the + # next build, so a lost rename costs one re-extraction, not data. + tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}") + try: + with tmp.open("w", encoding="utf-8") as f: + # Preserve the SIDECAR_KEYS order so a manual diff is easy to read. + ordered = {k: data[k] for k in SIDECAR_KEYS if k in data} + yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True) + tmp.replace(path) + except BaseException: + tmp.unlink(missing_ok=True) + raise def _read_one(image: Path) -> dict[str, Any]: diff --git a/tools/extract-palette.py b/tools/extract-palette.py index 601824a..d50cd13 100755 --- a/tools/extract-palette.py +++ b/tools/extract-palette.py @@ -23,6 +23,7 @@ a palette extraction error. from __future__ import annotations +import os import sys from pathlib import Path from typing import Any @@ -62,10 +63,17 @@ def _is_stale(image: Path, sidecar: Path) -> bool: def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None: - tmp = path.with_suffix(path.suffix + ".tmp") - with tmp.open("w", encoding="utf-8") as f: - yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True) - tmp.replace(path) + # PID-unique temp (concurrent runs can't share it), removed on + # failure. No fsync: sidecars are regenerated from the photo on the + # next build, so a lost rename costs one re-extraction, not data. + tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}") + try: + with tmp.open("w", encoding="utf-8") as f: + yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True) + tmp.replace(path) + except BaseException: + tmp.unlink(missing_ok=True) + raise def _extract_palette(image: Path) -> list[str]: diff --git a/tools/refreeze.sh b/tools/refreeze.sh index 6429c8d..fd8f6c6 100755 --- a/tools/refreeze.sh +++ b/tools/refreeze.sh @@ -8,11 +8,29 @@ FREEZE="$REPO_ROOT/cabal.project.freeze" cd "$REPO_ROOT" +# Back up the current freeze and restore it if resolution fails, so an +# unsolvable index never leaves the repo with no freeze file at all +# (recoverable via git, but the script shouldn't depend on that). +BACKUP="" +if [ -f "$FREEZE" ]; then + BACKUP="$(mktemp "$FREEZE.bak.XXXXXX")" + cp "$FREEZE" "$BACKUP" +fi +restore_on_failure() { + if [ -n "$BACKUP" ]; then + echo "==> Refreeze failed — restoring previous freeze file." >&2 + mv "$BACKUP" "$FREEZE" + fi +} +trap restore_on_failure ERR + echo "==> Removing stale freeze file..." rm -f "$FREEZE" echo "==> Resolving dependencies and writing new freeze file..." cabal freeze +trap - ERR +[ -n "$BACKUP" ] && rm -f "$BACKUP" echo "==> Verifying build..." cabal build diff --git a/uv.lock b/uv.lock index dad9b9e..2a76ddc 100644 --- a/uv.lock +++ b/uv.lock @@ -389,7 +389,7 @@ requires-dist = [ { name = "altair", specifier = ">=5.4,<6" }, { name = "beautifulsoup4", specifier = ">=4.12,<5" }, { name = "colorthief", specifier = ">=0.2,<1" }, - { name = "einops", specifier = ">=0.8.2" }, + { name = "einops", specifier = ">=0.8.2,<1" }, { name = "faiss-cpu", specifier = ">=1.9,<2" }, { name = "matplotlib", specifier = ">=3.9,<4" }, { name = "numpy", specifier = ">=2.0,<3" },