Tooling robustness: atomic writes, verified downloads

- archive.py: PROVENANCE.json / archive-index.json / archive-state.json
  now written atomically (tmp + os.replace) — a truncated integrity
  record is the one thing this tool must never produce (AUDIT §4.4);
  manifest entries validated as mappings up front (§4.7); refresh
  rejects provenance with a missing/empty artifact key instead of
  crashing on IsADirectoryError (§4.7); wayback save URL quotes
  unsafe characters (§4.7)
- download-leaflet.sh: existing files are re-verified before being
  skipped, and downloads land in a .part temp moved into place only
  after checksum verification — a failed verification can no longer
  leave a bad file that the next run silently accepts (§4.5)
- download-model.sh, convert-images.sh: same temp-then-move pattern so
  interrupted downloads/conversions never persist at final paths (§4.6)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Levi Neuwirth 2026-06-10 09:43:25 -04:00
parent c68d03af31
commit c17c203747
4 changed files with 84 additions and 42 deletions

View File

@ -104,6 +104,26 @@ def err(msg: str) -> None:
print(f"[archive] ERROR: {msg}", file=sys.stderr)
def atomic_write_text(path: Path, text: str) -> None:
"""Write to path.tmp then os.replace. PROVENANCE.json and the
generated index/state files are integrity records an interrupt
mid-write must never leave a truncated file that the next run
parses (or mistakes for corruption)."""
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".tmp")
try:
tmp.write_text(text, encoding="utf-8")
os.replace(tmp, path)
except BaseException:
tmp.unlink(missing_ok=True)
raise
def atomic_write_json(path: Path, obj) -> None:
atomic_write_text(
path, json.dumps(obj, indent=2, ensure_ascii=False) + "\n")
# ---------------------------------------------------------------------------
# Manifest / removed.yaml
# ---------------------------------------------------------------------------
@ -119,6 +139,15 @@ def load_yaml_list(path: Path) -> list[dict]:
if not isinstance(data, list):
err(f"{path.name}: expected a YAML list, got {type(data).__name__}")
sys.exit(1)
# Validate items too: a stray scalar line (`- https://example.com`
# instead of `- url: ...`) would otherwise surface much later as an
# AttributeError deep inside fetch/wayback/check.
for i, item in enumerate(data):
if not isinstance(item, dict):
err(f"{path.name}: entry {i + 1} is not a mapping "
f"(got {type(item).__name__}: {item!r}); "
f"each entry must be `- url: ...`")
sys.exit(1)
return data
@ -715,10 +744,7 @@ def cmd_fetch() -> int:
"snapshot-quality": quality,
"wayback": None,
}
prov_path.write_text(
json.dumps(prov, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
atomic_write_json(prov_path, prov)
log(f"{slug}: archived [{atype}, {quality}] ({prov['bytes']} bytes)")
# --- contribute to the Hakyll index -------------------------------
@ -730,11 +756,7 @@ def cmd_fetch() -> int:
}
# archive-index.json is always rewritten to mirror the manifest exactly.
INDEX_OUT.parent.mkdir(parents=True, exist_ok=True)
INDEX_OUT.write_text(
json.dumps(index, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
atomic_write_json(INDEX_OUT, index)
log(f"wrote {INDEX_OUT.relative_to(REPO_ROOT)} ({len(index)} entries)")
if skipped:
@ -785,14 +807,18 @@ def cmd_refresh(argv: list[str]) -> int:
try:
prev = json.loads(prov_path.read_text(encoding="utf-8"))
prev_sha = prev.get("sha256")
prev_artifact = slug_dir / prev.get("artifact", "")
prev_art_name = prev.get("artifact") or ""
prev_artifact = slug_dir / prev_art_name
except Exception as exc: # noqa: BLE001
err(f"refresh: cannot parse prior provenance for {slug}: {exc}")
return 2
# The prior snapshot must be committed and clean — otherwise
# `previous-sha256` would point at bytes git can no longer give
# back, breaking the auditable replacement contract.
if not prev_sha or not prev_artifact.exists():
# back, breaking the auditable replacement contract. The empty-
# artifact guard matters: without it prev_artifact would be the
# slug directory itself, which exists() accepts and sha256_of
# then crashes on with IsADirectoryError.
if not prev_sha or not prev_art_name or not prev_artifact.is_file():
err(f"refresh: prior snapshot for {slug} is incomplete; restore "
f"its artifact and provenance before replacing it.")
return 2
@ -850,11 +876,7 @@ def cmd_refresh(argv: list[str]) -> int:
if art_name and (slug_dir / art_name).exists():
if prev_sha:
new_prov["previous-sha256"] = prev_sha
prov_path.write_text(
json.dumps(new_prov, indent=2,
ensure_ascii=False) + "\n",
encoding="utf-8",
)
atomic_write_json(prov_path, new_prov)
log(f"refresh: recorded previous-sha256 "
f"{prev_sha[:12]}")
succeeded = True
@ -893,8 +915,12 @@ def wayback_save(url: str) -> None:
"""Trigger a fresh Wayback capture via Save Page Now. Best-effort: any
outcome is tolerated the resulting URL is read back via the
availability API (which also surfaces a pre-existing capture)."""
req = urllib.request.Request("https://web.archive.org/save/" + url,
headers={"User-Agent": USER_AGENT})
# Quote only what can't appear raw in a request line (spaces,
# control chars); URL structure (:/?&=#) passes through so Save
# Page Now sees the original URL shape.
req = urllib.request.Request(
"https://web.archive.org/save/" + quote(url, safe=":/?&=#"),
headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=WAYBACK_TIMEOUT):
pass
@ -951,10 +977,7 @@ def cmd_wayback() -> int:
capture = wayback_lookup(url)
if capture:
prov["wayback"] = capture
prov_path.write_text(
json.dumps(prov, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
atomic_write_json(prov_path, prov)
log(f"{slug}: wayback -> {capture}")
backfilled += 1
else:
@ -1073,11 +1096,7 @@ def cmd_check() -> int:
note = f" -> {new_url}" if new_url else ""
log(f"check: {url} [{rec['status']}]{note}")
STATE_OUT.parent.mkdir(parents=True, exist_ok=True)
STATE_OUT.write_text(
json.dumps(state, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
atomic_write_json(STATE_OUT, state)
log(f"check: {tally['live']} live, {tally['moved']} moved, "
f"{tally['error']} error, {tally['rotted']} rotted "
f"-> {STATE_OUT.relative_to(REPO_ROOT)}")

View File

@ -32,7 +32,11 @@ while IFS= read -r -d '' img; do
skipped=$((skipped + 1))
else
echo " webp ${img#"$REPO_ROOT/"}"
cwebp -quiet -q 85 "$img" -o "$webp"
# Write to a temp name then move: an interrupted cwebp would
# otherwise leave a truncated .webp that is newer than its
# source, which the staleness gate above then skips forever.
cwebp -quiet -q 85 "$img" -o "$webp.part"
mv "$webp.part" "$webp"
converted=$((converted + 1))
fi
done < <(find "$REPO_ROOT/static" "$REPO_ROOT/content" \

View File

@ -7,8 +7,9 @@
# the site, no third-party request at view time.
#
# Run once before deploying. The vendored copy is gitignored
# (~150 KB total); re-running is safe — the script skips when the
# files already exist.
# (~150 KB total); re-running is safe — files that already exist AND
# match their pinned checksum are skipped; anything missing or
# mismatched is re-fetched.
#
# To bump the pinned versions, set LEAFLET_VERSION / MARKERCLUSTER_VERSION,
# re-run, then update tools/leaflet-checksums.sha256 with the new hashes.
@ -39,13 +40,6 @@ files_to_fetch=(
"$UNPKG_MC|MarkerCluster.Default.css|leaflet.markercluster-${MARKERCLUSTER_VERSION}-MarkerCluster.Default.css"
)
# Skip the whole step if the canonical entry-point already exists.
# Force a re-fetch by removing the directory.
if [ -f "$LEAFLET_DIR/leaflet.js" ] && [ -f "$LEAFLET_DIR/leaflet.markercluster.js" ]; then
echo "leaflet: already vendored at $LEAFLET_DIR (skipping)"
exit 0
fi
mkdir -p "$LEAFLET_DIR/images"
verify_or_warn() {
@ -71,15 +65,35 @@ verify_or_warn() {
fi
}
# Per-file skip: existing files are skipped only after re-verifying
# their checksum, so a partial or tampered file from an interrupted
# earlier run can never be silently accepted. Downloads land in a
# .part temp and are only moved into place after verification — a
# failed verification leaves nothing at the final path.
for entry in "${files_to_fetch[@]}"; do
IFS='|' read -r url_base local_path pin_key <<<"$entry"
src_name="${local_path##*/}"
target="$LEAFLET_DIR/$local_path"
mkdir -p "$(dirname "$target")"
if [ -f "$target" ]; then
if verify_or_warn "$target" "$pin_key"; then
echo "leaflet: $local_path present and verified (skipping)"
continue
fi
echo "leaflet: $local_path failed verification — re-fetching" >&2
rm -f "$target"
fi
echo "leaflet: fetching $local_path ($pin_key)"
curl -fsSL --progress-bar "$url_base/$src_name" -o "$target"
verify_or_warn "$target" "$pin_key"
tmp="$target.part"
curl -fsSL --progress-bar "$url_base/$src_name" -o "$tmp"
if ! verify_or_warn "$tmp" "$pin_key"; then
rm -f "$tmp"
echo "leaflet: refusing to vendor unverified $local_path" >&2
exit 1
fi
mv "$tmp" "$target"
done
echo "leaflet: vendored to $LEAFLET_DIR"

View File

@ -68,8 +68,13 @@ fetch() {
return
fi
echo " fetch $src"
curl -fsSL --progress-bar "$BASE_URL/$src" -o "$dst"
verify_sha "$src" "$dst"
# Download to a temp name and move into place only after
# verification: an interrupted curl must never leave a partial
# file at the final path, where the present-file skip (or, for an
# unpinned file, nothing at all) would accept it forever.
curl -fsSL --progress-bar "$BASE_URL/$src" -o "$dst.part"
verify_sha "$src" "$dst.part"
mv "$dst.part" "$dst"
}
if [ ! -f "$CHECKSUMS" ]; then