diff --git a/tools/archive.py b/tools/archive.py index 2aacb0e..b16a1e3 100644 --- a/tools/archive.py +++ b/tools/archive.py @@ -104,6 +104,26 @@ def err(msg: str) -> None: print(f"[archive] ERROR: {msg}", file=sys.stderr) +def atomic_write_text(path: Path, text: str) -> None: + """Write to path.tmp then os.replace. PROVENANCE.json and the + generated index/state files are integrity records — an interrupt + mid-write must never leave a truncated file that the next run + parses (or mistakes for corruption).""" + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + try: + tmp.write_text(text, encoding="utf-8") + os.replace(tmp, path) + except BaseException: + tmp.unlink(missing_ok=True) + raise + + +def atomic_write_json(path: Path, obj) -> None: + atomic_write_text( + path, json.dumps(obj, indent=2, ensure_ascii=False) + "\n") + + # --------------------------------------------------------------------------- # Manifest / removed.yaml # --------------------------------------------------------------------------- @@ -119,6 +139,15 @@ def load_yaml_list(path: Path) -> list[dict]: if not isinstance(data, list): err(f"{path.name}: expected a YAML list, got {type(data).__name__}") sys.exit(1) + # Validate items too: a stray scalar line (`- https://example.com` + # instead of `- url: ...`) would otherwise surface much later as an + # AttributeError deep inside fetch/wayback/check. + for i, item in enumerate(data): + if not isinstance(item, dict): + err(f"{path.name}: entry {i + 1} is not a mapping " + f"(got {type(item).__name__}: {item!r}); " + f"each entry must be `- url: ...`") + sys.exit(1) return data @@ -715,10 +744,7 @@ def cmd_fetch() -> int: "snapshot-quality": quality, "wayback": None, } - prov_path.write_text( - json.dumps(prov, indent=2, ensure_ascii=False) + "\n", - encoding="utf-8", - ) + atomic_write_json(prov_path, prov) log(f"{slug}: archived [{atype}, {quality}] ({prov['bytes']} bytes)") # --- contribute to the Hakyll index ------------------------------- @@ -730,11 +756,7 @@ def cmd_fetch() -> int: } # archive-index.json is always rewritten to mirror the manifest exactly. - INDEX_OUT.parent.mkdir(parents=True, exist_ok=True) - INDEX_OUT.write_text( - json.dumps(index, indent=2, ensure_ascii=False) + "\n", - encoding="utf-8", - ) + atomic_write_json(INDEX_OUT, index) log(f"wrote {INDEX_OUT.relative_to(REPO_ROOT)} ({len(index)} entries)") if skipped: @@ -785,14 +807,18 @@ def cmd_refresh(argv: list[str]) -> int: try: prev = json.loads(prov_path.read_text(encoding="utf-8")) prev_sha = prev.get("sha256") - prev_artifact = slug_dir / prev.get("artifact", "") + prev_art_name = prev.get("artifact") or "" + prev_artifact = slug_dir / prev_art_name except Exception as exc: # noqa: BLE001 err(f"refresh: cannot parse prior provenance for {slug}: {exc}") return 2 # The prior snapshot must be committed and clean — otherwise # `previous-sha256` would point at bytes git can no longer give - # back, breaking the auditable replacement contract. - if not prev_sha or not prev_artifact.exists(): + # back, breaking the auditable replacement contract. The empty- + # artifact guard matters: without it prev_artifact would be the + # slug directory itself, which exists() accepts and sha256_of + # then crashes on with IsADirectoryError. + if not prev_sha or not prev_art_name or not prev_artifact.is_file(): err(f"refresh: prior snapshot for {slug} is incomplete; restore " f"its artifact and provenance before replacing it.") return 2 @@ -850,11 +876,7 @@ def cmd_refresh(argv: list[str]) -> int: if art_name and (slug_dir / art_name).exists(): if prev_sha: new_prov["previous-sha256"] = prev_sha - prov_path.write_text( - json.dumps(new_prov, indent=2, - ensure_ascii=False) + "\n", - encoding="utf-8", - ) + atomic_write_json(prov_path, new_prov) log(f"refresh: recorded previous-sha256 " f"{prev_sha[:12]}…") succeeded = True @@ -893,8 +915,12 @@ def wayback_save(url: str) -> None: """Trigger a fresh Wayback capture via Save Page Now. Best-effort: any outcome is tolerated — the resulting URL is read back via the availability API (which also surfaces a pre-existing capture).""" - req = urllib.request.Request("https://web.archive.org/save/" + url, - headers={"User-Agent": USER_AGENT}) + # Quote only what can't appear raw in a request line (spaces, + # control chars); URL structure (:/?&=#) passes through so Save + # Page Now sees the original URL shape. + req = urllib.request.Request( + "https://web.archive.org/save/" + quote(url, safe=":/?&=#"), + headers={"User-Agent": USER_AGENT}) try: with urllib.request.urlopen(req, timeout=WAYBACK_TIMEOUT): pass @@ -951,10 +977,7 @@ def cmd_wayback() -> int: capture = wayback_lookup(url) if capture: prov["wayback"] = capture - prov_path.write_text( - json.dumps(prov, indent=2, ensure_ascii=False) + "\n", - encoding="utf-8", - ) + atomic_write_json(prov_path, prov) log(f"{slug}: wayback -> {capture}") backfilled += 1 else: @@ -1073,11 +1096,7 @@ def cmd_check() -> int: note = f" -> {new_url}" if new_url else "" log(f"check: {url} [{rec['status']}]{note}") - STATE_OUT.parent.mkdir(parents=True, exist_ok=True) - STATE_OUT.write_text( - json.dumps(state, indent=2, ensure_ascii=False) + "\n", - encoding="utf-8", - ) + atomic_write_json(STATE_OUT, state) log(f"check: {tally['live']} live, {tally['moved']} moved, " f"{tally['error']} error, {tally['rotted']} rotted " f"-> {STATE_OUT.relative_to(REPO_ROOT)}") diff --git a/tools/convert-images.sh b/tools/convert-images.sh index aa25c0b..742ed6a 100755 --- a/tools/convert-images.sh +++ b/tools/convert-images.sh @@ -32,7 +32,11 @@ while IFS= read -r -d '' img; do skipped=$((skipped + 1)) else echo " webp ${img#"$REPO_ROOT/"}" - cwebp -quiet -q 85 "$img" -o "$webp" + # Write to a temp name then move: an interrupted cwebp would + # otherwise leave a truncated .webp that is newer than its + # source, which the staleness gate above then skips forever. + cwebp -quiet -q 85 "$img" -o "$webp.part" + mv "$webp.part" "$webp" converted=$((converted + 1)) fi done < <(find "$REPO_ROOT/static" "$REPO_ROOT/content" \ diff --git a/tools/download-leaflet.sh b/tools/download-leaflet.sh index 4be1dba..b6bfe50 100755 --- a/tools/download-leaflet.sh +++ b/tools/download-leaflet.sh @@ -7,8 +7,9 @@ # the site, no third-party request at view time. # # Run once before deploying. The vendored copy is gitignored -# (~150 KB total); re-running is safe — the script skips when the -# files already exist. +# (~150 KB total); re-running is safe — files that already exist AND +# match their pinned checksum are skipped; anything missing or +# mismatched is re-fetched. # # To bump the pinned versions, set LEAFLET_VERSION / MARKERCLUSTER_VERSION, # re-run, then update tools/leaflet-checksums.sha256 with the new hashes. @@ -39,13 +40,6 @@ files_to_fetch=( "$UNPKG_MC|MarkerCluster.Default.css|leaflet.markercluster-${MARKERCLUSTER_VERSION}-MarkerCluster.Default.css" ) -# Skip the whole step if the canonical entry-point already exists. -# Force a re-fetch by removing the directory. -if [ -f "$LEAFLET_DIR/leaflet.js" ] && [ -f "$LEAFLET_DIR/leaflet.markercluster.js" ]; then - echo "leaflet: already vendored at $LEAFLET_DIR (skipping)" - exit 0 -fi - mkdir -p "$LEAFLET_DIR/images" verify_or_warn() { @@ -71,15 +65,35 @@ verify_or_warn() { fi } +# Per-file skip: existing files are skipped only after re-verifying +# their checksum, so a partial or tampered file from an interrupted +# earlier run can never be silently accepted. Downloads land in a +# .part temp and are only moved into place after verification — a +# failed verification leaves nothing at the final path. for entry in "${files_to_fetch[@]}"; do IFS='|' read -r url_base local_path pin_key <<<"$entry" src_name="${local_path##*/}" target="$LEAFLET_DIR/$local_path" mkdir -p "$(dirname "$target")" + if [ -f "$target" ]; then + if verify_or_warn "$target" "$pin_key"; then + echo "leaflet: $local_path present and verified (skipping)" + continue + fi + echo "leaflet: $local_path failed verification — re-fetching" >&2 + rm -f "$target" + fi + echo "leaflet: fetching $local_path ($pin_key)" - curl -fsSL --progress-bar "$url_base/$src_name" -o "$target" - verify_or_warn "$target" "$pin_key" + tmp="$target.part" + curl -fsSL --progress-bar "$url_base/$src_name" -o "$tmp" + if ! verify_or_warn "$tmp" "$pin_key"; then + rm -f "$tmp" + echo "leaflet: refusing to vendor unverified $local_path" >&2 + exit 1 + fi + mv "$tmp" "$target" done echo "leaflet: vendored to $LEAFLET_DIR" diff --git a/tools/download-model.sh b/tools/download-model.sh index eaed8e8..d798a7a 100755 --- a/tools/download-model.sh +++ b/tools/download-model.sh @@ -68,8 +68,13 @@ fetch() { return fi echo " fetch $src" - curl -fsSL --progress-bar "$BASE_URL/$src" -o "$dst" - verify_sha "$src" "$dst" + # Download to a temp name and move into place only after + # verification: an interrupted curl must never leave a partial + # file at the final path, where the present-file skip (or, for an + # unpinned file, nothing at all) would accept it forever. + curl -fsSL --progress-bar "$BASE_URL/$src" -o "$dst.part" + verify_sha "$src" "$dst.part" + mv "$dst.part" "$dst" } if [ ! -f "$CHECKSUMS" ]; then