Tooling robustness: atomic writes, verified downloads
- archive.py: PROVENANCE.json / archive-index.json / archive-state.json now written atomically (tmp + os.replace) — a truncated integrity record is the one thing this tool must never produce (AUDIT §4.4); manifest entries validated as mappings up front (§4.7); refresh rejects provenance with a missing/empty artifact key instead of crashing on IsADirectoryError (§4.7); wayback save URL quotes unsafe characters (§4.7) - download-leaflet.sh: existing files are re-verified before being skipped, and downloads land in a .part temp moved into place only after checksum verification — a failed verification can no longer leave a bad file that the next run silently accepts (§4.5) - download-model.sh, convert-images.sh: same temp-then-move pattern so interrupted downloads/conversions never persist at final paths (§4.6) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
c68d03af31
commit
c17c203747
|
|
@ -104,6 +104,26 @@ def err(msg: str) -> None:
|
|||
print(f"[archive] ERROR: {msg}", file=sys.stderr)
|
||||
|
||||
|
||||
def atomic_write_text(path: Path, text: str) -> None:
|
||||
"""Write to path.tmp then os.replace. PROVENANCE.json and the
|
||||
generated index/state files are integrity records — an interrupt
|
||||
mid-write must never leave a truncated file that the next run
|
||||
parses (or mistakes for corruption)."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = path.with_suffix(path.suffix + ".tmp")
|
||||
try:
|
||||
tmp.write_text(text, encoding="utf-8")
|
||||
os.replace(tmp, path)
|
||||
except BaseException:
|
||||
tmp.unlink(missing_ok=True)
|
||||
raise
|
||||
|
||||
|
||||
def atomic_write_json(path: Path, obj) -> None:
|
||||
atomic_write_text(
|
||||
path, json.dumps(obj, indent=2, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Manifest / removed.yaml
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -119,6 +139,15 @@ def load_yaml_list(path: Path) -> list[dict]:
|
|||
if not isinstance(data, list):
|
||||
err(f"{path.name}: expected a YAML list, got {type(data).__name__}")
|
||||
sys.exit(1)
|
||||
# Validate items too: a stray scalar line (`- https://example.com`
|
||||
# instead of `- url: ...`) would otherwise surface much later as an
|
||||
# AttributeError deep inside fetch/wayback/check.
|
||||
for i, item in enumerate(data):
|
||||
if not isinstance(item, dict):
|
||||
err(f"{path.name}: entry {i + 1} is not a mapping "
|
||||
f"(got {type(item).__name__}: {item!r}); "
|
||||
f"each entry must be `- url: ...`")
|
||||
sys.exit(1)
|
||||
return data
|
||||
|
||||
|
||||
|
|
@ -715,10 +744,7 @@ def cmd_fetch() -> int:
|
|||
"snapshot-quality": quality,
|
||||
"wayback": None,
|
||||
}
|
||||
prov_path.write_text(
|
||||
json.dumps(prov, indent=2, ensure_ascii=False) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
atomic_write_json(prov_path, prov)
|
||||
log(f"{slug}: archived [{atype}, {quality}] ({prov['bytes']} bytes)")
|
||||
|
||||
# --- contribute to the Hakyll index -------------------------------
|
||||
|
|
@ -730,11 +756,7 @@ def cmd_fetch() -> int:
|
|||
}
|
||||
|
||||
# archive-index.json is always rewritten to mirror the manifest exactly.
|
||||
INDEX_OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
INDEX_OUT.write_text(
|
||||
json.dumps(index, indent=2, ensure_ascii=False) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
atomic_write_json(INDEX_OUT, index)
|
||||
log(f"wrote {INDEX_OUT.relative_to(REPO_ROOT)} ({len(index)} entries)")
|
||||
|
||||
if skipped:
|
||||
|
|
@ -785,14 +807,18 @@ def cmd_refresh(argv: list[str]) -> int:
|
|||
try:
|
||||
prev = json.loads(prov_path.read_text(encoding="utf-8"))
|
||||
prev_sha = prev.get("sha256")
|
||||
prev_artifact = slug_dir / prev.get("artifact", "")
|
||||
prev_art_name = prev.get("artifact") or ""
|
||||
prev_artifact = slug_dir / prev_art_name
|
||||
except Exception as exc: # noqa: BLE001
|
||||
err(f"refresh: cannot parse prior provenance for {slug}: {exc}")
|
||||
return 2
|
||||
# The prior snapshot must be committed and clean — otherwise
|
||||
# `previous-sha256` would point at bytes git can no longer give
|
||||
# back, breaking the auditable replacement contract.
|
||||
if not prev_sha or not prev_artifact.exists():
|
||||
# back, breaking the auditable replacement contract. The empty-
|
||||
# artifact guard matters: without it prev_artifact would be the
|
||||
# slug directory itself, which exists() accepts and sha256_of
|
||||
# then crashes on with IsADirectoryError.
|
||||
if not prev_sha or not prev_art_name or not prev_artifact.is_file():
|
||||
err(f"refresh: prior snapshot for {slug} is incomplete; restore "
|
||||
f"its artifact and provenance before replacing it.")
|
||||
return 2
|
||||
|
|
@ -850,11 +876,7 @@ def cmd_refresh(argv: list[str]) -> int:
|
|||
if art_name and (slug_dir / art_name).exists():
|
||||
if prev_sha:
|
||||
new_prov["previous-sha256"] = prev_sha
|
||||
prov_path.write_text(
|
||||
json.dumps(new_prov, indent=2,
|
||||
ensure_ascii=False) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
atomic_write_json(prov_path, new_prov)
|
||||
log(f"refresh: recorded previous-sha256 "
|
||||
f"{prev_sha[:12]}…")
|
||||
succeeded = True
|
||||
|
|
@ -893,7 +915,11 @@ def wayback_save(url: str) -> None:
|
|||
"""Trigger a fresh Wayback capture via Save Page Now. Best-effort: any
|
||||
outcome is tolerated — the resulting URL is read back via the
|
||||
availability API (which also surfaces a pre-existing capture)."""
|
||||
req = urllib.request.Request("https://web.archive.org/save/" + url,
|
||||
# Quote only what can't appear raw in a request line (spaces,
|
||||
# control chars); URL structure (:/?&=#) passes through so Save
|
||||
# Page Now sees the original URL shape.
|
||||
req = urllib.request.Request(
|
||||
"https://web.archive.org/save/" + quote(url, safe=":/?&=#"),
|
||||
headers={"User-Agent": USER_AGENT})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=WAYBACK_TIMEOUT):
|
||||
|
|
@ -951,10 +977,7 @@ def cmd_wayback() -> int:
|
|||
capture = wayback_lookup(url)
|
||||
if capture:
|
||||
prov["wayback"] = capture
|
||||
prov_path.write_text(
|
||||
json.dumps(prov, indent=2, ensure_ascii=False) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
atomic_write_json(prov_path, prov)
|
||||
log(f"{slug}: wayback -> {capture}")
|
||||
backfilled += 1
|
||||
else:
|
||||
|
|
@ -1073,11 +1096,7 @@ def cmd_check() -> int:
|
|||
note = f" -> {new_url}" if new_url else ""
|
||||
log(f"check: {url} [{rec['status']}]{note}")
|
||||
|
||||
STATE_OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
STATE_OUT.write_text(
|
||||
json.dumps(state, indent=2, ensure_ascii=False) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
atomic_write_json(STATE_OUT, state)
|
||||
log(f"check: {tally['live']} live, {tally['moved']} moved, "
|
||||
f"{tally['error']} error, {tally['rotted']} rotted "
|
||||
f"-> {STATE_OUT.relative_to(REPO_ROOT)}")
|
||||
|
|
|
|||
|
|
@ -32,7 +32,11 @@ while IFS= read -r -d '' img; do
|
|||
skipped=$((skipped + 1))
|
||||
else
|
||||
echo " webp ${img#"$REPO_ROOT/"}"
|
||||
cwebp -quiet -q 85 "$img" -o "$webp"
|
||||
# Write to a temp name then move: an interrupted cwebp would
|
||||
# otherwise leave a truncated .webp that is newer than its
|
||||
# source, which the staleness gate above then skips forever.
|
||||
cwebp -quiet -q 85 "$img" -o "$webp.part"
|
||||
mv "$webp.part" "$webp"
|
||||
converted=$((converted + 1))
|
||||
fi
|
||||
done < <(find "$REPO_ROOT/static" "$REPO_ROOT/content" \
|
||||
|
|
|
|||
|
|
@ -7,8 +7,9 @@
|
|||
# the site, no third-party request at view time.
|
||||
#
|
||||
# Run once before deploying. The vendored copy is gitignored
|
||||
# (~150 KB total); re-running is safe — the script skips when the
|
||||
# files already exist.
|
||||
# (~150 KB total); re-running is safe — files that already exist AND
|
||||
# match their pinned checksum are skipped; anything missing or
|
||||
# mismatched is re-fetched.
|
||||
#
|
||||
# To bump the pinned versions, set LEAFLET_VERSION / MARKERCLUSTER_VERSION,
|
||||
# re-run, then update tools/leaflet-checksums.sha256 with the new hashes.
|
||||
|
|
@ -39,13 +40,6 @@ files_to_fetch=(
|
|||
"$UNPKG_MC|MarkerCluster.Default.css|leaflet.markercluster-${MARKERCLUSTER_VERSION}-MarkerCluster.Default.css"
|
||||
)
|
||||
|
||||
# Skip the whole step if the canonical entry-point already exists.
|
||||
# Force a re-fetch by removing the directory.
|
||||
if [ -f "$LEAFLET_DIR/leaflet.js" ] && [ -f "$LEAFLET_DIR/leaflet.markercluster.js" ]; then
|
||||
echo "leaflet: already vendored at $LEAFLET_DIR (skipping)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "$LEAFLET_DIR/images"
|
||||
|
||||
verify_or_warn() {
|
||||
|
|
@ -71,15 +65,35 @@ verify_or_warn() {
|
|||
fi
|
||||
}
|
||||
|
||||
# Per-file skip: existing files are skipped only after re-verifying
|
||||
# their checksum, so a partial or tampered file from an interrupted
|
||||
# earlier run can never be silently accepted. Downloads land in a
|
||||
# .part temp and are only moved into place after verification — a
|
||||
# failed verification leaves nothing at the final path.
|
||||
for entry in "${files_to_fetch[@]}"; do
|
||||
IFS='|' read -r url_base local_path pin_key <<<"$entry"
|
||||
src_name="${local_path##*/}"
|
||||
target="$LEAFLET_DIR/$local_path"
|
||||
mkdir -p "$(dirname "$target")"
|
||||
|
||||
if [ -f "$target" ]; then
|
||||
if verify_or_warn "$target" "$pin_key"; then
|
||||
echo "leaflet: $local_path present and verified (skipping)"
|
||||
continue
|
||||
fi
|
||||
echo "leaflet: $local_path failed verification — re-fetching" >&2
|
||||
rm -f "$target"
|
||||
fi
|
||||
|
||||
echo "leaflet: fetching $local_path ($pin_key)"
|
||||
curl -fsSL --progress-bar "$url_base/$src_name" -o "$target"
|
||||
verify_or_warn "$target" "$pin_key"
|
||||
tmp="$target.part"
|
||||
curl -fsSL --progress-bar "$url_base/$src_name" -o "$tmp"
|
||||
if ! verify_or_warn "$tmp" "$pin_key"; then
|
||||
rm -f "$tmp"
|
||||
echo "leaflet: refusing to vendor unverified $local_path" >&2
|
||||
exit 1
|
||||
fi
|
||||
mv "$tmp" "$target"
|
||||
done
|
||||
|
||||
echo "leaflet: vendored to $LEAFLET_DIR"
|
||||
|
|
|
|||
|
|
@ -68,8 +68,13 @@ fetch() {
|
|||
return
|
||||
fi
|
||||
echo " fetch $src"
|
||||
curl -fsSL --progress-bar "$BASE_URL/$src" -o "$dst"
|
||||
verify_sha "$src" "$dst"
|
||||
# Download to a temp name and move into place only after
|
||||
# verification: an interrupted curl must never leave a partial
|
||||
# file at the final path, where the present-file skip (or, for an
|
||||
# unpinned file, nothing at all) would accept it forever.
|
||||
curl -fsSL --progress-bar "$BASE_URL/$src" -o "$dst.part"
|
||||
verify_sha "$src" "$dst.part"
|
||||
mv "$dst.part" "$dst"
|
||||
}
|
||||
|
||||
if [ ! -f "$CHECKSUMS" ]; then
|
||||
|
|
|
|||
Loading…
Reference in New Issue