#!/usr/bin/env python3 """ archive.py — Build-time link-archiving tool for levineuwirth.org. Reads archive/manifest.yaml, fetches any manifest URL that has no local artifact yet, stores it under archive//, extracts readable text, writes the per-entry archive//PROVENANCE.json, and (re)writes the Hakyll input data/archive-index.json. Two artifact types: * pdf — downloaded directly, stored as document.pdf, text via pdftotext. * html — snapshotted with `monolith` into a single self-contained snapshot.html (JavaScript stripped, assets inlined as data URIs), a restrictive Content-Security-Policy injected, text extracted with BeautifulSoup. Subcommands: fetch download missing artifacts, (re)generate sidecars + index refresh deliberately re-snapshot a single entry, recording the prior SHA in the new PROVENANCE.json's `previous-sha256` wayback submit archived URLs to the Wayback Machine as a second, independent copy; backfill the capture URL into PROVENANCE.json check HEAD/GET-probe every manifest URL for link rot, updating data/archive-state.json with asymmetric hysteresis gc delete archive// directories listed in archive/removed.yaml Failure policy: * Integrity errors — a committed artifact whose SHA-256 no longer matches PROVENANCE.json, or a slug whose manifest URL has changed — print loudly and exit non-zero, halting `make build`. * Transient errors — a network failure, an over-cap download, a missing `monolith` binary, a manifest entry missing its `url:` — print a warning, skip that entry, and exit zero so the build proceeds (the entry is retried on the next build). See ARCHIVE.md for the full design. Gated on .venv by the Makefile (same convention as embed.py). Non-stdlib dependencies: PyYAML and beautifulsoup4, both already in pyproject.toml. External tools: `pdftotext` (poppler) for PDF text, and the `monolith` binary — vendored at tools/bin/monolith, see tools/monolith-version.txt. """ from __future__ import annotations import datetime import hashlib import json import os import re import shutil import subprocess import sys import urllib.error import urllib.request from pathlib import Path from urllib.parse import parse_qsl, quote, urlencode, urlparse, urlunparse import yaml # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- REPO_ROOT = Path(__file__).resolve().parent.parent ARCHIVE_DIR = REPO_ROOT / "archive" MANIFEST = ARCHIVE_DIR / "manifest.yaml" REMOVED = ARCHIVE_DIR / "removed.yaml" INDEX_OUT = REPO_ROOT / "data" / "archive-index.json" STATE_OUT = REPO_ROOT / "data" / "archive-state.json" ROT_FAILS = 3 # consecutive failed scans before `rotted` is considered ROT_DAYS = 14 # ... and the streak must also span at least this many days SIZE_CAP = 25 * 1024 * 1024 # 25 MB per-artifact cap TIMEOUT = 60 # seconds, per network request WAYBACK_TIMEOUT = 120 # seconds — Save Page Now is slow USER_AGENT = ("levineuwirth.org/archive " "(ln@levineuwirth.org; removal requests honored)") # Per-type on-disk names. The artifact is committed; the .txt is generated # (gitignored) and regenerated whenever the artifact's SHA-256 changes. ARTIFACT = {"pdf": "document.pdf", "html": "snapshot.html"} TEXTFILE = {"pdf": "document.txt", "html": "snapshot.txt"} # Injected into every HTML snapshot's . Permits exactly what a # faithful monolith capture needs — inlined images/fonts as data URIs and # inline styles (as