diff --git a/.gitignore b/.gitignore
index 651615a..e7ca3f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,10 +69,20 @@ data/similar-links.json
 data/backlinks.json
 data/build-stats.json
 data/build-start.txt
+data/build-stamp.txt
 data/last-build-seconds.txt
 data/semantic-index.bin
 data/semantic-meta.json
 
+# Archive: generated text + its staleness stamp (recreated from the
+# committed artifact on every build — deterministic, so committing them is
+# churn). archive/**/PROVENANCE.json is deliberately NOT ignored — it is
+# the committed, immutable record of each archival event.
+archive/**/*.txt
+archive/**/*.txt.sha256
+data/archive-index.json
+data/archive-state.json
+
 # IGNORE.txt is for the local build and need not be synced.
 IGNORE.txt
 
diff --git a/ARCHIVE.md b/ARCHIVE.md
new file mode 100644
index 0000000..0c1d546
--- /dev/null
+++ b/ARCHIVE.md
@@ -0,0 +1,1535 @@
+# Archive
+
+Design and implementation plan for the link-archiving system of levineuwirth.org.
+This is the source of truth for how external references are preserved, hosted,
+displayed, and indexed. It sits alongside `WRITING.md`, `PHOTOGRAPHY.md`,
+`HOMEPAGE.md`, and `MARKS.md` as authoritative spec.
+
+## Status
+
+**Reviewed and ratified 2026-05-21, with revisions.** The original draft was
+reviewed against the live site over three rounds; the decisions below
+incorporate every round of deltas and are now locked.
+
+**Phase 1 complete (2026-05-22).** PDF entries: `archive/manifest.yaml`,
+`tools/archive.py` (`fetch` + `gc`), `build/Archive.hs`, the four templates,
+and the Makefile / `head.html` / `.gitignore` wiring are built and verified —
+`/archive/` and `/archive/nist-fips-203/` render.
+
+**Phase 2 complete (2026-05-22).** HTML snapshots: the pinned `monolith`
+binary is vendored at `tools/bin/monolith`, `archive.py fetch` snapshots HTML
+pages (CSP injected, text extracted, quality classified), and `archive.html`
+renders them in a sandboxed iframe — `/archive/djb-aes-speed/` renders. The
+cross-browser CSP check and the per-snapshot review remain author-gated by
+design.
+
+**Archive pages styled (2026-05-22).** `static/css/archive.css` gives the
+index and entry pages a framed treatment (banner callout, provenance panel,
+artifact viewer); the PDF embed was changed to the raw `document.pdf` (browser-
+native viewer), symmetric with HTML snapshots — see the Display — PDF decision.
+
+**Phase 3 complete (2026-05-22).** Link annotation + Wayback: `Filters/Archive.hs`
+appends an archive affordance to body links whose target is archived;
+`archive.py wayback` (+ `make archive-wayback`) backfills Wayback captures;
+`visibility: private` keeps an entry's artifact in-repo but undeployed.
+Bibliography annotation is documented as a `Citations.hs` follow-up.
+
+**Phase 4 complete (2026-05-22).** Backlinks + similar-pages: `Backlinks.hs`
+keeps archived external links and canonicalises them to their `/archive/<slug>/`
+page, so an archived work lists every essay that cites it under "Referenced by"
+(grouped by the fragment each citation targets); `archive.html` also carries a
+"Related" block from the `embed.py` similarity corpus, which now indexes archive
+pages and excludes the `/archive/` index.
+
+**Phase 5 complete (2026-05-22).** Link-rot detection: `tools/archive.py check`
+(+ `make archive-check`) HEAD/GET-probes every manifest URL and updates the
+gitignored `data/archive-state.json` under asymmetric hysteresis (`rotted`
+needs 3 fails over ≥14 days; a single success recovers immediately).
+`Filters.Archive` flips a body link to the archive when its target is `rotted`;
+each archive page surfaces its link status (provenance row, header note,
+Pagefind `status` filter tag); `/archive/` flags rotted entries; `/build/`
+gains a "Link archive" telemetry section. The search-UI `status` filter wiring
+in `search-filters.js` is deliberately partial — see the Phase 5 Met note.
+
+**All five phases done.** Refinements next; see the Phase 5 Met note for the
+documented deferrals (search-UI status filter; bibliography annotation from
+Phase 3; pull-from-Wayback at fetch time).
+
+**Refinements (2026-05-22).** A code-review pass found and fixed several
+correctness and posture issues across the system:
+
+- **Missing committed artifact no longer re-fetches silently.** `cmd_fetch`
+  used to skip its SHA guard when the artifact was absent and then download
+  fresh bytes whose hash differed from the recorded `sha256` — replacing the
+  recorded snapshot without surfacing it. The guard now also halts when
+  `PROVENANCE.json` is present but the artifact is missing, requiring the
+  author to restore the committed bytes before rebuilding.
+- **`archive/removed.yaml` is now enforced in `fetch` and `check`.** It was
+  only read by `gc`. A removed URL re-added to the manifest now halts
+  `cmd_fetch` loudly; `cmd_check` skips removed URLs so the link-rot
+  scanner does not keep probing a deliberate takedown.
+- **SHA verification closed the `.venv`-bypass hole.** The original
+  decision relied solely on `archive.py fetch` re-hashing, but that step is
+  `.venv`-gated — a contributor or deploy host without `.venv`, or a direct
+  `cabal run site -- build`, would publish a tampered artifact unchecked.
+  `build/Archive.hs` now also re-hashes via `sha256sum` from
+  `loadArchiveEntries` and halts the build on a mismatch, so the guarantee
+  holds independent of the Python step.
+- **Raw artifacts are no longer publicly indexable.** Pass 1 added a
+  `robots.txt` `Disallow: /archive/`, which pass 2 then reverted (see
+  below — it was counter-productive). Pass 1's other change — injecting
+  `<meta name=robots content="noindex, noarchive">` into every new HTML
+  snapshot alongside the archive CSP — remains in place; the
+  deploy-side header for raw PDFs landed in pass 2 as `nginx/archive.conf`.
+- **The documented `archive.py refresh {slug}` subcommand is implemented.**
+  It clears the slug's directory, re-fetches via `cmd_fetch`, and records
+  the prior `sha256` as `previous-sha256` in the new `PROVENANCE.json`. The
+  URL-changed error message in `cmd_fetch` now points at it instead of
+  asking the author to delete the directory by hand.
+- **`url_aliases` widened** to the design's full equivalent-URL set:
+  tracking-parameter stripping (`utm_*`, `fbclid`, `gclid`, `mc_*`, `ref`,
+  `igshid`, `_hsenc`, `_hsmi`, `mkt_tok`) and arXiv abs / pdf / versioned /
+  `.pdf` form expansion. Phase 1 had deliberately kept these as a Phase 4
+  deferral, but Phase 4 missed the follow-through.
+- **`X-Robots-Tag: noarchive` is now honoured on both HEAD and GET.** Some
+  servers omit the header on HEAD but emit it on GET; HTML capture now
+  aborts if either response carries the directive.
+
+Three smaller items remain documented and deferred:
+
+- **Archive tags joining the site-wide tag indexes.** `manifest.yaml`'s
+  `tags:` is authored but `Tags.hs`/`Patterns.tagIndexable` does not yet
+  ingest archive entries — it needs a Tags.hs-side integration with its
+  own design pass (archive pages aren't `match`ed Hakyll items in the
+  normal way).
+- **`archive.py suggest`** (bibliography discovery — diff `.bib` URLs
+  against the manifest) is documented but not implemented.
+- **The controlled-host end-to-end link-rot test** (reserve
+  `archive-test.levineuwirth.org`, run it through a 14-day-spanning fail
+  streak, watch the flip happen) is inherently a multi-week real-world
+  verification the author runs; the hysteresis logic is unit-tested
+  deterministically and the rendering side is verified by a hand-crafted
+  `rotted` state file.
+
+**Refinements pass 2 (2026-05-23).** A second code-review pass surfaced
+correctness gaps the first pass missed:
+
+- **`refresh` is now atomic.** It used to delete the slug directory and
+  then call `cmd_fetch`; a failed re-fetch left the entry with no
+  snapshot at all, while `refresh` returned 0 (because `cmd_fetch`
+  reports per-entry skips, not a process failure). The slug directory is
+  now *renamed* to a `.refresh-backup` sibling; success removes the
+  backup, any failure restores it. Verified by hiding the `monolith`
+  binary and confirming the prior snapshot survives intact.
+- **Invalid `visibility` values fail closed.** The `ManifestEntry` parser
+  used to accept any string and only treat the exact `"private"` as
+  private — a typo like `privte` would publish a work the author intended
+  to keep offline. The parser now rejects any value other than `public`
+  or `private`, and `readManifest` halts the build on any parse error of
+  a present file (instead of warning + returning an empty list — that
+  silent-skip was for `file absent`, not `file present but corrupt`).
+- **Lookup-side URL normalisation.** Alias generation alone cannot cover
+  unbounded forms (arXiv versions, arbitrary tracking-parameter
+  combinations). `ArchiveIndex` now normalises both index keys and
+  lookup inputs through the same `normalizeUrl` (drop fragment, strip
+  tracking, fold http→https, arXiv-canonicalise, trim trailing slash).
+  Verified: `https://cr.yp.to/aes-speed.html`,
+  `https://cr.yp.to/aes-speed.html?utm_source=mail`, and
+  `http://cr.yp.to/aes-speed.html/` all match the same archived entry.
+- **Raw-artifact indexing posture corrected.** The Phase-5 `robots.txt`
+  `Disallow: /archive/` was counter-productive: a URL blocked by
+  robots.txt can still appear in results when externally linked, and the
+  Disallow also prevents compliant crawlers from reading the wrapper
+  pages' `<meta name=robots>`. The Disallow is reverted; a new
+  `nginx/archive.conf` snippet emits `X-Robots-Tag: noindex, noarchive`
+  for the whole `/archive/` tree, which crawlers honour for any resource
+  (HTML and PDF alike). The deploy vhost should `include
+  snippets/archive.conf`.
+- **`cmd_wayback` skips `removed.yaml`.** The eviction procedure says
+  record in `removed.yaml` *before* dropping the manifest line; `fetch`
+  and `check` now honour that ordering, but `wayback` did not. A removed
+  entry whose manifest line was still in place could be submitted to a
+  third-party archive after a takedown was recorded.
+- **The shipped HTML snapshot was refreshed in the working tree** so it
+  carries the noarchive meta the Phase-5 inject promises. `archive.py
+  refresh djb-aes-speed` re-fetched cr.yp.to, applied
+  `inject_archive_metas`, and recorded the prior SHA as `previous-sha256`.
+  `archive/djb-aes-speed/{snapshot.html, PROVENANCE.json}` now reflect the
+  new bytes; matching SHA is verified by `Archive.hs`. *Caveat surfaced
+  in pass 3 (below): the prior snapshot was not committed at the moment
+  of this refresh, so its bytes are no longer recoverable via `git log
+  -S`. A pass-3 fix to `refresh` now refuses to replace an uncommitted
+  prior, but the historical artifact survives — `previous-sha256`
+  records a hash whose bytes this working tree cannot reproduce.*
+- **The URL-changed error in `cmd_fetch`** now points at
+  `archive.py refresh {slug}` instead of asking the author to delete the
+  directory by hand.
+
+Tag integration remains the one deferred refinement (it needs a Tags.hs
+design pass).
+
+**Refinements pass 3 (2026-05-23).** A third audit surfaced gaps the pass-2
+fixes didn't fully close:
+
+- **`refresh` refuses to replace an uncommitted prior snapshot.** Pass 2
+  preserved a prior snapshot through *failed* re-fetches, but a *successful*
+  one happily discarded uncommitted bytes — `previous-sha256` then pointed
+  at a hash no `git log -S` could recover. Pass 3 shells out to `git
+  ls-files` + `git diff --quiet HEAD` and refuses the refresh unless both
+  the prior PROVENANCE.json and its artifact are tracked and clean.
+- **`refresh` is atomic across *every* exit path.** Pass 2 handled the
+  ordinary `cmd_fetch returns 0 but the artifact wasn't produced` case but
+  not fatal `sys.exit`s (e.g. a `removed.yaml` conflict halting `cmd_fetch`
+  mid-refresh) nor mid-refresh exceptions, and it never rolled back the
+  `data/archive-index.json` rewrite. The work is now wrapped in
+  `try/finally` that restores both the slug directory and the index on any
+  exit path — normal failure, `SystemExit`, `KeyboardInterrupt`, or
+  exception.
+- **Removal enforcement now uses the same equivalence as link matching.**
+  Pass 2 introduced `normalizeUrl` for incoming citations but compared
+  removals as literal URL strings, so a tracking-laden manifest URL could
+  bypass a takedown. Python gains `normalize_url` mirroring the Haskell
+  helper, and `fetch` / `check` / `wayback` compare normalised forms.
+  `cmd_fetch` additionally rejects two manifest entries whose canonical
+  forms collide — that would otherwise route both under one slug.
+- **`fetch_html` honours `X-Robots-Tag: noarchive` on the captured GET too.**
+  Pass 1 added HEAD + ranged-GET probes, but a server can emit the header
+  only on the full document response. The Python tool now downloads that
+  response itself, checks its header and body directives, then passes those
+  exact bytes to `monolith --base-url ... -` so the saved snapshot is not
+  obtained through a second unobservable document request.
+- **`nginx/archive.conf` is wired into the deploy template** and
+  re-`include`s `security-headers.conf` inside its `location` block.
+  `nginx/vhost.conf.example` now includes `archive.conf`; the snippet
+  itself re-emits the baseline headers because nginx's `add_header` chain
+  is inherited from a parent only when the current context declares *no*
+  `add_header` directives — without the re-include, /archive/ would lose
+  HSTS, CSP, etc.
+- **Contract doc cleanups.** The Phase-5 paragraph claiming `robots.txt`
+  disallows `/archive/` is reworded to acknowledge the pass-2 reversal;
+  the Phase-1 checkbox claiming `Archive.hs` does not re-hash is updated
+  to point at `verifyArtifactSha`; the pass-2 note about the refreshed
+  djb snapshot now carries the caveat that its prior bytes were
+  uncommitted and are therefore unrecoverable.
+
+The historical `previous-sha256` value in `archive/djb-aes-speed/
+PROVENANCE.json` is left in place: it is a truthful record that *a* prior
+snapshot existed and what its hash was. It just is not recoverable from
+git in this working tree — the pass-3 `refresh` precondition exists so
+that property is never broken again.
+
+**Refinements pass 4 (2026-05-23).** A fourth audit completed the
+failure-closed paths:
+
+- **Direct Hakyll builds now enforce removals and missing-artifact failures.**
+  `Archive.hs` reads `removed.yaml`, rejects normalized manifest conflicts
+  and duplicate archive targets, and aborts if provenance exists without its
+  artifact. `ArchiveIndex.hs` filters the generated index through the live
+  manifest minus normalized removals, so a stale ignored index cannot retain
+  archive affordances after a takedown when `archive.py` was skipped.
+- **`refresh` verifies the prior bytes before replacing them.** A prior
+  snapshot must now be present, tracked, clean, and match its recorded
+  SHA-256 before its hash can be written into `previous-sha256`.
+- **Failed refresh restores an originally-absent index state.** If
+  `data/archive-index.json` did not exist before a failed refresh, any index
+  created by the attempted fetch is deleted during rollback.
+
+The genuinely-open questions that remain are collected at the end — the list is
+short.
+
+---
+
+## Motivation
+
+The site cites external work — papers, articles, blog posts, documentation.
+Three things go wrong with a plain hyperlink over time:
+
+1. **Link rot.** The target moves, paywalls, or vanishes. A 2019 essay's
+   citations decay silently; nobody notices until a reader clicks.
+2. **Content drift.** The target stays up but changes. The sentence you quoted
+   is no longer the sentence at that URL.
+3. **Opacity to the site's own machinery.** An external link is invisible to
+   `Backlinks.hs` (`isPageLink` drops every `http(s)://` URL) and to
+   `embed.py` (it indexes only `_site/**/*.html`). The site knows nothing about
+   the things it most often points at. A paper cited by six essays has no page,
+   no backlinks list, no place in any "Related" set.
+
+The archive fixes all three by keeping a **local, hosted, immutable snapshot**
+of each referenced work, giving it a stable URL on this domain, and making that
+URL a first-class citizen of the existing backlinks and similar-pages systems.
+
+This is deliberately *not* a general web crawler. It archives a curated set:
+the things this site references. The author adds a URL to a manifest; the build
+does the rest.
+
+### Relationship to existing pieces
+
+| Existing piece | What it does | Why the archive is different |
+|----------------|--------------|------------------------------|
+| `static/papers/` | Hosts Levi's **own** typeset PDFs (`preprint:`, `{{pdf:}}`) | The archive holds **third-party** works. Distinct directory, distinct purpose. Never conflate the two. |
+| nginx `popup-proxy.conf` | Caches **metadata** (title/abstract) from arXiv / archive.org / PubMed for hover previews | Caches structured metadata, not documents. A preview accelerator, not preservation. |
+| `Backlinks.hs` | Inverts **internal** links into a "who links here" map | Indexes site content only; external URLs are dropped. The archive makes referenced works internal enough to index. |
+| `embed.py` / `SimilarLinks.hs` | Semantic "Related" block from `_site/**/*.html` embeddings | Only sees site pages. Archived works become site pages, so they enter the embedding corpus for free. |
+
+---
+
+## Goals
+
+- **Preservation.** Every referenced work the author chooses to archive has a
+  byte-for-byte local snapshot that survives the original going dark.
+- **Stable hosting.** Each snapshot is reachable at a permanent
+  `/archive/{slug}/` URL on levineuwirth.org, rendered in site chrome.
+- **Hyperlink-able.** Archive URLs are ordinary internal links: usable in
+  prose, wikilinks, citations, and `further-reading`.
+- **Indexed.** Archived works appear in the **backlinks** ("Referenced by") and
+  **similar-pages** ("Related") systems exactly as native content does — and,
+  where the source structure allows, granularly by section.
+- **Curated, low-friction.** Adding an archive is one line in one manifest.
+  Everything else — fetch, text extraction, page generation, indexing — is
+  automatic and build-time.
+- **Static-friendly.** Every archive page renders at build time; JS is layered
+  on, never required. Matches the rest of the site's contract.
+- **Honest.** Archive pages never impersonate the original. They are framed as
+  archived copies, link prominently to the source, are kept out of search
+  engines, and carry a real, advertised removal channel on every page.
+- **Safe by default.** No build step ever deletes or overwrites a committed
+  artifact; destruction and replacement are always explicit, opt-in acts.
+
+---
+
+## Decisions (locked)
+
+| Topic | Decision | Rationale |
+|-------|----------|-----------|
+| Trigger | Curated manifest, not auto-crawl | Archives what the site *references*, not the web. Legally and operationally sane. |
+| Authored input | One hand-edited file: `archive/manifest.yaml` | One line per archived link. Mirrors `data/commonplace.yaml`'s authoring model. |
+| Bibliography seeding | **Rejected** as auto-seeding. `make archive-suggest` prints a "cited but not archived" diff; the author copies lines by hand. | Keeps the manifest the *identity* of the archive, not a cache of the `.bib` files. |
+| Per-entry provenance | `archive/{slug}/PROVENANCE.json`, committed — immutable for the current snapshot | An immutability claim that isn't in version control isn't immutable. |
+| Mutable state | `data/archive-state.json`, gitignored — link-rot status only | Strict split: immutable facts committed, volatile status disposable. |
+| Hakyll input | `data/archive-index.json` — `url` + aliases → slug, written by the tool | Minimal stable shape for the Haskell side; treated like `data/annotations.json`. |
+| Missing-index behaviour | `Backlinks.hs` and `Filters/Archive.hs` silently no-op when `archive-index.json` is absent | Preserves the established `.venv`-gated silent-skip convention. The archive degrades to invisible, never to an error. |
+| `fetch` idempotence | `fetch` is keyed on `(slug, url)` together; a slug whose recorded URL has changed is refused, not overwritten. `fetch` always rewrites `archive-index.json` to mirror the manifest. | A committed artifact is replaced only by an explicit `refresh`, never as a `fetch` side effect. |
+| Artifact storage | `archive/{slug}/` at repo root, **committed to git** | A preservation guarantee that depends on an un-versioned store is weaker. Repo stays reproducible. |
+| Per-artifact size cap | 25 MB; `archive.py fetch` warns and skips above it; `git add -f` to override deliberately | A 200 MB scan must never land in an auto-commit silently. |
+| Storage migration | If `archive/` exceeds ~5 GB or doubles year-over-year, evaluate a separate archive repo / object store. **Never git LFS.** | LFS breaks `git clone → make build` reproducibility — a regression for a preservation system. |
+| HTML snapshots | `monolith -j` → one self-contained HTML file; the pinned `monolith` binary is committed at `tools/bin/monolith` | Single static binary, no headless browser. Strips JS. Committing it (vs downloading) removes a network dependency and keeps the build reproducible from a bare clone. |
+| PDF snapshots | Direct download via `requests` | Papers are usually clean PDF URLs (arXiv etc.). |
+| Display — PDF | The raw `document.pdf` in an `<iframe>` — the browser's native PDF viewer renders it | A hyperlinked archive should display the document exactly as it is. Symmetric with the HTML snapshot (both embed the raw artifact); no PDF.js wrapper. `static/pdfjs/` stays vendored for the site's own `{{pdf:}}` embeds. |
+| Display — HTML | Snapshot in a sandboxed `<iframe>` (`referrerpolicy="no-referrer"`, no `allow-scripts`) + CSP `<meta>` baked into the snapshot + extracted text in the wrapper | Sandbox isolates markup; CSP is defense-in-depth; no-referrer stops leaking the reading path; extracted text feeds indexing. |
+| Snapshot quality | Recorded per entry (`ok` / `degraded` / `js-required`); degraded snapshots flagged on `/archive/` and `/build/` | `monolith` fails quietly on lazy-loaded images and SPAs; silent degradation is the enemy. |
+| Index thumbnails | **Dropped for v1.** `/archive/` is a text list. | At v1 scale a text list is faster to scan and to build than a thumbnail grid; revisit past ~50 entries (it is deferred capability, not a rejected one). |
+| Second archive | Submit every URL to the Wayback Machine — **non-blocking**; record the URL when it returns, backfill via `make archive-wayback` | Belt-and-suspenders, never on the critical path of a build. |
+| URL scheme | `/archive/{slug}/` | Permanent, human-readable, internal. |
+| URL matching | `archive-index.json` carries each entry's equivalent-URL aliases; **only tracking parameters** are stripped, other query parameters preserved; backlinks match any alias | Without it, "Referenced by" silently under-counts; blanket query stripping would over-match. |
+| Homepage portal | No | Infrastructure, not a content section. Reachable from `/archive/`, `/colophon`, footer. |
+| Search engines | `noindex` on every archive page | Preserving, not republishing or competing with originals. |
+| `robots.txt` | Not gated: a curated single-shot fetch of an already-cited URL is not crawling. But honour `X-Robots-Tag: noarchive` and `<meta name="robots" content="noarchive">`; skip anything behind authentication. | Matches Save-Page-Now / reference-manager norms. The load-bearing ethic is the removal channel, not `robots.txt`. |
+| Removal channel | A request to `ln@levineuwirth.org` is honoured; advertised on `/archive/`, on **every archive page**, and in the fetcher's User-Agent string | This is the real ethical commitment `robots.txt` only proxies for. |
+| Pagefind | Archived full text is indexed, tagged by `type: archive` and by link-rot `status` | Searching everything you've cited is a feature; the tags let results be filtered or excluded. |
+| Visibility levels | `public` (default) / `private` | `private` keeps the artifact in-repo but undeployed, for content not safe to redistribute. |
+| Paywalled originals | A manual `paywalled: true` manifest flag — **not** an automated scanner state. Soft paywalls return `200` and cannot be reliably detected. | Drives a banner note only, never a link flip. |
+| Eviction | Opt-in `make archive-gc`, **never part of `make build`**. Procedure: record in `removed.yaml` *first*, then drop the manifest line, then GC. GC deletes only slugs listed in `removed.yaml`. | A rename, branch-switch, or typo'd manifest edit must not silently eat committed artifacts. |
+| Snapshot mutability | Immutable for the current snapshot; `archive.py refresh` deliberately replaces it | A stable citation target must not move under readers — except by an explicit act. |
+| Rot hysteresis | Asymmetric: `rotted` requires 3 consecutive failed scans over ≥ 14 days; one failure is `error`. Recovery is immediate — a single success → `live`. | A transient failure must not flip a live citation; a recovered original should be reached eagerly, so un-rotting needs no delay. |
+| SHA verification | Both `archive.py fetch` *and* `build/Archive.hs` re-hash every committed artifact against `PROVENANCE.json` and halt non-zero on a mismatch. `archive.py` runs first in `make build`; `Archive.hs` shells out to `sha256sum` from `loadArchiveEntries`, so the integrity guarantee holds even when `archive.py` did not run (no `.venv`, a direct `cabal run site -- build`, or a deploy host that bypasses `make build`). | The original "Python tool is the sufficient enforcement point" assumption was unsafe: the Python step is `.venv`-gated, and a contributor or deploy without it could publish a tampered artifact unchecked. Two enforcement points cost a `sha256sum` call per entry and close the hole. |
+
+---
+
+## Content model & directory structure
+
+```
+archive/
+├── manifest.yaml                       # AUTHORED — the curated list of links
+├── removed.yaml                        # AUTHORED — record of evicted entries
+├── arxiv-2403-12345/
+│   ├── document.pdf                    # the snapshot (committed)
+│   ├── PROVENANCE.json                 # immutable archival facts (committed)
+│   ├── document.txt                    # extracted text (gitignored, regenerated)
+│   └── document.txt.sha256             # artifact SHA the .txt was built from (gitignored)
+├── gwern-net-scaling-hypothesis/
+│   ├── snapshot.html                   # self-contained monolith snapshot (committed)
+│   ├── PROVENANCE.json                 # immutable archival facts (committed)
+│   ├── snapshot.txt                    # extracted readable text (gitignored)
+│   └── snapshot.txt.sha256             # artifact SHA the .txt was built from (gitignored)
+└── ...
+```
+
+- `archive/` is a top-level directory, sibling to `content/`, `static/`, and
+  `data/` — **not** under `content/`. Files in `content/` are author-written
+  Markdown processed by Pandoc; `archive/` holds raw third-party artifacts plus
+  the manifest and provenance.
+- One directory per entry, keyed by **slug**.
+- Committed: the artifact (`document.pdf` / `snapshot.html`) — the preservation
+  payload — and `PROVENANCE.json` — the immutable record of the archival event.
+- Gitignored: the regenerable extracted text (`*.txt`) and its staleness stamp
+  (`*.txt.sha256`) — deterministic from the committed artifact, so committing
+  them is pure churn. This mirrors the photography sidecar and `*.webp`
+  companion rules already in `.gitignore`.
+- `make build`'s auto-commit stages `content/` **only**. Changes under
+  `archive/` (new artifacts, `PROVENANCE.json`, manifest edits) are committed
+  **deliberately by the author**. This is a feature, not a gap: it is the
+  eyeball-before-commit checkpoint where a degraded snapshot gets caught.
+
+### Authored input — `archive/manifest.yaml`
+
+The **only** file the author edits for normal operation. Adding an archive =
+adding one list item. Minimum is a bare `url:`; everything else is optional or
+auto-derived.
+
+```yaml
+# archive/manifest.yaml — curated list of works to preserve.
+# Edited by hand. Tools never write to this file.
+# Per-artifact cap: 25 MB. Above that, archive.py warns and skips the fetch;
+# commit an oversize artifact deliberately with `git add -f`.
+# To evict an entry, see archive/removed.yaml — record there FIRST, then
+# delete the line here, then run `make archive-gc`.
+
+- url: "https://arxiv.org/abs/2403.12345"
+  # slug:  auto-derived → arxiv-2403-12345  (override only to disambiguate)
+  # title: auto-derived from the artifact / popup-proxy metadata
+  # type:  auto-detected (pdf | html)
+  tags: [research/ml]              # optional — same slash-hierarchy as content
+  note: >                          # optional — why this is referenced
+    Cited in the scaling-laws essay; section 4 is the load-bearing part.
+
+- url: "https://www.gwern.net/Scaling-hypothesis"
+  type: html                       # optional override when detection is wrong
+  visibility: public               # public (default) | private
+
+- url: "https://example.com/paywalled-report"
+  paywalled: true                  # author-set; the original sits behind a paywall
+  visibility: private              # archived for the author; artifact not deployed
+```
+
+| Field | Required | Notes |
+|-------|----------|-------|
+| `url` | yes | The original URL. The identity of the entry. |
+| `slug` | no | Override the auto-derived slug. Must be unique. |
+| `title` | no | Override the auto-derived title. |
+| `type` | no | `pdf` \| `html`. Auto-detected from `Content-Type` / extension. |
+| `tags` | no | Slash-hierarchy tags (`Tags.hs`). Place the work on tag indexes. |
+| `note` | no | Author's reason for archiving; shown on the archive page. |
+| `visibility` | no | `public` (default) or `private`. |
+| `paywalled` | no | Author-set flag: the original is gated. Declared, not inferred — no reliable automated detection exists. Drives a banner note only. |
+| `source-date` | no | Publication date of the original, if known. |
+
+### Per-entry provenance — `archive/{slug}/PROVENANCE.json`
+
+Committed alongside the artifact. Written by `tools/archive.py fetch` and then
+stable for the lifetime of that snapshot — `wayback` is the one field backfilled
+later (by `make archive-wayback`).
+
+**"Immutable" means immutable for the *current* snapshot, not forever.**
+`archive.py refresh` deliberately re-snapshots an entry and **replaces** both
+the artifact and its `PROVENANCE.json` (new `sha256`, new `archived` date),
+moving the old `sha256` into `previous-sha256`. A refresh is a conscious act;
+absent one, the file does not change.
+
+`PROVENANCE.json` holds the facts that make the archival claim verifiable:
+`tools/archive.py fetch` re-hashes every present artifact against the recorded
+`sha256` on every run — *before* the Hakyll build — and **exits non-zero on a
+mismatch, halting `make build`**. The verification lives in the Python tool,
+not `Archive.hs`: the Haskell toolchain carries no SHA-256 library, and
+`archive.py` runs first in the pipeline regardless. `Archive.hs` trusts a
+present (provenance, artifact) pair and skips any entry lacking either.
+
+```json
+{
+  "url": "https://arxiv.org/abs/2403.12345",
+  "slug": "arxiv-2403-12345",
+  "title": "Scaling Laws for Neural Language Models",
+  "type": "pdf",
+  "artifact": "document.pdf",
+  "sha256": "9f86d0818884...",
+  "previous-sha256": null,
+  "bytes": 2317004,
+  "archived": "2026-05-21",
+  "source-date": "2024-03-15",
+  "snapshot-quality": "ok",
+  "wayback": "https://web.archive.org/web/20260521.../https://arxiv.org/abs/2403.12345"
+}
+```
+
+`previous-sha256` is `null` on first fetch and set by `refresh` to the
+immediately-prior snapshot's hash, so the last prior snapshot is reachable
+(via `git log -S`) without deeper archaeology. `PROVENANCE.json` lives **with
+the artifact**, not in a rolling global file, so the immutable claim is
+genuinely immutable in git history.
+
+### Mutable state — `data/archive-state.json`
+
+Written **only** by `tools/archive.py check`. Holds the volatile link-rot
+status, keyed by URL. Gitignored (`data/` generated files already are); a fresh
+clone simply rebuilds it on the next scan. Until a scan has run, every entry
+renders as the safe default (`live`, no link flip).
+
+```json
+{
+  "https://arxiv.org/abs/2403.12345": {
+    "status": "live",
+    "checked": "2026-05-21",
+    "consecutive-failures": 0,
+    "status-since": "2026-05-21"
+  }
+}
+```
+
+`status` ∈ `live` / `moved` / `rotted` / `error` — set by the scanner.
+(`paywalled` is *not* here: it is a manual manifest flag, not a scanner state.)
+`consecutive-failures` + `status-since` implement the rot hysteresis (Phase 5).
+
+### Hakyll input — `data/archive-index.json`
+
+A small map written by `tools/archive.py fetch`, consumed inside the Hakyll
+build by `Backlinks.hs` and the link-annotation filter. **`fetch` always
+rewrites this file to mirror the current manifest exactly** — whether or not any
+network I/O occurred — so an entry un-listed from the manifest (even without a
+GC) immediately stops being treated as archived, and `Backlinks.hs` never keeps
+writing backlinks toward a slug whose page no longer exists. The index is cheap
+to recompute (manifest + provenance, no network) and must never lag the
+manifest. Kept separate from `archive-state.json` so the Haskell side loads a
+minimal, stable shape; treated exactly like the existing `data/annotations.json`
+build input.
+
+```json
+{
+  "https://arxiv.org/abs/2403.12345": {
+    "slug": "arxiv-2403-12345",
+    "type": "pdf",
+    "title": "Scaling Laws for Neural Language Models",
+    "aliases": [
+      "http://arxiv.org/abs/2403.12345",
+      "https://arxiv.org/abs/2403.12345v1",
+      "https://arxiv.org/abs/2403.12345v2",
+      "https://arxiv.org/pdf/2403.12345",
+      "https://arxiv.org/pdf/2403.12345.pdf"
+    ]
+  }
+}
+```
+
+`aliases` is the equivalent-URL set (see URL matching, under Backlinks). The
+Haskell side flattens it into an `alias → entry` lookup on load.
+
+**When `archive-index.json` is absent** — `.venv` not set up, or `archive.py`
+has never run — it is treated as empty: `Backlinks.hs` and `Filters/Archive.hs`
+silently no-op, and the build succeeds unchanged. This is the same
+`.venv`-gated silent-skip convention used by `embed.py` and the photography
+extractors. (This exact phrasing recurs below; it is the canonical statement of
+the property.)
+
+### Eviction & removal
+
+Removing an archived work is a first-class, supported operation — a takedown
+request, an author request, a legal concern, or a quality cull will arrive, and
+probably before the system is mature. The cardinal rule: **no build step ever
+deletes a committed artifact.** Deletion is opt-in and explicit.
+
+Procedure (documented in the `manifest.yaml` header comment), in order:
+
+1. **Record the removal in `archive/removed.yaml` first** — before touching the
+   manifest:
+
+   ```yaml
+   - url: "https://example.com/withdrawn-article"
+     slug: example-com-withdrawn-article
+     removed: 2026-06-01
+     reason: takedown        # takedown | author-request | legal | quality
+     note: "DMCA from X; see archived email."
+   ```
+
+   | Field | Required | Notes |
+   |-------|----------|-------|
+   | `url` | yes | The original URL (matches the manifest URL at time of removal) |
+   | `slug` | yes | The slug whose `archive/{slug}/` directory `make archive-gc` is authorized to delete |
+   | `removed` | yes | ISO date of removal |
+   | `reason` | yes | Closed enum: `takedown` \| `author-request` \| `legal` \| `quality` |
+   | `note` | no | Free-text context |
+
+2. Delete the entry's line from `manifest.yaml`.
+3. Run `make archive-gc` (opt-in; **never** invoked by `make build`). It deletes
+   only `archive/{slug}/` directories whose slug is recorded in `removed.yaml`.
+   A directory orphaned by a rename, a branch switch, or a typo'd manifest edit
+   — i.e. *not* in `removed.yaml` — is **never deleted**; it is reported to
+   stderr with its slug and a one-line hint, and `gc` exits non-zero while any
+   orphan is present (`--ignore-orphans` suppresses the non-zero exit once the
+   author has consciously reviewed them). The author commits the deletion.
+
+An orphaned `archive/{slug}/` directory (manifest line gone, not yet GC'd) is
+inert in the meantime: `Archive.hs` generates pages and routes artifacts only
+for current `manifest.yaml` entries, so an orphan produces no page and is not
+deployed.
+
+`removed.yaml` is **not** a hostile-tracking list. It exists so that (a)
+`make archive-gc` knows exactly what is safe to delete, (b) re-adding a removed
+URL to the manifest is surfaced loudly at build time, (c) the link-rot scanner
+skips removed entries instead of probing them forever, and (d) `make
+archive-suggest` never re-suggests a deliberately-removed work. A removed URL
+still cited from a site page falls back to the original-only link: no archive
+affordance, no backlink canonicalization.
+
+---
+
+## Routing & generated pages
+
+| URL | Source | Notes |
+|-----|--------|-------|
+| `/archive/` | Generated from `manifest.yaml` | Index of all archived works; text list, filter by type, tag, status |
+| `/archive/{slug}/` | Generated per manifest entry | The archive page — wrapper chrome + embedded snapshot |
+| `/archive/{slug}/document.pdf` | `archive/{slug}/document.pdf` | Raw artifact, copied through unchanged |
+| `/archive/{slug}/snapshot.html` | `archive/{slug}/snapshot.html` | Raw HTML snapshot, copied through unchanged |
+| `/archive/{tag}/` | Existing `Tags.hs` | Archive entries with tags join the normal tag indexes |
+
+`PROVENANCE.json` is build input, not a routed page — it is consumed by
+`Archive.hs`, not served (the archive page surfaces the relevant fields).
+
+Slugs are auto-derived as `{domain-stem}-{path-slug}`, truncated, with a short
+hash appended on collision (`arxiv-2403-12345`, `gwern-net-scaling-hypothesis`).
+`slug:` in the manifest overrides.
+
+`/archive/` is **not** a homepage portal — it is infrastructure. It is reachable
+from `/colophon` (where the site explains its own machinery), from the footer's
+infrastructure links, and optionally as a shelf on `/library.html`. The
+`/archive/` page also carries the removal-request notice.
+
+---
+
+## The archive page
+
+`/archive/{slug}/` is a **wrapper**: site chrome around a preserved artifact.
+Top to bottom:
+
+1. **Archive banner.** An unmissable strip: "Archived copy — snapshot taken
+   2026-05-21. View the original ↗". The original URL is the most prominent
+   link on the page. The page never pretends to be the source.
+2. **Metadata block.** Title, original URL, archive date, source publication
+   date, content hash (short form), file size, snapshot quality, the author's
+   `note`, the Wayback Machine link, and current link-rot `status`.
+3. **The artifact.**
+   - **PDF** — the raw `document.pdf` embedded in an `<iframe>`, rendered by
+     the browser's native PDF viewer. Deliberately *not* the site's PDF.js
+     viewer: a hyperlinked archive should display the document as it is.
+   - **HTML** — the `monolith` snapshot loaded in a sandboxed `<iframe>`:
+     `sandbox` without `allow-scripts` (JS already stripped at fetch time) and
+     `referrerpolicy="no-referrer"` (so a click inside the snapshot does not
+     leak `levineuwirth.org/archive/...` — and which essay the reader came
+     from — to the original site). The snapshot file itself carries a
+     restrictive `Content-Security-Policy` `<meta>` tag, injected at fetch time,
+     as defense-in-depth (see Fetch pipeline).
+4. **Full text.** The extracted readable text (`document.txt` / `snapshot.txt`)
+   rendered into the DOM — collapsed in a `<details>` for PDFs, inline for HTML.
+   This block is the load-bearing one for indexing: `embed.py` and Pagefind see
+   text, not an opaque iframe. It also gives readers a fast, styled, dark-mode
+   reading path that does not depend on the original's markup.
+5. **Referenced by.** The backlinks list — every site page that cites this work.
+   (See Backlinks integration.)
+6. **Related.** The similar-pages list — semantically near content, site pages
+   and other archives alike. (See Similar-pages integration.)
+
+A removal-request line — the `partials/archive-removal-notice.html` partial,
+carrying `ln@levineuwirth.org` — is included on **every** archive page and on
+`/archive/`. It is its own partial, included directly by `archive.html` and
+`archive-index.html`; the site-wide `page-footer.html` is *not* touched.
+
+The page carries `<meta name="robots" content="noindex">`. The `head.html`
+partial currently has no robots hook; adding a `noindex` context flag is part
+of Phase 1.
+
+---
+
+## Fetch & snapshot pipeline
+
+`tools/archive.py` — a Python tool, gated on `.venv`, silent-skip when absent,
+matching the established `embed.py` / `extract-exif.py` pattern. Subcommands:
+
+- `archive.py fetch` — for every manifest URL without an artifact: download it,
+  detect the type, store it, extract text, write `PROVENANCE.json`. Always
+  rewrites `archive-index.json` to mirror the manifest (see below). Records
+  `wayback: null` (filled in later). Incremental — only URLs without an
+  artifact incur network I/O.
+- `archive.py wayback` — submit URLs whose `PROVENANCE.json` has `wayback: null`
+  to the Wayback Machine; backfill the returned URL. (`make archive-wayback`)
+- `archive.py check` — the link-rot scan. (`make archive-check`, Phase 5)
+- `archive.py suggest` — scan `data/*.bib` for `url` and `doi` fields; a
+  DOI-only entry is resolved to its `https://doi.org/{doi}` form. Prints a diff
+  of works cited but not yet in `manifest.yaml`, **excluding any URL already in
+  `archive/removed.yaml`** — a deliberately-removed work is never re-suggested.
+  (`make archive-suggest`)
+- `archive.py gc` — delete `archive/{slug}/` directories whose slug is recorded
+  in `removed.yaml`. Orphan directories (not in `manifest.yaml`, not in
+  `removed.yaml`) are never deleted: each is reported to stderr with its slug
+  and a one-line hint, and `gc` exits non-zero while any orphan is present
+  (`--ignore-orphans` to override). (`make archive-gc`)
+- `archive.py refresh {slug}` — deliberately re-snapshot one entry, replacing
+  both the artifact and its `PROVENANCE.json`; the prior `sha256` is written to
+  `previous-sha256` and printed.
+
+### `fetch` is keyed on `(slug, url)` together
+
+If a slug's directory already exists and its `PROVENANCE.json` records a
+*different* URL than the manifest now gives — the author edited a URL but kept
+the slug — `fetch` **refuses to overwrite** the committed artifact. It prints
+`URL changed for {slug}: run 'archive.py refresh {slug}' to re-snapshot` and
+leaves the entry untouched. Overwriting a committed artifact is always an
+explicit act (`refresh`), never a side effect of `fetch` — the same principle
+as GC requiring `removed.yaml`.
+
+Regardless of whether any artifact was fetched, `fetch` finishes by rewriting
+`data/archive-index.json` from the current manifest + provenance, so the index
+can never lag a manifest edit.
+
+### PDF
+
+Direct download via `requests`, with a per-request timeout and the size cap
+(25 MB; warn + skip above). User-Agent:
+`levineuwirth.org/archive (ln@levineuwirth.org; removal requests honored)`.
+Stored as `document.pdf`; text extracted with `pdftotext`.
+
+### HTML
+
+`monolith -j {url}` produces a single self-contained HTML file: CSS, images,
+and fonts inlined as data URIs, JavaScript stripped (`-j`).
+
+`monolith` is a single static Rust binary — no headless browser. Unlike Leaflet
+and PDF.js (servable assets fetched at build time and gitignored), `monolith` is
+a build-time **executable**: the pinned linux-x86_64 binary is **committed** at
+`tools/bin/monolith`, with its version and sha256 recorded in
+`tools/monolith-version.txt`. Committing it removes a network dependency from
+`make build` and keeps the archive pipeline reproducible from a bare clone.
+(If the build host ever changes architecture, re-vendor the matching binary.)
+
+After capture, `archive.py` injects a CSP `<meta>` into the snapshot's `<head>`:
+
+```html
+<meta http-equiv="Content-Security-Policy"
+      content="default-src 'none'; img-src data:;
+               style-src 'unsafe-inline'; style-src-elem 'unsafe-inline';
+               style-src-attr 'unsafe-inline'; font-src data:;
+               script-src 'none'; object-src 'none'; frame-src 'none'">
+```
+
+`monolith` inlines images and fonts as data URIs, and inlines styles both as
+`<style>` elements *and* as inline `style=""` attributes — so `style-src-elem`
+and `style-src-attr` are spelled out alongside `style-src` to cover both in
+browsers that honour the granular directives. `script-src 'none'` /
+`object-src 'none'` / `frame-src 'none'` are explicit because `monolith` inlines
+SVGs as `data:` images, and an SVG can carry a `<script>` block — the iframe
+sandbox already blocks execution, but a belt-and-suspenders claim should not
+rely on the sandbox alone. This CSP permits everything a correct snapshot needs
+and blocks every network fetch and script a broken or malicious snapshot might
+attempt. Correct rendering under this CSP is verified cross-browser as a
+Phase 2 exit criterion. (An nginx `location ^~ /archive/` block may add the
+header at the HTTP level too; the baked-in `<meta>` is what makes `make dev`'s
+plain server safe.)
+
+**`monolith` failure modes** — capture is not always faithful, and fails
+*quietly*. Known cases: lazy-loaded images using `data-src` (common on Substack,
+Medium, modern blogs) are not resolved — the snapshot looks complete but is
+missing images; soft-paywalled pages (Medium, NYT) often serve full article
+HTML to the fetch and gate it with a client-side overlay, so `-j` yields a
+snapshot that *looks* like unauthorized access (it is not — the server sent it
+— but the optics are poor); `<picture>`/`srcset` sources are inconsistently
+inlined. `archive.py` therefore classifies each capture and records
+`snapshot-quality` ∈ `ok` / `degraded` / `js-required` in `PROVENANCE.json`;
+degraded captures are flagged on `/archive/` and `/build/`. The author reviews
+the rendered snapshot before committing `archive/` (Phase 2 exit criterion). A
+headless-browser fallback for `js-required` pages is deferred — see Open
+questions.
+
+### Wayback Machine — non-blocking
+
+Wayback submission is **never on the critical path of a build.** `archive.py
+fetch` records `wayback: null` and moves on. `make archive-wayback` runs
+separately, POSTs the outstanding URLs to `https://web.archive.org/save/`
+(retrying transient 5xx, tolerating rate limits and hangs), and backfills the
+returned timestamped URL into each `PROVENANCE.json`. This second, independent
+copy means a rotted entry whose local artifact is somehow lost still has a
+fallback. If the original is *already* dead at first fetch, `archive.py fetch`
+pulls the most recent existing Wayback capture instead.
+
+### Politeness & safety
+
+The manifest is author-controlled, so SSRF is not a real threat, but the tool
+still: sets per-request timeouts, enforces the 25 MB cap, rate-limits to one
+request per host at a time, and identifies itself honestly. Beyond that:
+
+- **Honour `X-Robots-Tag: noarchive`** — and the equivalent
+  `<meta name="robots" content="noarchive">` in an HTML response body (cheap to
+  check: it is in the head of the document just fetched). If either is present,
+  the fetch is abandoned and the manifest entry flagged. This is the directive
+  that actually governs *archiving* (as opposed to crawling); respecting it
+  costs nothing and makes the posture defensible.
+- **Skip authenticated content.** `archive.py` never sends cookies or
+  credentials. If a URL needs authentication, it is not archived; at most it is
+  a manual `visibility: private` artifact.
+- **`robots.txt` is not gated.** A curated, single-shot, attributed, `noindex`'d
+  fetch of a URL the site already cites is not crawling — it is the same
+  operation a reader's browser performs on click. This matches Save-Page-Now
+  and reference-manager norms. The load-bearing ethical commitment is the
+  removal channel, advertised on `/archive/`, on every archive page, and inside
+  the User-Agent string.
+
+---
+
+## Text extraction & indexing
+
+The "Full text" block is what makes an archived work *indexable* rather than an
+opaque blob. Extraction:
+
+- **PDF** → `pdftotext` (from `poppler`, already a build dependency for the
+  `pdf-thumbs` Makefile target). Stored as `document.txt`.
+- **HTML** → readable text pulled from the `monolith` snapshot with
+  `BeautifulSoup` (already a dependency of `embed.py`). Headings are preserved.
+  Stored as `snapshot.txt`.
+
+Both `.txt` files are gitignored. `archive.py fetch` regenerates a `.txt`
+whenever the artifact's current SHA-256 differs from the value stamped in the
+adjacent `*.txt.sha256` sidecar (also gitignored), then re-stamps it. This
+catches every way the committed artifact and the local — gitignored, not
+`git pull`-ed — text could drift apart: a `refresh`, a `pdftotext` upgrade, a
+truncated file. The indexed text is thus always in sync with the embedded
+artifact.
+
+Once the archive page renders this text into `_site/archive/{slug}/index.html`:
+
+- **`embed.py`** walks `_site/**/*.html` *after* the Hakyll build. Archive pages
+  are ordinary HTML files in that tree, so they are embedded with **no change to
+  `embed.py`** — they automatically join both the page-level similarity corpus
+  (`similar-links.json`) and the paragraph-level semantic index
+  (`semantic-index.bin` / `semantic-meta.json`).
+- **Pagefind** likewise indexes them automatically. Two filter tags on the
+  archive template — `type: archive` and the link-rot `status` — let
+  `search-filters.js` separate archive hits from native content and let a reader
+  see (or exclude) `rotted`-citation archive pages.
+
+The one requirement this imposes: the archived text **must** be in the rendered
+DOM, not only inside the PDF.js / sandbox iframe. `embed.py`'s `BeautifulSoup`
+pass and Pagefind both see DOM text only. Hence the "Full text" block in §4 of
+the archive page is non-optional.
+
+---
+
+## Backlinks integration — "Referenced by"
+
+The goal: an archived paper's page shows every site page that cites it.
+
+Today `Backlinks.hs` runs in two passes (see its module header). Pass 1
+(`version "links"`) extracts links per content file; `isPageLink` **drops every
+external URL**. Pass 2 inverts `target → [sources]`. The archive needs two
+surgical changes, both driven by `data/archive-index.json`:
+
+1. **Pass 1 — keep archived externals.** `isPageLink` is widened: an external
+   URL is *kept* if it matches an entry in `archive-index.json`. Non-archived
+   externals are still dropped, exactly as now.
+2. **Pass 2 — canonicalize to the archive URL.** When inverting, an archived
+   external URL is rewritten to its `/archive/{slug}/` key.
+
+`backlinksField` then works unchanged: the archive page looks up its own route
+and finds its citing pages. The archive template labels the section
+**"Referenced by"** rather than "Backlinks" — semantically truer for a
+third-party work — but the underlying field is the same.
+
+This is purely additive: the *visible* link in the essay still points at the
+original URL (reader expectation is preserved); only the backlink *relationship*
+is recorded against the archive page. Archive pages do not need to be added to
+`Patterns.allContent` — they only *receive* backlinks, and that needs a route,
+not a `version "links"` pass.
+
+**When `archive-index.json` is absent** — `.venv` not set up, or `archive.py`
+has never run — it is treated as empty: `Backlinks.hs` and `Filters/Archive.hs`
+silently no-op, and the build succeeds unchanged. For `Backlinks.hs` that means
+every external URL is dropped exactly as today, with no canonicalization and no
+error. This is a hard requirement, not a nicety: it preserves the established
+`.venv`-gated silent-skip convention so a contributor without the Python
+environment still gets a clean build.
+
+### URL matching — the alias problem
+
+A cited URL in the wild has many equivalent forms: `http://` vs `https://`,
+trailing slash or not, `?utm_source=…` query junk, arXiv `abs` ↔ `pdf` ↔
+versioned (`/abs/2403.12345`, `/abs/2403.12345v2`, `/pdf/2403.12345.pdf`). If
+the index is keyed only by the manifest's canonical URL, a citation to any
+variant misses, and **"Referenced by" silently under-counts** — a failure that
+breaks nothing visibly and is miserable to debug.
+
+So `archive.py` computes the equivalent-URL set per entry and stores it as
+`aliases` in `archive-index.json`. The normalization is deliberately narrow:
+
+- **Tracking parameters are stripped** — `utm_*`, `fbclid`, `gclid`, `mc_*`,
+  `ref`, `igshid`, `_hsenc`, `_hsmi`, `mkt_tok`.
+- **All other query parameters are preserved.** A `?v=…`, a `?id=…`, a Wayback
+  timestamp is load-bearing; blanket query stripping would alias
+  `…/article?id=42` to every other article on the host.
+- `http`/`https` are folded, trailing slashes normalized, and known arXiv
+  families (`abs` / `pdf` / versioned) expanded.
+
+`Backlinks.hs` matches an incoming link against any alias before keying it to
+the archive URL.
+
+### Granular backlinks (Phase 4 refinement)
+
+If a citation targets a fragment — `…/abs/2403.12345#section-4`, or a PDF page
+`…/document.pdf#page=7` — the fragment is preserved through pass 2 instead of
+being stripped by `normaliseUrl`. The archive page can then group "Referenced
+by" entries by which section/page they cite: *"Section 4 — referenced by [Essay
+A], [Essay B]."* This is the "indexed granularly, by section" behaviour, on the
+backlinks side.
+
+---
+
+## Similar-pages integration — "Related"
+
+This side is almost free. `embed.py` produces `data/similar-links.json` (page
+similarity) from every file in `_site/`. Once archive pages render with their
+full text (above), they are in the corpus:
+
+- An **essay's** "Related" block can surface an archived paper.
+- An **archive page's** "Related" block surfaces neighbouring archives and the
+  site content nearest to it.
+
+`SimilarLinks.hs` needs no change — `/archive/{slug}/` is just another URL key,
+and `similarLinksField` resolves it like any page. Two small `embed.py` config
+nudges: add `/archive/` to `EXCLUDE_URLS` (the index is a list page and would
+otherwise dominate neighbours), and let individual archive pages through.
+
+**Cost — a Phase 4 risk with a concrete trigger.** `embed.py` has a coarse
+whole-run staleness skip but no per-document incrementality: when it *does* run,
+it re-embeds the entire corpus. A serious archive (hundreds of entries, several
+MB of extracted text each for long papers) materially extends every run that
+executes. Phase 4 measures this and applies a fixed trigger: **once the archive
+passes 50 entries, or `embed.py`'s runtime exceeds 60 seconds, add a
+per-document embedding cache** keyed by content hash to `embed.py`. Below both
+thresholds, the full-corpus re-embed is left alone — premature optimization
+otherwise.
+
+### Granular similar-pages (deferred)
+
+`embed.py` *already* builds a **paragraph-level** index
+(`semantic-index.bin` + `semantic-meta.json`, keyed `{url, title, heading,
+excerpt}`). An archived HTML snapshot's preserved headings mean its sections get
+distinct paragraph vectors automatically — the data for section-granular
+"Related" exists the moment archive text is in the DOM. What does *not* yet
+exist is a UI that consumes it per-section, for *any* content type. A
+per-section "Related" block is deferred site-wide; the archive system *feeds*
+the granular index regardless. For PDFs, section structure is unreliable
+(`pdftotext` flattens it); per-*page* chunking is the realistic granularity —
+see Open questions.
+
+---
+
+## Link annotation in content
+
+When the author writes a link to a URL that is archived, the build appends a
+small archive affordance — a superscript "[A]" / "archived" marker next to the
+link — pointing at `/archive/{slug}/`. No per-link markup; entirely automatic.
+
+Implementation: a Pandoc filter, `Filters/Archive.hs`, registered in
+`Filters.hs`. For every `Link` whose URL matches `archive-index.json` (alias
+set included), it appends the affordance inline.
+
+**Filter ordering — pinned, then verified.** Per `/colophon`, the site's AST
+chain is `markdown → pandoc → citations → wikilinks → preprocessing → sidenotes
+→ smallcaps/dropcaps → links → images → math`. `Filters/Archive.hs` is pinned
+**immediately after `smallcaps/dropcaps` and immediately before `links`** — not
+merely "somewhere before `links`". The reason is the narrower window matters:
+`smallcaps/dropcaps` rewrites the *text content* of nodes, so if `Archive.hs`
+decorated first, the `[A]` affordance could be swept into a smallcaps run or
+mistaken for an opening character by dropcap logic. Running it after
+`smallcaps/dropcaps` appends the affordance to already-styled text that nothing
+downstream re-touches; running it before `links` lets the link-decoration pass
+(and any future popup hooks) act on the already-annotated tree. This chain is
+transcribed from a published page; **Phase 3 confirms it against `Filters.hs`'s
+actual registration order** before the position is pinned in code — a doc and
+the implementation can drift.
+
+**Confirmed (2026-05-22).** `Filters.hs`'s `applyAll` applies, innermost
+first: `Images → SourceRefs → Code → Math → Dropcaps → Smallcaps → Links →
+Typography → Sidenotes → Aftermatter`. The `/colophon` narrative is a loose
+paraphrase — `Images` and `Math` run early, `Sidenotes` runs late — but
+`Smallcaps` and `Links` *are* adjacent, so `Filters.Archive` is pinned between
+them, exactly as specified above. (`/colophon` is prose, not authoritative for
+filter order, and was left unchanged.)
+
+**When `archive-index.json` is absent** — `.venv` not set up, or `archive.py`
+has never run — it is treated as empty: `Backlinks.hs` and `Filters/Archive.hs`
+silently no-op, and the build succeeds unchanged. For `Filters/Archive.hs` that
+means every `Link` passes through un-annotated, no error raised.
+
+**Bibliography — confirmed (2026-05-22): a separate context field.**
+`Citations.hs` runs `applyCitations` *before* the `applyAll` filter chain; it
+partitions the citeproc `refs` Div out of the document AST
+(`extractBibliography`) and renders it to an HTML string via `writeHtml5String`
+for the template's `$bibliography$` field. The body filter chain — and so
+`Filters.Archive` — never sees the bibliography. Prose links get affordances;
+bibliography reference links do not.
+
+This does **not** put the broken popup layer on the critical path, as the
+draft feared. `Citations.hs` already performs AST surgery on each bibliography
+entry (`enhanceEntry` — it wraps `file:` PDF links and appends keyword strips),
+so the realistic annotation hook is `enhanceEntry`, reusing `Filters.Archive`'s
+index lookup — no popup dependency. That is **deferred to a Phase 3 follow-up**:
+it first needs a check that `chicago-notes.csl` renders a cited work's
+`url`/`doi` as a `Link` node (a CSL style that omits URLs would leave nothing
+to match). Phase 3 ships prose-link annotation; bibliography annotation is
+documented as in-scope and hookable via `enhanceEntry`, pending that check. A
+future popup rewrite may *also* consult `archive-index.json`, but the archive
+system depends on neither the current nor a future popup implementation.
+
+---
+
+## Link-rot detection & maintenance (Phase 5)
+
+`tools/archive.py check` issues a `HEAD` (falling back to a ranged `GET`) to
+every original URL in the manifest and updates `data/archive-state.json`.
+
+**Hysteresis is asymmetric.** Rotting is slow; recovery is fast.
+
+- *Rotting.* A failed probe increments `consecutive-failures` and sets
+  `status: error`. Only after **3 consecutive failed scans spanning ≥ 14 days**
+  does the status become `rotted`. A single transient failure — a Cloudflare
+  challenge, a temporary 5xx, a DNS hiccup — therefore never flips a live
+  citation.
+- *Recovery.* A **single** successful probe resets `consecutive-failures` to 0
+  and returns the status straight to `live`, from `error` or `rotted` alike.
+  There is no cost to un-rotting eagerly — if the original is reachable again,
+  the reader should go there — so recovery needs no hysteresis.
+
+| `status` | Meaning | Rendering effect |
+|----------|---------|------------------|
+| `live` | Original reachable, unchanged | Normal: link to original, archive as backup |
+| `moved` | 3xx to a new location | Banner notes the move; new URL recorded |
+| `rotted` | Failed the hysteresis threshold (3 fails / ≥14 days) | Build flips the *primary* link to the archive copy; original shown struck-through as "(dead link)" |
+| `error` | Transient / inconclusive — below the hysteresis threshold | No rendering change; retried next scan |
+
+`paywalled` is deliberately **absent** from this table: a soft paywall returns
+`200`, so an automated `HEAD`/`GET` cannot reliably detect it. Paywall status is
+the manual `paywalled: true` manifest flag instead, and it drives only a banner
+note — never a link flip.
+
+The flip on `rotted` is the actual link-rot *cure*: a reader of a 2019 essay
+clicks through to a working local snapshot instead of a 404, with no manual
+intervention — and only after the rot is confirmed, not guessed.
+
+`check` is a slow network job, not something every `make build` should pay for.
+It runs on its own cadence — a periodic local `make archive-check`, or a
+scheduled remote agent. It is decoupled from the main build: the build consumes
+whatever `archive-state.json` exists.
+
+---
+
+## Build-pipeline integration
+
+New steps slot into the `Makefile` `build` target, gated on `.venv` (silent
+skip), consistent with `embed.py` and the photography extractors:
+
+```
+make build:
+  git auto-commit content/                       (existing — archive/ NOT swept in)
+  tools/convert-images.sh                         (existing)
+  pdf-thumbs                                      (existing)
+  download-pdfjs.sh / download-leaflet.sh         (existing)
+  → tools/archive.py fetch                        (NEW — fetch missing artifacts,
+                                                          extract text, write
+                                                          PROVENANCE.json +
+                                                          archive-index.json)
+  extract-exif / palette / dimensions             (existing)
+  cabal run site -- build                         (existing — now also routes archive/)
+  pagefind --site _site                           (existing — now also indexes archive pages)
+  tools/embed.py                                  (existing — now also embeds archive pages)
+  stamp-build-time.py / compress-assets.sh        (existing)
+```
+
+`tools/archive.py fetch` runs **before** `cabal run site -- build` so the
+artifacts, `PROVENANCE.json` files, and `archive-index.json` all exist when
+Hakyll routes the `archive/` tree and when `Backlinks.hs` loads the index.
+`fetch` is incremental — a normal build with no new manifest entries does no
+network I/O — but it still rewrites `archive-index.json` every run. Wayback
+submission is **not** in this path. The `monolith` binary is committed
+(`tools/bin/monolith`), so there is no download step.
+
+**`make build` never deletes anything under `archive/`.** Artifact removal is
+exclusively the job of the opt-in `make archive-gc` (see Eviction).
+
+Standalone targets, none a dependency of `build`:
+
+- `make archive-check` — link-rot scan.
+- `make archive-wayback` — backfill outstanding Wayback captures.
+- `make archive-suggest` — print the "cited but not archived" diff against
+  `data/*.bib` (DOI-only entries resolved; `removed.yaml` entries excluded).
+- `make archive-gc` — delete `archive/{slug}/` directories whose slug is
+  recorded in `removed.yaml`; report (never delete) orphans that are not.
+
+---
+
+## Build module structure
+
+New Haskell module:
+
+- **`build/Archive.hs`** — patterns, routing rules, and contexts for the
+  archive. Generates `/archive/` and every `/archive/{slug}/` page from
+  `archive/manifest.yaml` + `PROVENANCE.json` + `data/archive-state.json`;
+  routes the raw artifacts through unchanged. Pages and routed artifacts come
+  only from current `manifest.yaml` entries, so an orphaned `archive/{slug}/`
+  directory is inert (no page, not deployed). Integrity (SHA-256) verification
+  is `tools/archive.py`'s job — it runs first and halts the build on a
+  mismatch; `Archive.hs` trusts a present (provenance, artifact) pair and skips
+  any entry lacking either. Separated from `Site.hs` for the same reason
+  `Catalog.hs`, `Authors.hs`, and `Photography.hs` are — scoped concerns,
+  isolated reasoning.
+
+New Pandoc filter:
+
+- **`build/Filters/Archive.hs`** — the link-annotation filter; registered in
+  `Filters.hs` immediately after `smallcaps/dropcaps`, before the `links` pass.
+  No-op when `archive-index.json` is absent.
+
+Edits to existing modules:
+
+- **`build/Patterns.hs`** — add `archivePattern` (artifact files) and
+  `archiveManifest`. Add archive entries to `tagIndexable` so tagged archives
+  reach the tag indexes. (Deliberately *not* added to `allContent`: archive
+  pages receive backlinks but are not crawled for outbound links in v1.)
+- **`build/Backlinks.hs`** — load `data/archive-index.json` (silent no-op if
+  absent); widen `isPageLink` to keep archived externals; match incoming links
+  against the alias set; canonicalize them to `/archive/{slug}/` in pass 2.
+- **`build/Site.hs`** — wire the archive rules from `Archive.hs`; add the
+  `/archive/` link to the footer / `colophon` routing.
+- **`build/Stats.hs`** — contribute archive metrics to the `/build/` telemetry
+  page: count; total bytes; median artifact age; counts by `snapshot-quality`,
+  `status`, and `visibility`; `paywalled` count; and any orphan slugs
+  (directories not in `manifest.yaml` and not in `removed.yaml` — they should
+  not exist, so surface them where drift is visible).
+- **`templates/partials/head.html`** — add a `noindex` context hook and a
+  `$if(archive)$` link to `static/css/archive.css` (the archive pages'
+  stylesheet — banner, provenance panel, artifact viewer, index list;
+  scoped under `#markdownBody` to clear the prose rules in `typography.css`).
+
+---
+
+## Templates
+
+New files under `templates/`:
+
+| File | Role |
+|------|------|
+| `archive-index.html` | `/archive/` — the full text list, type/tag/status filters; includes `archive-removal-notice` |
+| `archive.html` | `/archive/{slug}/` — banner, metadata, embedded artifact, full text, Referenced-by, Related; includes `archive-removal-notice` |
+
+New partials:
+
+| File | Role |
+|------|------|
+| `partials/archive-banner.html` | The "archived copy / view original" strip — reused by `archive.html` and any inline archive embed |
+| `partials/archive-card.html` | Archive-entry card (text-only; no thumbnail in v1) for the index and for `/library.html` |
+| `partials/archive-removal-notice.html` | The removal-request line (`ln@levineuwirth.org`); included directly by `archive.html` and `archive-index.html` |
+
+Existing partials reused unchanged: `nav.html`, `head.html` (with the new
+`noindex` flag), `footer.html`, `page-footer.html`. The removal notice is a
+*new* partial precisely so `page-footer.html` stays untouched.
+
+---
+
+## Storage, repo size & `.gitignore`
+
+Committed: the artifacts (`document.pdf`, `snapshot.html`), `PROVENANCE.json`,
+`manifest.yaml`, `removed.yaml`, and the pinned `monolith` binary
+(`tools/bin/monolith`). Gitignored: everything regenerable.
+
+Append to `.gitignore`:
+
+```
+# Archive: generated text + its staleness stamp (recreated from the committed
+# artifact on every build — deterministic, so committing them is churn).
+archive/**/*.txt
+archive/**/*.txt.sha256
+
+# Archive: generated state (written by tools/archive.py).
+# NOTE: archive/**/PROVENANCE.json is deliberately NOT ignored — it is the
+# committed, immutable record of each archival event.
+data/archive-state.json
+data/archive-index.json
+```
+
+**Repo-size policy.** Archived artifacts are immutable once taken, so they add
+no *history* bloat — but the working tree grows. v1 commits them: a preservation
+guarantee that depends on an un-versioned side store is a weaker guarantee, and
+`git clone` → `make build` must reproduce the whole site.
+
+- **Per-artifact cap: 25 MB.** `archive.py fetch` warns and skips above it; a
+  deliberately-oversize artifact is committed with `git add -f`. This stops a
+  200 MB scan from being swept silently into a commit.
+- **Migration tripwire.** If `archive/` exceeds **~5 GB**, or **doubles
+  year-over-year**, evaluate moving the artifact store out of the main repo —
+  to a separate `archive` repository or a content-addressed store the VPS
+  rsyncs independently. `tools/archive.py` reads the store root from a single
+  config value, so the move is a config change, not a redesign.
+- **Never git LFS.** LFS smudges the property that makes this system worth
+  having: with LFS, `git clone` no longer yields the artifacts unless the LFS
+  server is up and authenticated. For a system whose value proposition is "this
+  survives," that is a regression. If migration is needed, the destination is a
+  separate repo or object store — not LFS in this one.
+
+---
+
+## Legal, ethical & SEO posture
+
+Archiving third-party content touches copyright. The design's guardrails:
+
+- **`noindex` on every archive page.** The archive preserves; it does not
+  republish to search engines or compete with originals for ranking.
+- **The original is the hero.** Every archive page links prominently to the
+  source and is explicitly framed as a dated archived copy.
+- **A real removal channel, everywhere.** A request to `ln@levineuwirth.org`
+  gets the entry removed (see Eviction). The channel is advertised on
+  `/archive/`, on **every individual archive page**, and inside the fetcher's
+  User-Agent string. This is the load-bearing ethical commitment; `robots.txt`
+  is only a proxy for it.
+- **`noarchive` honoured.** Both `X-Robots-Tag: noarchive` (HTTP header) and
+  `<meta name="robots" content="noarchive">` (HTML body) abort a fetch.
+- **Authenticated content skipped.** The fetcher sends no credentials. Anything
+  behind a login is not archived.
+- **`visibility: private`** keeps a snapshot in-repo for the author's own
+  reference without deploying the artifact to `_site/` — the appropriate
+  setting for licensed material the author may read but should not redistribute.
+  The archive *page* still exists (metadata + "held offline"), so link-rot
+  tracking and the Wayback link survive.
+- **Curated, not crawled.** The archive only ever contains works this site
+  deliberately references — a fundamentally different posture from a scraper.
+- **Attribution preserved.** Author, source title, source date, and original
+  URL are surfaced on every archive page.
+
+This is a personal-scale citation archive, consistent with long-standing
+practice on research-oriented personal sites. It is not a content platform.
+
+---
+
+## Phased implementation
+
+Each phase has explicit exit criteria. Do not start a phase until the previous
+one passes.
+
+### Phase 1 — Skeleton, PDF only
+
+Bootstrap entry: **NIST FIPS 203 (ML-KEM)**, PDF at
+`https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.203.pdf` — a stable, auth-free
+PDF already cited in `data/simd-paper.bib`, so the test entry keeps its value
+after Phase 1 ships.
+
+- [x] Define `archive/manifest.yaml` and `archive/removed.yaml` schemas; create
+      `manifest.yaml` with the bootstrap entry
+- [x] `tools/archive.py fetch` — PDF download, size cap, `pdftotext`,
+      `.txt.sha256` staleness stamp, write per-entry `PROVENANCE.json`; always
+      rewrite `archive-index.json`; refuse a `(slug, url)` mismatch, and
+      re-hash every committed artifact (non-zero exit on a SHA mismatch)
+- [x] `build/Archive.hs` — routing for `/archive/`, `/archive/{slug}/`, and the
+      raw `document.pdf`; orphaned directories produce no page (a pass-1
+      refinement subsequently added a Haskell-side SHA-256 re-hash via
+      `sha256sum`, so the integrity guarantee holds even when `archive.py`
+      did not run first — direct `cabal` invocations, deploy hosts without
+      `.venv`, etc.)
+- [x] `templates/archive.html`, `templates/archive-index.html`,
+      `partials/archive-banner.html`, `partials/archive-removal-notice.html`
+- [x] PDF artifact embedded on the page (Phase 2 changed this to a raw,
+      browser-native `<iframe>` embed — see the Display — PDF decision)
+- [x] Extracted text rendered into the page DOM (collapsed `<details>`)
+- [x] `noindex` hook in `head.html`; set on archive pages
+- [x] **Eviction works** end-to-end — `make archive-gc`, `removed.yaml` gating,
+      orphan reporting (see Eviction & removal)
+- [x] Wire `tools/archive.py fetch` into the Makefile, `.venv`-gated
+- [x] `.gitignore` additions (`PROVENANCE.json` explicitly *not* ignored)
+
+**Exit criteria:** the FIPS 203 PDF renders at `/archive/{slug}/` with banner,
+metadata, working PDF.js embed, visible extracted text, and a removal-request
+notice; `/archive/` lists it; both carry `noindex`. The eviction procedure
+(record in `removed.yaml` → drop the manifest line → `make archive-gc`) removes
+the artifact; a manifest line deleted *without* a `removed.yaml` entry leaves
+the artifact intact and emits a warning. **Running `make build` ten times in
+succession with no manifest edits produces no changes under `archive/`** — no
+deletions, no `PROVENANCE.json` rewrites, no artifact replacements.
+
+**Met (2026-05-22).** FIPS 203 fetched (1.25 MB, 3601 lines of extracted
+text); `/archive/nist-fips-203/` renders with banner, metadata, PDF.js iframe,
+in-DOM full text, and removal notice; `/archive/` lists it; both carry
+`noindex`. `gc` was verified on both paths — an orphan directory is reported
+and left intact (exit 1); a `removed.yaml`-listed directory is deleted while
+the manifest entry is untouched. `archive/` is byte-identical across repeated
+fetch + build cycles. The PDF.js iframe is correctly wired; rendering the
+viewer needs `static/pdfjs/`, which `make build` vendors via `download-pdfjs.sh`.
+
+### Phase 2 — HTML snapshots
+
+Bootstrap entry: **`https://cr.yp.to/aes-speed.html`** (`slug: djb-aes-speed`)
+— Bernstein's cache-timing-attacks page, cited in `data/simd-paper.bib`. A
+stable, JavaScript-free static page, so its snapshot is reproducible and
+classifies cleanly as `ok`; like FIPS 203 it keeps its value after the phase
+ships.
+
+- [x] Commit the pinned `monolith` binary at `tools/bin/monolith`; record
+      version + sha256 in `tools/monolith-version.txt`
+- [x] `tools/archive.py fetch` — HTML branch: `monolith --no-js`, CSP `<meta>`
+      injection (`style-src` + `-elem` + `-attr`, `script-src`/`object-src`/
+      `frame-src 'none'`), text extraction via `BeautifulSoup`, type detection
+- [x] `snapshot-quality` classification (`ok` / `degraded` / `js-required`)
+      written to `PROVENANCE.json`; degraded captures flagged on `/archive/`
+- [x] Sandboxed `<iframe>` rendering (`referrerpolicy="no-referrer"`, no
+      `allow-scripts`) in `archive.html`
+
+**Exit criteria:** an HTML URL snapshots to a self-contained file with a CSP
+`<meta>`, renders in a sandboxed no-referrer iframe with the original's styling
+isolated, and shows extracted readable text in site chrome; the sandboxed
+snapshot renders correctly under the CSP in **both Firefox and a Chromium-based
+browser**; capture quality is classified and a `degraded` snapshot is visibly
+flagged; the author has reviewed the rendered snapshot before committing it.
+
+**Met (2026-05-22).** `monolith` 2.10.1 (`monolith-gnu-linux-x86_64`) is
+vendored at `tools/bin/monolith` with its version + sha256 in
+`tools/monolith-version.txt`; `archive.py fetch` locates it via `$MONOLITH_BIN`
+→ `tools/bin/monolith` → `$PATH`, and warns-and-skips (build continues) when it
+is absent. `cr.yp.to/aes-speed.html` snapshots to a 26 KB self-contained
+`snapshot.html` with the archive CSP `<meta>` as the first `<head>` child;
+`/archive/djb-aes-speed/` renders it in a `sandbox`ed, `no-referrer` iframe with
+291 lines of extracted prose shown inline as `<p>` paragraphs; `snapshot-quality`
+classifies `ok`, and a (synthetically forced) `degraded` entry shows the warning
+note on the page and a flag on `/archive/`. `fetch` is idempotent — `archive/`
+is byte-identical across re-runs. The committed artifact is `snapshot.html`;
+`snapshot.txt` + `.sha256` are gitignored (the existing `archive/**/*.txt`
+globs already cover them).
+
+**Author-gated, by design (exit-criteria wording).** Two criteria are not
+machine-checkable here and remain the author's: (1) the cross-browser CSP
+render in Firefox *and* a Chromium browser; (2) the per-snapshot review before
+committing `archive/`. The vendored `monolith` binary and the FIPS 203 / djb
+artifacts are staged but **not committed** — committing `archive/` and
+`tools/bin/monolith` is the deliberate author act the design specifies.
+
+One real-world note from the bootstrap: `cr.yp.to` ships
+`<meta name="robots" content="none">`. Per spec `none` ≡ `noindex, nofollow` —
+it is *not* `noarchive`, so the snapshot proceeded correctly; only an explicit
+`noarchive` (header or meta) aborts a fetch.
+
+### Phase 3 — Link annotation & Wayback
+
+- [x] **Confirm `Filters.hs`'s actual filter registration order** matches the
+      AST chain documented on `/colophon` before pinning the filter's position
+- [x] **Confirm** whether the bibliography is rendered into the document AST or
+      a separate context field — this decides whether bibliography annotation
+      is in scope here or gated on the popup rewrite (see Link annotation)
+- [x] `build/Filters/Archive.hs` — annotate body links to archived URLs;
+      register in `Filters.hs` after `smallcaps/dropcaps`, before `links`;
+      no-op when `archive-index.json` is absent
+- [x] `archive.py wayback` + `make archive-wayback` — non-blocking submission,
+      backfill `wayback` into `PROVENANCE.json`
+- [x] `visibility: private` handling (artifact not routed to `_site/`)
+
+**Exit criteria:** a prose link to an archived URL gets an automatic archive
+affordance; a build without `.venv` (no `archive-index.json`) still succeeds
+with links un-annotated; every entry has a recorded Wayback URL after `make
+archive-wayback`; a `private` entry's page renders without deploying its
+artifact; the bibliography-annotation path is documented as either in-scope or
+popup-gated.
+
+**Met (2026-05-22).** `build/Filters/Archive.hs` walks body `Link` nodes and,
+for any URL in `data/archive-index.json` (canonical + alias set, fragment- and
+trailing-slash-tolerant), appends a superscript `archive-affordance` link to
+`/archive/<slug>/` — emitted as `RawInline` HTML so the downstream `Links`
+pass leaves it alone. It is registered in `Filters.applyAll` between
+`Smallcaps` and `Links`; the index loads once via an `unsafePerformIO` CAF and
+an absent/empty index makes the filter the identity (verified: a prose link to
+the archived `cr.yp.to/aes-speed.html` gains the affordance, a non-archived
+link does not). `archive.py wayback` (+ `make archive-wayback`) submits each
+entry lacking a `wayback` capture to the Wayback Machine and backfills
+`PROVENANCE.json`; it always exits 0 and is never on a build's critical path.
+`visibility: private` is a `manifest.yaml` field: a private entry's artifact is
+never routed to `_site/` (artifacts are routed by an explicit public-only list,
+which also stops an orphan directory's artifact deploying), and its page
+renders provenance + a "held offline" panel with no embed and no extracted text
+(verified: a private `_site/archive/<slug>/` contains only `index.html`).
+
+Two items are deliberately scoped out of this pass, both documented above:
+**bibliography annotation** (the bibliography is a separate `$bibliography$`
+field; the hook is `Citations.hs`'s `enhanceEntry`, pending a CSL-URL check —
+not popup-gated) and **pull-from-Wayback when the original is dead at fetch
+time** (it belongs with Phase 5 link-rot detection, where a dead URL is the
+central case and a Wayback-sourced artifact's provenance can be handled
+properly). The live `make archive-wayback` run is author-initiated — it submits
+public captures to a third-party service.
+
+### Phase 4 — Backlinks & similar-pages indexing
+
+- [x] `Backlinks.hs` — load `archive-index.json` (silent no-op if absent);
+      widen `isPageLink`; match the alias set; canonicalize archived externals
+      to `/archive/{slug}/` in pass 2
+- [x] "Referenced by" section on `archive.html`
+- [x] `embed.py` — add `/archive/` to `EXCLUDE_URLS`; verify archive pages join
+      `similar-links.json` and the paragraph index
+- [x] **Measure `embed.py` runtime** against a populated archive; add a
+      per-document embedding cache (keyed by content hash) once the archive
+      passes 50 entries or `embed.py` exceeds 60 s
+- [x] "Related" section on `archive.html`
+- [x] Fragment-preserving backlinks → grouped "Referenced by" by section/page
+
+**Exit criteria:** an archive page lists the essays that cite it under
+"Referenced by", including citations that used an alias URL form; essays surface
+relevant archived works under "Related"; a fragment-targeted citation appears
+grouped under its section; `embed.py` runtime with the archive populated is
+measured and either under the thresholds or the cache is in place.
+
+**Met (2026-05-22).** A shared `build/ArchiveIndex.hs` loads
+`data/archive-index.json` once (the `unsafePerformIO` CAF formerly private to
+`Filters.Archive`); `Backlinks.hs` and `Filters.Archive` both consume it.
+`Backlinks.isPageLink` keeps an archived external URL regardless of scheme or
+extension; pass 2 (`targetKey`) canonicalises it to the archived work's
+`/archive/<slug>/` page key — computed as the same string fed through
+`normaliseUrl` that `backlinksField` uses for the page's own route, so the two
+always agree. `archiveEntryCtx` gains `referencedByField` and
+`similarLinksField`; `archive.html` renders `$if(referenced-by)$` /
+`$if(similar-links)$` sections. `referencedByField` reuses the backlinks lookup
+but groups sources by the fragment each citation targets — a `#page=12`
+citation renders under a "Page 12" subheading, a bare citation in a flat list
+above. `embed.py` excludes the `/archive/` index from the corpus (individual
+entry pages stay in) and is measured at **~12 s** for the whole site (43 → 25
+pages, 802 paragraphs) — far under the 60 s threshold and the 50-entry trigger,
+so the per-document embedding cache is correctly *not* built (premature at this
+scale; revisit at the threshold).
+
+Verified end-to-end with a temporary citation in `content/about.md`: the
+FIPS 203 page listed it under "Referenced by" with a flat entry *and* a grouped
+"Page 12" entry; both archive pages surfaced the SIMD/PQC essay and each other
+under "Related"; the `/archive/` index was absent from `similar-links.json`.
+
+One pre-existing `embed.py` issue was surfaced and fixed: the `/source/`
+repository code mirror was in the similarity corpus — a template file was
+surfacing as a neighbour, titled with its unrendered `$title$` placeholder. An
+`EXCLUDE_PREFIXES` rule now keeps `/source/` out, which also dropped 18 junk
+pages from the site-wide corpus (43 → 25).
+
+### Phase 5 — Link-rot detection & maintenance
+
+**Prerequisite — resolved 2026-05-22.** `/build/` had been serving a stale
+cached page: its build-varying telemetry is gathered in `unsafeCompiler`, which
+Hakyll does not dependency-track, so the page recompiled only when tracked
+*content* changed. Fixed — `build/Main.hs` writes a per-build
+`data/build-stamp.txt` that `Stats.hs` loads as a dependency, forcing `/build/`
+and `/stats/` to recompile every build. The archive-metrics exit criterion
+below is now measurable.
+
+- [x] `tools/archive.py check` + `make archive-check` — HEAD/GET scan
+- [x] Asymmetric hysteresis: `rotted` requires 3 consecutive failed scans over
+      ≥ 14 days; a single success → `live`; `consecutive-failures` +
+      `status-since` tracked in `archive-state.json`
+- [x] Dead-link rendering: flip primary link to the archive on `rotted`
+- [x] Pagefind `status` filter tag wired into `search-filters.js`
+- [x] Archive metrics on `/build/` telemetry (`Stats.hs`)
+- [x] `/archive/` index shows per-entry health
+
+Test endpoint: reserve a controlled host — e.g. `archive-test.levineuwirth.org`,
+a sub-host the author owns — that can be toggled to return 404 on demand, so the
+rot-detection test flips without depending on a third party's uptime.
+
+**Exit criteria:** the controlled test URL is detected as `rotted` only after
+the hysteresis threshold is met, and the citing essay's link then flips to the
+archived copy; a single transient failure does *not* flip it; restoring the URL
+returns it to `live` on the next successful scan; the `/build/` page reports
+archive coverage and health; search results can be filtered by archive `status`.
+
+**Met (2026-05-22).** `tools/archive.py check` HEAD/GET-probes every manifest
+URL (HEAD first, ranged GET on 403/405/501) and updates the gitignored
+`data/archive-state.json`, which mirrors the manifest exactly (state for
+dropped URLs is discarded). The asymmetric hysteresis in `next_state` is
+unit-verified against synthetic scenarios — fail/fail/fail across 20 days flips
+to `rotted`; three fast fails within 2 days stay at `error`; a single `ok` from
+any non-live status recovers immediately to `live`. `ArchiveIndex.hs` exposes
+the parsed status to consumers as `archiveStatusForSlug`. `Filters.Archive`
+flips a `rotted` body link's href to `/archive/<slug>/` (adding an
+`archive-rotted` class and a solid "archived" affordance marker) — verified
+end-to-end with a hand-crafted `rotted` state file: a content link to the
+djb URL was rewritten to the archive page; reverting the state restored the
+original link. `archive.html` carries `data-pagefind-filter="type:archive,
+status:$status$"`, a "Link status" row in the provenance panel, and a
+status-note callout in the header for non-live states. The `/archive/` index
+flags rotted entries with a solid "link rotted" chip. `Stats.hs` `/build/`
+gains a "Link archive" section (count, total size, median age, by-status /
+by-quality / by-visibility breakdowns, paywalled count, orphan directories) —
+verified showing the test state's `error 1  ·  rotted 1` mix.
+
+**Rendering staleness — by design.** Rot status is consumed at build time via
+@unsafePerformIO@ CAFs; archive entry pages and content pages don't have a
+Hakyll dependency edge to `archive-state.json` (that would only fix half the
+problem — the archive pages — while leaving content-link flips stale, since
+`Filters.Archive` runs during content compilation and can't cheaply force
+every content page to depend on the state). So after `make archive-check`,
+an *incremental* build can leave both surfaces uniformly stale until a clean
+build refreshes everything. `make deploy` always does `make clean`, which
+makes the deployed site consistent. The `/build/` page is the one
+always-fresh surface: it recompiles every build via the existing build-stamp
+dependency, so its archive metrics always reflect the current scan.
+
+**Test endpoint deferred.** Spinning up `archive-test.levineuwirth.org` and
+running it through a 14-day-spanning fail streak is a multi-week real-world
+verification the author runs (or a CI cron); the hysteresis logic itself is
+unit-tested deterministically in `next_state`, and the rendering side is
+verified by the hand-crafted `rotted` state file.
+
+**Search-UI filter (`search-filters.js`) — partial.** The data-side is in
+place: every archive page carries `data-pagefind-filter="type:archive,
+status:$status$"`, so Pagefind's filter index now distinguishes archive hits
+by rot status and (when @pagefind-ui@ is configured to show filters) lists
+them as a filterable facet. The remaining work — wiring a custom UI control
+into `search-filters.js` — is a deliberate refinement, not done in Phase 5:
+its existing `status` filter is reserved for *epistemic* status (working
+model / drafting / etc.) sourced from `data/epistemic-meta.json`, so adding an
+archive `status` dimension needs a name to avoid the collision plus new
+filter-panel buttons. Search-UX best iterated with the live page in front of
+the author.
+
+---
+
+## Open / deferred questions
+
+Non-blocking, and now a short list — the draft's larger set was resolved into
+Decisions during review.
+
+- **JS-heavy / SPA pages.** `monolith` cannot execute JavaScript;
+  `js-required` captures are degraded. A headless-browser fallback (SingleFile,
+  Chromium capture) would handle them but adds a heavyweight dependency. Defer
+  until a real entry needs it.
+- **First-viewport thumbnails.** Dropped for v1 — `/archive/` is a text list. A
+  visual grid does not earn its keep at small N; revisit past ~50 entries.
+- **PDF section-granularity.** `pdftotext` flattens structure. Per-*page*
+  chunking (`#page=N` anchors, per-page text) is the realistic granularity for
+  PDF backlinks and semantic indexing. Defer.
+- **Per-section "Related" UI.** The paragraph-level semantic index already
+  receives archive text; a UI surfacing section-level "Related" does not exist
+  for *any* content type yet. Out of scope here; a site-wide feature.
+- **Snapshot versioning.** v1 snapshots are immutable per snapshot; `refresh`
+  replaces in place but records `previous-sha256`. If a referenced work is
+  meaningfully revised, should a new dated snapshot be kept *alongside* the old
+  (`document-2027-01-01.pdf`) with a version switcher? `previous-sha256` is the
+  seed — extend it to a list and the switcher reads it. Defer until needed.
+- **Intra-archive link rewriting.** When archived page A links to a URL that is
+  *also* archived, A's snapshot could be rewritten to point at the local copy
+  of B — keeping the reader inside the preserved set. Gwern-style; defer.
+- **Media beyond PDF/HTML.** EPUB, plain images, video. Out of scope for v1;
+  `type` is an open enum so it can extend.
+
+---
+
+## References
+
+- `WRITING.md` — authoring conventions; the link-annotation feature will be
+  documented there once Phase 3 lands
+- `PHOTOGRAPHY.md` — the closest precedent: authored-input/generated-sidecar
+  split, phased build, `.venv`-gated tools, vendored binaries
+- `build/Backlinks.hs` — two-pass backlinks; `isPageLink` is the integration
+  point
+- `build/SimilarLinks.hs` — "Related" block; consumes `embed.py` output
+- `tools/embed.py` — embedding pipeline; archive pages join its corpus for free
+- `build/Patterns.hs` — canonical content patterns
+- `build/Tags.hs` — slash-hierarchy tags (reused for archive tags)
+- `tools/download-leaflet.sh`, `tools/download-pdfjs.sh` — the sha256-pinning
+  convention; `monolith` is committed directly rather than downloaded (a
+  build-time executable, not a servable asset)
+- `nginx/popup-proxy.conf` — the metadata proxy; related but distinct (caches
+  previews, does not preserve documents)
+```
+</content>
diff --git a/Makefile b/Makefile
index 05f6912..cb4ed68 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: build deploy sign download-model download-pdfjs download-leaflet compress-assets convert-images pdf-thumbs pdfs watch clean dev
+.PHONY: build deploy sign download-model download-pdfjs download-leaflet compress-assets convert-images pdf-thumbs pdfs watch clean dev archive-gc archive-wayback archive-check
 
 # Source .env for deploy / GitHub config if it exists.
 # .env format: KEY=value (one per line, no `export` prefix, no quotes needed).
@@ -43,6 +43,16 @@ build:
 	else \
 	  echo "Photography sidecars skipped: run 'uv sync' to enable EXIF + palette + dimension extraction (build continues with frontmatter only)"; \
 	fi
+	# Archive pipeline (Phase 1): fetch any manifest URL without a local
+	# artifact, extract text, write archive/<slug>/PROVENANCE.json and
+	# data/archive-index.json. Gated on .venv, same as embed.py. A SHA or
+	# slug-URL integrity error exits non-zero and halts the build; a
+	# transient network failure is non-fatal (the entry retries next build).
+	@if [ -d .venv ]; then \
+	  uv run python tools/archive.py fetch; \
+	else \
+	  echo "Archive fetch skipped: run 'uv sync' to enable link archiving (build continues)"; \
+	fi
 	cabal run site -- build
 	pagefind --site _site
 	@if [ -d .venv ]; then \
@@ -153,6 +163,38 @@ watch:
 clean:
 	cabal run site -- clean
 
+# Evict archived works: delete archive/<slug>/ directories whose slug is
+# recorded in archive/removed.yaml. Opt-in — NEVER run by `make build`.
+# Orphan directories (not in manifest.yaml, not in removed.yaml) are
+# reported, never deleted. See ARCHIVE.md - Eviction & removal.
+archive-gc:
+	@if [ -d .venv ]; then \
+	  uv run python tools/archive.py gc; \
+	else \
+	  python3 tools/archive.py gc; \
+	fi
+
+# Submit archived URLs to the Wayback Machine and backfill the capture URL
+# into each PROVENANCE.json. A slow network job — opt-in, never run by
+# `make build`. Always exits 0; an entry without a capture retries next run.
+archive-wayback:
+	@if [ -d .venv ]; then \
+	  uv run python tools/archive.py wayback; \
+	else \
+	  python3 tools/archive.py wayback; \
+	fi
+
+# Probe every archived URL for link rot, updating data/archive-state.json.
+# A slow network job — opt-in, never run by `make build`. Asymmetric
+# hysteresis: `rotted` needs 3 consecutive failures over >=14 days; a
+# single success recovers immediately. The next build consumes the state.
+archive-check:
+	@if [ -d .venv ]; then \
+	  uv run python tools/archive.py check; \
+	else \
+	  python3 tools/archive.py check; \
+	fi
+
 # Dev build includes any in-progress drafts under content/drafts/essays/.
 # SITE_ENV=dev is read by build/Site.hs; drafts are otherwise invisible to
 # every build (make build / make deploy / cabal run site -- build directly).
diff --git a/archive/djb-aes-speed/PROVENANCE.json b/archive/djb-aes-speed/PROVENANCE.json
new file mode 100644
index 0000000..5811428
--- /dev/null
+++ b/archive/djb-aes-speed/PROVENANCE.json
@@ -0,0 +1,14 @@
+{
+  "url": "https://cr.yp.to/aes-speed.html",
+  "slug": "djb-aes-speed",
+  "title": "Cache-timing attacks on AES (cr.yp.to)",
+  "type": "html",
+  "artifact": "snapshot.html",
+  "sha256": "8da2d5aedeccf9f602e1680631aa77308683803c0cc9b04caad52c7a70c60832",
+  "previous-sha256": "0a50bf6d64b2ec08771d83be5ef47721ecbfc431e3512ff55978e76f452dbd3f",
+  "bytes": 26186,
+  "archived": "2026-05-23",
+  "source-date": null,
+  "snapshot-quality": "ok",
+  "wayback": null
+}
diff --git a/archive/djb-aes-speed/snapshot.html b/archive/djb-aes-speed/snapshot.html
new file mode 100644
index 0000000..04d2d3b
--- /dev/null
+++ b/archive/djb-aes-speed/snapshot.html
@@ -0,0 +1,470 @@
+<!-- Saved from https://cr.yp.to/aes-speed.html at 2026-05-23T13:04:33Z using monolith v2.10.1 -->
+<html><head><meta content="default-src 'none'; img-src data:; style-src 'unsafe-inline'; style-src-elem 'unsafe-inline'; style-src-attr 'unsafe-inline'; font-src data:; script-src 'none'; object-src 'none'; frame-src 'none'" http-equiv="Content-Security-Policy"/><meta content="noindex, noarchive" name="robots"/><link href="data:text/html;base64,PGh0bWw+PGJvZHk+ZmlsZSBkb2VzIG5vdCBleGlzdDwvYm9keT48L2h0bWw+DQo=" rel="icon"/></head><body>
+<title>AES speed</title>
+<meta content="aes" name="keywords"/>
+<a href="https://cr.yp.to/djb.html">D. J. Bernstein</a>
+<br/><a href="https://cr.yp.to/hash.html">Hash functions and ciphers</a>
+<h1>AES speed</h1>
+<b>Update:</b>
+Peter Schwabe and I now have a paper on this topic:
+<ul>
+<li>
+<a name="aesspeed-paper">[aesspeed]</a>
+15pp.
+<a href="https://cr.yp.to/aes-speed/aesspeed-20080926.pdf">(PDF)</a>
+D. J. Bernstein, Peter Schwabe.
+New AES software speed records.
+Document ID: b90c51d2f7eef86b78068511135a231f.
+URL: https://cr.yp.to/papers.html#aesspeed.
+Date: 2008.09.26.
+Supersedes:
+<a href="https://cr.yp.to/aes-speed/aesspeed-20080908.pdf">(PDF)</a>
+2008.09.08.
+</li></ul>
+The software is now available as part of the
+<a href="https://cr.yp.to/streamciphers/timings.html#toolkit-estreambench">estreambench</a>
+toolkit.
+We have placed the software into the public domain;
+feel free to integrate it into your own AES applications!
+<p>
+Information below this line has not yet been updated.
+</p><hr/>
+This document describes various speedups in AES software.
+This document assumes that
+the software is going to be used in an application
+where timing information is <i>not</i> exposed to attackers.
+<p>
+The reader is expected to already know the standard structure of AES software:
+</p><ul>
+<li>each of the 16 state bytes is used as an index for a table lookup producing a 32-bit word;
+</li><li>16 xors combine these 16 words and 4 expanded key words into 4 new state words;
+</li><li>those 4 words are viewed as the starting 16 bytes for the next round.
+</li></ul>
+See Section 5.2.1 of "AES Proposal: Rijndael" by Daemen and Rijmen.
+<h2>Endianness</h2>
+On a little-endian CPU,
+extracting the first byte of a 32-bit word
+is an &amp;0xff arithmetic instruction;
+on a big-endian CPU,
+extracting the first byte of a 32-bit word
+is a &gt;&gt;24 arithmetic instruction.
+Similar comments apply to the other bytes.
+<p>
+One can write AES software
+that uses arithmetic instructions as if the CPU were little-endian.
+If the CPU is actually big-endian,
+the software swaps the bytes of the AES key, input, and output (at run time).
+The software also swaps the bytes of the table (at compile time),
+for example by expressing the table as a sequence of 32-bit integers.
+</p><p>
+<b>Matched endianness.</b>
+One can easily eliminate the byte-swapping time for the AES key, input, and output:
+simply use the appropriate arithmetic instructions
+for the endianness of the CPU.
+In this case the table must not be swapped.
+</p><h2>Table structure</h2>
+All else being equal, smaller AES tables are faster:
+they take less time to load into cache and are more likely to stay in cache.
+Beware that most benchmarking tools preload caches and thus can't see this speedup.
+<p>
+Daemen and Rijmen suggest "4 KBytes of tables."
+There are 4 tables.
+Each table has 256 words occupying 1024 bytes.
+The loads are spread evenly across the tables.
+</p><p>
+<b>Rotated lookups.</b>
+Daemen and Rijmen suggest an alternative "with a total table size of 1KByte"
+but with extra arithmetic.
+The point is that the tables are rotations of each other:
+for example,
+the first word of the first table is (0xc6,0x63,0x63,0xa5),
+the first word of the second table is (0xa5,0xc6,0x63,0x63),
+the first word of the third table is (0x63,0xa5,0xc6,0x63),
+and the first word of the fourth table is (0x63,0x63,0xa5,0xc6).
+One can store the first table,
+and simulate a lookup in another table at the cost of an extra rotation.
+</p><p>
+<b>Unaligned loads.</b>
+One can instead use a single 2KB table having 256 8-byte entries
+such as (0x00,0x63,0xa5,0xc6,0x63,0x63,0xa5,0xc6).
+There are many reasonable choices of pattern here;
+what's important is that the pattern includes the desired
+(0xc6,0x63,0x63,0xa5) and (0xa5,0xc6,0x63,0x63) and so on as substrings.
+On the Pentium, the PowerPC, et al.,
+one can load 4-byte words from memory addresses that aren't divisible by 4,
+and there's no penalty when the word doesn't cross an 8-byte boundary.
+</p><h2>Masked loads</h2>
+16 of the 160 table lookups in 10-round AES are masked.
+The 40 table lookups in 10-round AES key expansion are also masked.
+The masks are 0x000000ff, 0x0000ff00, 0x00ff0000, and 0xff000000, each used equally often.
+<p>
+The simplest way to compute a mask is with an arithmetic instruction: for example, &amp;0xff00.
+</p><p>
+<b>Byte loads.</b>
+One can eliminate 25% of the masks,
+namely the bottom-byte masks,
+by combining them with load instructions.
+All popular CPUs have single-byte-load instructions.
+</p><p>
+<b>Two-byte loads.</b>
+One can eliminate another 25% of the masks
+on CPUs with two-byte-load instructions.
+This constrains the table pattern:
+it's important to have (0x00,0x63) on little-endian CPUs,
+and (0x63,0x00) on big-endian CPUs.
+</p><p>
+<b>Masked tables.</b>
+One can eliminate all of the masks by precomputing masked tables, using extra table space.
+The simplest table structure uses a total of 8KB.
+Two tables, one with entries such as (0x00,0x63,0xa5,0xc6,0x63,0x63,0xa5,0xc6)
+and another with entries such as (0x00,0x00,0x00,0x00,0x63,0x00,0x00,0x00),
+use a total of 4KB.
+In my experience,
+the cost of larger tables outweighs the benefit of eliminating a few masks.
+</p><h2>Key expansion</h2>
+A 4-word (128-bit) key is expanded in 40 steps.
+Each step produces a new word, totalling 44 words in the expanded key.
+A step has a byte extraction (see below), a masked load, and two xors.
+The total work is 40 byte extractions, 40 masked loads, and 80 xors.
+For comparison, the subsequent work to encrypt a block involves
+160 byte extractions, 160 loads (of which 16 are masked), and 160 xors.
+<p>
+Daemen and Rijmen say (Section 4.3.2)
+that key expansion involves "almost no computational overhead."
+Obviously key expansion is less expensive than encrypting a block.
+On the other hand, the cost of key expansion is still quite noticeable.
+</p><p>
+<b>Expanded keys.</b>
+A typical AES implementation precomputes and stores an expanded key.
+The 40 byte extractions, 40 masked loads, and 80 xors aren't repeated for every block;
+they are done only once, along with 44 stores.
+Each block then involves 44 extra loads for the expanded key.
+Some stores and loads can be eliminated
+if many blocks are handled at once
+and some extra registers are available.
+</p><p>
+Long-term storage of an expanded key can slow down applications that handle many keys:
+the expanded keys take more time to load into cache
+than the original keys and are less likely to stay in cache.
+</p><p>
+<b>Partially expanded keys.</b>
+An alternative is to precompute and store a partially expanded key,
+only 14 words instead of 44 words.
+The partially expanded key consists of words
+0, 1, 2, 3, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40 from the expanded key.
+Loading the partially expanded key, and converting it into the fully expanded key,
+takes only 14 loads and 30 xors.
+</p><p>
+One can interpolate between partial expansion and full expansion,
+using various amounts of storage per key and achieving various balances between load and xor.
+</p><h2>Index extraction</h2>
+The 16 xor operations in an AES round
+produce 4 words in 4 integer registers.
+The 16 bytes of these words are then extracted and used as indices for the next round.
+<p>
+The simplest way to extract 4 bytes is using 6 instructions,
+namely 3 shifts and 3 bottom-byte extractions:
+&amp;255;
+(&gt;&gt;8)&amp;255;
+(&gt;&gt;16)&amp;255;
+&gt;&gt;24.
+</p><p>
+Using a byte as an index then requires multiplying the byte by a constant
+that depends on the table structure.
+Let's assume the 2KB tables described above; then the constant is 8.
+The multiplications use 4 shifts:
+&lt;&lt;3;
+&lt;&lt;3;
+&lt;&lt;3;
+&lt;&lt;3.
+</p><p>
+<b>Scaled-index loads.</b>
+Many CPUs can multiply an index register by 8 for free as part of a load.
+</p><p>
+<b>Scaled-index extractions.</b>
+What about CPUs that can't multiply an index register by 8 for free?
+Two of the multiplications can nevertheless be eliminated,
+because they can be combined with shifts.
+The overall extract-and-scale sequence has 8 instructions:
+(&lt;&lt;3)&amp;2040;
+(&gt;&gt;5)&amp;2040;
+(&gt;&gt;13)&amp;2040;
+(&gt;&gt;21)&amp;2040.
+The PowerPC has a combined rotate-and-mask instruction,
+making this sequence take only 4 instructions.
+</p><p>
+<b>Scaled tables.</b>
+One can rotate table entries by 3 bits,
+reducing the above 8 instructions to 7 instructions.
+</p><p>
+<b>Second-byte instructions.</b>
+The x86 architecture (Pentium, Athlon, etc.)
+includes a combined (&gt;&gt;8)&amp;255 instruction.
+This means that extracting 4 bytes takes only 5 instructions:
+&amp;255;
+(&gt;&gt;8)&amp;255;
+&gt;&gt;16;
+&amp;255;
+&gt;&gt;8.
+Alternate 5-instruction sequence:
+&amp;255;
+(&gt;&gt;8)&amp;255;
+&gt;&gt;16;
+&amp;255;
+(&gt;&gt;8)&amp;255.
+</p><p>
+Of course, the ultimate measure of performance is a cycle count, not an instruction count.
+Matsui states that the (&gt;&gt;8)&amp;255; instruction is "a bit expensive"
+on the Pentium 4 Prescott (f33, f34, f41);
+presumably this means that the instruction takes more cycles than, e.g., a mere &amp;255.
+But all of the measurements I've seen indicate the opposite.
+I'm not sure what I'm missing here.
+</p><p>
+<b>32-bit shifts on 64-bit architectures.</b>
+The amd64 architecture (P4E, Athlon 64, Core 2, etc.) can right-shift a 64-bit register,
+but Matsui comments that this operation is extremely slow on the P4E.
+It's much better to use the amd64's x86-compatible right-shift instruction;
+this instruction sets the top 32 bits of its 64-bit input to 0 before shifting.
+</p><p>
+<b>Byte extraction via loads.</b>
+A completely different way to extract 4 bytes is with 1 store and 4 loads.
+One can mix this with the previous approaches
+to achieve various balances between load and arithmetic.
+</p><p>
+Consider, for example, the UltraSPARC,
+which has 2 integer units and 1 load/store unit.
+A traditional sequence of
+14 partially-expanded-key loads (see below), 30 key-expansion xors,
+160 scaled-index extractions, 160 table-lookup loads, 160 xors, 16 masks,
+4 input loads, and 4 output stores
+occupies a total of 526 integer instructions (at least 263 cycles)
+and 182 loads (at least 182 cycles).
+Using loads for some byte extractions,
+replacing 36 scaled-index extractions with 9 stores and 36 loads,
+means a total of 454 integer instructions (at least 227 cycles)
+and 227 loads/stores (at least 227 cycles).
+</p><h2>Unrolling</h2>
+A typical 9-iteration AES loop
+involves 9 increments of a loop index, 9 comparisons, and 9 branches,
+one of which is mispredicted on most CPUs.
+The loop index also consumes a register,
+forcing an extra 9 stores and 9 loads on CPUs that don't have registers to spare.
+<p>
+<b>Full unrolling.</b>
+One can eliminate all of these costs by fully unrolling the loop.
+Beware, however, that full unrolling costs a few kilobytes of code-cache space.
+</p><p>
+<b>Partial unrolling.</b>
+CPUs are more likely to correctly predict a 4-iteration loop than a 9-iteration loop.
+</p><h2>Instruction scheduling</h2>
+The 16 table lookups in an AES round are independent
+and can be scheduled in many different ways.
+One can, for example,
+perform all the table lookups for the first input from bottom byte to top
+(outputs 0, 3, 2, 1),
+then perform all the table lookups for the second input from bottom byte to top
+(outputs 1, 0, 3, 2),
+then perform all the table lookups for the third input from bottom byte to top
+(outputs 2, 1, 0, 3),
+then perform all the table lookups for the fourth input from bottom byte to top
+(outputs 3, 2, 1, 0).
+One can, as another example,
+first perform all the table lookups for the first output in order of the inputs,
+then perform all the table lookups for the second output in order of the inputs,
+etc.
+<p>
+<b>Maximum parallelism.</b>
+The overall depth of the AES round is
+one byte extraction plus one table lookup plus two xors:
+a mythical CPU offering extensive parallelism
+could perform all sixteen byte extractions in parallel,
+then all sixteen table lookups in parallel,
+then eight xors in parallel,
+then four xors in parallel.
+Note that each output is obtained by xor'ing two parallel xor's,
+rather than by three serial xor's.
+</p><p>
+<b>Deferring loads.</b>
+The amd64 architecture poses several challenges to AES instruction scheduling.
+First,
+most integer instructions require the output register to be one of the input registers.
+Second,
+typical amd64 CPUs handle a load and xor most efficiently as a unified load-xor,
+but a unified load-xor gives no opportunity to switch registers.
+Third,
+only 4 registers (eax, ebx, ecx, edx) allow second-byte instructions.
+</p><p>
+Matsui concludes that, on amd64 (and x86),
+keeping each round's inputs y0, y1, y2, y3 and outputs z0, z1, z2, z3 in eax, ebx, ecx, edx,
+to allow second-byte instructions,
+is "impossible without saving/restoring."
+But that's incorrect.
+No extra copies are required.
+A careful instruction sequence
+uses the minimal conceivable number of instructions:
+20 for byte extraction,
+16 for table lookups,
+and 4 for handling the expanded key.
+The idea is to extract all the bytes from an input,
+freeing the input's register for an output,
+before doing any table lookups involving that output:
+</p><ul>
+<li>Extract the 4 bytes from y0.
+At this point y1, y2, y3, and the 4 bytes are live.
+</li><li>Feed 1 byte into z0.
+At this point y1, y2, y3, z0, and 3 more bytes are live.
+</li><li>Extract the 4 bytes from y1, immediately feeding 1 into z0.
+At this point y2, y3, z0, and 6 more bytes are live.
+</li><li>Feed 2 bytes into z1.
+At this point y2, y3, z0, z1, and 4 more bytes are live.
+</li><li>Extract the 4 bytes from y2, immediately feeding 2 into z0 and z1.
+At this point y3, z0, z1, and 6 more bytes are live.
+</li><li>Feed 3 bytes into z2.
+At this point y3, z0, z1, z2, and 3 more bytes are live.
+</li><li>Extract the 4 bytes from y3, immediately feeding 3 into z0, z1, and z2.
+At this point z0, z1, z2, and 4 more bytes are live.
+</li><li>Feed 4 bytes into z3.
+At this point z0, z1, z2, and z3 are live.
+</li><li>Handle 4 words of the expanded key.
+</li></ul>
+The maximum number of live registers here is 9,
+fitting easily into the amd64 instruction set.
+<p>
+<b>Squeezing inputs and outputs into 7 32-bit registers.</b>
+The x86 architecture poses an additional challenge to AES instruction scheduling:
+there are only 7 general-purpose integer registers.
+</p><p>
+It's still possible to handle a round with 0 stores, 4 expanded-key loads,
+and 16 loads for table lookups.
+The shortest instruction sequence that I know has a total of 46 instructions,
+6 more than what would be possible with extra registers;
+1 of the 46 instructions can be eliminated if the key expansion is changed.
+</p><p>
+The idea of this instruction sequence
+is to rotate y0 by 16 bits,
+use the bottom two bytes of both y0 and y2,
+and then merge the remaining four bytes of y0 and y2 into a single register
+(for example, shifting y0 down 16 bits, masking y1, and adding the results),
+freeing a register at the cost of 3 extra instructions (the rotate, the mask, and the add);
+splitting 3 load-xor instructions into 3 loads and 3 xors
+then easily puts all outputs into suitable registers.
+The rotation can be eliminated if the expanded-key word that corresponds to y0
+is rotated by 16 bits.
+</p><h2>Speed reports</h2>
+Speed reports vary in whether they use CTR, CBC, etc.,
+and in the exact rules for measuring speeds.
+The "eSTREAM" cycles/byte counts are
+for counter-mode AES measured by the eSTREAM benchmarking toolkit;
+future implementors are encouraged to support the eSTREAM interface for direct comparability.
+<table border="">
+<tbody><tr><th>Architecture</th><th>CPU</th><th>eSTREAM cycles/byte</th><th>Ad-hoc cycles/byte</th><th>Software</th></tr>
+<tr><td>amd64</td><td>Intel Core 2 Duo (6f6)?</td><td></td><td>9.2</td><td>Matsui/Nakajima (CHES 2007)</td></tr>
+<tr><td>amd64</td><td>AMD Athlon 64 (15,75,2)?</td><td></td><td>10.625 (170/block)</td><td>Matsui (FSE 2006)</td></tr>
+<tr><td>amd64</td><td>AMD Athlon 64 (15,75,2)?</td><td></td><td>12.4375 (199/block)</td><td>Lipmaa</td></tr>
+<tr><td>amd64</td><td>Intel Core 2 Duo (6f6); katana</td><td>12.56</td><td></td><td>hongjun/v1/1</td></tr>
+<tr><td>amd64</td><td>Intel Core 2 Quad Q6600 (6fb); latour</td><td>12.57</td><td></td><td>hongjun/v1/1</td></tr>
+<tr><td>amd64</td><td>AMD Athlon 64 (15,75,2)?</td><td></td><td>13.125 (210/block)</td><td>Osvik</td></tr>
+<tr><td>amd64</td><td>AMD Athlon 64 X2 (15,75,2); mace</td><td>13.32</td><td></td><td>hongjun/v1/1</td></tr>
+<tr><td>amd64</td><td>AMD Opteron 240 (f58); nmisles8amd64</td><td>13.45</td><td></td><td>bernstein/amd64-1/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium III (68a)?</td><td></td><td>14 (224/block)</td><td>Osvik</td></tr>
+<tr><td>x86</td><td>AMD Athlon (622)?</td><td></td><td>14.0625 (225/block)</td><td>Osvik</td></tr>
+<tr><td>x86</td><td>Intel Pentium III (68a)?</td><td></td><td>14.125 (226/block)</td><td>Lipmaa</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f12)?</td><td></td><td>15 (240/block)</td><td>Osvik</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f12)?</td><td></td><td>15.875 (254/block)</td><td>Lipmaa</td></tr>
+<tr><td>x86</td><td>Intel Pentium M (695); whisper</td><td>15.96</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>amd64</td><td>Intel Pentium 4 (f64)?</td><td></td><td>16 (256/block)</td><td>Matsui (FSE 2006)</td></tr>
+<tr><td>x86</td><td>Intel Pentium III (68a)?</td><td></td><td>16.25 (260/block)</td><td>Gladman</td></tr>
+<tr><td>amd64</td><td>Intel Pentium D (f64); nmi0161</td><td>16.74</td><td></td><td>bernstein/amd64-2/1</td></tr>
+<tr><td>amd64</td><td>Intel Pentium D (f64); svlin001</td><td>16.75</td><td></td><td>bernstein/amd64-2/1</td></tr>
+<tr><td>amd64</td><td>Intel Xeon (f41); nmi0056</td><td>16.75</td><td></td><td>bernstein/amd64-2/1</td></tr>
+<tr><td>amd64</td><td>Intel Xeon (f4a); nmi0090</td><td>16.77</td><td></td><td>bernstein/amd64-2/1</td></tr>
+<tr><td>sparc</td><td>Sun UltraSPARC III</td><td></td><td>16.875 (270/block)</td><td>Lipmaa</td></tr>
+<tr><td>amd64</td><td>Intel Xeon (f41); nmi0057</td><td>16.89</td><td></td><td>bernstein/amd64-2/1</td></tr>
+<tr><td>amd64</td><td>Intel Pentium D (f64); speed</td><td>16.90</td><td></td><td>bernstein/amd64-2/1</td></tr>
+<tr><td>amd64</td><td>Intel Pentium D (f64); nmi0104</td><td>16.90</td><td></td><td>bernstein/amd64-2/1</td></tr>
+<tr><td>amd64</td><td>Intel Pentium D (f64); nmi0241</td><td>16.93</td><td></td><td>bernstein/amd64-2/1</td></tr>
+<tr><td>ppc64</td><td>IBM POWER5; nmi0154</td><td>16.93</td><td></td><td>bernstein/big-1/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f24); nmi0086</td><td>16.96</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f12); fireball</td><td>16.98</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f24); nmitest4</td><td>17.01</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>ppc64</td><td>IBM PowerPC G5 970; nmi0048</td><td>17.17</td><td></td><td>bernstein/big-1/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium 2 (652); boris</td><td>17.33</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium 3 (68a)</td><td>17.49</td><td></td><td>Bernstein aes-128/x86-mmx-1</td></tr>
+<tr><td>x86</td><td>Intel Pentium 3 (672); orpheus</td><td>17.55</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium M (6d8)</td><td>17.57</td><td></td><td>Wu v0/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f33)?</td><td></td><td>17.75 (284/block)</td><td>Matsui/Fukuda (FSE 2005)</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f29); nmibuild40</td><td>17.79</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f27); nmi0059</td><td>17.79</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f25); nmibuild16</td><td>17.79</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f25); nmi0013</td><td>17.79</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f29); nmi0059</td><td>17.80</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f29); nmibuild17</td><td>17.81</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f25); nmibuild15</td><td>17.82</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f25); nmibuild26</td><td>17.83</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f25); nmibuild21</td><td>17.83</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f25); nmi0036</td><td>17.84</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f25); nmibuild22</td><td>17.84</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>AMD Athlon (622); thoth</td><td>18.38</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>ppc32</td><td>IBM POWER4; nmibuild14</td><td>18.55</td><td></td><td>bernstein/little-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f41); nmi0079</td><td>18.88</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f41); nmi0062</td><td>18.89</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>amd64</td><td>Intel Core 2 Duo (6f6)</td><td></td><td>18.9</td><td>OpenSSL 0.9.8e</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f41); nmi0061</td><td>18.91</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f41); svlin002</td><td>18.94</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f41); nmi0076</td><td>18.96</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f4a); nmi0102</td><td>18.97</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f41); nmi0060</td><td>18.97</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Xeon (f41); nmi0063</td><td>18.95</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium 3 (68a)</td><td>19.06</td><td></td><td>Wu v1/1</td></tr>
+<tr><td>ppc32</td><td>Motorola PowerPC G4 7410; gggg</td><td>19.11</td><td></td><td>bernstein/big-1/1</td></tr>
+<tr><td>amd64</td><td>Intel Core 2 Duo (6f6)</td><td></td><td>19.5</td><td>OpenSSL 0.9.8a</td></tr>
+<tr><td>x86</td><td>AMD Athlon (622)?</td><td></td><td>19.9375 (319/block)</td><td>Lipmaa</td></tr>
+<tr><td>x86</td><td>Intel Pentium 1 (52c)</td><td></td><td>20 (320/block)</td><td>Lipmaa</td></tr>
+<tr><td>sparc</td><td>Sun UltraSPARC III</td><td>20.75</td><td></td><td>Bernstein big-1/1</td></tr>
+<tr><td>amd64</td><td>AMD Athlon 64 (15,75,2)</td><td></td><td>20.9</td><td>OpenSSL 0.9.8e</td></tr>
+<tr><td>ppc32</td><td>Motorola PowerPC G4 7400; nmi0042</td><td>20.92</td><td></td><td>bernstein/big-1/1</td></tr>
+<tr><td>x86</td><td>Intel Pentium M (6d8)</td><td></td><td>21</td><td>OpenSSL 0.9.8a</td></tr>
+<tr><td>x86</td><td>Intel Pentium D (f47); shell</td><td>21.58</td><td></td><td>bernstein/x86-mmx-1/1</td></tr>
+<tr><td>x86</td><td>AMD Athlon (622)</td><td></td><td>22</td><td>OpenSSL 0.9.8a</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f29)</td><td></td><td>22</td><td>OpenSSL 0.9.8b</td></tr>
+<tr><td>amd64</td><td>AMD Athlon 64 (15,75,2)?</td><td></td><td>23.5</td><td>OpenSSL 0.9.7e</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f41)</td><td></td><td>23.5</td><td>OpenSSL 0.9.8a</td></tr>
+<tr><td>x86</td><td>Intel Pentium 3 (672); orpheus</td><td></td><td>23.62</td><td>OpenSSL 0.9.8e</td></tr>
+<tr><td>ppc32</td><td>Motorola PowerPC G4 7410</td><td></td><td>24.0625 (385/block)</td><td>Ahrens</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f12)</td><td></td><td>24.4</td><td>OpenSSL 0.9.8a</td></tr>
+<tr><td>sparc</td><td>Sun UltraSPARC III</td><td></td><td>25</td><td>OpenSSL</td></tr>
+<tr><td>ppc32</td><td>Motorola PowerPC G4 7410</td><td></td><td>25.0625 (401/block)</td><td>Ahrens</td></tr>
+<tr><td>x86</td><td>Intel Core Duo; nmi0068</td><td>25.74</td><td></td><td>gladman/1</td></tr>
+<tr><td>amd64</td><td>Intel Pentium D (f64); speed</td><td></td><td>27.33</td><td>OpenSSL 0.9.8e</td></tr>
+<tr><td>ppc32</td><td>Motorola PowerPC G4 7410; gggg</td><td></td><td>29.32</td><td>OpenSSL 0.9.8c</td></tr>
+<tr><td>sparcv9</td><td>Sun UltraSPARC III; nmi0051</td><td>29.45</td><td></td><td>bernstein/big-1/1</td></tr>
+<tr><td>sparcv9</td><td>Sun UltraSPARC III; nmisolaris10</td><td>29.46</td><td></td><td>bernstein/big-1/1</td></tr>
+<tr><td>ppc64</td><td>IBM Cell PPE; nmips3</td><td>35.20</td><td></td><td>bernstein/big-1/1</td></tr>
+<tr><td>amd64</td><td>Intel Pentium 4 (f64)</td><td></td><td>37</td><td>OpenSSL 0.9.7f</td></tr>
+<tr><td>x86</td><td>Intel Pentium 4 (f29)</td><td></td><td>39</td><td>OpenSSL 0.9.7e</td></tr>
+<tr><td>sparc</td><td>Sun UltraSPARC III</td><td></td><td>46.875 (750/block)</td><td>Bassham</td></tr>
+<tr><td>x86</td><td>Intel Pentium 1 (52c); cruncher</td><td>38.20</td><td></td><td>hongjun/v1/1</td></tr>
+</tbody></table>
+<p>
+Regarding amd64 Intel Pentium 4,
+Matsui writes: 
+"The number of memory reads
+for one block encryption of AES
+is 4 (for plaintext loads)
++ 11 x 4 (for subkey loads)
++ 16 x 10 (for table lookups)
+= 208,
+which means that Pentium 4 takes at least 208 cycles/block for one block encryption."
+But this lower bound ignores the possibility of loading partially expanded keys,
+saving as many as 30 loads,
+and using 64-bit loads for keys and plaintext,
+saving 9 more loads.
+</p><p>
+Regarding amd64 AMD Athlon 64,
+Matsui writes:
+"Considering an instruction latency of Athlon 64, the theoretical limit of AES
+performance on this processor seems around 16 cycles/round = 160 cycles/block.
+Our result is hence reaching closely this limit."
+
+
+</p></body></html>
diff --git a/archive/manifest.yaml b/archive/manifest.yaml
new file mode 100644
index 0000000..781b853
--- /dev/null
+++ b/archive/manifest.yaml
@@ -0,0 +1,28 @@
+# archive/manifest.yaml — curated list of works to preserve.
+# Edited by hand. Tools never write to this file. See ARCHIVE.md.
+#
+# Per-artifact cap: 25 MB. Above that, archive.py warns and skips the fetch;
+# commit an oversize artifact deliberately with `git add -f`.
+#
+# To evict an entry, see archive/removed.yaml — record there FIRST, then
+# delete the line here, then run `make archive-gc`.
+
+- url: "https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.203.pdf"
+  slug: nist-fips-203
+  title: "FIPS 203 — Module-Lattice-Based Key-Encapsulation Mechanism Standard"
+  type: pdf
+  tags: [research]
+  note: >
+    The ML-KEM standard. Cited in the SIMD / post-quantum systems work;
+    archived so the citation survives any future reorganization of the
+    NIST publications site.
+
+- url: "https://cr.yp.to/aes-speed.html"
+  slug: djb-aes-speed
+  title: "Cache-timing attacks on AES (cr.yp.to)"
+  # type: html — auto-detected from the .html extension; no override needed.
+  tags: [research]
+  note: >
+    Bernstein's cache-timing-attacks page, cited in the SIMD work. The
+    Phase 2 bootstrap entry: a stable, JavaScript-free static page, so its
+    monolith snapshot is reproducible and classifies cleanly as `ok`.
diff --git a/archive/nist-fips-203/PROVENANCE.json b/archive/nist-fips-203/PROVENANCE.json
new file mode 100644
index 0000000..61474ec
--- /dev/null
+++ b/archive/nist-fips-203/PROVENANCE.json
@@ -0,0 +1,14 @@
+{
+  "url": "https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.203.pdf",
+  "slug": "nist-fips-203",
+  "title": "FIPS 203 — Module-Lattice-Based Key-Encapsulation Mechanism Standard",
+  "type": "pdf",
+  "artifact": "document.pdf",
+  "sha256": "fe1f12f32a7e44ec9fdebbf400cda843a40b506dee676725234dc6f7923b6cac",
+  "previous-sha256": null,
+  "bytes": 1252341,
+  "archived": "2026-05-22",
+  "source-date": null,
+  "snapshot-quality": "ok",
+  "wayback": "http://web.archive.org/web/20260515100505/https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.203.pdf"
+}
diff --git a/archive/nist-fips-203/document.pdf b/archive/nist-fips-203/document.pdf
new file mode 100644
index 0000000..a97b548
Binary files /dev/null and b/archive/nist-fips-203/document.pdf differ
diff --git a/archive/removed.yaml b/archive/removed.yaml
new file mode 100644
index 0000000..0501e08
--- /dev/null
+++ b/archive/removed.yaml
@@ -0,0 +1,19 @@
+# archive/removed.yaml — record of evicted archive entries.
+#
+# Append an entry here BEFORE deleting its line from manifest.yaml, then
+# run `make archive-gc`. The GC deletes only archive/<slug>/ directories
+# whose slug is recorded here; an orphaned directory absent from this file
+# is reported, never deleted. See ARCHIVE.md § Eviction & removal.
+#
+# Schema (all fields but `note` required):
+#   url:      original URL at time of removal
+#   slug:     the archive/<slug>/ directory archive-gc may delete
+#   removed:  ISO date of removal
+#   reason:   takedown | author-request | legal | quality
+#   note:     optional free-text context
+#
+# This is not a hostile-tracking list — it exists so GC knows what is safe
+# to delete, re-adding a removed URL is surfaced loudly, and the link-rot
+# scanner and `archive-suggest` skip removed works.
+
+[]
diff --git a/build/Archive.hs b/build/Archive.hs
new file mode 100644
index 0000000..6940f54
--- /dev/null
+++ b/build/Archive.hs
@@ -0,0 +1,579 @@
+{-# LANGUAGE GHC2021 #-}
+{-# LANGUAGE OverloadedStrings #-}
+-- | Archive section — the link-archiving system. Phases 1-2: PDF and HTML.
+--
+--   Authored input:        archive/manifest.yaml (one line per archived link)
+--   Generated, committed:  archive/<slug>/{document.pdf | snapshot.html}
+--                          + PROVENANCE.json
+--   Generated, gitignored: archive/<slug>/{document,snapshot}.txt
+--                          + data/archive-index.json
+--
+--   @tools/archive.py fetch@ runs before the Hakyll build: it downloads
+--   PDFs / snapshots HTML pages with @monolith@, extracts text, and writes
+--   each PROVENANCE.json. This module then routes the artifacts and renders
+--   one @/archive/<slug>/@ page per entry plus the @/archive/@ index.
+--
+--   An entry whose artifact has not been fetched (no PROVENANCE.json, or
+--   no artifact file on disk) is skipped — it produces no page, and an
+--   orphaned @archive/<slug>/@ directory with no manifest line is inert
+--   (no page, not deployed). Artifact-integrity (SHA-256) verification
+--   runs on both sides: @archive.py fetch@ re-hashes before the Hakyll
+--   build, and 'verifyArtifactSha' (below) re-hashes again in
+--   'loadArchiveEntries' — so the guarantee holds even when @archive.py@
+--   does not run first (no @.venv@, a direct @cabal run site -- build@,
+--   or a deploy host without the Python toolchain).
+--
+--   See @ARCHIVE.md@ at the repo root for the full design and phase plan.
+module Archive (archiveRules, archiveBuildStats) where
+
+import           Control.Exception      (SomeException, catch)
+import           Control.Monad          (filterM, forM, when)
+import           Data.Function          (on)
+import           Data.List              (groupBy, intercalate, sort, sortBy)
+import qualified Data.Map.Strict        as Map
+import           Data.Maybe             (catMaybes, fromMaybe)
+import           Data.Ord               (Down (..), comparing)
+import qualified Data.Set               as Set
+import qualified Data.Text              as T
+import           Data.Time              (Day, diffDays, fromGregorian,
+                                         getCurrentTime, utctDay)
+import qualified Data.Aeson             as A
+import           Data.Aeson             ((.:), (.:?))
+import qualified Data.Yaml              as Y
+import           System.Directory       (doesDirectoryExist, doesFileExist,
+                                         listDirectory)
+import           System.Exit            (exitFailure)
+import           System.IO              (hPutStrLn, readFile', stderr)
+import           System.Process         (readProcess)
+import           Text.Read              (readMaybe)
+import           Hakyll
+import           Contexts               (siteCtx)
+import           Backlinks              (referencedByField)
+import           SimilarLinks           (similarLinksField)
+import           ArchiveIndex           (ArchiveStatus (..), statusName,
+                                         archiveStatusForSlug, normalizeUrl)
+
+-- ---------------------------------------------------------------------------
+-- Data model
+-- ---------------------------------------------------------------------------
+
+-- | One authored entry in @archive/manifest.yaml@ — only the fields this
+--   module consumes. @title:@, @type:@ and @tags:@ are read by
+--   @tools/archive.py@ (title and type fold into PROVENANCE.json; tags are
+--   Phase 4) and need no Haskell-side binding.
+data ManifestEntry = ManifestEntry
+    { meUrl        :: String
+    , meNote       :: Maybe String
+    , mePaywalled  :: Bool
+    , meVisibility :: String          -- ^ "public" (default) | "private"
+    }
+
+instance A.FromJSON ManifestEntry where
+    parseJSON = A.withObject "ManifestEntry" $ \o -> do
+        url        <- o .:  "url"
+        note       <- o .:? "note"
+        paywalled  <- fromMaybe False    <$> o .:? "paywalled"
+        visibility <- fromMaybe "public" <$> o .:? "visibility"
+        -- A publication/privacy field must fail closed: an unknown value
+        -- (e.g. a typo'd "privte") would otherwise be treated as public
+        -- and publish an artifact the author intended to keep offline.
+        when (visibility `notElem` ["public", "private"]) $ fail $
+            "manifest entry " ++ url
+            ++ ": visibility must be \"public\" or \"private\", got "
+            ++ show visibility
+        return (ManifestEntry url note paywalled visibility)
+
+newtype RemovedEntry = RemovedEntry { reUrl :: String }
+
+instance A.FromJSON RemovedEntry where
+    parseJSON = A.withObject "RemovedEntry" $ \o ->
+        RemovedEntry <$> o .: "url"
+
+-- | One generated @archive/<slug>/PROVENANCE.json@ — the immutable
+--   record of an archival event, written by @tools/archive.py@.
+data Provenance = Provenance
+    { pvUrl      :: String
+    , pvSlug     :: String
+    , pvTitle    :: String
+    , pvType     :: String          -- ^ "pdf" | "html"
+    , pvArtifact :: String          -- ^ "document.pdf" | "snapshot.html"
+    , pvSha256   :: String
+    , pvBytes    :: Integer
+    , pvArchived :: String
+    , pvQuality  :: String          -- ^ "ok" | "degraded" | "js-required"
+    , pvWayback  :: Maybe String
+    }
+
+instance A.FromJSON Provenance where
+    parseJSON = A.withObject "Provenance" $ \o -> Provenance
+        <$> o .:  "url"
+        <*> o .:  "slug"
+        <*> o .:  "title"
+        <*> o .:  "type"
+        <*> o .:  "artifact"
+        <*> o .:  "sha256"
+        <*> o .:  "bytes"
+        <*> o .:  "archived"
+        <*> (fromMaybe "ok" <$> o .:? "snapshot-quality")
+        <*> o .:? "wayback"
+
+-- | A renderable archive entry: the authored manifest line joined with
+--   its generated provenance and extracted full text. @aeTextId@ is the
+--   on-disk path of the extracted-text sidecar when it exists (it is
+--   gitignored, so a no-@.venv@ build may lack it).
+data ArchiveEntry = ArchiveEntry
+    { aeManifest :: ManifestEntry
+    , aeProv     :: Provenance
+    , aeFulltext :: String
+    , aeTextId   :: Maybe FilePath
+    , aeStatus   :: ArchiveStatus     -- ^ link-rot status of the original
+    }
+
+-- | The extracted-text sidecar name for an artifact type.
+textFileFor :: Provenance -> String
+textFileFor pv
+    | pvType pv == "html" = "snapshot.txt"
+    | otherwise           = "document.txt"
+
+-- | True for a @visibility: private@ entry — kept in-repo as a local
+--   preservation copy, but its artifact is never routed to @_site/@ and
+--   its extracted text is never rendered into the page.
+isPrivate :: ArchiveEntry -> Bool
+isPrivate = (== "private") . meVisibility . aeManifest
+
+-- ---------------------------------------------------------------------------
+-- Rule-generation-time IO (runs inside 'preprocess')
+-- ---------------------------------------------------------------------------
+
+manifestPath, removedPath :: FilePath
+manifestPath = "archive/manifest.yaml"
+removedPath = "archive/removed.yaml"
+
+-- | Read @archive/manifest.yaml@. An absent file yields an empty list
+--   (the archive degrades to invisible, matching the @.venv@-gated
+--   silent-skip convention). A *parse error on a present file* halts the
+--   build: the file exists but is broken — degrading to invisible would
+--   swallow real errors like a typo'd @visibility@ value or a malformed
+--   entry, both of which are publication-relevant.
+readManifest :: IO [ManifestEntry]
+readManifest = do
+    exists <- doesFileExist manifestPath
+    if not exists
+        then return []
+        else do
+            parsed <- Y.decodeFileEither manifestPath
+            case parsed of
+                Right es -> return es
+                Left e   -> do
+                    hPutStrLn stderr $
+                        "[archive] FATAL: manifest.yaml: " ++ show e
+                    exitFailure
+
+readRemovedUrls :: IO (Set.Set T.Text)
+readRemovedUrls = do
+    exists <- doesFileExist removedPath
+    if not exists
+        then return Set.empty
+        else do
+            parsed <- Y.decodeFileEither removedPath
+            case parsed of
+                Right entries -> return . Set.fromList $
+                    map (normalizeUrl . T.pack . reUrl) (entries :: [RemovedEntry])
+                Left e -> do
+                    hPutStrLn stderr $
+                        "[archive] FATAL: removed.yaml: " ++ show e
+                    exitFailure
+
+validateManifestEntries :: [ManifestEntry] -> Set.Set T.Text -> IO ()
+validateManifestEntries manifest removed = go Map.empty manifest
+  where
+    go _ [] = return ()
+    go seen (entry : rest) = do
+        let url = meUrl entry
+            norm = normalizeUrl (T.pack url)
+        when (norm `Set.member` removed) $ do
+            hPutStrLn stderr $
+                "[archive] FATAL: manifest URL " ++ show url
+                ++ " is also recorded in removed.yaml; refusing to publish "
+                ++ "a deliberately removed work."
+            exitFailure
+        case Map.lookup norm seen of
+            Just prior -> do
+                hPutStrLn stderr $
+                    "[archive] FATAL: manifest URLs " ++ show prior ++ " and "
+                    ++ show url ++ " normalise to the same archive target."
+                exitFailure
+            Nothing -> go (Map.insert norm url seen) rest
+
+-- | Scan @archive/<slug>/PROVENANCE.json@ into a @url -> (slug, Provenance)@
+--   map. The directory name is the slug; the join key is the URL.
+readProvenances :: IO (Map.Map String (String, Provenance))
+readProvenances = do
+    exists <- doesDirectoryExist "archive"
+    if not exists
+        then return Map.empty
+        else do
+            names <- listDirectory "archive"
+            entries <- forM names $ \name -> do
+                let provPath = "archive/" ++ name ++ "/PROVENANCE.json"
+                isFile <- doesFileExist provPath
+                if not isFile
+                    then return Nothing
+                    else do
+                        decoded <- A.eitherDecodeFileStrict' provPath
+                        case decoded of
+                            Right p -> return (Just (pvUrl p, (name, p)))
+                            Left e  -> do
+                                hPutStrLn stderr $
+                                    "[archive] FATAL: " ++ provPath ++ ": " ++ show e
+                                exitFailure
+            return (Map.fromList (catMaybes entries))
+
+-- | Read a file, returning "" on any error (e.g. an absent text sidecar).
+readFileSafe :: FilePath -> IO String
+readFileSafe path =
+    catch (readFile' path) (\(_ :: SomeException) -> return "")
+
+-- | Verify a committed artifact's SHA-256 against its recorded value.
+--   The build halts with a clear message on mismatch — so the integrity
+--   guarantee holds even when @tools/archive.py@ does not run first
+--   (e.g. no @.venv@, or a direct @cabal run site -- build@), and a
+--   tampered or corrupted artifact can never be deployed.
+--
+--   Shells out to @sha256sum@ (GNU coreutils — same toolchain the rest of
+--   the build assumes); a missing or non-zero @sha256sum@ surfaces as an
+--   exception that also halts the build.
+verifyArtifactSha :: String -> FilePath -> String -> IO ()
+verifyArtifactSha slug path expected = do
+    out <- readProcess "sha256sum" [path] ""
+    let actual = takeWhile (/= ' ') out
+    when (actual /= expected) $ do
+        hPutStrLn stderr $
+            "[archive] FATAL: " ++ slug ++ ": " ++ path
+            ++ " SHA-256 mismatch (recorded " ++ expected
+            ++ ", found " ++ actual
+            ++ "). The committed artifact is corrupt or was replaced; "
+            ++ "halting build."
+        exitFailure
+
+-- | Join the authored manifest with generated provenance. A manifest
+--   entry with no matching provenance — or whose artifact is not on disk
+--   — is dropped, so it produces no page.
+loadArchiveEntries :: IO [ArchiveEntry]
+loadArchiveEntries = do
+    manifest  <- readManifest
+    removed   <- readRemovedUrls
+    validateManifestEntries manifest removed
+    provByUrl <- readProvenances
+    fmap catMaybes $ forM manifest $ \me ->
+        case Map.lookup (meUrl me) provByUrl of
+            Nothing         -> return Nothing
+            Just (slug, pv) -> do
+                let dir     = "archive/" ++ slug
+                    txtPath = dir ++ "/" ++ textFileFor pv
+                let artPath = dir ++ "/" ++ pvArtifact pv
+                artifactThere <- doesFileExist artPath
+                if not artifactThere
+                    then do
+                        hPutStrLn stderr $
+                            "[archive] FATAL: " ++ slug ++ ": " ++ artPath
+                            ++ " is missing although PROVENANCE.json exists; "
+                            ++ "restore the committed artifact before building."
+                        exitFailure
+                    else do
+                        verifyArtifactSha slug artPath (pvSha256 pv)
+                        txtThere <- doesFileExist txtPath
+                        txt <- if txtThere then readFileSafe txtPath
+                                           else return ""
+                        return $ Just ArchiveEntry
+                            { aeManifest = me
+                            , aeProv     = pv
+                            , aeFulltext = txt
+                            , aeTextId   = if txtThere then Just txtPath
+                                                       else Nothing
+                            , aeStatus   = archiveStatusForSlug slug
+                            }
+
+-- ---------------------------------------------------------------------------
+-- Rules
+-- ---------------------------------------------------------------------------
+
+-- | All archive rules. Called once from 'Site.rules'.
+archiveRules :: Rules ()
+archiveRules = do
+    entries <- preprocess loadArchiveEntries
+
+    -- Raw artifacts: the PDF / HTML snapshot of every *public* entry,
+    -- served at its own path (/archive/<slug>/...). Routing this explicit
+    -- list rather than a glob means a `visibility: private` entry's
+    -- artifact is never deployed, and an orphan directory's artifact
+    -- (no manifest line) is not deployed either.
+    let publicArtifacts =
+            [ fromFilePath ("archive/" ++ pvSlug (aeProv e)
+                                       ++ "/" ++ pvArtifact (aeProv e))
+            | e <- entries, not (isPrivate e) ]
+    match (fromList publicArtifacts) $ do
+        route   idRoute
+        compile copyFileCompiler
+
+    -- Provenance, extracted text, and the manifest: matched (not routed)
+    -- so the generated pages can `load` them as dependencies and recompile
+    -- when they change.
+    match "archive/*/PROVENANCE.json" $ compile getResourceBody
+    match "archive/*/document.txt"    $ compile getResourceBody
+    match "archive/*/snapshot.txt"    $ compile getResourceBody
+    match "archive/manifest.yaml"     $ compile getResourceBody
+
+    mapM_ archiveEntryRule entries
+    archiveIndexRule entries
+
+-- | One @/archive/<slug>/@ page.
+archiveEntryRule :: ArchiveEntry -> Rules ()
+archiveEntryRule ae =
+    create [fromFilePath ("archive/" ++ slug ++ "/index.html")] $ do
+        route idRoute
+        compile $ do
+            -- Dependency edges: recompile when provenance or the manifest
+            -- changes. The extracted-text sidecar is gitignored and may be
+            -- absent (no .venv / fetch never ran); load it as a dependency
+            -- only when present, so the build never fails for a missing
+            -- generated file.
+            _ <- load provId     :: Compiler (Item String)
+            _ <- load manifestId :: Compiler (Item String)
+            case aeTextId ae of
+                Just tp -> do
+                    _ <- load (fromFilePath tp) :: Compiler (Item String)
+                    return ()
+                Nothing -> return ()
+            makeItem ""
+                >>= loadAndApplyTemplate "templates/archive.html"  ctx
+                >>= loadAndApplyTemplate "templates/default.html"  ctx
+                >>= relativizeUrls
+  where
+    slug       = pvSlug (aeProv ae)
+    provId     = fromFilePath ("archive/" ++ slug ++ "/PROVENANCE.json")
+    manifestId = fromFilePath manifestPath
+    ctx        = archiveEntryCtx ae
+
+-- | The @/archive/@ index — every archived work, newest snapshot first.
+archiveIndexRule :: [ArchiveEntry] -> Rules ()
+archiveIndexRule entries =
+    create ["archive/index.html"] $ do
+        route idRoute
+        compile $ do
+            -- Recompile when any provenance appears / changes, or the
+            -- manifest changes.
+            _ <- loadAll "archive/*/PROVENANCE.json" :: Compiler [Item String]
+            _ <- load (fromFilePath manifestPath)    :: Compiler (Item String)
+            let sorted = sortBy (comparing (Down . pvArchived . aeProv)) entries
+                items  = map (\e -> Item (fromFilePath ("archive/" ++ pvSlug (aeProv e))) e)
+                             sorted
+                ctx    = listField "entries" entryListCtx (return items)
+                      <> constField "title"   "Archive"
+                      <> constField "archive" "true"
+                      <> constField "noindex" "true"
+                      <> (if null entries then mempty
+                                          else constField "has-entries" "true")
+                      <> siteCtx
+            makeItem ""
+                >>= loadAndApplyTemplate "templates/archive-index.html" ctx
+                >>= loadAndApplyTemplate "templates/default.html"       ctx
+                >>= relativizeUrls
+
+-- ---------------------------------------------------------------------------
+-- Contexts
+-- ---------------------------------------------------------------------------
+
+-- | Per-entry context for the @/archive/<slug>/@ page.
+archiveEntryCtx :: ArchiveEntry -> Context String
+archiveEntryCtx ae = mconcat
+    [ constField "title"            (pvTitle pv)
+    , constField "archive"          "true"
+    , constField "noindex"          "true"
+    , constField "original-url"     (meUrl me)
+    , constField "archived"         (pvArchived pv)
+    , constField "archive-type"     (pvType pv)
+    , constField "sha-short"        (take 12 (pvSha256 pv))
+    , constField "size"             (formatBytes (pvBytes pv))
+    , constField "snapshot-quality" (pvQuality pv)
+    , constField "status"           (statusName (aeStatus ae))
+    , qualityFlag
+    , maybeField "status-note" (statusNote (aeStatus ae))
+    , maybeField "note"      (meNote me)
+    , maybeField "wayback"   (pvWayback pv)
+    , maybeField "paywalled" (if mePaywalled me then Just "true" else Nothing)
+    , visibilityFields
+    -- "Referenced by" (the pages that cite this work) and "Related"
+    -- (semantically near content). Both resolve by this page's route, so
+    -- they need no archive-specific wiring; each is a $if(...)$-guarded
+    -- section in archive.html.
+    , referencedByField
+    , similarLinksField
+    , siteCtx
+    ]
+  where
+    me     = aeManifest ae
+    pv     = aeProv ae
+    slug   = pvSlug pv
+    artUrl = "/archive/" ++ slug ++ "/" ++ pvArtifact pv
+    -- A non-'ok' snapshot raises a visible flag on the page.
+    qualityFlag
+        | pvQuality pv == "ok" = mempty
+        | otherwise            = constField "degraded" "true"
+    -- A private entry keeps a local preservation copy but publishes none
+    -- of it: no embed, no extracted text — only the provenance metadata
+    -- and a 'held offline' note. A public entry embeds the artifact raw
+    -- (the browser renders the PDF natively, the snapshot loads directly;
+    -- no PDF.js wrapper) and renders its extracted text into the page.
+    -- The is-pdf / is-html flag drives only the iframe sandbox: a
+    -- third-party HTML snapshot is sandboxed, our own committed PDF is not.
+    visibilityFields
+        | isPrivate ae = constField "private" "true"
+        | otherwise    = typeField
+                      <> constField "artifact-url"  artUrl
+                      <> constField "artifact-name" (pvArtifact pv)
+                      <> fulltextField (pvType pv) (aeFulltext ae)
+    typeField
+        | pvType pv == "html" = constField "is-html" "true"
+        | otherwise           = constField "is-pdf"  "true"
+
+-- | Renders the extracted full text into the page DOM so embed.py and
+--   Pagefind index real text, not an opaque iframe. PDF text keeps its
+--   pdftotext layout in a @<pre>@; HTML text is block-separated prose, so
+--   it renders as escaped @<p>@ paragraphs. Absent when the text is empty
+--   / whitespace, so the @$if(fulltext)$@ guard hides the section.
+fulltextField :: String -> String -> Context String
+fulltextField ftype txt
+    | all isBlank txt = mempty
+    | ftype == "html" = constField "fulltext" (htmlParagraphs txt)
+    | otherwise       = constField "fulltext" preBlock
+  where
+    isBlank c = c == ' ' || c == '\n' || c == '\t' || c == '\r'
+    preBlock  = "<pre class=\"archive-fulltext\">"
+             ++ escapeHtml txt ++ "</pre>"
+
+-- | Block-separated text (paragraphs delimited by blank lines, as
+--   @archive.py@'s HTML extractor writes it) → escaped @<p>@ elements.
+htmlParagraphs :: String -> String
+htmlParagraphs = concatMap para . paragraphsOf
+  where
+    para p       = "<p>" ++ escapeHtml p ++ "</p>\n"
+    paragraphsOf = map (unwords . concatMap words)
+                 . filter (not . blankGroup)
+                 . groupBy ((==) `on` blankLine)
+                 . lines
+    blankGroup g = null g || blankLine (head g)
+    blankLine    = all (`elem` (" \t\r" :: String))
+
+-- | List-item context for the @/archive/@ index.
+entryListCtx :: Context ArchiveEntry
+entryListCtx = mconcat
+    [ field "entry-title"    (return . pvTitle    . aeProv . itemBody)
+    , field "entry-archived" (return . pvArchived . aeProv . itemBody)
+    , field "entry-type"     (return . pvType     . aeProv . itemBody)
+    , field "entry-quality"  (return . pvQuality  . aeProv . itemBody)
+    , boolField "entry-degraded" ((/= "ok") . pvQuality . aeProv . itemBody)
+    , boolField "entry-private"  (isPrivate . itemBody)
+    , field "entry-status"   (return . statusName . aeStatus . itemBody)
+    , boolField "entry-rotted"   ((== Rotted) . aeStatus . itemBody)
+    , field "entry-url"      (\i -> return $
+        "/archive/" ++ pvSlug (aeProv (itemBody i)) ++ "/")
+    ]
+
+-- | Provide a field only when the value is present; otherwise contribute
+--   nothing, so the template's @$if(...)$@ guard is false.
+maybeField :: String -> Maybe String -> Context String
+maybeField k = maybe mempty (constField k)
+
+-- | A prose note for a non-live link-rot status, shown on the archive
+--   page; 'Nothing' for 'Live' / 'Error' (no note rendered).
+statusNote :: ArchiveStatus -> Maybe String
+statusNote Rotted = Just "The original is no longer reachable. This archived \
+                         \copy is now the live link."
+statusNote Moved  = Just "The original page has moved since this snapshot was \
+                         \taken; the link above may redirect."
+statusNote _      = Nothing
+
+-- ---------------------------------------------------------------------------
+-- Formatting
+-- ---------------------------------------------------------------------------
+
+-- | Human-readable byte count (mirrors the helper in build/Stats.hs).
+formatBytes :: Integer -> String
+formatBytes b
+    | b < 1024        = show b ++ " B"
+    | b < 1024 * 1024 = showD (b * 10 `div` 1024)            ++ " KB"
+    | otherwise       = showD (b * 10 `div` (1024 * 1024))   ++ " MB"
+  where
+    showD n = show (n `div` 10) ++ "." ++ show (n `mod` 10)
+
+-- ---------------------------------------------------------------------------
+-- /build/ telemetry
+-- ---------------------------------------------------------------------------
+
+-- | Archive metrics for the @/build/@ telemetry page — count, total size,
+--   median artifact age, breakdowns by link-rot status / snapshot quality
+--   / visibility, the paywalled count, and any orphan directories.
+--   Rendered by @Stats.hs@; an empty archive yields just the count.
+archiveBuildStats :: IO [(String, String)]
+archiveBuildStats = do
+    entries <- loadArchiveEntries
+    today   <- utctDay <$> getCurrentTime
+    orphans <- findOrphanDirs entries
+    let n         = length entries
+        bytes     = sum (map (pvBytes . aeProv) entries)
+        ages      = [ fromInteger (diffDays today d)
+                    | e <- entries
+                    , Just d <- [parseIsoDay (pvArchived (aeProv e))] ]
+        paywalled = length (filter (mePaywalled . aeManifest) entries)
+    return $
+        [ ("Entries", show n) ]
+        ++ (if n == 0 then [] else
+            [ ("Total size",    formatBytes bytes)
+            , ("Median age",    medianAge ages)
+            , ("By status",     tallyOf (map (statusName . aeStatus) entries))
+            , ("By quality",    tallyOf (map (pvQuality . aeProv) entries))
+            , ("By visibility", tallyOf (map (meVisibility . aeManifest) entries))
+            ])
+        ++ [ ("Paywalled", show paywalled) | paywalled > 0 ]
+        ++ [ ("Orphan directories", unwords orphans) | not (null orphans) ]
+
+-- | Directory names under @archive/@ that hold a @PROVENANCE.json@ but are
+--   not a live manifest entry — drift the @/build/@ page should surface.
+findOrphanDirs :: [ArchiveEntry] -> IO [String]
+findOrphanDirs entries = do
+    exists <- doesDirectoryExist "archive"
+    if not exists
+        then return []
+        else do
+            names <- listDirectory "archive"
+            let live = map (pvSlug . aeProv) entries
+            filterM
+                (\name -> do
+                    hasProv <- doesFileExist
+                                   ("archive/" ++ name ++ "/PROVENANCE.json")
+                    return (hasProv && name `notElem` live))
+                (sort names)
+
+-- | Format a multiset of string values as @"a 2  \183  b 1"@.
+tallyOf :: [String] -> String
+tallyOf xs = intercalate "  \183  "
+    [ k ++ " " ++ show c
+    | (k, c) <- Map.toList (Map.fromListWith (+) [ (x, 1 :: Int) | x <- xs ]) ]
+
+-- | The median of a list of ages, as @"N days"@; an em dash when empty.
+medianAge :: [Int] -> String
+medianAge [] = "\8212"
+medianAge xs =
+    let m = sort xs !! (length xs `div` 2)
+    in  show m ++ if m == 1 then " day" else " days"
+
+-- | Parse a @YYYY-MM-DD@ date; 'Nothing' on malformed input.
+parseIsoDay :: String -> Maybe Day
+parseIsoDay s = case splitOnDash s of
+    [y, m, d] -> fromGregorian <$> readMaybe y <*> readMaybe m <*> readMaybe d
+    _         -> Nothing
+  where
+    splitOnDash str = case break (== '-') str of
+        (a, '-' : rest) -> a : splitOnDash rest
+        (a, _)          -> [a]
diff --git a/build/ArchiveIndex.hs b/build/ArchiveIndex.hs
new file mode 100644
index 0000000..a797f05
--- /dev/null
+++ b/build/ArchiveIndex.hs
@@ -0,0 +1,255 @@
+{-# LANGUAGE GHC2021 #-}
+{-# LANGUAGE OverloadedStrings #-}
+-- | ArchiveIndex — shared read-only access to the archive's two JSON
+--   sidecars: @data/archive-index.json@ (the @url\/alias -> slug@ map
+--   written by @archive.py fetch@) and @data/archive-state.json@ (the
+--   per-URL link-rot status written by @archive.py check@).
+--
+--   Consumers:
+--
+--     * @Filters.Archive@ — appends the archive affordance to body links
+--       whose target is archived, and flips a @rotted@ link to the local
+--       copy.
+--     * @Backlinks@ — keeps archived external links through pass 1 and
+--       canonicalises them to their @/archive/<slug>/@ page in pass 2.
+--     * @Archive@ — surfaces each entry's rot status on its page, the
+--       @/archive/@ index, and the @/build/@ telemetry.
+--
+--   Both files are loaded once per build via @unsafePerformIO@ CAFs. An
+--   absent or malformed file degrades safely: an empty index makes the
+--   link consumers no-op; an absent state file makes every entry @Live@
+--   (the safe default — no link flip). @archive.py check@ is decoupled
+--   from @make build@; a build consumes whatever state file exists.
+module ArchiveIndex
+    ( ArchiveStatus (..)
+    , statusName
+    , archiveSlugFor
+    , archiveStatusForSlug
+    , archiveIndexIsEmpty
+    , normalizeUrl
+    ) where
+
+import           Data.Map.Strict        (Map)
+import qualified Data.Map.Strict        as Map
+import           Data.Maybe             (fromMaybe)
+import           Data.Set               (Set)
+import qualified Data.Set               as Set
+import           Data.Text              (Text)
+import qualified Data.Text              as T
+import qualified Data.Aeson             as A
+import           Data.Aeson             ((.!=), (.:), (.:?))
+import qualified Data.Yaml              as Y
+import           System.Directory       (doesFileExist)
+import           System.IO.Unsafe       (unsafePerformIO)
+
+-- ---------------------------------------------------------------------------
+-- Link-rot status
+-- ---------------------------------------------------------------------------
+
+-- | The link-rot status of an archived work's original URL, as set by
+--   @archive.py check@. 'Live' is the safe default for an unscanned or
+--   unknown entry.
+data ArchiveStatus = Live | Moved | Rotted | Error
+    deriving (Eq, Show)
+
+-- | The lower-case wire name, matching @archive-state.json@ and the
+--   @status:@ Pagefind filter tag.
+statusName :: ArchiveStatus -> String
+statusName Live   = "live"
+statusName Moved  = "moved"
+statusName Rotted = "rotted"
+statusName Error  = "error"
+
+parseStatus :: Text -> ArchiveStatus
+parseStatus "moved"  = Moved
+parseStatus "rotted" = Rotted
+parseStatus "error"  = Error
+parseStatus _        = Live
+
+-- ---------------------------------------------------------------------------
+-- JSON shapes
+-- ---------------------------------------------------------------------------
+
+-- | One @archive-index.json@ entry. Only @slug@ and @aliases@ are used.
+data IdxEntry = IdxEntry
+    { ieSlug    :: String
+    , ieAliases :: [Text]
+    }
+
+instance A.FromJSON IdxEntry where
+    parseJSON = A.withObject "IdxEntry" $ \o -> IdxEntry
+        <$> o .:  "slug"
+        <*> (o .:? "aliases" .!= [])
+
+-- | One @archive-state.json@ entry — only the @status@ is consumed here.
+newtype StateEntry = StateEntry { seStatus :: ArchiveStatus }
+
+instance A.FromJSON StateEntry where
+    parseJSON = A.withObject "StateEntry" $ \o ->
+        StateEntry . parseStatus <$> (o .:? "status" .!= "live")
+
+newtype UrlEntry = UrlEntry { ueUrl :: Text }
+
+instance A.FromJSON UrlEntry where
+    parseJSON = A.withObject "UrlEntry" $ \o ->
+        UrlEntry <$> o .: "url"
+
+-- ---------------------------------------------------------------------------
+-- Loaded-once CAFs
+-- ---------------------------------------------------------------------------
+
+indexPath, statePath, manifestPath, removedPath :: FilePath
+indexPath = "data/archive-index.json"
+statePath = "data/archive-state.json"
+manifestPath = "archive/manifest.yaml"
+removedPath = "archive/removed.yaml"
+
+readUrlSet :: FilePath -> IO (Set Text)
+readUrlSet path = do
+    exists <- doesFileExist path
+    if not exists
+        then return Set.empty
+        else do
+            decoded <- Y.decodeFileEither path
+            case decoded of
+                Right entries -> return . Set.fromList $
+                    map (normalizeUrl . ueUrl) (entries :: [UrlEntry])
+                Left e -> ioError . userError $
+                    "[archive] FATAL: " ++ path ++ ": " ++ show e
+
+-- | Canonical URLs still permitted to participate in link annotation.
+--   Filtering the generated index at build time makes a direct Hakyll build
+--   respect authored manifest/removal state even when archive.py did not run.
+{-# NOINLINE activeUrls #-}
+activeUrls :: Set Text
+activeUrls = unsafePerformIO $ do
+    manifest <- readUrlSet manifestPath
+    removed  <- readUrlSet removedPath
+    return (manifest `Set.difference` removed)
+
+-- | @canonical-url -> entry@. Absent/malformed file -> empty; entries no
+--   longer permitted by the authored manifest/removal state are removed.
+{-# NOINLINE rawIndex #-}
+rawIndex :: Map Text IdxEntry
+rawIndex = unsafePerformIO $ do
+    decoded <- A.eitherDecodeFileStrict' indexPath
+    let parsed = either (const Map.empty) id decoded
+    return $ Map.filterWithKey
+        (\canon _ -> normalizeUrl canon `Set.member` activeUrls)
+        parsed
+
+-- | @url -> status@. Absent/malformed file -> empty (every entry 'Live').
+{-# NOINLINE rawState #-}
+rawState :: Map Text ArchiveStatus
+rawState = unsafePerformIO $ do
+    decoded <- A.eitherDecodeFileStrict' statePath
+    return $ either (const Map.empty) (Map.map seStatus) decoded
+
+-- | @normalised-url -> slug@: the canonical key and every alias from
+--   @archive-index.json@, each fed through 'normalizeUrl'. Both keys and
+--   lookups are normalised, so a citation form the alias set cannot
+--   enumerate (e.g. an unbounded arXiv version, or any tracking-laden
+--   variant of a clean manifest URL) still resolves.
+{-# NOINLINE flatIndex #-}
+flatIndex :: Map Text String
+flatIndex = Map.fromList
+    [ (normalizeUrl key, ieSlug e)
+    | (canon, e) <- Map.toList rawIndex
+    , key        <- canon : ieAliases e
+    ]
+
+-- | @slug -> status@: each entry's status, looked up by its canonical URL
+--   in the state file (the two files share the manifest URL as key).
+{-# NOINLINE slugStatus #-}
+slugStatus :: Map String ArchiveStatus
+slugStatus = Map.fromList
+    [ (ieSlug e, Map.findWithDefault Live canon rawState)
+    | (canon, e) <- Map.toList rawIndex
+    ]
+
+-- ---------------------------------------------------------------------------
+-- Public lookups
+-- ---------------------------------------------------------------------------
+
+-- | True when no archive index is available — the link consumers no-op.
+archiveIndexIsEmpty :: Bool
+archiveIndexIsEmpty = Map.null rawIndex
+
+-- | The archive slug for an outbound URL, or 'Nothing'. Both the index
+--   keys and the input go through 'normalizeUrl', so a citation form that
+--   the alias set cannot enumerate — an unbounded arXiv version, or any
+--   tracking-laden variant of a clean manifest URL — still resolves.
+archiveSlugFor :: Text -> Maybe String
+archiveSlugFor url = Map.lookup (normalizeUrl url) flatIndex
+
+-- | The link-rot status of an archived entry, by slug. 'Live' for an
+--   unknown slug or when no scan has run.
+archiveStatusForSlug :: String -> ArchiveStatus
+archiveStatusForSlug slug = Map.findWithDefault Live slug slugStatus
+
+-- ---------------------------------------------------------------------------
+-- URL normalisation (matching, not display)
+-- ---------------------------------------------------------------------------
+
+-- | Tracking-only query parameters: their presence or absence is
+--   semantically irrelevant; the lookup strips them before matching.
+--   Sync with @TRACKING_PARAMS@ in @tools/archive.py@.
+trackingParams :: [Text]
+trackingParams =
+    [ "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content"
+    , "fbclid", "gclid", "mc_eid", "mc_cid", "ref", "igshid"
+    , "_hsenc", "_hsmi", "mkt_tok"
+    ]
+
+-- | Remove tracking-only query parameters; preserve every other parameter
+--   in its original order.
+stripTracking :: Text -> Text
+stripTracking url = case T.breakOn "?" url of
+    (_, "")    -> url
+    (path, q)  ->
+        let kept = filter notTracking (T.splitOn "&" (T.drop 1 q))
+        in  if null kept then path
+            else path <> "?" <> T.intercalate "&" kept
+  where
+    notTracking p = T.takeWhile (/= '=') p `notElem` trackingParams
+
+-- | The canonical form of an arXiv URL: @https://arxiv.org/abs/<id>@ with
+--   no version suffix and no @.pdf@. Maps every member of the
+--   abs/pdf/versioned/@.pdf@ family to the same key. Non-arXiv passes through.
+arxivCanonical :: Text -> Text
+arxivCanonical url
+    | Just rest <- T.stripPrefix "https://arxiv.org/" url
+    , Just key  <- arxivKey rest = key
+    | Just rest <- T.stripPrefix "http://arxiv.org/" url
+    , Just key  <- arxivKey rest = key
+    | otherwise                  = url
+  where
+    arxivKey rest = case T.breakOn "/" rest of
+        (kind, slashId)
+            | kind `elem` ["abs", "pdf"], not (T.null slashId) ->
+                Just $ "https://arxiv.org/abs/"
+                     <> stripVer (stripPdfSuf (T.tail slashId))
+        _ -> Nothing
+    stripPdfSuf t = fromMaybe t (T.stripSuffix ".pdf" t)
+    stripVer t = case T.breakOnEnd "v" t of
+        (before, ver)
+            | not (T.null before)
+            , not (T.null ver)
+            , T.all isAsciiDigit ver
+            -> T.dropEnd 1 before
+        _ -> t
+    isAsciiDigit c = c >= '0' && c <= '9'
+
+-- | The full normalisation: drop fragment, strip tracking, fold
+--   @http://@→@https://@, arXiv-canonicalise, trim a trailing slash. Both
+--   'flatIndex' keys and 'archiveSlugFor' inputs go through this so the
+--   index never misses a citation form the design promises to match.
+normalizeUrl :: Text -> Text
+normalizeUrl url =
+    let noFrag = T.takeWhile (/= '#') url
+        clean  = stripTracking noFrag
+        https  = case T.stripPrefix "http://" clean of
+            Just rest -> "https://" <> rest
+            Nothing   -> clean
+        arxiv  = arxivCanonical https
+    in  T.dropWhileEnd (== '/') arxiv
diff --git a/build/Backlinks.hs b/build/Backlinks.hs
index f7e9821..205355d 100644
--- a/build/Backlinks.hs
+++ b/build/Backlinks.hs
@@ -25,9 +25,11 @@
 module Backlinks
     ( backlinkRules
     , backlinksField
+    , referencedByField
     ) where
 
-import           Data.List                  (nubBy, sortBy)
+import           Data.List                  (nubBy, partition, sortBy,
+                                             stripPrefix)
 import           Data.Ord                   (comparing)
 import           Data.Maybe                 (fromMaybe)
 import qualified Data.Map.Strict            as Map
@@ -50,6 +52,7 @@ import           Hakyll
 import           Compilers                  (readerOpts, writerOpts)
 import           Filters                    (preprocessSource)
 import qualified Patterns                   as P
+import           ArchiveIndex               (archiveSlugFor)
 
 -- ---------------------------------------------------------------------------
 -- Link-with-context entry (intermediate, saved by the "links" pass)
@@ -85,6 +88,7 @@ data BacklinkSource = BacklinkSource
     , blAbstract  :: String
     , blSentence  :: String   -- raw HTML of the sentence containing the link
     , blParagraph :: String   -- raw HTML of the full paragraph (hover popup)
+    , blFragment  :: String   -- archived-target fragment (no '#'), else ""
     } deriving (Show, Eq, Ord)
 
 instance Aeson.ToJSON BacklinkSource where
@@ -94,16 +98,18 @@ instance Aeson.ToJSON BacklinkSource where
         , "abstract"  .= blAbstract bl
         , "sentence"  .= blSentence bl
         , "paragraph" .= blParagraph bl
+        , "fragment"  .= blFragment bl
         ]
 
 instance Aeson.FromJSON BacklinkSource where
     parseJSON = Aeson.withObject "BacklinkSource" $ \o ->
         BacklinkSource
-            <$> o Aeson..: "url"
-            <*> o Aeson..: "title"
-            <*> o Aeson..: "abstract"
-            <*> o Aeson..: "sentence"
-            <*> o Aeson..: "paragraph"
+            <$> o Aeson..:  "url"
+            <*> o Aeson..:  "title"
+            <*> o Aeson..:  "abstract"
+            <*> o Aeson..:  "sentence"
+            <*> o Aeson..:  "paragraph"
+            <*> o Aeson..:? "fragment" Aeson..!= ""
 
 -- ---------------------------------------------------------------------------
 -- Writer options for context rendering
@@ -125,15 +131,22 @@ contextWriterOpts = writerOpts
 -- | URL filter: skip external links, pseudo-schemes, anchor-only fragments,
 -- and static-asset paths.
 isPageLink :: T.Text -> Bool
-isPageLink u =
-    not (T.isPrefixOf "http://"  u) &&
-    not (T.isPrefixOf "https://" u) &&
-    not (T.isPrefixOf "#"        u) &&
-    not (T.isPrefixOf "mailto:"  u) &&
-    not (T.isPrefixOf "tel:"     u) &&
-    not (T.null u) &&
-    not (hasStaticExt u)
+isPageLink u
+    -- An archived external URL is kept regardless of scheme or extension:
+    -- pass 2 inverts it to its /archive/<slug>/ page.
+    | isArchived = True
+    | otherwise  =
+        not (T.isPrefixOf "http://"  u) &&
+        not (T.isPrefixOf "https://" u) &&
+        not (T.isPrefixOf "#"        u) &&
+        not (T.isPrefixOf "mailto:"  u) &&
+        not (T.isPrefixOf "tel:"     u) &&
+        not (T.null u) &&
+        not (hasStaticExt u)
   where
+    isArchived = case archiveSlugFor u of
+                     Just _  -> True
+                     Nothing -> False
     staticExts = [".pdf",".svg",".png",".jpg",".jpeg",".webp",
                   ".mp3",".mp4",".woff2",".woff",".ttf",".ico",
                   ".json",".asc",".xml",".gz",".zip"]
@@ -289,6 +302,28 @@ percentDecode = T.unpack . TE.decodeUtf8With lenientDecode . pack . go
     pack = BS.pack
     lenientDecode = TE.lenientDecode
 
+-- ---------------------------------------------------------------------------
+-- Archive-aware target keying
+-- ---------------------------------------------------------------------------
+
+-- | The @data/backlinks.json@ key an outbound URL inverts to. An archived
+-- external URL canonicalises to its @/archive/<slug>/@ page key — computed
+-- exactly as 'backlinksFieldWith' computes the archive page's own key (the
+-- same string fed through 'normaliseUrl'), so the two always agree. Every
+-- other URL is normalised as before.
+targetKey :: T.Text -> T.Text
+targetKey u = case archiveSlugFor u of
+    Just slug -> T.pack (normaliseUrl ("/archive/" ++ slug ++ "/index.html"))
+    Nothing   -> T.pack (normaliseUrl (T.unpack u))
+
+-- | The fragment (without @#@) of an archived URL, for granular grouping
+-- of "Referenced by". Empty for a non-archived URL or one with no fragment
+-- — so granular grouping stays an archive-only behaviour.
+archiveFragment :: T.Text -> String
+archiveFragment u = case archiveSlugFor u of
+    Just _  -> T.unpack (T.drop 1 (T.dropWhile (/= '#') u))
+    Nothing -> ""
+
 -- ---------------------------------------------------------------------------
 -- Content patterns (must match the rules in Site.hs — sourced from
 -- Patterns.allContent so additions to the canonical list automatically
@@ -337,10 +372,11 @@ toSourcePairs item = do
                     :: Maybe [LinkEntry] of
                 Nothing      -> return []
                 Just entries ->
-                    return [ ( T.pack (normaliseUrl (T.unpack (leUrl e)))
+                    return [ ( targetKey (leUrl e)
                              , BacklinkSource srcUrl title abstract
                                               (leSentence  e)
                                               (leParagraph e)
+                                              (archiveFragment (leUrl e))
                              )
                            | e <- entries ]
 
@@ -352,7 +388,20 @@ toSourcePairs item = do
 -- to the current page, each with its paragraph context.
 -- Returns @noResult@ (so @$if(backlinks)$@ is false) when there are none.
 backlinksField :: Context String
-backlinksField = field "backlinks" $ \item -> do
+backlinksField = backlinksFieldWith renderBacklinks "backlinks"
+
+-- | "Referenced by" for archive pages. Same lookup as 'backlinksField',
+-- but the sources are grouped by the fragment each citation targets, so an
+-- archived work's page can show which section/page each citing essay points
+-- at (granular backlinks).
+referencedByField :: Context String
+referencedByField = backlinksFieldWith renderReferencedBy "referenced-by"
+
+-- | Shared machinery for 'backlinksField' and 'referencedByField': look the
+-- page up in @data/backlinks.json@ by its normalised route, then hand the
+-- sorted sources to the given renderer.
+backlinksFieldWith :: ([BacklinkSource] -> String) -> String -> Context String
+backlinksFieldWith renderSources name = field name $ \item -> do
     blItem <- load (fromFilePath "data/backlinks.json") :: Compiler (Item String)
     case Aeson.decodeStrict (TE.encodeUtf8 (T.pack (itemBody blItem)))
             :: Maybe (Map T.Text [BacklinkSource]) of
@@ -367,7 +416,7 @@ backlinksField = field "backlinks" $ \item -> do
                         sorted  = sortBy (comparing blTitle) sources
                     in  if null sorted
                         then fail "no backlinks"
-                        else return (renderBacklinks sorted)
+                        else return (renderSources sorted)
 
 -- ---------------------------------------------------------------------------
 -- HTML rendering
@@ -384,25 +433,59 @@ backlinksField = field "backlinks" $ \item -> do
 renderBacklinks :: [BacklinkSource] -> String
 renderBacklinks sources =
     "<ul class=\"backlinks-list\">\n"
-    ++ concatMap renderOne sources
+    ++ concatMap renderBacklinkItem sources
     ++ "</ul>"
-  where
-    renderOne bl =
-        "<li class=\"backlink-item\">"
-        ++ "<a class=\"backlink-source\" href=\""
-        ++ escapeHtml (blUrl bl) ++ "\">"
-        ++ escapeHtml (blTitle bl) ++ "</a>"
-        ++ ( if null (blSentence bl) then ""
-             else "<blockquote class=\"backlink-quote\">"
-                  ++ blSentence bl
-                  ++ paragraphAffordance bl
-                  ++ "</blockquote>" )
-        ++ "</li>\n"
 
-    paragraphAffordance bl
-        | null (blParagraph bl)                     = ""
-        | blParagraph bl == blSentence bl           = ""
-        | otherwise                                  =
+-- | "Referenced by", grouped by the fragment each citation targets.
+-- Sources citing the work with no fragment render first as a plain list;
+-- each distinct fragment then gets its own subheading. With no fragments
+-- anywhere (the common case) this collapses to exactly the flat list.
+renderReferencedBy :: [BacklinkSource] -> String
+renderReferencedBy sources =
+    let (general, fragmented) = partition (null . blFragment) sources
+        groups = Map.toList $ Map.fromListWith (flip (++))
+                     [ (blFragment s, [s]) | s <- fragmented ]
+    in  renderList general ++ concatMap renderGroup groups
+  where
+    renderList [] = ""
+    renderList ss = "<ul class=\"backlinks-list\">\n"
+                    ++ concatMap renderBacklinkItem ss ++ "</ul>\n"
+    renderGroup (frag, ss) =
+        "<div class=\"referenced-by-group\">"
+        ++ "<h3 class=\"referenced-by-fragment\">"
+        ++ escapeHtml (fragmentLabel frag) ++ "</h3>"
+        ++ renderList ss
+        ++ "</div>\n"
+
+-- | Human label for a cited fragment: a PDF @#page=N@ becomes "Page N";
+-- any other @#anchor@ is shown verbatim behind a section mark.
+fragmentLabel :: String -> String
+fragmentLabel frag =
+    case stripPrefix "page=" frag of
+        Just n  -> "Page " ++ n
+        Nothing -> "\x00A7 " ++ frag
+
+-- | One backlink @<li>@: the source title as a link, the sentence of
+-- context as a blockquote, and a hover affordance revealing the full
+-- paragraph. 'blSentence' / 'blParagraph' are already HTML fragments from
+-- the Pandoc writer, so they are emitted unescaped.
+renderBacklinkItem :: BacklinkSource -> String
+renderBacklinkItem bl =
+    "<li class=\"backlink-item\">"
+    ++ "<a class=\"backlink-source\" href=\""
+    ++ escapeHtml (blUrl bl) ++ "\">"
+    ++ escapeHtml (blTitle bl) ++ "</a>"
+    ++ ( if null (blSentence bl) then ""
+         else "<blockquote class=\"backlink-quote\">"
+              ++ blSentence bl
+              ++ paragraphAffordance
+              ++ "</blockquote>" )
+    ++ "</li>\n"
+  where
+    paragraphAffordance
+        | null (blParagraph bl)            = ""
+        | blParagraph bl == blSentence bl  = ""
+        | otherwise                        =
             "<span class=\"backlink-full\">"
             ++ "<button type=\"button\" class=\"backlink-full-trigger\""
             ++ " aria-label=\"Show full paragraph\" tabindex=\"0\">\x00B6</button>"
diff --git a/build/Filters.hs b/build/Filters.hs
index b6fbd71..e439f0b 100644
--- a/build/Filters.hs
+++ b/build/Filters.hs
@@ -13,6 +13,7 @@ import qualified Filters.Typography as Typography
 import qualified Filters.Links      as Links
 import qualified Filters.SourceRefs as SourceRefs
 import qualified Filters.Smallcaps  as Smallcaps
+import qualified Filters.Archive    as Archive
 import qualified Filters.Dropcaps   as Dropcaps
 import qualified Filters.Math       as Math
 import qualified Filters.Wikilinks     as Wikilinks
@@ -40,6 +41,7 @@ applyAll srcDir doc = do
         . Sidenotes.apply
         . Typography.apply
         . Links.apply
+        . Archive.apply
         . Smallcaps.apply
         . Dropcaps.apply
         . Math.apply
diff --git a/build/Filters/Archive.hs b/build/Filters/Archive.hs
new file mode 100644
index 0000000..ba12597
--- /dev/null
+++ b/build/Filters/Archive.hs
@@ -0,0 +1,82 @@
+{-# LANGUAGE GHC2021 #-}
+{-# LANGUAGE OverloadedStrings #-}
+-- | Filters.Archive — annotate (and, for dead links, redirect) body links
+--   to archived works.
+--
+--   For every @Link@ whose URL matches an entry in @data/archive-index.json@
+--   (the equivalent-URL alias set included):
+--
+--     * a 'live', 'moved' or (inconclusive) 'error' target keeps its
+--       original link and gains a small superscript affordance pointing at
+--       the local @/archive/<slug>/@ page — purely additive;
+--
+--     * a 'rotted' target (confirmed dead by @archive.py check@'s
+--       hysteresis) has its primary link flipped to the archived copy, so
+--       a reader of an old essay reaches a working snapshot instead of a
+--       404. A "archived" marker replaces the affordance.
+--
+--   Registered in 'Filters.applyAll' immediately after @Smallcaps@ and
+--   before @Links@: it must see the smallcaps-rewritten text, and it emits
+--   the affordance/marker as @RawInline@ so the downstream @Links@ pass
+--   never re-classifies it.
+--
+--   No-op when @data/archive-index.json@ is absent. When no rot scan has
+--   run, every entry is 'Live' — no link is ever flipped.
+module Filters.Archive (apply) where
+
+import qualified Data.Text              as T
+import           Text.Pandoc.Definition
+import           Text.Pandoc.Walk       (walk)
+import           ArchiveIndex           (ArchiveStatus (..), archiveIndexIsEmpty,
+                                         archiveSlugFor, archiveStatusForSlug)
+
+-- | Annotate body links. Headings are left alone — an affordance there
+--   would be noise. Identity when the index is empty.
+apply :: Pandoc -> Pandoc
+apply doc@(Pandoc meta blocks)
+    | archiveIndexIsEmpty = doc
+    | otherwise           = Pandoc meta (map annotateBlock blocks)
+
+annotateBlock :: Block -> Block
+annotateBlock h@Header{} = h
+annotateBlock b          = walk annotateInlines b
+
+-- | For each archived @Link@: flip it if the target is 'Rotted', else
+--   append the affordance. Non-archived links pass through untouched.
+annotateInlines :: [Inline] -> [Inline]
+annotateInlines = concatMap expand
+  where
+    expand l@(Link attr text (url, _)) =
+        case archiveSlugFor url of
+            Nothing   -> [l]
+            Just slug -> case archiveStatusForSlug slug of
+                Rotted -> [flipped slug attr text, marker slug "rotted"
+                                "The original is a dead link &mdash; \
+                                \opens the local archived copy"]
+                _      -> [l, marker slug "" "Archived &mdash; \
+                                            \local preservation copy"]
+    expand x = [x]
+
+-- | A 'Rotted' link, redirected to the local archived copy. Keeps the
+--   link text; the @archive-rotted@ class lets CSS mark it.
+flipped :: String -> Attr -> [Inline] -> Inline
+flipped slug (ident, classes, kvs) text =
+    Link (ident, "archive-rotted" : classes, kvs) text
+         ( T.pack ("/archive/" ++ slug ++ "/")
+         , "Original link is dead \8212 opens the local archived copy" )
+
+-- | The superscript marker after the link: "A" for a normal affordance,
+--   "archived" for a flipped dead link. Emitted as raw HTML so the
+--   downstream @Links@ filter (which classifies @Link@ nodes) leaves it
+--   alone. Slugs are @[a-z0-9-]@ by construction in @archive.py@.
+marker :: String -> String -> T.Text -> Inline
+marker slug modifier title = RawInline "html" $ T.concat
+    [ "<sup class=\"archive-affordance", modifierClass, "\">"
+    , "<a href=\"/archive/", T.pack slug, "/\" title=\"", title, "\">"
+    , label, "</a></sup>"
+    ]
+  where
+    modifierClass = if null modifier
+                    then ""
+                    else " archive-affordance--" <> T.pack modifier
+    label = if null modifier then "A" else "archived"
diff --git a/build/Main.hs b/build/Main.hs
index 9e4d304..555f1ab 100644
--- a/build/Main.hs
+++ b/build/Main.hs
@@ -1,7 +1,23 @@
 module Main where
 
-import Hakyll (hakyll)
-import Site (rules)
+import Data.Time.Clock.POSIX (getPOSIXTime)
+import System.Directory      (createDirectoryIfMissing)
+import Hakyll                (hakyll)
+import Site                  (rules)
+
+-- | Stamp the start of this build into @data/build-stamp.txt@ before
+-- Hakyll scans the provider directory. The file therefore always exists
+-- and always differs from the previous run. The telemetry pages
+-- (@/build/@, @/stats/@) @load@ it as a dependency so Hakyll recompiles
+-- them on every build instead of serving a stale cached copy when no
+-- tracked content changed. See build/Stats.hs and build/Site.hs.
+writeBuildStamp :: IO ()
+writeBuildStamp = do
+    createDirectoryIfMissing True "data"
+    t <- getPOSIXTime
+    writeFile "data/build-stamp.txt" (show t ++ "\n")
 
 main :: IO ()
-main = hakyll rules
+main = do
+    writeBuildStamp
+    hakyll rules
diff --git a/build/Site.hs b/build/Site.hs
index 68fc154..33e5217 100644
--- a/build/Site.hs
+++ b/build/Site.hs
@@ -19,6 +19,7 @@ import qualified Data.Aeson as Aeson
 import qualified Data.ByteString.Lazy.Char8 as LBS
 import qualified Data.Map.Strict as Map
 import Hakyll
+import Archive    (archiveRules)
 import Authors    (buildAllAuthors, applyAuthorRules)
 import Backlinks  (backlinkRules)
 import BibExtras  (BibExtra (..), emptyBibExtra, firstAuthorSurname, parseBibExtras)
@@ -265,6 +266,13 @@ rules = do
     -- /current.html. Re-compiles current.html when the YAML changes.
     match "data/now.yaml" $ compile getResourceBody
 
+    -- Per-build stamp — written by Main.main before Hakyll starts, so it
+    -- always exists and always differs from the previous run. Matched
+    -- (not routed) purely so the telemetry pages can `load` it as a
+    -- dependency and thus recompile every build instead of serving a
+    -- stale cached copy. See build/Stats.hs.
+    match "data/build-stamp.txt" $ compile getResourceBody
+
     -- ---------------------------------------------------------------------------
     -- Homepage
     -- ---------------------------------------------------------------------------
@@ -529,6 +537,13 @@ rules = do
     -- ---------------------------------------------------------------------------
     photographyRules
 
+    -- ---------------------------------------------------------------------------
+    -- Archive — link-archiving system: per-entry /archive/<slug>/ pages and
+    -- the /archive/ index, driven by archive/manifest.yaml + PROVENANCE.json.
+    -- See build/Archive.hs and ARCHIVE.md for the design.
+    -- ---------------------------------------------------------------------------
+    archiveRules
+
     -- ---------------------------------------------------------------------------
     -- Blog index (paginated)
     -- ---------------------------------------------------------------------------
@@ -926,6 +941,13 @@ rules = do
     create ["robots.txt"] $ do
         route idRoute
         compile $ makeItem $ unlines
+            -- /archive/ is *deliberately not* disallowed. Crawlers must be
+            -- able to reach the wrapper pages (and snapshot.html) to see
+            -- their <meta name=robots content="noindex, noarchive">; a
+            -- robots.txt Disallow would block that and a URL blocked only
+            -- by robots.txt can still appear in results when linked. The
+            -- raw PDFs cannot carry meta — they need an `X-Robots-Tag`
+            -- HTTP header from the deploy webserver (see nginx/archive.conf).
             [ "User-agent: *"
             , "Allow: /"
             , ""
diff --git a/build/Stats.hs b/build/Stats.hs
index df9ed28..10fc2fb 100644
--- a/build/Stats.hs
+++ b/build/Stats.hs
@@ -37,6 +37,7 @@ import qualified Text.Blaze.Html5.Attributes as A
 import           Text.Blaze.Html.Renderer.String (renderHtml)
 import qualified Text.Blaze.Internal         as BI
 import Hakyll
+import Archive                    (archiveBuildStats)
 import Contexts                   (siteCtx, authorLinksField)
 import qualified Patterns         as P
 import Utils                      (readingTime)
@@ -707,6 +708,14 @@ renderBuild ts dur =
         , ("Last build duration", txt dur)
         ]
 
+-- | Link-archive coverage and health. The metric rows are computed by
+-- 'Archive.archiveBuildStats' (count, size, link-rot status breakdown,
+-- snapshot quality, visibility, orphans); this only lays them out.
+renderArchive :: [(String, String)] -> H.Html
+renderArchive metrics =
+    section "archive" "Link archive" $
+    dl [ (k, txt v) | (k, v) <- metrics ]
+
 -- ---------------------------------------------------------------------------
 -- Static TOC (matches the nine h2 sections above)
 -- ---------------------------------------------------------------------------
@@ -726,6 +735,7 @@ pageTOC = H.ol $ mapM_ item sections
         , ("links",        "Links")
         , ("epistemic",    "Epistemic coverage")
         , ("output",       "Output")
+        , ("archive",      "Link archive")
         , ("repository",   "Repository")
         , ("build",        "Build")
         ]
@@ -743,6 +753,16 @@ statsRules tags = do
   create ["build/index.html"] $ do
         route idRoute
         compile $ do
+            -- ----------------------------------------------------------------
+            -- Per-build stamp dependency: data/build-stamp.txt is rewritten
+            -- by Main.main on every invocation, so loading it here forces
+            -- Hakyll to recompile this page each build. Without it the page
+            -- is served from cache whenever no tracked content changed, and
+            -- every unsafeCompiler-sourced figure below (timestamp, output
+            -- stats, git, LOC) goes stale. The value itself is unused.
+            -- ----------------------------------------------------------------
+            _ <- load (fromFilePath "data/build-stamp.txt") :: Compiler (Item String)
+
             -- ----------------------------------------------------------------
             -- Load all content items
             -- ----------------------------------------------------------------
@@ -846,6 +866,11 @@ statsRules tags = do
             (hf, hl, cf, cl, jf, jl) <- unsafeCompiler getLocStats
             (commits, firstDate)      <- unsafeCompiler getGitStats
 
+            -- ----------------------------------------------------------------
+            -- Link-archive coverage + link-rot health
+            -- ----------------------------------------------------------------
+            archiveMetrics <- unsafeCompiler archiveBuildStats
+
             -- ----------------------------------------------------------------
             -- Build timestamp + last build duration
             -- ----------------------------------------------------------------
@@ -869,6 +894,7 @@ statsRules tags = do
                     renderLinks mostLinkedInfo orphanCount (length allPIs)
                     renderEpistemic epTotal withStatus withConf withImp withEv
                     renderOutput outputGrouped totalFiles totalSize
+                    renderArchive archiveMetrics
                     renderRepository hf hl cf cl jf jl commits firstDate
                     renderBuild buildTimestamp lastBuildDur
                 contentString = renderHtml htmlContent
@@ -897,6 +923,11 @@ statsRules tags = do
   create ["stats/index.html"] $ do
         route idRoute
         compile $ do
+            -- Per-build stamp dependency — forces a recompile every build
+            -- so the heatmap's "today" and all corpus figures stay current.
+            -- See the /build/ rule above for the full rationale.
+            _ <- load (fromFilePath "data/build-stamp.txt") :: Compiler (Item String)
+
             essays  <- loadAll (P.essayPattern             .&&. hasNoVersion)
             posts   <- loadAll ("content/blog/*.md"        .&&. hasNoVersion)
             poems   <- loadAll ("content/poetry/*.md"      .&&. hasNoVersion)
diff --git a/levineuwirth.cabal b/levineuwirth.cabal
index cbba1f4..240db82 100644
--- a/levineuwirth.cabal
+++ b/levineuwirth.cabal
@@ -13,6 +13,8 @@ executable site
     hs-source-dirs:   build
     other-modules:
         Site
+        Archive
+        ArchiveIndex
         Authors
         Catalog
         Commonplace
@@ -36,6 +38,7 @@ executable site
         Filters.Sidenotes
         Filters.Dropcaps
         Filters.Smallcaps
+        Filters.Archive
         Filters.Wikilinks
         Filters.Transclusion
         Filters.EmbedPdf
diff --git a/nginx/archive.conf b/nginx/archive.conf
new file mode 100644
index 0000000..6b59bcc
--- /dev/null
+++ b/nginx/archive.conf
@@ -0,0 +1,45 @@
+# archive.conf — `X-Robots-Tag: noindex, noarchive` for the link archive.
+#
+# Place at /etc/nginx/snippets/archive.conf and `include` it inside the
+# levineuwirth.org server { } block, *after* security-headers.conf:
+#
+#   server {
+#       server_name levineuwirth.org;
+#       root /var/www/levineuwirth.org;
+#       ...
+#       include snippets/security-headers.conf;
+#       include snippets/static-assets.conf;
+#       include snippets/popup-proxy.conf;
+#       include snippets/archive.conf;
+#   }
+#
+# Why a location header rather than robots.txt: a URL blocked by
+# robots.txt can still appear in results when externally linked, and the
+# noindex directive must be reachable. Wrapper pages carry the meta in
+# HTML, and the HTML snapshots have the same meta injected at fetch
+# time. But raw PDFs cannot carry meta directives — and a robots.txt
+# Disallow on /archive/ would prevent crawlers from reading the wrapper
+# meta in the first place. The header form is the right control for the
+# whole tree: crawlers honour it for any resource, HTML or PDF.
+#
+# `^~` makes this prefix-match take priority over any regex location
+# that might match the same path.
+
+location ^~ /archive/ {
+    # nginx's add_header chain is inherited from a parent context ONLY
+    # when the current context declares no add_header directives — see
+    # nginx.org/en/docs/http/ngx_http_headers_module.html. Adding any
+    # header inside this location would silently drop the baseline
+    # security headers within the /archive/ subtree, so we re-include
+    # security-headers.conf to keep HSTS, CSP, X-Frame-Options, etc.
+    # intact for archive pages and raw artifacts.
+    include snippets/security-headers.conf;
+
+    # `always` so the header is emitted even on 4xx/5xx responses (the
+    # default add_header only sets on 2xx/3xx; without `always` a 404
+    # under /archive/ could be indexed).
+    add_header X-Robots-Tag "noindex, noarchive" always;
+
+    # Hand off to the same static-file fallback as the rest of the site.
+    try_files $uri $uri/index.html $uri.html =404;
+}
diff --git a/nginx/vhost.conf.example b/nginx/vhost.conf.example
index 7ef1a4a..475709c 100644
--- a/nginx/vhost.conf.example
+++ b/nginx/vhost.conf.example
@@ -42,6 +42,12 @@ server {
     include snippets/security-headers.conf;
     include snippets/static-assets.conf;
     include snippets/popup-proxy.conf;
+    # archive.conf must come *after* security-headers.conf — it declares
+    # its own add_header inside `location ^~ /archive/`, which (per the
+    # nginx add_header inheritance rules) would otherwise drop the
+    # baseline headers within that subtree. The snippet re-includes
+    # security-headers.conf inside its location to compensate.
+    include snippets/archive.conf;
 
     # Static-site fallback. Pretty URLs first (foo/index.html, foo.html),
     # then 404.
diff --git a/static/css/archive.css b/static/css/archive.css
new file mode 100644
index 0000000..895f350
--- /dev/null
+++ b/static/css/archive.css
@@ -0,0 +1,463 @@
+/* archive.css — the link archive: /archive/ and /archive/<slug>/.
+ *
+ * Gated in head.html via $if(archive)$ (build/Archive.hs sets the flag on
+ * the index and every entry page). The archive pages are structured
+ * surfaces rather than prose, but they render inside #markdownBody — so
+ * every rule here is scoped under #markdownBody to clear the id-specificity
+ * prose rules in typography.css (heading scales, figure framing, paragraph
+ * indent) that would otherwise win over a bare class.
+ *
+ * Treatment: "framed / structured" — the archival chrome (banner,
+ * provenance panel, the embedded artifact viewer) is given visible borders
+ * so a reader is never in doubt that this is a preservation copy, not the
+ * original. All colour comes from tokens, so dark mode follows for free;
+ * the embedded artifact itself is shown raw and is deliberately not themed.
+ */
+
+/* Structured pages, not essays — no first-line indent on any paragraph. */
+#markdownBody :is(.archive-banner-text, .archive-degraded, .archive-note,
+                  .archive-private, .archive-status-note, .archive-index-intro,
+                  .archive-removal, .archive-empty),
+#markdownBody .archive-fulltext-wrap > p {
+    text-indent: 0;
+}
+
+/* ============================================================
+   ENTRY HEADER + ARCHIVAL BANNER
+   The banner is a bordered callout, stacked: a small-caps label,
+   one plain-language line, and the original link given real
+   weight — the original is the hero, never the archived copy.
+   ============================================================ */
+
+#markdownBody .archive-header {
+    margin-bottom: 0.5rem;
+}
+
+#markdownBody .archive-header .page-title {
+    margin-bottom: 0;
+}
+
+#markdownBody .archive-banner {
+    margin-top: 1.4rem;
+    padding: 0.9rem 1.1rem;
+    display: flex;
+    flex-direction: column;
+    gap: 0.3rem;
+    border: 1px solid var(--border-muted);
+    border-radius: 2px;
+    background: var(--bg-subtle);
+}
+
+#markdownBody .archive-banner-label {
+    margin: 0;
+    font-family: var(--font-sans);
+    font-size: 0.7rem;
+    font-variant: all-small-caps;
+    font-feature-settings: "smcp" 1;
+    letter-spacing: 0.13em;
+    color: var(--text-muted);
+}
+
+#markdownBody .archive-banner-text {
+    margin: 0;
+    font-family: var(--font-serif);
+    font-size: 0.95rem;
+    line-height: 1.5;
+    color: var(--text);
+}
+
+#markdownBody .archive-banner-original {
+    align-self: flex-start;
+    font-family: var(--font-sans);
+    font-size: 0.85rem;
+    font-weight: 600;
+}
+
+/* Degraded / js-required snapshots: a dashed-border note. Restrained —
+   the monochrome palette has no alarm colour and wants none. */
+#markdownBody .archive-degraded {
+    margin: 1rem 0 0;
+    padding: 0.7rem 1rem;
+    border: 1px dashed var(--border-muted);
+    border-radius: 2px;
+    font-family: var(--font-serif);
+    font-size: 0.9rem;
+    line-height: 1.55;
+    color: var(--text-muted);
+}
+
+#markdownBody .archive-degraded-label {
+    margin-right: 0.4rem;
+    font-family: var(--font-sans);
+    font-size: 0.7rem;
+    font-variant: all-small-caps;
+    font-feature-settings: "smcp" 1;
+    letter-spacing: 0.1em;
+    color: var(--text);
+}
+
+/* Private entry: the artifact is held offline, not published — a calm
+   informational panel in place of the artifact viewer. */
+#markdownBody .archive-private {
+    margin: 1.8rem 0;
+    padding: 1rem 1.2rem;
+    border: 1px solid var(--border);
+    border-radius: 2px;
+    background: var(--bg-subtle);
+    font-family: var(--font-serif);
+    font-size: 0.95rem;
+    line-height: 1.6;
+    color: var(--text-muted);
+}
+
+/* Link-rot status — a header note for non-live states (archive.py check),
+   and the status word in the provenance panel. The palette is monochrome,
+   so a `rotted` entry is marked by weight and a heavier left rule, never
+   colour. */
+#markdownBody .archive-status-note {
+    margin: 1rem 0 0;
+    padding: 0.7rem 1rem;
+    border: 1px solid var(--border-muted);
+    border-left-width: 3px;
+    border-radius: 2px;
+    font-family: var(--font-serif);
+    font-size: 0.92rem;
+    line-height: 1.55;
+    color: var(--text);
+}
+
+#markdownBody .archive-status-note--rotted {
+    border-left-color: var(--text);
+}
+
+#markdownBody .archive-status-note--moved {
+    color: var(--text-muted);
+}
+
+#markdownBody .archive-status {
+    font-variant: all-small-caps;
+    font-feature-settings: "smcp" 1;
+    letter-spacing: 0.04em;
+}
+
+#markdownBody .archive-status--live {
+    color: var(--text-muted);
+}
+
+#markdownBody .archive-status--rotted {
+    font-weight: 600;
+}
+
+/* ============================================================
+   PROVENANCE PANEL
+   A bordered box with a small-caps label; the metadata is a
+   two-column key/value grid — labels auto-sized, values take
+   the rest, long URLs and hashes wrap rather than overflow.
+   ============================================================ */
+
+#markdownBody .archive-provenance {
+    margin: 1.8rem 0;
+    padding: 1rem 1.2rem 1.1rem;
+    border: 1px solid var(--border);
+    border-radius: 2px;
+}
+
+#markdownBody .archive-panel-title {
+    margin: 0 0 0.7rem;
+    font-family: var(--font-sans);
+    font-size: 0.72rem;
+    font-weight: 600;
+    font-variant: all-small-caps;
+    font-feature-settings: "smcp" 1;
+    letter-spacing: 0.12em;
+    color: var(--text-faint);
+}
+
+#markdownBody .archive-meta {
+    margin: 0;
+    display: grid;
+    grid-template-columns: max-content 1fr;
+    gap: 0.34rem 1.1rem;
+}
+
+#markdownBody .archive-meta dt {
+    font-family: var(--font-sans);
+    font-size: 0.78rem;
+    font-variant: all-small-caps;
+    font-feature-settings: "smcp" 1;
+    letter-spacing: 0.05em;
+    color: var(--text-faint);
+}
+
+#markdownBody .archive-meta dd {
+    margin: 0;
+    font-family: var(--font-serif);
+    font-size: 0.92rem;
+    color: var(--text);
+    overflow-wrap: anywhere;
+}
+
+#markdownBody .archive-meta dd code {
+    font-family: var(--font-mono);
+    font-size: 0.82rem;
+}
+
+/* The author's reason-for-archiving note, set in the page measure. */
+#markdownBody .archive-note {
+    margin: 1.6rem 0;
+    font-family: var(--font-serif);
+    font-size: 0.97rem;
+    font-style: italic;
+    line-height: 1.6;
+    color: var(--text-muted);
+}
+
+/* ============================================================
+   ARTIFACT VIEWER
+   A <div> (not a <figure> — that carries prose framing) with a
+   mono caption bar that names the raw artifact and links to it,
+   and the artifact embedded raw beneath: the PDF renders in the
+   browser's native viewer, the HTML snapshot loads sandboxed.
+   ============================================================ */
+
+#markdownBody .archive-viewer {
+    margin: 1.8rem 0;
+    border: 1px solid var(--border-muted);
+    border-radius: 2px;
+    overflow: hidden;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.03);
+}
+
+#markdownBody .archive-viewer-bar {
+    display: flex;
+    align-items: baseline;
+    justify-content: space-between;
+    gap: 1rem;
+    padding: 0.45rem 0.75rem;
+    border-bottom: 1px solid var(--border-muted);
+    background: var(--bg-subtle);
+}
+
+#markdownBody .archive-viewer-name {
+    font-family: var(--font-mono);
+    font-size: 0.78rem;
+    color: var(--text-muted);
+}
+
+#markdownBody .archive-viewer-open {
+    font-family: var(--font-sans);
+    font-size: 0.76rem;
+    white-space: nowrap;
+}
+
+#markdownBody .archive-frame {
+    display: block;
+    width: 100%;
+    height: 80vh;
+    border: 0;
+    background: var(--bg);
+}
+
+/* ============================================================
+   EXTRACTED FULL TEXT
+   Always in the DOM, for embed.py / Pagefind. PDF text is
+   collapsed in a <details> and keeps its pdftotext layout in a
+   scrollable mono block; HTML text shows as serif paragraphs.
+   ============================================================ */
+
+#markdownBody .archive-fulltext-wrap {
+    margin: 1.8rem 0 0;
+}
+
+#markdownBody .archive-fulltext-title,
+#markdownBody .archive-section-title {
+    margin: 0 0 0.6rem;
+    padding-bottom: 0.4rem;
+    border-bottom: 1px solid var(--border);
+    font-family: var(--font-sans);
+    font-size: 0.78rem;
+    font-weight: 600;
+    font-variant: all-small-caps;
+    font-feature-settings: "smcp" 1;
+    letter-spacing: 0.1em;
+    color: var(--text-muted);
+}
+
+#markdownBody summary.archive-fulltext-title {
+    cursor: pointer;
+}
+
+#markdownBody .archive-fulltext-wrap > p {
+    margin: 0 0 0.85rem;
+    font-family: var(--font-serif);
+    font-size: 0.95rem;
+    line-height: 1.6;
+    color: var(--text);
+}
+
+/* The pdftotext block: scroll-capped so it never dominates the page. */
+#markdownBody .archive-fulltext {
+    margin: 0.8rem 0 0;
+    padding: 0.9rem 1rem;
+    max-height: 60vh;
+    overflow: auto;
+    border: 1px solid var(--border);
+    border-radius: 2px;
+    background: var(--bg-subtle);
+    font-family: var(--font-mono);
+    font-size: 0.8rem;
+    line-height: 1.5;
+    color: var(--text-muted);
+    white-space: pre-wrap;
+    overflow-wrap: anywhere;
+}
+
+/* ============================================================
+   REFERENCED BY / RELATED
+   The site-wide .backlinks-list / .similar-links-list styles
+   (components.css) carry the lists themselves; these rules add
+   only the section framing and the granular fragment groups.
+   ============================================================ */
+
+#markdownBody .archive-backlinks,
+#markdownBody .archive-related {
+    margin: 1.8rem 0 0;
+}
+
+#markdownBody .referenced-by-group {
+    margin-top: 0.9rem;
+}
+
+#markdownBody .referenced-by-fragment {
+    margin: 0 0 0.3rem;
+    font-family: var(--font-sans);
+    font-size: 0.72rem;
+    font-weight: 600;
+    font-variant: all-small-caps;
+    font-feature-settings: "smcp" 1;
+    letter-spacing: 0.08em;
+    color: var(--text-faint);
+}
+
+/* ============================================================
+   REMOVAL NOTICE
+   A quiet italic footer line, set off by a top rule — present
+   on every archive page and on the index.
+   ============================================================ */
+
+#markdownBody .archive-removal {
+    margin: 2.4rem 0 0;
+    padding-top: 1rem;
+    border-top: 1px solid var(--border);
+    font-family: var(--font-serif);
+    font-size: 0.85rem;
+    font-style: italic;
+    line-height: 1.55;
+    color: var(--text-faint);
+}
+
+/* ============================================================
+   INDEX PAGE — /archive/
+   A text list in the catalog idiom: one hairline between rows,
+   the title in serif, type + date + any quality flag in quiet
+   sans pushed to the row's end.
+   ============================================================ */
+
+#markdownBody .archive-index-header {
+    margin-bottom: 1.8rem;
+}
+
+#markdownBody .archive-index-intro {
+    margin: 0.6rem 0 0;
+    font-family: var(--font-serif);
+    font-size: 1rem;
+    line-height: 1.6;
+    color: var(--text-muted);
+}
+
+#markdownBody .archive-list {
+    margin: 0;
+    padding: 0;
+    list-style: none;
+}
+
+#markdownBody .archive-list-item {
+    display: flex;
+    align-items: baseline;
+    justify-content: space-between;
+    gap: 0.4rem 1rem;
+    flex-wrap: wrap;
+    padding: 0.7rem 0;
+    border-bottom: 1px solid var(--border);
+}
+
+#markdownBody .archive-list-item:last-child {
+    border-bottom: none;
+}
+
+#markdownBody .archive-list-link {
+    font-family: var(--font-serif);
+    font-size: 1.05rem;
+    color: var(--text);
+    text-decoration: none;
+}
+
+#markdownBody .archive-list-link:hover {
+    text-decoration: underline;
+    text-underline-offset: 2px;
+}
+
+#markdownBody .archive-list-meta {
+    font-family: var(--font-sans);
+    font-size: 0.78rem;
+    color: var(--text-faint);
+    white-space: nowrap;
+}
+
+/* Non-'ok' capture flag — a dashed chip, echoing the entry-page note. */
+#markdownBody .archive-quality-flag {
+    padding: 0.05em 0.4em;
+    border: 1px dashed var(--border-muted);
+    border-radius: 2px;
+    font-variant: all-small-caps;
+    font-feature-settings: "smcp" 1;
+    letter-spacing: 0.04em;
+    color: var(--text-muted);
+}
+
+/* A rotted entry is the one health state worth a solid, inked flag. */
+#markdownBody .archive-quality-flag--rotted {
+    border-style: solid;
+    border-color: var(--text);
+    color: var(--text);
+}
+
+#markdownBody .archive-empty {
+    font-family: var(--font-serif);
+    font-style: italic;
+    color: var(--text-muted);
+}
+
+/* ============================================================
+   MOBILE
+   Collapse the provenance grid to stacked rows; trim the frame.
+   ============================================================ */
+
+@media (max-width: 540px) {
+    #markdownBody .archive-meta {
+        grid-template-columns: 1fr;
+        gap: 0;
+    }
+
+    #markdownBody .archive-meta dt {
+        margin-top: 0.55rem;
+    }
+
+    #markdownBody .archive-meta dt:first-of-type {
+        margin-top: 0;
+    }
+
+    #markdownBody .archive-frame {
+        height: 70vh;
+    }
+}
diff --git a/static/css/components.css b/static/css/components.css
index b89ae30..fa6dda1 100644
--- a/static/css/components.css
+++ b/static/css/components.css
@@ -1849,3 +1849,50 @@ pre:hover .copy-btn,
         min-height: 300px;
     }
 }
+
+/* ── Archive affordance ─────────────────────────────────────────────────────
+   The superscript "A" appended after a body link whose target is preserved
+   in the local archive (build/Filters/Archive.hs). Loaded site-wide because
+   the marker appears in essay/prose content, not on archive pages. */
+
+.archive-affordance {
+    font-size: 0.7em;
+    margin-left: 0.15em;
+    line-height: 0;
+}
+
+.archive-affordance a {
+    font-family: var(--font-sans);
+    font-weight: 600;
+    text-decoration: none;
+    color: var(--text-faint);
+    border: 1px solid var(--border-muted);
+    border-radius: 2px;
+    padding: 0 0.25em;
+}
+
+.archive-affordance a:hover {
+    color: var(--text);
+    border-color: var(--text-muted);
+    background: var(--bg-subtle);
+}
+
+/* Dead-link flip — a body link whose archived target is `rotted` has its
+   href redirected to the local copy (build/Filters/Archive.hs). A dotted
+   underline marks the link as redirected; its marker becomes a solid chip
+   reading "archived" rather than the quiet bordered "A". */
+.archive-rotted {
+    text-decoration-style: dotted;
+}
+
+.archive-affordance--rotted a {
+    color: var(--bg);
+    background: var(--text-muted);
+    border-color: var(--text-muted);
+}
+
+.archive-affordance--rotted a:hover {
+    color: var(--bg);
+    background: var(--text);
+    border-color: var(--text);
+}
diff --git a/templates/archive-index.html b/templates/archive-index.html
new file mode 100644
index 0000000..0e6aba8
--- /dev/null
+++ b/templates/archive-index.html
@@ -0,0 +1,23 @@
+<div id="content">
+    <main id="markdownBody" data-pagefind-body>
+        <header class="archive-index-header">
+            <h1 class="page-title">$title$</h1>
+            <p class="archive-index-intro">Local snapshots of works referenced across the site, preserved against link rot. Each is an archived copy; the original is linked prominently from its page.</p>
+        </header>
+
+        $if(has-entries)$
+        <ul class="archive-list">
+            $for(entries)$
+            <li class="archive-list-item">
+                <a class="archive-list-link" href="$entry-url$">$entry-title$</a>
+                <span class="archive-list-meta">$entry-type$ &middot; archived $entry-archived$$if(entry-degraded)$ &middot; <span class="archive-quality-flag">$entry-quality$ capture</span>$endif$$if(entry-private)$ &middot; <span class="archive-quality-flag">private</span>$endif$$if(entry-rotted)$ &middot; <span class="archive-quality-flag archive-quality-flag--rotted">link rotted</span>$endif$</span>
+            </li>
+            $endfor$
+        </ul>
+        $else$
+        <p class="archive-empty">Nothing archived yet.</p>
+        $endif$
+
+        $partial("templates/partials/archive-removal-notice.html")$
+    </main>
+</div>
diff --git a/templates/archive.html b/templates/archive.html
new file mode 100644
index 0000000..d2e1fd5
--- /dev/null
+++ b/templates/archive.html
@@ -0,0 +1,109 @@
+<div id="content">
+    <main id="markdownBody" data-pagefind-body data-pagefind-filter="type:archive, status:$status$">
+        <article class="archive-entry">
+            <header class="archive-header">
+                <h1 class="page-title">$title$</h1>
+                $partial("templates/partials/archive-banner.html")$
+                $if(status-note)$
+                <p class="archive-status-note archive-status-note--$status$" role="note">
+                    $status-note$
+                </p>
+                $endif$
+                $if(degraded)$
+                <p class="archive-degraded" role="note">
+                    <span class="archive-degraded-label">Capture: $snapshot-quality$</span>
+                    Some of the original's content (images, scripted elements)
+                    may be missing or incomplete in this snapshot. The original
+                    is linked above.
+                </p>
+                $endif$
+            </header>
+
+            <section class="archive-provenance" aria-label="Provenance">
+                <h2 class="archive-panel-title">Provenance</h2>
+                <dl class="archive-meta">
+                    <dt>Original</dt>
+                    <dd><a href="$original-url$" rel="noopener noreferrer" target="_blank">$original-url$</a></dd>
+                    <dt>Link status</dt>
+                    <dd class="archive-status archive-status--$status$">$status$</dd>
+                    <dt>Archived</dt>
+                    <dd>$archived$</dd>
+                    <dt>Type</dt>
+                    <dd>$archive-type$</dd>
+                    <dt>Snapshot quality</dt>
+                    <dd>$snapshot-quality$</dd>
+                    <dt>Size</dt>
+                    <dd>$size$</dd>
+                    <dt>SHA-256</dt>
+                    <dd><code>$sha-short$&hellip;</code></dd>
+                    $if(wayback)$
+                    <dt>Wayback</dt>
+                    <dd><a href="$wayback$" rel="noopener noreferrer" target="_blank">web.archive.org copy</a></dd>
+                    $endif$
+                    $if(paywalled)$
+                    <dt>Access</dt>
+                    <dd>The original sits behind a paywall.</dd>
+                    $endif$
+                    $if(private)$
+                    <dt>Visibility</dt>
+                    <dd>private &mdash; held offline</dd>
+                    $endif$
+                </dl>
+            </section>
+
+            $if(note)$<p class="archive-note">$note$</p>$endif$
+
+            $if(private)$
+            <p class="archive-private" role="note">
+                This work is archived <strong>privately</strong>: a local
+                preservation copy is kept against link rot, but the artifact
+                is not published here. Use the original link above to read it.
+            </p>
+            $else$
+            <div class="archive-viewer">
+                <div class="archive-viewer-bar">
+                    <span class="archive-viewer-name">$artifact-name$</span>
+                    <a class="archive-viewer-open" href="$artifact-url$" target="_blank" rel="noopener noreferrer">Open raw&nbsp;&#8599;</a>
+                </div>
+                $if(is-pdf)$
+                <iframe class="archive-frame" src="$artifact-url$" title="$title$ &mdash; archived document" loading="lazy"></iframe>
+                $endif$
+                $if(is-html)$
+                <iframe class="archive-frame" src="$artifact-url$" title="$title$ &mdash; archived snapshot" sandbox referrerpolicy="no-referrer" loading="lazy"></iframe>
+                $endif$
+            </div>
+            $endif$
+
+            $if(fulltext)$
+            $if(is-pdf)$
+            <details class="archive-fulltext-wrap">
+                <summary class="archive-fulltext-title">Full text (extracted)</summary>
+                $fulltext$
+            </details>
+            $endif$
+            $if(is-html)$
+            <section class="archive-fulltext-wrap">
+                <h2 class="archive-fulltext-title">Readable text (extracted)</h2>
+                $fulltext$
+            </section>
+            $endif$
+            $endif$
+
+            $if(referenced-by)$
+            <section class="archive-backlinks">
+                <h2 class="archive-section-title">Referenced by</h2>
+                $referenced-by$
+            </section>
+            $endif$
+
+            $if(similar-links)$
+            <section class="archive-related">
+                <h2 class="archive-section-title">Related</h2>
+                $similar-links$
+            </section>
+            $endif$
+
+            $partial("templates/partials/archive-removal-notice.html")$
+        </article>
+    </main>
+</div>
diff --git a/templates/partials/archive-banner.html b/templates/partials/archive-banner.html
new file mode 100644
index 0000000..d08b3ec
--- /dev/null
+++ b/templates/partials/archive-banner.html
@@ -0,0 +1,5 @@
+<div class="archive-banner" role="note">
+    <p class="archive-banner-label">Archived copy</p>
+    <p class="archive-banner-text">A local preservation snapshot taken $archived$ &mdash; this page is not the original.</p>
+    <a class="archive-banner-original" href="$original-url$" rel="noopener noreferrer" target="_blank">View the original&nbsp;&#8599;</a>
+</div>
diff --git a/templates/partials/archive-removal-notice.html b/templates/partials/archive-removal-notice.html
new file mode 100644
index 0000000..c34a0f5
--- /dev/null
+++ b/templates/partials/archive-removal-notice.html
@@ -0,0 +1,5 @@
+<p class="archive-removal">
+    This is an archived copy, preserved so that a work cited across the site
+    survives the original going dark. To request removal, email
+    <a href="mailto:ln@levineuwirth.org">ln@levineuwirth.org</a>.
+</p>
diff --git a/templates/partials/head.html b/templates/partials/head.html
index c9eb017..beec3a0 100644
--- a/templates/partials/head.html
+++ b/templates/partials/head.html
@@ -2,6 +2,7 @@
 <meta name="viewport" content="width=device-width, initial-scale=1">
 $if(home)$<title>Levi Neuwirth</title>$else$$if(title)$<title>$title$ — Levi Neuwirth</title>$else$<title>Levi Neuwirth</title>$endif$$endif$
 $if(description)$<meta name="description" content="$description$">$endif$
+$if(noindex)$<meta name="robots" content="noindex">$endif$
 <link rel="canonical" href="$site-url$$url$">
 <link rel="alternate" type="application/atom+xml" title="Levi Neuwirth" href="/feed.xml">
 <link rel="alternate" type="application/atom+xml" title="Levi Neuwirth — music" href="/music/feed.xml">
@@ -49,6 +50,7 @@ $if(build)$<link rel="stylesheet" href="/css/build.css">$endif$
 $if(reading)$<link rel="stylesheet" href="/css/reading.css">$endif$
 $if(composition)$<link rel="stylesheet" href="/css/score-reader.css">$endif$
 $if(photography)$<link rel="stylesheet" href="/css/photography.css">$endif$
+$if(archive)$<link rel="stylesheet" href="/css/archive.css">$endif$
 $if(photography-map)$<link rel="stylesheet" href="/leaflet/leaflet.css">$endif$
 $if(photography-map)$<link rel="stylesheet" href="/leaflet/MarkerCluster.css">$endif$
 $if(photography-map)$<link rel="stylesheet" href="/leaflet/MarkerCluster.Default.css">$endif$
diff --git a/tools/archive.py b/tools/archive.py
new file mode 100644
index 0000000..2aacb0e
--- /dev/null
+++ b/tools/archive.py
@@ -0,0 +1,1151 @@
+#!/usr/bin/env python3
+"""
+archive.py — Build-time link-archiving tool for levineuwirth.org.
+
+Reads archive/manifest.yaml, fetches any manifest URL that has no local
+artifact yet, stores it under archive/<slug>/, extracts readable text,
+writes the per-entry archive/<slug>/PROVENANCE.json, and (re)writes the
+Hakyll input data/archive-index.json.
+
+Two artifact types:
+  * pdf  — downloaded directly, stored as document.pdf, text via pdftotext.
+  * html — snapshotted with `monolith` into a single self-contained
+           snapshot.html (JavaScript stripped, assets inlined as data
+           URIs), a restrictive Content-Security-Policy <meta> injected,
+           text extracted with BeautifulSoup.
+
+Subcommands:
+  fetch    download missing artifacts, (re)generate sidecars + index
+  refresh  deliberately re-snapshot a single entry, recording the prior
+           SHA in the new PROVENANCE.json's `previous-sha256`
+  wayback  submit archived URLs to the Wayback Machine as a second,
+           independent copy; backfill the capture URL into PROVENANCE.json
+  check    HEAD/GET-probe every manifest URL for link rot, updating
+           data/archive-state.json with asymmetric hysteresis
+  gc       delete archive/<slug>/ directories listed in archive/removed.yaml
+
+Failure policy:
+  * Integrity errors — a committed artifact whose SHA-256 no longer
+    matches PROVENANCE.json, or a slug whose manifest URL has changed —
+    print loudly and exit non-zero, halting `make build`.
+  * Transient errors — a network failure, an over-cap download, a missing
+    `monolith` binary, a manifest entry missing its `url:` — print a
+    warning, skip that entry, and exit zero so the build proceeds (the
+    entry is retried on the next build).
+
+See ARCHIVE.md for the full design.
+
+Gated on .venv by the Makefile (same convention as embed.py). Non-stdlib
+dependencies: PyYAML and beautifulsoup4, both already in pyproject.toml.
+External tools: `pdftotext` (poppler) for PDF text, and the `monolith`
+binary — vendored at tools/bin/monolith, see tools/monolith-version.txt.
+"""
+
+from __future__ import annotations
+
+import datetime
+import hashlib
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import urllib.error
+import urllib.request
+from pathlib import Path
+from urllib.parse import parse_qsl, quote, urlencode, urlparse, urlunparse
+
+import yaml
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+REPO_ROOT     = Path(__file__).resolve().parent.parent
+ARCHIVE_DIR   = REPO_ROOT / "archive"
+MANIFEST      = ARCHIVE_DIR / "manifest.yaml"
+REMOVED       = ARCHIVE_DIR / "removed.yaml"
+INDEX_OUT     = REPO_ROOT / "data" / "archive-index.json"
+STATE_OUT     = REPO_ROOT / "data" / "archive-state.json"
+
+ROT_FAILS     = 3       # consecutive failed scans before `rotted` is considered
+ROT_DAYS      = 14      # ... and the streak must also span at least this many days
+
+SIZE_CAP      = 25 * 1024 * 1024          # 25 MB per-artifact cap
+TIMEOUT       = 60                        # seconds, per network request
+WAYBACK_TIMEOUT = 120                     # seconds — Save Page Now is slow
+USER_AGENT    = ("levineuwirth.org/archive "
+                 "(ln@levineuwirth.org; removal requests honored)")
+
+# Per-type on-disk names. The artifact is committed; the .txt is generated
+# (gitignored) and regenerated whenever the artifact's SHA-256 changes.
+ARTIFACT = {"pdf": "document.pdf", "html": "snapshot.html"}
+TEXTFILE = {"pdf": "document.txt", "html": "snapshot.txt"}
+
+# Injected into every HTML snapshot's <head>. Permits exactly what a
+# faithful monolith capture needs — inlined images/fonts as data URIs and
+# inline styles (as <style> elements and as style="" attributes) — and
+# blocks every network fetch and every script a broken or hostile snapshot
+# might attempt. Defense-in-depth behind the iframe sandbox; see ARCHIVE.md.
+ARCHIVE_CSP = (
+    "default-src 'none'; img-src data:; "
+    "style-src 'unsafe-inline'; style-src-elem 'unsafe-inline'; "
+    "style-src-attr 'unsafe-inline'; font-src data:; "
+    "script-src 'none'; object-src 'none'; frame-src 'none'"
+)
+
+
+def log(msg: str) -> None:
+    print(f"[archive] {msg}")
+
+
+def err(msg: str) -> None:
+    print(f"[archive] ERROR: {msg}", file=sys.stderr)
+
+
+# ---------------------------------------------------------------------------
+# Manifest / removed.yaml
+# ---------------------------------------------------------------------------
+
+def load_yaml_list(path: Path) -> list[dict]:
+    """Load a YAML file expected to hold a list of mappings. An empty or
+    absent file yields an empty list."""
+    if not path.exists():
+        return []
+    data = yaml.safe_load(path.read_text(encoding="utf-8"))
+    if data is None:
+        return []
+    if not isinstance(data, list):
+        err(f"{path.name}: expected a YAML list, got {type(data).__name__}")
+        sys.exit(1)
+    return data
+
+
+def derive_slug(url: str) -> str:
+    """Auto-derive a slug as {domain-label}-{path-tail}, slugified and
+    truncated. A manifest `slug:` override is preferred over this."""
+    p = urlparse(url)
+    host = p.netloc.lower().removeprefix("www.")
+    labels = host.split(".")
+    domain = labels[-2] if len(labels) >= 2 else (host or "url")
+    tail = (p.path.rstrip("/").split("/") or [""])[-1] or "index"
+    slug = re.sub(r"[^a-z0-9]+", "-", f"{domain}-{tail}".lower()).strip("-")
+    slug = slug[:64].strip("-")
+    return slug or hashlib.sha1(url.encode()).hexdigest()[:12]
+
+
+def entry_slug(entry: dict) -> str:
+    slug = entry.get("slug")
+    return slug if slug else derive_slug(entry["url"])
+
+
+# ---------------------------------------------------------------------------
+# Hashing / type detection
+# ---------------------------------------------------------------------------
+
+def sha256_of(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as fh:
+        for chunk in iter(lambda: fh.read(1 << 16), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def probe_headers(url: str) -> dict[str, str]:
+    """Best-effort HEAD request. Returns the response headers as a
+    lowercased-key dict, or {} on any failure (some servers reject HEAD)."""
+    req = urllib.request.Request(url, method="HEAD",
+                                 headers={"User-Agent": USER_AGENT})
+    try:
+        with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
+            return {k.lower(): v for k, v in resp.headers.items()}
+    except Exception:                                  # noqa: BLE001
+        return {}
+
+
+def probe_headers_get(url: str) -> dict[str, str]:
+    """Best-effort ranged GET, returning lowercased-key response headers
+    or {} on any failure. Used alongside 'probe_headers' so an
+    @X-Robots-Tag: noarchive@ that appears only on GET (some servers omit
+    it on HEAD) is still honoured."""
+    req = urllib.request.Request(
+        url, method="GET",
+        headers={"User-Agent": USER_AGENT, "Range": "bytes=0-0"})
+    try:
+        with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
+            return {k.lower(): v for k, v in resp.headers.items()}
+    except Exception:                                  # noqa: BLE001
+        return {}
+
+
+def detect_type(url: str, override) -> str | None:
+    """Resolve an entry's artifact type. A manifest `type:` wins; then the
+    URL extension; then a Content-Type probe; HTML is the final default
+    (most non-PDF cited URLs are pages). Returns None on a bad override."""
+    if override:
+        o = str(override).strip().lower()
+        if o in ARTIFACT:
+            return o
+        err(f"{url}: manifest type: {override!r} not recognised "
+            f"(expected pdf | html)")
+        return None
+    path = urlparse(url).path.lower()
+    if path.endswith(".pdf"):
+        return "pdf"
+    if path.endswith((".html", ".htm")):
+        return "html"
+    ct = (probe_headers(url).get("content-type") or "").lower()
+    if "pdf" in ct:
+        return "pdf"
+    return "html"
+
+
+# ---------------------------------------------------------------------------
+# PDF fetch + text extraction
+# ---------------------------------------------------------------------------
+
+def fetch_pdf(url: str, dest: Path) -> bool:
+    """Download `url` to `dest`, enforcing the size cap. Returns True on
+    success. A partial / over-cap download leaves no file behind."""
+    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+    tmp = dest.with_suffix(dest.suffix + ".part")
+    try:
+        with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
+            # X-Robots-Tag: noarchive — honour the archiving-specific
+            # directive even though robots.txt itself is not gated.
+            robots = (resp.headers.get("X-Robots-Tag") or "").lower()
+            if "noarchive" in robots:
+                err(f"{url}: response carries X-Robots-Tag: noarchive — skipped")
+                return False
+            total = 0
+            with tmp.open("wb") as fh:
+                for chunk in iter(lambda: resp.read(1 << 16), b""):
+                    total += len(chunk)
+                    if total > SIZE_CAP:
+                        fh.close()
+                        tmp.unlink(missing_ok=True)
+                        err(f"{url}: exceeds {SIZE_CAP // (1024*1024)} MB cap "
+                            f"— skipped (commit deliberately with `git add -f`)")
+                        return False
+                    fh.write(chunk)
+        tmp.replace(dest)
+        return True
+    except Exception as exc:                       # noqa: BLE001 — report any failure
+        tmp.unlink(missing_ok=True)
+        err(f"{url}: fetch failed — {exc}")
+        return False
+
+
+def extract_text_pdf(pdf: Path, txt: Path) -> None:
+    """Extract plain text from `pdf` into `txt` via pdftotext. On any
+    failure an empty file is written so downstream steps still find it."""
+    try:
+        subprocess.run(["pdftotext", "-q", str(pdf), str(txt)], check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError) as exc:
+        err(f"{pdf.name}: pdftotext failed ({exc}); writing empty text sidecar")
+        txt.write_text("", encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# HTML snapshot (monolith) + CSP + text extraction + quality classification
+# ---------------------------------------------------------------------------
+
+def find_monolith() -> str | None:
+    """Locate the monolith binary: $MONOLITH_BIN, then the vendored
+    tools/bin/monolith, then $PATH. None if unavailable."""
+    env = os.environ.get("MONOLITH_BIN")
+    if env and Path(env).is_file():
+        return env
+    vendored = REPO_ROOT / "tools" / "bin" / "monolith"
+    if vendored.is_file():
+        return str(vendored)
+    return shutil.which("monolith")
+
+
+def body_noarchive(path: Path) -> bool:
+    """True if the snapshot declares <meta name=robots ... noarchive> —
+    the in-document equivalent of the X-Robots-Tag header."""
+    from bs4 import BeautifulSoup
+    soup = BeautifulSoup(path.read_text(encoding="utf-8", errors="replace"),
+                         "html.parser")
+    for m in soup.find_all("meta"):
+        if (m.get("name") or "").lower() in ("robots", "googlebot"):
+            if "noarchive" in (m.get("content") or "").lower():
+                return True
+    return False
+
+
+def inject_archive_metas(path: Path) -> None:
+    """Insert the archive CSP and a robots `noindex, noarchive` <meta> as
+    the first <head> children, dropping any CSP or robots <meta> the
+    original shipped: two intersecting CSPs could block resources a
+    faithful snapshot legitimately needs, and we own the indexing posture
+    for the served snapshot regardless of what the original said."""
+    from bs4 import BeautifulSoup
+    soup = BeautifulSoup(path.read_text(encoding="utf-8", errors="replace"),
+                         "html.parser")
+    head = soup.head
+    if head is None:
+        head = soup.new_tag("head")
+        (soup.html if soup.html is not None else soup).insert(0, head)
+    for m in list(head.find_all("meta")):
+        if (m.get("http-equiv") or "").lower() == "content-security-policy":
+            m.decompose()
+        elif (m.get("name") or "").lower() == "robots":
+            m.decompose()
+    # Inserted in reverse so the final head order is CSP first, robots
+    # second (deterministic, easy to grep).
+    robots = soup.new_tag("meta")
+    robots["name"] = "robots"
+    robots["content"] = "noindex, noarchive"
+    head.insert(0, robots)
+    csp = soup.new_tag("meta")
+    csp["http-equiv"] = "Content-Security-Policy"
+    csp["content"] = ARCHIVE_CSP
+    head.insert(0, csp)
+    path.write_text(str(soup), encoding="utf-8")
+
+
+def fetch_html(url: str, dest: Path) -> bool:
+    """Snapshot an HTML page with monolith into a single self-contained
+    file at `dest`, then inject the archive CSP. Returns True on success;
+    every failure path is non-fatal (warn + skip)."""
+    # Honour directives returned by preliminary probes before performing
+    # the document fetch. The full document response is inspected below
+    # and is also the exact body passed to monolith; do not let monolith
+    # perform a second unobservable fetch of the primary document.
+    if any("noarchive" in (h.get("x-robots-tag") or "").lower()
+           for h in (probe_headers(url),
+                     probe_headers_get(url))):
+        err(f"{url}: response carries X-Robots-Tag: noarchive — skipped")
+        return False
+
+    mono = find_monolith()
+    if mono is None:
+        err(f"{url}: monolith not found — vendor the binary at "
+            f"tools/bin/monolith (see tools/monolith-version.txt) or set "
+            f"$MONOLITH_BIN; HTML snapshot skipped")
+        return False
+
+    source = dest.with_suffix(dest.suffix + ".source.part")
+    tmp = dest.with_suffix(dest.suffix + ".part")
+    effective_url = url
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+        with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
+            robots = (resp.headers.get("X-Robots-Tag") or "").lower()
+            if "noarchive" in robots:
+                err(f"{url}: response carries X-Robots-Tag: noarchive — skipped")
+                return False
+            effective_url = resp.geturl()
+            total = 0
+            with source.open("wb") as fh:
+                for chunk in iter(lambda: resp.read(1 << 16), b""):
+                    total += len(chunk)
+                    if total > SIZE_CAP:
+                        fh.close()
+                        source.unlink(missing_ok=True)
+                        err(f"{url}: source HTML exceeds "
+                            f"{SIZE_CAP // (1024*1024)} MB cap — skipped")
+                        return False
+                    fh.write(chunk)
+    except Exception as exc:                           # noqa: BLE001
+        source.unlink(missing_ok=True)
+        err(f"{url}: fetch failed — {exc}")
+        return False
+
+    if body_noarchive(source):
+        source.unlink(missing_ok=True)
+        err(f"{url}: response declares <meta name=robots> noarchive — skipped")
+        return False
+
+    cmd = [mono, "--no-js", "--ignore-errors", "--quiet",
+           "--timeout", str(TIMEOUT), "--user-agent", USER_AGENT,
+           "--base-url", effective_url, "--output", str(tmp), "-"]
+    try:
+        proc = subprocess.run(cmd, input=source.read_bytes(),
+                              capture_output=True, timeout=TIMEOUT * 6)
+    except subprocess.TimeoutExpired:
+        source.unlink(missing_ok=True)
+        tmp.unlink(missing_ok=True)
+        err(f"{url}: monolith timed out — skipped")
+        return False
+    except Exception as exc:                           # noqa: BLE001
+        source.unlink(missing_ok=True)
+        tmp.unlink(missing_ok=True)
+        err(f"{url}: monolith failed to run — {exc}")
+        return False
+    finally:
+        source.unlink(missing_ok=True)
+
+    if proc.returncode != 0:
+        tmp.unlink(missing_ok=True)
+        output = proc.stderr or proc.stdout or b""
+        tail = output.decode("utf-8", errors="replace").strip().splitlines()
+        err(f"{url}: monolith exited {proc.returncode} "
+            f"({tail[-1] if tail else 'no output'}) — skipped")
+        return False
+    if not tmp.exists() or tmp.stat().st_size == 0:
+        tmp.unlink(missing_ok=True)
+        err(f"{url}: monolith produced no output — skipped")
+        return False
+    if tmp.stat().st_size > SIZE_CAP:
+        size_mb = tmp.stat().st_size // (1024 * 1024)
+        tmp.unlink(missing_ok=True)
+        err(f"{url}: snapshot is {size_mb} MB, over the "
+            f"{SIZE_CAP // (1024*1024)} MB cap — skipped "
+            f"(commit deliberately with `git add -f`)")
+        return False
+    inject_archive_metas(tmp)
+    tmp.replace(dest)
+    return True
+
+
+def extract_text_html(snapshot: Path, txt: Path) -> None:
+    """Extract readable, block-separated text from an HTML snapshot. Block
+    boundaries become blank lines so the archive page can render the text
+    as paragraphs. On any failure an empty file is written."""
+    try:
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(snapshot.read_text(encoding="utf-8",
+                                                errors="replace"),
+                             "html.parser")
+        for tag in soup(["script", "style", "noscript", "template", "head"]):
+            tag.decompose()
+        blocks = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote",
+                  "pre", "tr", "figcaption", "section", "article", "div",
+                  "header", "footer", "ul", "ol", "dl", "dd", "dt", "table",
+                  "br", "hr"]
+        # Append a NUL after every block element, then split the flattened
+        # text on it: each chunk is the text between two block boundaries,
+        # i.e. one paragraph. NUL never occurs in real HTML text content.
+        sentinel = "\x00"
+        for tag in soup.find_all(blocks):
+            tag.append(sentinel)
+        body = soup.body or soup
+        paras = []
+        for chunk in body.get_text(" ").split(sentinel):
+            words = chunk.split()
+            if words:
+                paras.append(" ".join(words))
+        txt.write_text("\n\n".join(paras) + "\n", encoding="utf-8")
+    except Exception as exc:                           # noqa: BLE001
+        err(f"{snapshot.name}: HTML text extraction failed ({exc}); "
+            f"writing empty text sidecar")
+        txt.write_text("", encoding="utf-8")
+
+
+def classify_snapshot(path: Path) -> str:
+    """Heuristic capture-quality grade: 'ok' / 'degraded' / 'js-required'.
+    A near-empty snapshot is a JS app shell `--no-js` hollowed out; an
+    <img> whose src is still remote (or only lazy-load attrs) is one
+    monolith failed to inline. The author reviews the rendered snapshot
+    before committing regardless — this only drives an automated flag."""
+    try:
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(path.read_text(encoding="utf-8", errors="replace"),
+                             "html.parser")
+        for tag in soup(["script", "style", "noscript", "template"]):
+            tag.decompose()
+        body = soup.body or soup
+        if len(body.get_text(" ", strip=True)) < 200:
+            return "js-required"
+        remote = 0
+        for img in body.find_all("img"):
+            src = (img.get("src") or "").strip()
+            if src.startswith(("http://", "https://")):
+                remote += 1
+            elif not src and (img.get("data-src") or img.get("data-lazy-src")
+                              or img.get("srcset")):
+                remote += 1
+        return "degraded" if remote else "ok"
+    except Exception:                                  # noqa: BLE001
+        return "degraded"
+
+
+# ---------------------------------------------------------------------------
+# Equivalent-URL aliases
+# ---------------------------------------------------------------------------
+
+# Query parameters whose presence/absence is semantically irrelevant — a
+# citation written with `?utm_source=…` should match the canonical form.
+# Non-tracking parameters (`?v=`, `?id=`, Wayback timestamps) are
+# load-bearing and must be preserved.
+TRACKING_PARAMS = frozenset({
+    "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
+    "fbclid", "gclid", "mc_eid", "mc_cid", "ref", "igshid",
+    "_hsenc", "_hsmi", "mkt_tok",
+})
+
+# Matches @https://arxiv.org/(abs|pdf)/<id>[v<n>][.pdf]@ — the family of
+# forms a single paper has in the wild.
+_ARXIV_RE = re.compile(
+    r"(https?://arxiv\.org/)(abs|pdf)/([\w.]+?)(v\d+)?(\.pdf)?$"
+)
+
+
+def strip_tracking(url: str) -> str:
+    """Remove tracking query parameters, leaving every other parameter in
+    place. An empty query is preserved as empty (no trailing `?`)."""
+    p = urlparse(url)
+    if not p.query:
+        return url
+    kept = [(k, v) for k, v in parse_qsl(p.query, keep_blank_values=True)
+            if k not in TRACKING_PARAMS]
+    return urlunparse(p._replace(query=urlencode(kept)))
+
+
+def arxiv_aliases(url: str) -> set[str]:
+    """For an arXiv URL, the set of equivalent forms: abs ↔ pdf, with and
+    without version, with and without trailing @.pdf@. Empty for any URL
+    that isn't arXiv."""
+    m = _ARXIV_RE.match(url)
+    if not m:
+        return set()
+    scheme_host, _kind, paper_id, version, _ext = m.groups()
+    out: set[str] = set()
+    for kind in ("abs", "pdf"):
+        for ver in ("", version or ""):
+            tails = (".pdf", "") if kind == "pdf" else ("",)
+            for tail in tails:
+                out.add(f"{scheme_host}{kind}/{paper_id}{ver}{tail}")
+    return out
+
+
+def url_aliases(url: str) -> list[str]:
+    """The equivalent-URL set: tracking parameters stripped, http/https
+    folded, trailing slashes tolerated, arXiv abs/pdf/versioned forms
+    expanded. The canonical URL itself is omitted (it is the index key)."""
+    out: set[str] = {url, strip_tracking(url)}
+    for u in list(out):
+        if u.startswith("https://"):
+            out.add("http://" + u[len("https://"):])
+        elif u.startswith("http://"):
+            out.add("https://" + u[len("http://"):])
+    for u in list(out):
+        out.add(u.rstrip("/"))
+    for u in list(out):
+        out.update(arxiv_aliases(u))
+    out.discard(url)
+    return sorted(out)
+
+
+def arxiv_canonical(url: str) -> str:
+    """The canonical form of an arXiv URL: @https://arxiv.org/abs/<id>@
+    with no version and no @.pdf@. Non-arXiv passes through. Mirrors the
+    Haskell-side @arxivCanonical@ in @build/ArchiveIndex.hs@."""
+    m = _ARXIV_RE.match(url)
+    if not m:
+        return url
+    _scheme_host, _kind, paper_id, _ver, _ext = m.groups()
+    return f"https://arxiv.org/abs/{paper_id}"
+
+
+def normalize_url(url: str) -> str:
+    """The canonical form for *matching* — drop fragment, strip tracking,
+    fold http→https, arXiv-canonicalise, trim trailing slashes. Mirrors
+    @normalizeUrl@ in @build/ArchiveIndex.hs@ so removal enforcement and
+    duplicate detection use the same equivalence the link-annotation
+    filter uses; keep the two in sync."""
+    no_frag = url.split("#", 1)[0]
+    clean = strip_tracking(no_frag)
+    if clean.startswith("http://"):
+        clean = "https://" + clean[len("http://"):]
+    canonical = arxiv_canonical(clean)
+    return canonical.rstrip("/")
+
+
+def _is_tracked_and_clean(*paths: Path) -> bool:
+    """True if every path is tracked by git AND has no uncommitted
+    changes — i.e. its committed bytes are recoverable via @git log -S@
+    once a refresh replaces it. False on any git error (uninitialised
+    repo, missing git binary, dirty/untracked file)."""
+    str_paths = [str(p) for p in paths]
+    try:
+        for p in str_paths:
+            rc = subprocess.run(
+                ["git", "ls-files", "--error-unmatch", "--", p],
+                cwd=str(REPO_ROOT),
+                capture_output=True,
+            ).returncode
+            if rc != 0:
+                return False
+        rc = subprocess.run(
+            ["git", "diff", "--quiet", "HEAD", "--", *str_paths],
+            cwd=str(REPO_ROOT),
+            capture_output=True,
+        ).returncode
+        return rc == 0
+    except FileNotFoundError:
+        return False
+
+
+# ---------------------------------------------------------------------------
+# fetch subcommand
+# ---------------------------------------------------------------------------
+
+def cmd_fetch() -> int:
+    manifest = load_yaml_list(MANIFEST)
+    # Removed URLs are compared in normalised form so a tracking-laden
+    # variant cannot bypass a takedown the author already recorded.
+    removed_norms = {normalize_url(r["url"])
+                     for r in load_yaml_list(REMOVED) if r.get("url")}
+
+    # Pre-scan validation: reject canonical-form duplicates *before* any
+    # fetch I/O, so a first colliding entry never gets partially processed
+    # while a second's duplicate check halts.
+    seen: dict[str, str] = {}
+    for entry in manifest:
+        url = entry.get("url")
+        if not url:
+            continue
+        norm = normalize_url(url)
+        if norm in seen:
+            err(f"manifest: {url!r} and {seen[norm]!r} normalise to the "
+                f"same canonical form ({norm!r}). Drop one or distinguish "
+                f"them; the link archive cannot route both under one slug.")
+            sys.exit(1)
+        seen[norm] = url
+
+    index: dict[str, dict] = {}
+    skipped = 0
+
+    for entry in manifest:
+        url = entry.get("url")
+        if not url:
+            err("manifest entry without a `url:` — skipped")
+            skipped += 1
+            continue
+
+        norm = normalize_url(url)
+
+        # A manifest URL whose canonical form matches a removed entry is a
+        # deliberate takedown; never silently re-archive it. The author
+        # either removes the line from removed.yaml ("I want it back") or
+        # from the manifest.
+        if norm in removed_norms:
+            err(f"manifest URL {url!r} (canonical {norm!r}) is recorded in "
+                f"archive/removed.yaml as a deliberate takedown. To re-archive "
+                f"it, remove the corresponding line from removed.yaml first.")
+            sys.exit(1)
+
+        slug = entry_slug(entry)
+        slug_dir = ARCHIVE_DIR / slug
+        prov_path = slug_dir / "PROVENANCE.json"
+
+        # --- resolve the artifact type ------------------------------------
+        # An archived entry's type is fixed in PROVENANCE.json; a new entry
+        # is detected from the manifest / URL / Content-Type.
+        prov = None
+        if prov_path.exists():
+            prov = json.loads(prov_path.read_text(encoding="utf-8"))
+            if prov.get("url") != url:
+                err(f"{slug}: manifest URL changed "
+                    f"({prov.get('url')!r} -> {url!r}). A committed artifact "
+                    f"is never silently re-fetched; to deliberately "
+                    f"re-snapshot, run `archive.py refresh {slug}`.")
+                sys.exit(1)
+            atype = prov.get("type", "pdf")
+        else:
+            atype = detect_type(url, entry.get("type"))
+            if atype is None:
+                skipped += 1
+                continue
+
+        art       = slug_dir / ARTIFACT[atype]
+        txt       = slug_dir / TEXTFILE[atype]
+        txt_stamp = slug_dir / (TEXTFILE[atype] + ".sha256")
+
+        # --- integrity guard (fatal): a committed artifact must verify,
+        #     and a lost artifact must not be silently re-fetched. -------
+        if prov is not None:
+            if art.exists():
+                live = sha256_of(art)
+                if live != prov.get("sha256"):
+                    err(f"{slug}: {art.name} SHA-256 mismatch "
+                        f"(recorded {prov.get('sha256')}, found {live}) "
+                        f"— the committed artifact is corrupt or was replaced")
+                    sys.exit(1)
+            else:
+                err(f"{slug}: PROVENANCE.json is committed but {art.name} "
+                    f"is missing. The committed artifact has been lost; "
+                    f"restore it from git before rebuilding. A refresh "
+                    f"requires a present, verified prior snapshot.")
+                sys.exit(1)
+
+        # --- fetch the artifact if it is not already present --------------
+        if not art.exists():
+            slug_dir.mkdir(parents=True, exist_ok=True)
+            log(f"fetching {url}  [{atype}]")
+            ok = fetch_pdf(url, art) if atype == "pdf" else fetch_html(url, art)
+            if not ok:
+                skipped += 1
+                continue
+        else:
+            log(f"{slug}: artifact present, skipping fetch")
+
+        digest = sha256_of(art)
+
+        # --- regenerate text when the artifact changed (or .txt absent) ---
+        stale = (not txt.exists()
+                 or not txt_stamp.exists()
+                 or txt_stamp.read_text(encoding="utf-8").strip() != digest)
+        if stale:
+            if atype == "pdf":
+                extract_text_pdf(art, txt)
+            else:
+                extract_text_html(art, txt)
+            txt_stamp.write_text(digest + "\n", encoding="utf-8")
+
+        # --- write PROVENANCE.json (once; stable thereafter) --------------
+        if prov is None:
+            quality = "ok" if atype == "pdf" else classify_snapshot(art)
+            prov = {
+                "url": url,
+                "slug": slug,
+                "title": entry.get("title") or slug,
+                "type": atype,
+                "artifact": ARTIFACT[atype],
+                "sha256": digest,
+                "previous-sha256": None,
+                "bytes": art.stat().st_size,
+                "archived": datetime.date.today().isoformat(),
+                "source-date": entry.get("source-date"),
+                "snapshot-quality": quality,
+                "wayback": None,
+            }
+            prov_path.write_text(
+                json.dumps(prov, indent=2, ensure_ascii=False) + "\n",
+                encoding="utf-8",
+            )
+            log(f"{slug}: archived [{atype}, {quality}] ({prov['bytes']} bytes)")
+
+        # --- contribute to the Hakyll index -------------------------------
+        index[url] = {
+            "slug": slug,
+            "type": prov.get("type", atype),
+            "title": prov.get("title", slug),
+            "aliases": url_aliases(url),
+        }
+
+    # archive-index.json is always rewritten to mirror the manifest exactly.
+    INDEX_OUT.parent.mkdir(parents=True, exist_ok=True)
+    INDEX_OUT.write_text(
+        json.dumps(index, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+    log(f"wrote {INDEX_OUT.relative_to(REPO_ROOT)} ({len(index)} entries)")
+
+    if skipped:
+        err(f"{skipped} entr{'y' if skipped == 1 else 'ies'} skipped "
+            f"(network / cap / missing url) — retried next build")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# refresh subcommand — deliberate re-snapshot of one entry
+# ---------------------------------------------------------------------------
+
+def cmd_refresh(argv: list[str]) -> int:
+    """Deliberately re-snapshot a single entry.
+
+    Two invariants:
+
+      * The prior snapshot is *recoverable* — refresh refuses to replace
+        an artifact whose committed bytes git does not have, so the
+        recorded @previous-sha256@ always points at something
+        retrievable via @git log -S@. Commit the current snapshot first.
+
+      * The replacement is *atomic across every exit path* — slug dir and
+        @data/archive-index.json@ are both staged aside; any failure
+        (transient fetch error, fatal @cmd_fetch@ exit, exception,
+        interruption) restores both. We never end up with no snapshot
+        and never leave the index pointing at a discarded state.
+
+    The only way an @archive.py@ invocation replaces a committed artifact
+    — @cmd_fetch@ itself refuses to."""
+    if not argv:
+        err("refresh: pass a slug "
+            "(e.g. `archive.py refresh nist-fips-203`)")
+        return 2
+    slug = argv[0]
+
+    manifest = load_yaml_list(MANIFEST)
+    entry = next((e for e in manifest
+                  if e.get("url") and entry_slug(e) == slug), None)
+    if entry is None:
+        err(f"refresh: {slug!r} is not in archive/manifest.yaml")
+        return 2
+
+    slug_dir = ARCHIVE_DIR / slug
+    prov_path = slug_dir / "PROVENANCE.json"
+    prev_sha: str | None = None
+    if prov_path.exists():
+        try:
+            prev = json.loads(prov_path.read_text(encoding="utf-8"))
+            prev_sha = prev.get("sha256")
+            prev_artifact = slug_dir / prev.get("artifact", "")
+        except Exception as exc:                       # noqa: BLE001
+            err(f"refresh: cannot parse prior provenance for {slug}: {exc}")
+            return 2
+        # The prior snapshot must be committed and clean — otherwise
+        # `previous-sha256` would point at bytes git can no longer give
+        # back, breaking the auditable replacement contract.
+        if not prev_sha or not prev_artifact.exists():
+            err(f"refresh: prior snapshot for {slug} is incomplete; restore "
+                f"its artifact and provenance before replacing it.")
+            return 2
+        live_sha = sha256_of(prev_artifact)
+        if live_sha != prev_sha:
+            err(f"refresh: prior snapshot for {slug} fails SHA-256 "
+                f"verification (recorded {prev_sha}, found {live_sha}); "
+                f"refusing to replace unverifiable bytes.")
+            return 2
+        if not _is_tracked_and_clean(prov_path, prev_artifact):
+            err(f"refresh: the prior snapshot for {slug} "
+                f"(archive/{slug}/{{PROVENANCE.json, "
+                f"{prev_artifact.name}}}) has uncommitted changes or is "
+                f"not tracked in git. Commit the current snapshot first "
+                f"— otherwise its bytes cannot be recovered via "
+                f"`git log -S` once replaced.")
+            return 2
+
+    # Stage the old snapshot AND the current archive-index.json aside —
+    # cmd_fetch rewrites the index unconditionally, so a failed refresh
+    # must roll both back.
+    backup: Path | None = None
+    if slug_dir.exists():
+        backup = slug_dir.with_name(slug + ".refresh-backup")
+        if backup.exists():
+            err(f"refresh: recovery directory {backup.name} already exists; "
+                f"resolve it before starting another refresh.")
+            return 2
+        slug_dir.rename(backup)
+        log(f"refresh: staged old archive/{slug}/ aside as {backup.name}")
+
+    index_existed = INDEX_OUT.exists()
+    index_backup: Path | None = None
+    if index_existed:
+        index_backup = INDEX_OUT.with_suffix(".json.refresh-backup")
+        if index_backup.exists():
+            if backup is not None:
+                backup.rename(slug_dir)
+            err(f"refresh: recovery file {index_backup.name} already exists; "
+                f"resolve it before starting another refresh.")
+            return 2
+        shutil.copy2(INDEX_OUT, index_backup)
+
+    succeeded = False
+    try:
+        rc = cmd_fetch()
+
+        # Success requires a new PROVENANCE.json *and* its declared
+        # artifact on disk. `cmd_fetch` returns 0 even when individual
+        # entries skip, so the return code alone is not enough.
+        if rc == 0 and prov_path.exists():
+            try:
+                new_prov = json.loads(prov_path.read_text(encoding="utf-8"))
+                art_name = new_prov.get("artifact", "")
+                if art_name and (slug_dir / art_name).exists():
+                    if prev_sha:
+                        new_prov["previous-sha256"] = prev_sha
+                        prov_path.write_text(
+                            json.dumps(new_prov, indent=2,
+                                       ensure_ascii=False) + "\n",
+                            encoding="utf-8",
+                        )
+                        log(f"refresh: recorded previous-sha256 "
+                            f"{prev_sha[:12]}…")
+                    succeeded = True
+            except Exception:                              # noqa: BLE001
+                succeeded = False
+    finally:
+        # Runs on every exit path — normal return, exception, SystemExit
+        # from cmd_fetch, KeyboardInterrupt. We always end with either a
+        # complete new snapshot or the prior one restored, never neither.
+        if succeeded:
+            if backup is not None:
+                shutil.rmtree(backup)
+            if index_backup is not None:
+                index_backup.unlink()
+            log(f"refresh: {slug} re-snapshotted")
+        else:
+            if slug_dir.exists():
+                shutil.rmtree(slug_dir)
+            if backup is not None:
+                backup.rename(slug_dir)
+            if index_backup is not None:
+                shutil.move(str(index_backup), str(INDEX_OUT))
+            elif not index_existed:
+                INDEX_OUT.unlink(missing_ok=True)
+            err(f"refresh: re-snapshot of {slug} failed; the prior "
+                f"snapshot has been restored.")
+
+    return 0 if succeeded else 1
+
+
+# ---------------------------------------------------------------------------
+# wayback subcommand
+# ---------------------------------------------------------------------------
+
+def wayback_save(url: str) -> None:
+    """Trigger a fresh Wayback capture via Save Page Now. Best-effort: any
+    outcome is tolerated — the resulting URL is read back via the
+    availability API (which also surfaces a pre-existing capture)."""
+    req = urllib.request.Request("https://web.archive.org/save/" + url,
+                                 headers={"User-Agent": USER_AGENT})
+    try:
+        with urllib.request.urlopen(req, timeout=WAYBACK_TIMEOUT):
+            pass
+    except Exception as exc:                           # noqa: BLE001
+        log(f"wayback: save request for {url} did not complete ({exc})")
+
+
+def wayback_lookup(url: str) -> str | None:
+    """Return the most recent Wayback Machine capture URL for `url`, or
+    None if there is no capture (or the availability API is unreachable)."""
+    api = ("https://archive.org/wayback/available?url="
+           + quote(url, safe=""))
+    req = urllib.request.Request(api, headers={"User-Agent": USER_AGENT})
+    try:
+        with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+    except Exception as exc:                           # noqa: BLE001
+        err(f"wayback: availability lookup failed for {url} ({exc})")
+        return None
+    snap = (data.get("archived_snapshots") or {}).get("closest") or {}
+    if snap.get("available") and snap.get("url"):
+        return snap["url"]
+    return None
+
+
+def cmd_wayback() -> int:
+    """Submit every archived URL whose PROVENANCE.json has no `wayback`
+    capture yet to the Wayback Machine, then backfill the returned capture
+    URL. Never on the critical path of a build — a separate target. Always
+    exits 0: a capture that does not come through is simply retried next
+    run. URLs recorded in removed.yaml are skipped — a deliberate takedown
+    must not be re-published to a third-party archive even if its manifest
+    line is still present during the documented eviction sequence.
+    """
+    manifest = load_yaml_list(MANIFEST)
+    removed_norms = {normalize_url(r["url"])
+                     for r in load_yaml_list(REMOVED) if r.get("url")}
+    backfilled = pending = 0
+
+    for entry in manifest:
+        url = entry.get("url")
+        if not url or normalize_url(url) in removed_norms:
+            continue
+        slug = entry_slug(entry)
+        prov_path = ARCHIVE_DIR / slug / "PROVENANCE.json"
+        if not prov_path.exists():
+            continue                       # not fetched yet — run `fetch` first
+        prov = json.loads(prov_path.read_text(encoding="utf-8"))
+        if prov.get("wayback"):
+            continue                       # already has a capture recorded
+
+        log(f"wayback: submitting {url}")
+        wayback_save(url)
+        capture = wayback_lookup(url)
+        if capture:
+            prov["wayback"] = capture
+            prov_path.write_text(
+                json.dumps(prov, indent=2, ensure_ascii=False) + "\n",
+                encoding="utf-8",
+            )
+            log(f"{slug}: wayback -> {capture}")
+            backfilled += 1
+        else:
+            log(f"{slug}: no Wayback capture available yet — retried next run")
+            pending += 1
+
+    log(f"wayback: {backfilled} backfilled, {pending} pending")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# check subcommand — link-rot detection
+# ---------------------------------------------------------------------------
+
+def moved_meaningfully(orig: str, final: str) -> bool:
+    """True if `final` (where the request actually landed after redirects)
+    differs from `orig` by more than an http/https fold or a trailing slash
+    — i.e. a real relocation, not benign canonicalisation."""
+    def norm(u: str) -> str:
+        u = u.split("#", 1)[0]
+        if u.startswith("http://"):
+            u = "https://" + u[len("http://"):]
+        return u.rstrip("/")
+    return norm(orig) != norm(final)
+
+
+def probe_url(url: str) -> tuple[str, str | None]:
+    """Probe a URL for reachability. Returns @(result, new_url)@ where
+    result is 'ok' | 'moved' | 'fail'. HEAD first; a server that rejects
+    HEAD (405/501/403) is retried with a ranged GET."""
+    for method in ("HEAD", "GET"):
+        headers = {"User-Agent": USER_AGENT}
+        if method == "GET":
+            headers["Range"] = "bytes=0-0"
+        req = urllib.request.Request(url, method=method, headers=headers)
+        try:
+            with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
+                final = resp.geturl()
+                if moved_meaningfully(url, final):
+                    return ("moved", final)
+                return ("ok", None)
+        except urllib.error.HTTPError as exc:
+            if method == "HEAD" and exc.code in (403, 405, 501):
+                continue                       # HEAD not allowed — try GET
+            return ("fail", None)              # a definite 4xx/5xx
+        except Exception:                      # noqa: BLE001 — network failure
+            if method == "HEAD":
+                continue
+            return ("fail", None)
+    return ("fail", None)
+
+
+def next_state(prev: dict, result: str, new_url: str | None,
+               today: datetime.date) -> dict:
+    """Fold a probe result into an entry's state with asymmetric
+    hysteresis. Recovery is immediate: one 'ok' returns straight to
+    'live'. Rotting is slow: 'rotted' needs ROT_FAILS consecutive failures
+    spanning at least ROT_DAYS days; below that the status is the
+    inconclusive 'error'."""
+    iso         = today.isoformat()
+    prev_status = prev.get("status", "live")
+    prev_cf     = prev.get("consecutive-failures", 0)
+    prev_since  = prev.get("status-since", iso)
+
+    if result == "ok":
+        return {"status": "live", "checked": iso,
+                "consecutive-failures": 0,
+                "status-since": prev_since if prev_status == "live" else iso}
+
+    if result == "moved":
+        rec = {"status": "moved", "checked": iso,
+               "consecutive-failures": 0,
+               "status-since": prev_since if prev_status == "moved" else iso}
+        if new_url:
+            rec["new-url"] = new_url
+        return rec
+
+    # result == "fail" — increment the streak; 'status-since' marks its start.
+    cf = prev_cf + 1
+    streak_since = prev_since if prev_status in ("error", "rotted") else iso
+    span = (today - datetime.date.fromisoformat(streak_since)).days
+    status = "rotted" if (cf >= ROT_FAILS and span >= ROT_DAYS) else "error"
+    return {"status": status, "checked": iso,
+            "consecutive-failures": cf, "status-since": streak_since}
+
+
+def cmd_check() -> int:
+    """Probe every manifest URL and rewrite data/archive-state.json. The
+    new state mirrors the manifest exactly (entries for dropped URLs are
+    discarded). A slow network job — never on a build's critical path;
+    always exits 0, since a probe failure is the signal, not an error.
+    URLs listed in removed.yaml are skipped — the link-rot scanner should
+    not keep probing a deliberately-removed work."""
+    manifest = load_yaml_list(MANIFEST)
+    removed_norms = {normalize_url(r["url"])
+                     for r in load_yaml_list(REMOVED) if r.get("url")}
+    old = {}
+    if STATE_OUT.exists():
+        try:
+            old = json.loads(STATE_OUT.read_text(encoding="utf-8"))
+        except Exception:                                  # noqa: BLE001
+            old = {}
+
+    today = datetime.date.today()
+    state: dict[str, dict] = {}
+    tally = {"live": 0, "moved": 0, "error": 0, "rotted": 0}
+
+    for entry in manifest:
+        url = entry.get("url")
+        if not url or normalize_url(url) in removed_norms:
+            continue
+        result, new_url = probe_url(url)
+        rec = next_state(old.get(url, {}), result, new_url, today)
+        state[url] = rec
+        tally[rec["status"]] = tally.get(rec["status"], 0) + 1
+        note = f" -> {new_url}" if new_url else ""
+        log(f"check: {url}  [{rec['status']}]{note}")
+
+    STATE_OUT.parent.mkdir(parents=True, exist_ok=True)
+    STATE_OUT.write_text(
+        json.dumps(state, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+    log(f"check: {tally['live']} live, {tally['moved']} moved, "
+        f"{tally['error']} error, {tally['rotted']} rotted "
+        f"-> {STATE_OUT.relative_to(REPO_ROOT)}")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# gc subcommand
+# ---------------------------------------------------------------------------
+
+def cmd_gc(ignore_orphans: bool) -> int:
+    manifest = load_yaml_list(MANIFEST)
+    removed = load_yaml_list(REMOVED)
+
+    manifest_slugs = {entry_slug(e) for e in manifest if e.get("url")}
+    removed_slugs = {r["slug"] for r in removed if r.get("slug")}
+
+    if not ARCHIVE_DIR.exists():
+        log("no archive/ directory — nothing to GC")
+        return 0
+
+    deleted = 0
+    orphans: list[str] = []
+    for child in sorted(ARCHIVE_DIR.iterdir()):
+        if not child.is_dir():
+            continue
+        slug = child.name
+        if slug in removed_slugs:
+            shutil.rmtree(child)
+            log(f"gc: removed archive/{slug}/ (in removed.yaml)")
+            deleted += 1
+        elif slug not in manifest_slugs:
+            orphans.append(slug)
+
+    for slug in orphans:
+        err(f"gc: archive/{slug}/ is not in manifest.yaml and not in "
+            f"removed.yaml — left intact. If you meant to evict it, add it "
+            f"to removed.yaml first; if it is stale (a branch switch, a "
+            f"rename), delete the directory by hand.")
+
+    log(f"gc: {deleted} director{'y' if deleted == 1 else 'ies'} removed")
+    if orphans and not ignore_orphans:
+        err(f"gc: {len(orphans)} orphan(s) present — "
+            f"resolve them or re-run with --ignore-orphans")
+        return 1
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+def main(argv: list[str]) -> int:
+    cmd = argv[0] if argv else "fetch"
+    if cmd == "fetch":
+        return cmd_fetch()
+    if cmd == "refresh":
+        return cmd_refresh(argv[1:])
+    if cmd == "wayback":
+        return cmd_wayback()
+    if cmd == "check":
+        return cmd_check()
+    if cmd == "gc":
+        return cmd_gc(ignore_orphans="--ignore-orphans" in argv[1:])
+    err(f"unknown subcommand {cmd!r} "
+        f"(expected: fetch | refresh | wayback | check | gc)")
+    return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/tools/bin/monolith b/tools/bin/monolith
new file mode 100755
index 0000000..1d90df7
Binary files /dev/null and b/tools/bin/monolith differ
diff --git a/tools/embed.py b/tools/embed.py
index 9034ee7..e201f66 100644
--- a/tools/embed.py
+++ b/tools/embed.py
@@ -48,7 +48,16 @@ MIN_SCORE      = 0.30   # similar-links: discard weak matches
 MIN_PARA_CHARS = 80     # semantic: skip very short paragraphs
 MAX_PARA_CHARS = 1000   # semantic: truncate before embedding
 
-EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml", "/music/feed.xml"}
+# /archive/ is the archive index — a list page that would dominate every
+# entry's "Related" set; the individual /archive/<slug>/ pages stay in.
+EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml",
+                "/music/feed.xml", "/archive/"}
+
+# Whole subtrees kept out of the corpus. /source/ is the repository code
+# mirror — source files, not content; left in, they pollute every page's
+# "Related" set and semantic search (e.g. a template file surfacing as a
+# neighbour, titled with its unrendered "$title$" placeholder).
+EXCLUDE_PREFIXES = ("/source/",)
 
 # Pages whose <body data-portal> are portal/landing pages — they aggregate
 # excerpts from many entries and would otherwise dominate every page's
@@ -122,7 +131,7 @@ def extract_page(html_path: Path) -> dict | None:
     soup = BeautifulSoup(raw, "html.parser")
     url  = _url_from_path(html_path)
 
-    if url in EXCLUDE_URLS:
+    if url in EXCLUDE_URLS or url.startswith(EXCLUDE_PREFIXES):
         return None
     body_tag = soup.body
     if body_tag is not None and body_tag.has_attr(PORTAL_BODY_ATTR):
diff --git a/tools/monolith-version.txt b/tools/monolith-version.txt
new file mode 100644
index 0000000..b21264c
--- /dev/null
+++ b/tools/monolith-version.txt
@@ -0,0 +1,17 @@
+# Pinned monolith binary — the HTML-snapshot tool for the link archive.
+#
+# Unlike PDF.js / Leaflet (servable assets downloaded at build time and
+# gitignored), monolith is a build-time *executable*: the binary itself is
+# committed at tools/bin/monolith so `git clone` -> `make build` needs no
+# network fetch and stays reproducible from a bare clone. See ARCHIVE.md.
+#
+# To re-vendor (version bump, or a build host on a different architecture):
+#   1. Download the matching asset from
+#      https://github.com/Y2Z/monolith/releases
+#   2. Place it at tools/bin/monolith and `chmod +x`.
+#   3. Update the three values below; verify `tools/bin/monolith --version`.
+#   4. Commit the binary and this file together.
+
+version  = 2.10.1
+asset    = monolith-gnu-linux-x86_64
+sha256   = 663ca914b078e91d5a854b4a07e913c613bbbcfe8fb11a24da1a6ab23c9205df