levineuwirth.org/build/ArchiveIndex.hs

264 lines
10 KiB
Haskell

{-# LANGUAGE GHC2021 #-}
{-# LANGUAGE OverloadedStrings #-}
-- | ArchiveIndex — shared read-only access to the archive's two JSON
-- sidecars: @data/archive-index.json@ (the @url\/alias -> slug@ map
-- written by @archive.py fetch@) and @data/archive-state.json@ (the
-- per-URL link-rot status written by @archive.py check@).
--
-- Consumers:
--
-- * @Filters.Archive@ — appends the archive affordance to body links
-- whose target is archived, and flips a @rotted@ link to the local
-- copy.
-- * @Backlinks@ — keeps archived external links through pass 1 and
-- canonicalises them to their @/archive/<slug>/@ page in pass 2.
-- * @Archive@ — surfaces each entry's rot status on its page, the
-- @/archive/@ index, and the @/build/@ telemetry.
--
-- Both files are loaded once per build via @unsafePerformIO@ CAFs. An
-- absent or malformed file degrades safely: an empty index makes the
-- link consumers no-op; an absent state file makes every entry @Live@
-- (the safe default — no link flip). @archive.py check@ is decoupled
-- from @make build@; a build consumes whatever state file exists.
module ArchiveIndex
( ArchiveStatus (..)
, statusName
, archiveSlugFor
, archiveStatusForSlug
, archiveIndexIsEmpty
, normalizeUrl
) where
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Maybe (fromMaybe)
import Data.Set (Set)
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Aeson as A
import Data.Aeson ((.!=), (.:), (.:?))
import qualified Data.Yaml as Y
import System.Directory (doesFileExist)
import System.IO.Unsafe (unsafePerformIO)
-- ---------------------------------------------------------------------------
-- Link-rot status
-- ---------------------------------------------------------------------------
-- | The link-rot status of an archived work's original URL, as set by
-- @archive.py check@. 'Live' is the safe default for an unscanned or
-- unknown entry.
data ArchiveStatus = Live | Moved | Rotted | Error
deriving (Eq, Show)
-- | The lower-case wire name, matching @archive-state.json@ and the
-- @status:@ Pagefind filter tag.
statusName :: ArchiveStatus -> String
statusName Live = "live"
statusName Moved = "moved"
statusName Rotted = "rotted"
statusName Error = "error"
parseStatus :: Text -> ArchiveStatus
parseStatus "moved" = Moved
parseStatus "rotted" = Rotted
parseStatus "error" = Error
parseStatus _ = Live
-- ---------------------------------------------------------------------------
-- JSON shapes
-- ---------------------------------------------------------------------------
-- | One @archive-index.json@ entry. Only @slug@ and @aliases@ are used.
data IdxEntry = IdxEntry
{ ieSlug :: String
, ieAliases :: [Text]
}
instance A.FromJSON IdxEntry where
parseJSON = A.withObject "IdxEntry" $ \o -> IdxEntry
<$> o .: "slug"
<*> (o .:? "aliases" .!= [])
-- | One @archive-state.json@ entry — only the @status@ is consumed here.
newtype StateEntry = StateEntry { seStatus :: ArchiveStatus }
instance A.FromJSON StateEntry where
parseJSON = A.withObject "StateEntry" $ \o ->
StateEntry . parseStatus <$> (o .:? "status" .!= "live")
newtype UrlEntry = UrlEntry { ueUrl :: Text }
instance A.FromJSON UrlEntry where
parseJSON = A.withObject "UrlEntry" $ \o ->
UrlEntry <$> o .: "url"
-- ---------------------------------------------------------------------------
-- Loaded-once CAFs
-- ---------------------------------------------------------------------------
indexPath, statePath, manifestPath, removedPath :: FilePath
indexPath = "data/archive-index.json"
statePath = "data/archive-state.json"
manifestPath = "archive/manifest.yaml"
removedPath = "archive/removed.yaml"
readUrlSet :: FilePath -> IO (Set Text)
readUrlSet path = do
exists <- doesFileExist path
if not exists
then return Set.empty
else do
decoded <- Y.decodeFileEither path
case decoded of
Right entries -> return . Set.fromList $
map (normalizeUrl . ueUrl) (entries :: [UrlEntry])
Left e -> ioError . userError $
"[archive] FATAL: " ++ path ++ ": " ++ show e
-- | Canonical URLs still permitted to participate in link annotation.
-- Filtering the generated index at build time makes a direct Hakyll build
-- respect authored manifest/removal state even when archive.py did not run.
{-# NOINLINE activeUrls #-}
activeUrls :: Set Text
activeUrls = unsafePerformIO $ do
manifest <- readUrlSet manifestPath
removed <- readUrlSet removedPath
return (manifest `Set.difference` removed)
-- | @canonical-url -> entry@. Absent/malformed file -> empty; entries no
-- longer permitted by the authored manifest/removal state are removed.
{-# NOINLINE rawIndex #-}
rawIndex :: Map Text IdxEntry
rawIndex = unsafePerformIO $ do
exists <- doesFileExist indexPath
if not exists
then return Map.empty
else do
decoded <- A.eitherDecodeFileStrict' indexPath
let parsed = either (const Map.empty) id decoded
return $ Map.filterWithKey
(\canon _ -> normalizeUrl canon `Set.member` activeUrls)
parsed
-- | @url -> status@. Absent/malformed file -> empty (every entry 'Live').
{-# NOINLINE rawState #-}
rawState :: Map Text ArchiveStatus
rawState = unsafePerformIO $ do
exists <- doesFileExist statePath
if not exists
then return Map.empty
else do
decoded <- A.eitherDecodeFileStrict' statePath
return $ either (const Map.empty) (Map.map seStatus) decoded
-- | @normalised-url -> slug@: the canonical key and every alias from
-- @archive-index.json@, each fed through 'normalizeUrl'. Both keys and
-- lookups are normalised, so a citation form the alias set cannot
-- enumerate (e.g. an unbounded arXiv version, or any tracking-laden
-- variant of a clean manifest URL) still resolves.
{-# NOINLINE flatIndex #-}
flatIndex :: Map Text String
flatIndex = Map.fromList
[ (normalizeUrl key, ieSlug e)
| (canon, e) <- Map.toList rawIndex
, key <- canon : ieAliases e
]
-- | @slug -> status@: each entry's status, looked up by its canonical URL
-- in the state file (the two files share the manifest URL as key).
{-# NOINLINE slugStatus #-}
slugStatus :: Map String ArchiveStatus
slugStatus = Map.fromList
[ (ieSlug e, Map.findWithDefault Live canon rawState)
| (canon, e) <- Map.toList rawIndex
]
-- ---------------------------------------------------------------------------
-- Public lookups
-- ---------------------------------------------------------------------------
-- | True when no archive index is available — the link consumers no-op.
archiveIndexIsEmpty :: Bool
archiveIndexIsEmpty = Map.null rawIndex
-- | The archive slug for an outbound URL, or 'Nothing'. Both the index
-- keys and the input go through 'normalizeUrl', so a citation form that
-- the alias set cannot enumerate — an unbounded arXiv version, or any
-- tracking-laden variant of a clean manifest URL — still resolves.
archiveSlugFor :: Text -> Maybe String
archiveSlugFor url = Map.lookup (normalizeUrl url) flatIndex
-- | The link-rot status of an archived entry, by slug. 'Live' for an
-- unknown slug or when no scan has run.
archiveStatusForSlug :: String -> ArchiveStatus
archiveStatusForSlug slug = Map.findWithDefault Live slug slugStatus
-- ---------------------------------------------------------------------------
-- URL normalisation (matching, not display)
-- ---------------------------------------------------------------------------
-- | Tracking-only query parameters: their presence or absence is
-- semantically irrelevant; the lookup strips them before matching.
-- Sync with @TRACKING_PARAMS@ in @tools/archive.py@.
trackingParams :: [Text]
trackingParams =
[ "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content"
, "fbclid", "gclid", "mc_eid", "mc_cid", "ref", "igshid"
, "_hsenc", "_hsmi", "mkt_tok"
]
-- | Remove tracking-only query parameters; preserve every other parameter
-- in its original order.
stripTracking :: Text -> Text
stripTracking url = case T.breakOn "?" url of
(_, "") -> url
(path, q) ->
let kept = filter notTracking (T.splitOn "&" (T.drop 1 q))
in if null kept then path
else path <> "?" <> T.intercalate "&" kept
where
notTracking p = T.takeWhile (/= '=') p `notElem` trackingParams
-- | The canonical form of an arXiv URL: @https://arxiv.org/abs/<id>@ with
-- no version suffix and no @.pdf@. Maps every member of the
-- abs/pdf/versioned/@.pdf@ family to the same key. Non-arXiv passes through.
arxivCanonical :: Text -> Text
arxivCanonical url
| Just rest <- T.stripPrefix "https://arxiv.org/" url
, Just key <- arxivKey rest = key
| Just rest <- T.stripPrefix "http://arxiv.org/" url
, Just key <- arxivKey rest = key
| otherwise = url
where
arxivKey rest = case T.breakOn "/" rest of
(kind, slashId)
| kind `elem` ["abs", "pdf"], not (T.null slashId) ->
Just $ "https://arxiv.org/abs/"
<> stripVer (stripPdfSuf (T.tail slashId))
_ -> Nothing
stripPdfSuf t = fromMaybe t (T.stripSuffix ".pdf" t)
stripVer t = case T.breakOnEnd "v" t of
(before, ver)
| not (T.null before)
, not (T.null ver)
, T.all isAsciiDigit ver
-> T.dropEnd 1 before
_ -> t
isAsciiDigit c = c >= '0' && c <= '9'
-- | The full normalisation: drop fragment, strip tracking, fold
-- @http://@→@https://@, arXiv-canonicalise, trim a trailing slash. Both
-- 'flatIndex' keys and 'archiveSlugFor' inputs go through this so the
-- index never misses a citation form the design promises to match.
normalizeUrl :: Text -> Text
normalizeUrl url =
let noFrag = T.takeWhile (/= '#') url
clean = stripTracking noFrag
https = case T.stripPrefix "http://" clean of
Just rest -> "https://" <> rest
Nothing -> clean
arxiv = arxivCanonical https
in T.dropWhileEnd (== '/') arxiv