256 lines
10 KiB
Haskell
256 lines
10 KiB
Haskell
{-# LANGUAGE GHC2021 #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
-- | ArchiveIndex — shared read-only access to the archive's two JSON
|
|
-- sidecars: @data/archive-index.json@ (the @url\/alias -> slug@ map
|
|
-- written by @archive.py fetch@) and @data/archive-state.json@ (the
|
|
-- per-URL link-rot status written by @archive.py check@).
|
|
--
|
|
-- Consumers:
|
|
--
|
|
-- * @Filters.Archive@ — appends the archive affordance to body links
|
|
-- whose target is archived, and flips a @rotted@ link to the local
|
|
-- copy.
|
|
-- * @Backlinks@ — keeps archived external links through pass 1 and
|
|
-- canonicalises them to their @/archive/<slug>/@ page in pass 2.
|
|
-- * @Archive@ — surfaces each entry's rot status on its page, the
|
|
-- @/archive/@ index, and the @/build/@ telemetry.
|
|
--
|
|
-- Both files are loaded once per build via @unsafePerformIO@ CAFs. An
|
|
-- absent or malformed file degrades safely: an empty index makes the
|
|
-- link consumers no-op; an absent state file makes every entry @Live@
|
|
-- (the safe default — no link flip). @archive.py check@ is decoupled
|
|
-- from @make build@; a build consumes whatever state file exists.
|
|
module ArchiveIndex
|
|
( ArchiveStatus (..)
|
|
, statusName
|
|
, archiveSlugFor
|
|
, archiveStatusForSlug
|
|
, archiveIndexIsEmpty
|
|
, normalizeUrl
|
|
) where
|
|
|
|
import Data.Map.Strict (Map)
|
|
import qualified Data.Map.Strict as Map
|
|
import Data.Maybe (fromMaybe)
|
|
import Data.Set (Set)
|
|
import qualified Data.Set as Set
|
|
import Data.Text (Text)
|
|
import qualified Data.Text as T
|
|
import qualified Data.Aeson as A
|
|
import Data.Aeson ((.!=), (.:), (.:?))
|
|
import qualified Data.Yaml as Y
|
|
import System.Directory (doesFileExist)
|
|
import System.IO.Unsafe (unsafePerformIO)
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Link-rot status
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | The link-rot status of an archived work's original URL, as set by
|
|
-- @archive.py check@. 'Live' is the safe default for an unscanned or
|
|
-- unknown entry.
|
|
data ArchiveStatus = Live | Moved | Rotted | Error
|
|
deriving (Eq, Show)
|
|
|
|
-- | The lower-case wire name, matching @archive-state.json@ and the
|
|
-- @status:@ Pagefind filter tag.
|
|
statusName :: ArchiveStatus -> String
|
|
statusName Live = "live"
|
|
statusName Moved = "moved"
|
|
statusName Rotted = "rotted"
|
|
statusName Error = "error"
|
|
|
|
parseStatus :: Text -> ArchiveStatus
|
|
parseStatus "moved" = Moved
|
|
parseStatus "rotted" = Rotted
|
|
parseStatus "error" = Error
|
|
parseStatus _ = Live
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- JSON shapes
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | One @archive-index.json@ entry. Only @slug@ and @aliases@ are used.
|
|
data IdxEntry = IdxEntry
|
|
{ ieSlug :: String
|
|
, ieAliases :: [Text]
|
|
}
|
|
|
|
instance A.FromJSON IdxEntry where
|
|
parseJSON = A.withObject "IdxEntry" $ \o -> IdxEntry
|
|
<$> o .: "slug"
|
|
<*> (o .:? "aliases" .!= [])
|
|
|
|
-- | One @archive-state.json@ entry — only the @status@ is consumed here.
|
|
newtype StateEntry = StateEntry { seStatus :: ArchiveStatus }
|
|
|
|
instance A.FromJSON StateEntry where
|
|
parseJSON = A.withObject "StateEntry" $ \o ->
|
|
StateEntry . parseStatus <$> (o .:? "status" .!= "live")
|
|
|
|
newtype UrlEntry = UrlEntry { ueUrl :: Text }
|
|
|
|
instance A.FromJSON UrlEntry where
|
|
parseJSON = A.withObject "UrlEntry" $ \o ->
|
|
UrlEntry <$> o .: "url"
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Loaded-once CAFs
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
indexPath, statePath, manifestPath, removedPath :: FilePath
|
|
indexPath = "data/archive-index.json"
|
|
statePath = "data/archive-state.json"
|
|
manifestPath = "archive/manifest.yaml"
|
|
removedPath = "archive/removed.yaml"
|
|
|
|
readUrlSet :: FilePath -> IO (Set Text)
|
|
readUrlSet path = do
|
|
exists <- doesFileExist path
|
|
if not exists
|
|
then return Set.empty
|
|
else do
|
|
decoded <- Y.decodeFileEither path
|
|
case decoded of
|
|
Right entries -> return . Set.fromList $
|
|
map (normalizeUrl . ueUrl) (entries :: [UrlEntry])
|
|
Left e -> ioError . userError $
|
|
"[archive] FATAL: " ++ path ++ ": " ++ show e
|
|
|
|
-- | Canonical URLs still permitted to participate in link annotation.
|
|
-- Filtering the generated index at build time makes a direct Hakyll build
|
|
-- respect authored manifest/removal state even when archive.py did not run.
|
|
{-# NOINLINE activeUrls #-}
|
|
activeUrls :: Set Text
|
|
activeUrls = unsafePerformIO $ do
|
|
manifest <- readUrlSet manifestPath
|
|
removed <- readUrlSet removedPath
|
|
return (manifest `Set.difference` removed)
|
|
|
|
-- | @canonical-url -> entry@. Absent/malformed file -> empty; entries no
|
|
-- longer permitted by the authored manifest/removal state are removed.
|
|
{-# NOINLINE rawIndex #-}
|
|
rawIndex :: Map Text IdxEntry
|
|
rawIndex = unsafePerformIO $ do
|
|
decoded <- A.eitherDecodeFileStrict' indexPath
|
|
let parsed = either (const Map.empty) id decoded
|
|
return $ Map.filterWithKey
|
|
(\canon _ -> normalizeUrl canon `Set.member` activeUrls)
|
|
parsed
|
|
|
|
-- | @url -> status@. Absent/malformed file -> empty (every entry 'Live').
|
|
{-# NOINLINE rawState #-}
|
|
rawState :: Map Text ArchiveStatus
|
|
rawState = unsafePerformIO $ do
|
|
decoded <- A.eitherDecodeFileStrict' statePath
|
|
return $ either (const Map.empty) (Map.map seStatus) decoded
|
|
|
|
-- | @normalised-url -> slug@: the canonical key and every alias from
|
|
-- @archive-index.json@, each fed through 'normalizeUrl'. Both keys and
|
|
-- lookups are normalised, so a citation form the alias set cannot
|
|
-- enumerate (e.g. an unbounded arXiv version, or any tracking-laden
|
|
-- variant of a clean manifest URL) still resolves.
|
|
{-# NOINLINE flatIndex #-}
|
|
flatIndex :: Map Text String
|
|
flatIndex = Map.fromList
|
|
[ (normalizeUrl key, ieSlug e)
|
|
| (canon, e) <- Map.toList rawIndex
|
|
, key <- canon : ieAliases e
|
|
]
|
|
|
|
-- | @slug -> status@: each entry's status, looked up by its canonical URL
|
|
-- in the state file (the two files share the manifest URL as key).
|
|
{-# NOINLINE slugStatus #-}
|
|
slugStatus :: Map String ArchiveStatus
|
|
slugStatus = Map.fromList
|
|
[ (ieSlug e, Map.findWithDefault Live canon rawState)
|
|
| (canon, e) <- Map.toList rawIndex
|
|
]
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Public lookups
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | True when no archive index is available — the link consumers no-op.
|
|
archiveIndexIsEmpty :: Bool
|
|
archiveIndexIsEmpty = Map.null rawIndex
|
|
|
|
-- | The archive slug for an outbound URL, or 'Nothing'. Both the index
|
|
-- keys and the input go through 'normalizeUrl', so a citation form that
|
|
-- the alias set cannot enumerate — an unbounded arXiv version, or any
|
|
-- tracking-laden variant of a clean manifest URL — still resolves.
|
|
archiveSlugFor :: Text -> Maybe String
|
|
archiveSlugFor url = Map.lookup (normalizeUrl url) flatIndex
|
|
|
|
-- | The link-rot status of an archived entry, by slug. 'Live' for an
|
|
-- unknown slug or when no scan has run.
|
|
archiveStatusForSlug :: String -> ArchiveStatus
|
|
archiveStatusForSlug slug = Map.findWithDefault Live slug slugStatus
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- URL normalisation (matching, not display)
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Tracking-only query parameters: their presence or absence is
|
|
-- semantically irrelevant; the lookup strips them before matching.
|
|
-- Sync with @TRACKING_PARAMS@ in @tools/archive.py@.
|
|
trackingParams :: [Text]
|
|
trackingParams =
|
|
[ "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content"
|
|
, "fbclid", "gclid", "mc_eid", "mc_cid", "ref", "igshid"
|
|
, "_hsenc", "_hsmi", "mkt_tok"
|
|
]
|
|
|
|
-- | Remove tracking-only query parameters; preserve every other parameter
|
|
-- in its original order.
|
|
stripTracking :: Text -> Text
|
|
stripTracking url = case T.breakOn "?" url of
|
|
(_, "") -> url
|
|
(path, q) ->
|
|
let kept = filter notTracking (T.splitOn "&" (T.drop 1 q))
|
|
in if null kept then path
|
|
else path <> "?" <> T.intercalate "&" kept
|
|
where
|
|
notTracking p = T.takeWhile (/= '=') p `notElem` trackingParams
|
|
|
|
-- | The canonical form of an arXiv URL: @https://arxiv.org/abs/<id>@ with
|
|
-- no version suffix and no @.pdf@. Maps every member of the
|
|
-- abs/pdf/versioned/@.pdf@ family to the same key. Non-arXiv passes through.
|
|
arxivCanonical :: Text -> Text
|
|
arxivCanonical url
|
|
| Just rest <- T.stripPrefix "https://arxiv.org/" url
|
|
, Just key <- arxivKey rest = key
|
|
| Just rest <- T.stripPrefix "http://arxiv.org/" url
|
|
, Just key <- arxivKey rest = key
|
|
| otherwise = url
|
|
where
|
|
arxivKey rest = case T.breakOn "/" rest of
|
|
(kind, slashId)
|
|
| kind `elem` ["abs", "pdf"], not (T.null slashId) ->
|
|
Just $ "https://arxiv.org/abs/"
|
|
<> stripVer (stripPdfSuf (T.tail slashId))
|
|
_ -> Nothing
|
|
stripPdfSuf t = fromMaybe t (T.stripSuffix ".pdf" t)
|
|
stripVer t = case T.breakOnEnd "v" t of
|
|
(before, ver)
|
|
| not (T.null before)
|
|
, not (T.null ver)
|
|
, T.all isAsciiDigit ver
|
|
-> T.dropEnd 1 before
|
|
_ -> t
|
|
isAsciiDigit c = c >= '0' && c <= '9'
|
|
|
|
-- | The full normalisation: drop fragment, strip tracking, fold
|
|
-- @http://@→@https://@, arXiv-canonicalise, trim a trailing slash. Both
|
|
-- 'flatIndex' keys and 'archiveSlugFor' inputs go through this so the
|
|
-- index never misses a citation form the design promises to match.
|
|
normalizeUrl :: Text -> Text
|
|
normalizeUrl url =
|
|
let noFrag = T.takeWhile (/= '#') url
|
|
clean = stripTracking noFrag
|
|
https = case T.stripPrefix "http://" clean of
|
|
Just rest -> "https://" <> rest
|
|
Nothing -> clean
|
|
arxiv = arxivCanonical https
|
|
in T.dropWhileEnd (== '/') arxiv
|