{-# LANGUAGE GHC2021 #-} {-# LANGUAGE OverloadedStrings #-} -- | Backlinks with context: build-time computation of which pages link to -- each page, including the sentence that contains each link. -- -- Architecture (dependency-correct, no circular deps): -- -- 1. Each content file is compiled under @version "links"@: a lightweight -- pass that parses the source, walks the AST block-by-block, splits -- each paragraph into sentences, and for every internal link records -- the URL *and* the HTML of the sentence that contains it. The result -- is serialised as a JSON array of @{url, context}@ objects. -- -- 2. A @create ["data/backlinks.json"]@ rule loads all "links" items, -- inverts the map, and serialises -- @target → [{url, title, abstract, context}]@ as JSON. -- -- 3. @backlinksField@ loads that JSON at page render time and injects -- an HTML list showing each source's title and a quoted sentence of -- context. The @load@ call establishes a proper Hakyll dependency so -- pages recompile when backlinks change. -- -- Dependency order (no cycles): -- content "links" versions → data/backlinks.json → content default versions module Backlinks ( backlinkRules , backlinksField ) where import Data.List (nubBy, sortBy) import Data.Ord (comparing) import Data.Maybe (fromMaybe) import qualified Data.Map.Strict as Map import Data.Map.Strict (Map) import qualified Data.ByteString as BS import qualified Data.Text as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as TLE import qualified Data.Text.Encoding as TE import qualified Data.Text.Encoding.Error as TE import qualified Data.Aeson as Aeson import Data.Aeson ((.=)) import Text.Pandoc.Class (runPure) import Text.Pandoc.Writers (writeHtml5String) import Text.Pandoc.Definition (Block (..), Inline (..), Pandoc (..), nullMeta) import Text.Pandoc.Options (WriterOptions (..), HTMLMathMethod (..)) import Text.Pandoc.Walk (query) import Hakyll import Compilers (readerOpts, writerOpts) import Filters (preprocessSource) import qualified Patterns as P -- --------------------------------------------------------------------------- -- Link-with-context entry (intermediate, saved by the "links" pass) -- --------------------------------------------------------------------------- data LinkEntry = LinkEntry { leUrl :: T.Text -- internal URL (as found in the AST) , leSentence :: String -- HTML of the sentence containing the link , leParagraph :: String -- HTML of the full surrounding paragraph } deriving (Show, Eq) instance Aeson.ToJSON LinkEntry where toJSON e = Aeson.object [ "url" .= leUrl e , "sentence" .= leSentence e , "paragraph" .= leParagraph e ] instance Aeson.FromJSON LinkEntry where parseJSON = Aeson.withObject "LinkEntry" $ \o -> LinkEntry <$> o Aeson..: "url" <*> o Aeson..: "sentence" <*> o Aeson..: "paragraph" -- --------------------------------------------------------------------------- -- Backlink source record (stored in data/backlinks.json) -- --------------------------------------------------------------------------- data BacklinkSource = BacklinkSource { blUrl :: String , blTitle :: String , blAbstract :: String , blSentence :: String -- raw HTML of the sentence containing the link , blParagraph :: String -- raw HTML of the full paragraph (hover popup) } deriving (Show, Eq, Ord) instance Aeson.ToJSON BacklinkSource where toJSON bl = Aeson.object [ "url" .= blUrl bl , "title" .= blTitle bl , "abstract" .= blAbstract bl , "sentence" .= blSentence bl , "paragraph" .= blParagraph bl ] instance Aeson.FromJSON BacklinkSource where parseJSON = Aeson.withObject "BacklinkSource" $ \o -> BacklinkSource <$> o Aeson..: "url" <*> o Aeson..: "title" <*> o Aeson..: "abstract" <*> o Aeson..: "sentence" <*> o Aeson..: "paragraph" -- --------------------------------------------------------------------------- -- Writer options for context rendering -- --------------------------------------------------------------------------- -- | Minimal writer options for rendering paragraph context: no template -- (fragment only), plain math fallback (context excerpts are previews, not -- full renders, and KaTeX CSS may not be loaded on all target pages). contextWriterOpts :: WriterOptions contextWriterOpts = writerOpts { writerTemplate = Nothing , writerHTMLMathMethod = PlainMath } -- --------------------------------------------------------------------------- -- Context extraction -- --------------------------------------------------------------------------- -- | URL filter: skip external links, pseudo-schemes, anchor-only fragments, -- and static-asset paths. isPageLink :: T.Text -> Bool isPageLink u = not (T.isPrefixOf "http://" u) && not (T.isPrefixOf "https://" u) && not (T.isPrefixOf "#" u) && not (T.isPrefixOf "mailto:" u) && not (T.isPrefixOf "tel:" u) && not (T.null u) && not (hasStaticExt u) where staticExts = [".pdf",".svg",".png",".jpg",".jpeg",".webp", ".mp3",".mp4",".woff2",".woff",".ttf",".ico", ".json",".asc",".xml",".gz",".zip"] hasStaticExt x = any (`T.isSuffixOf` T.toLower x) staticExts -- | Render a list of inlines to an HTML fragment string. -- Uses Plain (not Para) to avoid a wrapping
— callers add their own. renderInlines :: [Inline] -> String renderInlines inlines = case runPure (writeHtml5String contextWriterOpts doc) of Left _ -> "" Right txt -> T.unpack txt where doc = Pandoc nullMeta [Plain inlines] -- | Split a list of inlines into sentences by terminator punctuation. -- -- A @Str@ whose last non-closing-punctuation character is @.@, @!@, or @?@ -- ends a sentence when followed by @Space@, @SoftBreak@, @LineBreak@, or -- end-of-list. Closing quote/bracket characters after the terminator -- (e.g. @right-double-quote@, @)@, @]@) are tolerated. -- -- The splitter is deliberately simple: abbreviations like "e.g." or "Dr." -- will cause occasional over-splitting. That is acceptable for backlink -- previews, where a slightly short context is preferable to the complexity -- of abbreviation detection. splitSentences :: [Inline] -> [[Inline]] splitSentences = go [] where go acc [] = if null acc then [] else [reverse acc] go acc (tok : rest) | isTerminator tok && leadingBreak rest = reverse (tok : acc) : go [] (dropLeadingBreak rest) | otherwise = go (tok : acc) rest isTerminator :: Inline -> Bool isTerminator (Str s) = endsWithTerminator s isTerminator _ = False endsWithTerminator :: T.Text -> Bool endsWithTerminator t = case T.unsnoc (T.dropWhileEnd isClosingPunct t) of Just (_, c) -> c == '.' || c == '!' || c == '?' Nothing -> False isClosingPunct :: Char -> Bool isClosingPunct c = c `elem` (")]\"'\x201D\x2019" :: String) leadingBreak :: [Inline] -> Bool leadingBreak [] = True leadingBreak (Space : _) = True leadingBreak (SoftBreak : _) = True leadingBreak (LineBreak : _) = True leadingBreak _ = False dropLeadingBreak :: [Inline] -> [Inline] dropLeadingBreak (Space : xs) = xs dropLeadingBreak (SoftBreak : xs) = xs dropLeadingBreak (LineBreak : xs) = xs dropLeadingBreak xs = xs -- | Extract @LinkEntry@ records from a Pandoc document. -- For every internal link in a paragraph, emit an entry carrying the HTML -- of the sentence containing the link (default display) and the HTML of -- the full paragraph (hover/popup context). -- Recurses into Div, BlockQuote, BulletList, and OrderedList. extractLinksWithContext :: Pandoc -> [LinkEntry] extractLinksWithContext (Pandoc _ blocks) = concatMap go blocks where go :: Block -> [LinkEntry] go (Para inlines) = paraEntries inlines go (BlockQuote bs) = concatMap go bs go (Div _ bs) = concatMap go bs go (BulletList items) = concatMap (concatMap go) items go (OrderedList _ items) = concatMap (concatMap go) items go _ = [] paraEntries :: [Inline] -> [LinkEntry] paraEntries inlines = let paraHtml = renderInlines inlines sentences = splitSentences inlines in concatMap (sentenceEntries paraHtml) sentences sentenceEntries :: String -> [Inline] -> [LinkEntry] sentenceEntries paraHtml sentence = let urls = filter isPageLink (query getUrl sentence) in if null urls then [] else let sentHtml = renderInlines sentence in map (\u -> LinkEntry u sentHtml paraHtml) urls getUrl :: Inline -> [T.Text] getUrl (Link _ _ (url, _)) = [url] getUrl _ = [] -- --------------------------------------------------------------------------- -- Lightweight links compiler -- --------------------------------------------------------------------------- -- | Compile a source file lightly: parse the Markdown (wikilinks preprocessed), -- extract internal links with their paragraph context, and serialise as JSON. linksCompiler :: Compiler (Item String) linksCompiler = do body <- getResourceBody let src = itemBody body let body' = itemSetBody (preprocessSource src) body pandocItem <- readPandocWith readerOpts body' let entries = nubBy sameEntry (extractLinksWithContext (itemBody pandocItem)) makeItem . TL.unpack . TLE.decodeUtf8 . Aeson.encode $ entries where sameEntry a b = leUrl a == leUrl b && leSentence a == leSentence b && leParagraph a == leParagraph b -- --------------------------------------------------------------------------- -- URL normalisation -- --------------------------------------------------------------------------- -- | Normalise an internal URL as a map key: strip query string, fragment, -- and trailing @.html@; ensure a leading slash; percent-decode the path -- so that @\/essays\/caf%C3%A9@ and @\/essays\/café@ collide on the same -- key. normaliseUrl :: String -> String normaliseUrl url = let t = T.pack url t1 = fst (T.breakOn "?" (fst (T.breakOn "#" t))) t2 = if T.isPrefixOf "/" t1 then t1 else "/" `T.append` t1 t3 = fromMaybe t2 (T.stripSuffix ".html" t2) in percentDecode (T.unpack t3) -- | Decode percent-escapes (@%XX@) into raw bytes, then re-interpret the -- resulting bytestring as UTF-8. Invalid escapes are passed through -- verbatim so this is safe to call on already-decoded input. percentDecode :: String -> String percentDecode = T.unpack . TE.decodeUtf8With lenientDecode . pack . go where go [] = [] go ('%':a:b:rest) | Just hi <- hexDigit a , Just lo <- hexDigit b = fromIntegral (hi * 16 + lo) : go rest go (c:rest) = fromIntegral (fromEnum c) : go rest hexDigit c | c >= '0' && c <= '9' = Just (fromEnum c - fromEnum '0') | c >= 'a' && c <= 'f' = Just (fromEnum c - fromEnum 'a' + 10) | c >= 'A' && c <= 'F' = Just (fromEnum c - fromEnum 'A' + 10) | otherwise = Nothing pack = BS.pack lenientDecode = TE.lenientDecode -- --------------------------------------------------------------------------- -- Content patterns (must match the rules in Site.hs — sourced from -- Patterns.allContent so additions to the canonical list automatically -- propagate to backlinks). -- --------------------------------------------------------------------------- allContent :: Pattern allContent = P.allContent -- --------------------------------------------------------------------------- -- Hakyll rules -- --------------------------------------------------------------------------- -- | Register the @version "links"@ rules for all content and the -- @create ["data/backlinks.json"]@ rule. Call this from 'Site.rules'. backlinkRules :: Rules () backlinkRules = do -- Pass 1: extract links + context from each content file. match allContent $ version "links" $ compile linksCompiler -- Pass 2: invert the map and write the backlinks JSON. create ["data/backlinks.json"] $ do route idRoute compile $ do items <- loadAll (allContent .&&. hasVersion "links") :: Compiler [Item String] pairs <- concat <$> mapM toSourcePairs items makeItem . TL.unpack . TLE.decodeUtf8 . Aeson.encode $ Map.fromListWith (++) [(k, [v]) | (k, v) <- pairs] -- | For one "links" item, produce @(normalised-target-url, BacklinkSource)@ -- pairs — one per internal link found in the source file. toSourcePairs :: Item String -> Compiler [(T.Text, BacklinkSource)] toSourcePairs item = do let ident0 = setVersion Nothing (itemIdentifier item) mRoute <- getRoute ident0 meta <- getMetadata ident0 let srcUrl = maybe "" (\r -> "/" ++ r) mRoute let title = fromMaybe "(untitled)" (lookupString "title" meta) let abstract = fromMaybe "" (lookupString "abstract" meta) case mRoute of Nothing -> return [] Just _ -> case Aeson.decodeStrict (TE.encodeUtf8 (T.pack (itemBody item))) :: Maybe [LinkEntry] of Nothing -> return [] Just entries -> return [ ( T.pack (normaliseUrl (T.unpack (leUrl e))) , BacklinkSource srcUrl title abstract (leSentence e) (leParagraph e) ) | e <- entries ] -- --------------------------------------------------------------------------- -- Context field -- --------------------------------------------------------------------------- -- | Context field @$backlinks$@ that injects an HTML list of pages that link -- to the current page, each with its paragraph context. -- Returns @noResult@ (so @$if(backlinks)$@ is false) when there are none. backlinksField :: Context String backlinksField = field "backlinks" $ \item -> do blItem <- load (fromFilePath "data/backlinks.json") :: Compiler (Item String) case Aeson.decodeStrict (TE.encodeUtf8 (T.pack (itemBody blItem))) :: Maybe (Map T.Text [BacklinkSource]) of Nothing -> fail "backlinks: could not parse data/backlinks.json" Just blMap -> do mRoute <- getRoute (itemIdentifier item) case mRoute of Nothing -> fail "backlinks: item has no route" Just r -> let key = T.pack (normaliseUrl ("/" ++ r)) sources = fromMaybe [] (Map.lookup key blMap) sorted = sortBy (comparing blTitle) sources in if null sorted then fail "no backlinks" else return (renderBacklinks sorted) -- --------------------------------------------------------------------------- -- HTML rendering -- --------------------------------------------------------------------------- -- | Render backlink sources as an HTML list. Each item shows: -- * the source title as a link (serif body face), -- * a
of the sentence containing the link (default context), -- * a small hoverable "¶" affordance that reveals the full paragraph in -- a CSS-driven popup when hovered or keyboard-focused. -- -- 'blSentence' and 'blParagraph' are already HTML fragments produced by -- the Pandoc writer, so they are emitted unescaped. renderBacklinks :: [BacklinkSource] -> String renderBacklinks sources = "\n" ++ concatMap renderOne sources ++ "
" where renderOne bl = "" ++ "" ++ escapeHtml (blTitle bl) ++ "" ++ ( if null (blSentence bl) then "" else " \n" paragraphAffordance bl | null (blParagraph bl) = "" | blParagraph bl == blSentence bl = "" | otherwise = "" ++ "" ++ "" ++ blParagraph bl ++ "" ++ """ ++ blSentence bl ++ paragraphAffordance bl ++ "" ) ++ "