413 lines
17 KiB
Haskell
413 lines
17 KiB
Haskell
{-# LANGUAGE GHC2021 #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
-- | Backlinks with context: build-time computation of which pages link to
|
|
-- each page, including the sentence that contains each link.
|
|
--
|
|
-- Architecture (dependency-correct, no circular deps):
|
|
--
|
|
-- 1. Each content file is compiled under @version "links"@: a lightweight
|
|
-- pass that parses the source, walks the AST block-by-block, splits
|
|
-- each paragraph into sentences, and for every internal link records
|
|
-- the URL *and* the HTML of the sentence that contains it. The result
|
|
-- is serialised as a JSON array of @{url, context}@ objects.
|
|
--
|
|
-- 2. A @create ["data/backlinks.json"]@ rule loads all "links" items,
|
|
-- inverts the map, and serialises
|
|
-- @target → [{url, title, abstract, context}]@ as JSON.
|
|
--
|
|
-- 3. @backlinksField@ loads that JSON at page render time and injects
|
|
-- an HTML list showing each source's title and a quoted sentence of
|
|
-- context. The @load@ call establishes a proper Hakyll dependency so
|
|
-- pages recompile when backlinks change.
|
|
--
|
|
-- Dependency order (no cycles):
|
|
-- content "links" versions → data/backlinks.json → content default versions
|
|
module Backlinks
|
|
( backlinkRules
|
|
, backlinksField
|
|
) where
|
|
|
|
import Data.List (nubBy, sortBy)
|
|
import Data.Ord (comparing)
|
|
import Data.Maybe (fromMaybe)
|
|
import qualified Data.Map.Strict as Map
|
|
import Data.Map.Strict (Map)
|
|
import qualified Data.ByteString as BS
|
|
import qualified Data.Text as T
|
|
import qualified Data.Text.Lazy as TL
|
|
import qualified Data.Text.Lazy.Encoding as TLE
|
|
import qualified Data.Text.Encoding as TE
|
|
import qualified Data.Text.Encoding.Error as TE
|
|
import qualified Data.Aeson as Aeson
|
|
import Data.Aeson ((.=))
|
|
import Text.Pandoc.Class (runPure)
|
|
import Text.Pandoc.Writers (writeHtml5String)
|
|
import Text.Pandoc.Definition (Block (..), Inline (..), Pandoc (..),
|
|
nullMeta)
|
|
import Text.Pandoc.Options (WriterOptions (..), HTMLMathMethod (..))
|
|
import Text.Pandoc.Walk (query)
|
|
import Hakyll
|
|
import Compilers (readerOpts, writerOpts)
|
|
import Filters (preprocessSource)
|
|
import qualified Patterns as P
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Link-with-context entry (intermediate, saved by the "links" pass)
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
data LinkEntry = LinkEntry
|
|
{ leUrl :: T.Text -- internal URL (as found in the AST)
|
|
, leSentence :: String -- HTML of the sentence containing the link
|
|
, leParagraph :: String -- HTML of the full surrounding paragraph
|
|
} deriving (Show, Eq)
|
|
|
|
instance Aeson.ToJSON LinkEntry where
|
|
toJSON e = Aeson.object
|
|
[ "url" .= leUrl e
|
|
, "sentence" .= leSentence e
|
|
, "paragraph" .= leParagraph e
|
|
]
|
|
|
|
instance Aeson.FromJSON LinkEntry where
|
|
parseJSON = Aeson.withObject "LinkEntry" $ \o ->
|
|
LinkEntry
|
|
<$> o Aeson..: "url"
|
|
<*> o Aeson..: "sentence"
|
|
<*> o Aeson..: "paragraph"
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Backlink source record (stored in data/backlinks.json)
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
data BacklinkSource = BacklinkSource
|
|
{ blUrl :: String
|
|
, blTitle :: String
|
|
, blAbstract :: String
|
|
, blSentence :: String -- raw HTML of the sentence containing the link
|
|
, blParagraph :: String -- raw HTML of the full paragraph (hover popup)
|
|
} deriving (Show, Eq, Ord)
|
|
|
|
instance Aeson.ToJSON BacklinkSource where
|
|
toJSON bl = Aeson.object
|
|
[ "url" .= blUrl bl
|
|
, "title" .= blTitle bl
|
|
, "abstract" .= blAbstract bl
|
|
, "sentence" .= blSentence bl
|
|
, "paragraph" .= blParagraph bl
|
|
]
|
|
|
|
instance Aeson.FromJSON BacklinkSource where
|
|
parseJSON = Aeson.withObject "BacklinkSource" $ \o ->
|
|
BacklinkSource
|
|
<$> o Aeson..: "url"
|
|
<*> o Aeson..: "title"
|
|
<*> o Aeson..: "abstract"
|
|
<*> o Aeson..: "sentence"
|
|
<*> o Aeson..: "paragraph"
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Writer options for context rendering
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Minimal writer options for rendering paragraph context: no template
|
|
-- (fragment only), plain math fallback (context excerpts are previews, not
|
|
-- full renders, and KaTeX CSS may not be loaded on all target pages).
|
|
contextWriterOpts :: WriterOptions
|
|
contextWriterOpts = writerOpts
|
|
{ writerTemplate = Nothing
|
|
, writerHTMLMathMethod = PlainMath
|
|
}
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Context extraction
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | URL filter: skip external links, pseudo-schemes, anchor-only fragments,
|
|
-- and static-asset paths.
|
|
isPageLink :: T.Text -> Bool
|
|
isPageLink u =
|
|
not (T.isPrefixOf "http://" u) &&
|
|
not (T.isPrefixOf "https://" u) &&
|
|
not (T.isPrefixOf "#" u) &&
|
|
not (T.isPrefixOf "mailto:" u) &&
|
|
not (T.isPrefixOf "tel:" u) &&
|
|
not (T.null u) &&
|
|
not (hasStaticExt u)
|
|
where
|
|
staticExts = [".pdf",".svg",".png",".jpg",".jpeg",".webp",
|
|
".mp3",".mp4",".woff2",".woff",".ttf",".ico",
|
|
".json",".asc",".xml",".gz",".zip"]
|
|
hasStaticExt x = any (`T.isSuffixOf` T.toLower x) staticExts
|
|
|
|
-- | Render a list of inlines to an HTML fragment string.
|
|
-- Uses Plain (not Para) to avoid a wrapping <p> — callers add their own.
|
|
renderInlines :: [Inline] -> String
|
|
renderInlines inlines =
|
|
case runPure (writeHtml5String contextWriterOpts doc) of
|
|
Left _ -> ""
|
|
Right txt -> T.unpack txt
|
|
where
|
|
doc = Pandoc nullMeta [Plain inlines]
|
|
|
|
-- | Split a list of inlines into sentences by terminator punctuation.
|
|
--
|
|
-- A @Str@ whose last non-closing-punctuation character is @.@, @!@, or @?@
|
|
-- ends a sentence when followed by @Space@, @SoftBreak@, @LineBreak@, or
|
|
-- end-of-list. Closing quote/bracket characters after the terminator
|
|
-- (e.g. @right-double-quote@, @)@, @]@) are tolerated.
|
|
--
|
|
-- The splitter is deliberately simple: abbreviations like "e.g." or "Dr."
|
|
-- will cause occasional over-splitting. That is acceptable for backlink
|
|
-- previews, where a slightly short context is preferable to the complexity
|
|
-- of abbreviation detection.
|
|
splitSentences :: [Inline] -> [[Inline]]
|
|
splitSentences = go []
|
|
where
|
|
go acc [] = if null acc then [] else [reverse acc]
|
|
go acc (tok : rest)
|
|
| isTerminator tok && leadingBreak rest =
|
|
reverse (tok : acc) : go [] (dropLeadingBreak rest)
|
|
| otherwise =
|
|
go (tok : acc) rest
|
|
|
|
isTerminator :: Inline -> Bool
|
|
isTerminator (Str s) = endsWithTerminator s
|
|
isTerminator _ = False
|
|
|
|
endsWithTerminator :: T.Text -> Bool
|
|
endsWithTerminator t =
|
|
case T.unsnoc (T.dropWhileEnd isClosingPunct t) of
|
|
Just (_, c) -> c == '.' || c == '!' || c == '?'
|
|
Nothing -> False
|
|
|
|
isClosingPunct :: Char -> Bool
|
|
isClosingPunct c = c `elem` (")]\"'\x201D\x2019" :: String)
|
|
|
|
leadingBreak :: [Inline] -> Bool
|
|
leadingBreak [] = True
|
|
leadingBreak (Space : _) = True
|
|
leadingBreak (SoftBreak : _) = True
|
|
leadingBreak (LineBreak : _) = True
|
|
leadingBreak _ = False
|
|
|
|
dropLeadingBreak :: [Inline] -> [Inline]
|
|
dropLeadingBreak (Space : xs) = xs
|
|
dropLeadingBreak (SoftBreak : xs) = xs
|
|
dropLeadingBreak (LineBreak : xs) = xs
|
|
dropLeadingBreak xs = xs
|
|
|
|
-- | Extract @LinkEntry@ records from a Pandoc document.
|
|
-- For every internal link in a paragraph, emit an entry carrying the HTML
|
|
-- of the sentence containing the link (default display) and the HTML of
|
|
-- the full paragraph (hover/popup context).
|
|
-- Recurses into Div, BlockQuote, BulletList, and OrderedList.
|
|
extractLinksWithContext :: Pandoc -> [LinkEntry]
|
|
extractLinksWithContext (Pandoc _ blocks) = concatMap go blocks
|
|
where
|
|
go :: Block -> [LinkEntry]
|
|
go (Para inlines) = paraEntries inlines
|
|
go (BlockQuote bs) = concatMap go bs
|
|
go (Div _ bs) = concatMap go bs
|
|
go (BulletList items) = concatMap (concatMap go) items
|
|
go (OrderedList _ items) = concatMap (concatMap go) items
|
|
go _ = []
|
|
|
|
paraEntries :: [Inline] -> [LinkEntry]
|
|
paraEntries inlines =
|
|
let paraHtml = renderInlines inlines
|
|
sentences = splitSentences inlines
|
|
in concatMap (sentenceEntries paraHtml) sentences
|
|
|
|
sentenceEntries :: String -> [Inline] -> [LinkEntry]
|
|
sentenceEntries paraHtml sentence =
|
|
let urls = filter isPageLink (query getUrl sentence)
|
|
in if null urls then []
|
|
else
|
|
let sentHtml = renderInlines sentence
|
|
in map (\u -> LinkEntry u sentHtml paraHtml) urls
|
|
|
|
getUrl :: Inline -> [T.Text]
|
|
getUrl (Link _ _ (url, _)) = [url]
|
|
getUrl _ = []
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Lightweight links compiler
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Compile a source file lightly: parse the Markdown (wikilinks preprocessed),
|
|
-- extract internal links with their paragraph context, and serialise as JSON.
|
|
linksCompiler :: Compiler (Item String)
|
|
linksCompiler = do
|
|
body <- getResourceBody
|
|
let src = itemBody body
|
|
let body' = itemSetBody (preprocessSource src) body
|
|
pandocItem <- readPandocWith readerOpts body'
|
|
let entries = nubBy sameEntry
|
|
(extractLinksWithContext (itemBody pandocItem))
|
|
makeItem . TL.unpack . TLE.decodeUtf8 . Aeson.encode $ entries
|
|
where
|
|
sameEntry a b =
|
|
leUrl a == leUrl b &&
|
|
leSentence a == leSentence b &&
|
|
leParagraph a == leParagraph b
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- URL normalisation
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Normalise an internal URL as a map key: strip query string, fragment,
|
|
-- and trailing @.html@; ensure a leading slash; percent-decode the path
|
|
-- so that @\/essays\/caf%C3%A9@ and @\/essays\/café@ collide on the same
|
|
-- key.
|
|
normaliseUrl :: String -> String
|
|
normaliseUrl url =
|
|
let t = T.pack url
|
|
t1 = fst (T.breakOn "?" (fst (T.breakOn "#" t)))
|
|
t2 = if T.isPrefixOf "/" t1 then t1 else "/" `T.append` t1
|
|
t3 = fromMaybe t2 (T.stripSuffix ".html" t2)
|
|
in percentDecode (T.unpack t3)
|
|
|
|
-- | Decode percent-escapes (@%XX@) into raw bytes, then re-interpret the
|
|
-- resulting bytestring as UTF-8. Invalid escapes are passed through
|
|
-- verbatim so this is safe to call on already-decoded input.
|
|
percentDecode :: String -> String
|
|
percentDecode = T.unpack . TE.decodeUtf8With lenientDecode . pack . go
|
|
where
|
|
go [] = []
|
|
go ('%':a:b:rest)
|
|
| Just hi <- hexDigit a
|
|
, Just lo <- hexDigit b
|
|
= fromIntegral (hi * 16 + lo) : go rest
|
|
go (c:rest) = fromIntegral (fromEnum c) : go rest
|
|
|
|
hexDigit c
|
|
| c >= '0' && c <= '9' = Just (fromEnum c - fromEnum '0')
|
|
| c >= 'a' && c <= 'f' = Just (fromEnum c - fromEnum 'a' + 10)
|
|
| c >= 'A' && c <= 'F' = Just (fromEnum c - fromEnum 'A' + 10)
|
|
| otherwise = Nothing
|
|
|
|
pack = BS.pack
|
|
lenientDecode = TE.lenientDecode
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Content patterns (must match the rules in Site.hs — sourced from
|
|
-- Patterns.allContent so additions to the canonical list automatically
|
|
-- propagate to backlinks).
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
allContent :: Pattern
|
|
allContent = P.allContent
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Hakyll rules
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Register the @version "links"@ rules for all content and the
|
|
-- @create ["data/backlinks.json"]@ rule. Call this from 'Site.rules'.
|
|
backlinkRules :: Rules ()
|
|
backlinkRules = do
|
|
-- Pass 1: extract links + context from each content file.
|
|
match allContent $ version "links" $
|
|
compile linksCompiler
|
|
|
|
-- Pass 2: invert the map and write the backlinks JSON.
|
|
create ["data/backlinks.json"] $ do
|
|
route idRoute
|
|
compile $ do
|
|
items <- loadAll (allContent .&&. hasVersion "links")
|
|
:: Compiler [Item String]
|
|
pairs <- concat <$> mapM toSourcePairs items
|
|
makeItem . TL.unpack . TLE.decodeUtf8 . Aeson.encode
|
|
$ Map.fromListWith (++) [(k, [v]) | (k, v) <- pairs]
|
|
|
|
-- | For one "links" item, produce @(normalised-target-url, BacklinkSource)@
|
|
-- pairs — one per internal link found in the source file.
|
|
toSourcePairs :: Item String -> Compiler [(T.Text, BacklinkSource)]
|
|
toSourcePairs item = do
|
|
let ident0 = setVersion Nothing (itemIdentifier item)
|
|
mRoute <- getRoute ident0
|
|
meta <- getMetadata ident0
|
|
let srcUrl = maybe "" (\r -> "/" ++ r) mRoute
|
|
let title = fromMaybe "(untitled)" (lookupString "title" meta)
|
|
let abstract = fromMaybe "" (lookupString "abstract" meta)
|
|
case mRoute of
|
|
Nothing -> return []
|
|
Just _ ->
|
|
case Aeson.decodeStrict (TE.encodeUtf8 (T.pack (itemBody item)))
|
|
:: Maybe [LinkEntry] of
|
|
Nothing -> return []
|
|
Just entries ->
|
|
return [ ( T.pack (normaliseUrl (T.unpack (leUrl e)))
|
|
, BacklinkSource srcUrl title abstract
|
|
(leSentence e)
|
|
(leParagraph e)
|
|
)
|
|
| e <- entries ]
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Context field
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Context field @$backlinks$@ that injects an HTML list of pages that link
|
|
-- to the current page, each with its paragraph context.
|
|
-- Returns @noResult@ (so @$if(backlinks)$@ is false) when there are none.
|
|
backlinksField :: Context String
|
|
backlinksField = field "backlinks" $ \item -> do
|
|
blItem <- load (fromFilePath "data/backlinks.json") :: Compiler (Item String)
|
|
case Aeson.decodeStrict (TE.encodeUtf8 (T.pack (itemBody blItem)))
|
|
:: Maybe (Map T.Text [BacklinkSource]) of
|
|
Nothing -> fail "backlinks: could not parse data/backlinks.json"
|
|
Just blMap -> do
|
|
mRoute <- getRoute (itemIdentifier item)
|
|
case mRoute of
|
|
Nothing -> fail "backlinks: item has no route"
|
|
Just r ->
|
|
let key = T.pack (normaliseUrl ("/" ++ r))
|
|
sources = fromMaybe [] (Map.lookup key blMap)
|
|
sorted = sortBy (comparing blTitle) sources
|
|
in if null sorted
|
|
then fail "no backlinks"
|
|
else return (renderBacklinks sorted)
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- HTML rendering
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Render backlink sources as an HTML list. Each item shows:
|
|
-- * the source title as a link (serif body face),
|
|
-- * a <blockquote> of the sentence containing the link (default context),
|
|
-- * a small hoverable "¶" affordance that reveals the full paragraph in
|
|
-- a CSS-driven popup when hovered or keyboard-focused.
|
|
--
|
|
-- 'blSentence' and 'blParagraph' are already HTML fragments produced by
|
|
-- the Pandoc writer, so they are emitted unescaped.
|
|
renderBacklinks :: [BacklinkSource] -> String
|
|
renderBacklinks sources =
|
|
"<ul class=\"backlinks-list\">\n"
|
|
++ concatMap renderOne sources
|
|
++ "</ul>"
|
|
where
|
|
renderOne bl =
|
|
"<li class=\"backlink-item\">"
|
|
++ "<a class=\"backlink-source\" href=\""
|
|
++ escapeHtml (blUrl bl) ++ "\">"
|
|
++ escapeHtml (blTitle bl) ++ "</a>"
|
|
++ ( if null (blSentence bl) then ""
|
|
else "<blockquote class=\"backlink-quote\">"
|
|
++ blSentence bl
|
|
++ paragraphAffordance bl
|
|
++ "</blockquote>" )
|
|
++ "</li>\n"
|
|
|
|
paragraphAffordance bl
|
|
| null (blParagraph bl) = ""
|
|
| blParagraph bl == blSentence bl = ""
|
|
| otherwise =
|
|
"<span class=\"backlink-full\">"
|
|
++ "<button type=\"button\" class=\"backlink-full-trigger\""
|
|
++ " aria-label=\"Show full paragraph\" tabindex=\"0\">\x00B6</button>"
|
|
++ "<span class=\"backlink-full-popup\" role=\"tooltip\">"
|
|
++ blParagraph bl
|
|
++ "</span>"
|
|
++ "</span>"
|