302 lines
13 KiB
Haskell
302 lines
13 KiB
Haskell
{-# LANGUAGE GHC2021 #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
-- | Backlinks with context: build-time computation of which pages link to
|
|
-- each page, including the paragraph that contains each link.
|
|
--
|
|
-- Architecture (dependency-correct, no circular deps):
|
|
--
|
|
-- 1. Each content file is compiled under @version "links"@: a lightweight
|
|
-- pass that parses the source, walks the AST block-by-block, and for
|
|
-- every internal link records the URL *and* the HTML of its surrounding
|
|
-- paragraph. The result is serialised as a JSON array of
|
|
-- @{url, context}@ objects.
|
|
--
|
|
-- 2. A @create ["data/backlinks.json"]@ rule loads all "links" items,
|
|
-- inverts the map, and serialises
|
|
-- @target → [{url, title, abstract, context}]@ as JSON.
|
|
--
|
|
-- 3. @backlinksField@ loads that JSON at page render time and injects
|
|
-- an HTML list showing each source's title and context paragraph.
|
|
-- The @load@ call establishes a proper Hakyll dependency so pages
|
|
-- recompile when backlinks change.
|
|
--
|
|
-- Dependency order (no cycles):
|
|
-- content "links" versions → data/backlinks.json → content default versions
|
|
module Backlinks
|
|
( backlinkRules
|
|
, backlinksField
|
|
) where
|
|
|
|
import Data.List (nubBy, sortBy)
|
|
import Data.Ord (comparing)
|
|
import Data.Maybe (fromMaybe)
|
|
import qualified Data.Map.Strict as Map
|
|
import Data.Map.Strict (Map)
|
|
import qualified Data.Text as T
|
|
import qualified Data.Text.Lazy as TL
|
|
import qualified Data.Text.Lazy.Encoding as TLE
|
|
import qualified Data.Text.Encoding as TE
|
|
import qualified Data.Aeson as Aeson
|
|
import Data.Aeson ((.=))
|
|
import Text.Pandoc.Class (runPure)
|
|
import Text.Pandoc.Writers (writeHtml5String)
|
|
import Text.Pandoc.Definition (Block (..), Inline (..), Pandoc (..),
|
|
nullMeta)
|
|
import Text.Pandoc.Options (WriterOptions (..), HTMLMathMethod (..))
|
|
import Text.Pandoc.Walk (query)
|
|
import Hakyll
|
|
import Compilers (readerOpts, writerOpts)
|
|
import Filters (preprocessSource)
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Link-with-context entry (intermediate, saved by the "links" pass)
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
data LinkEntry = LinkEntry
|
|
{ leUrl :: T.Text -- internal URL (as found in the AST)
|
|
, leContext :: String -- HTML of the surrounding paragraph
|
|
} deriving (Show, Eq)
|
|
|
|
instance Aeson.ToJSON LinkEntry where
|
|
toJSON e = Aeson.object ["url" .= leUrl e, "context" .= leContext e]
|
|
|
|
instance Aeson.FromJSON LinkEntry where
|
|
parseJSON = Aeson.withObject "LinkEntry" $ \o ->
|
|
LinkEntry <$> o Aeson..: "url" <*> o Aeson..: "context"
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Backlink source record (stored in data/backlinks.json)
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
data BacklinkSource = BacklinkSource
|
|
{ blUrl :: String
|
|
, blTitle :: String
|
|
, blAbstract :: String
|
|
, blContext :: String -- raw HTML of the paragraph containing the link
|
|
} deriving (Show, Eq, Ord)
|
|
|
|
instance Aeson.ToJSON BacklinkSource where
|
|
toJSON bl = Aeson.object
|
|
[ "url" .= blUrl bl
|
|
, "title" .= blTitle bl
|
|
, "abstract" .= blAbstract bl
|
|
, "context" .= blContext bl
|
|
]
|
|
|
|
instance Aeson.FromJSON BacklinkSource where
|
|
parseJSON = Aeson.withObject "BacklinkSource" $ \o ->
|
|
BacklinkSource
|
|
<$> o Aeson..: "url"
|
|
<*> o Aeson..: "title"
|
|
<*> o Aeson..: "abstract"
|
|
<*> o Aeson..: "context"
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Writer options for context rendering
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Minimal writer options for rendering paragraph context: no template
|
|
-- (fragment only), plain math fallback (context excerpts are previews, not
|
|
-- full renders, and KaTeX CSS may not be loaded on all target pages).
|
|
contextWriterOpts :: WriterOptions
|
|
contextWriterOpts = writerOpts
|
|
{ writerTemplate = Nothing
|
|
, writerHTMLMathMethod = PlainMath
|
|
}
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Context extraction
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | URL filter: skip external links, pseudo-schemes, anchor-only fragments,
|
|
-- and static-asset paths.
|
|
isPageLink :: T.Text -> Bool
|
|
isPageLink u =
|
|
not (T.isPrefixOf "http://" u) &&
|
|
not (T.isPrefixOf "https://" u) &&
|
|
not (T.isPrefixOf "#" u) &&
|
|
not (T.isPrefixOf "mailto:" u) &&
|
|
not (T.isPrefixOf "tel:" u) &&
|
|
not (T.null u) &&
|
|
not (hasStaticExt u)
|
|
where
|
|
staticExts = [".pdf",".svg",".png",".jpg",".jpeg",".webp",
|
|
".mp3",".mp4",".woff2",".woff",".ttf",".ico",
|
|
".json",".asc",".xml",".gz",".zip"]
|
|
hasStaticExt x = any (`T.isSuffixOf` T.toLower x) staticExts
|
|
|
|
-- | Render a list of inlines to an HTML fragment string.
|
|
-- Uses Plain (not Para) to avoid a wrapping <p> — callers add their own.
|
|
renderInlines :: [Inline] -> String
|
|
renderInlines inlines =
|
|
case runPure (writeHtml5String contextWriterOpts doc) of
|
|
Left _ -> ""
|
|
Right txt -> T.unpack txt
|
|
where
|
|
doc = Pandoc nullMeta [Plain inlines]
|
|
|
|
-- | Extract @(internal-url, context-html)@ pairs from a Pandoc document.
|
|
-- Context is the HTML of the immediate surrounding paragraph.
|
|
-- Recurses into Div, BlockQuote, BulletList, and OrderedList.
|
|
extractLinksWithContext :: Pandoc -> [LinkEntry]
|
|
extractLinksWithContext (Pandoc _ blocks) = concatMap go blocks
|
|
where
|
|
go :: Block -> [LinkEntry]
|
|
go (Para inlines) = paraEntries inlines
|
|
go (BlockQuote bs) = concatMap go bs
|
|
go (Div _ bs) = concatMap go bs
|
|
go (BulletList items) = concatMap (concatMap go) items
|
|
go (OrderedList _ items) = concatMap (concatMap go) items
|
|
go _ = []
|
|
|
|
-- For a Para block: find all internal links it contains, and for each
|
|
-- return a LinkEntry with the paragraph's HTML as context.
|
|
paraEntries :: [Inline] -> [LinkEntry]
|
|
paraEntries inlines =
|
|
let urls = filter isPageLink (query getUrl inlines)
|
|
in if null urls then []
|
|
else
|
|
let ctx = renderInlines inlines
|
|
in map (\u -> LinkEntry u ctx) urls
|
|
|
|
getUrl :: Inline -> [T.Text]
|
|
getUrl (Link _ _ (url, _)) = [url]
|
|
getUrl _ = []
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Lightweight links compiler
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Compile a source file lightly: parse the Markdown (wikilinks preprocessed),
|
|
-- extract internal links with their paragraph context, and serialise as JSON.
|
|
linksCompiler :: Compiler (Item String)
|
|
linksCompiler = do
|
|
body <- getResourceBody
|
|
let src = itemBody body
|
|
let body' = itemSetBody (preprocessSource src) body
|
|
pandocItem <- readPandocWith readerOpts body'
|
|
let entries = nubBy (\a b -> leUrl a == leUrl b && leContext a == leContext b)
|
|
(extractLinksWithContext (itemBody pandocItem))
|
|
makeItem . TL.unpack . TLE.decodeUtf8 . Aeson.encode $ entries
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- URL normalisation
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Normalise an internal URL as a map key: strip query string, fragment,
|
|
-- and trailing @.html@; ensure a leading slash.
|
|
normaliseUrl :: String -> String
|
|
normaliseUrl url =
|
|
let t = T.pack url
|
|
t1 = fst (T.breakOn "?" (fst (T.breakOn "#" t)))
|
|
t2 = if T.isPrefixOf "/" t1 then t1 else "/" `T.append` t1
|
|
t3 = fromMaybe t2 (T.stripSuffix ".html" t2)
|
|
in T.unpack t3
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Content patterns (must match the rules in Site.hs)
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
allContent :: Pattern
|
|
allContent =
|
|
"content/essays/*.md"
|
|
.||. "content/blog/*.md"
|
|
.||. "content/poetry/*.md"
|
|
.||. "content/fiction/*.md"
|
|
.||. "content/music/*/index.md"
|
|
.||. "content/*.md"
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Hakyll rules
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Register the @version "links"@ rules for all content and the
|
|
-- @create ["data/backlinks.json"]@ rule. Call this from 'Site.rules'.
|
|
backlinkRules :: Rules ()
|
|
backlinkRules = do
|
|
-- Pass 1: extract links + context from each content file.
|
|
match allContent $ version "links" $
|
|
compile linksCompiler
|
|
|
|
-- Pass 2: invert the map and write the backlinks JSON.
|
|
create ["data/backlinks.json"] $ do
|
|
route idRoute
|
|
compile $ do
|
|
items <- loadAll (allContent .&&. hasVersion "links")
|
|
:: Compiler [Item String]
|
|
pairs <- concat <$> mapM toSourcePairs items
|
|
makeItem . TL.unpack . TLE.decodeUtf8 . Aeson.encode
|
|
$ Map.fromListWith (++) [(k, [v]) | (k, v) <- pairs]
|
|
|
|
-- | For one "links" item, produce @(normalised-target-url, BacklinkSource)@
|
|
-- pairs — one per internal link found in the source file.
|
|
toSourcePairs :: Item String -> Compiler [(T.Text, BacklinkSource)]
|
|
toSourcePairs item = do
|
|
let ident0 = setVersion Nothing (itemIdentifier item)
|
|
mRoute <- getRoute ident0
|
|
meta <- getMetadata ident0
|
|
let srcUrl = maybe "" (\r -> "/" ++ r) mRoute
|
|
let title = fromMaybe "(untitled)" (lookupString "title" meta)
|
|
let abstract = fromMaybe "" (lookupString "abstract" meta)
|
|
case mRoute of
|
|
Nothing -> return []
|
|
Just _ ->
|
|
case Aeson.decodeStrict (TE.encodeUtf8 (T.pack (itemBody item)))
|
|
:: Maybe [LinkEntry] of
|
|
Nothing -> return []
|
|
Just entries ->
|
|
return [ ( T.pack (normaliseUrl (T.unpack (leUrl e)))
|
|
, BacklinkSource srcUrl title abstract (leContext e)
|
|
)
|
|
| e <- entries ]
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Context field
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Context field @$backlinks$@ that injects an HTML list of pages that link
|
|
-- to the current page, each with its paragraph context.
|
|
-- Returns @noResult@ (so @$if(backlinks)$@ is false) when there are none.
|
|
backlinksField :: Context String
|
|
backlinksField = field "backlinks" $ \item -> do
|
|
blItem <- load (fromFilePath "data/backlinks.json") :: Compiler (Item String)
|
|
case Aeson.decodeStrict (TE.encodeUtf8 (T.pack (itemBody blItem)))
|
|
:: Maybe (Map T.Text [BacklinkSource]) of
|
|
Nothing -> fail "backlinks: could not parse data/backlinks.json"
|
|
Just blMap -> do
|
|
mRoute <- getRoute (itemIdentifier item)
|
|
case mRoute of
|
|
Nothing -> fail "backlinks: item has no route"
|
|
Just r ->
|
|
let key = T.pack (normaliseUrl ("/" ++ r))
|
|
sources = fromMaybe [] (Map.lookup key blMap)
|
|
sorted = sortBy (comparing blTitle) sources
|
|
in if null sorted
|
|
then fail "no backlinks"
|
|
else return (renderBacklinks sorted)
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- HTML rendering
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Render backlink sources as an HTML list.
|
|
-- Each item shows the source title as a link (always visible) and a
|
|
-- <details> element containing the context paragraph (collapsed by default).
|
|
-- @blContext@ is already HTML produced by the Pandoc writer — not escaped.
|
|
renderBacklinks :: [BacklinkSource] -> String
|
|
renderBacklinks sources =
|
|
"<ul class=\"backlinks-list\">\n"
|
|
++ concatMap renderOne sources
|
|
++ "</ul>"
|
|
where
|
|
renderOne bl =
|
|
"<li class=\"backlink-item\">"
|
|
++ "<a class=\"backlink-source\" href=\"" ++ escapeHtml (blUrl bl) ++ "\">"
|
|
++ escapeHtml (blTitle bl) ++ "</a>"
|
|
++ ( if null (blContext bl) then ""
|
|
else "<details class=\"backlink-details\">"
|
|
++ "<summary class=\"backlink-summary\">context</summary>"
|
|
++ "<div class=\"backlink-context\">" ++ blContext bl ++ "</div>"
|
|
++ "</details>" )
|
|
++ "</li>\n"
|