levineuwirth.org/build/Citations.hs

218 lines
9.6 KiB
Haskell

{-# LANGUAGE GHC2021 #-}
{-# LANGUAGE OverloadedStrings #-}
-- | Citation processing pipeline.
--
-- Steps:
-- 1. Skip if the document contains no Cite nodes and frKeys is empty.
-- 2. Inject default bibliography / CSL metadata if absent.
-- 3. Inject nocite entries for further-reading keys.
-- 4. Run Pandoc's citeproc to resolve references and generate bibliography.
-- 5. Walk the AST and replace Cite nodes with numbered superscripts.
-- 6. Extract the citeproc bibliography div from the body, reorder by
-- first-appearance, split into cited / further-reading sections,
-- and render to an HTML string for the template's $bibliography$ field.
--
-- Returns (Pandoc without refs div, bibliography HTML).
-- The bibliography HTML is empty when there are no citations.
--
-- NOTE: processCitations with in-text CSL leaves Cite nodes as Cite nodes
-- in the AST — it only populates their inline content and creates the refs
-- div. The HTML writer later wraps them in <span class="citation">. We must
-- therefore match Cite nodes (not Span nodes) in our transform pass.
--
-- NOTE: Hakyll strips YAML frontmatter before passing to readPandocWith, so
-- the Pandoc Meta is empty. further-reading keys are passed explicitly by the
-- caller (read from Hakyll's own metadata via lookupStringList).
--
-- NOTE: Does not import Contexts to avoid cycles.
module Citations (applyCitations) where
import Data.List (intercalate, nub, partition, sortBy)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Maybe (fromMaybe, mapMaybe)
import Data.Ord (comparing)
import Data.Text (Text)
import qualified Data.Text as T
import Text.Pandoc
import Text.Pandoc.Citeproc (processCitations)
import Text.Pandoc.Walk
-- ---------------------------------------------------------------------------
-- Public API
-- ---------------------------------------------------------------------------
-- | Process citations in a Pandoc document.
-- @frKeys@: further-reading citation keys (read from Hakyll metadata by
-- the caller, since Hakyll strips YAML frontmatter before parsing).
-- Returns @(body, citedHtml, furtherHtml)@ where @body@ has Cite nodes
-- replaced with numbered superscripts and no bibliography div,
-- @citedHtml@ is the inline-cited references HTML, and @furtherHtml@ is
-- the further-reading-only references HTML (each empty when absent).
applyCitations :: [Text] -> Pandoc -> IO (Pandoc, Text, Text)
applyCitations frKeys doc
| not (hasCitations frKeys doc) = return (doc, "", "")
| otherwise = do
let doc1 = injectMeta frKeys doc
processed <- runIOorExplode $ processCitations doc1
let (body, citedHtml, furtherHtml) = transformAndExtract frKeys processed
return (body, citedHtml, furtherHtml)
-- ---------------------------------------------------------------------------
-- Detection
-- ---------------------------------------------------------------------------
-- | True if the document has inline [@key] cites or a further-reading list.
hasCitations :: [Text] -> Pandoc -> Bool
hasCitations frKeys doc =
not (null (query collectCites doc))
|| not (null frKeys)
where
collectCites (Cite {}) = [()]
collectCites _ = []
-- ---------------------------------------------------------------------------
-- Metadata injection
-- ---------------------------------------------------------------------------
-- | Inject default bibliography / CSL paths and nocite for further-reading.
injectMeta :: [Text] -> Pandoc -> Pandoc
injectMeta frKeys (Pandoc meta blocks) =
let meta1 = if null frKeys then meta
else insertMeta "nocite" (nociteVal frKeys) meta
meta2 = case lookupMeta "bibliography" meta1 of
Nothing -> insertMeta "bibliography"
(MetaString "data/bibliography.bib") meta1
Just _ -> meta1
meta3 = case lookupMeta "csl" meta2 of
Nothing -> insertMeta "csl"
(MetaString "data/chicago-notes.csl") meta2
Just _ -> meta2
in Pandoc meta3 blocks
where
-- Each key becomes its own Cite node (matching what pandoc parses from
-- nocite: "@key1 @key2" in YAML frontmatter).
nociteVal keys = MetaInlines (intercalate [Space] (map mkCiteNode keys))
mkCiteNode k = [Cite [Citation k [] [] AuthorInText 1 0] [Str ("@" <> k)]]
-- | Insert a key/value pair into Pandoc Meta.
insertMeta :: Text -> MetaValue -> Meta -> Meta
insertMeta k v (Meta m) = Meta (Map.insert k v m)
-- ---------------------------------------------------------------------------
-- Transform pass
-- ---------------------------------------------------------------------------
-- | Number citation Cite nodes and extract the bibliography div.
transformAndExtract :: [Text] -> Pandoc -> (Pandoc, Text, Text)
transformAndExtract frKeys doc@(Pandoc meta _) =
let citeOrder = collectCiteOrder doc -- keys, first-appearance order
keyNums = Map.fromList (zip citeOrder [1 :: Int ..])
-- Replace Cite nodes with numbered superscript markers
doc' = walk (transformInline keyNums) doc
-- Pull bibliography div out of body and render to HTML
(bodyBlocks, citedHtml, furtherHtml) = extractBibliography citeOrder frKeys
(pandocBlocks doc')
in (Pandoc meta bodyBlocks, citedHtml, furtherHtml)
where
pandocBlocks (Pandoc _ bs) = bs
-- | Collect citation keys in order of first appearance (body only).
-- NOTE: after processCitations, Cite nodes remain as Cite in the AST;
-- they are not converted to Span nodes with in-text CSL.
-- We query only blocks (not metadata) so that nocite Cite nodes injected
-- into the 'nocite' meta field are not mistakenly treated as inline citations.
collectCiteOrder :: Pandoc -> [Text]
collectCiteOrder (Pandoc _ blocks) = nub (query extractKeys blocks)
where
extractKeys (Cite citations _) = map citationId citations
extractKeys _ = []
-- | Replace a Cite node with a numbered superscript marker.
transformInline :: Map Text Int -> Inline -> Inline
transformInline keyNums (Cite citations _) =
let keys = map citationId citations
nums = mapMaybe (`Map.lookup` keyNums) keys
in if null nums
then Str ""
else RawInline "html" (markerHtml (head keys) (head nums) nums)
transformInline _ x = x
markerHtml :: Text -> Int -> [Int] -> Text
markerHtml firstKey firstNum nums =
let label = "[" <> T.intercalate "," (map tshow nums) <> "]"
in "<sup class=\"cite-marker\" id=\"cite-back-" <> tshow firstNum <> "\">"
<> "<a href=\"#ref-" <> firstKey <> "\" class=\"cite-link\">"
<> label <> "</a></sup>"
where tshow = T.pack . show
-- ---------------------------------------------------------------------------
-- Bibliography extraction + rendering
-- ---------------------------------------------------------------------------
-- | Separate the @refs@ div from body blocks and render it to HTML.
-- Returns @(bodyBlocks, citedHtml, furtherHtml)@.
extractBibliography :: [Text] -> [Text] -> [Block] -> ([Block], Text, Text)
extractBibliography citeOrder frKeys blocks =
let (bodyBlocks, refDivs) = partition (not . isRefsDiv) blocks
(citedHtml, furtherHtml) = case refDivs of
[] -> ("", "")
(d:_) -> renderBibDiv citeOrder frKeys d
in (bodyBlocks, citedHtml, furtherHtml)
where
isRefsDiv (Div ("refs", _, _) _) = True
isRefsDiv _ = False
-- | Render the citeproc @refs@ Div into two HTML strings:
-- @(citedHtml, furtherHtml)@ — each is empty when there are no entries
-- in that section. Headings are rendered in the template, not here.
renderBibDiv :: [Text] -> [Text] -> Block -> (Text, Text)
renderBibDiv citeOrder _frKeys (Div _ children) =
let keyIndex = Map.fromList (zip citeOrder [0 :: Int ..])
(citedEntries, furtherEntries) =
partition (isCited keyIndex) children
sorted = sortBy (comparing (entryOrder keyIndex)) citedEntries
numbered = zipWith addNumber [1..] sorted
citedHtml = renderEntries "csl-bib-body cite-refs" numbered
furtherHtml
| null furtherEntries = ""
| otherwise = renderEntries "csl-bib-body further-reading-refs" furtherEntries
in (citedHtml, furtherHtml)
renderBibDiv _ _ _ = ("", "")
isCited :: Map Text Int -> Block -> Bool
isCited keyIndex (Div (rid, _, _) _) = Map.member (stripRefPrefix rid) keyIndex
isCited _ _ = False
entryOrder :: Map Text Int -> Block -> Int
entryOrder keyIndex (Div (rid, _, _) _) =
fromMaybe maxBound $ Map.lookup (stripRefPrefix rid) keyIndex
entryOrder _ _ = maxBound
-- | Prepend [N] marker to a bibliography entry block.
addNumber :: Int -> Block -> Block
addNumber n (Div attrs content) =
Div attrs
( Plain [ RawInline "html"
("<span class=\"ref-num\">[" <> T.pack (show n) <> "]</span>") ]
: content )
addNumber _ b = b
-- | Strip the @ref-@ prefix that citeproc adds to div IDs.
stripRefPrefix :: Text -> Text
stripRefPrefix t = fromMaybe t (T.stripPrefix "ref-" t)
-- | Render a list of blocks as an HTML string (used for bibliography sections).
renderEntries :: Text -> [Block] -> Text
renderEntries cls entries =
case runPure (writeHtml5String wOpts (Pandoc nullMeta entries)) of
Left _ -> ""
Right html -> "<div class=\"" <> cls <> "\">\n" <> html <> "</div>\n"
where
wOpts = def { writerWrapText = WrapNone }