levineuwirth.org/build/Citations.hs

383 lines
17 KiB
Haskell

{-# LANGUAGE GHC2021 #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
-- | Citation processing pipeline.
--
-- Steps:
-- 1. Skip if the document contains no Cite nodes and frKeys is empty.
-- 2. Inject default bibliography / CSL metadata if absent.
-- 3. Inject nocite entries for further-reading keys.
-- 4. Run Pandoc's citeproc to resolve references and generate bibliography.
-- 5. Walk the AST and replace Cite nodes with numbered superscripts.
-- 6. Extract the citeproc bibliography div from the body, reorder by
-- first-appearance, split into cited / further-reading sections,
-- and render to an HTML string for the template's $bibliography$ field.
--
-- Returns (Pandoc without refs div, bibliography HTML).
-- The bibliography HTML is empty when there are no citations.
--
-- NOTE: processCitations with in-text CSL leaves Cite nodes as Cite nodes
-- in the AST — it only populates their inline content and creates the refs
-- div. The HTML writer later wraps them in <span class="citation">. We must
-- therefore match Cite nodes (not Span nodes) in our transform pass.
--
-- NOTE: Hakyll strips YAML frontmatter before passing to readPandocWith, so
-- the Pandoc Meta is empty. further-reading keys are passed explicitly by the
-- caller (read from Hakyll's own metadata via lookupStringList).
--
-- NOTE: Does not import Contexts to avoid cycles.
module Citations
( applyCitations
-- * For synthetic bibliography pages (Phase 6b)
, renderBibliographyHtml
) where
import Data.List (intercalate, intersperse, nub, partition, sortBy)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Maybe (fromMaybe, mapMaybe)
import Data.Ord (comparing)
import Data.Text (Text)
import qualified Data.Text as T
import Text.Pandoc
import Text.Pandoc.Citeproc (processCitations)
import Text.Pandoc.Walk
import BibExtras (BibExtra (..), emptyBibExtra, parseBibExtras)
-- ---------------------------------------------------------------------------
-- Public API
-- ---------------------------------------------------------------------------
-- | Process citations in a Pandoc document.
-- @frKeys@: further-reading citation keys (read from Hakyll metadata by
-- the caller, since Hakyll strips YAML frontmatter before parsing).
-- Returns @(body, citedHtml, furtherHtml)@ where @body@ has Cite nodes
-- replaced with numbered superscripts and no bibliography div,
-- @citedHtml@ is the inline-cited references HTML, and @furtherHtml@ is
-- the further-reading-only references HTML (each empty when absent).
applyCitations :: [Text] -> Text -> Pandoc -> IO (Pandoc, Text, Text)
applyCitations frKeys bibPath doc
| not (hasCitations frKeys doc) = return (doc, "", "")
| otherwise = do
-- Read custom fields (@file:@, @keywords:@) from the .bib file
-- in parallel with citeproc. These don't affect citation
-- resolution — they enhance the rendered bibliography entries.
extras <- parseBibExtras (T.unpack bibPath)
let doc1 = injectMeta frKeys bibPath doc
processed <- runIOorExplode $ processCitations doc1
let (body, citedHtml, furtherHtml) = transformAndExtract extras frKeys processed
return (body, citedHtml, furtherHtml)
-- | Render a standalone bibliography section from a list of citekeys and
-- a set of @.bib@ file paths. Used by the synthetic @\/bibliography\/@
-- pages (Phase 6b) to produce CSL-formatted entries outside of any
-- essay's citation context.
--
-- Given citekeys are passed to citeproc via a synthesized @nocite@
-- metadata entry on an otherwise empty document; citeproc emits a
-- @refs@ Div whose children are the rendered entries. We then reorder
-- the children to match the caller-supplied @keys@ list (citeproc's
-- own ordering is overridden so callers control sort), enhance each
-- entry with the Phase 6a PDF-link and keyword-strip hooks, and
-- render to HTML wrapped in @\<div class="csl-bib-body"\>@.
--
-- @extras@ is the combined 'BibExtra' map for the same @.bib@ files;
-- passed in so that 'enhanceEntry' can consult @file:@ and
-- @keywords:@ without each entry re-parsing the files.
renderBibliographyHtml :: [FilePath] -- ^ .bib paths
-> Map String BibExtra -- ^ enhancement map
-> [String] -- ^ citekeys, in desired order
-> IO Text
renderBibliographyHtml _ _ [] = return ""
renderBibliographyHtml bibPaths extras keys = do
let doc = synthesizeNociteDoc bibPaths keys
processed <- runIOorExplode $ processCitations doc
let refsDivs = concatMap unwrapRefs (pandocBlocks processed)
ordered = reorderByKeys keys refsDivs
enhanced = map (enhanceEntry extras) ordered
return (renderEntries "csl-bib-body" enhanced)
where
pandocBlocks (Pandoc _ bs) = bs
unwrapRefs (Div ("refs", _, _) children) = children
unwrapRefs _ = []
-- | Build a Pandoc doc whose only citation-relevant content is a
-- @nocite@ metadata entry listing every supplied citekey. Runs
-- through 'processCitations' to emit a fully-formatted @refs@ Div
-- containing every entry.
synthesizeNociteDoc :: [FilePath] -> [String] -> Pandoc
synthesizeNociteDoc bibPaths keys =
let meta = Meta $ Map.fromList
[ ("bibliography", bibPathMeta bibPaths)
, ("csl", MetaString "data/chicago-notes.csl")
, ("nocite", nociteVal (map T.pack keys))
]
in Pandoc meta []
where
bibPathMeta [p] = MetaString (T.pack p)
bibPathMeta ps = MetaList (map (MetaString . T.pack) ps)
nociteVal ks = MetaInlines (intercalate [Space] (map mkCite ks))
mkCite k = [Cite [Citation k [] [] AuthorInText 1 0] [Str ("@" <> k)]]
-- | Reorder a list of @csl-entry@ Divs to match a requested key order.
-- Divs not in the key list (shouldn't happen in practice, but safe
-- by construction) drop to the end in their original order.
reorderByKeys :: [String] -> [Block] -> [Block]
reorderByKeys keys divs =
let divMap = Map.fromList [ (T.unpack (stripRefPrefix d), blk)
| blk@(Div (d, _, _) _) <- divs ]
found = mapMaybe (`Map.lookup` divMap) keys
leftovers = filter (\blk -> case blk of
Div (d, _, _) _ ->
T.unpack (stripRefPrefix d) `notElem` keys
_ -> True) divs
in found ++ leftovers
-- ---------------------------------------------------------------------------
-- Detection
-- ---------------------------------------------------------------------------
-- | True if the document has inline [@key] cites or a further-reading list.
hasCitations :: [Text] -> Pandoc -> Bool
hasCitations frKeys doc =
not (null (query collectCites doc))
|| not (null frKeys)
where
collectCites (Cite {}) = [()]
collectCites _ = []
-- ---------------------------------------------------------------------------
-- Metadata injection
-- ---------------------------------------------------------------------------
-- | Inject default bibliography / CSL paths and nocite for further-reading.
injectMeta :: [Text] -> Text -> Pandoc -> Pandoc
injectMeta frKeys bibPath (Pandoc meta blocks) =
let meta1 = if null frKeys then meta
else insertMeta "nocite" (nociteVal frKeys) meta
meta2 = case lookupMeta "bibliography" meta1 of
Nothing -> insertMeta "bibliography"
(MetaString bibPath) meta1
Just _ -> meta1
meta3 = case lookupMeta "csl" meta2 of
Nothing -> insertMeta "csl"
(MetaString "data/chicago-notes.csl") meta2
Just _ -> meta2
in Pandoc meta3 blocks
where
-- Each key becomes its own Cite node (matching what pandoc parses from
-- nocite: "@key1 @key2" in YAML frontmatter).
nociteVal keys = MetaInlines (intercalate [Space] (map mkCiteNode keys))
mkCiteNode k = [Cite [Citation k [] [] AuthorInText 1 0] [Str ("@" <> k)]]
-- | Insert a key/value pair into Pandoc Meta.
insertMeta :: Text -> MetaValue -> Meta -> Meta
insertMeta k v (Meta m) = Meta (Map.insert k v m)
-- ---------------------------------------------------------------------------
-- Transform pass
-- ---------------------------------------------------------------------------
-- | Number citation Cite nodes and extract the bibliography div.
transformAndExtract :: Map String BibExtra -> [Text] -> Pandoc -> (Pandoc, Text, Text)
transformAndExtract extras frKeys doc@(Pandoc meta _) =
let citeOrder = collectCiteOrder doc -- keys, first-appearance order
keyNums = Map.fromList (zip citeOrder [1 :: Int ..])
-- Replace Cite nodes with numbered superscript markers
doc' = walk (transformInline keyNums) doc
-- Pull bibliography div out of body and render to HTML
(bodyBlocks, citedHtml, furtherHtml) = extractBibliography extras citeOrder frKeys
(pandocBlocks doc')
in (Pandoc meta bodyBlocks, citedHtml, furtherHtml)
where
pandocBlocks (Pandoc _ bs) = bs
-- | Collect citation keys in order of first appearance (body only).
-- NOTE: after processCitations, Cite nodes remain as Cite in the AST;
-- they are not converted to Span nodes with in-text CSL.
-- We query only blocks (not metadata) so that nocite Cite nodes injected
-- into the 'nocite' meta field are not mistakenly treated as inline citations.
collectCiteOrder :: Pandoc -> [Text]
collectCiteOrder (Pandoc _ blocks) = nub (query extractKeys blocks)
where
extractKeys (Cite citations _) = map citationId citations
extractKeys _ = []
-- | Replace a Cite node with a numbered superscript marker.
transformInline :: Map Text Int -> Inline -> Inline
transformInline keyNums (Cite citations _) =
let keys = map citationId citations
nums = mapMaybe (`Map.lookup` keyNums) keys
in case (keys, nums) of
-- Both lists are guaranteed non-empty by the @null nums@ check
-- below, but pattern-match to keep this total instead of
-- relying on @head@.
(firstKey : _, firstNum : _) ->
RawInline "html" (markerHtml keys firstKey firstNum nums)
_ ->
Str ""
transformInline _ x = x
markerHtml :: [Text] -> Text -> Int -> [Int] -> Text
markerHtml keys firstKey firstNum nums =
let label = "[" <> T.intercalate "," (map tshow nums) <> "]"
allIds = T.intercalate " " (map ("ref-" <>) keys)
in "<sup class=\"cite-marker\" id=\"cite-back-" <> tshow firstNum <> "\">"
<> "<a href=\"#ref-" <> firstKey <> "\" class=\"cite-link\""
<> " data-cite-keys=\"" <> allIds <> "\">"
<> label <> "</a></sup>"
where tshow = T.pack . show
-- ---------------------------------------------------------------------------
-- Bibliography extraction + rendering
-- ---------------------------------------------------------------------------
-- | Separate the @refs@ div from body blocks and render it to HTML.
-- Returns @(bodyBlocks, citedHtml, furtherHtml)@.
extractBibliography :: Map String BibExtra -> [Text] -> [Text] -> [Block]
-> ([Block], Text, Text)
extractBibliography extras citeOrder frKeys blocks =
let (bodyBlocks, refDivs) = partition (not . isRefsDiv) blocks
(citedHtml, furtherHtml) = case refDivs of
[] -> ("", "")
(d:_) -> renderBibDiv extras citeOrder frKeys d
in (bodyBlocks, citedHtml, furtherHtml)
where
isRefsDiv (Div ("refs", _, _) _) = True
isRefsDiv _ = False
-- | Render the citeproc @refs@ Div into two HTML strings:
-- @(citedHtml, furtherHtml)@ — each is empty when there are no entries
-- in that section. Headings are rendered in the template, not here.
--
-- Entry bodies are enhanced before numbering: title-wrapped as a
-- @.pdf-link[data-pdf-src]@ when the .bib @file:@ field is set (so
-- popups.js's PDF hover preview fires), and a trailing
-- @\<div class="bib-keywords"\>@ appended when @keywords:@ is set.
renderBibDiv :: Map String BibExtra -> [Text] -> [Text] -> Block -> (Text, Text)
renderBibDiv extras citeOrder _frKeys (Div _ children) =
let enhanced = map (enhanceEntry extras) children
keyIndex = Map.fromList (zip citeOrder [0 :: Int ..])
(citedEntries, furtherEntries) =
partition (isCited keyIndex) enhanced
sorted = sortBy (comparing (entryOrder keyIndex)) citedEntries
numbered = zipWith addNumber [1..] sorted
citedHtml = renderEntries "csl-bib-body cite-refs" numbered
furtherHtml
| null furtherEntries = ""
| otherwise = renderEntries "csl-bib-body further-reading-refs" furtherEntries
in (citedHtml, furtherHtml)
renderBibDiv _ _ _ _ = ("", "")
-- ---------------------------------------------------------------------------
-- Bib entry enhancement (Phase 6a)
-- ---------------------------------------------------------------------------
-- | Augment a single @csl-entry@ Div with the custom fields we parsed
-- from the .bib file. Other Blocks pass through unchanged.
enhanceEntry :: Map String BibExtra -> Block -> Block
enhanceEntry extras b@(Div attrs@(divId, _, _) blocks) =
let key = T.unpack (stripRefPrefix divId)
extra = fromMaybe emptyBibExtra (Map.lookup key extras)
withLink = case bibFile extra of
Nothing -> blocks
Just fp -> map (wrapFirstTitleBlock (T.pack fp)) blocks
withKw = withLink ++ keywordsBlocks (bibKeywords extra)
in case (bibFile extra, bibKeywords extra) of
(Nothing, []) -> b
_ -> Div attrs withKw
enhanceEntry _ b = b
-- | In one block of an entry, wrap the first title-bearing inline
-- with a @.pdf-link@ anchor. Pandoc's CSL-formatted references
-- render the title as either a @Quoted@ (article titles in
-- Chicago-notes: "Paper Title") or an @Emph@ (book titles:
-- /Book Title/), and those are the first such inline in each
-- entry. We wrap at the block level and fall back to passing the
-- block through if no matching inline appears.
wrapFirstTitleBlock :: Text -> Block -> Block
wrapFirstTitleBlock href = \case
Para ils -> Para (wrapFirstTitle href ils)
Plain ils -> Plain (wrapFirstTitle href ils)
other -> other
-- | Left-to-right scan: wrap the first title-bearing inline in a link
-- pointing at the PDF. Pandoc's CSL renderer emits article titles as
-- @Span@ nodes (whose rendered HTML wraps quotation marks around the
-- title text) and book titles as @Emph@; @Quoted@ appears in some
-- other CSL styles. First match of any of these is treated as the
-- title; subsequent ones pass through — journal names are also
-- @Emph@ on @\@article@ entries but come after the @Span@ title, so
-- the article case picks the right target.
wrapFirstTitle :: Text -> [Inline] -> [Inline]
wrapFirstTitle href inls = reverse . fst $ foldl step ([], False) inls
where
step (acc, True) inl = (inl:acc, True)
step (acc, False) inl = case inl of
Span _ _ -> (asPdfLink href [inl] : acc, True)
Quoted _ _ -> (asPdfLink href [inl] : acc, True)
Emph _ -> (asPdfLink href [inl] : acc, True)
_ -> (inl:acc, False)
-- | Build the @.pdf-link[data-pdf-src]@ anchor that popups.js binds to.
-- See @static/js/popups.js:112@ for the matching selector.
asPdfLink :: Text -> [Inline] -> Inline
asPdfLink href content =
Link ("", ["pdf-link"], [("data-pdf-src", href)])
content
(href, "")
-- | Trailing keyword strip, linking each keyword to the future
-- @/bibliography/\<keyword\>/@ page. Returns @[]@ when the keyword
-- list is empty so the entry gets no extra block at all.
keywordsBlocks :: [String] -> [Block]
keywordsBlocks [] = []
keywordsBlocks ks =
[ Div ("", ["bib-keywords"], [])
[Plain (intersperse (Str ", ") (map keywordLink ks))]
]
where
keywordLink k =
Link ("", ["bib-keyword"], [])
[Str (T.pack k)]
(T.pack ("/bibliography/" ++ k ++ "/"), "")
isCited :: Map Text Int -> Block -> Bool
isCited keyIndex (Div (rid, _, _) _) = Map.member (stripRefPrefix rid) keyIndex
isCited _ _ = False
entryOrder :: Map Text Int -> Block -> Int
entryOrder keyIndex (Div (rid, _, _) _) =
fromMaybe maxBound $ Map.lookup (stripRefPrefix rid) keyIndex
entryOrder _ _ = maxBound
-- | Prepend [N] marker to a bibliography entry block.
addNumber :: Int -> Block -> Block
addNumber n (Div attrs@(divId, _, _) content) =
Div attrs
( Plain [ RawInline "html"
("<a class=\"ref-num\" href=\"#" <> divId <> "\">[" <> T.pack (show n) <> "]</a>") ]
: content )
addNumber _ b = b
-- | Strip the @ref-@ prefix that citeproc adds to div IDs.
stripRefPrefix :: Text -> Text
stripRefPrefix t = fromMaybe t (T.stripPrefix "ref-" t)
-- | Render a list of blocks as an HTML string (used for bibliography sections).
renderEntries :: Text -> [Block] -> Text
renderEntries cls entries =
case runPure (writeHtml5String wOpts (Pandoc nullMeta entries)) of
Left _ -> ""
Right html -> "<div class=\"" <> cls <> "\">\n" <> html <> "</div>\n"
where
wOpts = def { writerWrapText = WrapNone }