levineuwirth.org/build/BibExtras.hs

174 lines
6.7 KiB
Haskell

{-# LANGUAGE GHC2021 #-}
-- | Parser for custom fields on BibLaTeX entries that citeproc doesn't
-- surface on its own: @file:@ (path to a hosted PDF) and @keywords:@
-- (comma-separated list, shared vocabulary with essay-frontmatter
-- @keywords:@ for bibliography-page cross-linking). Also captures
-- @author:@ and @year:@ used for bibliography-page sorting.
--
-- Character-based scanner with brace-balance tracking, so fields
-- whose values span multiple lines parse correctly — e.g.:
--
-- @
-- \@inproceedings{kyber2018,
-- author = {Bos, Joppe W. and Ducas, Léo and ...
-- and Stehlé, Damien},
-- title = {{CRYSTALS -- Kyber}},
-- year = {2018}
-- }
-- @
--
-- Field values enclosed in @{...}@ (balanced) or @"..."@ are both
-- recognized. Unknown fields are ignored.
module BibExtras
( BibExtra (..)
, emptyBibExtra
, parseBibExtras
, firstAuthorSurname
) where
import Data.Char (isAlphaNum, isSpace, toLower)
import Data.List (dropWhileEnd)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
-- | Custom fields we extract per citekey. Fields absent from the
-- entry normalize to @Nothing@ / @[]@.
data BibExtra = BibExtra
{ bibFile :: Maybe FilePath -- ^ @file:@ — URL path to a hosted PDF.
, bibKeywords :: [String] -- ^ @keywords:@ — comma-split, trimmed.
, bibAuthor :: Maybe String -- ^ @author:@ — raw value, sort key only.
, bibYear :: Maybe String -- ^ @year:@ — raw value, sort key only.
} deriving (Show)
-- | Neutral default for a citekey with no custom fields.
emptyBibExtra :: BibExtra
emptyBibExtra = BibExtra Nothing [] Nothing Nothing
-- | First-author surname for alphabetic sort. Conservative extraction:
-- take everything up to the first comma of the first author entry.
-- BibLaTeX author format separates authors with " and ", so
-- "Nietzsche, Friedrich and Holub, Robert C." → "Nietzsche".
-- Corporate authors like "{National Institute of ...}" strip the
-- outer braces (the parser drops them) and sort by the full name.
-- Entries without an author sort under the empty string.
firstAuthorSurname :: BibExtra -> String
firstAuthorSurname extra = case bibAuthor extra of
Just s -> trim (takeWhile (/= ',') (stripOuterBraces s))
Nothing -> ""
where
stripOuterBraces ('{':rest) = dropWhileEnd (== '}') rest
stripOuterBraces s = s
-- | Parse a @.bib@ file; returns a map @citekey -> 'BibExtra'@.
parseBibExtras :: FilePath -> IO (Map String BibExtra)
parseBibExtras path = Map.fromList . parseBib <$> readFile path
-- ---------------------------------------------------------------------------
-- Character-based scanner
-- ---------------------------------------------------------------------------
-- | Enumerate all entries in a .bib file as (citekey, extra) pairs.
parseBib :: String -> [(String, BibExtra)]
parseBib input = go (dropTo '@' input)
where
-- Advance past any non-entry prefix to the first '@'.
dropTo c = dropWhile (/= c)
go [] = []
go ('@':rest) =
let -- Entry type, then '{', then citekey, then ',', then fields, then '}'.
r1 = dropWhile isAlphaNum rest -- skip type name
r2 = dropWhile isSpace r1
in case r2 of
'{':r3 ->
let (citekey, r4) = span (\c -> c /= ',' && not (isSpace c)) r3
r5 = dropWhile (\c -> c /= ',' && c /= '}') r4
in case r5 of
',':r6 ->
let (flds, r7) = parseFields r6
in (trim citekey, toExtra flds) : go (dropTo '@' r7)
-- Fieldless entries: walk past and carry on.
'}':r6 -> (trim citekey, emptyBibExtra) : go (dropTo '@' r6)
_ -> []
_ -> go (dropTo '@' r2)
go (_:rest) = go (dropTo '@' rest)
-- | Parse fields until the closing '}' of the enclosing entry.
-- Accepts @name = {value}@, @name = "value"@, or trailing commas.
parseFields :: String -> ([(String, String)], String)
parseFields = go
where
go s =
let s' = dropWhile isSkippable s
in case s' of
[] -> ([], [])
'}':rest -> ([], rest)
_ -> case parseField s' of
Nothing -> ([], s') -- malformed; stop collecting
Just (nv, rest) ->
let (more, rest') = go rest
in (nv : more, rest')
isSkippable c = isSpace c || c == ','
-- | Parse a single @name = value@ field.
parseField :: String -> Maybe ((String, String), String)
parseField s =
let (name, r1) = span (\c -> isAlphaNum c || c == '_') (dropWhile isSpace s)
r2 = dropWhile isSpace r1
in case r2 of
'=':r3 -> do
let r4 = dropWhile isSpace r3
(value, r5) <- readFieldValue r4
return ((map toLower (trim name), value), r5)
_ -> Nothing
-- | Read a field's value, honoring nested braces and quoted forms.
readFieldValue :: String -> Maybe (String, String)
readFieldValue ('{':rest) = Just (readBraces 1 "" rest)
readFieldValue ('"':rest) = Just (readQuote "" rest)
readFieldValue _ = Nothing
-- | Read characters up to the matching @}@ that closes the outermost
-- @{@; preserves interior @{@ / @}@ pairs as part of the value.
readBraces :: Int -> String -> String -> (String, String)
readBraces 0 acc r = (reverse acc, r)
readBraces _ acc [] = (reverse acc, [])
readBraces 1 acc ('}':r) = (reverse acc, r) -- outer close
readBraces n acc ('{':r) = readBraces (n + 1) ('{' : acc) r
readBraces n acc ('}':r) = readBraces (n - 1) ('}' : acc) r
readBraces n acc (c:r) = readBraces n (c : acc) r
-- | Read characters up to the closing @"@.
readQuote :: String -> String -> (String, String)
readQuote acc ('"':r) = (reverse acc, r)
readQuote acc [] = (reverse acc, [])
readQuote acc (c:r) = readQuote (c : acc) r
-- | Build a 'BibExtra' from the parsed fields list.
toExtra :: [(String, String)] -> BibExtra
toExtra flds = BibExtra
{ bibFile = lookup "file" flds
, bibKeywords = case lookup "keywords" flds of
Nothing -> []
Just s -> filter (not . null) (map trim (splitOn ',' s))
, bibAuthor = lookup "author" flds
, bibYear = lookup "year" flds
}
-- ---------------------------------------------------------------------------
-- Utilities
-- ---------------------------------------------------------------------------
trim :: String -> String
trim = dropWhile isSpace . dropWhileEnd isSpace
splitOn :: Eq a => a -> [a] -> [[a]]
splitOn c xs = case break (== c) xs of
(before, []) -> [before]
(before, _ : rest) -> before : splitOn c rest