174 lines
6.7 KiB
Haskell
174 lines
6.7 KiB
Haskell
{-# LANGUAGE GHC2021 #-}
|
|
-- | Parser for custom fields on BibLaTeX entries that citeproc doesn't
|
|
-- surface on its own: @file:@ (path to a hosted PDF) and @keywords:@
|
|
-- (comma-separated list, shared vocabulary with essay-frontmatter
|
|
-- @keywords:@ for bibliography-page cross-linking). Also captures
|
|
-- @author:@ and @year:@ used for bibliography-page sorting.
|
|
--
|
|
-- Character-based scanner with brace-balance tracking, so fields
|
|
-- whose values span multiple lines parse correctly — e.g.:
|
|
--
|
|
-- @
|
|
-- \@inproceedings{kyber2018,
|
|
-- author = {Bos, Joppe W. and Ducas, Léo and ...
|
|
-- and Stehlé, Damien},
|
|
-- title = {{CRYSTALS -- Kyber}},
|
|
-- year = {2018}
|
|
-- }
|
|
-- @
|
|
--
|
|
-- Field values enclosed in @{...}@ (balanced) or @"..."@ are both
|
|
-- recognized. Unknown fields are ignored.
|
|
module BibExtras
|
|
( BibExtra (..)
|
|
, emptyBibExtra
|
|
, parseBibExtras
|
|
, firstAuthorSurname
|
|
) where
|
|
|
|
import Data.Char (isAlphaNum, isSpace, toLower)
|
|
import Data.List (dropWhileEnd)
|
|
import Data.Map.Strict (Map)
|
|
import qualified Data.Map.Strict as Map
|
|
|
|
|
|
-- | Custom fields we extract per citekey. Fields absent from the
|
|
-- entry normalize to @Nothing@ / @[]@.
|
|
data BibExtra = BibExtra
|
|
{ bibFile :: Maybe FilePath -- ^ @file:@ — URL path to a hosted PDF.
|
|
, bibKeywords :: [String] -- ^ @keywords:@ — comma-split, trimmed.
|
|
, bibAuthor :: Maybe String -- ^ @author:@ — raw value, sort key only.
|
|
, bibYear :: Maybe String -- ^ @year:@ — raw value, sort key only.
|
|
} deriving (Show)
|
|
|
|
-- | Neutral default for a citekey with no custom fields.
|
|
emptyBibExtra :: BibExtra
|
|
emptyBibExtra = BibExtra Nothing [] Nothing Nothing
|
|
|
|
-- | First-author surname for alphabetic sort. Conservative extraction:
|
|
-- take everything up to the first comma of the first author entry.
|
|
-- BibLaTeX author format separates authors with " and ", so
|
|
-- "Nietzsche, Friedrich and Holub, Robert C." → "Nietzsche".
|
|
-- Corporate authors like "{National Institute of ...}" strip the
|
|
-- outer braces (the parser drops them) and sort by the full name.
|
|
-- Entries without an author sort under the empty string.
|
|
firstAuthorSurname :: BibExtra -> String
|
|
firstAuthorSurname extra = case bibAuthor extra of
|
|
Just s -> trim (takeWhile (/= ',') (stripOuterBraces s))
|
|
Nothing -> ""
|
|
where
|
|
stripOuterBraces ('{':rest) = dropWhileEnd (== '}') rest
|
|
stripOuterBraces s = s
|
|
|
|
|
|
-- | Parse a @.bib@ file; returns a map @citekey -> 'BibExtra'@.
|
|
parseBibExtras :: FilePath -> IO (Map String BibExtra)
|
|
parseBibExtras path = Map.fromList . parseBib <$> readFile path
|
|
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Character-based scanner
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | Enumerate all entries in a .bib file as (citekey, extra) pairs.
|
|
parseBib :: String -> [(String, BibExtra)]
|
|
parseBib input = go (dropTo '@' input)
|
|
where
|
|
-- Advance past any non-entry prefix to the first '@'.
|
|
dropTo c = dropWhile (/= c)
|
|
|
|
go [] = []
|
|
go ('@':rest) =
|
|
let -- Entry type, then '{', then citekey, then ',', then fields, then '}'.
|
|
r1 = dropWhile isAlphaNum rest -- skip type name
|
|
r2 = dropWhile isSpace r1
|
|
in case r2 of
|
|
'{':r3 ->
|
|
let (citekey, r4) = span (\c -> c /= ',' && not (isSpace c)) r3
|
|
r5 = dropWhile (\c -> c /= ',' && c /= '}') r4
|
|
in case r5 of
|
|
',':r6 ->
|
|
let (flds, r7) = parseFields r6
|
|
in (trim citekey, toExtra flds) : go (dropTo '@' r7)
|
|
-- Fieldless entries: walk past and carry on.
|
|
'}':r6 -> (trim citekey, emptyBibExtra) : go (dropTo '@' r6)
|
|
_ -> []
|
|
_ -> go (dropTo '@' r2)
|
|
go (_:rest) = go (dropTo '@' rest)
|
|
|
|
-- | Parse fields until the closing '}' of the enclosing entry.
|
|
-- Accepts @name = {value}@, @name = "value"@, or trailing commas.
|
|
parseFields :: String -> ([(String, String)], String)
|
|
parseFields = go
|
|
where
|
|
go s =
|
|
let s' = dropWhile isSkippable s
|
|
in case s' of
|
|
[] -> ([], [])
|
|
'}':rest -> ([], rest)
|
|
_ -> case parseField s' of
|
|
Nothing -> ([], s') -- malformed; stop collecting
|
|
Just (nv, rest) ->
|
|
let (more, rest') = go rest
|
|
in (nv : more, rest')
|
|
|
|
isSkippable c = isSpace c || c == ','
|
|
|
|
-- | Parse a single @name = value@ field.
|
|
parseField :: String -> Maybe ((String, String), String)
|
|
parseField s =
|
|
let (name, r1) = span (\c -> isAlphaNum c || c == '_') (dropWhile isSpace s)
|
|
r2 = dropWhile isSpace r1
|
|
in case r2 of
|
|
'=':r3 -> do
|
|
let r4 = dropWhile isSpace r3
|
|
(value, r5) <- readFieldValue r4
|
|
return ((map toLower (trim name), value), r5)
|
|
_ -> Nothing
|
|
|
|
-- | Read a field's value, honoring nested braces and quoted forms.
|
|
readFieldValue :: String -> Maybe (String, String)
|
|
readFieldValue ('{':rest) = Just (readBraces 1 "" rest)
|
|
readFieldValue ('"':rest) = Just (readQuote "" rest)
|
|
readFieldValue _ = Nothing
|
|
|
|
-- | Read characters up to the matching @}@ that closes the outermost
|
|
-- @{@; preserves interior @{@ / @}@ pairs as part of the value.
|
|
readBraces :: Int -> String -> String -> (String, String)
|
|
readBraces 0 acc r = (reverse acc, r)
|
|
readBraces _ acc [] = (reverse acc, [])
|
|
readBraces 1 acc ('}':r) = (reverse acc, r) -- outer close
|
|
readBraces n acc ('{':r) = readBraces (n + 1) ('{' : acc) r
|
|
readBraces n acc ('}':r) = readBraces (n - 1) ('}' : acc) r
|
|
readBraces n acc (c:r) = readBraces n (c : acc) r
|
|
|
|
-- | Read characters up to the closing @"@.
|
|
readQuote :: String -> String -> (String, String)
|
|
readQuote acc ('"':r) = (reverse acc, r)
|
|
readQuote acc [] = (reverse acc, [])
|
|
readQuote acc (c:r) = readQuote (c : acc) r
|
|
|
|
-- | Build a 'BibExtra' from the parsed fields list.
|
|
toExtra :: [(String, String)] -> BibExtra
|
|
toExtra flds = BibExtra
|
|
{ bibFile = lookup "file" flds
|
|
, bibKeywords = case lookup "keywords" flds of
|
|
Nothing -> []
|
|
Just s -> filter (not . null) (map trim (splitOn ',' s))
|
|
, bibAuthor = lookup "author" flds
|
|
, bibYear = lookup "year" flds
|
|
}
|
|
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Utilities
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
trim :: String -> String
|
|
trim = dropWhile isSpace . dropWhileEnd isSpace
|
|
|
|
splitOn :: Eq a => a -> [a] -> [[a]]
|
|
splitOn c xs = case break (== c) xs of
|
|
(before, []) -> [before]
|
|
(before, _ : rest) -> before : splitOn c rest
|