ozymandias/build/Filters/Images.hs

395 lines
17 KiB
Haskell

{-# LANGUAGE GHC2021 #-}
{-# LANGUAGE OverloadedStrings #-}
-- | Image filter: lazy loading, lightbox markers, WebP <picture>
-- wrappers, and CLS-preventing width/height attrs.
--
-- For local raster images (JPG, JPEG, PNG, GIF) whose @.webp@ companion
-- exists on disk at build time, emits a @<picture>@ element with a WebP
-- @<source>@ and the original format as the @<img>@ fallback. When the
-- webp companion is absent (cwebp not installed, @convert-images.sh@ not
-- yet run, or a single file missed), the filter emits a plain @<img>@ so
-- the image still renders. This matters because browsers do NOT fall back
-- from a 404'd @<source>@ inside @<picture>@ to the nested @<img>@ — the
-- source is selected up front and a broken one leaves the area blank.
--
-- @tools/convert-images.sh@ produces the companion .webp files at build
-- time. When cwebp is not installed the script is a no-op, and this
-- filter degrades gracefully to plain @<img>@.
--
-- SVG files and external URLs are passed through with only lazy loading
-- (and lightbox markers for standalone images).
--
-- Width / height attrs are looked up from @{image}.dims.yaml@ sidecars
-- produced by @tools/extract-dimensions.py@ at build time, on the same
-- path-resolution rules as the WebP companion check (absolute paths
-- under @static/@, relative under the source-file directory). When a
-- sidecar is missing the filter emits an attr-free <img> rather than
-- guessing — partial dimensions are worse than no dimensions, since
-- the browser would then size the image wrong on first paint.
module Filters.Images (apply) where
import Data.Char (toLower)
import Data.Default (def)
import Data.List (isPrefixOf)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Aeson.KeyMap as KM
import qualified Data.Scientific as Sci
import qualified Data.Yaml as Y
import Text.Pandoc.Definition
import qualified Text.Pandoc as Pandoc
import Text.Pandoc.Walk (walkM)
import System.Directory (doesFileExist)
import System.FilePath (replaceExtension, takeExtension, (</>))
import qualified Utils as U
-- | Apply image attribute injection and WebP wrapping to the entire document.
--
-- @srcDir@ is the directory of the source Markdown file, used to resolve
-- relative image paths when probing for the corresponding @.webp@
-- companion file. Absolute paths (leading @/@) are resolved against
-- @static/@ instead, matching the layout @convert-images.sh@ writes to.
--
-- Two-pass walk:
--
-- 1. Block-level pass (@transformBlock@) intercepts standalone
-- figures so we can synthesize the entire @<figure>@ ourselves
-- when WebP wrapping kicks in. Without this pass, replacing the
-- inner @Image@ with a @RawInline@ would break Pandoc's
-- alt-vs-caption comparison and we'd lose the
-- @aria-hidden="true"@ hint on identical-text figcaptions.
-- 2. Inline-level pass (@transformInline@) handles every remaining
-- @Image@ — inline-in-prose, inside @Link@s, etc. Pandoc's writer
-- still applies its accessibility heuristics for figures we
-- didn't synthesize (notably the no-WebP case).
apply :: FilePath -> Pandoc -> IO Pandoc
apply srcDir doc = do
doc' <- walkM (transformBlock srcDir) doc
walkM (transformInline srcDir) doc'
-- ---------------------------------------------------------------------------
-- Core transformations
-- ---------------------------------------------------------------------------
-- | Block-level pass. Currently only acts on the simple-figure shape
-- that Pandoc's Markdown reader produces for @![alt](src)@ standalone:
--
-- @Figure attr caption [Plain [Image imgAttr alt target]]@
--
-- When the image has a WebP companion on disk, we replace the whole
-- Figure with a @RawBlock@ containing the equivalent HTML — but with
-- the @<picture>@ wrapper inside and a manually-emitted
-- @aria-hidden="true"@ on the figcaption when alt text equals the
-- caption text. Anything more exotic (multi-image figures, mixed
-- block content inside the figure, no-WebP images) is left to
-- Pandoc's default emission, which is already correct for those
-- cases.
transformBlock :: FilePath -> Block -> IO Block
transformBlock srcDir b@(Figure figAttr caption [Plain [Image imgAttr alt target]]) = do
let src = T.unpack (fst target)
if not (isLocalRaster src)
then pure b
else do
hasWebp <- doesFileExist (webpPhysicalPath srcDir (fst target))
if not hasWebp
then pure b -- Pandoc handles aria-hidden naturally on the no-WebP path.
else synthesizeFigure srcDir figAttr caption imgAttr alt target
transformBlock _ b = pure b
-- | Build a @<figure>@ block from an Image and its surrounding
-- metadata. Used only on the WebP branch; the no-WebP branch leaves
-- Pandoc to emit the figure naturally.
--
-- Aria-hiding rule: when the caption's plain-text content equals the
-- alt text and both are non-empty, mark the @<figcaption>@ with
-- @aria-hidden="true"@. Screen readers then announce the alt
-- (via the @<img>@) and skip the figcaption (which would just
-- duplicate it). Non-matching captions render as visible content.
--
-- Caption inline rendering goes through Pandoc's HTML writer, so
-- formatting (italic, links, code, etc.) is preserved.
synthesizeFigure :: FilePath -> Attr -> Caption -> Attr -> [Inline] -> Target -> IO Block
synthesizeFigure srcDir figAttr caption imgAttr alt target = do
dims <- readDims srcDir (fst target)
let pictureHtml = renderPicture imgAttr alt target True dims
capInlines = captionInlines caption
capText = stringify capInlines
altText = stringify alt
useAriaHide = capText == altText && not (T.null altText)
pure $ RawBlock (Format "html") $
renderFigure figAttr pictureHtml (renderFigcaption capInlines useAriaHide)
transformInline :: FilePath -> Inline -> IO Inline
transformInline srcDir (Link lAttr ils lTarget) = do
-- Recurse into link contents; images inside a link get no lightbox marker.
ils' <- mapM (wrapLinkedImg srcDir) ils
pure (Link lAttr ils' lTarget)
transformInline srcDir (Image attr alt target) =
renderImg srcDir attr alt target True
transformInline _ x = pure x
wrapLinkedImg :: FilePath -> Inline -> IO Inline
wrapLinkedImg srcDir (Image iAttr alt iTarget) =
renderImg srcDir iAttr alt iTarget False
wrapLinkedImg _ x = pure x
-- | Dispatch on image type:
-- * Local raster with webp companion on disk → @<picture>@ with WebP @<source>@
-- * Local raster without companion → plain @<img>@ (graceful degradation)
-- * Everything else (SVG, URL) → plain @<img>@ with loading/lightbox attrs
--
-- In all three branches, when a @{image}.dims.yaml@ sidecar is
-- present, @width@ and @height@ attrs are emitted on the rendered
-- @<img>@. The sidecar lookup is skipped for non-local sources
-- (HTTP URLs, data URIs) since there's no local file to measure.
renderImg :: FilePath -> Attr -> [Inline] -> Target -> Bool -> IO Inline
renderImg srcDir attr alt target@(src, _) lightbox = do
let s = T.unpack src
isRaster = isLocalRaster s
local = not (isUrl s)
dims <- if local then readDims srcDir src else pure Nothing
if isRaster
then do
hasWebp <- doesFileExist (webpPhysicalPath srcDir src)
if hasWebp
then pure $ RawInline (Format "html")
(renderPicture attr alt target lightbox dims)
else pure $ Image (commonAttrs dims) alt target
else
pure $ Image (commonAttrs dims) alt target
where
commonAttrs dims =
withDims dims
$ addAttr "decoding" "async"
$ addLightbox lightbox
$ addAttr "loading" "lazy" attr
addLightbox True a = addAttr "data-lightbox" "true" a
addLightbox False a = a
withDims Nothing a = a
withDims (Just (w, h)) a =
addAttr "width" (T.pack (show w))
(addAttr "height" (T.pack (show h)) a)
-- | Physical on-disk path of the @.webp@ companion for a Markdown image src.
--
-- Absolute paths (@/images/foo.jpg@) resolve under @static/@ because that
-- is where Hakyll's static-asset rule writes them from. Relative paths
-- resolve against the source file's directory, where Pandoc already
-- expects co-located assets to live.
webpPhysicalPath :: FilePath -> Text -> FilePath
webpPhysicalPath srcDir src =
let s = T.unpack src
physical = if "/" `isPrefixOf` s
then "static" ++ s
else srcDir </> s
in replaceExtension physical ".webp"
-- | Physical on-disk path of the @.dims.yaml@ sidecar for a Markdown
-- image src. Same path-resolution rules as 'webpPhysicalPath'; the
-- sidecar lives next to the original image with the literal
-- extension @.dims.yaml@ appended.
dimsPhysicalPath :: FilePath -> Text -> FilePath
dimsPhysicalPath srcDir src =
let s = T.unpack src
physical = if "/" `isPrefixOf` s
then "static" ++ s
else srcDir </> s
in physical ++ ".dims.yaml"
-- | Read the @{image}.dims.yaml@ sidecar and return @(width, height)@
-- when present and parseable. Returns 'Nothing' on absent file,
-- parse error, missing keys, or non-integer values — all of which
-- cause the filter to emit no width/height attrs (rather than a
-- guess that would size the image wrong on first paint).
readDims :: FilePath -> Text -> IO (Maybe (Int, Int))
readDims srcDir src = do
let path = dimsPhysicalPath srcDir src
exists <- doesFileExist path
if not exists
then pure Nothing
else do
decoded <- Y.decodeFileEither path
pure $ case decoded of
Right (Y.Object obj) -> do
w <- intValue =<< KM.lookup "width" obj
h <- intValue =<< KM.lookup "height" obj
Just (w, h)
_ -> Nothing
where
intValue :: Y.Value -> Maybe Int
intValue (Y.Number n) = Sci.toBoundedInteger n
intValue _ = Nothing
-- ---------------------------------------------------------------------------
-- <picture> rendering
-- ---------------------------------------------------------------------------
-- | Emit a @<picture>@ element with a WebP @<source>@ and an @<img>@ fallback.
renderPicture :: Attr -> [Inline] -> Target -> Bool -> Maybe (Int, Int) -> Text
renderPicture (ident, classes, kvs) alt (src, title) lightbox dims =
T.concat
[ "<picture>"
, "<source srcset=\"", T.pack webpSrc, "\" type=\"image/webp\">"
, "<img"
, attrId ident
, attrClasses classes
, " src=\"", esc src, "\""
, attrAlt alt
, attrTitle title
, dimsAttrs dims
, " loading=\"lazy\""
, " decoding=\"async\""
, if lightbox then " data-lightbox=\"true\"" else ""
, renderKvs passedKvs
, ">"
, "</picture>"
]
where
webpSrc = replaceExtension (T.unpack src) ".webp"
-- Strip attrs we handle explicitly above (id/class/alt/title) and the
-- attrs we always emit ourselves (loading, decoding, data-lightbox,
-- width, height), so they don't appear twice on the <img>.
passedKvs = filter
(\(k, _) -> k `notElem`
[ "loading", "decoding", "data-lightbox"
, "id", "class", "alt", "title", "src"
, "width", "height"
])
kvs
dimsAttrs Nothing = ""
dimsAttrs (Just (w, h)) =
" width=\"" <> T.pack (show w)
<> "\" height=\"" <> T.pack (show h) <> "\""
-- ---------------------------------------------------------------------------
-- <figure> synthesis (Block walk, WebP path only)
-- ---------------------------------------------------------------------------
-- | Build a @<figure>@ HTML element wrapping pre-rendered inner
-- content (typically a @<picture>@) and a pre-rendered figcaption.
-- Preserves any id / classes / kvs from the surrounding Pandoc
-- 'Figure' attr.
renderFigure :: Attr -> Text -> Text -> Text
renderFigure (figId, figClasses, figKvs) inner figcaption =
T.concat
[ "<figure"
, attrId figId
, attrClasses figClasses
, renderKvs figKvs
, ">\n"
, inner
, "\n"
, figcaption
, "\n</figure>"
]
-- | Build a @<figcaption>@ element. When @ariaHidden@ is true, emits
-- @aria-hidden="true"@ — used when the caption text exactly
-- duplicates the image alt (so screen readers don't announce the
-- same content twice). Caption inlines render through Pandoc's HTML
-- writer to preserve formatting.
renderFigcaption :: [Inline] -> Bool -> Text
renderFigcaption ils ariaHidden =
let body = renderInlinesToHtml ils
attrs = if ariaHidden then " aria-hidden=\"true\"" else ""
in "<figcaption" <> attrs <> ">" <> body <> "</figcaption>"
-- | Pandoc 'Caption' has a long form (@[Block]@) and an optional short
-- form (@Maybe ShortCaption@). We use the long form, flattening any
-- @Plain@ / @Para@ blocks into a single inline list. Multi-block
-- captions (rare) collapse to the inlines of their text-bearing
-- blocks; non-text blocks (like nested lists) are dropped, since
-- they don't make sense in a figcaption anyway.
captionInlines :: Caption -> [Inline]
captionInlines (Caption _ blocks) = concatMap go blocks
where
go (Plain ils) = ils
go (Para ils) = ils
go _ = []
-- | Render Pandoc 'Inline' nodes to HTML using Pandoc's own writer.
-- Wrapping the inlines in a @Plain@ block (rather than @Para@)
-- avoids the surrounding @<p>@ tag the writer would otherwise emit.
-- On writer failure (extremely unlikely for inline-only input),
-- falls back to the plain-text 'stringify' rendering — a worse but
-- still safe figcaption.
renderInlinesToHtml :: [Inline] -> Text
renderInlinesToHtml ils =
case Pandoc.runPure (Pandoc.writeHtml5String def doc) of
Right t -> T.strip t
Left _ -> stringify ils
where
doc = Pandoc mempty [Plain ils]
attrId :: Text -> Text
attrId t = if T.null t then "" else " id=\"" <> esc t <> "\""
attrClasses :: [Text] -> Text
attrClasses [] = ""
attrClasses cs = " class=\"" <> T.intercalate " " (map esc cs) <> "\""
attrAlt :: [Inline] -> Text
attrAlt ils = let t = stringify ils
in if T.null t then "" else " alt=\"" <> esc t <> "\""
attrTitle :: Text -> Text
attrTitle t = if T.null t then "" else " title=\"" <> esc t <> "\""
renderKvs :: [(Text, Text)] -> Text
renderKvs = T.concat . map (\(k, v) -> " " <> k <> "=\"" <> esc v <> "\"")
-- ---------------------------------------------------------------------------
-- Helpers
-- ---------------------------------------------------------------------------
-- | True for local (non-URL) images with a raster format we can convert.
isLocalRaster :: FilePath -> Bool
isLocalRaster src = not (isUrl src) && lowerExt src `elem` [".jpg", ".jpeg", ".png", ".gif"]
isUrl :: String -> Bool
isUrl s = any (`isPrefixOf` s) ["http://", "https://", "//", "data:"]
-- | Extension of a path, lowercased (e.g. ".JPG" → ".jpg").
-- Returns the empty string for paths with no extension.
lowerExt :: FilePath -> String
lowerExt = map toLower . takeExtension
-- | Prepend a key=value pair if not already present.
addAttr :: Text -> Text -> Attr -> Attr
addAttr k v (i, cs, kvs)
| any ((== k) . fst) kvs = (i, cs, kvs)
| otherwise = (i, cs, (k, v) : kvs)
-- | Plain-text content of a list of inlines (for alt text).
stringify :: [Inline] -> Text
stringify = T.concat . map go
where
go (Str t) = t
go Space = " "
go SoftBreak = " "
go LineBreak = " "
go (Emph ils) = stringify ils
go (Strong ils) = stringify ils
go (Strikeout ils) = stringify ils
go (Superscript ils) = stringify ils
go (Subscript ils) = stringify ils
go (SmallCaps ils) = stringify ils
go (Underline ils) = stringify ils
go (Quoted _ ils) = stringify ils
go (Cite _ ils) = stringify ils
go (Code _ t) = t
go (Math _ t) = t
go (RawInline _ _) = ""
go (Link _ ils _) = stringify ils
go (Image _ ils _) = stringify ils
go (Span _ ils) = stringify ils
go (Note _) = ""
-- | HTML-escape a text value for use in attribute values.
-- Defers to the canonical 'Utils.escapeHtmlText'.
esc :: Text -> Text
esc = U.escapeHtmlText