Filters: fence/code-span awareness, host matching, nested-header skip

- SourceRefs trigger whitelist aligned to the /source/ serving
  whitelist (drops content/, yaml-source/, broad static//tools//data
  prefixes; adds .bib); existsCached no longer memoizes non-existence,
  so files created under make watch are picked up (§2.5, §2.16)
- fill/stroke hex replacement is boundary-aware: #000080 and 8-digit
  RGBA forms can no longer be corrupted into currentColor80 (§2.12)
- Wikilinks/Transclusion/EmbedPdf skip fenced code blocks (shared
  CommonMark fence tracker), and wikilinks additionally skip inline
  code spans — the syntax-documentation essay now renders its own
  examples literally while live wikilinks still convert (verified both
  ways in output) (§2.13)
- domainIcon matches the extracted host by label suffix instead of
  substring-of-URL; extractHost also strips userinfo (§2.14)
- webpSrc escaped in srcset; internal PDF links no longer double-
  classified; Smallcaps/Archive header-skip now holds at every nesting
  depth via protect/restore walks (§2.17)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Levi Neuwirth 2026-06-10 11:13:08 -04:00
parent c8eeaaa9bc
commit f254ce866e
10 changed files with 362 additions and 97 deletions

View File

@ -30,22 +30,45 @@ import Text.Pandoc.Walk (walk)
import ArchiveIndex (ArchiveStatus (..), archiveIndexIsEmpty, import ArchiveIndex (ArchiveStatus (..), archiveIndexIsEmpty,
archiveSlugFor, archiveStatusForSlug) archiveSlugFor, archiveStatusForSlug)
-- | Annotate body links. Headings are left alone — an affordance there -- | Annotate body links. Links inside headings are left alone at
-- would be noise. Identity when the index is empty. -- /every/ nesting depth — an affordance there would be noise, and a
-- top-level pattern match would miss a @Header@ inside a @Div@ or
-- @BlockQuote@. Header links are tagged with a sentinel class before
-- the annotation walk and stripped of it afterwards, so the sentinel
-- can never leak into the writer. Identity when the index is empty.
apply :: Pandoc -> Pandoc apply :: Pandoc -> Pandoc
apply doc@(Pandoc meta blocks) apply doc
| archiveIndexIsEmpty = doc | archiveIndexIsEmpty = doc
| otherwise = Pandoc meta (map annotateBlock blocks) | otherwise =
walk unprotectLink . walk annotateInlines . walk protectHeader $ doc
annotateBlock :: Block -> Block -- | Sentinel class marking a link the annotation walk must skip. It
annotateBlock h@Header{} = h -- only exists between the protect and unprotect walks inside 'apply'.
annotateBlock b = walk annotateInlines b skipClass :: T.Text
skipClass = "archive-header-skip"
protectHeader :: Block -> Block
protectHeader (Header lvl attr ils) = Header lvl attr (walk protect ils)
where
protect (Link (ident, cls, kvs) text target) =
Link (ident, skipClass : cls, kvs) text target
protect x = x
protectHeader b = b
unprotectLink :: Inline -> Inline
unprotectLink (Link (ident, cls, kvs) text target)
| skipClass `elem` cls =
Link (ident, filter (/= skipClass) cls, kvs) text target
unprotectLink x = x
-- | For each archived @Link@: flip it if the target is 'Rotted', else -- | For each archived @Link@: flip it if the target is 'Rotted', else
-- append the affordance. Non-archived links pass through untouched. -- append the affordance. Non-archived links — and links protected by
-- 'protectHeader' — pass through untouched.
annotateInlines :: [Inline] -> [Inline] annotateInlines :: [Inline] -> [Inline]
annotateInlines = concatMap expand annotateInlines = concatMap expand
where where
expand l@(Link (_, cls, _) _ _)
| skipClass `elem` cls = [l]
expand l@(Link attr text (url, _)) = expand l@(Link attr text (url, _)) =
case archiveSlugFor url of case archiveSlugFor url of
Nothing -> [l] Nothing -> [l]

View File

@ -12,15 +12,23 @@
-- --
-- The file path must be root-relative (begins with @/@). -- The file path must be root-relative (begins with @/@).
-- PDF.js is expected to be vendored at @/pdfjs/web/viewer.html@. -- PDF.js is expected to be vendored at @/pdfjs/web/viewer.html@.
--
-- Code protection (honest scope): lines inside /fenced/ code blocks
-- are passed through untouched ('Filters.Wikilinks.mapOutsideFences'),
-- so fenced examples can show @{{pdf:…}}@ literally. Indented code
-- blocks and inline code spans are NOT recognised — a full-line
-- directive inside either is still rewritten.
module Filters.EmbedPdf (preprocess) where module Filters.EmbedPdf (preprocess) where
import Data.Char (isDigit) import Data.Char (isDigit)
import Data.List (isPrefixOf, isSuffixOf) import Data.List (isPrefixOf, isSuffixOf)
import Filters.Wikilinks (mapOutsideFences)
import qualified Utils as U import qualified Utils as U
-- | Apply PDF-embed substitution to the raw Markdown source string. -- | Apply PDF-embed substitution to the raw Markdown source string,
-- skipping lines inside fenced code blocks.
preprocess :: String -> String preprocess :: String -> String
preprocess = unlines . map processLine . lines preprocess = mapOutsideFences processLine
processLine :: String -> String processLine :: String -> String
processLine line = processLine line =

View File

@ -231,7 +231,7 @@ renderPicture :: Attr -> [Inline] -> Target -> Bool -> Maybe (Int, Int) -> Text
renderPicture (ident, classes, kvs) alt (src, title) lightbox dims = renderPicture (ident, classes, kvs) alt (src, title) lightbox dims =
T.concat T.concat
[ "<picture>" [ "<picture>"
, "<source srcset=\"", T.pack webpSrc, "\" type=\"image/webp\">" , "<source srcset=\"", esc (T.pack webpSrc), "\" type=\"image/webp\">"
, "<img" , "<img"
, attrId ident , attrId ident
, attrClasses classes , attrClasses classes

View File

@ -16,8 +16,11 @@ import Text.Pandoc.Definition
import Text.Pandoc.Walk (walk) import Text.Pandoc.Walk (walk)
-- | Apply link classification to the entire document. -- | Apply link classification to the entire document.
-- Two passes: PDF links first (rewrites href to viewer URL), then external -- Two passes: PDF links first (rewrites href to the viewer URL and tags
-- link classification (operates on http/https, so no overlap). -- the anchor @pdf-link@), then general classification. The second pass
-- explicitly skips anchors the PDF pass already claimed — the viewer URL
-- is root-relative, so without that guard it would also be classified as
-- an internal page link and get double chrome.
apply :: Pandoc -> Pandoc apply :: Pandoc -> Pandoc
apply = walk classifyLink . walk classifyPdfLink apply = walk classifyLink . walk classifyPdfLink
@ -49,6 +52,11 @@ classifyLink l@(Link (_, classes, _) _ _)
-- brand icon stamp, and have their own popup provider. Leave them -- brand icon stamp, and have their own popup provider. Leave them
-- entirely alone. -- entirely alone.
| "source-ref" `elem` classes = l | "source-ref" `elem` classes = l
-- PDF links were already rewritten to the (root-relative) viewer URL
-- and given their own chrome by 'classifyPdfLink' in the preceding
-- pass; without this guard they would be double-classified as
-- internal page links.
| "pdf-link" `elem` classes = l
classifyLink (Link (ident, classes, kvs) ils (url, title)) classifyLink (Link (ident, classes, kvs) ils (url, title))
| isExternal url = | isExternal url =
let icon = domainIcon url let icon = domainIcon url
@ -100,8 +108,9 @@ isExternal url =
where where
siteHost = "levineuwirth.org" siteHost = "levineuwirth.org"
-- | Extract the lowercased hostname from an absolute http(s) URL. -- | Extract the lowercased hostname from an absolute http(s) URL,
-- Returns 'Nothing' for non-http(s) URLs (relative paths, mailto:, etc.). -- stripping any userinfo (@user:pass\@@) and port. Returns 'Nothing'
-- for non-http(s) URLs (relative paths, mailto:, etc.).
extractHost :: Text -> Maybe Text extractHost :: Text -> Maybe Text
extractHost url extractHost url
| Just rest <- T.stripPrefix "https://" url = Just (hostOf rest) | Just rest <- T.stripPrefix "https://" url = Just (hostOf rest)
@ -109,45 +118,60 @@ extractHost url
| otherwise = Nothing | otherwise = Nothing
where where
hostOf rest = hostOf rest =
let withPort = T.takeWhile (\c -> c /= '/' && c /= '?' && c /= '#') rest let authority = T.takeWhile (\c -> c /= '/' && c /= '?' && c /= '#') rest
host = T.takeWhile (/= ':') withPort -- 'T.breakOnEnd' yields the segment after the last @\@@, or
-- the whole authority when there is no userinfo.
(_, hostPort) = T.breakOnEnd "@" authority
host = T.takeWhile (/= ':') hostPort
in T.toLower host in T.toLower host
-- | Icon name for the link, matching a file in /images/link-icons/<name>.svg. -- | Icon name for the link, matching a file in /images/link-icons/<name>.svg.
--
-- Matches on the URL's host only, never on the full URL — a path like
-- @https://example.org/why-x.com-failed@ must not get the Twitter
-- icon. URLs with no extractable host get the generic icon.
domainIcon :: Text -> Text domainIcon :: Text -> Text
domainIcon url domainIcon url = maybe "external" iconForHost (extractHost url)
iconForHost :: Text -> Text
iconForHost host
-- Scholarly / reference -- Scholarly / reference
| "wikipedia.org" `T.isInfixOf` url = "wikipedia" | m "wikipedia.org" = "wikipedia"
| "arxiv.org" `T.isInfixOf` url = "arxiv" | m "arxiv.org" = "arxiv"
| "doi.org" `T.isInfixOf` url = "doi" | m "doi.org" = "doi"
| "worldcat.org" `T.isInfixOf` url = "worldcat" | m "worldcat.org" = "worldcat"
| "orcid.org" `T.isInfixOf` url = "orcid" | m "orcid.org" = "orcid"
| "archive.org" `T.isInfixOf` url = "internet-archive" | m "archive.org" = "internet-archive"
-- Code / software -- Code / software
| "github.com" `T.isInfixOf` url = "github" | m "github.com" = "github"
| "git.levineuwirth.org" `T.isInfixOf` url = "forgejo" | m "git.levineuwirth.org" = "forgejo"
| "tensorflow.org" `T.isInfixOf` url = "tensorflow" | m "tensorflow.org" = "tensorflow"
-- AI companies (consumer products share a brand icon with the lab) -- AI companies (consumer products share a brand icon with the lab)
| "anthropic.com" `T.isInfixOf` url = "anthropic" | m "anthropic.com" = "anthropic"
| "claude.ai" `T.isInfixOf` url = "anthropic" | m "claude.ai" = "anthropic"
| "openai.com" `T.isInfixOf` url = "openai" | m "openai.com" = "openai"
| "chatgpt.com" `T.isInfixOf` url = "openai" | m "chatgpt.com" = "openai"
-- Social / media -- Social / media
| "twitter.com" `T.isInfixOf` url = "twitter" | m "twitter.com" = "twitter"
| "x.com" `T.isInfixOf` url = "twitter" | m "x.com" = "twitter"
| "reddit.com" `T.isInfixOf` url = "reddit" | m "reddit.com" = "reddit"
| "youtube.com" `T.isInfixOf` url = "youtube" | m "youtube.com" = "youtube"
| "youtu.be" `T.isInfixOf` url = "youtube" | m "youtu.be" = "youtube"
| "tiktok.com" `T.isInfixOf` url = "tiktok" | m "tiktok.com" = "tiktok"
| "substack.com" `T.isInfixOf` url = "substack" | m "substack.com" = "substack"
| "news.ycombinator.com" `T.isInfixOf` url = "hacker-news" | m "news.ycombinator.com" = "hacker-news"
| "lesswrong.com" `T.isInfixOf` url = "lesswrong" | m "lesswrong.com" = "lesswrong"
-- News -- News
| "nytimes.com" `T.isInfixOf` url = "new-york-times" | m "nytimes.com" = "new-york-times"
-- Institutions -- Institutions
| "nasa.gov" `T.isInfixOf` url = "nasa" | m "nasa.gov" = "nasa"
| "apple.com" `T.isInfixOf` url = "apple" | m "apple.com" = "apple"
| otherwise = "external" | otherwise = "external"
where
-- Label-suffix match: the host is the domain itself or a subdomain
-- of it. Never fires on a lookalike label (@notx.com@) or on text
-- in the path or query.
m d = host == d || ("." <> d) `T.isSuffixOf` host
-- | Percent-encode characters that would break a @?file=@ query-string value. -- | Percent-encode characters that would break a @?file=@ query-string value.
-- Slashes are intentionally left unencoded so root-relative paths remain -- Slashes are intentionally left unencoded so root-relative paths remain

View File

@ -15,6 +15,7 @@
module Filters.Score (inlineScores) where module Filters.Score (inlineScores) where
import Control.Exception (IOException, try) import Control.Exception (IOException, try)
import Data.Char (isHexDigit)
import Data.Maybe (listToMaybe) import Data.Maybe (listToMaybe)
import qualified Data.Text as T import qualified Data.Text as T
import qualified Data.Text.IO as TIO import qualified Data.Text.IO as TIO
@ -86,25 +87,48 @@ findImagePath blocks = listToMaybe
-- | Replace hardcoded black fill/stroke values with @currentColor@ so the -- | Replace hardcoded black fill/stroke values with @currentColor@ so the
-- SVG inherits the CSS @color@ property in both light and dark modes. -- SVG inherits the CSS @color@ property in both light and dark modes.
-- --
-- 6-digit hex patterns are at the bottom of the composition chain -- Quoted attribute forms (@fill="#000"@) are self-delimiting — the
-- (applied first) so they are replaced before the 3-digit shorthand, -- closing quote bounds the match — so plain 'T.replace' is safe for
-- preventing partial matches (e.g. @#000@ matching the prefix of @#000000@). -- them. Unquoted style-property forms (@fill:#000@) are not: naive
-- replacement would also fire on the prefix of a longer hex colour
-- (@fill:#000080@ → @fill:currentColor80@, invalid CSS). Those go
-- through 'replaceHexColor', which rewrites a match only when it is
-- not followed by another hex digit; the boundary check also makes
-- the 3-digit/6-digit application order irrelevant.
processColors :: T.Text -> T.Text processColors :: T.Text -> T.Text
processColors processColors
-- 3-digit hex and keyword patterns (applied after 6-digit replacements) -- 3-digit hex and keyword patterns
= T.replace "fill=\"#000\"" "fill=\"currentColor\"" = T.replace "fill=\"#000\"" "fill=\"currentColor\""
. T.replace "fill=\"black\"" "fill=\"currentColor\"" . T.replace "fill=\"black\"" "fill=\"currentColor\""
. T.replace "stroke=\"#000\"" "stroke=\"currentColor\"" . T.replace "stroke=\"#000\"" "stroke=\"currentColor\""
. T.replace "stroke=\"black\"" "stroke=\"currentColor\"" . T.replace "stroke=\"black\"" "stroke=\"currentColor\""
. T.replace "fill:#000" "fill:currentColor" . replaceHexColor "fill:#000" "fill:currentColor"
. T.replace "fill:black" "fill:currentColor" . T.replace "fill:black" "fill:currentColor"
. T.replace "stroke:#000" "stroke:currentColor" . replaceHexColor "stroke:#000" "stroke:currentColor"
. T.replace "stroke:black" "stroke:currentColor" . T.replace "stroke:black" "stroke:currentColor"
-- 6-digit hex patterns (applied first — bottom of the chain) -- 6-digit hex patterns (applied first — bottom of the chain)
. T.replace "fill=\"#000000\"" "fill=\"currentColor\"" . T.replace "fill=\"#000000\"" "fill=\"currentColor\""
. T.replace "stroke=\"#000000\"" "stroke=\"currentColor\"" . T.replace "stroke=\"#000000\"" "stroke=\"currentColor\""
. T.replace "fill:#000000" "fill:currentColor" . replaceHexColor "fill:#000000" "fill:currentColor"
. T.replace "stroke:#000000" "stroke:currentColor" . replaceHexColor "stroke:#000000" "stroke:currentColor"
-- | 'T.replace' restricted to hex-boundary-terminated matches: an
-- occurrence of @needle@ is rewritten only when the character after
-- it is not another hex digit, so @fill:#000@ never fires inside the
-- longer colours @fill:#0008@, @fill:#000080@, or @fill:#00000080@.
replaceHexColor :: T.Text -> T.Text -> T.Text -> T.Text
replaceHexColor needle replacement = go
where
go t =
let (pre, rest) = T.breakOn needle t
in if T.null rest
then pre
else
let after = T.drop (T.length needle) rest
in case T.uncons after of
Just (c, _) | isHexDigit c ->
pre <> needle <> go after
_ -> pre <> replacement <> go after
buildHtml :: Maybe T.Text -> Maybe T.Text -> T.Text -> T.Text buildHtml :: Maybe T.Text -> Maybe T.Text -> T.Text -> T.Text
buildHtml mName mCaption svgContent = T.concat buildHtml mName mCaption svgContent = T.concat

View File

@ -14,7 +14,8 @@
-- extra filter logic is needed for that case. -- extra filter logic is needed for that case.
-- --
-- The filter is /not/ applied inside headings (where Fira Sans uppercase -- The filter is /not/ applied inside headings (where Fira Sans uppercase
-- text looks intentional) or inside @Code@/@RawInline@ inlines. -- text looks intentional, at any nesting depth — including headings
-- inside divs and block quotes) or inside @Code@/@RawInline@ inlines.
module Filters.Smallcaps (apply) where module Filters.Smallcaps (apply) where
import Data.Char (isUpper, isAlpha) import Data.Char (isUpper, isAlpha)
@ -25,13 +26,31 @@ import Text.Pandoc.Walk (walk)
import qualified Utils as U import qualified Utils as U
-- | Apply smallcaps detection to paragraph-level content. -- | Apply smallcaps detection to paragraph-level content.
-- Skips heading blocks to avoid false positives. -- Heading blocks are skipped at /every/ nesting level (a top-level
-- pattern match would miss a @Header@ inside a @Div@ or
-- @BlockQuote@): each header's @Str@ content is swapped for a
-- sentinel 'RawInline' before the wrapping walk and restored
-- afterwards, so 'wrapCaps' can never see it, wherever the header
-- sits in the block tree.
apply :: Pandoc -> Pandoc apply :: Pandoc -> Pandoc
apply (Pandoc meta blocks) = Pandoc meta (map applyBlock blocks) apply = walk restoreStr . walk wrapCaps . walk protectHeader
applyBlock :: Block -> Block -- | Sentinel format marking a @Str@ that must not be wrapped. It only
applyBlock b@(Header {}) = b -- leave headings untouched -- exists between the protect and restore walks inside 'apply' and
applyBlock b = walk wrapCaps b -- can never leak into the writer.
skipFmt :: Format
skipFmt = Format "smallcaps-skip"
protectHeader :: Block -> Block
protectHeader (Header lvl attr ils) = Header lvl attr (walk protectStr ils)
where
protectStr (Str t) = RawInline skipFmt t
protectStr x = x
protectHeader b = b
restoreStr :: Inline -> Inline
restoreStr (RawInline fmt t) | fmt == skipFmt = Str t
restoreStr x = x
-- | Wrap an all-caps Str token in an abbr element, preserving any trailing -- | Wrap an all-caps Str token in an abbr element, preserving any trailing
-- punctuation (comma, period, colon, semicolon, closing paren/bracket) -- punctuation (comma, period, colon, semicolon, closing paren/bracket)

View File

@ -19,12 +19,15 @@
-- source-preview rule in 'Site.rules') and renders a -- source-preview rule in 'Site.rules') and renders a
-- syntax-highlighted snippet via Prism. -- syntax-highlighted snippet via Prism.
-- --
-- Conservative-by-design: the trigger only fires on paths under a -- Conservative-by-design: the trigger only fires on paths the
-- short whitelist of top-level directories, or a small set of named -- @/source/@ serving rule actually publishes ('isServedPath', a
-- root files. This keeps the parser cheap and avoids false positives -- mirror of @sourcePreviewable@ in 'Site.rules'), or a small set of
-- on words that happen to contain a slash and a dot. -- named root files. This keeps the parser cheap, avoids false
-- positives on words that happen to contain a slash and a dot, and
-- guarantees every wrapped path has a fetchable @/source/…@ copy.
module Filters.SourceRefs (apply, isSourcePath, forgejoSourceUrl) where module Filters.SourceRefs (apply, isSourcePath, forgejoSourceUrl) where
import Control.Monad (when)
import Data.IORef (IORef, atomicModifyIORef', newIORef, readIORef) import Data.IORef (IORef, atomicModifyIORef', newIORef, readIORef)
import qualified Data.Map.Strict as Map import qualified Data.Map.Strict as Map
import Data.Text (Text) import Data.Text (Text)
@ -94,16 +97,17 @@ classifyExistingLink x = pure x
-- Heuristic -- Heuristic
-- --------------------------------------------------------------------------- -- ---------------------------------------------------------------------------
-- | True when the text looks like a repo-relative path under one of -- | True when the text looks like a repo-relative path that the
-- the whitelisted directories (or is a whitelisted root file), ends -- @/source/@ serving rule actually publishes (or is a whitelisted
-- in a known source extension, and contains only safe path -- root file), ends in a known source extension, and contains only
-- characters. Conservative by design — the goal is no false -- safe path characters. Conservative by design — the goal is no
-- positives on prose that incidentally contains a slash and a dot. -- false positives on prose that incidentally contains a slash and a
-- dot, and no wrapped path whose popup fetch would 404.
isSourcePath :: Text -> Bool isSourcePath :: Text -> Bool
isSourcePath t = and isSourcePath t = and
[ not (T.null t) [ not (T.null t)
, T.all safeChar t , T.all safeChar t
, (hasKnownPrefix t && hasKnownExt t) || isKnownRootFile t , (isServedPath t && hasKnownExt t) || isKnownRootFile t
] ]
where where
safeChar c = safeChar c =
@ -112,11 +116,26 @@ isSourcePath t = and
|| ('0' <= c && c <= '9') || ('0' <= c && c <= '9')
|| c == '/' || c == '.' || c == '_' || c == '-' || c == '+' || c == '/' || c == '.' || c == '_' || c == '-' || c == '+'
hasKnownPrefix :: Text -> Bool -- | Mirror of the @sourcePreviewable@ whitelist in 'Site.rules' (the
hasKnownPrefix t = any (`T.isPrefixOf` t) -- rule that copies files to @/source/<path>@) — the two must stay
[ "build/", "static/", "templates/", "tools/" -- aligned so every link this filter emits has a corresponding
, "nginx/", "data/", "content/", "yaml-source/" -- @/source/…@ target for the popup to fetch. Directories Site.hs
-- does not serve (e.g. @content/@) are deliberately absent here:
-- wrapping them would emit popups that are guaranteed to 404.
isServedPath :: Text -> Bool
isServedPath t = or
[ "build/" `T.isPrefixOf` t && hasExt ".hs"
, "static/js/" `T.isPrefixOf` t
, "static/css/" `T.isPrefixOf` t
, "templates/" `T.isPrefixOf` t
, "tools/" `T.isPrefixOf` t && (hasExt ".sh" || hasExt ".py")
, "nginx/" `T.isPrefixOf` t && hasExt ".conf"
, "data/" `T.isPrefixOf` t
&& not ("/" `T.isInfixOf` T.drop 5 t) -- top-level data files only
&& (hasExt ".json" || hasExt ".yaml" || hasExt ".md" || hasExt ".bib")
] ]
where
hasExt e = e `T.isSuffixOf` T.toLower t
hasKnownExt :: Text -> Bool hasKnownExt :: Text -> Bool
hasKnownExt t = hasKnownExt t =
@ -125,7 +144,7 @@ hasKnownExt t =
[ ".hs", ".js", ".mjs", ".css", ".html" [ ".hs", ".js", ".mjs", ".css", ".html"
, ".py", ".cabal", ".md", ".yaml", ".yml" , ".py", ".cabal", ".md", ".yaml", ".yml"
, ".toml", ".sh", ".bash", ".svg", ".conf" , ".toml", ".sh", ".bash", ".svg", ".conf"
, ".json", ".ini", ".tex" , ".json", ".ini", ".tex", ".bib"
] ]
isKnownRootFile :: Text -> Bool isKnownRootFile :: Text -> Bool
@ -142,14 +161,19 @@ isKnownRootFile t = t `elem`
-- File existence cache -- File existence cache
-- --------------------------------------------------------------------------- -- ---------------------------------------------------------------------------
-- | Process-wide memo of @doesFileExist@ results, keyed by the same -- | Process-wide memo of /positive/ @doesFileExist@ results, keyed by
-- path the popup will fetch. Hakyll runs this filter once per -- the same path the popup will fetch. Hakyll runs this filter once
-- compiled page and the same source-file references recur across -- per compiled page and the same source-file references recur across
-- many pages (e.g. @build\/Filters\/Links.hs@ in the Links page, -- many pages (e.g. @build\/Filters\/Links.hs@ in the Links page,
-- the Colophon, several essays); the cache turns N stats into one -- the Colophon, several essays); the cache turns N stats into one
-- per distinct path. The build process's working directory is the -- per distinct path. Only existence is memoized: a missing file is
-- project root, so the path can be passed straight to -- re-stat'ed on every miss, so a source file created during a
-- 'doesFileExist' without prefixing. -- long-lived @make watch@ session is picked up on the next rebuild
-- instead of staying "absent" for the process lifetime. (A file
-- /deleted/ mid-watch stays cached as present until restart — the
-- benign direction: the popup fetch 404s and simply never appears.)
-- The build process's working directory is the project root, so the
-- path can be passed straight to 'doesFileExist' without prefixing.
{-# NOINLINE existsCacheRef #-} {-# NOINLINE existsCacheRef #-}
existsCacheRef :: IORef (Map.Map Text Bool) existsCacheRef :: IORef (Map.Map Text Bool)
existsCacheRef = unsafePerformIO (newIORef Map.empty) existsCacheRef = unsafePerformIO (newIORef Map.empty)
@ -161,6 +185,7 @@ existsCached path = do
Just b -> pure b Just b -> pure b
Nothing -> do Nothing -> do
b <- doesFileExist (T.unpack path) b <- doesFileExist (T.unpack path)
when b $
atomicModifyIORef' existsCacheRef (\m -> (Map.insert path b m, ())) atomicModifyIORef' existsCacheRef (\m -> (Map.insert path b m, ()))
pure b pure b

View File

@ -5,7 +5,13 @@
-- HTML placeholders that transclude.js resolves at runtime. -- HTML placeholders that transclude.js resolves at runtime.
-- --
-- A directive must be the sole content of a line (after trimming) to be -- A directive must be the sole content of a line (after trimming) to be
-- replaced — this prevents accidental substitution inside prose or code. -- replaced — this prevents accidental substitution inside prose.
--
-- Code protection (honest scope): lines inside /fenced/ code blocks
-- are passed through untouched ('Filters.Wikilinks.mapOutsideFences'),
-- so fenced examples can show @{{slug}}@ literally. Indented code
-- blocks and inline code spans are NOT recognised — a full-line
-- directive inside either is still rewritten.
-- --
-- Examples: -- Examples:
-- {{my-essay}} → full-page transclusion of /my-essay.html -- {{my-essay}} → full-page transclusion of /my-essay.html
@ -14,11 +20,13 @@
module Filters.Transclusion (preprocess) where module Filters.Transclusion (preprocess) where
import Data.List (isSuffixOf, isPrefixOf, stripPrefix) import Data.List (isSuffixOf, isPrefixOf, stripPrefix)
import Filters.Wikilinks (mapOutsideFences)
import qualified Utils as U import qualified Utils as U
-- | Apply transclusion substitution to the raw Markdown source string. -- | Apply transclusion substitution to the raw Markdown source string,
-- skipping lines inside fenced code blocks.
preprocess :: String -> String preprocess :: String -> String
preprocess = unlines . map processLine . lines preprocess = mapOutsideFences processLine
processLine :: String -> String processLine :: String -> String
processLine line = processLine line =

View File

@ -37,6 +37,7 @@
module Filters.Viz (inlineViz) where module Filters.Viz (inlineViz) where
import Control.Exception (IOException, catch) import Control.Exception (IOException, catch)
import Data.Char (isHexDigit)
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import qualified Data.Text as T import qualified Data.Text as T
import System.Directory (doesFileExist) import System.Directory (doesFileExist)
@ -117,20 +118,47 @@ runScript baseDir attrs =
-- | Replace hardcoded black fill/stroke values with @currentColor@ so the -- | Replace hardcoded black fill/stroke values with @currentColor@ so the
-- embedded SVG inherits the CSS text colour in both light and dark modes. -- embedded SVG inherits the CSS text colour in both light and dark modes.
--
-- Quoted attribute forms (@fill="#000"@) are self-delimiting — the
-- closing quote bounds the match — so plain 'T.replace' is safe for
-- them. Unquoted style-property forms (@fill:#000@) are not: naive
-- replacement would also fire on the prefix of a longer hex colour
-- (@fill:#000080@ → @fill:currentColor80@, invalid CSS). Those go
-- through 'replaceHexColor', which rewrites a match only when it is
-- not followed by another hex digit.
processColors :: T.Text -> T.Text processColors :: T.Text -> T.Text
processColors processColors
= T.replace "fill=\"#000\"" "fill=\"currentColor\"" = T.replace "fill=\"#000\"" "fill=\"currentColor\""
. T.replace "fill=\"black\"" "fill=\"currentColor\"" . T.replace "fill=\"black\"" "fill=\"currentColor\""
. T.replace "stroke=\"#000\"" "stroke=\"currentColor\"" . T.replace "stroke=\"#000\"" "stroke=\"currentColor\""
. T.replace "stroke=\"black\"" "stroke=\"currentColor\"" . T.replace "stroke=\"black\"" "stroke=\"currentColor\""
. T.replace "fill:#000" "fill:currentColor" . replaceHexColor "fill:#000" "fill:currentColor"
. T.replace "fill:black" "fill:currentColor" . T.replace "fill:black" "fill:currentColor"
. T.replace "stroke:#000" "stroke:currentColor" . replaceHexColor "stroke:#000" "stroke:currentColor"
. T.replace "stroke:black" "stroke:currentColor" . T.replace "stroke:black" "stroke:currentColor"
. T.replace "fill=\"#000000\"" "fill=\"currentColor\"" . T.replace "fill=\"#000000\"" "fill=\"currentColor\""
. T.replace "stroke=\"#000000\"" "stroke=\"currentColor\"" . T.replace "stroke=\"#000000\"" "stroke=\"currentColor\""
. T.replace "fill:#000000" "fill:currentColor" . replaceHexColor "fill:#000000" "fill:currentColor"
. T.replace "stroke:#000000" "stroke:currentColor" . replaceHexColor "stroke:#000000" "stroke:currentColor"
-- | 'T.replace' restricted to hex-boundary-terminated matches: an
-- occurrence of @needle@ is rewritten only when the character after
-- it is not another hex digit, so @fill:#000@ never fires inside the
-- longer colours @fill:#0008@, @fill:#000080@, or @fill:#00000080@.
-- (Mirrors 'Filters.Score.replaceHexColor'.)
replaceHexColor :: T.Text -> T.Text -> T.Text -> T.Text
replaceHexColor needle replacement = go
where
go t =
let (pre, rest) = T.breakOn needle t
in if T.null rest
then pre
else
let after = T.drop (T.length needle) rest
in case T.uncons after of
Just (c, _) | isHexDigit c ->
pre <> needle <> go after
_ -> pre <> replacement <> go after
-- --------------------------------------------------------------------------- -- ---------------------------------------------------------------------------
-- JSON safety for <script> embedding -- JSON safety for <script> embedding

View File

@ -12,23 +12,129 @@
-- replaced with hyphens, non-alphanumeric characters stripped, and -- replaced with hyphens, non-alphanumeric characters stripped, and
-- a @.html@ suffix appended so the link resolves identically under -- a @.html@ suffix appended so the link resolves identically under
-- the dev server, file:// previews, and nginx in production. -- the dev server, file:// previews, and nginx in production.
module Filters.Wikilinks (preprocess) where --
-- Code protection (honest scope): lines inside /fenced/ code blocks
-- are passed through untouched (see 'mapOutsideFences'), and within a
-- line, inline code spans (backtick runs, CommonMark equal-length
-- matching) are skipped — so both fenced and @`inline`@ examples can
-- show @[[…]]@ literally. Indented code blocks and code spans that
-- cross a line break are NOT recognised; a wikilink inside those is
-- still rewritten.
module Filters.Wikilinks (preprocess, mapOutsideFences) where
import Data.Char (isAlphaNum, toLower, isSpace) import Data.Char (isAlphaNum, toLower, isSpace)
import Data.List (intercalate) import Data.List (intercalate)
import qualified Utils as U import qualified Utils as U
-- | Scan the raw Markdown source for @[[…]]@ wikilinks and replace them -- | Scan the raw Markdown source for @[[…]]@ wikilinks and replace them
-- with standard Markdown link syntax. -- with standard Markdown link syntax. Processing is line-by-line and
-- skips fenced code blocks; a wikilink therefore cannot span a line
-- break (which was never a sensible authoring form).
preprocess :: String -> String preprocess :: String -> String
preprocess [] = [] preprocess = mapOutsideFences replaceWikilinks
preprocess ('[':'[':rest) =
replaceWikilinks :: String -> String
replaceWikilinks = go
where
go [] = []
-- Inline code span: a backtick run opens a span closed by a run of
-- exactly the same length (CommonMark). Its body passes through
-- verbatim so documentation can quote @`[[…]]`@ literally. An
-- unclosed run is literal text — and then a following @[[…]]@ is
-- genuinely a wikilink, matching how Pandoc will read the line.
go s@('`':_) =
let (run, afterRun) = span (== '`') s
in case codeSpan (length run) afterRun of
Just (body, after) -> run ++ body ++ run ++ go after
Nothing -> run ++ go afterRun
go ('[':'[':rest) =
case break (== ']') rest of case break (== ']') rest of
(inner, ']':']':after) (inner, ']':']':after)
| not (null inner) -> | not (null inner) ->
toMarkdownLink inner ++ preprocess after toMarkdownLink inner ++ go after
_ -> '[' : '[' : preprocess rest _ -> '[' : '[' : go rest
preprocess (c:rest) = c : preprocess rest go (c:rest) = c : go rest
-- @codeSpan n s@: the span body and the remainder after a closing
-- run of exactly @n@ backticks; 'Nothing' when no closer exists on
-- this line.
codeSpan :: Int -> String -> Maybe (String, String)
codeSpan n = loop
where
loop [] = Nothing
loop s@('`':_) =
let (run, rest) = span (== '`') s
in if length run == n
then Just ("", rest)
else prepend run <$> loop rest
loop (c:cs) = prepend [c] <$> loop cs
prepend pre (body, after) = (pre ++ body, after)
-- ---------------------------------------------------------------------------
-- Fence-aware line mapping (shared by all source-level preprocessors)
-- ---------------------------------------------------------------------------
-- | Apply a line transformation to every line that is not part of a
-- fenced code block. Shared by the three source-level preprocessors
-- (wikilinks here, 'Filters.Transclusion', 'Filters.EmbedPdf') so
-- their directive syntax can be quoted literally inside fenced code.
--
-- Fence tracking follows CommonMark: an opener is at most three
-- spaces of indentation followed by a run of at least three backticks
-- or tildes (longer runs allowed); for backtick fences the info
-- string may not contain a backtick. The closer uses the same fence
-- character, a run at least as long as the opener, and nothing but
-- whitespace after it. An unclosed fence extends to the end of the
-- document. Fence delimiter lines themselves pass through untouched.
--
-- Honest scope: only /fenced/ code blocks are protected. Indented
-- code blocks and inline code spans are not recognised here — a
-- directive inside either is still rewritten.
mapOutsideFences :: (String -> String) -> String -> String
mapOutsideFences f = unlines . go Nothing . lines
where
go _ [] = []
go Nothing (l:ls) =
case openingFence l of
Just fence -> l : go (Just fence) ls
Nothing -> f l : go Nothing ls
go st@(Just fence) (l:ls)
| closesFence fence l = l : go Nothing ls
| otherwise = l : go st ls
-- | The fence character and run length of a CommonMark fence opener,
-- or 'Nothing' when the line does not open a fence.
openingFence :: String -> Maybe (Char, Int)
openingFence l = do
rest <- stripFenceIndent l
case rest of
(c:_) | c == '`' || c == '~' ->
let run = takeWhile (== c) rest
n = length run
info = drop n rest
in if n >= 3 && (c == '~' || '`' `notElem` info)
then Just (c, n)
else Nothing
_ -> Nothing
-- | True when the line closes the fence opened by @(c, n)@: the same
-- fence character, a run at least as long as the opener, and only
-- whitespace after it.
closesFence :: (Char, Int) -> String -> Bool
closesFence (c, n) l =
case stripFenceIndent l of
Nothing -> False
Just rest ->
let run = takeWhile (== c) rest
in length run >= n && all isSpace (drop (length run) rest)
-- | Strip up to three leading spaces (the indentation CommonMark allows
-- on a fence line); 'Nothing' for four or more, which would be an
-- indented code block rather than a fence.
stripFenceIndent :: String -> Maybe String
stripFenceIndent l =
let (indent, rest) = span (== ' ') l
in if length indent <= 3 then Just rest else Nothing
-- | Convert the inner content of @[[…]]@ to a Markdown link. -- | Convert the inner content of @[[…]]@ to a Markdown link.
-- --