Filters: fence/code-span awareness, host matching, nested-header skip

- SourceRefs trigger whitelist aligned to the /source/ serving
  whitelist (drops content/, yaml-source/, broad static//tools//data
  prefixes; adds .bib); existsCached no longer memoizes non-existence,
  so files created under make watch are picked up (§2.5, §2.16)
- fill/stroke hex replacement is boundary-aware: #000080 and 8-digit
  RGBA forms can no longer be corrupted into currentColor80 (§2.12)
- Wikilinks/Transclusion/EmbedPdf skip fenced code blocks (shared
  CommonMark fence tracker), and wikilinks additionally skip inline
  code spans — the syntax-documentation essay now renders its own
  examples literally while live wikilinks still convert (verified both
  ways in output) (§2.13)
- domainIcon matches the extracted host by label suffix instead of
  substring-of-URL; extractHost also strips userinfo (§2.14)
- webpSrc escaped in srcset; internal PDF links no longer double-
  classified; Smallcaps/Archive header-skip now holds at every nesting
  depth via protect/restore walks (§2.17)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Levi Neuwirth 2026-06-10 11:13:08 -04:00
parent c8eeaaa9bc
commit f254ce866e
10 changed files with 362 additions and 97 deletions

View File

@ -30,22 +30,45 @@ import Text.Pandoc.Walk (walk)
import ArchiveIndex (ArchiveStatus (..), archiveIndexIsEmpty,
archiveSlugFor, archiveStatusForSlug)
-- | Annotate body links. Headings are left alone — an affordance there
-- would be noise. Identity when the index is empty.
-- | Annotate body links. Links inside headings are left alone at
-- /every/ nesting depth — an affordance there would be noise, and a
-- top-level pattern match would miss a @Header@ inside a @Div@ or
-- @BlockQuote@. Header links are tagged with a sentinel class before
-- the annotation walk and stripped of it afterwards, so the sentinel
-- can never leak into the writer. Identity when the index is empty.
apply :: Pandoc -> Pandoc
apply doc@(Pandoc meta blocks)
apply doc
| archiveIndexIsEmpty = doc
| otherwise = Pandoc meta (map annotateBlock blocks)
| otherwise =
walk unprotectLink . walk annotateInlines . walk protectHeader $ doc
annotateBlock :: Block -> Block
annotateBlock h@Header{} = h
annotateBlock b = walk annotateInlines b
-- | Sentinel class marking a link the annotation walk must skip. It
-- only exists between the protect and unprotect walks inside 'apply'.
skipClass :: T.Text
skipClass = "archive-header-skip"
protectHeader :: Block -> Block
protectHeader (Header lvl attr ils) = Header lvl attr (walk protect ils)
where
protect (Link (ident, cls, kvs) text target) =
Link (ident, skipClass : cls, kvs) text target
protect x = x
protectHeader b = b
unprotectLink :: Inline -> Inline
unprotectLink (Link (ident, cls, kvs) text target)
| skipClass `elem` cls =
Link (ident, filter (/= skipClass) cls, kvs) text target
unprotectLink x = x
-- | For each archived @Link@: flip it if the target is 'Rotted', else
-- append the affordance. Non-archived links pass through untouched.
-- append the affordance. Non-archived links — and links protected by
-- 'protectHeader' — pass through untouched.
annotateInlines :: [Inline] -> [Inline]
annotateInlines = concatMap expand
where
expand l@(Link (_, cls, _) _ _)
| skipClass `elem` cls = [l]
expand l@(Link attr text (url, _)) =
case archiveSlugFor url of
Nothing -> [l]

View File

@ -12,15 +12,23 @@
--
-- The file path must be root-relative (begins with @/@).
-- PDF.js is expected to be vendored at @/pdfjs/web/viewer.html@.
--
-- Code protection (honest scope): lines inside /fenced/ code blocks
-- are passed through untouched ('Filters.Wikilinks.mapOutsideFences'),
-- so fenced examples can show @{{pdf:…}}@ literally. Indented code
-- blocks and inline code spans are NOT recognised — a full-line
-- directive inside either is still rewritten.
module Filters.EmbedPdf (preprocess) where
import Data.Char (isDigit)
import Data.List (isPrefixOf, isSuffixOf)
import Filters.Wikilinks (mapOutsideFences)
import qualified Utils as U
-- | Apply PDF-embed substitution to the raw Markdown source string.
-- | Apply PDF-embed substitution to the raw Markdown source string,
-- skipping lines inside fenced code blocks.
preprocess :: String -> String
preprocess = unlines . map processLine . lines
preprocess = mapOutsideFences processLine
processLine :: String -> String
processLine line =

View File

@ -231,7 +231,7 @@ renderPicture :: Attr -> [Inline] -> Target -> Bool -> Maybe (Int, Int) -> Text
renderPicture (ident, classes, kvs) alt (src, title) lightbox dims =
T.concat
[ "<picture>"
, "<source srcset=\"", T.pack webpSrc, "\" type=\"image/webp\">"
, "<source srcset=\"", esc (T.pack webpSrc), "\" type=\"image/webp\">"
, "<img"
, attrId ident
, attrClasses classes

View File

@ -16,8 +16,11 @@ import Text.Pandoc.Definition
import Text.Pandoc.Walk (walk)
-- | Apply link classification to the entire document.
-- Two passes: PDF links first (rewrites href to viewer URL), then external
-- link classification (operates on http/https, so no overlap).
-- Two passes: PDF links first (rewrites href to the viewer URL and tags
-- the anchor @pdf-link@), then general classification. The second pass
-- explicitly skips anchors the PDF pass already claimed — the viewer URL
-- is root-relative, so without that guard it would also be classified as
-- an internal page link and get double chrome.
apply :: Pandoc -> Pandoc
apply = walk classifyLink . walk classifyPdfLink
@ -49,6 +52,11 @@ classifyLink l@(Link (_, classes, _) _ _)
-- brand icon stamp, and have their own popup provider. Leave them
-- entirely alone.
| "source-ref" `elem` classes = l
-- PDF links were already rewritten to the (root-relative) viewer URL
-- and given their own chrome by 'classifyPdfLink' in the preceding
-- pass; without this guard they would be double-classified as
-- internal page links.
| "pdf-link" `elem` classes = l
classifyLink (Link (ident, classes, kvs) ils (url, title))
| isExternal url =
let icon = domainIcon url
@ -100,8 +108,9 @@ isExternal url =
where
siteHost = "levineuwirth.org"
-- | Extract the lowercased hostname from an absolute http(s) URL.
-- Returns 'Nothing' for non-http(s) URLs (relative paths, mailto:, etc.).
-- | Extract the lowercased hostname from an absolute http(s) URL,
-- stripping any userinfo (@user:pass\@@) and port. Returns 'Nothing'
-- for non-http(s) URLs (relative paths, mailto:, etc.).
extractHost :: Text -> Maybe Text
extractHost url
| Just rest <- T.stripPrefix "https://" url = Just (hostOf rest)
@ -109,45 +118,60 @@ extractHost url
| otherwise = Nothing
where
hostOf rest =
let withPort = T.takeWhile (\c -> c /= '/' && c /= '?' && c /= '#') rest
host = T.takeWhile (/= ':') withPort
let authority = T.takeWhile (\c -> c /= '/' && c /= '?' && c /= '#') rest
-- 'T.breakOnEnd' yields the segment after the last @\@@, or
-- the whole authority when there is no userinfo.
(_, hostPort) = T.breakOnEnd "@" authority
host = T.takeWhile (/= ':') hostPort
in T.toLower host
-- | Icon name for the link, matching a file in /images/link-icons/<name>.svg.
--
-- Matches on the URL's host only, never on the full URL — a path like
-- @https://example.org/why-x.com-failed@ must not get the Twitter
-- icon. URLs with no extractable host get the generic icon.
domainIcon :: Text -> Text
domainIcon url
domainIcon url = maybe "external" iconForHost (extractHost url)
iconForHost :: Text -> Text
iconForHost host
-- Scholarly / reference
| "wikipedia.org" `T.isInfixOf` url = "wikipedia"
| "arxiv.org" `T.isInfixOf` url = "arxiv"
| "doi.org" `T.isInfixOf` url = "doi"
| "worldcat.org" `T.isInfixOf` url = "worldcat"
| "orcid.org" `T.isInfixOf` url = "orcid"
| "archive.org" `T.isInfixOf` url = "internet-archive"
| m "wikipedia.org" = "wikipedia"
| m "arxiv.org" = "arxiv"
| m "doi.org" = "doi"
| m "worldcat.org" = "worldcat"
| m "orcid.org" = "orcid"
| m "archive.org" = "internet-archive"
-- Code / software
| "github.com" `T.isInfixOf` url = "github"
| "git.levineuwirth.org" `T.isInfixOf` url = "forgejo"
| "tensorflow.org" `T.isInfixOf` url = "tensorflow"
| m "github.com" = "github"
| m "git.levineuwirth.org" = "forgejo"
| m "tensorflow.org" = "tensorflow"
-- AI companies (consumer products share a brand icon with the lab)
| "anthropic.com" `T.isInfixOf` url = "anthropic"
| "claude.ai" `T.isInfixOf` url = "anthropic"
| "openai.com" `T.isInfixOf` url = "openai"
| "chatgpt.com" `T.isInfixOf` url = "openai"
| m "anthropic.com" = "anthropic"
| m "claude.ai" = "anthropic"
| m "openai.com" = "openai"
| m "chatgpt.com" = "openai"
-- Social / media
| "twitter.com" `T.isInfixOf` url = "twitter"
| "x.com" `T.isInfixOf` url = "twitter"
| "reddit.com" `T.isInfixOf` url = "reddit"
| "youtube.com" `T.isInfixOf` url = "youtube"
| "youtu.be" `T.isInfixOf` url = "youtube"
| "tiktok.com" `T.isInfixOf` url = "tiktok"
| "substack.com" `T.isInfixOf` url = "substack"
| "news.ycombinator.com" `T.isInfixOf` url = "hacker-news"
| "lesswrong.com" `T.isInfixOf` url = "lesswrong"
| m "twitter.com" = "twitter"
| m "x.com" = "twitter"
| m "reddit.com" = "reddit"
| m "youtube.com" = "youtube"
| m "youtu.be" = "youtube"
| m "tiktok.com" = "tiktok"
| m "substack.com" = "substack"
| m "news.ycombinator.com" = "hacker-news"
| m "lesswrong.com" = "lesswrong"
-- News
| "nytimes.com" `T.isInfixOf` url = "new-york-times"
| m "nytimes.com" = "new-york-times"
-- Institutions
| "nasa.gov" `T.isInfixOf` url = "nasa"
| "apple.com" `T.isInfixOf` url = "apple"
| otherwise = "external"
| m "nasa.gov" = "nasa"
| m "apple.com" = "apple"
| otherwise = "external"
where
-- Label-suffix match: the host is the domain itself or a subdomain
-- of it. Never fires on a lookalike label (@notx.com@) or on text
-- in the path or query.
m d = host == d || ("." <> d) `T.isSuffixOf` host
-- | Percent-encode characters that would break a @?file=@ query-string value.
-- Slashes are intentionally left unencoded so root-relative paths remain

View File

@ -15,6 +15,7 @@
module Filters.Score (inlineScores) where
import Control.Exception (IOException, try)
import Data.Char (isHexDigit)
import Data.Maybe (listToMaybe)
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
@ -86,25 +87,48 @@ findImagePath blocks = listToMaybe
-- | Replace hardcoded black fill/stroke values with @currentColor@ so the
-- SVG inherits the CSS @color@ property in both light and dark modes.
--
-- 6-digit hex patterns are at the bottom of the composition chain
-- (applied first) so they are replaced before the 3-digit shorthand,
-- preventing partial matches (e.g. @#000@ matching the prefix of @#000000@).
-- Quoted attribute forms (@fill="#000"@) are self-delimiting — the
-- closing quote bounds the match — so plain 'T.replace' is safe for
-- them. Unquoted style-property forms (@fill:#000@) are not: naive
-- replacement would also fire on the prefix of a longer hex colour
-- (@fill:#000080@ → @fill:currentColor80@, invalid CSS). Those go
-- through 'replaceHexColor', which rewrites a match only when it is
-- not followed by another hex digit; the boundary check also makes
-- the 3-digit/6-digit application order irrelevant.
processColors :: T.Text -> T.Text
processColors
-- 3-digit hex and keyword patterns (applied after 6-digit replacements)
-- 3-digit hex and keyword patterns
= T.replace "fill=\"#000\"" "fill=\"currentColor\""
. T.replace "fill=\"black\"" "fill=\"currentColor\""
. T.replace "stroke=\"#000\"" "stroke=\"currentColor\""
. T.replace "stroke=\"black\"" "stroke=\"currentColor\""
. T.replace "fill:#000" "fill:currentColor"
. replaceHexColor "fill:#000" "fill:currentColor"
. T.replace "fill:black" "fill:currentColor"
. T.replace "stroke:#000" "stroke:currentColor"
. replaceHexColor "stroke:#000" "stroke:currentColor"
. T.replace "stroke:black" "stroke:currentColor"
-- 6-digit hex patterns (applied first — bottom of the chain)
. T.replace "fill=\"#000000\"" "fill=\"currentColor\""
. T.replace "stroke=\"#000000\"" "stroke=\"currentColor\""
. T.replace "fill:#000000" "fill:currentColor"
. T.replace "stroke:#000000" "stroke:currentColor"
. replaceHexColor "fill:#000000" "fill:currentColor"
. replaceHexColor "stroke:#000000" "stroke:currentColor"
-- | 'T.replace' restricted to hex-boundary-terminated matches: an
-- occurrence of @needle@ is rewritten only when the character after
-- it is not another hex digit, so @fill:#000@ never fires inside the
-- longer colours @fill:#0008@, @fill:#000080@, or @fill:#00000080@.
replaceHexColor :: T.Text -> T.Text -> T.Text -> T.Text
replaceHexColor needle replacement = go
where
go t =
let (pre, rest) = T.breakOn needle t
in if T.null rest
then pre
else
let after = T.drop (T.length needle) rest
in case T.uncons after of
Just (c, _) | isHexDigit c ->
pre <> needle <> go after
_ -> pre <> replacement <> go after
buildHtml :: Maybe T.Text -> Maybe T.Text -> T.Text -> T.Text
buildHtml mName mCaption svgContent = T.concat

View File

@ -14,7 +14,8 @@
-- extra filter logic is needed for that case.
--
-- The filter is /not/ applied inside headings (where Fira Sans uppercase
-- text looks intentional) or inside @Code@/@RawInline@ inlines.
-- text looks intentional, at any nesting depth — including headings
-- inside divs and block quotes) or inside @Code@/@RawInline@ inlines.
module Filters.Smallcaps (apply) where
import Data.Char (isUpper, isAlpha)
@ -25,13 +26,31 @@ import Text.Pandoc.Walk (walk)
import qualified Utils as U
-- | Apply smallcaps detection to paragraph-level content.
-- Skips heading blocks to avoid false positives.
-- Heading blocks are skipped at /every/ nesting level (a top-level
-- pattern match would miss a @Header@ inside a @Div@ or
-- @BlockQuote@): each header's @Str@ content is swapped for a
-- sentinel 'RawInline' before the wrapping walk and restored
-- afterwards, so 'wrapCaps' can never see it, wherever the header
-- sits in the block tree.
apply :: Pandoc -> Pandoc
apply (Pandoc meta blocks) = Pandoc meta (map applyBlock blocks)
apply = walk restoreStr . walk wrapCaps . walk protectHeader
applyBlock :: Block -> Block
applyBlock b@(Header {}) = b -- leave headings untouched
applyBlock b = walk wrapCaps b
-- | Sentinel format marking a @Str@ that must not be wrapped. It only
-- exists between the protect and restore walks inside 'apply' and
-- can never leak into the writer.
skipFmt :: Format
skipFmt = Format "smallcaps-skip"
protectHeader :: Block -> Block
protectHeader (Header lvl attr ils) = Header lvl attr (walk protectStr ils)
where
protectStr (Str t) = RawInline skipFmt t
protectStr x = x
protectHeader b = b
restoreStr :: Inline -> Inline
restoreStr (RawInline fmt t) | fmt == skipFmt = Str t
restoreStr x = x
-- | Wrap an all-caps Str token in an abbr element, preserving any trailing
-- punctuation (comma, period, colon, semicolon, closing paren/bracket)

View File

@ -19,12 +19,15 @@
-- source-preview rule in 'Site.rules') and renders a
-- syntax-highlighted snippet via Prism.
--
-- Conservative-by-design: the trigger only fires on paths under a
-- short whitelist of top-level directories, or a small set of named
-- root files. This keeps the parser cheap and avoids false positives
-- on words that happen to contain a slash and a dot.
-- Conservative-by-design: the trigger only fires on paths the
-- @/source/@ serving rule actually publishes ('isServedPath', a
-- mirror of @sourcePreviewable@ in 'Site.rules'), or a small set of
-- named root files. This keeps the parser cheap, avoids false
-- positives on words that happen to contain a slash and a dot, and
-- guarantees every wrapped path has a fetchable @/source/…@ copy.
module Filters.SourceRefs (apply, isSourcePath, forgejoSourceUrl) where
import Control.Monad (when)
import Data.IORef (IORef, atomicModifyIORef', newIORef, readIORef)
import qualified Data.Map.Strict as Map
import Data.Text (Text)
@ -94,16 +97,17 @@ classifyExistingLink x = pure x
-- Heuristic
-- ---------------------------------------------------------------------------
-- | True when the text looks like a repo-relative path under one of
-- the whitelisted directories (or is a whitelisted root file), ends
-- in a known source extension, and contains only safe path
-- characters. Conservative by design — the goal is no false
-- positives on prose that incidentally contains a slash and a dot.
-- | True when the text looks like a repo-relative path that the
-- @/source/@ serving rule actually publishes (or is a whitelisted
-- root file), ends in a known source extension, and contains only
-- safe path characters. Conservative by design — the goal is no
-- false positives on prose that incidentally contains a slash and a
-- dot, and no wrapped path whose popup fetch would 404.
isSourcePath :: Text -> Bool
isSourcePath t = and
[ not (T.null t)
, T.all safeChar t
, (hasKnownPrefix t && hasKnownExt t) || isKnownRootFile t
, (isServedPath t && hasKnownExt t) || isKnownRootFile t
]
where
safeChar c =
@ -112,11 +116,26 @@ isSourcePath t = and
|| ('0' <= c && c <= '9')
|| c == '/' || c == '.' || c == '_' || c == '-' || c == '+'
hasKnownPrefix :: Text -> Bool
hasKnownPrefix t = any (`T.isPrefixOf` t)
[ "build/", "static/", "templates/", "tools/"
, "nginx/", "data/", "content/", "yaml-source/"
-- | Mirror of the @sourcePreviewable@ whitelist in 'Site.rules' (the
-- rule that copies files to @/source/<path>@) — the two must stay
-- aligned so every link this filter emits has a corresponding
-- @/source/…@ target for the popup to fetch. Directories Site.hs
-- does not serve (e.g. @content/@) are deliberately absent here:
-- wrapping them would emit popups that are guaranteed to 404.
isServedPath :: Text -> Bool
isServedPath t = or
[ "build/" `T.isPrefixOf` t && hasExt ".hs"
, "static/js/" `T.isPrefixOf` t
, "static/css/" `T.isPrefixOf` t
, "templates/" `T.isPrefixOf` t
, "tools/" `T.isPrefixOf` t && (hasExt ".sh" || hasExt ".py")
, "nginx/" `T.isPrefixOf` t && hasExt ".conf"
, "data/" `T.isPrefixOf` t
&& not ("/" `T.isInfixOf` T.drop 5 t) -- top-level data files only
&& (hasExt ".json" || hasExt ".yaml" || hasExt ".md" || hasExt ".bib")
]
where
hasExt e = e `T.isSuffixOf` T.toLower t
hasKnownExt :: Text -> Bool
hasKnownExt t =
@ -125,7 +144,7 @@ hasKnownExt t =
[ ".hs", ".js", ".mjs", ".css", ".html"
, ".py", ".cabal", ".md", ".yaml", ".yml"
, ".toml", ".sh", ".bash", ".svg", ".conf"
, ".json", ".ini", ".tex"
, ".json", ".ini", ".tex", ".bib"
]
isKnownRootFile :: Text -> Bool
@ -142,14 +161,19 @@ isKnownRootFile t = t `elem`
-- File existence cache
-- ---------------------------------------------------------------------------
-- | Process-wide memo of @doesFileExist@ results, keyed by the same
-- path the popup will fetch. Hakyll runs this filter once per
-- compiled page and the same source-file references recur across
-- | Process-wide memo of /positive/ @doesFileExist@ results, keyed by
-- the same path the popup will fetch. Hakyll runs this filter once
-- per compiled page and the same source-file references recur across
-- many pages (e.g. @build\/Filters\/Links.hs@ in the Links page,
-- the Colophon, several essays); the cache turns N stats into one
-- per distinct path. The build process's working directory is the
-- project root, so the path can be passed straight to
-- 'doesFileExist' without prefixing.
-- per distinct path. Only existence is memoized: a missing file is
-- re-stat'ed on every miss, so a source file created during a
-- long-lived @make watch@ session is picked up on the next rebuild
-- instead of staying "absent" for the process lifetime. (A file
-- /deleted/ mid-watch stays cached as present until restart — the
-- benign direction: the popup fetch 404s and simply never appears.)
-- The build process's working directory is the project root, so the
-- path can be passed straight to 'doesFileExist' without prefixing.
{-# NOINLINE existsCacheRef #-}
existsCacheRef :: IORef (Map.Map Text Bool)
existsCacheRef = unsafePerformIO (newIORef Map.empty)
@ -161,7 +185,8 @@ existsCached path = do
Just b -> pure b
Nothing -> do
b <- doesFileExist (T.unpack path)
atomicModifyIORef' existsCacheRef (\m -> (Map.insert path b m, ()))
when b $
atomicModifyIORef' existsCacheRef (\m -> (Map.insert path b m, ()))
pure b
-- ---------------------------------------------------------------------------

View File

@ -5,7 +5,13 @@
-- HTML placeholders that transclude.js resolves at runtime.
--
-- A directive must be the sole content of a line (after trimming) to be
-- replaced — this prevents accidental substitution inside prose or code.
-- replaced — this prevents accidental substitution inside prose.
--
-- Code protection (honest scope): lines inside /fenced/ code blocks
-- are passed through untouched ('Filters.Wikilinks.mapOutsideFences'),
-- so fenced examples can show @{{slug}}@ literally. Indented code
-- blocks and inline code spans are NOT recognised — a full-line
-- directive inside either is still rewritten.
--
-- Examples:
-- {{my-essay}} → full-page transclusion of /my-essay.html
@ -14,11 +20,13 @@
module Filters.Transclusion (preprocess) where
import Data.List (isSuffixOf, isPrefixOf, stripPrefix)
import Filters.Wikilinks (mapOutsideFences)
import qualified Utils as U
-- | Apply transclusion substitution to the raw Markdown source string.
-- | Apply transclusion substitution to the raw Markdown source string,
-- skipping lines inside fenced code blocks.
preprocess :: String -> String
preprocess = unlines . map processLine . lines
preprocess = mapOutsideFences processLine
processLine :: String -> String
processLine line =

View File

@ -37,6 +37,7 @@
module Filters.Viz (inlineViz) where
import Control.Exception (IOException, catch)
import Data.Char (isHexDigit)
import Data.Maybe (fromMaybe)
import qualified Data.Text as T
import System.Directory (doesFileExist)
@ -117,20 +118,47 @@ runScript baseDir attrs =
-- | Replace hardcoded black fill/stroke values with @currentColor@ so the
-- embedded SVG inherits the CSS text colour in both light and dark modes.
--
-- Quoted attribute forms (@fill="#000"@) are self-delimiting — the
-- closing quote bounds the match — so plain 'T.replace' is safe for
-- them. Unquoted style-property forms (@fill:#000@) are not: naive
-- replacement would also fire on the prefix of a longer hex colour
-- (@fill:#000080@ → @fill:currentColor80@, invalid CSS). Those go
-- through 'replaceHexColor', which rewrites a match only when it is
-- not followed by another hex digit.
processColors :: T.Text -> T.Text
processColors
= T.replace "fill=\"#000\"" "fill=\"currentColor\""
. T.replace "fill=\"black\"" "fill=\"currentColor\""
. T.replace "stroke=\"#000\"" "stroke=\"currentColor\""
. T.replace "stroke=\"black\"" "stroke=\"currentColor\""
. T.replace "fill:#000" "fill:currentColor"
. replaceHexColor "fill:#000" "fill:currentColor"
. T.replace "fill:black" "fill:currentColor"
. T.replace "stroke:#000" "stroke:currentColor"
. replaceHexColor "stroke:#000" "stroke:currentColor"
. T.replace "stroke:black" "stroke:currentColor"
. T.replace "fill=\"#000000\"" "fill=\"currentColor\""
. T.replace "stroke=\"#000000\"" "stroke=\"currentColor\""
. T.replace "fill:#000000" "fill:currentColor"
. T.replace "stroke:#000000" "stroke:currentColor"
. replaceHexColor "fill:#000000" "fill:currentColor"
. replaceHexColor "stroke:#000000" "stroke:currentColor"
-- | 'T.replace' restricted to hex-boundary-terminated matches: an
-- occurrence of @needle@ is rewritten only when the character after
-- it is not another hex digit, so @fill:#000@ never fires inside the
-- longer colours @fill:#0008@, @fill:#000080@, or @fill:#00000080@.
-- (Mirrors 'Filters.Score.replaceHexColor'.)
replaceHexColor :: T.Text -> T.Text -> T.Text -> T.Text
replaceHexColor needle replacement = go
where
go t =
let (pre, rest) = T.breakOn needle t
in if T.null rest
then pre
else
let after = T.drop (T.length needle) rest
in case T.uncons after of
Just (c, _) | isHexDigit c ->
pre <> needle <> go after
_ -> pre <> replacement <> go after
-- ---------------------------------------------------------------------------
-- JSON safety for <script> embedding

View File

@ -12,23 +12,129 @@
-- replaced with hyphens, non-alphanumeric characters stripped, and
-- a @.html@ suffix appended so the link resolves identically under
-- the dev server, file:// previews, and nginx in production.
module Filters.Wikilinks (preprocess) where
--
-- Code protection (honest scope): lines inside /fenced/ code blocks
-- are passed through untouched (see 'mapOutsideFences'), and within a
-- line, inline code spans (backtick runs, CommonMark equal-length
-- matching) are skipped — so both fenced and @`inline`@ examples can
-- show @[[…]]@ literally. Indented code blocks and code spans that
-- cross a line break are NOT recognised; a wikilink inside those is
-- still rewritten.
module Filters.Wikilinks (preprocess, mapOutsideFences) where
import Data.Char (isAlphaNum, toLower, isSpace)
import Data.List (intercalate)
import qualified Utils as U
-- | Scan the raw Markdown source for @[[…]]@ wikilinks and replace them
-- with standard Markdown link syntax.
-- with standard Markdown link syntax. Processing is line-by-line and
-- skips fenced code blocks; a wikilink therefore cannot span a line
-- break (which was never a sensible authoring form).
preprocess :: String -> String
preprocess [] = []
preprocess ('[':'[':rest) =
case break (== ']') rest of
(inner, ']':']':after)
| not (null inner) ->
toMarkdownLink inner ++ preprocess after
_ -> '[' : '[' : preprocess rest
preprocess (c:rest) = c : preprocess rest
preprocess = mapOutsideFences replaceWikilinks
replaceWikilinks :: String -> String
replaceWikilinks = go
where
go [] = []
-- Inline code span: a backtick run opens a span closed by a run of
-- exactly the same length (CommonMark). Its body passes through
-- verbatim so documentation can quote @`[[…]]`@ literally. An
-- unclosed run is literal text — and then a following @[[…]]@ is
-- genuinely a wikilink, matching how Pandoc will read the line.
go s@('`':_) =
let (run, afterRun) = span (== '`') s
in case codeSpan (length run) afterRun of
Just (body, after) -> run ++ body ++ run ++ go after
Nothing -> run ++ go afterRun
go ('[':'[':rest) =
case break (== ']') rest of
(inner, ']':']':after)
| not (null inner) ->
toMarkdownLink inner ++ go after
_ -> '[' : '[' : go rest
go (c:rest) = c : go rest
-- @codeSpan n s@: the span body and the remainder after a closing
-- run of exactly @n@ backticks; 'Nothing' when no closer exists on
-- this line.
codeSpan :: Int -> String -> Maybe (String, String)
codeSpan n = loop
where
loop [] = Nothing
loop s@('`':_) =
let (run, rest) = span (== '`') s
in if length run == n
then Just ("", rest)
else prepend run <$> loop rest
loop (c:cs) = prepend [c] <$> loop cs
prepend pre (body, after) = (pre ++ body, after)
-- ---------------------------------------------------------------------------
-- Fence-aware line mapping (shared by all source-level preprocessors)
-- ---------------------------------------------------------------------------
-- | Apply a line transformation to every line that is not part of a
-- fenced code block. Shared by the three source-level preprocessors
-- (wikilinks here, 'Filters.Transclusion', 'Filters.EmbedPdf') so
-- their directive syntax can be quoted literally inside fenced code.
--
-- Fence tracking follows CommonMark: an opener is at most three
-- spaces of indentation followed by a run of at least three backticks
-- or tildes (longer runs allowed); for backtick fences the info
-- string may not contain a backtick. The closer uses the same fence
-- character, a run at least as long as the opener, and nothing but
-- whitespace after it. An unclosed fence extends to the end of the
-- document. Fence delimiter lines themselves pass through untouched.
--
-- Honest scope: only /fenced/ code blocks are protected. Indented
-- code blocks and inline code spans are not recognised here — a
-- directive inside either is still rewritten.
mapOutsideFences :: (String -> String) -> String -> String
mapOutsideFences f = unlines . go Nothing . lines
where
go _ [] = []
go Nothing (l:ls) =
case openingFence l of
Just fence -> l : go (Just fence) ls
Nothing -> f l : go Nothing ls
go st@(Just fence) (l:ls)
| closesFence fence l = l : go Nothing ls
| otherwise = l : go st ls
-- | The fence character and run length of a CommonMark fence opener,
-- or 'Nothing' when the line does not open a fence.
openingFence :: String -> Maybe (Char, Int)
openingFence l = do
rest <- stripFenceIndent l
case rest of
(c:_) | c == '`' || c == '~' ->
let run = takeWhile (== c) rest
n = length run
info = drop n rest
in if n >= 3 && (c == '~' || '`' `notElem` info)
then Just (c, n)
else Nothing
_ -> Nothing
-- | True when the line closes the fence opened by @(c, n)@: the same
-- fence character, a run at least as long as the opener, and only
-- whitespace after it.
closesFence :: (Char, Int) -> String -> Bool
closesFence (c, n) l =
case stripFenceIndent l of
Nothing -> False
Just rest ->
let run = takeWhile (== c) rest
in length run >= n && all isSpace (drop (length run) rest)
-- | Strip up to three leading spaces (the indentation CommonMark allows
-- on a fence line); 'Nothing' for four or more, which would be an
-- indented code block rather than a fence.
stripFenceIndent :: String -> Maybe String
stripFenceIndent l =
let (indent, rest) = span (== ' ') l
in if length indent <= 3 then Just rest else Nothing
-- | Convert the inner content of @[[…]]@ to a Markdown link.
--