{-# LANGUAGE GHC2021 #-}
{-# LANGUAGE OverloadedStrings #-}
-- | Build telemetry page (/build/): corpus statistics, word-length
-- distribution, tag frequencies, link analysis, epistemic coverage,
-- output metrics, repository overview, and build timing.
-- Rendered as a full essay (3-column layout, TOC, metadata block).
module Stats (statsRules) where
import Control.Exception (IOException, catch)
import Control.Monad (forM)
import Data.Char (isSpace, toLower)
import Data.List (find, isPrefixOf, isSuffixOf, sort, sortBy)
import qualified Data.Map.Strict as Map
import Data.Maybe (catMaybes, fromMaybe, isJust, listToMaybe)
import Data.Ord (comparing, Down (..))
import qualified Data.Set as Set
import Data.String (fromString)
import Data.Time (getCurrentTime, formatTime, defaultTimeLocale,
Day, parseTimeM, utctDay, addDays, diffDays)
import Data.Time.Calendar (toGregorian, dayOfWeek)
import System.Directory (doesDirectoryExist, getFileSize, listDirectory,
pathIsSymbolicLink)
import System.Exit (ExitCode (..))
import System.FilePath (takeExtension, (>))
import System.Process (readProcessWithExitCode)
import Text.Read (readMaybe)
import qualified Data.Aeson as Aeson
import qualified Data.Aeson.Key as AK
import qualified Data.Aeson.KeyMap as KM
import qualified Data.Vector as V
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import qualified Data.Text.Encoding as TE
import qualified Text.Blaze.Html5 as H
import qualified Text.Blaze.Html5.Attributes as A
import Text.Blaze.Html.Renderer.String (renderHtml)
import qualified Text.Blaze.Internal as BI
import Hakyll
import Contexts (siteCtx, authorLinksField)
import qualified Patterns as P
import Utils (readingTime)
-- ---------------------------------------------------------------------------
-- Types
-- ---------------------------------------------------------------------------
data TypeRow = TypeRow
{ trLabel :: String
, trCount :: Int
, trWords :: Int
}
data PageInfo = PageInfo
{ piTitle :: String
, piUrl :: String
, piWC :: Int
}
-- ---------------------------------------------------------------------------
-- Hakyll helpers
-- ---------------------------------------------------------------------------
loadWC :: Item String -> Compiler Int
loadWC item = do
snap <- loadSnapshot (itemIdentifier item) "word-count"
return $ fromMaybe 0 (readMaybe (itemBody snap))
loadPI :: Item String -> Compiler (Maybe PageInfo)
loadPI item = do
meta <- getMetadata (itemIdentifier item)
mRoute <- getRoute (itemIdentifier item)
wc <- loadWC item
return $ fmap (\r -> PageInfo
{ piTitle = fromMaybe "(untitled)" (lookupString "title" meta)
, piUrl = "/" ++ r
, piWC = wc
}) mRoute
-- ---------------------------------------------------------------------------
-- Formatting helpers
-- ---------------------------------------------------------------------------
commaInt :: Int -> String
commaInt n
| n < 1000 = show n
| otherwise = commaInt (n `div` 1000) ++ "," ++ pad3 (n `mod` 1000)
where
pad3 x
| x < 10 = "00" ++ show x
| x < 100 = "0" ++ show x
| otherwise = show x
formatBytes :: Integer -> String
formatBytes b
| b < 1024 = show b ++ " B"
| b < 1024*1024 = showD (b * 10 `div` 1024) ++ " KB"
| otherwise = showD (b * 10 `div` (1024*1024)) ++ " MB"
where showD n = show (n `div` 10) ++ "." ++ show (n `mod` 10)
rtStr :: Int -> String
rtStr totalWords
| mins < 60 = show mins ++ " min"
| otherwise = show (mins `div` 60) ++ "h " ++ show (mins `mod` 60) ++ "m"
where mins = totalWords `div` 200
pctStr :: Int -> Int -> String
pctStr _ 0 = "—"
pctStr n total = show (n * 100 `div` total) ++ "%"
-- | Strip HTML tags for plain-text word counting.
--
-- Handles:
-- * Tag bodies, including @>@ inside double-quoted attribute values
-- (so @\ b\"\>@ doesn't slice the surrounding text).
-- * HTML comments @\".
dropComment ('-':'-':'>':rs) = rs
dropComment (_:rs) = dropComment rs
dropComment [] = []
-- Drop everything up to and including "]]>".
dropCdata (']':']':'>':rs) = rs
dropCdata (_:rs) = dropCdata rs
dropCdata [] = []
-- Drop a tag body, respecting double-quoted attribute values.
dropTag ('"':rs) = dropTag (skipQuoted rs)
dropTag ('\'':rs) = dropTag (skipApos rs)
dropTag ('>':rs) = rs
dropTag (_:rs) = dropTag rs
dropTag [] = []
skipQuoted ('"':rs) = rs
skipQuoted (_:rs) = skipQuoted rs
skipQuoted [] = []
skipApos ('\'':rs) = rs
skipApos (_:rs) = skipApos rs
skipApos [] = []
-- | Normalise a page URL for backlink map lookup (strip trailing .html).
normUrl :: String -> String
normUrl u
| ".html" `isSuffixOf` u = take (length u - 5) u
| otherwise = u
pad2 :: (Show a, Integral a) => a -> String
pad2 n = if n < 10 then "0" ++ show n else show n
-- | Median of a non-empty list; returns 0 for empty. Uses 'drop' +
-- pattern match instead of @(!!)@ so the function is total in its
-- own implementation, not just by external invariant.
median :: [Int] -> Int
median [] = 0
median xs =
case drop (length xs `div` 2) (sort xs) of
(m : _) -> m
[] -> 0 -- unreachable: length xs >= 1 above
-- ---------------------------------------------------------------------------
-- Date helpers (for /stats/ page)
-- ---------------------------------------------------------------------------
parseDay :: String -> Maybe Day
parseDay = parseTimeM True defaultTimeLocale "%Y-%m-%d"
-- | First Monday on or before 'day' (start of its ISO week).
weekStart :: Day -> Day
weekStart day = addDays (fromIntegral (negate (fromEnum (dayOfWeek day)))) day
-- | Intensity class for the heatmap (hm0 … hm4).
heatClass :: Int -> String
heatClass 0 = "hm0"
heatClass n | n < 500 = "hm1"
heatClass n | n < 2000 = "hm2"
heatClass n | n < 5000 = "hm3"
heatClass _ = "hm4"
shortMonth :: Int -> String
shortMonth m = case m of
1 -> "Jan"; 2 -> "Feb"; 3 -> "Mar"; 4 -> "Apr"
5 -> "May"; 6 -> "Jun"; 7 -> "Jul"; 8 -> "Aug"
9 -> "Sep"; 10 -> "Oct"; 11 -> "Nov"; 12 -> "Dec"
_ -> ""
-- ---------------------------------------------------------------------------
-- URL sanitization and core HTML combinators
-- ---------------------------------------------------------------------------
-- | Defense-in-depth URL allowlist: reject anything that isn't an internal
-- path, a fragment, or an explicit safe scheme. Case-insensitive and
-- whitespace-tolerant to block @JavaScript:@, @\tjavascript:@, @data:@, etc.
-- @http://@ is intentionally excluded to avoid mixed-content warnings.
--
-- Protocol-relative URLs (@//evil.com@) are rejected because the leading
-- slash would otherwise admit them through the @\"\/\"@ prefix check.
isSafeUrl :: String -> Bool
isSafeUrl u =
let norm = map toLower (dropWhile isSpace u)
in not ("//" `isPrefixOf` norm)
&& any (`isPrefixOf` norm) ["/", "https://", "mailto:", "#"]
safeHref :: String -> H.AttributeValue
safeHref u
| isSafeUrl u = H.stringValue u
| otherwise = H.stringValue "#"
-- | Shorthand for 'H.toHtml' over a 'String'.
txt :: String -> H.Html
txt = H.toHtml
-- | Anchor element with escaped title text and URL sanitized via 'safeHref'.
-- Use for trusted plain-text labels such as tag slugs.
link :: String -> String -> H.Html
link url title = H.a H.! A.href (safeHref url) $ H.toHtml title
-- | Anchor for a content page, where the title comes from frontmatter and
-- may contain author-authored inline HTML (e.g. @Book Title@).
-- The URL is still sanitized via 'safeHref'; the title is emitted
-- pre-escaped, matching site convention that metadata titles are
-- author-controlled trusted HTML.
pageLink :: String -> String -> H.Html
pageLink url title = H.a H.! A.href (safeHref url) $ H.preEscapedToHtml title
-- | Typed section header followed by its body content.
section :: String -> String -> H.Html -> H.Html
section id_ title body = do
H.h2 H.! A.id (H.stringValue id_) $ H.toHtml title
body
-- | Build-telemetry table with header row, body rows, and an optional total
-- row. Cell contents are pre-rendered 'H.Html' so callers may embed links or
-- emphasis inside cells without risking double-escaping.
table :: [String] -> [[H.Html]] -> Maybe [H.Html] -> H.Html
table headers rows mFoot =
H.table H.! A.class_ "build-table" $ do
H.thead $ H.tr $ mapM_ (H.th . H.toHtml) headers
H.tbody $ mapM_ renderRow rows
maybe (return ()) renderFoot mFoot
where
renderRow cells = H.tr $ mapM_ H.td cells
renderFoot cells = H.tfoot $
H.tr H.! A.class_ "build-total" $ mapM_ H.td cells
-- | Two-column metadata block: each pair becomes a @