levineuwirth.org/build/Stats.hs

533 lines
23 KiB
Haskell

{-# LANGUAGE GHC2021 #-}
{-# LANGUAGE OverloadedStrings #-}
-- | Build telemetry page (/build/): corpus statistics, word-length
-- distribution, tag frequencies, link analysis, epistemic coverage,
-- output metrics, repository overview, and build timing.
-- Rendered as a full essay (3-column layout, TOC, metadata block).
module Stats (statsRules) where
import Control.Exception (IOException, catch)
import Control.Monad (forM)
import Data.List (find, isSuffixOf, sortBy)
import qualified Data.Map.Strict as Map
import Data.Maybe (catMaybes, fromMaybe, isJust, listToMaybe)
import Data.Ord (comparing, Down (..))
import qualified Data.Set as Set
import Data.Time (getCurrentTime, formatTime, defaultTimeLocale)
import System.Directory (doesDirectoryExist, getFileSize, listDirectory)
import System.Exit (ExitCode (..))
import System.FilePath (takeExtension, (</>))
import System.Process (readProcessWithExitCode)
import Text.Read (readMaybe)
import qualified Data.Aeson as Aeson
import qualified Data.Aeson.Key as AK
import qualified Data.Aeson.KeyMap as KM
import qualified Data.Vector as V
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import Hakyll
import Authors (authorLinksField)
import Contexts (siteCtx)
import Utils (readingTime)
-- ---------------------------------------------------------------------------
-- Types
-- ---------------------------------------------------------------------------
data TypeRow = TypeRow
{ trLabel :: String
, trCount :: Int
, trWords :: Int
}
data PageInfo = PageInfo
{ piTitle :: String
, piUrl :: String
, piWC :: Int
}
-- ---------------------------------------------------------------------------
-- Hakyll helpers
-- ---------------------------------------------------------------------------
loadWC :: Item String -> Compiler Int
loadWC item = do
snap <- loadSnapshot (itemIdentifier item) "word-count"
return $ fromMaybe 0 (readMaybe (itemBody snap))
loadPI :: Item String -> Compiler (Maybe PageInfo)
loadPI item = do
meta <- getMetadata (itemIdentifier item)
mRoute <- getRoute (itemIdentifier item)
wc <- loadWC item
return $ fmap (\r -> PageInfo
{ piTitle = fromMaybe "(untitled)" (lookupString "title" meta)
, piUrl = "/" ++ r
, piWC = wc
}) mRoute
-- ---------------------------------------------------------------------------
-- Formatting helpers
-- ---------------------------------------------------------------------------
commaInt :: Int -> String
commaInt n
| n < 1000 = show n
| otherwise = commaInt (n `div` 1000) ++ "," ++ pad3 (n `mod` 1000)
where
pad3 x
| x < 10 = "00" ++ show x
| x < 100 = "0" ++ show x
| otherwise = show x
formatBytes :: Integer -> String
formatBytes b
| b < 1024 = show b ++ " B"
| b < 1024*1024 = showD (b * 10 `div` 1024) ++ " KB"
| otherwise = showD (b * 10 `div` (1024*1024)) ++ " MB"
where showD n = show (n `div` 10) ++ "." ++ show (n `mod` 10)
rtStr :: Int -> String
rtStr totalWords
| mins < 60 = show mins ++ " min"
| otherwise = show (mins `div` 60) ++ "h " ++ show (mins `mod` 60) ++ "m"
where mins = totalWords `div` 200
pctStr :: Int -> Int -> String
pctStr _ 0 = ""
pctStr n total = show (n * 100 `div` total) ++ "%"
-- | Strip HTML tags for plain-text word counting.
stripHtmlTags :: String -> String
stripHtmlTags [] = []
stripHtmlTags ('<':rest) = stripHtmlTags (drop 1 (dropWhile (/= '>') rest))
stripHtmlTags (c:rest) = c : stripHtmlTags rest
-- | Normalise a page URL for backlink map lookup (strip trailing .html).
normUrl :: String -> String
normUrl u
| ".html" `isSuffixOf` u = take (length u - 5) u
| otherwise = u
-- ---------------------------------------------------------------------------
-- IO: output directory walk
-- ---------------------------------------------------------------------------
walkDir :: FilePath -> IO [(FilePath, Integer)]
walkDir dir = do
entries <- listDirectory dir `catch` (\(_ :: IOException) -> return [])
fmap concat $ forM entries $ \e -> do
let path = dir </> e
isDir <- doesDirectoryExist path
if isDir
then walkDir path
else do
sz <- getFileSize path `catch` (\(_ :: IOException) -> return 0)
return [(path, sz)]
displayExt :: FilePath -> String
displayExt path = case takeExtension path of
".html" -> ".html"
".css" -> ".css"
".js" -> ".js"
".woff2" -> ".woff2"
".svg" -> ".svg"
".mp3" -> ".mp3"
".pdf" -> ".pdf"
".json" -> ".json"
".xml" -> ".xml"
".ico" -> ".ico"
".png" -> "image"
".jpg" -> "image"
".jpeg" -> "image"
".webp" -> "image"
_ -> "other"
getOutputStats :: IO (Map.Map String (Int, Integer), Int, Integer)
getOutputStats = do
files <- walkDir "_site"
let grouped = foldr (\(path, sz) acc ->
Map.insertWith (\(c1,s1) (c2,s2) -> (c1+c2, s1+s2))
(displayExt path)
(1, sz) acc)
Map.empty files
return (grouped, length files, sum (map snd files))
-- ---------------------------------------------------------------------------
-- IO: lines of code
-- ---------------------------------------------------------------------------
countLinesDir :: FilePath -> String -> (FilePath -> Bool) -> IO (Int, Int)
countLinesDir dir ext skipPred = do
entries <- listDirectory dir `catch` (\(_ :: IOException) -> return [])
let files = filter (\e -> takeExtension e == ext && not (skipPred e)) entries
ls <- fmap sum $ forM files $ \e -> do
content <- readFile (dir </> e) `catch` (\(_ :: IOException) -> return "")
return (length (lines content))
return (length files, ls)
getLocStats :: IO (Int, Int, Int, Int, Int, Int)
-- (hsFiles, hsLines, cssFiles, cssLines, jsFiles, jsLines)
getLocStats = do
(hf, hl) <- countLinesDir "build" ".hs" (const False)
(cf, cl) <- countLinesDir "static/css" ".css" (const False)
(jf, jl) <- countLinesDir "static/js" ".js" (".min.js" `isSuffixOf`)
return (hf, hl, cf, cl, jf, jl)
-- ---------------------------------------------------------------------------
-- IO: git stats
-- ---------------------------------------------------------------------------
gitRun :: [String] -> IO String
gitRun args = do
(ec, out, _) <- readProcessWithExitCode "git" args ""
return $ if ec == ExitSuccess then out else ""
getGitStats :: IO (Int, String)
getGitStats = do
countOut <- gitRun ["rev-list", "--count", "HEAD"]
firstOut <- gitRun ["log", "--format=%ad", "--date=short", "--reverse"]
let commits = fromMaybe 0 (readMaybe (filter (/= '\n') countOut) :: Maybe Int)
firstDate = case lines firstOut of { (d:_) -> d; _ -> "\x2014" }
return (commits, firstDate)
-- ---------------------------------------------------------------------------
-- HTML rendering: section helpers
-- ---------------------------------------------------------------------------
section :: String -> String -> String -> String
section id_ title body = concat
[ "<h2 id=\"", id_, "\">", title, "</h2>\n"
, body
]
table :: [String] -> [[String]] -> Maybe [String] -> String
table headers rows mFoot = concat
[ "<table class=\"build-table\">"
, "<thead><tr>", concatMap (\h -> "<th>" ++ h ++ "</th>") headers, "</tr></thead>"
, "<tbody>", concatMap renderRow rows, "</tbody>"
, maybe "" renderFoot mFoot
, "</table>"
]
where
renderRow cells = "<tr>" ++ concatMap (\c -> "<td>" ++ c ++ "</td>") cells ++ "</tr>"
renderFoot cells = "<tfoot><tr class=\"build-total\">"
++ concatMap (\c -> "<td>" ++ c ++ "</td>") cells
++ "</tr></tfoot>"
dl :: [(String, String)] -> String
dl pairs = "<dl class=\"build-meta\">"
++ concatMap (\(k, v) -> "<dt>" ++ k ++ "</dt><dd>" ++ v ++ "</dd>") pairs
++ "</dl>"
link :: String -> String -> String
link url title = "<a href=\"" ++ url ++ "\">" ++ title ++ "</a>"
-- ---------------------------------------------------------------------------
-- HTML rendering: sections
-- ---------------------------------------------------------------------------
renderContent :: [TypeRow] -> String
renderContent rows =
section "content" "Content" $
table
["Type", "Count", "Words", "Reading time"]
(map row rows)
(Just ["Total", commaInt totalCount, commaInt totalWords, rtStr totalWords])
where
totalCount = sum (map trCount rows)
totalWords = sum (map trWords rows)
row r = [ trLabel r
, commaInt (trCount r)
, commaInt (trWords r)
, rtStr (trWords r)
]
renderPages :: [PageInfo] -> Maybe (String,String,String) -> Maybe (String,String,String) -> String
renderPages allPIs mOldest mNewest =
section "pages" "Pages" $ concat
[ dl $
[ ("Total pages", commaInt (length allPIs))
, ("Average length", commaInt avgWC ++ " words")
] ++
maybe [] (\(d,t,u) -> [("Oldest content", d ++ " \x2014 " ++ link u t)]) mOldest ++
maybe [] (\(d,t,u) -> [("Newest content", d ++ " \x2014 " ++ link u t)]) mNewest
, "<p><strong>Longest</strong></p>"
, pageList (take 3 (sortBy (comparing (Down . piWC)) hasSomeWC))
, "<p><strong>Shortest</strong></p>"
, pageList (take 3 (sortBy (comparing piWC) hasSomeWC))
]
where
hasSomeWC = filter (\p -> piWC p > 0) allPIs
avgWC = if null hasSomeWC then 0
else sum (map piWC hasSomeWC) `div` length hasSomeWC
pageList ps = "<ol class=\"build-page-list\">"
++ concatMap (\p -> "<li>" ++ link (piUrl p) (piTitle p)
++ " \x2014 " ++ commaInt (piWC p) ++ " words</li>") ps
++ "</ol>"
renderDistribution :: [Int] -> String
renderDistribution wcs =
section "distribution" "Word-length distribution" $
"<div class=\"build-bars\">" ++ concatMap bar buckets ++ "</div>"
where
bucketOf w
| w < 500 = 0 | w < 1000 = 1 | w < 2000 = 2 | w < 5000 = 3 | otherwise = 4
labels = ["&lt; 500", "500 \x2013 1k", "1k \x2013 2k", "2k \x2013 5k", "\x2265 5k"]
counts = foldr (\w acc -> Map.insertWith (+) (bucketOf w) 1 acc)
(Map.fromList [(i,0) | i <- [0..4]]) wcs
buckets = [(labels !! i, fromMaybe 0 (Map.lookup i counts)) | i <- [0..4]]
maxCount = max 1 (maximum (map snd buckets))
bar (lbl, n) =
let pct = n * 100 `div` maxCount
in concat
[ "<div class=\"build-bar-row\">"
, "<span class=\"build-bar-label\">", lbl, "</span>"
, "<span class=\"build-bar-wrap\"><span class=\"build-bar\" style=\"width:"
, show pct, "%\"></span></span>"
, "<span class=\"build-bar-count\">", show n, "</span>"
, "</div>"
]
renderTagsSection :: [(String, Int)] -> Int -> String
renderTagsSection topTags uniqueCount =
section "tags" "Tags" $ concat
[ dl [("Unique tags", commaInt uniqueCount)]
, table ["Tag", "Items"] (map row topTags) Nothing
]
where
row (t, n) = [link ("/" ++ t ++ "/") t, show n]
renderLinks :: Maybe (String, Int, String) -> Int -> Int -> String
renderLinks mMostLinked orphanCount total =
section "links" "Links" $
dl $
(case mMostLinked of
Nothing -> [("Most-linked page", "\x2014")]
Just (u, n, t) -> [("Most-linked page",
link u t ++ " (" ++ show n ++ " inbound links)")]) ++
[ ("Orphan pages", commaInt orphanCount
++ " of " ++ commaInt total
++ " (" ++ pctStr orphanCount total ++ ")") ]
renderEpistemic :: Int -> Int -> Int -> Int -> Int -> String
renderEpistemic total ws wc wi we =
section "epistemic" "Epistemic coverage" $
table
["Field", "Set", "Coverage"]
[ row "Status" ws
, row "Confidence" wc
, row "Importance" wi
, row "Evidence" we
]
Nothing
where
row label n = [label, show n ++ " / " ++ show total, pctStr n total]
renderOutput :: Map.Map String (Int, Integer) -> Int -> Integer -> String
renderOutput grouped totalFiles totalSize =
section "output" "Output" $
table
["Type", "Files", "Size"]
(map row (sortBy (comparing (Down . snd . snd)) (Map.toList grouped)))
(Just ["Total", commaInt totalFiles, formatBytes totalSize])
where
row (ext, (n, sz)) = [ext, commaInt n, formatBytes sz]
renderRepository :: Int -> Int -> Int -> Int -> Int -> Int -> Int -> String -> String
renderRepository hf hl cf cl jf jl commits firstDate =
section "repository" "Repository" $
dl
[ ("Haskell", commaInt hl ++ " lines across " ++ show hf ++ " files")
, ("CSS", commaInt cl ++ " lines across " ++ show cf ++ " files")
, ("JavaScript", commaInt jl ++ " lines across " ++ show jf ++ " files (excl. minified)")
, ("Total git commits", commaInt commits)
, ("Repository started", firstDate)
]
renderBuild :: String -> String -> String
renderBuild ts dur =
section "build" "Build" $
dl
[ ("Generated", ts)
, ("Last build duration", dur)
]
-- ---------------------------------------------------------------------------
-- Static TOC (matches the nine h2 sections above)
-- ---------------------------------------------------------------------------
pageTOC :: String
pageTOC = "<ol>\n" ++ concatMap item sections ++ "</ol>\n"
where
item (id_, title) =
"<li><a href=\"#" ++ id_ ++ "\" data-target=\"" ++ id_ ++ "\">"
++ title ++ "</a></li>\n"
sections =
[ ("content", "Content")
, ("pages", "Pages")
, ("distribution", "Word-length distribution")
, ("tags", "Tags")
, ("links", "Links")
, ("epistemic", "Epistemic coverage")
, ("output", "Output")
, ("repository", "Repository")
, ("build", "Build")
]
-- ---------------------------------------------------------------------------
-- Rules
-- ---------------------------------------------------------------------------
statsRules :: Tags -> Rules ()
statsRules tags =
create ["build/index.html"] $ do
route idRoute
compile $ do
-- ----------------------------------------------------------------
-- Load all content items
-- ----------------------------------------------------------------
essays <- loadAll ("content/essays/*.md" .&&. hasNoVersion)
posts <- loadAll ("content/blog/*.md" .&&. hasNoVersion)
poems <- loadAll ("content/poetry/*.md" .&&. hasNoVersion)
fiction <- loadAll ("content/fiction/*.md" .&&. hasNoVersion)
comps <- loadAll ("content/music/*/index.md" .&&. hasNoVersion)
-- ----------------------------------------------------------------
-- Word counts
-- ----------------------------------------------------------------
essayWCs <- mapM loadWC essays
postWCs <- mapM loadWC posts
poemWCs <- mapM loadWC poems
fictionWCs <- mapM loadWC fiction
compWCs <- mapM loadWC comps
let allWCs = essayWCs ++ postWCs ++ poemWCs ++ fictionWCs ++ compWCs
rows =
[ TypeRow "Essays" (length essays) (sum essayWCs)
, TypeRow "Blog posts" (length posts) (sum postWCs)
, TypeRow "Poems" (length poems) (sum poemWCs)
, TypeRow "Fiction" (length fiction) (sum fictionWCs)
, TypeRow "Compositions" (length comps) (sum compWCs)
]
-- ----------------------------------------------------------------
-- Per-page info (title + URL + word count)
-- ----------------------------------------------------------------
allItems <- return (essays ++ posts ++ poems ++ fiction ++ comps)
allPIs <- catMaybes <$> mapM loadPI allItems
-- ----------------------------------------------------------------
-- Dates (essays + posts only)
-- ----------------------------------------------------------------
let getDateMeta item = do
meta <- getMetadata (itemIdentifier item)
mRoute <- getRoute (itemIdentifier item)
let d = fromMaybe "" (lookupString "date" meta)
t = fromMaybe "(untitled)" (lookupString "title" meta)
u = maybe "#" (\r -> "/" ++ r) mRoute
return (d, t, u)
essayDates <- mapM getDateMeta essays
postDates <- mapM getDateMeta posts
let allDates = filter (\(d,_,_) -> not (null d)) (essayDates ++ postDates)
sortedDates = sortBy (comparing (\(d,_,_) -> d)) allDates
oldestDate = listToMaybe sortedDates
newestDate = listToMaybe (reverse sortedDates)
-- ----------------------------------------------------------------
-- Tags
-- ----------------------------------------------------------------
let tagFreqs = map (\(t, ids) -> (t, length ids)) (tagsMap tags)
topTags = take 15 (sortBy (comparing (Down . snd)) tagFreqs)
uniqueTags = length tagFreqs
-- ----------------------------------------------------------------
-- Backlinks: most-linked page + orphan count
-- ----------------------------------------------------------------
blItem <- load (fromFilePath "data/backlinks.json") :: Compiler (Item String)
let rawBL = itemBody blItem
mBLVal = Aeson.decodeStrict (TE.encodeUtf8 (T.pack rawBL)) :: Maybe Aeson.Value
blPairs = case mBLVal of
Just (Aeson.Object km) ->
[ (T.unpack (AK.toText k),
case v of Aeson.Array arr -> V.length arr; _ -> 0)
| (k, v) <- KM.toList km ]
_ -> []
blSet = Set.fromList (map fst blPairs)
orphanCount = length
[ p | p <- allPIs
, not (Set.member (normUrl (piUrl p)) blSet) ]
mostLinked = listToMaybe (sortBy (comparing (Down . snd)) blPairs)
mostLinkedInfo = mostLinked >>= \(url, ct) ->
let mTitle = piTitle <$> find (\p -> normUrl (piUrl p) == url) allPIs
in Just (url, ct, fromMaybe url mTitle)
-- ----------------------------------------------------------------
-- Epistemic coverage (essays + posts)
-- ----------------------------------------------------------------
essayMetas <- mapM (getMetadata . itemIdentifier) essays
postMetas <- mapM (getMetadata . itemIdentifier) posts
let epMetas = essayMetas ++ postMetas
epTotal = length epMetas
ep f = length (filter (isJust . f) epMetas)
withStatus = ep (lookupString "status")
withConf = ep (lookupString "confidence")
withImp = ep (lookupString "importance")
withEv = ep (lookupString "evidence")
-- ----------------------------------------------------------------
-- Output directory stats
-- ----------------------------------------------------------------
(outputGrouped, totalFiles, totalSize) <-
unsafeCompiler getOutputStats
-- ----------------------------------------------------------------
-- Lines of code + git stats
-- ----------------------------------------------------------------
(hf, hl, cf, cl, jf, jl) <- unsafeCompiler getLocStats
(commits, firstDate) <- unsafeCompiler getGitStats
-- ----------------------------------------------------------------
-- Build timestamp + last build duration
-- ----------------------------------------------------------------
buildTimestamp <- unsafeCompiler $
formatTime defaultTimeLocale "%Y-%m-%d %H:%M UTC" <$> getCurrentTime
lastBuildDur <- unsafeCompiler $
(readFile "data/last-build-seconds.txt" >>= \s ->
let secs = fromMaybe 0 (readMaybe (filter (/= '\n') s) :: Maybe Int)
in return (show secs ++ "s"))
`catch` (\(_ :: IOException) -> return "\x2014")
-- ----------------------------------------------------------------
-- Assemble page
-- ----------------------------------------------------------------
let content = concat
[ renderContent rows
, renderPages allPIs oldestDate newestDate
, renderDistribution allWCs
, renderTagsSection topTags uniqueTags
, renderLinks mostLinkedInfo orphanCount (length allPIs)
, renderEpistemic epTotal withStatus withConf withImp withEv
, renderOutput outputGrouped totalFiles totalSize
, renderRepository hf hl cf cl jf jl commits firstDate
, renderBuild buildTimestamp lastBuildDur
]
plainText = stripHtmlTags content
wc = length (words plainText)
rt = readingTime plainText
ctx = constField "toc" pageTOC
<> constField "word-count" (show wc)
<> constField "reading-time" (show rt)
<> constField "title" "Build Telemetry"
<> constField "abstract" "Per-build corpus statistics, tag distribution, \
\link analysis, epistemic coverage, output metrics, \
\repository overview, and build timing."
<> constField "build" "true"
<> authorLinksField
<> siteCtx
makeItem content
>>= loadAndApplyTemplate "templates/essay.html" ctx
>>= loadAndApplyTemplate "templates/default.html" ctx
>>= relativizeUrls