States/Context/Embeddings fixes

This commit is contained in:
Levi Neuwirth 2026-04-26 11:22:57 -04:00
parent 6d2f9d12ae
commit 6585573dae
13 changed files with 207 additions and 45 deletions

View File

@ -1,9 +1,15 @@
# Copy this file to .env and fill in the values. # Copy this file to .env and fill in the values, then run:
# .env is gitignored — never commit it. # chmod 600 .env
# so other local users cannot read your VPS path / token. .env is
# gitignored — never commit it. The auto-snapshot in `make build`
# uses an explicit pathspec under content/ to keep stray .env files
# out of the snapshot, but **/.env is also in .gitignore as a backstop.
# #
# `make deploy` rsyncs the built _site/ to the VPS, then pushes the # `make deploy` pushes to GitHub first, then rsyncs the built _site/
# repository to GitHub. The Makefile aborts with a clear error if any # to the VPS. The Makefile aborts with a clear error if any of
# of VPS_USER / VPS_HOST / VPS_PATH is unset. # VPS_USER / VPS_HOST / VPS_PATH is unset, if VPS_PATH points at an
# obviously dangerous parent directory, or if _site/index.html does
# not exist (a sign of a broken build).
# --- VPS deployment target ------------------------------------------------- # --- VPS deployment target -------------------------------------------------
# SSH user on the deployment VPS. # SSH user on the deployment VPS.
@ -15,8 +21,10 @@ VPS_PATH=
# --- GitHub mirror push ---------------------------------------------------- # --- GitHub mirror push ----------------------------------------------------
# A GitHub fine-grained personal access token with Contents: read+write # A GitHub fine-grained personal access token with Contents: read+write
# on the levineuwirth.org repository. # on the levineuwirth.org repository. Currently optional — `make deploy`
# Generate at: https://github.com/settings/tokens # uses your local git credential helper for `git push`, so this is only
# needed if you wire token-based push into a credential helper yourself.
# Generate at: https://github.com/settings/personal-access-tokens/new
GITHUB_TOKEN= GITHUB_TOKEN=
# The GitHub repository in owner/repo format. # The GitHub repository in owner/repo format.

4
.gitignore vendored
View File

@ -3,6 +3,10 @@ _site/
_cache/ _cache/
.DS_Store .DS_Store
.env .env
# Defense-in-depth: catch any stray .env / .env.* anywhere in the tree
# (the auto-snapshot in the Makefile stages content/ on every build).
**/.env
**/.env.*
# Editor backup/swap files # Editor backup/swap files
*~ *~

View File

@ -1,9 +1,12 @@
.PHONY: build deploy sign download-model download-pdfjs compress-assets convert-images pdf-thumbs pdfs watch clean dev .PHONY: build deploy sign download-model download-pdfjs compress-assets convert-images pdf-thumbs pdfs watch clean dev
# Source .env for GITHUB_TOKEN and GITHUB_REPO if it exists. # Source .env for deploy / GitHub config if it exists.
# .env format: KEY=value (one per line, no `export` prefix, no quotes needed). # .env format: KEY=value (one per line, no `export` prefix, no quotes needed).
# Only the variables explicitly listed below are exported to recipe
# subprocesses — bare `export` would leak every .env key (including any
# future GITHUB_TOKEN) into every child process.
-include .env -include .env
export export VPS_USER VPS_HOST VPS_PATH GITHUB_REPO
build: build:
# Auto-snapshot any uncommitted content/ changes BEFORE the build # Auto-snapshot any uncommitted content/ changes BEFORE the build
@ -12,8 +15,20 @@ build:
# the history — that's intentional. The next successful build # the history — that's intentional. The next successful build
# either reuses it (no new content/ changes) or appends another # either reuses it (no new content/ changes) or appends another
# snapshot on top, so failures don't disappear from the log. # snapshot on top, so failures don't disappear from the log.
@git add content/ #
@git diff --cached --quiet || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ)" # Pathspec is explicit (not `git add content/`) so a stray .env,
# credential file, or other non-content artifact dropped under
# content/ is NOT auto-staged. The :(glob) magic prefix makes `**`
# match across path components (git default fnmatch does not).
# Add new extensions here if a new asset type is introduced.
@git add ':(glob)content/**/*.md' ':(glob)content/**/*.html' ':(glob)content/**/*.bib' \
':(glob)content/**/*.png' ':(glob)content/**/*.jpg' ':(glob)content/**/*.jpeg' \
':(glob)content/**/*.svg' ':(glob)content/**/*.gif' ':(glob)content/**/*.pdf' \
':(glob)content/**/*.mp3' ':(glob)content/**/*.ogg' ':(glob)content/**/*.flac' \
':(glob)content/**/*.yaml' ':(glob)content/**/*.yml' ':(glob)content/**/*.json' \
':(glob)content/**/*.css' ':(glob)content/**/*.tex'
@git diff --cached --quiet || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]"
@mkdir -p data
@date +%s > data/build-start.txt @date +%s > data/build-start.txt
@./tools/convert-images.sh @./tools/convert-images.sh
@$(MAKE) -s pdf-thumbs @$(MAKE) -s pdf-thumbs
@ -29,7 +44,8 @@ build:
> IGNORE.txt > IGNORE.txt
@BUILD_END=$$(date +%s); \ @BUILD_END=$$(date +%s); \
BUILD_START=$$(cat data/build-start.txt); \ BUILD_START=$$(cat data/build-start.txt); \
echo $$((BUILD_END - BUILD_START)) > data/last-build-seconds.txt echo $$((BUILD_END - BUILD_START)) > data/last-build-seconds.txt.tmp && \
mv data/last-build-seconds.txt.tmp data/last-build-seconds.txt
sign: sign:
@./tools/sign-site.sh @./tools/sign-site.sh
@ -99,9 +115,19 @@ deploy: clean build sign
@test -n "$(VPS_USER)" || (echo "deploy: VPS_USER not set in .env" >&2; exit 1) @test -n "$(VPS_USER)" || (echo "deploy: VPS_USER not set in .env" >&2; exit 1)
@test -n "$(VPS_HOST)" || (echo "deploy: VPS_HOST not set in .env" >&2; exit 1) @test -n "$(VPS_HOST)" || (echo "deploy: VPS_HOST not set in .env" >&2; exit 1)
@test -n "$(VPS_PATH)" || (echo "deploy: VPS_PATH not set in .env" >&2; exit 1) @test -n "$(VPS_PATH)" || (echo "deploy: VPS_PATH not set in .env" >&2; exit 1)
@command -v notify-send >/dev/null 2>&1 && notify-send "make deploy" "Ready to rsync — waiting for SSH auth" || true # Refuse to deploy a manifestly broken build. _site/index.html must
rsync -avz --delete _site/ $(VPS_USER)@$(VPS_HOST):$(VPS_PATH)/ # exist and be non-empty before we run rsync --delete on the VPS.
@test -s _site/index.html || { echo "deploy: _site/index.html is missing or empty — refusing to rsync" >&2; exit 1; }
# Defense-in-depth: refuse rsync --delete to obviously dangerous
# parents in case VPS_PATH was typo'd (e.g. trailing-slash mistake).
@case "$(VPS_PATH)" in /|/srv|/srv/http|/var|/var/www|/home|/root|"") echo "deploy: VPS_PATH=$(VPS_PATH) looks unsafe — refusing" >&2; exit 1 ;; esac
@command -v notify-send >/dev/null 2>&1 && notify-send "make deploy" "Ready to push & rsync — waiting for auth" || true
# Push first: a successful push is cheap to roll back, while a
# half-completed rsync is harder to recover from. If the push
# fails (auth, branch protection, network), abort before touching
# the VPS so the public source repo and the live site stay in sync.
git push -u origin main git push -u origin main
rsync -avz --delete _site/ $(VPS_USER)@$(VPS_HOST):$(VPS_PATH)/
watch: export SITE_ENV = dev watch: export SITE_ENV = dev
watch: watch:
@ -117,4 +143,4 @@ dev: export SITE_ENV = dev
dev: dev:
cabal run site -- clean cabal run site -- clean
cabal run site -- build cabal run site -- build
python3 -m http.server 8000 --directory _site python3 -m http.server 8000 --bind 127.0.0.1 --directory _site

View File

@ -10,6 +10,7 @@ module Contexts
, compositionCtx , compositionCtx
, contentKindField , contentKindField
, abstractField , abstractField
, descriptionField
, tagLinksField , tagLinksField
, tagLinksFieldExcludingScope , tagLinksFieldExcludingScope
, tagLinksFieldExcludingTopSegment , tagLinksFieldExcludingTopSegment
@ -34,7 +35,7 @@ import Data.Time.Format (formatTime, defaultTimeLocale, parseTimeM)
import System.FilePath (takeDirectory, takeFileName) import System.FilePath (takeDirectory, takeFileName)
import Text.Read (readMaybe) import Text.Read (readMaybe)
import qualified Data.Text as T import qualified Data.Text as T
import Text.Pandoc (runPure, readMarkdown, writeHtml5String, Pandoc(..), Block(..), Inline(..)) import Text.Pandoc (runPure, readMarkdown, writeHtml5String, writePlain, Pandoc(..), Block(..), Inline(..))
import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..)) import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..))
import Hakyll hiding (trim) import Hakyll hiding (trim)
import Backlinks (backlinksField) import Backlinks (backlinksField)
@ -348,6 +349,44 @@ abstractField = field "abstract" $ \item -> do
isPara (Para _) = True isPara (Para _) = True
isPara _ = False isPara _ = False
-- ---------------------------------------------------------------------------
-- Description field
-- ---------------------------------------------------------------------------
-- | Renders the @abstract@ frontmatter key as plain text suitable for use in
-- @<meta name="description">@, @og:description@, and @twitter:description@.
-- Strips Pandoc markup, collapses internal whitespace, truncates to ~200
-- chars, and HTML-escapes attribute-special characters. Returns @noResult@
-- when no @abstract@ is present (so @$if(description)$@ short-circuits).
descriptionField :: Context String
descriptionField = field "description" $ \item -> do
meta <- getMetadata (itemIdentifier item)
case lookupString "abstract" meta of
Nothing -> fail "no abstract"
Just src -> do
let pandocResult = runPure $ do
doc <- readMarkdown defaultHakyllReaderOptions (T.pack src)
writePlain defaultHakyllWriterOptions doc
case pandocResult of
Left err -> fail $ "Pandoc error rendering description: " ++ show err
Right txt ->
let collapsed = T.unwords (T.words txt)
capped = if T.length collapsed > 200
then T.take 197 collapsed <> T.pack "\x2026"
else collapsed
in return (attrEscape (T.unpack capped))
-- | HTML-escape characters that would break out of an attribute value.
attrEscape :: String -> String
attrEscape = concatMap esc
where
esc '&' = "&amp;"
esc '<' = "&lt;"
esc '>' = "&gt;"
esc '"' = "&quot;"
esc '\'' = "&#39;"
esc c = [c]
-- --------------------------------------------------------------------------- -- ---------------------------------------------------------------------------
-- Summary field -- Summary field
-- --------------------------------------------------------------------------- -- ---------------------------------------------------------------------------
@ -377,6 +416,7 @@ siteCtx =
<> buildTimeField <> buildTimeField
<> pageScriptsField <> pageScriptsField
<> abstractField <> abstractField
<> descriptionField
<> summaryField <> summaryField
<> dingbatField <> dingbatField
<> defaultContext <> defaultContext

View File

@ -619,7 +619,10 @@ renderDistribution wcs =
] ]
counts = foldr (\w acc -> Map.insertWith (+) (bucketOf w) (1 :: Int) acc) counts = foldr (\w acc -> Map.insertWith (+) (bucketOf w) (1 :: Int) acc)
(Map.fromList [(i, 0 :: Int) | i <- [0 .. 4]]) wcs (Map.fromList [(i, 0 :: Int) | i <- [0 .. 4]]) wcs
buckets = [(labels !! i, fromMaybe 0 (Map.lookup i counts)) | i <- [0 .. 4]] -- Pair labels with bucket indices via @zip@ rather than @(!!)@ to keep
-- the function total even if the bucket count and @labels@ list ever
-- drift out of sync (matching the discipline used in 'median').
buckets = [(lbl, fromMaybe 0 (Map.lookup i counts)) | (i, lbl) <- zip [0 :: Int ..] labels]
maxCount = max 1 (maximum (map snd buckets)) maxCount = max 1 (maximum (map snd buckets))
bar (lbl, n) = bar (lbl, n) =
let pct = n * 100 `div` maxCount let pct = n * 100 `div` maxCount

View File

@ -17,6 +17,6 @@ evidence: 2
scope: broad scope: broad
novelty: idiosyncratic novelty: idiosyncratic
practicality: high practicality: high
confidence history: confidence-history:
- 65 - 65
--- ---

View File

@ -8,7 +8,7 @@ tags:
- nonfiction/philosophy - nonfiction/philosophy
authors: authors:
- "Levi Neuwirth | /me.html" - "Levi Neuwirth | /me.html"
revised: history:
- date: "2026-04-17" - date: "2026-04-17"
note: "expanded section on Shestov's divergence from Nietzsche" note: "expanded section on Shestov's divergence from Nietzsche"
- date: "2025-12-03" - date: "2025-12-03"

View File

@ -1,3 +1,8 @@
---
title: Library
library: true
---
::: {lang="es"} ::: {lang="es"}
> *El universo (que otros llaman la Biblioteca) se compone de un número indefinido, y tal vez infinito, de galerías hexagonales, con vastos pozos de ventilación en el medio, cercados por barandas bajísimas.* > *El universo (que otros llaman la Biblioteca) se compone de un número indefinido, y tal vez infinito, de galerías hexagonales, con vastos pozos de ventilación en el medio, cercados por barandas bajísimas.*
> >

View File

@ -294,20 +294,32 @@
} }
/* 1. Citations synchronous DOM lookup; supports multi-citation groups /* 1. Citations synchronous DOM lookup; supports multi-citation groups
via data-cite-keys (space-separated list of ref-* IDs). */ via data-cite-keys (space-separated list of ref-* IDs).
Returns a DocumentFragment of cloned bibliography entries instead
of stringifying innerHTML, so a malicious or malformed cite target
cannot smuggle markup through the popup's innerHTML setter. */
function citationContent(target) { function citationContent(target) {
return new Promise(function (resolve) { return new Promise(function (resolve) {
var keysAttr = target.getAttribute('data-cite-keys'); var keysAttr = target.getAttribute('data-cite-keys');
var ids = keysAttr var ids = keysAttr
? keysAttr.trim().split(/\s+/) ? keysAttr.trim().split(/\s+/)
: [(target.getAttribute('href') || '').slice(1)]; : [(target.getAttribute('href') || '').slice(1)];
var parts = ids.map(function (id) { var entries = ids
var entry = document.getElementById(id); .map(function (id) { return document.getElementById(id); })
return entry ? '<div class="popup-citation-entry">' + entry.innerHTML + '</div>' : null; .filter(Boolean);
}).filter(Boolean); if (!entries.length) { resolve(null); return; }
resolve(parts.length
? '<div class="popup-citation">' + parts.join('') + '</div>' var wrapper = document.createElement('div');
: null); wrapper.className = 'popup-citation';
entries.forEach(function (entry) {
var item = document.createElement('div');
item.className = 'popup-citation-entry';
Array.prototype.forEach.call(entry.childNodes, function (n) {
item.appendChild(n.cloneNode(true));
});
wrapper.appendChild(item);
});
resolve(wrapper);
}); });
} }

View File

@ -4,7 +4,7 @@
<a href="/">levineuwirth.org</a> <a href="/">levineuwirth.org</a>
</div> </div>
<div class="footer-center"> <div class="footer-center">
<span class="footer-license">CC BY-SA-NC 4.0 · <a href="https://git.levineuwirth.org/neuwirth/levineuwirth.org">MIT</a> · <a href="/memento-mori.html" class="footer-mm">MM</a></span> <span class="footer-license"><a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" rel="license">CC&nbsp;BY-NC-SA&nbsp;4.0</a> · <a href="https://git.levineuwirth.org/neuwirth/levineuwirth.org">MIT</a> · <a href="/memento-mori.html" class="footer-mm">MM</a></span>
</div> </div>
<div class="footer-right"> <div class="footer-right">
<a href="/build/" class="footer-build-link" aria-label="Build telemetry">build</a> $build-time$ <a href="/build/" class="footer-build-link" aria-label="Build telemetry">build</a> $build-time$

View File

@ -1,6 +1,21 @@
<meta charset="UTF-8"> <meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1">
$if(home)$<title>Levi Neuwirth</title>$else$<title>$title$ — Levi Neuwirth</title>$endif$ $if(home)$<title>Levi Neuwirth</title>$else$$if(title)$<title>$title$ — Levi Neuwirth</title>$else$<title>Levi Neuwirth</title>$endif$$endif$
$if(description)$<meta name="description" content="$description$">$endif$
<link rel="canonical" href="$site-url$$url$">
<link rel="alternate" type="application/atom+xml" title="Levi Neuwirth" href="/feed.xml">
<link rel="alternate" type="application/atom+xml" title="Levi Neuwirth — music" href="/music/feed.xml">
<!-- OpenGraph / Twitter (link-preview unfurling) -->
<meta property="og:site_name" content="Levi Neuwirth">
$if(home)$<meta property="og:title" content="Levi Neuwirth">$else$$if(title)$<meta property="og:title" content="$title$">$endif$$endif$
$if(description)$<meta property="og:description" content="$description$">$endif$
<meta property="og:url" content="$site-url$$url$">
$if(date)$<meta property="og:type" content="article">$else$<meta property="og:type" content="website">$endif$
<meta property="og:image" content="$site-url$/web-app-manifest-512x512.png">
<meta name="twitter:card" content="summary">
$if(description)$<meta name="twitter:description" content="$description$">$endif$
<link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96"> <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
<link rel="icon" type="image/svg+xml" href="/favicon.svg"> <link rel="icon" type="image/svg+xml" href="/favicon.svg">
<link rel="shortcut icon" href="/favicon.ico"> <link rel="shortcut icon" href="/favicon.ico">

View File

@ -16,6 +16,7 @@ Staleness check: skips if all output files are newer than every HTML in _site/.
""" """
import json import json
import os
import re import re
import sys import sys
from pathlib import Path from pathlib import Path
@ -45,6 +46,19 @@ MAX_PARA_CHARS = 1000 # semantic: truncate before embedding
EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml", "/music/feed.xml"} EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml", "/music/feed.xml"}
def atomic_write_bytes(path: Path, data: bytes) -> None:
"""Write to path.tmp then os.replace, so an interrupt mid-write
cannot leave a truncated file that the next build/serve loads."""
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".tmp")
tmp.write_bytes(data)
os.replace(tmp, path)
def atomic_write_text(path: Path, text: str) -> None:
atomic_write_bytes(path, text.encode("utf-8"))
STRIP_SELECTORS = [ STRIP_SELECTORS = [
"nav", "footer", "#toc", ".link-popup", "script", "style", "nav", "footer", "#toc", ".link-popup", "script", "style",
".page-meta-footer", ".metadata", "[data-pagefind-ignore]", ".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
@ -204,8 +218,7 @@ def main() -> int:
if neighbours: if neighbours:
similar[page["url"]] = neighbours similar[page["url"]] = neighbours
SIMILAR_OUT.parent.mkdir(parents=True, exist_ok=True) atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
SIMILAR_OUT.write_text(json.dumps(similar, ensure_ascii=False, indent=2))
print(f"embed.py: wrote {len(similar)} similar-links entries") print(f"embed.py: wrote {len(similar)} similar-links entries")
# --- Semantic index (paragraph level) --- # --- Semantic index (paragraph level) ---
@ -221,12 +234,12 @@ def main() -> int:
batch_size=64, batch_size=64,
).astype(np.float32) ).astype(np.float32)
SEMANTIC_BIN.write_bytes(para_vecs.tobytes()) atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes())
meta = [{"url": p["url"], "title": p["title"], meta = [{"url": p["url"], "title": p["title"],
"heading": p["heading"], "excerpt": p["excerpt"]} "heading": p["heading"], "excerpt": p["excerpt"]}
for p in paragraphs] for p in paragraphs]
SEMANTIC_META.write_text(json.dumps(meta, ensure_ascii=False)) atomic_write_text(SEMANTIC_META, json.dumps(meta, ensure_ascii=False))
print(f"embed.py: wrote {len(paragraphs)} paragraphs to semantic index " print(f"embed.py: wrote {len(paragraphs)} paragraphs to semantic index "
f"({SEMANTIC_BIN.stat().st_size // 1024} KB)") f"({SEMANTIC_BIN.stat().st_size // 1024} KB)")

View File

@ -40,15 +40,51 @@ if ! GNUPGHOME="$GNUPGHOME" gpg \
fi fi
echo "sign-site: pre-flight OK — signing $SITE_DIR..." >&2 echo "sign-site: pre-flight OK — signing $SITE_DIR..." >&2
find "$SITE_DIR" -name "*.html" -print0 | xargs -0 -I {} -P $(nproc) \ # Sign sequentially through a single gpg-agent: parallel signing causes
gpg --homedir "$GNUPGHOME" \ # pinentry/IPC races where individual signs fail silently while xargs
--batch \ # still exits 0. Atomic write via .tmp + mv avoids leaving a truncated
--yes \ # .sig if the script is interrupted mid-write.
--detach-sign \ sign_one() {
--armor \ local html="$1"
--local-user "$SIGNING_KEY" \ local sig="${html}.sig"
--output "{}.sig" \ local tmp="${sig}.tmp"
"{}" if ! gpg --homedir "$GNUPGHOME" \
--batch \
--yes \
--detach-sign \
--armor \
--local-user "$SIGNING_KEY" \
--output "$tmp" \
"$html"; then
rm -f "$tmp"
echo "sign-site: FAILED to sign $html" >&2
return 1
fi
mv -f "$tmp" "$sig"
}
count=0
while IFS= read -r -d '' html; do
sign_one "$html"
count=$((count + 1))
done < <(find "$SITE_DIR" -name "*.html" -print0)
# Post-sign manifest verification: every .html must have a non-empty
# matching .sig. This catches any per-file failure that slipped through
# (set -e bails on first failure inside the loop, but a manual --output
# write to a directory containing a stale .sig from a prior run could
# look "successful" otherwise).
missing=0
while IFS= read -r -d '' html; do
if [ ! -s "${html}.sig" ]; then
echo "sign-site: missing/empty signature for $html" >&2
missing=$((missing + 1))
fi
done < <(find "$SITE_DIR" -name "*.html" -print0)
if [ "$missing" -ne 0 ]; then
echo "sign-site: $missing HTML files lack signatures — aborting" >&2
exit 1
fi
count=$(find "$SITE_DIR" -name "*.html" -printf '.' | wc -c)
echo "Signed $count HTML files in $SITE_DIR." echo "Signed $count HTML files in $SITE_DIR."