States/Context/Embeddings fixes

This commit is contained in:
Levi Neuwirth 2026-04-26 11:22:57 -04:00
parent 6d2f9d12ae
commit 6585573dae
13 changed files with 207 additions and 45 deletions

View File

@ -1,9 +1,15 @@
# Copy this file to .env and fill in the values.
# .env is gitignored — never commit it.
# Copy this file to .env and fill in the values, then run:
# chmod 600 .env
# so other local users cannot read your VPS path / token. .env is
# gitignored — never commit it. The auto-snapshot in `make build`
# uses an explicit pathspec under content/ to keep stray .env files
# out of the snapshot, but **/.env is also in .gitignore as a backstop.
#
# `make deploy` rsyncs the built _site/ to the VPS, then pushes the
# repository to GitHub. The Makefile aborts with a clear error if any
# of VPS_USER / VPS_HOST / VPS_PATH is unset.
# `make deploy` pushes to GitHub first, then rsyncs the built _site/
# to the VPS. The Makefile aborts with a clear error if any of
# VPS_USER / VPS_HOST / VPS_PATH is unset, if VPS_PATH points at an
# obviously dangerous parent directory, or if _site/index.html does
# not exist (a sign of a broken build).
# --- VPS deployment target -------------------------------------------------
# SSH user on the deployment VPS.
@ -15,8 +21,10 @@ VPS_PATH=
# --- GitHub mirror push ----------------------------------------------------
# A GitHub fine-grained personal access token with Contents: read+write
# on the levineuwirth.org repository.
# Generate at: https://github.com/settings/tokens
# on the levineuwirth.org repository. Currently optional — `make deploy`
# uses your local git credential helper for `git push`, so this is only
# needed if you wire token-based push into a credential helper yourself.
# Generate at: https://github.com/settings/personal-access-tokens/new
GITHUB_TOKEN=
# The GitHub repository in owner/repo format.

4
.gitignore vendored
View File

@ -3,6 +3,10 @@ _site/
_cache/
.DS_Store
.env
# Defense-in-depth: catch any stray .env / .env.* anywhere in the tree
# (the auto-snapshot in the Makefile stages content/ on every build).
**/.env
**/.env.*
# Editor backup/swap files
*~

View File

@ -1,9 +1,12 @@
.PHONY: build deploy sign download-model download-pdfjs compress-assets convert-images pdf-thumbs pdfs watch clean dev
# Source .env for GITHUB_TOKEN and GITHUB_REPO if it exists.
# Source .env for deploy / GitHub config if it exists.
# .env format: KEY=value (one per line, no `export` prefix, no quotes needed).
# Only the variables explicitly listed below are exported to recipe
# subprocesses — bare `export` would leak every .env key (including any
# future GITHUB_TOKEN) into every child process.
-include .env
export
export VPS_USER VPS_HOST VPS_PATH GITHUB_REPO
build:
# Auto-snapshot any uncommitted content/ changes BEFORE the build
@ -12,8 +15,20 @@ build:
# the history — that's intentional. The next successful build
# either reuses it (no new content/ changes) or appends another
# snapshot on top, so failures don't disappear from the log.
@git add content/
@git diff --cached --quiet || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ)"
#
# Pathspec is explicit (not `git add content/`) so a stray .env,
# credential file, or other non-content artifact dropped under
# content/ is NOT auto-staged. The :(glob) magic prefix makes `**`
# match across path components (git default fnmatch does not).
# Add new extensions here if a new asset type is introduced.
@git add ':(glob)content/**/*.md' ':(glob)content/**/*.html' ':(glob)content/**/*.bib' \
':(glob)content/**/*.png' ':(glob)content/**/*.jpg' ':(glob)content/**/*.jpeg' \
':(glob)content/**/*.svg' ':(glob)content/**/*.gif' ':(glob)content/**/*.pdf' \
':(glob)content/**/*.mp3' ':(glob)content/**/*.ogg' ':(glob)content/**/*.flac' \
':(glob)content/**/*.yaml' ':(glob)content/**/*.yml' ':(glob)content/**/*.json' \
':(glob)content/**/*.css' ':(glob)content/**/*.tex'
@git diff --cached --quiet || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]"
@mkdir -p data
@date +%s > data/build-start.txt
@./tools/convert-images.sh
@$(MAKE) -s pdf-thumbs
@ -29,7 +44,8 @@ build:
> IGNORE.txt
@BUILD_END=$$(date +%s); \
BUILD_START=$$(cat data/build-start.txt); \
echo $$((BUILD_END - BUILD_START)) > data/last-build-seconds.txt
echo $$((BUILD_END - BUILD_START)) > data/last-build-seconds.txt.tmp && \
mv data/last-build-seconds.txt.tmp data/last-build-seconds.txt
sign:
@./tools/sign-site.sh
@ -99,9 +115,19 @@ deploy: clean build sign
@test -n "$(VPS_USER)" || (echo "deploy: VPS_USER not set in .env" >&2; exit 1)
@test -n "$(VPS_HOST)" || (echo "deploy: VPS_HOST not set in .env" >&2; exit 1)
@test -n "$(VPS_PATH)" || (echo "deploy: VPS_PATH not set in .env" >&2; exit 1)
@command -v notify-send >/dev/null 2>&1 && notify-send "make deploy" "Ready to rsync — waiting for SSH auth" || true
rsync -avz --delete _site/ $(VPS_USER)@$(VPS_HOST):$(VPS_PATH)/
# Refuse to deploy a manifestly broken build. _site/index.html must
# exist and be non-empty before we run rsync --delete on the VPS.
@test -s _site/index.html || { echo "deploy: _site/index.html is missing or empty — refusing to rsync" >&2; exit 1; }
# Defense-in-depth: refuse rsync --delete to obviously dangerous
# parents in case VPS_PATH was typo'd (e.g. trailing-slash mistake).
@case "$(VPS_PATH)" in /|/srv|/srv/http|/var|/var/www|/home|/root|"") echo "deploy: VPS_PATH=$(VPS_PATH) looks unsafe — refusing" >&2; exit 1 ;; esac
@command -v notify-send >/dev/null 2>&1 && notify-send "make deploy" "Ready to push & rsync — waiting for auth" || true
# Push first: a successful push is cheap to roll back, while a
# half-completed rsync is harder to recover from. If the push
# fails (auth, branch protection, network), abort before touching
# the VPS so the public source repo and the live site stay in sync.
git push -u origin main
rsync -avz --delete _site/ $(VPS_USER)@$(VPS_HOST):$(VPS_PATH)/
watch: export SITE_ENV = dev
watch:
@ -117,4 +143,4 @@ dev: export SITE_ENV = dev
dev:
cabal run site -- clean
cabal run site -- build
python3 -m http.server 8000 --directory _site
python3 -m http.server 8000 --bind 127.0.0.1 --directory _site

View File

@ -10,6 +10,7 @@ module Contexts
, compositionCtx
, contentKindField
, abstractField
, descriptionField
, tagLinksField
, tagLinksFieldExcludingScope
, tagLinksFieldExcludingTopSegment
@ -34,7 +35,7 @@ import Data.Time.Format (formatTime, defaultTimeLocale, parseTimeM)
import System.FilePath (takeDirectory, takeFileName)
import Text.Read (readMaybe)
import qualified Data.Text as T
import Text.Pandoc (runPure, readMarkdown, writeHtml5String, Pandoc(..), Block(..), Inline(..))
import Text.Pandoc (runPure, readMarkdown, writeHtml5String, writePlain, Pandoc(..), Block(..), Inline(..))
import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..))
import Hakyll hiding (trim)
import Backlinks (backlinksField)
@ -348,6 +349,44 @@ abstractField = field "abstract" $ \item -> do
isPara (Para _) = True
isPara _ = False
-- ---------------------------------------------------------------------------
-- Description field
-- ---------------------------------------------------------------------------
-- | Renders the @abstract@ frontmatter key as plain text suitable for use in
-- @<meta name="description">@, @og:description@, and @twitter:description@.
-- Strips Pandoc markup, collapses internal whitespace, truncates to ~200
-- chars, and HTML-escapes attribute-special characters. Returns @noResult@
-- when no @abstract@ is present (so @$if(description)$@ short-circuits).
descriptionField :: Context String
descriptionField = field "description" $ \item -> do
meta <- getMetadata (itemIdentifier item)
case lookupString "abstract" meta of
Nothing -> fail "no abstract"
Just src -> do
let pandocResult = runPure $ do
doc <- readMarkdown defaultHakyllReaderOptions (T.pack src)
writePlain defaultHakyllWriterOptions doc
case pandocResult of
Left err -> fail $ "Pandoc error rendering description: " ++ show err
Right txt ->
let collapsed = T.unwords (T.words txt)
capped = if T.length collapsed > 200
then T.take 197 collapsed <> T.pack "\x2026"
else collapsed
in return (attrEscape (T.unpack capped))
-- | HTML-escape characters that would break out of an attribute value.
attrEscape :: String -> String
attrEscape = concatMap esc
where
esc '&' = "&amp;"
esc '<' = "&lt;"
esc '>' = "&gt;"
esc '"' = "&quot;"
esc '\'' = "&#39;"
esc c = [c]
-- ---------------------------------------------------------------------------
-- Summary field
-- ---------------------------------------------------------------------------
@ -377,6 +416,7 @@ siteCtx =
<> buildTimeField
<> pageScriptsField
<> abstractField
<> descriptionField
<> summaryField
<> dingbatField
<> defaultContext

View File

@ -619,7 +619,10 @@ renderDistribution wcs =
]
counts = foldr (\w acc -> Map.insertWith (+) (bucketOf w) (1 :: Int) acc)
(Map.fromList [(i, 0 :: Int) | i <- [0 .. 4]]) wcs
buckets = [(labels !! i, fromMaybe 0 (Map.lookup i counts)) | i <- [0 .. 4]]
-- Pair labels with bucket indices via @zip@ rather than @(!!)@ to keep
-- the function total even if the bucket count and @labels@ list ever
-- drift out of sync (matching the discipline used in 'median').
buckets = [(lbl, fromMaybe 0 (Map.lookup i counts)) | (i, lbl) <- zip [0 :: Int ..] labels]
maxCount = max 1 (maximum (map snd buckets))
bar (lbl, n) =
let pct = n * 100 `div` maxCount

View File

@ -17,6 +17,6 @@ evidence: 2
scope: broad
novelty: idiosyncratic
practicality: high
confidence history:
confidence-history:
- 65
---

View File

@ -8,7 +8,7 @@ tags:
- nonfiction/philosophy
authors:
- "Levi Neuwirth | /me.html"
revised:
history:
- date: "2026-04-17"
note: "expanded section on Shestov's divergence from Nietzsche"
- date: "2025-12-03"

View File

@ -1,3 +1,8 @@
---
title: Library
library: true
---
::: {lang="es"}
> *El universo (que otros llaman la Biblioteca) se compone de un número indefinido, y tal vez infinito, de galerías hexagonales, con vastos pozos de ventilación en el medio, cercados por barandas bajísimas.*
>

View File

@ -294,20 +294,32 @@
}
/* 1. Citations synchronous DOM lookup; supports multi-citation groups
via data-cite-keys (space-separated list of ref-* IDs). */
via data-cite-keys (space-separated list of ref-* IDs).
Returns a DocumentFragment of cloned bibliography entries instead
of stringifying innerHTML, so a malicious or malformed cite target
cannot smuggle markup through the popup's innerHTML setter. */
function citationContent(target) {
return new Promise(function (resolve) {
var keysAttr = target.getAttribute('data-cite-keys');
var ids = keysAttr
? keysAttr.trim().split(/\s+/)
: [(target.getAttribute('href') || '').slice(1)];
var parts = ids.map(function (id) {
var entry = document.getElementById(id);
return entry ? '<div class="popup-citation-entry">' + entry.innerHTML + '</div>' : null;
}).filter(Boolean);
resolve(parts.length
? '<div class="popup-citation">' + parts.join('') + '</div>'
: null);
var entries = ids
.map(function (id) { return document.getElementById(id); })
.filter(Boolean);
if (!entries.length) { resolve(null); return; }
var wrapper = document.createElement('div');
wrapper.className = 'popup-citation';
entries.forEach(function (entry) {
var item = document.createElement('div');
item.className = 'popup-citation-entry';
Array.prototype.forEach.call(entry.childNodes, function (n) {
item.appendChild(n.cloneNode(true));
});
wrapper.appendChild(item);
});
resolve(wrapper);
});
}

View File

@ -4,7 +4,7 @@
<a href="/">levineuwirth.org</a>
</div>
<div class="footer-center">
<span class="footer-license">CC BY-SA-NC 4.0 · <a href="https://git.levineuwirth.org/neuwirth/levineuwirth.org">MIT</a> · <a href="/memento-mori.html" class="footer-mm">MM</a></span>
<span class="footer-license"><a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" rel="license">CC&nbsp;BY-NC-SA&nbsp;4.0</a> · <a href="https://git.levineuwirth.org/neuwirth/levineuwirth.org">MIT</a> · <a href="/memento-mori.html" class="footer-mm">MM</a></span>
</div>
<div class="footer-right">
<a href="/build/" class="footer-build-link" aria-label="Build telemetry">build</a> $build-time$

View File

@ -1,6 +1,21 @@
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
$if(home)$<title>Levi Neuwirth</title>$else$<title>$title$ — Levi Neuwirth</title>$endif$
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
$if(home)$<title>Levi Neuwirth</title>$else$$if(title)$<title>$title$ — Levi Neuwirth</title>$else$<title>Levi Neuwirth</title>$endif$$endif$
$if(description)$<meta name="description" content="$description$">$endif$
<link rel="canonical" href="$site-url$$url$">
<link rel="alternate" type="application/atom+xml" title="Levi Neuwirth" href="/feed.xml">
<link rel="alternate" type="application/atom+xml" title="Levi Neuwirth — music" href="/music/feed.xml">
<!-- OpenGraph / Twitter (link-preview unfurling) -->
<meta property="og:site_name" content="Levi Neuwirth">
$if(home)$<meta property="og:title" content="Levi Neuwirth">$else$$if(title)$<meta property="og:title" content="$title$">$endif$$endif$
$if(description)$<meta property="og:description" content="$description$">$endif$
<meta property="og:url" content="$site-url$$url$">
$if(date)$<meta property="og:type" content="article">$else$<meta property="og:type" content="website">$endif$
<meta property="og:image" content="$site-url$/web-app-manifest-512x512.png">
<meta name="twitter:card" content="summary">
$if(description)$<meta name="twitter:description" content="$description$">$endif$
<link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
<link rel="icon" type="image/svg+xml" href="/favicon.svg">
<link rel="shortcut icon" href="/favicon.ico">

View File

@ -16,6 +16,7 @@ Staleness check: skips if all output files are newer than every HTML in _site/.
"""
import json
import os
import re
import sys
from pathlib import Path
@ -45,6 +46,19 @@ MAX_PARA_CHARS = 1000 # semantic: truncate before embedding
EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml", "/music/feed.xml"}
def atomic_write_bytes(path: Path, data: bytes) -> None:
"""Write to path.tmp then os.replace, so an interrupt mid-write
cannot leave a truncated file that the next build/serve loads."""
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".tmp")
tmp.write_bytes(data)
os.replace(tmp, path)
def atomic_write_text(path: Path, text: str) -> None:
atomic_write_bytes(path, text.encode("utf-8"))
STRIP_SELECTORS = [
"nav", "footer", "#toc", ".link-popup", "script", "style",
".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
@ -204,8 +218,7 @@ def main() -> int:
if neighbours:
similar[page["url"]] = neighbours
SIMILAR_OUT.parent.mkdir(parents=True, exist_ok=True)
SIMILAR_OUT.write_text(json.dumps(similar, ensure_ascii=False, indent=2))
atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
print(f"embed.py: wrote {len(similar)} similar-links entries")
# --- Semantic index (paragraph level) ---
@ -221,12 +234,12 @@ def main() -> int:
batch_size=64,
).astype(np.float32)
SEMANTIC_BIN.write_bytes(para_vecs.tobytes())
atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes())
meta = [{"url": p["url"], "title": p["title"],
"heading": p["heading"], "excerpt": p["excerpt"]}
for p in paragraphs]
SEMANTIC_META.write_text(json.dumps(meta, ensure_ascii=False))
atomic_write_text(SEMANTIC_META, json.dumps(meta, ensure_ascii=False))
print(f"embed.py: wrote {len(paragraphs)} paragraphs to semantic index "
f"({SEMANTIC_BIN.stat().st_size // 1024} KB)")

View File

@ -40,15 +40,51 @@ if ! GNUPGHOME="$GNUPGHOME" gpg \
fi
echo "sign-site: pre-flight OK — signing $SITE_DIR..." >&2
find "$SITE_DIR" -name "*.html" -print0 | xargs -0 -I {} -P $(nproc) \
gpg --homedir "$GNUPGHOME" \
--batch \
--yes \
--detach-sign \
--armor \
--local-user "$SIGNING_KEY" \
--output "{}.sig" \
"{}"
# Sign sequentially through a single gpg-agent: parallel signing causes
# pinentry/IPC races where individual signs fail silently while xargs
# still exits 0. Atomic write via .tmp + mv avoids leaving a truncated
# .sig if the script is interrupted mid-write.
sign_one() {
local html="$1"
local sig="${html}.sig"
local tmp="${sig}.tmp"
if ! gpg --homedir "$GNUPGHOME" \
--batch \
--yes \
--detach-sign \
--armor \
--local-user "$SIGNING_KEY" \
--output "$tmp" \
"$html"; then
rm -f "$tmp"
echo "sign-site: FAILED to sign $html" >&2
return 1
fi
mv -f "$tmp" "$sig"
}
count=0
while IFS= read -r -d '' html; do
sign_one "$html"
count=$((count + 1))
done < <(find "$SITE_DIR" -name "*.html" -print0)
# Post-sign manifest verification: every .html must have a non-empty
# matching .sig. This catches any per-file failure that slipped through
# (set -e bails on first failure inside the loop, but a manual --output
# write to a directory containing a stale .sig from a prior run could
# look "successful" otherwise).
missing=0
while IFS= read -r -d '' html; do
if [ ! -s "${html}.sig" ]; then
echo "sign-site: missing/empty signature for $html" >&2
missing=$((missing + 1))
fi
done < <(find "$SITE_DIR" -name "*.html" -print0)
if [ "$missing" -ne 0 ]; then
echo "sign-site: $missing HTML files lack signatures — aborting" >&2
exit 1
fi
count=$(find "$SITE_DIR" -name "*.html" -printf '.' | wc -c)
echo "Signed $count HTML files in $SITE_DIR."