States/Context/Embeddings fixes
This commit is contained in:
parent
6d2f9d12ae
commit
6585573dae
22
.env.example
22
.env.example
|
|
@ -1,9 +1,15 @@
|
|||
# Copy this file to .env and fill in the values.
|
||||
# .env is gitignored — never commit it.
|
||||
# Copy this file to .env and fill in the values, then run:
|
||||
# chmod 600 .env
|
||||
# so other local users cannot read your VPS path / token. .env is
|
||||
# gitignored — never commit it. The auto-snapshot in `make build`
|
||||
# uses an explicit pathspec under content/ to keep stray .env files
|
||||
# out of the snapshot, but **/.env is also in .gitignore as a backstop.
|
||||
#
|
||||
# `make deploy` rsyncs the built _site/ to the VPS, then pushes the
|
||||
# repository to GitHub. The Makefile aborts with a clear error if any
|
||||
# of VPS_USER / VPS_HOST / VPS_PATH is unset.
|
||||
# `make deploy` pushes to GitHub first, then rsyncs the built _site/
|
||||
# to the VPS. The Makefile aborts with a clear error if any of
|
||||
# VPS_USER / VPS_HOST / VPS_PATH is unset, if VPS_PATH points at an
|
||||
# obviously dangerous parent directory, or if _site/index.html does
|
||||
# not exist (a sign of a broken build).
|
||||
|
||||
# --- VPS deployment target -------------------------------------------------
|
||||
# SSH user on the deployment VPS.
|
||||
|
|
@ -15,8 +21,10 @@ VPS_PATH=
|
|||
|
||||
# --- GitHub mirror push ----------------------------------------------------
|
||||
# A GitHub fine-grained personal access token with Contents: read+write
|
||||
# on the levineuwirth.org repository.
|
||||
# Generate at: https://github.com/settings/tokens
|
||||
# on the levineuwirth.org repository. Currently optional — `make deploy`
|
||||
# uses your local git credential helper for `git push`, so this is only
|
||||
# needed if you wire token-based push into a credential helper yourself.
|
||||
# Generate at: https://github.com/settings/personal-access-tokens/new
|
||||
GITHUB_TOKEN=
|
||||
|
||||
# The GitHub repository in owner/repo format.
|
||||
|
|
|
|||
|
|
@ -3,6 +3,10 @@ _site/
|
|||
_cache/
|
||||
.DS_Store
|
||||
.env
|
||||
# Defense-in-depth: catch any stray .env / .env.* anywhere in the tree
|
||||
# (the auto-snapshot in the Makefile stages content/ on every build).
|
||||
**/.env
|
||||
**/.env.*
|
||||
|
||||
# Editor backup/swap files
|
||||
*~
|
||||
|
|
|
|||
42
Makefile
42
Makefile
|
|
@ -1,9 +1,12 @@
|
|||
.PHONY: build deploy sign download-model download-pdfjs compress-assets convert-images pdf-thumbs pdfs watch clean dev
|
||||
|
||||
# Source .env for GITHUB_TOKEN and GITHUB_REPO if it exists.
|
||||
# Source .env for deploy / GitHub config if it exists.
|
||||
# .env format: KEY=value (one per line, no `export` prefix, no quotes needed).
|
||||
# Only the variables explicitly listed below are exported to recipe
|
||||
# subprocesses — bare `export` would leak every .env key (including any
|
||||
# future GITHUB_TOKEN) into every child process.
|
||||
-include .env
|
||||
export
|
||||
export VPS_USER VPS_HOST VPS_PATH GITHUB_REPO
|
||||
|
||||
build:
|
||||
# Auto-snapshot any uncommitted content/ changes BEFORE the build
|
||||
|
|
@ -12,8 +15,20 @@ build:
|
|||
# the history — that's intentional. The next successful build
|
||||
# either reuses it (no new content/ changes) or appends another
|
||||
# snapshot on top, so failures don't disappear from the log.
|
||||
@git add content/
|
||||
@git diff --cached --quiet || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
#
|
||||
# Pathspec is explicit (not `git add content/`) so a stray .env,
|
||||
# credential file, or other non-content artifact dropped under
|
||||
# content/ is NOT auto-staged. The :(glob) magic prefix makes `**`
|
||||
# match across path components (git default fnmatch does not).
|
||||
# Add new extensions here if a new asset type is introduced.
|
||||
@git add ':(glob)content/**/*.md' ':(glob)content/**/*.html' ':(glob)content/**/*.bib' \
|
||||
':(glob)content/**/*.png' ':(glob)content/**/*.jpg' ':(glob)content/**/*.jpeg' \
|
||||
':(glob)content/**/*.svg' ':(glob)content/**/*.gif' ':(glob)content/**/*.pdf' \
|
||||
':(glob)content/**/*.mp3' ':(glob)content/**/*.ogg' ':(glob)content/**/*.flac' \
|
||||
':(glob)content/**/*.yaml' ':(glob)content/**/*.yml' ':(glob)content/**/*.json' \
|
||||
':(glob)content/**/*.css' ':(glob)content/**/*.tex'
|
||||
@git diff --cached --quiet || git commit -m "auto: $$(date -u +%Y-%m-%dT%H:%M:%SZ) [skip ci]"
|
||||
@mkdir -p data
|
||||
@date +%s > data/build-start.txt
|
||||
@./tools/convert-images.sh
|
||||
@$(MAKE) -s pdf-thumbs
|
||||
|
|
@ -29,7 +44,8 @@ build:
|
|||
> IGNORE.txt
|
||||
@BUILD_END=$$(date +%s); \
|
||||
BUILD_START=$$(cat data/build-start.txt); \
|
||||
echo $$((BUILD_END - BUILD_START)) > data/last-build-seconds.txt
|
||||
echo $$((BUILD_END - BUILD_START)) > data/last-build-seconds.txt.tmp && \
|
||||
mv data/last-build-seconds.txt.tmp data/last-build-seconds.txt
|
||||
|
||||
sign:
|
||||
@./tools/sign-site.sh
|
||||
|
|
@ -99,9 +115,19 @@ deploy: clean build sign
|
|||
@test -n "$(VPS_USER)" || (echo "deploy: VPS_USER not set in .env" >&2; exit 1)
|
||||
@test -n "$(VPS_HOST)" || (echo "deploy: VPS_HOST not set in .env" >&2; exit 1)
|
||||
@test -n "$(VPS_PATH)" || (echo "deploy: VPS_PATH not set in .env" >&2; exit 1)
|
||||
@command -v notify-send >/dev/null 2>&1 && notify-send "make deploy" "Ready to rsync — waiting for SSH auth" || true
|
||||
rsync -avz --delete _site/ $(VPS_USER)@$(VPS_HOST):$(VPS_PATH)/
|
||||
# Refuse to deploy a manifestly broken build. _site/index.html must
|
||||
# exist and be non-empty before we run rsync --delete on the VPS.
|
||||
@test -s _site/index.html || { echo "deploy: _site/index.html is missing or empty — refusing to rsync" >&2; exit 1; }
|
||||
# Defense-in-depth: refuse rsync --delete to obviously dangerous
|
||||
# parents in case VPS_PATH was typo'd (e.g. trailing-slash mistake).
|
||||
@case "$(VPS_PATH)" in /|/srv|/srv/http|/var|/var/www|/home|/root|"") echo "deploy: VPS_PATH=$(VPS_PATH) looks unsafe — refusing" >&2; exit 1 ;; esac
|
||||
@command -v notify-send >/dev/null 2>&1 && notify-send "make deploy" "Ready to push & rsync — waiting for auth" || true
|
||||
# Push first: a successful push is cheap to roll back, while a
|
||||
# half-completed rsync is harder to recover from. If the push
|
||||
# fails (auth, branch protection, network), abort before touching
|
||||
# the VPS so the public source repo and the live site stay in sync.
|
||||
git push -u origin main
|
||||
rsync -avz --delete _site/ $(VPS_USER)@$(VPS_HOST):$(VPS_PATH)/
|
||||
|
||||
watch: export SITE_ENV = dev
|
||||
watch:
|
||||
|
|
@ -117,4 +143,4 @@ dev: export SITE_ENV = dev
|
|||
dev:
|
||||
cabal run site -- clean
|
||||
cabal run site -- build
|
||||
python3 -m http.server 8000 --directory _site
|
||||
python3 -m http.server 8000 --bind 127.0.0.1 --directory _site
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ module Contexts
|
|||
, compositionCtx
|
||||
, contentKindField
|
||||
, abstractField
|
||||
, descriptionField
|
||||
, tagLinksField
|
||||
, tagLinksFieldExcludingScope
|
||||
, tagLinksFieldExcludingTopSegment
|
||||
|
|
@ -34,7 +35,7 @@ import Data.Time.Format (formatTime, defaultTimeLocale, parseTimeM)
|
|||
import System.FilePath (takeDirectory, takeFileName)
|
||||
import Text.Read (readMaybe)
|
||||
import qualified Data.Text as T
|
||||
import Text.Pandoc (runPure, readMarkdown, writeHtml5String, Pandoc(..), Block(..), Inline(..))
|
||||
import Text.Pandoc (runPure, readMarkdown, writeHtml5String, writePlain, Pandoc(..), Block(..), Inline(..))
|
||||
import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..))
|
||||
import Hakyll hiding (trim)
|
||||
import Backlinks (backlinksField)
|
||||
|
|
@ -348,6 +349,44 @@ abstractField = field "abstract" $ \item -> do
|
|||
isPara (Para _) = True
|
||||
isPara _ = False
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- Description field
|
||||
-- ---------------------------------------------------------------------------
|
||||
|
||||
-- | Renders the @abstract@ frontmatter key as plain text suitable for use in
|
||||
-- @<meta name="description">@, @og:description@, and @twitter:description@.
|
||||
-- Strips Pandoc markup, collapses internal whitespace, truncates to ~200
|
||||
-- chars, and HTML-escapes attribute-special characters. Returns @noResult@
|
||||
-- when no @abstract@ is present (so @$if(description)$@ short-circuits).
|
||||
descriptionField :: Context String
|
||||
descriptionField = field "description" $ \item -> do
|
||||
meta <- getMetadata (itemIdentifier item)
|
||||
case lookupString "abstract" meta of
|
||||
Nothing -> fail "no abstract"
|
||||
Just src -> do
|
||||
let pandocResult = runPure $ do
|
||||
doc <- readMarkdown defaultHakyllReaderOptions (T.pack src)
|
||||
writePlain defaultHakyllWriterOptions doc
|
||||
case pandocResult of
|
||||
Left err -> fail $ "Pandoc error rendering description: " ++ show err
|
||||
Right txt ->
|
||||
let collapsed = T.unwords (T.words txt)
|
||||
capped = if T.length collapsed > 200
|
||||
then T.take 197 collapsed <> T.pack "\x2026"
|
||||
else collapsed
|
||||
in return (attrEscape (T.unpack capped))
|
||||
|
||||
-- | HTML-escape characters that would break out of an attribute value.
|
||||
attrEscape :: String -> String
|
||||
attrEscape = concatMap esc
|
||||
where
|
||||
esc '&' = "&"
|
||||
esc '<' = "<"
|
||||
esc '>' = ">"
|
||||
esc '"' = """
|
||||
esc '\'' = "'"
|
||||
esc c = [c]
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- Summary field
|
||||
-- ---------------------------------------------------------------------------
|
||||
|
|
@ -377,6 +416,7 @@ siteCtx =
|
|||
<> buildTimeField
|
||||
<> pageScriptsField
|
||||
<> abstractField
|
||||
<> descriptionField
|
||||
<> summaryField
|
||||
<> dingbatField
|
||||
<> defaultContext
|
||||
|
|
|
|||
|
|
@ -619,7 +619,10 @@ renderDistribution wcs =
|
|||
]
|
||||
counts = foldr (\w acc -> Map.insertWith (+) (bucketOf w) (1 :: Int) acc)
|
||||
(Map.fromList [(i, 0 :: Int) | i <- [0 .. 4]]) wcs
|
||||
buckets = [(labels !! i, fromMaybe 0 (Map.lookup i counts)) | i <- [0 .. 4]]
|
||||
-- Pair labels with bucket indices via @zip@ rather than @(!!)@ to keep
|
||||
-- the function total even if the bucket count and @labels@ list ever
|
||||
-- drift out of sync (matching the discipline used in 'median').
|
||||
buckets = [(lbl, fromMaybe 0 (Map.lookup i counts)) | (i, lbl) <- zip [0 :: Int ..] labels]
|
||||
maxCount = max 1 (maximum (map snd buckets))
|
||||
bar (lbl, n) =
|
||||
let pct = n * 100 `div` maxCount
|
||||
|
|
|
|||
|
|
@ -17,6 +17,6 @@ evidence: 2
|
|||
scope: broad
|
||||
novelty: idiosyncratic
|
||||
practicality: high
|
||||
confidence history:
|
||||
confidence-history:
|
||||
- 65
|
||||
---
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ tags:
|
|||
- nonfiction/philosophy
|
||||
authors:
|
||||
- "Levi Neuwirth | /me.html"
|
||||
revised:
|
||||
history:
|
||||
- date: "2026-04-17"
|
||||
note: "expanded section on Shestov's divergence from Nietzsche"
|
||||
- date: "2025-12-03"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,8 @@
|
|||
---
|
||||
title: Library
|
||||
library: true
|
||||
---
|
||||
|
||||
::: {lang="es"}
|
||||
> *El universo (que otros llaman la Biblioteca) se compone de un número indefinido, y tal vez infinito, de galerías hexagonales, con vastos pozos de ventilación en el medio, cercados por barandas bajísimas.*
|
||||
>
|
||||
|
|
|
|||
|
|
@ -294,20 +294,32 @@
|
|||
}
|
||||
|
||||
/* 1. Citations — synchronous DOM lookup; supports multi-citation groups
|
||||
via data-cite-keys (space-separated list of ref-* IDs). */
|
||||
via data-cite-keys (space-separated list of ref-* IDs).
|
||||
Returns a DocumentFragment of cloned bibliography entries instead
|
||||
of stringifying innerHTML, so a malicious or malformed cite target
|
||||
cannot smuggle markup through the popup's innerHTML setter. */
|
||||
function citationContent(target) {
|
||||
return new Promise(function (resolve) {
|
||||
var keysAttr = target.getAttribute('data-cite-keys');
|
||||
var ids = keysAttr
|
||||
? keysAttr.trim().split(/\s+/)
|
||||
: [(target.getAttribute('href') || '').slice(1)];
|
||||
var parts = ids.map(function (id) {
|
||||
var entry = document.getElementById(id);
|
||||
return entry ? '<div class="popup-citation-entry">' + entry.innerHTML + '</div>' : null;
|
||||
}).filter(Boolean);
|
||||
resolve(parts.length
|
||||
? '<div class="popup-citation">' + parts.join('') + '</div>'
|
||||
: null);
|
||||
var entries = ids
|
||||
.map(function (id) { return document.getElementById(id); })
|
||||
.filter(Boolean);
|
||||
if (!entries.length) { resolve(null); return; }
|
||||
|
||||
var wrapper = document.createElement('div');
|
||||
wrapper.className = 'popup-citation';
|
||||
entries.forEach(function (entry) {
|
||||
var item = document.createElement('div');
|
||||
item.className = 'popup-citation-entry';
|
||||
Array.prototype.forEach.call(entry.childNodes, function (n) {
|
||||
item.appendChild(n.cloneNode(true));
|
||||
});
|
||||
wrapper.appendChild(item);
|
||||
});
|
||||
resolve(wrapper);
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
<a href="/">levineuwirth.org</a>
|
||||
</div>
|
||||
<div class="footer-center">
|
||||
<span class="footer-license">CC BY-SA-NC 4.0 · <a href="https://git.levineuwirth.org/neuwirth/levineuwirth.org">MIT</a> · <a href="/memento-mori.html" class="footer-mm">MM</a></span>
|
||||
<span class="footer-license"><a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" rel="license">CC BY-NC-SA 4.0</a> · <a href="https://git.levineuwirth.org/neuwirth/levineuwirth.org">MIT</a> · <a href="/memento-mori.html" class="footer-mm">MM</a></span>
|
||||
</div>
|
||||
<div class="footer-right">
|
||||
<a href="/build/" class="footer-build-link" aria-label="Build telemetry">build</a> $build-time$
|
||||
|
|
|
|||
|
|
@ -1,6 +1,21 @@
|
|||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
$if(home)$<title>Levi Neuwirth</title>$else$<title>$title$ — Levi Neuwirth</title>$endif$
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
$if(home)$<title>Levi Neuwirth</title>$else$$if(title)$<title>$title$ — Levi Neuwirth</title>$else$<title>Levi Neuwirth</title>$endif$$endif$
|
||||
$if(description)$<meta name="description" content="$description$">$endif$
|
||||
<link rel="canonical" href="$site-url$$url$">
|
||||
<link rel="alternate" type="application/atom+xml" title="Levi Neuwirth" href="/feed.xml">
|
||||
<link rel="alternate" type="application/atom+xml" title="Levi Neuwirth — music" href="/music/feed.xml">
|
||||
|
||||
<!-- OpenGraph / Twitter (link-preview unfurling) -->
|
||||
<meta property="og:site_name" content="Levi Neuwirth">
|
||||
$if(home)$<meta property="og:title" content="Levi Neuwirth">$else$$if(title)$<meta property="og:title" content="$title$">$endif$$endif$
|
||||
$if(description)$<meta property="og:description" content="$description$">$endif$
|
||||
<meta property="og:url" content="$site-url$$url$">
|
||||
$if(date)$<meta property="og:type" content="article">$else$<meta property="og:type" content="website">$endif$
|
||||
<meta property="og:image" content="$site-url$/web-app-manifest-512x512.png">
|
||||
<meta name="twitter:card" content="summary">
|
||||
$if(description)$<meta name="twitter:description" content="$description$">$endif$
|
||||
|
||||
<link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
|
||||
<link rel="icon" type="image/svg+xml" href="/favicon.svg">
|
||||
<link rel="shortcut icon" href="/favicon.ico">
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ Staleness check: skips if all output files are newer than every HTML in _site/.
|
|||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
|
@ -45,6 +46,19 @@ MAX_PARA_CHARS = 1000 # semantic: truncate before embedding
|
|||
|
||||
EXCLUDE_URLS = {"/search/", "/build/", "/404.html", "/feed.xml", "/music/feed.xml"}
|
||||
|
||||
|
||||
def atomic_write_bytes(path: Path, data: bytes) -> None:
|
||||
"""Write to path.tmp then os.replace, so an interrupt mid-write
|
||||
cannot leave a truncated file that the next build/serve loads."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = path.with_suffix(path.suffix + ".tmp")
|
||||
tmp.write_bytes(data)
|
||||
os.replace(tmp, path)
|
||||
|
||||
|
||||
def atomic_write_text(path: Path, text: str) -> None:
|
||||
atomic_write_bytes(path, text.encode("utf-8"))
|
||||
|
||||
STRIP_SELECTORS = [
|
||||
"nav", "footer", "#toc", ".link-popup", "script", "style",
|
||||
".page-meta-footer", ".metadata", "[data-pagefind-ignore]",
|
||||
|
|
@ -204,8 +218,7 @@ def main() -> int:
|
|||
if neighbours:
|
||||
similar[page["url"]] = neighbours
|
||||
|
||||
SIMILAR_OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
SIMILAR_OUT.write_text(json.dumps(similar, ensure_ascii=False, indent=2))
|
||||
atomic_write_text(SIMILAR_OUT, json.dumps(similar, ensure_ascii=False, indent=2))
|
||||
print(f"embed.py: wrote {len(similar)} similar-links entries")
|
||||
|
||||
# --- Semantic index (paragraph level) ---
|
||||
|
|
@ -221,12 +234,12 @@ def main() -> int:
|
|||
batch_size=64,
|
||||
).astype(np.float32)
|
||||
|
||||
SEMANTIC_BIN.write_bytes(para_vecs.tobytes())
|
||||
atomic_write_bytes(SEMANTIC_BIN, para_vecs.tobytes())
|
||||
|
||||
meta = [{"url": p["url"], "title": p["title"],
|
||||
"heading": p["heading"], "excerpt": p["excerpt"]}
|
||||
for p in paragraphs]
|
||||
SEMANTIC_META.write_text(json.dumps(meta, ensure_ascii=False))
|
||||
atomic_write_text(SEMANTIC_META, json.dumps(meta, ensure_ascii=False))
|
||||
|
||||
print(f"embed.py: wrote {len(paragraphs)} paragraphs to semantic index "
|
||||
f"({SEMANTIC_BIN.stat().st_size // 1024} KB)")
|
||||
|
|
|
|||
|
|
@ -40,15 +40,51 @@ if ! GNUPGHOME="$GNUPGHOME" gpg \
|
|||
fi
|
||||
echo "sign-site: pre-flight OK — signing $SITE_DIR..." >&2
|
||||
|
||||
find "$SITE_DIR" -name "*.html" -print0 | xargs -0 -I {} -P $(nproc) \
|
||||
gpg --homedir "$GNUPGHOME" \
|
||||
# Sign sequentially through a single gpg-agent: parallel signing causes
|
||||
# pinentry/IPC races where individual signs fail silently while xargs
|
||||
# still exits 0. Atomic write via .tmp + mv avoids leaving a truncated
|
||||
# .sig if the script is interrupted mid-write.
|
||||
sign_one() {
|
||||
local html="$1"
|
||||
local sig="${html}.sig"
|
||||
local tmp="${sig}.tmp"
|
||||
if ! gpg --homedir "$GNUPGHOME" \
|
||||
--batch \
|
||||
--yes \
|
||||
--detach-sign \
|
||||
--armor \
|
||||
--local-user "$SIGNING_KEY" \
|
||||
--output "{}.sig" \
|
||||
"{}"
|
||||
--output "$tmp" \
|
||||
"$html"; then
|
||||
rm -f "$tmp"
|
||||
echo "sign-site: FAILED to sign $html" >&2
|
||||
return 1
|
||||
fi
|
||||
mv -f "$tmp" "$sig"
|
||||
}
|
||||
|
||||
count=0
|
||||
while IFS= read -r -d '' html; do
|
||||
sign_one "$html"
|
||||
count=$((count + 1))
|
||||
done < <(find "$SITE_DIR" -name "*.html" -print0)
|
||||
|
||||
# Post-sign manifest verification: every .html must have a non-empty
|
||||
# matching .sig. This catches any per-file failure that slipped through
|
||||
# (set -e bails on first failure inside the loop, but a manual --output
|
||||
# write to a directory containing a stale .sig from a prior run could
|
||||
# look "successful" otherwise).
|
||||
missing=0
|
||||
while IFS= read -r -d '' html; do
|
||||
if [ ! -s "${html}.sig" ]; then
|
||||
echo "sign-site: missing/empty signature for $html" >&2
|
||||
missing=$((missing + 1))
|
||||
fi
|
||||
done < <(find "$SITE_DIR" -name "*.html" -print0)
|
||||
|
||||
if [ "$missing" -ne 0 ]; then
|
||||
echo "sign-site: $missing HTML files lack signatures — aborting" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
count=$(find "$SITE_DIR" -name "*.html" -printf '.' | wc -c)
|
||||
echo "Signed $count HTML files in $SITE_DIR."
|
||||
|
|
|
|||
Loading…
Reference in New Issue