PDF compression

This commit is contained in:
Levi Neuwirth 2026-04-22 12:40:22 -04:00
parent 3a95a05284
commit 6d2f9d12ae
4 changed files with 176 additions and 1 deletions

View File

@ -1,4 +1,4 @@
.PHONY: build deploy sign download-model download-pdfjs convert-images pdf-thumbs pdfs watch clean dev .PHONY: build deploy sign download-model download-pdfjs compress-assets convert-images pdf-thumbs pdfs watch clean dev
# Source .env for GITHUB_TOKEN and GITHUB_REPO if it exists. # Source .env for GITHUB_TOKEN and GITHUB_REPO if it exists.
# .env format: KEY=value (one per line, no `export` prefix, no quotes needed). # .env format: KEY=value (one per line, no `export` prefix, no quotes needed).
@ -25,6 +25,7 @@ build:
else \ else \
echo "Embedding skipped: run 'uv sync' to enable similar-links (build continues)"; \ echo "Embedding skipped: run 'uv sync' to enable similar-links (build continues)"; \
fi fi
@./tools/compress-assets.sh _site
> IGNORE.txt > IGNORE.txt
@BUILD_END=$$(date +%s); \ @BUILD_END=$$(date +%s); \
BUILD_START=$$(cat data/build-start.txt); \ BUILD_START=$$(cat data/build-start.txt); \
@ -44,6 +45,12 @@ download-model:
download-pdfjs: download-pdfjs:
@./tools/download-pdfjs.sh @./tools/download-pdfjs.sh
# Generate .gz and .br sidecars for compressible text assets in _site/.
# Runs automatically as part of `build`. Pairs with `gzip_static` /
# `brotli_static` in the nginx vhost (see nginx/static-assets.conf).
compress-assets:
@./tools/compress-assets.sh _site
# Convert JPEG/PNG images to WebP companions (also runs automatically in build). # Convert JPEG/PNG images to WebP companions (also runs automatically in build).
# Requires cwebp: pacman -S libwebp / apt install webp # Requires cwebp: pacman -S libwebp / apt install webp
convert-images: convert-images:

78
nginx/static-assets.conf Normal file
View File

@ -0,0 +1,78 @@
# static-assets.conf — Compression + long-lived cache headers for static assets.
#
# Place at /etc/nginx/snippets/static-assets.conf and `include` it inside
# the server { } block of the levineuwirth.org vhost:
#
# server {
# server_name levineuwirth.org;
# root /var/www/levineuwirth.org;
# ...
# include snippets/static-assets.conf;
# include snippets/popup-proxy.conf;
# }
#
# Pairs with tools/compress-assets.sh, which pre-generates .gz and .br
# sidecars at build time so nginx never pays the compression cost at
# request time.
# ── On-the-fly gzip (fallback) ───────────────────────────────────────
# Covers dynamically generated responses and any file for which a .gz
# sidecar was not produced (e.g. files smaller than compress-assets.sh's
# MIN_SIZE threshold, or extensions not on its allow-list).
gzip on;
gzip_vary on;
gzip_comp_level 6;
gzip_min_length 256;
gzip_proxied any;
gzip_types text/plain
text/css
text/xml
application/javascript
text/javascript
application/json
application/xml
application/xml+rss
application/wasm
image/svg+xml;
# ── Pre-compressed sidecars ──────────────────────────────────────────
# Serve <file>.gz / <file>.br when the client advertises a matching
# Accept-Encoding. Zero request-time CPU; maximum compression ratio
# because the sidecars were produced with gzip -9 / brotli -Z.
gzip_static on;
# brotli_static requires the ngx_brotli module:
# Arch: pacman -S nginx-mod-brotli (or build nginx-mainline with the module)
# Debian/Ubuntu: apt install libnginx-mod-brotli
# If the module is absent, comment out the two brotli lines below; gzip_static
# will still cover every modern browser. Chromium/Firefox/Safari all accept gzip.
brotli_static on;
brotli off; # we ship pre-compressed sidecars only, no on-the-fly brotli
# ── Cache headers ────────────────────────────────────────────────────
# PDF.js viewer is version-pinned in tools/download-pdfjs.sh — bumping
# the pin is a deploy, so `immutable` is safe and makes repeat visits
# instantaneous. Same reasoning applies to fingerprinted fonts and the
# locally vendored ML model files.
location ^~ /pdfjs/ {
add_header Cache-Control "public, max-age=31536000, immutable" always;
access_log off;
}
location ^~ /fonts/ {
add_header Cache-Control "public, max-age=31536000, immutable" always;
access_log off;
}
location ^~ /models/ {
add_header Cache-Control "public, max-age=31536000, immutable" always;
access_log off;
}
# Per-extension caching for assets that live alongside HTML. CSS and JS
# in this repo are not fingerprinted, so a 1-day cache with must-revalidate
# keeps them responsive to deploys without forcing a fetch on every page.
location ~* \.(?:css|js|mjs|woff2?|svg|webp|png|jpg|jpeg|ico)$ {
add_header Cache-Control "public, max-age=86400, must-revalidate" always;
access_log off;
}

80
tools/compress-assets.sh Executable file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env bash
# compress-assets.sh — Generate .gz (and .br, if brotli is installed) sidecars
# for compressible text assets in _site/.
#
# Pairs with nginx `gzip_static on` / `brotli_static on`: nginx serves the
# pre-compressed file when the client advertises a matching Accept-Encoding,
# so each build pays the compression cost once (at brotli -q 11) instead of
# the server paying it on every request.
#
# Only files >= MIN_SIZE bytes are compressed — below that, the compression
# framing overhead can exceed the savings. Sidecars are regenerated only
# when the source is newer than the existing sidecar, so re-runs are cheap.
#
# Usage:
# ./tools/compress-assets.sh # compress _site/
# ./tools/compress-assets.sh path/to/dir # compress a specific directory
set -euo pipefail
SITE_DIR="${1:-_site}"
MIN_SIZE="${MIN_SIZE:-1024}" # bytes
if [ ! -d "$SITE_DIR" ]; then
echo "compress-assets: directory '$SITE_DIR' not found" >&2
exit 1
fi
have_brotli=0
if command -v brotli >/dev/null 2>&1; then
have_brotli=1
else
echo "compress-assets: brotli not found — generating gzip only" >&2
echo " (install: pacman -S brotli / apt install brotli)" >&2
fi
# Export for subshells invoked by xargs.
export MIN_SIZE
export have_brotli
compress_one() {
local src="$1"
local size
size=$(stat -c '%s' "$src" 2>/dev/null || stat -f '%z' "$src")
if [ "$size" -lt "$MIN_SIZE" ]; then
return
fi
# gzip sidecar — -9 max ratio, -n strips filename/mtime for reproducible output.
if [ ! -f "$src.gz" ] || [ "$src" -nt "$src.gz" ]; then
gzip -9 -n -c "$src" > "$src.gz.tmp" && mv "$src.gz.tmp" "$src.gz"
fi
# brotli sidecar — -Z is the max quality (level 11); slow but cached.
if [ "$have_brotli" = "1" ]; then
if [ ! -f "$src.br" ] || [ "$src" -nt "$src.br" ]; then
brotli -Z -f -o "$src.br.tmp" "$src" && mv "$src.br.tmp" "$src.br"
fi
fi
}
export -f compress_one
# Extensions worth compressing. Images (png/jpg/webp) and PDFs are already
# compressed; fonts (woff2) are zstd/brotli internally — don't re-wrap.
find "$SITE_DIR" -type f \( \
-name '*.html' -o \
-name '*.css' -o \
-name '*.js' -o \
-name '*.mjs' -o \
-name '*.json' -o \
-name '*.svg' -o \
-name '*.xml' -o \
-name '*.txt' -o \
-name '*.wasm' \
\) \
-not -name '*.gz' \
-not -name '*.br' \
-print0 \
| xargs -0 -P "$(nproc 2>/dev/null || echo 4)" -I {} bash -c 'compress_one "$@"' _ {}
echo "compress-assets: sidecars written under $SITE_DIR/"

View File

@ -60,5 +60,15 @@ mkdir -p "$PDFJS_DIR"
echo "pdfjs: extracting to $PDFJS_DIR" echo "pdfjs: extracting to $PDFJS_DIR"
unzip -q -o "$tmpdir/$ARCHIVE" -d "$PDFJS_DIR" unzip -q -o "$tmpdir/$ARCHIVE" -d "$PDFJS_DIR"
# Strip artifacts that are never needed by site users. Saves ~11 MB on
# disk and in rsync; none are referenced by viewer.html at runtime.
# *.map sourcemaps (devtools-only)
# web/debugger.mjs, debugger.css PDF.js developer panel
# web/compressed.tracemonkey-*.pdf demo PDF shipped as the viewer's default
echo "pdfjs: stripping unused artifacts"
find "$PDFJS_DIR" -type f -name '*.map' -delete
rm -f "$PDFJS_DIR/web/debugger.mjs" "$PDFJS_DIR/web/debugger.css"
rm -f "$PDFJS_DIR"/web/compressed.tracemonkey-*.pdf
echo "pdfjs: done. static/pdfjs/web/viewer.html is ready." echo "pdfjs: done. static/pdfjs/web/viewer.html is ready."
echo " Run 'make build' to include it in _site/." echo " Run 'make build' to include it in _site/."