levineuwirth.org/nginx/popup-proxy.conf

193 lines
8.9 KiB
Plaintext

# popup-proxy.conf — same-origin reverse proxy for popups.js providers
# whose upstream APIs do not send CORS headers (arXiv, NCBI/PubMed,
# Internet Archive). All three return immutable metadata, so the cache
# TTL is generous; a manual `proxy_cache_purge` is unnecessary.
#
# Place this file at /etc/nginx/snippets/popup-proxy.conf and `include`
# it inside the server { } block of the levineuwirth.org vhost. The
# `proxy_cache_path` directive must live in the http { } context — put
# it in nginx.conf or the relevant conf.d/ file.
#
# http {
# proxy_cache_path /var/cache/nginx/popup-proxy
# levels=1:2 keys_zone=popup_proxy:16m
# max_size=512m inactive=60d use_temp_path=off;
# ...
# }
#
# server {
# server_name levineuwirth.org;
# ...
# include snippets/popup-proxy.conf;
# }
# All locations use `^~` prefix matching: without it, the regex
# location in static-assets.conf (per-extension cache headers) outranks
# a plain prefix match and captures any proxied URL ending in an image
# extension — e.g. an arXiv figure .png — serving a local 404 instead
# of proxying. `^~` short-circuits regex evaluation for this subtree.
# Shared resolver — needed because proxy_pass uses a variable upstream
# (literal upstreams are resolved once at startup; variables defer DNS
# to request time, which lets nginx start without the upstream being
# reachable and survives upstream IP changes).
resolver 1.1.1.1 8.8.8.8 ipv6=off valid=300s;
resolver_timeout 5s;
# ── arXiv ────────────────────────────────────────────────────────────
# Atom feed of paper metadata. Abstracts never change after publication
# (revisions get distinct IDs like 2604.06217v2), so 30d is safe.
location ^~ /proxy/arxiv/ {
set $upstream_arxiv export.arxiv.org;
# With a VARIABLE upstream, a URI part on proxy_pass is passed to
# the upstream literally — "proxy_pass https://$up/;" sends every
# request to the upstream's homepage instead of prefix-stripping.
# Strip the prefix explicitly; `break` keeps args intact.
rewrite ^/proxy/arxiv/(.*)$ /$1 break;
proxy_pass https://$upstream_arxiv;
proxy_set_header Host $upstream_arxiv;
proxy_set_header User-Agent "levineuwirth.org popup-proxy (ln@levineuwirth.org)";
proxy_ssl_server_name on;
# Keep the security baseline: the add_header directives below
# would otherwise drop it for /proxy/ responses (same pattern
# as archive.conf). The upstream's own security headers are hidden
# first — browsers honor only the FIRST Strict-Transport-Security
# header (RFC 6797 §8.1), so an upstream's short max-age passing
# through ahead of ours would downgrade the domain's cached HSTS
# policy on every popup fetch.
proxy_hide_header Strict-Transport-Security;
proxy_hide_header Content-Security-Policy;
proxy_hide_header X-Frame-Options;
include snippets/security-headers.conf;
proxy_cache popup_proxy;
proxy_cache_valid 200 30d;
proxy_cache_valid any 5m;
proxy_cache_use_stale error timeout updating http_500 http_502 http_503 http_504;
proxy_cache_lock on;
add_header X-Cache-Status $upstream_cache_status always;
# Belt-and-suspenders: even though same-origin doesn't need CORS, a
# future migration of popups.js to a worker or different origin would.
add_header Access-Control-Allow-Origin "$scheme://$host" always;
}
# ── arXiv HTML renditions (lead figures) ─────────────────────────────
# popups.js's arXiv enrich step fetches the LaTeXML HTML page to find
# the paper's lead figure, then loads the figure image itself — both
# through this location. Upstream is arxiv.org proper, NOT
# export.arxiv.org: the export host serves the Atom API fine but
# rate-limits the /html/ asset tree (429s on figures). Pages can be
# large (hundreds of KB), which is exactly why they're cached here.
# A 404 is the common no-HTML-rendition case (pre-2024 papers,
# unconvertible sources) — cached briefly so hovers don't hammer it.
location ^~ /proxy/arxiv-html/ {
set $upstream_arxiv_site arxiv.org;
rewrite ^/proxy/arxiv-html/(.*)$ /html/$1 break;
proxy_pass https://$upstream_arxiv_site;
proxy_set_header Host $upstream_arxiv_site;
proxy_set_header User-Agent "levineuwirth.org popup-proxy (ln@levineuwirth.org)";
proxy_ssl_server_name on;
# Keep the security baseline: the add_header directives below
# would otherwise drop it for /proxy/ responses (same pattern
# as archive.conf).
proxy_hide_header Strict-Transport-Security;
proxy_hide_header Content-Security-Policy;
proxy_hide_header X-Frame-Options;
include snippets/security-headers.conf;
proxy_cache popup_proxy;
proxy_cache_valid 200 30d;
proxy_cache_valid 404 1d;
proxy_cache_valid any 5m;
proxy_cache_use_stale error timeout updating http_500 http_502 http_503 http_504;
proxy_cache_lock on;
add_header X-Cache-Status $upstream_cache_status always;
add_header Access-Control-Allow-Origin "$scheme://$host" always;
}
# ── Internet Archive ─────────────────────────────────────────────────
# Item metadata JSON. Item descriptions are author-edited and could
# change, but rarely; 7d strikes a reasonable balance.
location ^~ /proxy/archive/ {
set $upstream_archive archive.org;
# Prefix-strip explicitly — see the arXiv block for why a URI part
# on a variable proxy_pass would break this.
rewrite ^/proxy/archive/(.*)$ /$1 break;
proxy_pass https://$upstream_archive;
proxy_set_header Host $upstream_archive;
proxy_set_header User-Agent "levineuwirth.org popup-proxy (ln@levineuwirth.org)";
proxy_ssl_server_name on;
# Keep the security baseline: the add_header directives below
# would otherwise drop it for /proxy/ responses (same pattern
# as archive.conf). The upstream's own security headers are hidden
# first — browsers honor only the FIRST Strict-Transport-Security
# header (RFC 6797 §8.1), so an upstream's short max-age passing
# through ahead of ours would downgrade the domain's cached HSTS
# policy on every popup fetch.
proxy_hide_header Strict-Transport-Security;
proxy_hide_header Content-Security-Policy;
proxy_hide_header X-Frame-Options;
include snippets/security-headers.conf;
proxy_cache popup_proxy;
proxy_cache_valid 200 7d;
proxy_cache_valid any 5m;
proxy_cache_use_stale error timeout updating http_500 http_502 http_503 http_504;
proxy_cache_lock on;
add_header X-Cache-Status $upstream_cache_status always;
add_header Access-Control-Allow-Origin "$scheme://$host" always;
}
# ── PubMed (NCBI E-utilities) ────────────────────────────────────────
# Article summaries. NCBI requests a tool=/email= identifier on every
# request (https://www.ncbi.nlm.nih.gov/books/NBK25497/); we inject
# them server-side so popups.js stays focused on rendering.
location ^~ /proxy/pubmed/ {
set $upstream_pubmed eutils.ncbi.nlm.nih.gov;
# Prefix-strip explicitly — see the arXiv block for why a URI part
# on a variable proxy_pass would break this.
rewrite ^/proxy/pubmed/(.*)$ /$1 break;
proxy_pass https://$upstream_pubmed;
proxy_set_header Host $upstream_pubmed;
proxy_set_header User-Agent "levineuwirth.org popup-proxy (ln@levineuwirth.org)";
proxy_ssl_server_name on;
# NCBI etiquette: rate-limit to <3 req/s without an API key. With
# caching this is rarely exercised, but the burst guards a hot page.
limit_req zone=pubmed burst=3 nodelay;
# Keep the security baseline: the add_header directives below
# would otherwise drop it for /proxy/ responses (same pattern
# as archive.conf). The upstream's own security headers are hidden
# first — browsers honor only the FIRST Strict-Transport-Security
# header (RFC 6797 §8.1), so an upstream's short max-age passing
# through ahead of ours would downgrade the domain's cached HSTS
# policy on every popup fetch.
proxy_hide_header Strict-Transport-Security;
proxy_hide_header Content-Security-Policy;
proxy_hide_header X-Frame-Options;
include snippets/security-headers.conf;
proxy_cache popup_proxy;
proxy_cache_valid 200 30d;
proxy_cache_valid any 5m;
proxy_cache_use_stale error timeout updating http_500 http_502 http_503 http_504;
proxy_cache_lock on;
add_header X-Cache-Status $upstream_cache_status always;
add_header Access-Control-Allow-Origin "$scheme://$host" always;
}
# Companion directive for the limit_req above. Place in http { } context:
#
# http {
# limit_req_zone $binary_remote_addr zone=pubmed:1m rate=3r/s;
# ...
# }