445 lines
14 KiB
Python
Executable File
445 lines
14 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
extract-exif.py — Build-time EXIF sidecar generator for photography.
|
|
|
|
Walks content/photography/**/*.{jpg,jpeg,png} and writes a
|
|
{photo}.exif.yaml sidecar alongside each image. The Hakyll context in
|
|
build/Contexts.hs reads these sidecars and merges their fields into
|
|
the photographyCtx so authors don't have to hand-write camera / lens /
|
|
exposure / captured-date in frontmatter for digital photos.
|
|
|
|
Frontmatter always wins. The sidecar is a strict fallback — present
|
|
to populate fields the author chose not to write. Film scans typically
|
|
have no EXIF; the sidecar is still written but with an empty body, and
|
|
the author hand-writes the relevant fields in frontmatter.
|
|
|
|
Strategy:
|
|
|
|
1. Prefer exiftool when available — Perl-based, ships in distro
|
|
repos, handles every camera vendor's tag dialect (incl. RAW).
|
|
2. Fall back to Pillow's EXIF reader — pure Python, narrower
|
|
coverage, but always available via the project's .venv.
|
|
|
|
Staleness check: skips an image whose sidecar mtime > image mtime.
|
|
This means re-running the tool is idempotent and cheap.
|
|
|
|
GPS coordinates are written to the sidecar at full precision; the
|
|
geo-precision rounding (`exact | km | city | hidden`) is applied in
|
|
Hakyll at the consuming end, against each photo's frontmatter
|
|
`geo-precision:` value. The sidecar is the source of truth; the
|
|
consumer is the privacy gate.
|
|
|
|
Called by `make build` when .venv exists. Failures on individual
|
|
images are logged and the rest of the walk continues.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
from fractions import Fraction
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import yaml
|
|
|
|
REPO_ROOT = Path(__file__).parent.parent
|
|
CONTENT_DIR = REPO_ROOT / "content" / "photography"
|
|
|
|
IMAGE_EXTS = {".jpg", ".jpeg", ".png"}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Field normalisation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Mapping from EXIF field names (as exposed by exiftool / Pillow) to the
|
|
# sidecar keys consumed by Hakyll. Hakyll's fields are deliberately
|
|
# lowercase-with-hyphens, matching the photographyCtx convention.
|
|
SIDECAR_KEYS = [
|
|
"camera",
|
|
"lens",
|
|
"exposure",
|
|
"shutter",
|
|
"aperture",
|
|
"iso",
|
|
"focal-length",
|
|
"captured",
|
|
"geo",
|
|
# Pixel dimensions of the delivered (resized, EXIF-stripped) JPEG.
|
|
# Threaded through to the Hakyll photographyCtx and emitted as
|
|
# width / height attrs on every <img> tag — prevents cumulative
|
|
# layout shift while photos load.
|
|
"width",
|
|
"height",
|
|
]
|
|
|
|
|
|
def _format_shutter(speed: float) -> str:
|
|
"""Render shutter speed as "1/125" or "0.5s" depending on magnitude."""
|
|
if speed <= 0:
|
|
return ""
|
|
if speed >= 1.0:
|
|
return f"{speed:g}s"
|
|
denom = round(1.0 / speed)
|
|
return f"1/{denom}"
|
|
|
|
|
|
def _format_aperture(value: float) -> str:
|
|
if value <= 0:
|
|
return ""
|
|
# Common aperture values display with at most one decimal place.
|
|
if abs(value - round(value)) < 0.05:
|
|
return f"f/{int(round(value))}"
|
|
return f"f/{value:.1f}"
|
|
|
|
|
|
def _format_focal(value: float) -> str:
|
|
if value <= 0:
|
|
return ""
|
|
return f"{int(round(value))}mm"
|
|
|
|
|
|
def _build_exposure_string(
|
|
shutter: str | None,
|
|
aperture: str | None,
|
|
iso: int | None,
|
|
) -> str | None:
|
|
"""Compose "1/125 f/8 ISO 400" from individual fields when present."""
|
|
parts: list[str] = []
|
|
if shutter:
|
|
parts.append(shutter)
|
|
if aperture:
|
|
parts.append(aperture)
|
|
if iso:
|
|
parts.append(f"ISO {iso}")
|
|
return " ".join(parts) if parts else None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# exiftool path
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _exiftool_available() -> bool:
|
|
return shutil.which("exiftool") is not None
|
|
|
|
|
|
def _read_exif_via_exiftool(image: Path) -> dict[str, Any]:
|
|
"""Invoke exiftool and return a dict of normalised sidecar keys.
|
|
|
|
exiftool's `-json` output is a list of objects; we parse the first
|
|
entry. Numeric values come through as numbers; text values as
|
|
strings. We accept missing keys silently.
|
|
"""
|
|
result = subprocess.run(
|
|
[
|
|
"exiftool",
|
|
"-json",
|
|
"-Make",
|
|
"-Model",
|
|
"-LensModel",
|
|
"-LensSpec",
|
|
"-LensInfo",
|
|
"-ExposureTime",
|
|
"-FNumber",
|
|
"-ISO",
|
|
"-FocalLength",
|
|
"-FocalLengthIn35mmFormat",
|
|
"-DateTimeOriginal",
|
|
"-CreateDate",
|
|
"-GPSLatitude",
|
|
"-GPSLongitude",
|
|
"-GPSLatitudeRef",
|
|
"-GPSLongitudeRef",
|
|
"-ImageWidth",
|
|
"-ImageHeight",
|
|
"-n", # numeric output for shutter/aperture/GPS/dimensions
|
|
str(image),
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
if result.returncode != 0:
|
|
return {}
|
|
try:
|
|
data = json.loads(result.stdout)
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
if not data:
|
|
return {}
|
|
raw = data[0]
|
|
|
|
out: dict[str, Any] = {}
|
|
|
|
make = (raw.get("Make") or "").strip()
|
|
model = (raw.get("Model") or "").strip()
|
|
if make and model and not model.lower().startswith(make.lower()):
|
|
out["camera"] = f"{make} {model}".strip()
|
|
elif model:
|
|
out["camera"] = model
|
|
elif make:
|
|
out["camera"] = make
|
|
|
|
lens = (
|
|
raw.get("LensModel")
|
|
or raw.get("LensSpec")
|
|
or raw.get("LensInfo")
|
|
or ""
|
|
).strip()
|
|
if lens:
|
|
out["lens"] = lens
|
|
|
|
shutter_secs = raw.get("ExposureTime")
|
|
if isinstance(shutter_secs, (int, float)) and shutter_secs > 0:
|
|
out["shutter"] = _format_shutter(float(shutter_secs))
|
|
|
|
aperture = raw.get("FNumber")
|
|
if isinstance(aperture, (int, float)) and aperture > 0:
|
|
out["aperture"] = _format_aperture(float(aperture))
|
|
|
|
iso = raw.get("ISO")
|
|
if isinstance(iso, int) and iso > 0:
|
|
out["iso"] = iso
|
|
|
|
focal = raw.get("FocalLength")
|
|
if isinstance(focal, (int, float)) and focal > 0:
|
|
out["focal-length"] = _format_focal(float(focal))
|
|
|
|
captured_raw = raw.get("DateTimeOriginal") or raw.get("CreateDate")
|
|
if isinstance(captured_raw, str) and captured_raw:
|
|
# exiftool format is "YYYY:MM:DD HH:MM:SS"; we want ISO date only.
|
|
date_part = captured_raw.split(" ", 1)[0].replace(":", "-")
|
|
if len(date_part) == 10:
|
|
out["captured"] = date_part
|
|
|
|
lat = raw.get("GPSLatitude")
|
|
lon = raw.get("GPSLongitude")
|
|
if isinstance(lat, (int, float)) and isinstance(lon, (int, float)):
|
|
# exiftool with -n returns signed decimals already.
|
|
out["geo"] = [round(float(lat), 6), round(float(lon), 6)]
|
|
|
|
width = raw.get("ImageWidth")
|
|
height = raw.get("ImageHeight")
|
|
if isinstance(width, int) and width > 0:
|
|
out["width"] = width
|
|
if isinstance(height, int) and height > 0:
|
|
out["height"] = height
|
|
|
|
exposure = _build_exposure_string(
|
|
out.get("shutter"), out.get("aperture"), out.get("iso")
|
|
)
|
|
if exposure:
|
|
out["exposure"] = exposure
|
|
|
|
return out
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pillow fallback path
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _pillow_rational(value: Any) -> float | None:
|
|
"""Pillow can return EXIF rationals as IFDRational, tuples, or floats."""
|
|
if value is None:
|
|
return None
|
|
try:
|
|
if isinstance(value, tuple) and len(value) == 2:
|
|
num, den = value
|
|
return float(num) / float(den) if den else None
|
|
return float(Fraction(value).limit_denominator())
|
|
except (TypeError, ValueError, ZeroDivisionError):
|
|
try:
|
|
return float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def _gps_to_decimal(coord: Any, ref: Any) -> float | None:
|
|
"""Pillow GPS coords come as ((deg_n, deg_d), (min_n, min_d), (sec_n, sec_d))."""
|
|
if not coord:
|
|
return None
|
|
try:
|
|
deg = float(coord[0])
|
|
minutes = float(coord[1])
|
|
seconds = float(coord[2])
|
|
decimal = deg + minutes / 60.0 + seconds / 3600.0
|
|
if isinstance(ref, str) and ref in ("S", "W"):
|
|
decimal = -decimal
|
|
return decimal
|
|
except (TypeError, ValueError, IndexError):
|
|
return None
|
|
|
|
|
|
def _read_exif_via_pillow(image: Path) -> dict[str, Any]:
|
|
from PIL import Image, ExifTags
|
|
|
|
out: dict[str, Any] = {}
|
|
|
|
# Pixel dimensions are extracted unconditionally (separate from
|
|
# EXIF) — every readable raster file has them, even synthetic
|
|
# placeholders or photos that have had their EXIF stripped.
|
|
try:
|
|
with Image.open(image) as img:
|
|
width, height = img.size
|
|
if isinstance(width, int) and width > 0:
|
|
out["width"] = width
|
|
if isinstance(height, int) and height > 0:
|
|
out["height"] = height
|
|
exif = img._getexif() or {}
|
|
except Exception: # noqa: BLE001 — corrupt EXIF should not abort the walk
|
|
return out
|
|
|
|
if not exif:
|
|
return out
|
|
|
|
tag_name = {v: k for k, v in ExifTags.TAGS.items()}
|
|
gps_name = {v: k for k, v in ExifTags.GPSTAGS.items()}
|
|
|
|
def _g(name: str) -> Any:
|
|
return exif.get(tag_name.get(name, -1))
|
|
|
|
make = (_g("Make") or "").strip()
|
|
model = (_g("Model") or "").strip()
|
|
if make and model and not model.lower().startswith(make.lower()):
|
|
out["camera"] = f"{make} {model}".strip()
|
|
elif model:
|
|
out["camera"] = model
|
|
elif make:
|
|
out["camera"] = make
|
|
|
|
lens = (_g("LensModel") or _g("LensMake") or "").strip()
|
|
if lens:
|
|
out["lens"] = lens
|
|
|
|
shutter_secs = _pillow_rational(_g("ExposureTime"))
|
|
if shutter_secs and shutter_secs > 0:
|
|
out["shutter"] = _format_shutter(shutter_secs)
|
|
|
|
aperture = _pillow_rational(_g("FNumber"))
|
|
if aperture and aperture > 0:
|
|
out["aperture"] = _format_aperture(aperture)
|
|
|
|
iso_raw = _g("ISOSpeedRatings") or _g("PhotographicSensitivity")
|
|
if isinstance(iso_raw, int) and iso_raw > 0:
|
|
out["iso"] = iso_raw
|
|
elif isinstance(iso_raw, tuple) and iso_raw and isinstance(iso_raw[0], int):
|
|
out["iso"] = iso_raw[0]
|
|
|
|
focal = _pillow_rational(_g("FocalLength"))
|
|
if focal and focal > 0:
|
|
out["focal-length"] = _format_focal(focal)
|
|
|
|
captured_raw = _g("DateTimeOriginal") or _g("DateTime")
|
|
if isinstance(captured_raw, str) and captured_raw:
|
|
date_part = captured_raw.split(" ", 1)[0].replace(":", "-")
|
|
if len(date_part) == 10:
|
|
out["captured"] = date_part
|
|
|
|
gps_idx = tag_name.get("GPSInfo", -1)
|
|
gps_info = exif.get(gps_idx) or {}
|
|
if isinstance(gps_info, dict) and gps_info:
|
|
# Pillow exposes GPSInfo by integer-keyed dict; remap.
|
|
named = {gps_name.get(k, str(k)): v for k, v in gps_info.items()}
|
|
lat = _gps_to_decimal(named.get("GPSLatitude"), named.get("GPSLatitudeRef"))
|
|
lon = _gps_to_decimal(named.get("GPSLongitude"), named.get("GPSLongitudeRef"))
|
|
if lat is not None and lon is not None:
|
|
out["geo"] = [round(lat, 6), round(lon, 6)]
|
|
|
|
exposure = _build_exposure_string(
|
|
out.get("shutter"), out.get("aperture"), out.get("iso")
|
|
)
|
|
if exposure:
|
|
out["exposure"] = exposure
|
|
|
|
return out
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Walk + write
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _sidecar_path(image: Path) -> Path:
|
|
return image.with_suffix(image.suffix + ".exif.yaml")
|
|
|
|
|
|
def _is_stale(image: Path, sidecar: Path) -> bool:
|
|
if not sidecar.exists():
|
|
return True
|
|
return image.stat().st_mtime > sidecar.stat().st_mtime
|
|
|
|
|
|
def _atomic_write_yaml(path: Path, data: dict[str, Any]) -> None:
|
|
tmp = path.with_suffix(path.suffix + ".tmp")
|
|
with tmp.open("w", encoding="utf-8") as f:
|
|
# Preserve the SIDECAR_KEYS order so a manual diff is easy to read.
|
|
ordered = {k: data[k] for k in SIDECAR_KEYS if k in data}
|
|
yaml.safe_dump(ordered, f, sort_keys=False, allow_unicode=True)
|
|
tmp.replace(path)
|
|
|
|
|
|
def _read_one(image: Path) -> dict[str, Any]:
|
|
if _exiftool_available():
|
|
data = _read_exif_via_exiftool(image)
|
|
if data:
|
|
return data
|
|
return _read_exif_via_pillow(image)
|
|
|
|
|
|
def main() -> int:
|
|
if not CONTENT_DIR.exists():
|
|
print(f"extract-exif: {CONTENT_DIR} does not exist — skipping.", file=sys.stderr)
|
|
return 0
|
|
|
|
using_exiftool = _exiftool_available()
|
|
print(
|
|
"extract-exif: source ="
|
|
f" {'exiftool' if using_exiftool else 'Pillow (exiftool not installed)'}",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
written = 0
|
|
skipped = 0
|
|
failed = 0
|
|
|
|
for image in sorted(CONTENT_DIR.rglob("*")):
|
|
if image.suffix.lower() not in IMAGE_EXTS:
|
|
continue
|
|
# Skip the WebP companions (extension wouldn't match anyway, but
|
|
# be explicit) and any tmp / hidden files.
|
|
if image.name.startswith(".") or image.name.endswith(".tmp"):
|
|
continue
|
|
|
|
sidecar = _sidecar_path(image)
|
|
if not _is_stale(image, sidecar):
|
|
skipped += 1
|
|
continue
|
|
|
|
try:
|
|
data = _read_one(image)
|
|
except Exception as e: # noqa: BLE001 — keep walking
|
|
print(f"extract-exif: {image}: {e}", file=sys.stderr)
|
|
failed += 1
|
|
continue
|
|
|
|
# Always write a sidecar — even if it's empty — so the consumer
|
|
# doesn't need to branch on existence. An empty sidecar is the
|
|
# explicit signal that "we tried; nothing to extract" (typical
|
|
# for film scans).
|
|
_atomic_write_yaml(sidecar, data)
|
|
written += 1
|
|
|
|
print(
|
|
f"extract-exif: {written} written, {skipped} skipped, {failed} failed",
|
|
file=sys.stderr,
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|