From d29f4f1b78c47c712babccb691198fb8cd418ea7 Mon Sep 17 00:00:00 2001
From: Levi Neuwirth <ln@levineuwirth.org>
Date: Tue, 14 Apr 2026 20:32:29 -0400
Subject: [PATCH] benchmarking and profiling scaffold

---
 CHANGELOG.md                                | 121 +++-
 docs/api/benchmark.md                       |   3 +
 docs/api/segment.md                         |   3 +
 docs/architecture.md                        |  65 +-
 mkdocs.yml                                  |   2 +
 pyproject.toml                              |   6 +
 src/neuropose/analyzer/__init__.py          |  24 +
 src/neuropose/analyzer/segment.py           | 590 ++++++++++++++++++
 src/neuropose/benchmark.py                  | 320 ++++++++++
 src/neuropose/cli.py                        | 647 +++++++++++++++++++-
 src/neuropose/estimator.py                  | 133 +++-
 src/neuropose/io.py                         | 401 +++++++++++-
 tests/integration/test_joint_names_drift.py |  60 ++
 tests/unit/test_analyzer_segment.py         | 431 +++++++++++++
 tests/unit/test_benchmark.py                | 314 ++++++++++
 tests/unit/test_cli.py                      | 406 +++++++++++-
 tests/unit/test_estimator.py                | 115 +++-
 tests/unit/test_io.py                       | 246 ++++++++
 uv.lock                                     |  18 +
 19 files changed, 3861 insertions(+), 44 deletions(-)
 create mode 100644 docs/api/benchmark.md
 create mode 100644 docs/api/segment.md
 create mode 100644 src/neuropose/analyzer/segment.py
 create mode 100644 src/neuropose/benchmark.py
 create mode 100644 tests/integration/test_joint_names_drift.py
 create mode 100644 tests/unit/test_analyzer_segment.py
 create mode 100644 tests/unit/test_benchmark.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d8f230a..d642888 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,8 @@ be split into per-release sections once tagging begins.
   and `pyproject.toml` with full project metadata, classifiers, and
   URL pointers. The runtime TensorFlow dependency is pinned to
   `tensorflow>=2.16,<3.0` — see *Changed* below for the rationale.
+  `psutil>=5.9` is a runtime dependency used by the estimator's
+  always-on `PerformanceMetrics` collection to sample peak RSS.
 - `[project.optional-dependencies].analysis` extra for fastdtw, scipy,
   scikit-learn, and sktime — install via `pip install neuropose[analysis]`.
 - `[project.optional-dependencies].metal` extra pulling
@@ -82,23 +84,48 @@ be split into per-release sections once tagging begins.
 - **`neuropose.io`** — validated prediction schemas:
   `FramePrediction` (frozen), `VideoMetadata` (frame count, fps,
   width, height), `VideoPredictions` (metadata envelope + frames
-  mapping), `JobResults`, `JobStatus` enum, `JobStatusEntry` (with a
-  structured `error` field), and `StatusFile`. Load and save helpers
-  with an atomic tmp-file-then-rename pattern for every state file.
-  `load_status` is deliberately crash-resilient: missing, corrupt,
-  or non-mapping JSON returns an empty `StatusFile` rather than
-  raising.
+  mapping + optional `segmentations` field), `JobResults`,
+  `JobStatus` enum, `JobStatusEntry` (with a structured `error`
+  field), and `StatusFile`. Performance schema: frozen
+  `PerformanceMetrics` carrying per-call timings
+  (`model_load_seconds`, `total_seconds`, `per_frame_latencies_ms`),
+  `peak_rss_mb`, `active_device`, `tensorflow_metal_active`, and
+  `tensorflow_version`; `BenchmarkResult` pairing a discarded
+  `warmup_pass` with `measured_passes` and a `BenchmarkAggregate`
+  (mean / p50 / p95 / p99 per-frame latency, mean throughput, max
+  peak RSS); optional `CpuComparisonResult` nested inside
+  `BenchmarkResult` for `--compare-cpu` runs, carrying both
+  device aggregates, the throughput speedup, and the
+  maximum-element-wise `poses3d` divergence in millimetres.
+  Segmentation schema: frozen `Segment` windows (`start`, `end`,
+  `peak`), `SegmentationConfig` (with a `method` version literal,
+  e.g. `valley_to_valley_v1`), a discriminated `ExtractorSpec`
+  union over `JointAxisExtractor`, `JointPairDistanceExtractor`,
+  `JointSpeedExtractor`, and `JointAngleExtractor`, and
+  `Segmentation` pairing a config with its segments so on-disk
+  results are self-describing. Load and save helpers with an atomic
+  tmp-file-then-rename pattern for every state file.
+  `load_benchmark_result` / `save_benchmark_result` follow the same
+  atomic pattern. `load_status` is deliberately crash-resilient:
+  missing, corrupt, or non-mapping JSON returns an empty
+  `StatusFile` rather than raising. Legacy predictions files
+  without the `segmentations` field deserialize cleanly to an
+  empty mapping.
 - **`neuropose.estimator`** — `Estimator` class that streams frames
   directly from OpenCV into the model, with no intermediate write-to-
   disk-then-read-back-as-PNG round trip. Returns a typed
   `ProcessVideoResult` containing a validated `VideoPredictions`
-  object; does not touch the filesystem. Constructor accepts an
-  injected model for testability; `load_model()` delegates to
-  `neuropose._model.load_metrabs_model()`. Typed exception hierarchy:
-  `EstimatorError`, `ModelNotLoadedError`, `VideoDecodeError`.
-  Optional per-frame `progress` callback for long videos. Frame
-  identifier convention is `frame_000000` (six-digit zero-pad, no
-  extension — no file is implied).
+  object and an always-populated `PerformanceMetrics` bundle (per-
+  frame latency in ms, total wall clock, peak RSS via `psutil`,
+  active TF device string, `tensorflow-metal` detection, TF
+  version, and model load time when the caller went through
+  `load_model()`). Does not touch the filesystem. Constructor
+  accepts an injected model for testability; `load_model()`
+  delegates to `neuropose._model.load_metrabs_model()`. Typed
+  exception hierarchy: `EstimatorError`, `ModelNotLoadedError`,
+  `VideoDecodeError`. Optional per-frame `progress` callback for
+  long videos. Frame identifier convention is `frame_000000`
+  (six-digit zero-pad, no extension — no file is implied).
 - **`neuropose.visualize`** — `visualize_predictions()` for per-frame
   2D + 3D overlay rendering. `matplotlib.use("Agg")` is called inside
   the function rather than at module import, so `import neuropose.visualize`
@@ -127,6 +154,21 @@ be split into per-release sections once tagging begins.
   download was truncated). Post-load interface check for
   `detect_poses`, `per_skeleton_joint_names`, and
   `per_skeleton_joint_edges`.
+- **`neuropose.benchmark`** — multi-pass inference benchmarking for
+  a single video. `run_benchmark()` runs `process_video` N times
+  (default 5), always discards the first pass as warmup (graph
+  compilation, file-system cache warmup), and aggregates the
+  remaining `PerformanceMetrics` into a `BenchmarkAggregate` with
+  mean / p50 / p95 / p99 per-frame latency, mean throughput, and
+  max peak RSS. `capture_reference=True` additionally preserves the
+  last measured pass's `VideoPredictions` in memory so the
+  `--compare-cpu` CLI flow can diff the `poses3d` arrays between a
+  GPU and CPU run. `compute_poses3d_divergence()` computes the
+  maximum element-wise absolute difference (in millimetres) between
+  two prediction sets, skipping frames with mismatched detection
+  counts and surfacing the `frame_count_compared` so callers can
+  tell if the number is trustworthy. `format_benchmark_report()`
+  renders a human-readable summary for CLI stdout.
 - **`neuropose.analyzer`** — post-processing subpackage with lazy
   imports for the heavy dependencies:
   - `analyzer.dtw` — three DTW entry points (`dtw_all`,
@@ -139,16 +181,53 @@ be split into per-release sections once tagging begins.
     degenerate vectors), `extract_feature_statistics`
     (`FeatureStatistics` frozen dataclass), and a `find_peaks` thin
     wrapper around `scipy.signal.find_peaks`.
+  - `analyzer.segment` — repetition segmentation for trials in
+    which a subject performs the same movement several times. A
+    three-layer API: `segment_by_peaks` (pure 1D
+    valley-to-valley peak detection on a generic signal),
+    `segment_predictions` (top-level entry point taking a
+    `VideoPredictions` plus an `ExtractorSpec`, converting
+    time-based parameters to frame counts via `metadata.fps`), and
+    `slice_predictions` (split a `VideoPredictions` into one per
+    detected repetition with re-keyed frame names and a rewritten
+    `frame_count`). Ships four extractor factories —
+    `joint_axis`, `joint_pair_distance`, `joint_speed`, and
+    `joint_angle` — plus a `JOINT_NAMES` constant for the
+    berkeley_mhad_43 skeleton with a `joint_index(name)` lookup,
+    so post-processing callers can resolve `"rwri"` → integer
+    without loading the MeTRAbs SavedModel. A matching integration
+    test (`tests/integration/test_joint_names_drift.py`, marked
+    `slow`) loads the real model and asserts the constant still
+    matches, so any upstream skeleton drift fails CI.
 - **`neuropose.cli`** — Typer-based command-line interface with
-  three subcommands: `watch` (run the daemon), `process <video>`
-  (run the estimator on a single video), and `analyze <results>`
-  (stub). Global options `--config/-c`, `--verbose/-v`, `--quiet/-q`,
+  five subcommands: `watch` (run the daemon), `process <video>`
+  (run the estimator on a single video), `segment <results>`
+  (post-hoc repetition segmentation — loads a JobResults or a
+  single VideoPredictions, runs
+  `neuropose.analyzer.segment.segment_predictions` with the chosen
+  extractor and thresholds, and atomically writes the file back
+  with the new segmentation attached under `--name`),
+  `benchmark <video>` (multi-pass inference benchmark — runs
+  `--repeats N` passes with a discarded first pass and
+  `--warmup-frames M` excluded from the head of each measured
+  pass, reports aggregates to stdout, and optionally writes a
+  structured `BenchmarkResult` to `--output`. Supports
+  `--compare-cpu` which spawns a `--force-cpu` subprocess, diffs
+  the resulting `poses3d` arrays, and reports throughput speedup
+  and max divergence in mm — the missing Apple Silicon numerical
+  verification answer from `RESEARCH.md`), and
+  `analyze <results>` (stub). The `segment` subcommand accepts
+  joint specifiers as either berkeley_mhad_43 names (`lwri`,
+  `rwri`, …) or integer indices, and refuses to overwrite an
+  existing segmentation of the same name without `--force`.
+  Global options `--config/-c`, `--verbose/-v`, `--quiet/-q`,
   `--version`. Structured error handling turns expected exceptions
-  (`FileNotFoundError` on config, `ValidationError`, `AlreadyRunningError`,
-  `NotImplementedError`, `KeyboardInterrupt`) into clear stderr
-  messages and distinct exit codes (`EXIT_OK=0`, `EXIT_USAGE=2`,
-  `EXIT_PENDING=3`, `EXIT_INTERRUPTED=130`). The CLI entry point is
-  wired in `[project.scripts]` as `neuropose = "neuropose.cli:run"`.
+  (`FileNotFoundError` on config, `ValidationError`,
+  `AlreadyRunningError`, `NotImplementedError`,
+  `KeyboardInterrupt`) into clear stderr messages and distinct
+  exit codes (`EXIT_OK=0`, `EXIT_USAGE=2`, `EXIT_PENDING=3`,
+  `EXIT_INTERRUPTED=130`). The CLI entry point is wired in
+  `[project.scripts]` as `neuropose = "neuropose.cli:run"`.
 
 #### Documentation
 
diff --git a/docs/api/benchmark.md b/docs/api/benchmark.md
new file mode 100644
index 0000000..42a5605
--- /dev/null
+++ b/docs/api/benchmark.md
@@ -0,0 +1,3 @@
+# `neuropose.benchmark`
+
+::: neuropose.benchmark
diff --git a/docs/api/segment.md b/docs/api/segment.md
new file mode 100644
index 0000000..d56dfb0
--- /dev/null
+++ b/docs/api/segment.md
@@ -0,0 +1,3 @@
+# `neuropose.analyzer.segment`
+
+::: neuropose.analyzer.segment
diff --git a/docs/architecture.md b/docs/architecture.md
index c153bd3..aac4e78 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -30,7 +30,8 @@ them are defined by validated pydantic schemas in
 ### estimator
 
 **Role:** pure inference library. Given a video path and a MeTRAbs
-model, produces a validated `VideoPredictions` object.
+model, produces a validated `VideoPredictions` object plus a
+`PerformanceMetrics` bundle.
 
 **Does NOT handle:** job directories, status files, polling, locking,
 signal handling, visualization, or anywhere-to-save decisions. It is a
@@ -39,11 +40,34 @@ library, not a daemon.
 The estimator streams frames directly from OpenCV into the model — no
 intermediate write-to-disk-then-read-back-as-PNG round trip like the
 previous prototype had. `process_video()` returns a typed
-`ProcessVideoResult` containing the predictions and does not touch the
-filesystem unless the caller explicitly asks it to save the result.
+`ProcessVideoResult` containing the predictions and an
+always-populated `PerformanceMetrics` (per-frame latency, peak RSS,
+total wall clock, active TF device, TF version, `tensorflow-metal`
+detection, and model load time when the caller went through
+`load_model()`). It does not touch the filesystem unless the caller
+explicitly asks it to save the result.
 
 See [`neuropose.estimator`](api/estimator.md) for the API reference.
 
+### benchmark
+
+**Role:** multi-pass inference benchmarking layered on top of the
+estimator. `run_benchmark()` calls `process_video` N times, discards
+the first pass as warmup, and aggregates the remaining
+`PerformanceMetrics` into a `BenchmarkAggregate` with distributional
+statistics (mean / p50 / p95 / p99 per-frame latency, mean
+throughput, max peak RSS).
+
+The benchmark is exposed via the `neuropose benchmark <video>` CLI
+subcommand. Its `--compare-cpu` flag spawns a subprocess with GPU
+visibility hidden (via `tf.config.set_visible_devices([], "GPU")`
+before any TF op) so a Metal-backed Apple Silicon run can be diffed
+against a CPU baseline — both the throughput speedup and the maximum
+element-wise `poses3d` divergence in millimetres are surfaced in the
+output. This is the "is `tensorflow-metal` producing correct
+numerics?" check that `RESEARCH.md`'s TensorFlow-version-compatibility
+section leaves open for v0.1.
+
 ### interfacer
 
 **Role:** job-lifecycle daemon. Watches `input_dir` for new job
@@ -74,16 +98,37 @@ Key guarantees:
 
 See [`neuropose.interfacer`](api/interfacer.md) for the API reference.
 
-### analyzer (pending commit 10)
+### analyzer
 
 **Role:** post-processing. Takes a `results.json` and produces analysis
-output (DTW comparisons, joint-angle features, classification). Each
-piece is a pure function of the predictions, so the module is a set of
-testable utilities rather than a daemon.
+output (DTW comparisons, joint-angle features, repetition segmentation,
+classification). Each piece is a pure function of the predictions, so
+the module is a set of testable utilities rather than a daemon.
 
-Pending the commit-10 rewrite. The previous prototype's `analyzer.py`
-was non-functional (it had imports that could not resolve and
-infinite-recursion bugs) and is not being ported forward.
+Three submodules ship today:
+
+- `analyzer.features` — `predictions_to_numpy`, normalization,
+  padding, joint angles, summary statistics, and a thin
+  `scipy.signal.find_peaks` wrapper.
+- `analyzer.dtw` — three DTW entry points (`dtw_all`, `dtw_per_joint`,
+  `dtw_relation`) over `fastdtw`, with a frozen `DTWResult` dataclass.
+  See `RESEARCH.md` for the ongoing methodology discussion.
+- `analyzer.segment` — **repetition segmentation**. Given a
+  `VideoPredictions` of a trial in which the subject performs the
+  same movement several times (e.g. lifting a cup repeatedly), the
+  module detects the individual repetitions as
+  `[start, peak, end)` windows via valley-to-valley peak detection
+  on a clinically chosen 1D signal. The signal is one of four
+  extractor variants (`joint_axis`, `joint_pair_distance`,
+  `joint_speed`, `joint_angle`), and the produced `Segmentation`
+  carries its own `SegmentationConfig` so the on-disk
+  representation is self-describing. Segmentation is exposed both
+  as a Python API and as the `neuropose segment` CLI subcommand,
+  which runs post-hoc against an existing `results.json` — the
+  daemon stays a pure inference daemon.
+
+Classification wrappers on top of `sktime` are deliberately not
+shipped yet; see `RESEARCH.md` for the plan.
 
 ## Data flow
 
diff --git a/mkdocs.yml b/mkdocs.yml
index 71966cd..076453e 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -94,6 +94,8 @@ nav:
       - neuropose.estimator: api/estimator.md
       - neuropose.interfacer: api/interfacer.md
       - neuropose.io: api/io.md
+      - neuropose.benchmark: api/benchmark.md
+      - neuropose.analyzer.segment: api/segment.md
       - neuropose.visualize: api/visualize.md
   - Development: development.md
   - Deployment: deployment.md
diff --git a/pyproject.toml b/pyproject.toml
index b38142f..8134f23 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,12 @@ dependencies = [
   "numpy>=1.26",
   "opencv-python-headless>=4.9",
   "matplotlib>=3.8",
+  # psutil powers the peak-RSS and process-memory readings captured by
+  # PerformanceMetrics in neuropose.io. It is a cross-platform pure-Python
+  # wrapper around /proc (Linux), mach (macOS), and Win32 APIs with a
+  # small C extension; brought in at runtime because metrics are always
+  # collected, not an optional feature.
+  "psutil>=5.9",
   "tensorflow>=2.16,<3.0",
   "tensorflow-hub>=0.16",
 ]
diff --git a/src/neuropose/analyzer/__init__.py b/src/neuropose/analyzer/__init__.py
index 4c97557..ec7e811 100644
--- a/src/neuropose/analyzer/__init__.py
+++ b/src/neuropose/analyzer/__init__.py
@@ -42,8 +42,23 @@ from neuropose.analyzer.features import (
     pad_sequences,
     predictions_to_numpy,
 )
+from neuropose.analyzer.segment import (
+    JOINT_INDEX,
+    JOINT_NAMES,
+    extract_signal,
+    joint_angle,
+    joint_axis,
+    joint_index,
+    joint_pair_distance,
+    joint_speed,
+    segment_by_peaks,
+    segment_predictions,
+    slice_predictions,
+)
 
 __all__ = [
+    "JOINT_INDEX",
+    "JOINT_NAMES",
     "DTWResult",
     "FeatureStatistics",
     "dtw_all",
@@ -51,8 +66,17 @@ __all__ = [
     "dtw_relation",
     "extract_feature_statistics",
     "extract_joint_angles",
+    "extract_signal",
     "find_peaks",
+    "joint_angle",
+    "joint_axis",
+    "joint_index",
+    "joint_pair_distance",
+    "joint_speed",
     "normalize_pose_sequence",
     "pad_sequences",
     "predictions_to_numpy",
+    "segment_by_peaks",
+    "segment_predictions",
+    "slice_predictions",
 ]
diff --git a/src/neuropose/analyzer/segment.py b/src/neuropose/analyzer/segment.py
new file mode 100644
index 0000000..232a725
--- /dev/null
+++ b/src/neuropose/analyzer/segment.py
@@ -0,0 +1,590 @@
+"""Repetition segmentation for pose sequences.
+
+Given a :class:`~neuropose.io.VideoPredictions` of a trial in which the
+subject performs the same movement several times (e.g. lifting a cup
+repeatedly), this module detects the individual repetitions and returns
+them as a :class:`~neuropose.io.Segmentation` that can be persisted back
+into ``results.json``.
+
+The detection is a two-step pipeline:
+
+1. **Extract a 1D segmentation signal** from the per-frame pose array,
+   using one of the :class:`~neuropose.io.ExtractorSpec` variants
+   (:class:`~neuropose.io.JointAxisExtractor`,
+   :class:`~neuropose.io.JointPairDistanceExtractor`,
+   :class:`~neuropose.io.JointSpeedExtractor`, or
+   :class:`~neuropose.io.JointAngleExtractor`).
+2. **Walk peaks to their neighbouring valleys** to form one ``[start,
+   peak, end)`` window per repetition. The "valley-to-valley" method
+   assumes the subject comes to rest between repetitions — the recording
+   protocol for NeuroPose's clinical use cases makes this assumption
+   reasonable; see :mod:`neuropose.io.SegmentationConfig.method` for the
+   version stamp that travels with persisted segmentations.
+
+Three layers of API are provided, in increasing order of convenience:
+
+- :func:`segment_by_peaks` — pure 1D signal segmentation, no pose or
+  metadata awareness. All parameters are in sample counts.
+- :func:`segment_predictions` — the top-level entry point. Takes a
+  :class:`~neuropose.io.VideoPredictions` plus an
+  :class:`~neuropose.io.ExtractorSpec`, converts time-based parameters
+  to frame counts using ``metadata.fps``, and returns a full
+  :class:`~neuropose.io.Segmentation` ready to attach to the predictions.
+- :func:`slice_predictions` — split a :class:`~neuropose.io.VideoPredictions`
+  into one per-repetition :class:`~neuropose.io.VideoPredictions`,
+  useful when downstream code wants per-rep objects rather than windows
+  over the original.
+
+The four :class:`~neuropose.io.ExtractorSpec` variants have convenience
+factories (:func:`joint_axis`, :func:`joint_pair_distance`,
+:func:`joint_speed`, :func:`joint_angle`) re-exported at the subpackage
+root so that Python callers never have to import the pydantic classes
+directly::
+
+    from neuropose.analyzer import (
+        segment_predictions,
+        joint_pair_distance,
+        JOINT_INDEX,
+    )
+
+    seg = segment_predictions(
+        predictions,
+        joint_pair_distance(JOINT_INDEX["lwri"], JOINT_INDEX["rwri"]),
+        min_distance_seconds=0.5,
+        min_prominence=50.0,
+    )
+
+Dependency note
+---------------
+This module requires :mod:`scipy` for peak detection, which is part of
+the ``analysis`` optional extra. The import is lazy so that
+``import neuropose.analyzer.segment`` succeeds even when scipy is not
+installed; a clear :class:`ImportError` surfaces at the first call to
+:func:`segment_by_peaks` or :func:`segment_predictions`.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import numpy as np
+
+from neuropose.analyzer.features import predictions_to_numpy
+from neuropose.io import (
+    ExtractorSpec,
+    JointAngleExtractor,
+    JointAxisExtractor,
+    JointPairDistanceExtractor,
+    JointSpeedExtractor,
+    Segment,
+    Segmentation,
+    SegmentationConfig,
+    VideoMetadata,
+    VideoPredictions,
+)
+
+# ---------------------------------------------------------------------------
+# berkeley_mhad_43 joint names
+# ---------------------------------------------------------------------------
+#
+# The MeTRAbs SavedModel we pin in ``neuropose._model`` exposes a 43-joint
+# skeleton named ``berkeley_mhad_43``. The names below are captured
+# verbatim from that model and committed as a constant so that
+# post-processing code (CLI flags, analyzer helpers) can translate
+# human-readable joint names into indices without having to load a
+# multi-gigabyte TensorFlow model just to resolve ``"rwri"``.
+#
+# An integration test (:mod:`tests.integration.test_joint_names_match_model`)
+# asserts that this tuple still matches what the loaded model reports, so
+# any upstream change in the MeTRAbs skeleton will fail CI the next time
+# the slow tests run. When that happens, the expected fix is:
+#
+#   1. Update this tuple in the same commit that bumps the model pin in
+#      ``neuropose._model``, and
+#   2. Cross-check any CLI or docs that embed hardcoded joint names.
+
+JOINT_NAMES: tuple[str, ...] = (
+    "head",
+    "lhead",
+    "rhead",
+    "rback",
+    "backl",
+    "backt",
+    "lback",
+    "lside",
+    "bell",
+    "chest",
+    "rside",
+    "lsho1",
+    "lsho2",
+    "larm",
+    "lelb",
+    "lwri",
+    "lhan1",
+    "lhan2",
+    "lhan3",
+    "rsho1",
+    "rsho2",
+    "rarm",
+    "relb",
+    "rwri",
+    "rhan1",
+    "rhan2",
+    "rhan3",
+    "lhipb",
+    "lhipf",
+    "lhipl",
+    "lleg",
+    "lkne",
+    "lank",
+    "lhee",
+    "lfoo",
+    "rhipb",
+    "rhipf",
+    "rhipl",
+    "rleg",
+    "rkne",
+    "rank",
+    "rhee",
+    "rfoo",
+)
+
+JOINT_INDEX: dict[str, int] = {name: idx for idx, name in enumerate(JOINT_NAMES)}
+
+
+def joint_index(name: str) -> int:
+    """Return the integer index of ``name`` in the berkeley_mhad_43 skeleton.
+
+    Parameters
+    ----------
+    name
+        Joint name as it appears in the MeTRAbs SavedModel
+        ``per_skeleton_joint_names["berkeley_mhad_43"]`` tensor — for
+        example ``"rwri"`` for the right wrist or ``"lkne"`` for the
+        left knee.
+
+    Returns
+    -------
+    int
+        The 0-based index into a ``(frames, 43, 3)`` pose sequence.
+
+    Raises
+    ------
+    KeyError
+        If ``name`` is not one of the 43 known joint names. The error
+        message lists all valid names to make recovery obvious.
+    """
+    try:
+        return JOINT_INDEX[name]
+    except KeyError:
+        raise KeyError(
+            f"unknown joint name: {name!r}. Known names: {sorted(JOINT_INDEX)}"
+        ) from None
+
+
+# ---------------------------------------------------------------------------
+# Extractor factories
+# ---------------------------------------------------------------------------
+#
+# Thin constructors for the four ExtractorSpec variants. Callers *could*
+# instantiate the pydantic models directly, but the factories read a bit
+# better at call sites and match the ergonomic pattern set by the rest
+# of the analyzer subpackage (e.g. ``joint_pair_distance(j1, j2)`` reads
+# like a function call rather than a schema construction).
+
+
+def joint_axis(joint: int, axis: int, *, invert: bool = False) -> JointAxisExtractor:
+    """Construct a :class:`~neuropose.io.JointAxisExtractor`.
+
+    Parameters
+    ----------
+    joint
+        Joint index into the ``(frames, J, 3)`` pose sequence.
+    axis
+        Spatial axis: ``0`` for x, ``1`` for y, ``2`` for z.
+    invert
+        If ``True``, negate the signal so valleys become peaks. Useful
+        when the movement of interest is a *decrease* in the selected
+        coordinate (e.g. a joint that dips during the repetition).
+    """
+    return JointAxisExtractor(joint=joint, axis=axis, invert=invert)
+
+
+def joint_pair_distance(j1: int, j2: int) -> JointPairDistanceExtractor:
+    """Construct a :class:`~neuropose.io.JointPairDistanceExtractor`.
+
+    The two joints must be distinct; pydantic validation enforces this.
+    """
+    return JointPairDistanceExtractor(joints=(j1, j2))
+
+
+def joint_speed(joint: int) -> JointSpeedExtractor:
+    """Construct a :class:`~neuropose.io.JointSpeedExtractor`."""
+    return JointSpeedExtractor(joint=joint)
+
+
+def joint_angle(a: int, b: int, c: int) -> JointAngleExtractor:
+    """Construct a :class:`~neuropose.io.JointAngleExtractor`.
+
+    The angle is computed at joint ``b`` between the vectors ``(a - b)``
+    and ``(c - b)``.
+    """
+    return JointAngleExtractor(triplet=(a, b, c))
+
+
+# ---------------------------------------------------------------------------
+# Signal extraction
+# ---------------------------------------------------------------------------
+
+
+def extract_signal(sequence: np.ndarray, spec: ExtractorSpec) -> np.ndarray:
+    """Reduce a pose sequence to a 1D segmentation signal per the ``spec``.
+
+    Dispatches on the discriminator ``kind`` of ``spec``. All variants
+    produce a 1D array whose length equals ``sequence.shape[0]``, so the
+    segmentation engine can treat the output uniformly regardless of
+    which extractor produced it.
+
+    Parameters
+    ----------
+    sequence
+        Array of shape ``(frames, joints, 3)`` — the output of
+        :func:`neuropose.analyzer.features.predictions_to_numpy`.
+    spec
+        One of the :class:`~neuropose.io.ExtractorSpec` variants.
+
+    Returns
+    -------
+    numpy.ndarray
+        A 1D ``float`` array of length ``frames``.
+
+    Raises
+    ------
+    ValueError
+        If ``sequence`` is not a ``(frames, joints, 3)`` array, if any
+        joint index in ``spec`` is out of range, or if
+        ``JointSpeedExtractor`` is applied to a single-frame sequence.
+    """
+    if sequence.ndim != 3 or sequence.shape[-1] != 3:
+        raise ValueError(f"expected (frames, joints, 3); got shape {sequence.shape}")
+    num_frames, num_joints, _ = sequence.shape
+
+    def _check_joint(idx: int) -> None:
+        if not (0 <= idx < num_joints):
+            raise ValueError(f"joint index {idx} out of range [0, {num_joints})")
+
+    if isinstance(spec, JointAxisExtractor):
+        _check_joint(spec.joint)
+        signal = sequence[:, spec.joint, spec.axis].astype(float, copy=True)
+        if spec.invert:
+            signal = -signal
+        return signal
+
+    if isinstance(spec, JointPairDistanceExtractor):
+        j1, j2 = spec.joints
+        _check_joint(j1)
+        _check_joint(j2)
+        delta = sequence[:, j1, :] - sequence[:, j2, :]
+        return np.linalg.norm(delta, axis=1).astype(float, copy=False)
+
+    if isinstance(spec, JointSpeedExtractor):
+        _check_joint(spec.joint)
+        if num_frames < 2:
+            raise ValueError("joint_speed requires at least two frames")
+        diffs = np.diff(sequence[:, spec.joint, :], axis=0)
+        mags = np.linalg.norm(diffs, axis=1)
+        # Pad the first frame with 0 so the signal length matches the
+        # input frame count; segmentation indices then line up with the
+        # original frame indices without an off-by-one.
+        return np.concatenate([[0.0], mags]).astype(float, copy=False)
+
+    if isinstance(spec, JointAngleExtractor):
+        # Deferred import to avoid a circular dependency — features.py
+        # and segment.py both live under analyzer/, and extract_joint_angles
+        # is defined in features.py.
+        from neuropose.analyzer.features import extract_joint_angles
+
+        a, b, c = spec.triplet
+        _check_joint(a)
+        _check_joint(b)
+        _check_joint(c)
+        return extract_joint_angles(sequence, [(a, b, c)])[:, 0].astype(float, copy=False)
+
+    raise TypeError(f"unknown extractor kind: {type(spec).__name__}")
+
+
+# ---------------------------------------------------------------------------
+# Layer 1: pure 1D segmentation
+# ---------------------------------------------------------------------------
+
+
+def segment_by_peaks(
+    signal: np.ndarray,
+    *,
+    min_distance: int | None = None,
+    min_prominence: float | None = None,
+    min_height: float | None = None,
+    pad: int = 0,
+) -> list[Segment]:
+    """Segment a 1D signal into one window per detected peak.
+
+    Implements the ``valley_to_valley_v1`` method: each peak found by
+    :func:`scipy.signal.find_peaks` is walked outward to the nearest
+    valley on each side (a local minimum of the signal), and the
+    resulting ``[start, end)`` window is reported as one
+    :class:`~neuropose.io.Segment`. If the signal has no valley before
+    the first peak the segment starts at frame 0; likewise a trailing
+    peak without a following valley extends to the end of the signal.
+
+    Parameters
+    ----------
+    signal
+        1D numpy array of segmentation feature values (e.g. wrist
+        height across frames).
+    min_distance, min_prominence, min_height
+        Forwarded as ``distance``, ``prominence`` and ``height`` to
+        :func:`scipy.signal.find_peaks`. Use these to reject noise; the
+        defaults are permissive.
+    pad
+        Number of samples to extend each resulting segment on both
+        sides. Segments are clamped to ``[0, len(signal)]``; adjacent
+        padded segments may therefore overlap.
+
+    Returns
+    -------
+    list[Segment]
+        One :class:`~neuropose.io.Segment` per detected repetition, in
+        ascending order of peak index. Empty if no peaks were found.
+
+    Raises
+    ------
+    ValueError
+        If ``signal`` is not 1D or if ``pad`` is negative.
+    ImportError
+        If :mod:`scipy` is not installed. The error message points at
+        the ``analysis`` optional extra.
+    """
+    if signal.ndim != 1:
+        raise ValueError(f"expected 1D array; got shape {signal.shape}")
+    if pad < 0:
+        raise ValueError(f"pad must be non-negative; got {pad}")
+
+    try:
+        from scipy.signal import find_peaks as _sp_find_peaks
+    except ImportError as exc:
+        raise ImportError(
+            "neuropose.analyzer.segment.segment_by_peaks requires scipy. "
+            "Install it with: pip install neuropose[analysis]"
+        ) from exc
+
+    peak_kwargs: dict[str, float | int] = {}
+    # scipy requires ``distance >= 1``; treat ``None`` or 0 as "no constraint".
+    if min_distance is not None and min_distance >= 1:
+        peak_kwargs["distance"] = min_distance
+    if min_prominence is not None:
+        peak_kwargs["prominence"] = min_prominence
+    if min_height is not None:
+        peak_kwargs["height"] = min_height
+
+    peaks, _ = _sp_find_peaks(signal, **peak_kwargs)
+    if len(peaks) == 0:
+        return []
+
+    # Valleys = peaks of the negated signal. No extra filters — we want
+    # every local minimum as a candidate boundary, and we'll pick the
+    # nearest one on each side of each qualifying peak.
+    valleys, _ = _sp_find_peaks(-signal)
+
+    n = int(signal.shape[0])
+    segments: list[Segment] = []
+    for peak in peaks:
+        peak_idx = int(peak)
+        left = valleys[valleys < peak_idx]
+        right = valleys[valleys > peak_idx]
+        start = int(left.max()) if left.size > 0 else 0
+        # ``end`` is exclusive; include the trailing valley frame so
+        # the segment captures the return-to-rest the clinician cares
+        # about.
+        end = int(right.min()) + 1 if right.size > 0 else n
+
+        start = max(0, start - pad)
+        end = min(n, end + pad)
+        segments.append(Segment(start=start, end=end, peak=peak_idx))
+
+    return segments
+
+
+# ---------------------------------------------------------------------------
+# Layer 2: pose-aware convenience
+# ---------------------------------------------------------------------------
+
+
+def segment_predictions(
+    predictions: VideoPredictions,
+    extractor: ExtractorSpec,
+    *,
+    person_index: int = 0,
+    min_distance_seconds: float | None = None,
+    min_prominence: float | None = None,
+    min_height: float | None = None,
+    pad_seconds: float = 0.0,
+) -> Segmentation:
+    """Segment a :class:`~neuropose.io.VideoPredictions` into repetitions.
+
+    This is the top-level entry point for post-hoc segmentation. It:
+
+    1. Reduces ``predictions`` to a ``(frames, joints, 3)`` numpy array
+       via :func:`~neuropose.analyzer.features.predictions_to_numpy`.
+    2. Extracts a 1D segmentation signal using ``extractor``.
+    3. Converts the time-based parameters (``min_distance_seconds``,
+       ``pad_seconds``) to frame counts using
+       ``predictions.metadata.fps``.
+    4. Delegates to :func:`segment_by_peaks`.
+    5. Wraps the result in a :class:`~neuropose.io.Segmentation` whose
+       :class:`~neuropose.io.SegmentationConfig` carries the original
+       (time-based) parameters and the extractor spec, so the output
+       is self-describing when persisted.
+
+    Parameters
+    ----------
+    predictions
+        Per-video predictions to segment. The ``metadata.fps`` field
+        is used to convert seconds to frames; callers with a video of
+        unknown frame rate should fall back to :func:`segment_by_peaks`
+        directly.
+    extractor
+        Serializable extractor spec — typically produced by one of the
+        convenience factories (:func:`joint_axis`,
+        :func:`joint_pair_distance`, :func:`joint_speed`,
+        :func:`joint_angle`).
+    person_index
+        Which detected person to extract from each frame. Defaults to
+        0 (the first detected person), matching the single-subject
+        clinical case.
+    min_distance_seconds, min_prominence, min_height, pad_seconds
+        Forwarded to :func:`segment_by_peaks` after the time-based
+        parameters are converted to sample counts via ``metadata.fps``.
+
+    Returns
+    -------
+    Segmentation
+        A :class:`~neuropose.io.Segmentation` pairing the segments with
+        the exact :class:`~neuropose.io.SegmentationConfig` that
+        produced them. Ready to attach under a name to
+        ``VideoPredictions.segmentations``.
+
+    Raises
+    ------
+    ValueError
+        If ``predictions`` has zero frames, if a joint index in the
+        extractor is out of range, or if ``predictions.metadata.fps``
+        is non-positive and time-based parameters were supplied.
+    ImportError
+        If :mod:`scipy` is not installed.
+    """
+    sequence = predictions_to_numpy(predictions, person_index=person_index)
+    signal = extract_signal(sequence, extractor)
+
+    fps = predictions.metadata.fps
+    needs_fps = (
+        min_distance_seconds is not None and min_distance_seconds > 0.0
+    ) or pad_seconds > 0.0
+    if needs_fps and fps <= 0.0:
+        raise ValueError(
+            "cannot convert seconds to frames: metadata.fps is "
+            f"{fps}. Pass min_distance_seconds=None and pad_seconds=0.0, "
+            "or call segment_by_peaks directly with sample-count units."
+        )
+
+    if min_distance_seconds is None or min_distance_seconds <= 0.0:
+        min_distance: int | None = None
+    else:
+        min_distance = max(1, round(min_distance_seconds * fps))
+
+    pad = round(pad_seconds * fps) if pad_seconds > 0.0 else 0
+
+    segments = segment_by_peaks(
+        signal,
+        min_distance=min_distance,
+        min_prominence=min_prominence,
+        min_height=min_height,
+        pad=pad,
+    )
+
+    config = SegmentationConfig(
+        extractor=extractor,
+        person_index=person_index,
+        min_distance_seconds=min_distance_seconds,
+        min_prominence=min_prominence,
+        min_height=min_height,
+        pad_seconds=pad_seconds,
+    )
+    return Segmentation(config=config, segments=segments)
+
+
+# ---------------------------------------------------------------------------
+# Slicing: one VideoPredictions per segment
+# ---------------------------------------------------------------------------
+
+
+def slice_predictions(
+    predictions: VideoPredictions,
+    segments: Sequence[Segment],
+) -> list[VideoPredictions]:
+    """Split a :class:`~neuropose.io.VideoPredictions` into one per segment.
+
+    Each output :class:`~neuropose.io.VideoPredictions` contains only
+    the frames in ``[segment.start, segment.end)`` of the input,
+    re-keyed starting at ``frame_000000`` and with
+    :class:`~neuropose.io.VideoMetadata.frame_count` rewritten to match.
+    Other metadata fields (``fps``, ``width``, ``height``) are copied
+    verbatim. The input is not modified.
+
+    The per-slice ``segmentations`` field is intentionally reset to
+    empty; a segment is meaningful only against its parent video's
+    timeline, and copying the parent's segmentations into a sliced view
+    would be confusing rather than useful.
+
+    Parameters
+    ----------
+    predictions
+        Source predictions.
+    segments
+        Segments to slice out. Indices are interpreted into the input's
+        ``frame_names()`` ordering.
+
+    Returns
+    -------
+    list[VideoPredictions]
+        One :class:`~neuropose.io.VideoPredictions` per segment, in the
+        order of the input list.
+
+    Raises
+    ------
+    ValueError
+        If any segment's ``end`` exceeds the input frame count.
+    """
+    frame_names = predictions.frame_names()
+    total_frames = len(frame_names)
+    results: list[VideoPredictions] = []
+    for idx, seg in enumerate(segments):
+        if seg.end > total_frames:
+            raise ValueError(f"segment {idx} end={seg.end} exceeds frame count {total_frames}")
+        sliced_frames = {
+            f"frame_{i:06d}": predictions[src_name]
+            for i, src_name in enumerate(frame_names[seg.start : seg.end])
+        }
+        new_metadata = VideoMetadata(
+            frame_count=seg.end - seg.start,
+            fps=predictions.metadata.fps,
+            width=predictions.metadata.width,
+            height=predictions.metadata.height,
+        )
+        results.append(
+            VideoPredictions(
+                metadata=new_metadata,
+                frames=sliced_frames,
+                segmentations={},
+            )
+        )
+    return results
diff --git a/src/neuropose/benchmark.py b/src/neuropose/benchmark.py
new file mode 100644
index 0000000..95ca7ff
--- /dev/null
+++ b/src/neuropose/benchmark.py
@@ -0,0 +1,320 @@
+"""Multi-pass inference benchmarking for :mod:`neuropose.estimator`.
+
+The :mod:`neuropose.estimator` module already instruments every call
+and attaches a :class:`~neuropose.io.PerformanceMetrics` to its
+:class:`~neuropose.estimator.ProcessVideoResult`, so real-world runs
+always carry their own timing. This module layers on top of that to
+answer the harder question — *what can this machine actually do
+steady-state?* — by running the same video multiple times, throwing
+out the first pass entirely (graph compilation / file-system cache
+warmup), and aggregating the rest into distributional statistics.
+
+The design is intentionally thin: a single :func:`run_benchmark`
+entry point, a small divergence helper for the ``--compare-cpu``
+flow, and a pretty-printer used by the CLI. The heavy lifting —
+schemas, aggregation, serialisation — lives in :mod:`neuropose.io`.
+
+See :class:`neuropose.io.BenchmarkResult` for the result shape and
+:mod:`neuropose.cli.benchmark` for the command-line surface.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+
+from neuropose.estimator import Estimator
+from neuropose.io import (
+    BenchmarkAggregate,
+    BenchmarkResult,
+    PerformanceMetrics,
+    VideoPredictions,
+)
+
+
+@dataclass(frozen=True)
+class BenchmarkRunOutcome:
+    """Return type of :func:`run_benchmark`.
+
+    Separate from :class:`neuropose.io.BenchmarkResult` because the
+    reference :class:`~neuropose.io.VideoPredictions` captured from the
+    last measured pass is **not** part of the serialisable benchmark
+    record — it is kept in memory only to feed the ``--compare-cpu``
+    divergence computation, and would otherwise bloat every JSON output
+    with redundant pose data.
+    """
+
+    result: BenchmarkResult
+    reference_predictions: VideoPredictions | None
+
+
+def run_benchmark(
+    estimator: Estimator,
+    video_path: Path,
+    *,
+    repeats: int = 5,
+    warmup_frames: int = 3,
+    capture_reference: bool = False,
+) -> BenchmarkRunOutcome:
+    """Run ``repeats`` passes of ``process_video`` and aggregate timings.
+
+    Parameters
+    ----------
+    estimator
+        A fully-initialised :class:`~neuropose.estimator.Estimator`
+        with its model loaded (or injected). The benchmark calls
+        ``estimator.process_video(video_path)`` in a loop.
+    video_path
+        Path to an input video. Must be readable by OpenCV; any
+        :class:`~neuropose.estimator.VideoDecodeError` from the first
+        pass propagates out (no point retrying a broken file).
+    repeats
+        Total number of passes to run. Must be at least 2 — the first
+        pass is always discarded, and the aggregate needs at least one
+        measured pass to be meaningful.
+    warmup_frames
+        Number of frames to exclude from the head of *each* measured
+        pass when computing the aggregate. Graph compilation and XLA
+        kernel JIT usually bite only the first 1-3 frames of a pass,
+        so the default of 3 is a safe floor for Apple Silicon and
+        CUDA alike.
+    capture_reference
+        When ``True``, the :class:`VideoPredictions` from the last
+        measured pass is preserved on the outcome. Used by the
+        ``--compare-cpu`` flow to diff poses across devices; callers
+        that only want timings should leave this ``False``.
+
+    Returns
+    -------
+    BenchmarkRunOutcome
+        The serialisable :class:`~neuropose.io.BenchmarkResult` plus
+        (optionally) the reference predictions.
+
+    Raises
+    ------
+    ValueError
+        If ``repeats`` is less than 2 (no measured passes after
+        discarding the warmup pass) or ``warmup_frames`` is negative.
+    """
+    if repeats < 2:
+        raise ValueError(f"repeats must be >= 2; got {repeats}")
+    if warmup_frames < 0:
+        raise ValueError(f"warmup_frames must be >= 0; got {warmup_frames}")
+
+    passes: list[PerformanceMetrics] = []
+    reference_predictions: VideoPredictions | None = None
+    for i in range(repeats):
+        result = estimator.process_video(video_path)
+        passes.append(result.metrics)
+        # Only the *last* measured pass needs to be captured for
+        # divergence comparison. Earlier passes would just be
+        # overwritten, so we avoid holding their frame dicts in memory.
+        if capture_reference and i == repeats - 1:
+            reference_predictions = result.predictions
+
+    aggregate = _aggregate_passes(passes[1:], warmup_frames=warmup_frames)
+    benchmark_result = BenchmarkResult(
+        video_name=video_path.name,
+        repeats=repeats,
+        warmup_frames=warmup_frames,
+        warmup_pass=passes[0],
+        measured_passes=passes[1:],
+        aggregate=aggregate,
+    )
+    return BenchmarkRunOutcome(
+        result=benchmark_result,
+        reference_predictions=reference_predictions,
+    )
+
+
+def compute_poses3d_divergence(
+    reference: VideoPredictions,
+    other: VideoPredictions,
+) -> tuple[float, int]:
+    """Return the maximum absolute ``poses3d`` divergence in mm.
+
+    Walks both prediction sets frame-by-frame and compares each
+    ``(person, joint, axis)`` entry. Frames are matched by name (the
+    six-digit ``frame_000000`` identifier), which assumes both runs
+    processed the same source video and decoded the same number of
+    frames — the benchmark always does, since it invokes
+    ``process_video`` on the same file with the same OpenCV build.
+
+    Parameters
+    ----------
+    reference
+        Predictions from the primary device pass (typically GPU).
+    other
+        Predictions from the secondary device pass (typically CPU via
+        a ``--force-cpu`` subprocess).
+
+    Returns
+    -------
+    tuple[float, int]
+        ``(max_divergence_mm, frame_count_compared)``. The integer is
+        the number of frames that actually contributed to the diff —
+        frames with mismatched detection counts are skipped rather
+        than raising, so the caller can tell "is the number trustworthy?"
+        from the count alone.
+
+    Raises
+    ------
+    ValueError
+        If the two prediction sets report different frame counts (the
+        upstream benchmark should never produce that, so it is a real
+        bug when it happens).
+    """
+    ref_names = reference.frame_names()
+    other_names = other.frame_names()
+    if len(ref_names) != len(other_names):
+        raise ValueError(
+            f"frame count mismatch: reference has {len(ref_names)}, other has {len(other_names)}"
+        )
+
+    max_diff = 0.0
+    compared = 0
+    for ref_name, other_name in zip(ref_names, other_names, strict=True):
+        ref_frame = reference[ref_name]
+        other_frame = other[other_name]
+        if len(ref_frame.poses3d) != len(other_frame.poses3d):
+            # Detection-count mismatches mean the two devices disagreed
+            # about *how many people* were in the frame, not about joint
+            # positions. Skip these frames for the divergence metric
+            # but do not silently mask them — the comparison result
+            # surfaces frame_count_compared so the caller can tell.
+            continue
+        if not ref_frame.poses3d:
+            # Both zero detections → nothing to compare, nothing to skip.
+            compared += 1
+            continue
+        ref_arr = np.asarray(ref_frame.poses3d, dtype=float)
+        other_arr = np.asarray(other_frame.poses3d, dtype=float)
+        if ref_arr.shape != other_arr.shape:
+            continue
+        diff = float(np.abs(ref_arr - other_arr).max())
+        if diff > max_diff:
+            max_diff = diff
+        compared += 1
+    return max_diff, compared
+
+
+def format_benchmark_report(result: BenchmarkResult) -> str:
+    """Return a human-readable report for ``result``.
+
+    Renders a multi-line summary suitable for stdout. The CLI uses this
+    for the operator-facing output; the JSON on disk is the machine-
+    readable form.
+    """
+    agg = result.aggregate
+    lines = [
+        f"Benchmark: {result.video_name}",
+        f"  device:       {agg.active_device}"
+        + (" (tensorflow-metal)" if agg.tensorflow_metal_active else ""),
+        f"  tf version:   {agg.tensorflow_version}",
+        f"  model load:   {_format_model_load(result.warmup_pass.model_load_seconds)}",
+        f"  repeats:      {result.repeats} ({agg.repeats_measured} measured, 1 discarded)",
+        f"  warmup:       {result.warmup_frames} frame(s) excluded per pass",
+        "",
+        "Per-frame latency (ms, across measured passes after warmup):",
+        f"  mean:    {agg.mean_frame_latency_ms:8.2f}",
+        f"  p50:     {agg.p50_frame_latency_ms:8.2f}",
+        f"  p95:     {agg.p95_frame_latency_ms:8.2f}",
+        f"  p99:     {agg.p99_frame_latency_ms:8.2f}",
+        f"  stddev:  {agg.stddev_frame_latency_ms:8.2f}",
+        "",
+        f"Throughput:   {agg.mean_throughput_fps:.2f} fps (mean across measured passes)",
+        f"Peak RSS:     {agg.peak_rss_mb_max:.0f} MB",
+    ]
+    if result.cpu_comparison is not None:
+        cmp = result.cpu_comparison
+        lines.extend(
+            [
+                "",
+                "CPU comparison:",
+                f"  primary throughput: {cmp.primary_aggregate.mean_throughput_fps:.2f} fps",
+                f"  cpu throughput:     {cmp.cpu_aggregate.mean_throughput_fps:.2f} fps",
+                f"  speedup:            {cmp.speedup:.2f}x",
+                f"  max poses3d divergence: {cmp.max_poses3d_divergence_mm:.4f} mm "
+                f"({cmp.frame_count_compared} frames compared)",
+            ]
+        )
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
+
+
+def _aggregate_passes(
+    measured: list[PerformanceMetrics],
+    *,
+    warmup_frames: int,
+) -> BenchmarkAggregate:
+    """Compute a :class:`BenchmarkAggregate` from a list of measured passes."""
+    if not measured:
+        # Caller guarantees repeats >= 2 so measured is non-empty, but
+        # validate here anyway: an empty list would make np.percentile
+        # fail with a confusing error downstream.
+        raise ValueError("cannot aggregate zero measured passes")
+
+    latencies: list[float] = []
+    throughputs: list[float] = []
+    peak_rss = 0.0
+    for p in measured:
+        # Slice off the warmup head of each pass. If warmup_frames
+        # exceeds the pass length, the slice is empty and contributes
+        # nothing, which is the correct behaviour for a pathologically
+        # short video.
+        kept = p.per_frame_latencies_ms[warmup_frames:]
+        latencies.extend(kept)
+        frame_count = len(p.per_frame_latencies_ms)
+        if frame_count > 0 and p.total_seconds > 0:
+            throughputs.append(frame_count / p.total_seconds)
+        if p.peak_rss_mb > peak_rss:
+            peak_rss = p.peak_rss_mb
+
+    if not latencies:
+        # Could happen if warmup_frames >= frames_per_pass for every
+        # measured pass. Aggregate with all-zero timing rather than
+        # exploding; the CLI surfaces the zero-throughput number
+        # clearly enough that the operator will notice.
+        return BenchmarkAggregate(
+            repeats_measured=len(measured),
+            warmup_frames_per_pass=warmup_frames,
+            mean_frame_latency_ms=0.0,
+            p50_frame_latency_ms=0.0,
+            p95_frame_latency_ms=0.0,
+            p99_frame_latency_ms=0.0,
+            stddev_frame_latency_ms=0.0,
+            mean_throughput_fps=float(np.mean(throughputs)) if throughputs else 0.0,
+            peak_rss_mb_max=peak_rss,
+            active_device=measured[0].active_device,
+            tensorflow_metal_active=measured[0].tensorflow_metal_active,
+            tensorflow_version=measured[0].tensorflow_version,
+        )
+
+    latencies_arr = np.asarray(latencies, dtype=float)
+    return BenchmarkAggregate(
+        repeats_measured=len(measured),
+        warmup_frames_per_pass=warmup_frames,
+        mean_frame_latency_ms=float(latencies_arr.mean()),
+        p50_frame_latency_ms=float(np.percentile(latencies_arr, 50)),
+        p95_frame_latency_ms=float(np.percentile(latencies_arr, 95)),
+        p99_frame_latency_ms=float(np.percentile(latencies_arr, 99)),
+        stddev_frame_latency_ms=float(latencies_arr.std()),
+        mean_throughput_fps=float(np.mean(throughputs)) if throughputs else 0.0,
+        peak_rss_mb_max=peak_rss,
+        active_device=measured[0].active_device,
+        tensorflow_metal_active=measured[0].tensorflow_metal_active,
+        tensorflow_version=measured[0].tensorflow_version,
+    )
+
+
+def _format_model_load(seconds: float | None) -> str:
+    """Format the ``model_load_seconds`` field for the text report."""
+    if seconds is None:
+        return "injected (not loaded by benchmark)"
+    return f"{seconds:.2f} s"
diff --git a/src/neuropose/cli.py b/src/neuropose/cli.py
index 6ff6fab..dd87186 100644
--- a/src/neuropose/cli.py
+++ b/src/neuropose/cli.py
@@ -1,11 +1,20 @@
 """NeuroPose command-line interface.
 
-Three subcommands:
+Five subcommands:
 
 - ``neuropose watch`` — run the :class:`~neuropose.interfacer.Interfacer`
   daemon against the configured input directory.
 - ``neuropose process <video>`` — run the estimator on a single video and
   write the predictions JSON to disk.
+- ``neuropose segment <results>`` — post-hoc repetition segmentation of
+  an existing predictions file. Attaches a named
+  :class:`~neuropose.io.Segmentation` to every video it contains and
+  writes the file back atomically.
+- ``neuropose benchmark <video>`` — multi-pass inference benchmark for
+  a single video, with optional ``--compare-cpu`` for Apple-Silicon
+  vs CPU numerical-divergence checks. Prints a human report to stdout
+  and (optionally) writes a structured :class:`~neuropose.io.BenchmarkResult`
+  JSON to ``--output``.
 - ``neuropose analyze <results>`` — stubbed placeholder pending the
   analyzer rewrite in commit 10.
 
@@ -31,7 +40,9 @@ exception classes we expect from the layers below.
 
 from __future__ import annotations
 
+import json
 import logging
+from enum import StrEnum
 from pathlib import Path
 from typing import Annotated
 
@@ -42,7 +53,19 @@ from neuropose import __version__
 from neuropose.config import Settings
 from neuropose.estimator import Estimator
 from neuropose.interfacer import AlreadyRunningError, Interfacer
-from neuropose.io import save_video_predictions
+from neuropose.io import (
+    BenchmarkResult,
+    CpuComparisonResult,
+    ExtractorSpec,
+    JobResults,
+    VideoPredictions,
+    load_benchmark_result,
+    load_job_results,
+    load_video_predictions,
+    save_benchmark_result,
+    save_job_results,
+    save_video_predictions,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -233,6 +256,626 @@ def process(
     typer.echo(f"wrote {out_path} ({result.frame_count} frames)")
 
 
+# ---------------------------------------------------------------------------
+# segment
+# ---------------------------------------------------------------------------
+
+
+class _ExtractorKind(StrEnum):
+    """CLI-facing extractor-kind enum for ``neuropose segment --extractor``."""
+
+    JOINT_AXIS = "joint_axis"
+    JOINT_PAIR_DISTANCE = "joint_pair_distance"
+    JOINT_SPEED = "joint_speed"
+    JOINT_ANGLE = "joint_angle"
+
+
+def _resolve_joint(token: str) -> int:
+    """Resolve a joint specifier to an integer index.
+
+    Accepts either a string joint name from the berkeley_mhad_43 skeleton
+    (e.g. ``"rwri"``) or a plain integer as a string. Raises
+    ``typer.BadParameter`` with a helpful message on failure so CLI users
+    get a short error rather than a pydantic traceback.
+    """
+    # Deferred import so the CLI module itself stays free of the analyzer
+    # heavy imports when the user is only running watch/process.
+    from neuropose.analyzer.segment import JOINT_INDEX
+
+    stripped = token.strip()
+    if not stripped:
+        raise typer.BadParameter("joint specifier is empty")
+    if stripped in JOINT_INDEX:
+        return JOINT_INDEX[stripped]
+    try:
+        return int(stripped)
+    except ValueError as exc:
+        raise typer.BadParameter(
+            f"unknown joint {stripped!r}; expected an integer index or one of "
+            f"the berkeley_mhad_43 names (e.g. 'rwri', 'lkne')"
+        ) from exc
+
+
+def _parse_joint_list(raw: str, *, expected: int) -> tuple[int, ...]:
+    """Split a comma-separated joint list into ``expected`` integer indices."""
+    tokens = [t for t in raw.split(",") if t.strip()]
+    if len(tokens) != expected:
+        raise typer.BadParameter(
+            f"expected {expected} comma-separated joint specifiers; got {len(tokens)}: {raw!r}"
+        )
+    return tuple(_resolve_joint(t) for t in tokens)
+
+
+def _build_extractor_spec(
+    kind: _ExtractorKind,
+    *,
+    joint: str | None,
+    joints: str | None,
+    triplet: str | None,
+    axis: int | None,
+    invert: bool,
+) -> ExtractorSpec:
+    """Translate CLI flags into a serializable :class:`ExtractorSpec`.
+
+    Each extractor kind consumes a different subset of the flags; unused
+    flags are ignored for ergonomics (so users can leave defaults in shell
+    aliases) but missing required flags raise ``typer.BadParameter`` with
+    the specific missing name so the error is obvious.
+    """
+    from neuropose.analyzer.segment import (
+        joint_angle,
+        joint_axis,
+        joint_pair_distance,
+        joint_speed,
+    )
+
+    if kind is _ExtractorKind.JOINT_AXIS:
+        if joint is None:
+            raise typer.BadParameter("--joint is required for joint_axis")
+        if axis is None:
+            raise typer.BadParameter("--axis is required for joint_axis")
+        if not (0 <= axis <= 2):
+            raise typer.BadParameter(f"--axis must be 0, 1, or 2; got {axis}")
+        return joint_axis(_resolve_joint(joint), axis, invert=invert)
+
+    if kind is _ExtractorKind.JOINT_PAIR_DISTANCE:
+        if joints is None:
+            raise typer.BadParameter(
+                "--joints is required for joint_pair_distance (e.g. --joints lwri,rwri)"
+            )
+        j1, j2 = _parse_joint_list(joints, expected=2)
+        return joint_pair_distance(j1, j2)
+
+    if kind is _ExtractorKind.JOINT_SPEED:
+        if joint is None:
+            raise typer.BadParameter("--joint is required for joint_speed")
+        return joint_speed(_resolve_joint(joint))
+
+    if kind is _ExtractorKind.JOINT_ANGLE:
+        if triplet is None:
+            raise typer.BadParameter(
+                "--triplet is required for joint_angle (e.g. --triplet larm,lelb,lwri)"
+            )
+        a, b, c = _parse_joint_list(triplet, expected=3)
+        return joint_angle(a, b, c)
+
+    raise typer.BadParameter(f"unknown extractor kind: {kind}")
+
+
+def _load_predictions_or_results(
+    path: Path,
+) -> tuple[JobResults | VideoPredictions, bool]:
+    """Load a results file, auto-detecting JobResults vs VideoPredictions.
+
+    Returns the loaded object and a boolean ``is_job_results`` flag so
+    the caller knows which save helper to use when writing back. The
+    distinction is made by trying :class:`VideoPredictions` first (the
+    more restrictive shape) and falling back to :class:`JobResults`.
+    """
+    with path.open("r", encoding="utf-8") as f:
+        raw = json.load(f)
+    # A JobResults is a mapping of video-name → VideoPredictions; a bare
+    # VideoPredictions always has a top-level "metadata" key. This is
+    # cheap enough to dispatch on without a full validation pass.
+    if isinstance(raw, dict) and "metadata" in raw and "frames" in raw:
+        return load_video_predictions(path), False
+    return load_job_results(path), True
+
+
+@app.command()
+def segment(
+    ctx: typer.Context,
+    results: Annotated[
+        Path,
+        typer.Argument(
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+            help="Path to a results.json (JobResults) or predictions.json "
+            "(single VideoPredictions).",
+        ),
+    ],
+    name: Annotated[
+        str,
+        typer.Option(
+            "--name",
+            help="Key under which the segmentation will be stored in "
+            "VideoPredictions.segmentations (e.g. 'cup_lift').",
+        ),
+    ],
+    extractor: Annotated[
+        _ExtractorKind,
+        typer.Option(
+            "--extractor",
+            help="Which segmentation signal to extract from each frame.",
+        ),
+    ],
+    joint: Annotated[
+        str | None,
+        typer.Option(
+            "--joint",
+            help="Joint specifier for joint_axis and joint_speed extractors. "
+            "Accepts a berkeley_mhad_43 name ('rwri', 'lkne') or an integer index.",
+        ),
+    ] = None,
+    joints: Annotated[
+        str | None,
+        typer.Option(
+            "--joints",
+            help="Comma-separated pair for joint_pair_distance (e.g. 'lwri,rwri').",
+        ),
+    ] = None,
+    triplet: Annotated[
+        str | None,
+        typer.Option(
+            "--triplet",
+            help="Comma-separated (a,b,c) for joint_angle; angle is at b.",
+        ),
+    ] = None,
+    axis: Annotated[
+        int | None,
+        typer.Option(
+            "--axis",
+            help="Spatial axis for joint_axis: 0=x, 1=y, 2=z.",
+        ),
+    ] = None,
+    invert: Annotated[
+        bool,
+        typer.Option(
+            "--invert",
+            help="Negate the signal (joint_axis only); useful when the "
+            "movement of interest is a decrease in the selected coordinate.",
+        ),
+    ] = False,
+    person_index: Annotated[
+        int,
+        typer.Option("--person-index", min=0, help="Which detected person to use."),
+    ] = 0,
+    min_distance_seconds: Annotated[
+        float | None,
+        typer.Option(
+            "--min-distance-seconds",
+            min=0.0,
+            help="Minimum time between successive repetition peaks.",
+        ),
+    ] = None,
+    min_prominence: Annotated[
+        float | None,
+        typer.Option(
+            "--min-prominence",
+            help="Minimum scipy peak prominence on the raw signal.",
+        ),
+    ] = None,
+    min_height: Annotated[
+        float | None,
+        typer.Option(
+            "--min-height",
+            help="Minimum signal value to qualify as a peak.",
+        ),
+    ] = None,
+    pad_seconds: Annotated[
+        float,
+        typer.Option(
+            "--pad-seconds",
+            min=0.0,
+            help="Amount of time to extend each segment on both sides.",
+        ),
+    ] = 0.0,
+    output: Annotated[
+        Path | None,
+        typer.Option(
+            "--output",
+            "-o",
+            help="Where to write the segmented file. Defaults to overwriting the input atomically.",
+            dir_okay=False,
+            writable=True,
+        ),
+    ] = None,
+    force: Annotated[
+        bool,
+        typer.Option(
+            "--force",
+            help="Overwrite an existing segmentation with the same --name. "
+            "Without --force, a collision is a usage error.",
+        ),
+    ] = False,
+) -> None:
+    """Run post-hoc repetition segmentation over an existing predictions file.
+
+    Loads the file (auto-detecting JobResults vs single VideoPredictions),
+    runs :func:`neuropose.analyzer.segment.segment_predictions` on each
+    video with the chosen extractor and thresholds, attaches the result
+    under ``--name`` in every video's ``segmentations`` mapping, and
+    writes the file back to ``--output`` (or the input path by default).
+
+    The command mutates only the ``segmentations`` field; inference
+    output — per-frame poses and video metadata — round-trips unchanged.
+    """
+    del ctx  # settings not needed; this command is pure post-processing
+    # Deferred import keeps the CLI module's top-level imports free of
+    # scipy so watch/process do not pay the import cost on startup.
+    from neuropose.analyzer.segment import segment_predictions
+
+    try:
+        spec = _build_extractor_spec(
+            extractor,
+            joint=joint,
+            joints=joints,
+            triplet=triplet,
+            axis=axis,
+            invert=invert,
+        )
+    except typer.BadParameter as exc:
+        typer.echo(f"error: {exc}", err=True)
+        raise typer.Exit(code=EXIT_USAGE) from exc
+
+    try:
+        loaded, is_job_results = _load_predictions_or_results(results)
+    except (ValidationError, json.JSONDecodeError) as exc:
+        typer.echo(f"error: could not load {results}: {exc}", err=True)
+        raise typer.Exit(code=EXIT_USAGE) from exc
+
+    # Normalise to an iterable of (video_name, VideoPredictions) so the
+    # per-video segmentation loop is the same for both input shapes.
+    if is_job_results:
+        assert isinstance(loaded, JobResults)
+        videos: dict[str, VideoPredictions] = dict(loaded.root)
+    else:
+        assert isinstance(loaded, VideoPredictions)
+        videos = {results.name: loaded}
+
+    updated: dict[str, VideoPredictions] = {}
+    total_segments = 0
+    for video_name, preds in videos.items():
+        if name in preds.segmentations and not force:
+            typer.echo(
+                f"error: video {video_name!r} already has a segmentation named "
+                f"{name!r}; pass --force to overwrite.",
+                err=True,
+            )
+            raise typer.Exit(code=EXIT_USAGE)
+        try:
+            result = segment_predictions(
+                preds,
+                spec,
+                person_index=person_index,
+                min_distance_seconds=min_distance_seconds,
+                min_prominence=min_prominence,
+                min_height=min_height,
+                pad_seconds=pad_seconds,
+            )
+        except (ValueError, ImportError) as exc:
+            typer.echo(f"error: segmentation failed for {video_name}: {exc}", err=True)
+            raise typer.Exit(code=EXIT_USAGE) from exc
+
+        new_segmentations = dict(preds.segmentations)
+        new_segmentations[name] = result
+        updated[video_name] = preds.model_copy(update={"segmentations": new_segmentations})
+        total_segments += len(result.segments)
+        typer.echo(f"{video_name}: {len(result.segments)} segment(s) under name={name!r}")
+
+    out_path = output if output is not None else results
+    if is_job_results:
+        save_job_results(out_path, JobResults(root=updated))
+    else:
+        # Single-VideoPredictions input: the dict has exactly one entry.
+        (only,) = updated.values()
+        save_video_predictions(out_path, only)
+
+    typer.echo(f"wrote {out_path} ({len(updated)} video(s), {total_segments} total segment(s))")
+
+
+# ---------------------------------------------------------------------------
+# benchmark
+# ---------------------------------------------------------------------------
+
+
+def _force_cpu_only() -> None:
+    """Hide all GPU devices from TensorFlow before any TF op initialises.
+
+    Called from the ``--force-cpu`` benchmark path before the estimator
+    loads the model. ``tf.config.set_visible_devices`` must be invoked
+    before the runtime has touched a GPU device, which in practice
+    means before any other TF API call — so this function imports TF
+    itself and then immediately pins visibility to an empty GPU list.
+
+    The import is deferred inside the function so that non-benchmark
+    CLI paths (``watch``, ``process``) do not pay the ~2 s TensorFlow
+    import cost unless they are actually doing inference.
+    """
+    try:
+        import tensorflow as tf
+    except ImportError as exc:
+        raise typer.BadParameter(
+            "--force-cpu requires TensorFlow to be installed, but import failed"
+        ) from exc
+    # Silence is fine here: a stale visibility call on a runtime that
+    # already initialised GPU raises RuntimeError, but the benchmark CLI
+    # calls this as its first TF op so that should never happen.
+    tf.config.set_visible_devices([], "GPU")
+
+
+def _run_compare_cpu_subprocess(
+    video: Path,
+    *,
+    repeats: int,
+    warmup_frames: int,
+) -> tuple[BenchmarkResult, VideoPredictions]:
+    """Spawn a ``--force-cpu`` child to get a CPU baseline + predictions.
+
+    The parent benchmark runs on whatever device the platform exposes
+    by default (Apple Silicon → Metal GPU when the extra is installed;
+    Linux → CPU by default). For the ``--compare-cpu`` comparison we
+    need a second run that is *guaranteed* to hit CPU, and we need the
+    resulting ``poses3d`` to diff against the parent's output.
+
+    The cleanest way to guarantee device isolation is to run the CPU
+    pass in a subprocess with GPU visibility hidden before any TF
+    import. In-process device switching via
+    ``tf.device("/CPU:0")`` is not reliable against SavedModels whose
+    ConcreteFunctions may carry baked-in device placements, and
+    calling ``set_visible_devices`` after the GPU has already been
+    touched is a hard error from TF's runtime.
+
+    The subprocess is invoked as ``neuropose benchmark <video> --force-cpu
+    --repeats N --warmup-frames M --output <tmp> --predictions-output
+    <tmp>`` so the child writes both the benchmark result and the last
+    measured pass's predictions to disk. The parent reads them, deletes
+    the tempfiles, and returns.
+    """
+    # Deferred imports: subprocess and tempfile are pure-Python and
+    # cheap, but keeping them inside the function keeps the module's
+    # top-level import surface small.
+    import subprocess
+    import sys
+    import tempfile
+
+    with tempfile.TemporaryDirectory(prefix="neuropose_cpu_bench_") as td:
+        td_path = Path(td)
+        result_path = td_path / "result.json"
+        predictions_path = td_path / "predictions.json"
+
+        cmd = [
+            sys.executable,
+            "-m",
+            "neuropose.cli",
+            "benchmark",
+            str(video),
+            "--repeats",
+            str(repeats),
+            "--warmup-frames",
+            str(warmup_frames),
+            "--output",
+            str(result_path),
+            "--predictions-output",
+            str(predictions_path),
+            "--force-cpu",
+        ]
+        logger.info("spawning cpu-baseline subprocess: %s", " ".join(cmd))
+        proc = subprocess.run(cmd, capture_output=True, text=True, check=False)
+        if proc.returncode != 0:
+            raise RuntimeError(
+                f"cpu-baseline subprocess failed (exit {proc.returncode}): {proc.stderr.strip()}"
+            )
+        return (
+            load_benchmark_result(result_path),
+            load_video_predictions(predictions_path),
+        )
+
+
+@app.command()
+def benchmark(
+    ctx: typer.Context,
+    video: Annotated[
+        Path,
+        typer.Argument(
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+            help="Path to an input video file to benchmark.",
+        ),
+    ],
+    repeats: Annotated[
+        int,
+        typer.Option(
+            "--repeats",
+            min=2,
+            help="Total passes to run; the first is always discarded as warmup.",
+        ),
+    ] = 5,
+    warmup_frames: Annotated[
+        int,
+        typer.Option(
+            "--warmup-frames",
+            min=0,
+            help="Frames to discard from the head of each measured pass.",
+        ),
+    ] = 3,
+    compare_cpu: Annotated[
+        bool,
+        typer.Option(
+            "--compare-cpu",
+            help=(
+                "After the primary benchmark, spawn a subprocess with "
+                "--force-cpu to produce a CPU baseline on the same video, "
+                "then report throughput speedup and max poses3d divergence "
+                "in millimetres. Intended for Apple Silicon numerics "
+                "verification."
+            ),
+        ),
+    ] = False,
+    force_cpu: Annotated[
+        bool,
+        typer.Option(
+            "--force-cpu",
+            help=(
+                "Hide all GPU devices from TensorFlow before loading the "
+                "model, so inference is guaranteed to run on CPU. Used "
+                "internally by --compare-cpu's subprocess; rarely needed "
+                "as a user-facing flag."
+            ),
+        ),
+    ] = False,
+    output: Annotated[
+        Path | None,
+        typer.Option(
+            "--output",
+            "-o",
+            help=(
+                "Where to write the structured BenchmarkResult JSON. "
+                "When omitted, only the human-readable report prints to "
+                "stdout."
+            ),
+            dir_okay=False,
+            writable=True,
+        ),
+    ] = None,
+    predictions_output: Annotated[
+        Path | None,
+        typer.Option(
+            "--predictions-output",
+            help=(
+                "Where to write the last measured pass's VideoPredictions "
+                "JSON. Used internally by --compare-cpu to supply the "
+                "divergence computation with the CPU pass's poses3d; "
+                "leave unset for normal benchmark runs."
+            ),
+            dir_okay=False,
+            writable=True,
+        ),
+    ] = None,
+) -> None:
+    """Run a multi-pass inference benchmark for a single video.
+
+    The first pass is always discarded (graph compilation / filesystem
+    caches), and subsequent passes contribute to a
+    :class:`~neuropose.io.BenchmarkAggregate` with mean / p50 / p95 / p99
+    per-frame latencies, throughput, peak RSS, active device, and the
+    exact TensorFlow version that ran the measurement. With
+    ``--compare-cpu``, a second run is spawned as a subprocess with
+    GPU visibility hidden; the resulting CPU baseline is diffed against
+    the primary run's ``poses3d`` array to produce a maximum absolute
+    divergence in millimetres, which is the answer to the "is
+    tensorflow-metal producing correct numerics?" question that
+    RESEARCH.md's TensorFlow-version-compatibility section leaves open.
+    """
+    settings: Settings = ctx.obj
+    # Deferred imports: the benchmark module and its dependencies
+    # (numpy, time) are needed for this subcommand only.
+    from neuropose.benchmark import (
+        compute_poses3d_divergence,
+        format_benchmark_report,
+        run_benchmark,
+    )
+
+    if force_cpu and compare_cpu:
+        typer.echo(
+            "error: --force-cpu and --compare-cpu are mutually exclusive; "
+            "--force-cpu is an implementation detail of --compare-cpu's "
+            "subprocess and should not be combined with it.",
+            err=True,
+        )
+        raise typer.Exit(code=EXIT_USAGE)
+
+    if force_cpu:
+        _force_cpu_only()
+
+    estimator = Estimator(
+        device=settings.device,
+        default_fov_degrees=settings.default_fov_degrees,
+    )
+    try:
+        estimator.load_model(cache_dir=settings.model_cache_dir)
+    except NotImplementedError as exc:
+        typer.echo(f"error: {exc}", err=True)
+        raise typer.Exit(code=EXIT_PENDING) from exc
+
+    outcome = run_benchmark(
+        estimator,
+        video,
+        repeats=repeats,
+        warmup_frames=warmup_frames,
+        capture_reference=compare_cpu or predictions_output is not None,
+    )
+    result = outcome.result
+
+    # CPU comparison path: spawn child, diff poses3d, rewrap result
+    # with a cpu_comparison field populated.
+    if compare_cpu:
+        if outcome.reference_predictions is None:
+            # Should be impossible because we asked for it above; guard
+            # anyway so a future refactor cannot silently drop it.
+            typer.echo(
+                "error: --compare-cpu requires reference predictions, "
+                "but none were captured. This is a bug.",
+                err=True,
+            )
+            raise typer.Exit(code=EXIT_USAGE)
+
+        try:
+            cpu_result, cpu_predictions = _run_compare_cpu_subprocess(
+                video,
+                repeats=repeats,
+                warmup_frames=warmup_frames,
+            )
+        except RuntimeError as exc:
+            typer.echo(f"error: {exc}", err=True)
+            raise typer.Exit(code=EXIT_USAGE) from exc
+
+        max_diff, compared = compute_poses3d_divergence(
+            outcome.reference_predictions, cpu_predictions
+        )
+        primary_throughput = result.aggregate.mean_throughput_fps
+        cpu_throughput = cpu_result.aggregate.mean_throughput_fps
+        speedup = primary_throughput / cpu_throughput if cpu_throughput > 0 else 0.0
+        comparison = CpuComparisonResult(
+            primary_aggregate=result.aggregate,
+            cpu_aggregate=cpu_result.aggregate,
+            speedup=speedup,
+            max_poses3d_divergence_mm=max_diff,
+            frame_count_compared=compared,
+        )
+        result = result.model_copy(update={"cpu_comparison": comparison})
+
+    typer.echo(format_benchmark_report(result))
+
+    if output is not None:
+        save_benchmark_result(output, result)
+        typer.echo(f"\nwrote {output}")
+
+    if predictions_output is not None:
+        if outcome.reference_predictions is None:
+            typer.echo(
+                "error: --predictions-output was given but no reference "
+                "predictions were captured. This is a bug.",
+                err=True,
+            )
+            raise typer.Exit(code=EXIT_USAGE)
+        save_video_predictions(predictions_output, outcome.reference_predictions)
+
+
 # ---------------------------------------------------------------------------
 # analyze (stub)
 # ---------------------------------------------------------------------------
diff --git a/src/neuropose/estimator.py b/src/neuropose/estimator.py
index 804544e..4461ac0 100644
--- a/src/neuropose/estimator.py
+++ b/src/neuropose/estimator.py
@@ -34,15 +34,22 @@ model is present raises :class:`ModelNotLoadedError`.
 from __future__ import annotations
 
 import logging
+import time
 from collections.abc import Callable
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 
 import cv2
+import psutil
 
 from neuropose._model import load_metrabs_model
-from neuropose.io import FramePrediction, VideoMetadata, VideoPredictions
+from neuropose.io import (
+    FramePrediction,
+    PerformanceMetrics,
+    VideoMetadata,
+    VideoPredictions,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -78,9 +85,23 @@ class ProcessVideoResult:
     predictions
         The validated :class:`VideoPredictions` object, containing both the
         per-frame predictions and the ``VideoMetadata`` envelope.
+    metrics
+        Timing and resource-usage metrics for the call. Always populated
+        so real-world runs carry their own measurements without the
+        caller having to opt in. ``metrics.model_load_seconds`` is
+        ``None`` when the model was injected rather than loaded via
+        :meth:`Estimator.load_model`.
     """
 
     predictions: VideoPredictions
+    metrics: PerformanceMetrics = field(
+        default_factory=lambda: PerformanceMetrics(
+            total_seconds=0.0,
+            peak_rss_mb=0.0,
+            active_device="/CPU:0",
+            tensorflow_version="unknown",
+        )
+    )
 
     @property
     def frame_count(self) -> int:
@@ -132,6 +153,11 @@ class Estimator:
         self.skeleton = skeleton
         self.default_fov_degrees = default_fov_degrees
         self._model: Any | None = model
+        # ``None`` when the model was injected via the constructor (tests,
+        # shared-model callers) — we never fake a zero load time. Set on
+        # successful ``load_model`` below so the next ``process_video`` can
+        # pass the real number through into ``PerformanceMetrics``.
+        self._model_load_seconds: float | None = None
 
     # -- model lifecycle ----------------------------------------------------
 
@@ -169,8 +195,10 @@ class Estimator:
             logger.debug("Model already loaded; skipping reload.")
             return
         logger.info("Loading MeTRAbs model (cache_dir=%s)", cache_dir)
+        start = time.perf_counter()
         self._model = load_metrabs_model(cache_dir=cache_dir)
-        logger.info("MeTRAbs model loaded.")
+        self._model_load_seconds = time.perf_counter() - start
+        logger.info("MeTRAbs model loaded in %.2f s", self._model_load_seconds)
 
     # -- inference ----------------------------------------------------------
 
@@ -223,6 +251,14 @@ class Estimator:
         if not cap.isOpened():
             raise VideoDecodeError(f"OpenCV could not open video: {video_path}")
 
+        # Start metrics collection *after* the file-not-found and
+        # decode-error paths so total_seconds reflects "work the estimator
+        # actually did" rather than setup failures the caller handles.
+        process = psutil.Process()
+        peak_rss_bytes = process.memory_info().rss
+        per_frame_latencies_ms: list[float] = []
+        overall_start = time.perf_counter()
+
         try:
             fps = float(cap.get(cv2.CAP_PROP_FPS))
             width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
@@ -246,9 +282,19 @@ class Estimator:
                     break
                 # MeTRAbs was trained on RGB images; OpenCV gives us BGR.
                 rgb_frame = cv2.cvtColor(bgr_frame, cv2.COLOR_BGR2RGB)
+                # Time only the model call — cv2 decode and colour
+                # conversion are not what the benchmark is asking about.
+                frame_start = time.perf_counter()
                 prediction = self._infer_frame(model, rgb_frame, fov)
+                per_frame_latencies_ms.append((time.perf_counter() - frame_start) * 1000.0)
                 frames[f"frame_{frame_index:06d}"] = prediction
                 frame_index += 1
+                # Sample RSS once per frame. Cheap (a single proc/mach
+                # syscall) and produces a monotonic peak without needing
+                # a background thread.
+                rss = process.memory_info().rss
+                if rss > peak_rss_bytes:
+                    peak_rss_bytes = rss
                 if progress is not None:
                     progress(frame_index, total_hint)
 
@@ -264,8 +310,28 @@ class Estimator:
         if frame_index == 0:
             logger.warning("Video %s contained no decodable frames.", video_path)
 
+        total_seconds = time.perf_counter() - overall_start
+        device_info = _detect_active_device()
+        metrics = PerformanceMetrics(
+            model_load_seconds=self._model_load_seconds,
+            total_seconds=total_seconds,
+            per_frame_latencies_ms=per_frame_latencies_ms,
+            peak_rss_mb=peak_rss_bytes / (1024.0 * 1024.0),
+            active_device=device_info.device,
+            tensorflow_metal_active=device_info.metal_active,
+            tensorflow_version=device_info.tf_version,
+        )
+        logger.info(
+            "Processed %d frames in %.2f s (%.1f fps, peak RSS %.0f MB, device %s)",
+            frame_index,
+            total_seconds,
+            frame_index / total_seconds if total_seconds > 0 else 0.0,
+            metrics.peak_rss_mb,
+            metrics.active_device,
+        )
+
         predictions = VideoPredictions(metadata=metadata, frames=frames)
-        return ProcessVideoResult(predictions=predictions)
+        return ProcessVideoResult(predictions=predictions, metrics=metrics)
 
     # -- internals ----------------------------------------------------------
 
@@ -306,3 +372,62 @@ def _to_nested_list(value: Any) -> Any:
     if hasattr(value, "tolist"):
         return value.tolist()
     return list(value)
+
+
+@dataclass(frozen=True)
+class _ActiveDeviceInfo:
+    """Small bundle of device info gathered for ``PerformanceMetrics``."""
+
+    device: str
+    metal_active: bool
+    tf_version: str
+
+
+def _detect_active_device() -> _ActiveDeviceInfo:
+    """Report which TF device the current process would use for inference.
+
+    Returns a synthetic "unknown" bundle if TensorFlow is not importable —
+    unit tests that inject a fake model do not have a real TF install at
+    the metrics layer, and we do not want to fail the call for that.
+
+    On Apple Silicon with the ``[metal]`` extra installed, TensorFlow
+    exposes a ``GPU`` device contributed by the ``tensorflow-metal``
+    PluggableDevice. The distinction between that and a real CUDA GPU
+    matters for :mod:`neuropose.benchmark`'s ``--compare-cpu`` flow, so
+    we surface it via ``metal_active``.
+    """
+    try:
+        import tensorflow as tf
+    except ImportError:
+        return _ActiveDeviceInfo(
+            device="unknown",
+            metal_active=False,
+            tf_version="unknown",
+        )
+
+    tf_version = getattr(tf, "__version__", "unknown")
+    try:
+        gpu_devices = tf.config.list_physical_devices("GPU")
+    except Exception:  # runtime-specific: never hard-fail metrics
+        gpu_devices = []
+
+    device = "/GPU:0" if gpu_devices else "/CPU:0"
+
+    metal_active = False
+    if gpu_devices:
+        try:
+            from importlib.metadata import PackageNotFoundError, version
+
+            try:
+                version("tensorflow-metal")
+                metal_active = True
+            except PackageNotFoundError:
+                metal_active = False
+        except Exception:
+            metal_active = False
+
+    return _ActiveDeviceInfo(
+        device=device,
+        metal_active=metal_active,
+        tf_version=tf_version,
+    )
diff --git a/src/neuropose/io.py b/src/neuropose/io.py
index 3324bc0..5b5ba03 100644
--- a/src/neuropose/io.py
+++ b/src/neuropose/io.py
@@ -19,9 +19,9 @@ from collections.abc import Iterator
 from datetime import datetime
 from enum import StrEnum
 from pathlib import Path
-from typing import Any
+from typing import Annotated, Any, Literal
 
-from pydantic import BaseModel, ConfigDict, Field, RootModel
+from pydantic import BaseModel, ConfigDict, Field, RootModel, model_validator
 
 
 class JobStatus(StrEnum):
@@ -76,18 +76,402 @@ class VideoMetadata(BaseModel):
     height: int = Field(ge=0, description="Source video frame height in pixels.")
 
 
+class PerformanceMetrics(BaseModel):
+    """Timing and resource-usage metrics collected during inference.
+
+    Populated by :meth:`neuropose.estimator.Estimator.process_video` on
+    every call so real-world runs always carry their own timing without
+    the caller having to opt in. The values describe the estimator's
+    behaviour on one specific pass over a single video and are *not*
+    aggregates — callers interested in distributional statistics
+    (p50/p95/p99) should pass the same video through
+    :func:`neuropose.benchmark.run_benchmark`, which runs multiple
+    passes and aggregates the resulting :class:`PerformanceMetrics`
+    instances.
+
+    Fields intentionally capture "what machine ran this":
+    ``active_device`` reports the TensorFlow device string the model
+    ran on (``/GPU:0`` on Apple Silicon with ``tensorflow-metal``,
+    ``/CPU:0`` otherwise), and ``tensorflow_version`` captures the
+    exact TF release so downstream readers can tell whether a
+    numerical discrepancy between two runs is a stack-version
+    difference rather than a real measurement.
+
+    Frozen so that the metrics captured at return time cannot be
+    edited in-place downstream.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    model_load_seconds: float | None = Field(
+        default=None,
+        ge=0.0,
+        description=(
+            "Wall-clock time to load the MeTRAbs model, in seconds. "
+            "``None`` when the model was injected via ``Estimator(model=...)`` "
+            "rather than loaded through ``load_model()``, so the value is "
+            "never confused with a zero-cost cache hit."
+        ),
+    )
+    total_seconds: float = Field(
+        ge=0.0,
+        description="Wall-clock time of ``process_video`` end-to-end.",
+    )
+    per_frame_latencies_ms: list[float] = Field(
+        default_factory=list,
+        description=(
+            "Per-frame inference latencies in milliseconds, one entry per "
+            "decoded frame in insertion order. Excludes video-decode time "
+            "and captures only the ``detect_poses`` call, so callers "
+            "aggregating over this list are measuring model throughput."
+        ),
+    )
+    peak_rss_mb: float = Field(
+        ge=0.0,
+        description=(
+            "Maximum resident-set size observed during the call, in "
+            "megabytes. Sampled once after each frame via ``psutil``."
+        ),
+    )
+    active_device: str = Field(
+        description=(
+            "TensorFlow device string the inference ran on, e.g. "
+            "``/CPU:0`` or ``/GPU:0``. Derived from "
+            "``tf.config.list_physical_devices('GPU')`` — when a GPU "
+            "device is visible the string is ``/GPU:0``, otherwise "
+            "``/CPU:0``."
+        ),
+    )
+    tensorflow_metal_active: bool = Field(
+        default=False,
+        description=(
+            "``True`` if the ``tensorflow-metal`` PluggableDevice is "
+            "installed and contributed the visible GPU device, i.e. "
+            "MeTRAbs inference is running through Apple's Metal "
+            "Performance Shaders backend on Apple Silicon. ``False`` "
+            "otherwise, including on Linux CUDA builds."
+        ),
+    )
+    tensorflow_version: str = Field(
+        description="Value of ``tensorflow.__version__`` at the time of the call.",
+    )
+
+
+class BenchmarkAggregate(BaseModel):
+    """Distributional statistics aggregated across benchmark passes.
+
+    Computed by :mod:`neuropose.benchmark` over the measured
+    :class:`PerformanceMetrics` instances of a multi-pass benchmark run
+    — the very first pass is always discarded as warmup (graph
+    compilation, file-system caches, etc.), and the first
+    ``warmup_frames_per_pass`` frames of each remaining pass are also
+    excluded so graph-init cost inside a pass does not contaminate the
+    steady-state percentiles.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    repeats_measured: int = Field(
+        ge=0,
+        description="Number of passes that contributed to the aggregate.",
+    )
+    warmup_frames_per_pass: int = Field(
+        ge=0,
+        description="Frames discarded from the head of each measured pass.",
+    )
+    mean_frame_latency_ms: float = Field(ge=0.0)
+    p50_frame_latency_ms: float = Field(ge=0.0)
+    p95_frame_latency_ms: float = Field(ge=0.0)
+    p99_frame_latency_ms: float = Field(ge=0.0)
+    stddev_frame_latency_ms: float = Field(ge=0.0)
+    mean_throughput_fps: float = Field(ge=0.0)
+    peak_rss_mb_max: float = Field(
+        ge=0.0,
+        description="Maximum peak RSS observed across all measured passes.",
+    )
+    active_device: str
+    tensorflow_metal_active: bool = False
+    tensorflow_version: str
+
+
+class CpuComparisonResult(BaseModel):
+    """Result of a ``--compare-cpu`` benchmark run.
+
+    The parent benchmark process runs on the platform's default device
+    (GPU if visible), and a subprocess is spawned with ``--force-cpu``
+    to run the exact same video on CPU. Both runs' aggregates are
+    preserved here alongside the maximum element-wise divergence in the
+    resulting ``poses3d`` arrays — that number is the "is Metal
+    numerically OK?" answer that ``RESEARCH.md`` §TensorFlow version
+    compatibility is asking.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    primary_aggregate: BenchmarkAggregate = Field(
+        description="Aggregate from the parent (default-device) run.",
+    )
+    cpu_aggregate: BenchmarkAggregate = Field(
+        description="Aggregate from the ``--force-cpu`` subprocess run.",
+    )
+    speedup: float = Field(
+        description=(
+            "``primary_aggregate.mean_throughput_fps / "
+            "cpu_aggregate.mean_throughput_fps``. Values greater than 1 "
+            "mean the primary device is faster than CPU; values less "
+            "than 1 mean CPU won (possible on tiny videos where device "
+            "initialisation dominates)."
+        ),
+    )
+    max_poses3d_divergence_mm: float = Field(
+        ge=0.0,
+        description=(
+            "Maximum element-wise absolute difference (in millimetres) "
+            "between the primary-device and CPU-device ``poses3d`` "
+            "arrays, taken over every frame, detection, joint, and "
+            "axis. A small number (~1e-3 mm) is the expected floor; "
+            "anything above ~1e-2 mm warrants investigation before "
+            "trusting the primary device for clinical measurement."
+        ),
+    )
+    frame_count_compared: int = Field(
+        ge=0,
+        description="Number of frames that entered the divergence computation.",
+    )
+
+
+class BenchmarkResult(BaseModel):
+    """Top-level result of :func:`neuropose.benchmark.run_benchmark`.
+
+    Carries the raw per-pass metrics, the aggregated summary, and (if
+    ``--compare-cpu`` was requested) a :class:`CpuComparisonResult`.
+    Serialised to JSON for downstream regression tracking; pretty-printed
+    to stdout by the :mod:`neuropose.cli.benchmark` subcommand.
+
+    The ``video_name`` field intentionally stores only the file's
+    basename, not the full path, to match ``VideoMetadata``'s stance on
+    not persisting potentially subject-identifying filesystem paths.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    video_name: str = Field(
+        description="Basename of the benchmarked video (no directory components).",
+    )
+    repeats: int = Field(
+        ge=1,
+        description="Total passes executed — includes the discarded warmup pass.",
+    )
+    warmup_frames: int = Field(
+        ge=0,
+        description=(
+            "Frames excluded from the head of each measured pass when "
+            "computing the aggregate statistics."
+        ),
+    )
+    warmup_pass: PerformanceMetrics = Field(
+        description=(
+            "First pass, discarded from the aggregate. Preserved here so "
+            "readers can see the graph-compilation cost explicitly."
+        ),
+    )
+    measured_passes: list[PerformanceMetrics] = Field(
+        description="Passes 2..N, i.e. those that contributed to the aggregate.",
+    )
+    aggregate: BenchmarkAggregate
+    cpu_comparison: CpuComparisonResult | None = None
+
+
+class JointAxisExtractor(BaseModel):
+    """Segmentation signal extracted from one axis of one joint.
+
+    The simplest extractor: picks a single spatial coordinate of a single
+    joint as the segmentation signal. Use ``invert=True`` when the motion
+    of interest is a decrease (e.g. a joint that dips down during the
+    repetition) so that peak detection can work on a maximum rather than
+    a minimum.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    kind: Literal["joint_axis"] = "joint_axis"
+    joint: int = Field(ge=0, description="Joint index into the (J,) skeleton.")
+    axis: int = Field(ge=0, le=2, description="Spatial axis: 0=x, 1=y, 2=z.")
+    invert: bool = Field(default=False, description="Negate the signal so valleys become peaks.")
+
+
+class JointPairDistanceExtractor(BaseModel):
+    """Signal = Euclidean distance between two joints per frame.
+
+    Translation- and (partially) rotation-invariant. Good fit for tasks
+    where the target motion is a distance change between two body parts
+    (e.g. wrist-to-shoulder for a reach).
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    kind: Literal["joint_pair_distance"] = "joint_pair_distance"
+    joints: tuple[int, int] = Field(description="Ordered pair of joint indices.")
+
+    @model_validator(mode="after")
+    def _joints_distinct(self) -> JointPairDistanceExtractor:
+        if self.joints[0] == self.joints[1]:
+            raise ValueError("joint_pair_distance requires two distinct joints")
+        if self.joints[0] < 0 or self.joints[1] < 0:
+            raise ValueError("joint indices must be non-negative")
+        return self
+
+
+class JointSpeedExtractor(BaseModel):
+    """Signal = frame-to-frame displacement magnitude of one joint.
+
+    The first frame is padded with a 0 so the signal has the same length
+    as the input pose sequence. Useful when a repetition is characterised
+    by "the joint moves fast, then rests, then moves fast again" rather
+    than by any particular coordinate value.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    kind: Literal["joint_speed"] = "joint_speed"
+    joint: int = Field(ge=0)
+
+
+class JointAngleExtractor(BaseModel):
+    """Signal = angle at joint ``b`` formed by the triplet ``(a, b, c)``.
+
+    Computed in radians in ``[0, pi]`` using
+    :func:`neuropose.analyzer.features.extract_joint_angles`. This is the
+    most translation- and rotation-invariant of the built-in extractors
+    and the natural choice for clinically meaningful metrics like knee
+    or elbow flexion.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    kind: Literal["joint_angle"] = "joint_angle"
+    triplet: tuple[int, int, int] = Field(
+        description="``(a, b, c)`` joint indices; angle is computed at ``b``."
+    )
+
+    @model_validator(mode="after")
+    def _triplet_non_negative(self) -> JointAngleExtractor:
+        if any(idx < 0 for idx in self.triplet):
+            raise ValueError("joint indices must be non-negative")
+        return self
+
+
+ExtractorSpec = Annotated[
+    JointAxisExtractor | JointPairDistanceExtractor | JointSpeedExtractor | JointAngleExtractor,
+    Field(discriminator="kind"),
+]
+
+
+class SegmentationConfig(BaseModel):
+    """Parameters that define a single segmentation pass.
+
+    The full config is serialized alongside the segments it produced so
+    that a reader of ``results.json`` can tell — a year later, without
+    access to the code that wrote it — exactly which joint was tracked
+    and which thresholds were applied. The ``method`` field is a version
+    stamp: if the segmentation algorithm changes, add a new literal here
+    and dispatch on it in the engine rather than silently reinterpreting
+    old files.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    extractor: ExtractorSpec
+    person_index: int = Field(
+        default=0,
+        ge=0,
+        description="Which detected person to extract from each frame.",
+    )
+    min_distance_seconds: float | None = Field(
+        default=None,
+        ge=0.0,
+        description="Minimum time between successive repetition peaks.",
+    )
+    min_prominence: float | None = Field(
+        default=None,
+        description="Minimum scipy-style peak prominence on the raw signal.",
+    )
+    min_height: float | None = Field(
+        default=None,
+        description="Minimum signal value for a point to be considered a peak.",
+    )
+    pad_seconds: float = Field(
+        default=0.0,
+        ge=0.0,
+        description="Amount of time to extend each segment on both sides.",
+    )
+    method: Literal["valley_to_valley_v1"] = Field(
+        default="valley_to_valley_v1",
+        description="Name and version of the segmentation algorithm used.",
+    )
+
+
+class Segment(BaseModel):
+    """A single repetition window inside a pose sequence.
+
+    Frame indices are integers into the ``VideoPredictions.frames``
+    insertion order. ``start`` is inclusive, ``end`` is exclusive (Python
+    slice convention), and ``peak`` is the apex of the repetition inside
+    ``[start, end)``.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    start: int = Field(ge=0, description="Inclusive start frame index.")
+    end: int = Field(gt=0, description="Exclusive end frame index.")
+    peak: int = Field(ge=0, description="Frame index of the repetition's apex.")
+
+    @model_validator(mode="after")
+    def _check_ordering(self) -> Segment:
+        if self.end <= self.start:
+            raise ValueError(f"end ({self.end}) must be > start ({self.start})")
+        if not (self.start <= self.peak < self.end):
+            raise ValueError(
+                f"peak ({self.peak}) must be in [start, end) = [{self.start}, {self.end})"
+            )
+        return self
+
+
+class Segmentation(BaseModel):
+    """A labelled segmentation of a single video.
+
+    Pairs the :class:`SegmentationConfig` that produced the segments with
+    the segments themselves, so the serialized form is self-describing.
+    Multiple named :class:`Segmentation` objects can coexist on a single
+    :class:`VideoPredictions` via its ``segmentations`` field — typically
+    one per clinical task or per operator-chosen strategy.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    config: SegmentationConfig
+    segments: list[Segment]
+
+
 class VideoPredictions(BaseModel):
     """Per-frame predictions for a single video, paired with video metadata.
 
     The ``frames`` mapping is keyed by frame identifier (``frame_<index>`` by
     convention, zero-padded to 6 digits). The identifier is a stable string,
     not a filesystem path — no PNG file is implied.
+
+    The optional ``segmentations`` mapping carries one or more post-hoc
+    :class:`Segmentation` objects keyed by operator-chosen name (e.g.
+    ``"cup_lift"``). Downstream analysis code that expects rep-level
+    windows looks here. The field defaults to empty, so inference output
+    written before segmentation round-trips through this schema unchanged.
     """
 
     model_config = ConfigDict(extra="forbid", frozen=True)
 
     metadata: VideoMetadata
     frames: dict[str, FramePrediction]
+    segmentations: dict[str, Segmentation] = Field(default_factory=dict)
 
     def frame_names(self) -> list[str]:
         """Return frame identifiers in insertion order."""
@@ -195,6 +579,19 @@ def save_job_results(path: Path, results: JobResults) -> None:
     _write_json_atomic(path, results.model_dump(mode="json"))
 
 
+def load_benchmark_result(path: Path) -> BenchmarkResult:
+    """Load and validate a benchmark-result JSON file."""
+    with path.open("r", encoding="utf-8") as f:
+        data: Any = json.load(f)
+    return BenchmarkResult.model_validate(data)
+
+
+def save_benchmark_result(path: Path, result: BenchmarkResult) -> None:
+    """Serialize a :class:`BenchmarkResult` to a JSON file atomically."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    _write_json_atomic(path, result.model_dump(mode="json"))
+
+
 def load_status(path: Path) -> StatusFile:
     """Load the persistent job status file.
 
diff --git a/tests/integration/test_joint_names_drift.py b/tests/integration/test_joint_names_drift.py
new file mode 100644
index 0000000..30df5ba
--- /dev/null
+++ b/tests/integration/test_joint_names_drift.py
@@ -0,0 +1,60 @@
+"""Drift test for the hardcoded berkeley_mhad_43 joint-name constant.
+
+:mod:`neuropose.analyzer.segment` ships ``JOINT_NAMES`` as a frozen
+tuple of 43 strings so that post-processing callers can resolve
+``"rwri"`` → index without loading a multi-gigabyte TensorFlow model.
+That is only safe while the hardcoded tuple actually matches what the
+pinned MeTRAbs SavedModel reports.
+
+This test loads the real model via :func:`neuropose._model.load_metrabs_model`
+and asserts that ``JOINT_NAMES`` is byte-identical to
+``model.per_skeleton_joint_names["berkeley_mhad_43"].numpy().astype(str)``.
+If MeTRAbs ever ships a new skeleton under the same name — or if we bump
+the model pin to one whose ``berkeley_mhad_43`` skeleton is spelled
+differently — this test is the drift detector.
+
+Like every test under ``tests/integration/`` the file is marked
+``@pytest.mark.slow`` and only runs under ``pytest --runslow``.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from neuropose._model import load_metrabs_model
+from neuropose.analyzer.segment import JOINT_NAMES
+
+pytestmark = pytest.mark.slow
+
+
+@pytest.fixture(scope="module")
+def metrabs_model_cache_dir(tmp_path_factory: pytest.TempPathFactory) -> Path:
+    """Module-scoped cache dir so the model downloads at most once per run.
+
+    Function scope would re-download on every test; session scope would
+    collide with the estimator smoke-test cache. Module scope is the
+    right middle ground for a file that only needs the model loaded
+    once.
+    """
+    return tmp_path_factory.mktemp("neuropose_joint_names_model_cache")
+
+
+def test_joint_names_match_pinned_model(metrabs_model_cache_dir: Path) -> None:
+    """Hardcoded ``JOINT_NAMES`` must match the loaded MeTRAbs skeleton.
+
+    If this fails, the expected fix is:
+
+    1. Update :data:`neuropose.analyzer.segment.JOINT_NAMES` in the same
+       commit that bumps the model pin in :mod:`neuropose._model`.
+    2. Cross-check any CLI or docs that embed hardcoded joint names.
+    """
+    model = load_metrabs_model(cache_dir=metrabs_model_cache_dir)
+    tensor = model.per_skeleton_joint_names["berkeley_mhad_43"]
+    model_names = tuple(tensor.numpy().astype(str).tolist())
+    assert model_names == JOINT_NAMES, (
+        "JOINT_NAMES drift detected — the hardcoded tuple in "
+        "neuropose.analyzer.segment no longer matches the MeTRAbs model. "
+        f"Model reports: {model_names}"
+    )
diff --git a/tests/unit/test_analyzer_segment.py b/tests/unit/test_analyzer_segment.py
new file mode 100644
index 0000000..76078f5
--- /dev/null
+++ b/tests/unit/test_analyzer_segment.py
@@ -0,0 +1,431 @@
+"""Tests for :mod:`neuropose.analyzer.segment`.
+
+Three layers of coverage:
+
+- **Layer 1** (:func:`segment_by_peaks`) against synthetic 1D signals
+  with known peaks and valleys.
+- **Layer 2** (:func:`segment_predictions`) against synthetic
+  :class:`VideoPredictions` fixtures exercising every extractor variant.
+- **Slicing** (:func:`slice_predictions`) — per-rep round-trip and the
+  metadata rewrite.
+
+The extractor factories and the discriminated :class:`ExtractorSpec`
+union are covered incidentally through Layer 2 (which is the only
+layer that cares about the spec shape) plus a handful of targeted
+schema tests for the validators (distinct joint indices etc.).
+"""
+
+from __future__ import annotations
+
+import itertools
+import math
+
+import numpy as np
+import pytest
+
+from neuropose.analyzer.segment import (
+    JOINT_INDEX,
+    JOINT_NAMES,
+    extract_signal,
+    joint_angle,
+    joint_axis,
+    joint_index,
+    joint_pair_distance,
+    joint_speed,
+    segment_by_peaks,
+    segment_predictions,
+    slice_predictions,
+)
+from neuropose.io import (
+    JointAngleExtractor,
+    JointAxisExtractor,
+    JointPairDistanceExtractor,
+    JointSpeedExtractor,
+    Segment,
+    Segmentation,
+    VideoPredictions,
+)
+
+NUM_JOINTS = 43
+
+
+def _triple_hump_signal(num_frames: int = 300) -> np.ndarray:
+    """Three non-negative sine humps separated by clear zero-valleys."""
+    t = np.linspace(0.0, 6.0 * math.pi, num_frames)
+    return np.maximum(0.0, np.sin(t)) ** 2
+
+
+def _make_predictions(
+    signal: np.ndarray,
+    joint: int,
+    *,
+    axis: int = 1,
+    fps: float = 30.0,
+) -> VideoPredictions:
+    """Build a VideoPredictions whose ``joint``'s ``axis`` follows ``signal``."""
+    frames = {}
+    for i, value in enumerate(signal):
+        poses = [[[0.0, 0.0, 0.0] for _ in range(NUM_JOINTS)]]
+        poses[0][joint][axis] = float(value)
+        frames[f"frame_{i:06d}"] = {
+            "boxes": [[0.0, 0.0, 1.0, 1.0, 0.9]],
+            "poses3d": poses,
+            "poses2d": [[[0.0, 0.0]] * NUM_JOINTS],
+        }
+    return VideoPredictions.model_validate(
+        {
+            "metadata": {
+                "frame_count": len(signal),
+                "fps": fps,
+                "width": 640,
+                "height": 480,
+            },
+            "frames": frames,
+        }
+    )
+
+
+# ---------------------------------------------------------------------------
+# JOINT_NAMES / JOINT_INDEX / joint_index()
+# ---------------------------------------------------------------------------
+
+
+class TestJointNames:
+    def test_tuple_length_is_43(self) -> None:
+        assert len(JOINT_NAMES) == 43
+
+    def test_index_matches_position(self) -> None:
+        for idx, name in enumerate(JOINT_NAMES):
+            assert JOINT_INDEX[name] == idx
+
+    def test_joint_index_by_name(self) -> None:
+        assert joint_index("lwri") == JOINT_NAMES.index("lwri")
+        assert joint_index("rwri") == JOINT_NAMES.index("rwri")
+
+    def test_joint_index_unknown_name(self) -> None:
+        with pytest.raises(KeyError, match="unknown joint name"):
+            joint_index("elbow")  # deliberately wrong spelling
+
+
+# ---------------------------------------------------------------------------
+# Factory shortcuts
+# ---------------------------------------------------------------------------
+
+
+class TestFactories:
+    def test_joint_axis_factory(self) -> None:
+        spec = joint_axis(JOINT_INDEX["lwri"], 1, invert=True)
+        assert isinstance(spec, JointAxisExtractor)
+        assert spec.kind == "joint_axis"
+        assert spec.joint == JOINT_INDEX["lwri"]
+        assert spec.axis == 1
+        assert spec.invert is True
+
+    def test_joint_pair_distance_factory(self) -> None:
+        spec = joint_pair_distance(JOINT_INDEX["lwri"], JOINT_INDEX["rwri"])
+        assert isinstance(spec, JointPairDistanceExtractor)
+        assert spec.joints == (JOINT_INDEX["lwri"], JOINT_INDEX["rwri"])
+
+    def test_joint_pair_distance_rejects_same_joint(self) -> None:
+        from pydantic import ValidationError
+
+        with pytest.raises(ValidationError, match="distinct"):
+            joint_pair_distance(5, 5)
+
+    def test_joint_speed_factory(self) -> None:
+        spec = joint_speed(JOINT_INDEX["rwri"])
+        assert isinstance(spec, JointSpeedExtractor)
+        assert spec.joint == JOINT_INDEX["rwri"]
+
+    def test_joint_angle_factory(self) -> None:
+        spec = joint_angle(
+            JOINT_INDEX["larm"],
+            JOINT_INDEX["lelb"],
+            JOINT_INDEX["lwri"],
+        )
+        assert isinstance(spec, JointAngleExtractor)
+        assert spec.triplet == (
+            JOINT_INDEX["larm"],
+            JOINT_INDEX["lelb"],
+            JOINT_INDEX["lwri"],
+        )
+
+
+# ---------------------------------------------------------------------------
+# extract_signal: one test per extractor variant
+# ---------------------------------------------------------------------------
+
+
+class TestExtractSignal:
+    def test_joint_axis_selects_axis(self) -> None:
+        seq = np.zeros((4, NUM_JOINTS, 3))
+        seq[:, 10, 1] = [1.0, 2.0, 3.0, 4.0]
+        signal = extract_signal(seq, joint_axis(10, 1))
+        np.testing.assert_array_equal(signal, [1.0, 2.0, 3.0, 4.0])
+
+    def test_joint_axis_invert(self) -> None:
+        seq = np.zeros((3, NUM_JOINTS, 3))
+        seq[:, 0, 0] = [1.0, 2.0, 3.0]
+        signal = extract_signal(seq, joint_axis(0, 0, invert=True))
+        np.testing.assert_array_equal(signal, [-1.0, -2.0, -3.0])
+
+    def test_joint_pair_distance(self) -> None:
+        seq = np.zeros((3, NUM_JOINTS, 3))
+        seq[:, 0, 0] = [0.0, 0.0, 0.0]
+        seq[:, 1, 0] = [3.0, 6.0, 9.0]  # distances 3, 6, 9 along x
+        signal = extract_signal(seq, joint_pair_distance(0, 1))
+        np.testing.assert_allclose(signal, [3.0, 6.0, 9.0])
+
+    def test_joint_speed_pads_first_frame_with_zero(self) -> None:
+        seq = np.zeros((4, NUM_JOINTS, 3))
+        seq[:, 5, 0] = [0.0, 1.0, 3.0, 6.0]  # speeds: 1, 2, 3
+        signal = extract_signal(seq, joint_speed(5))
+        np.testing.assert_allclose(signal, [0.0, 1.0, 2.0, 3.0])
+
+    def test_joint_speed_single_frame_rejected(self) -> None:
+        seq = np.zeros((1, NUM_JOINTS, 3))
+        with pytest.raises(ValueError, match="at least two frames"):
+            extract_signal(seq, joint_speed(0))
+
+    def test_joint_angle_straight(self) -> None:
+        seq = np.zeros((2, NUM_JOINTS, 3))
+        # Straight line: a=(-1,0,0), b=(0,0,0), c=(1,0,0). Angle = pi.
+        seq[:, 0, 0] = [-1.0, -1.0]
+        seq[:, 1, 0] = [0.0, 0.0]
+        seq[:, 2, 0] = [1.0, 1.0]
+        signal = extract_signal(seq, joint_angle(0, 1, 2))
+        np.testing.assert_allclose(signal, [math.pi, math.pi])
+
+    def test_joint_angle_right(self) -> None:
+        seq = np.zeros((1, NUM_JOINTS, 3))
+        seq[0, 0] = [1.0, 0.0, 0.0]
+        seq[0, 1] = [0.0, 0.0, 0.0]
+        seq[0, 2] = [0.0, 1.0, 0.0]
+        signal = extract_signal(seq, joint_angle(0, 1, 2))
+        np.testing.assert_allclose(signal, [math.pi / 2])
+
+    def test_out_of_range_joint_index(self) -> None:
+        seq = np.zeros((3, NUM_JOINTS, 3))
+        with pytest.raises(ValueError, match="out of range"):
+            extract_signal(seq, joint_axis(999, 0))
+
+    def test_bad_sequence_shape(self) -> None:
+        with pytest.raises(ValueError, match="frames, joints, 3"):
+            extract_signal(np.zeros((5, 10)), joint_axis(0, 0))
+
+
+# ---------------------------------------------------------------------------
+# Layer 1: segment_by_peaks
+# ---------------------------------------------------------------------------
+
+
+class TestSegmentByPeaks:
+    def test_three_humps_three_segments(self) -> None:
+        signal = _triple_hump_signal()
+        segs = segment_by_peaks(signal, min_prominence=0.1)
+        assert len(segs) == 3
+        # Segments should not overlap (in this synthetic case). Adjacent
+        # segments are allowed to share the valley frame on their boundary,
+        # so we use a ``>=`` comparison with one frame of slack.
+        for prev, curr in itertools.pairwise(segs):
+            assert curr.start >= prev.end - 1
+
+    def test_first_segment_starts_at_zero_without_leading_valley(self) -> None:
+        signal = _triple_hump_signal()
+        segs = segment_by_peaks(signal, min_prominence=0.1)
+        assert segs[0].start == 0
+
+    def test_last_segment_ends_at_signal_length(self) -> None:
+        signal = _triple_hump_signal()
+        segs = segment_by_peaks(signal, min_prominence=0.1)
+        assert segs[-1].end == len(signal)
+
+    def test_peaks_lie_inside_segment(self) -> None:
+        signal = _triple_hump_signal()
+        segs = segment_by_peaks(signal, min_prominence=0.1)
+        for seg in segs:
+            assert seg.start <= seg.peak < seg.end
+
+    def test_no_peaks_returns_empty(self) -> None:
+        flat = np.zeros(50)
+        segs = segment_by_peaks(flat)
+        assert segs == []
+
+    def test_min_distance_suppresses_close_peaks(self) -> None:
+        # A signal with two very close peaks should give only one segment
+        # when min_distance is large.
+        signal = np.zeros(100)
+        signal[20] = 1.0
+        signal[25] = 1.0
+        signal[70] = 1.0
+        segs = segment_by_peaks(signal, min_distance=30)
+        # Exact count depends on scipy's tie-breaking; main assertion is
+        # "fewer than if no distance constraint".
+        segs_unconstrained = segment_by_peaks(signal)
+        assert len(segs) < len(segs_unconstrained)
+
+    def test_pad_extends_segment(self) -> None:
+        signal = _triple_hump_signal()
+        base = segment_by_peaks(signal, min_prominence=0.1)
+        padded = segment_by_peaks(signal, min_prominence=0.1, pad=5)
+        assert len(base) == len(padded)
+        for b, p in zip(base, padded, strict=True):
+            assert p.start <= b.start
+            assert p.end >= b.end
+
+    def test_pad_is_clamped_to_bounds(self) -> None:
+        signal = _triple_hump_signal()
+        padded = segment_by_peaks(signal, min_prominence=0.1, pad=10_000)
+        for seg in padded:
+            assert seg.start >= 0
+            assert seg.end <= len(signal)
+
+    def test_negative_pad_rejected(self) -> None:
+        with pytest.raises(ValueError, match="pad"):
+            segment_by_peaks(np.zeros(10), pad=-1)
+
+    def test_rejects_non_1d(self) -> None:
+        with pytest.raises(ValueError, match="1D"):
+            segment_by_peaks(np.zeros((5, 5)))
+
+
+# ---------------------------------------------------------------------------
+# Layer 2: segment_predictions
+# ---------------------------------------------------------------------------
+
+
+class TestSegmentPredictions:
+    def test_returns_segmentation_with_config(self) -> None:
+        signal = _triple_hump_signal() * 1000.0  # mm scale
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"])
+        result = segment_predictions(
+            preds,
+            joint_axis(JOINT_INDEX["lwri"], 1),
+            min_prominence=50.0,
+        )
+        assert isinstance(result, Segmentation)
+        assert len(result.segments) == 3
+        assert result.config.extractor.kind == "joint_axis"
+        assert result.config.min_prominence == 50.0
+        assert result.config.method == "valley_to_valley_v1"
+
+    def test_min_distance_seconds_converts_via_fps(self) -> None:
+        # 300 frames at 30 fps = 10 seconds; humps are ~3.3 s apart.
+        signal = _triple_hump_signal() * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"], fps=30.0)
+        # A 5-second minimum distance should collapse the three humps
+        # into at most two segments.
+        result = segment_predictions(
+            preds,
+            joint_axis(JOINT_INDEX["lwri"], 1),
+            min_prominence=50.0,
+            min_distance_seconds=5.0,
+        )
+        assert len(result.segments) <= 2
+
+    def test_pad_seconds_extends_segments(self) -> None:
+        signal = _triple_hump_signal() * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"], fps=30.0)
+        plain = segment_predictions(preds, joint_axis(JOINT_INDEX["lwri"], 1), min_prominence=50.0)
+        padded = segment_predictions(
+            preds,
+            joint_axis(JOINT_INDEX["lwri"], 1),
+            min_prominence=50.0,
+            pad_seconds=0.2,  # ~6 frames at 30 fps
+        )
+        # At least one segment should have moved outward.
+        assert any(
+            p.start < b.start or p.end > b.end
+            for p, b in zip(padded.segments, plain.segments, strict=True)
+        )
+
+    def test_requires_fps_when_time_params_used(self) -> None:
+        signal = _triple_hump_signal() * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"], fps=0.0)
+        with pytest.raises(ValueError, match="fps"):
+            segment_predictions(
+                preds,
+                joint_axis(JOINT_INDEX["lwri"], 1),
+                min_prominence=50.0,
+                min_distance_seconds=1.0,
+            )
+
+    def test_no_fps_is_fine_without_time_params(self) -> None:
+        signal = _triple_hump_signal() * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"], fps=0.0)
+        # Without any time-based parameters we never need to multiply by
+        # fps, so fps=0 is tolerated.
+        result = segment_predictions(
+            preds,
+            joint_axis(JOINT_INDEX["lwri"], 1),
+            min_prominence=50.0,
+        )
+        assert len(result.segments) == 3
+
+    def test_config_roundtrips_through_json(self) -> None:
+        signal = _triple_hump_signal() * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"])
+        result = segment_predictions(
+            preds,
+            joint_pair_distance(JOINT_INDEX["lwri"], JOINT_INDEX["rwri"]),
+            min_prominence=10.0,
+        )
+        serialized = result.model_dump(mode="json")
+        rehydrated = Segmentation.model_validate(serialized)
+        assert rehydrated == result
+
+
+# ---------------------------------------------------------------------------
+# slice_predictions
+# ---------------------------------------------------------------------------
+
+
+class TestSlicePredictions:
+    def test_one_output_per_segment(self) -> None:
+        signal = _triple_hump_signal() * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"])
+        result = segment_predictions(preds, joint_axis(JOINT_INDEX["lwri"], 1), min_prominence=50.0)
+        slices = slice_predictions(preds, result.segments)
+        assert len(slices) == len(result.segments)
+
+    def test_metadata_frame_count_matches_segment_length(self) -> None:
+        signal = _triple_hump_signal() * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"])
+        segments = [Segment(start=10, end=30, peak=20), Segment(start=50, end=90, peak=75)]
+        slices = slice_predictions(preds, segments)
+        assert slices[0].metadata.frame_count == 20
+        assert slices[1].metadata.frame_count == 40
+
+    def test_frames_are_rekeyed_from_zero(self) -> None:
+        signal = _triple_hump_signal() * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"])
+        segments = [Segment(start=100, end=110, peak=105)]
+        sliced = slice_predictions(preds, segments)[0]
+        assert sliced.frame_names()[0] == "frame_000000"
+        assert sliced.frame_names()[-1] == "frame_000009"
+
+    def test_sliced_segmentations_field_is_empty(self) -> None:
+        # Parent has a segmentation attached; sliced copies intentionally
+        # drop it because segment indices are only meaningful in the
+        # parent's timeline.
+        signal = _triple_hump_signal() * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"])
+        result = segment_predictions(preds, joint_axis(JOINT_INDEX["lwri"], 1), min_prominence=50.0)
+        parent = preds.model_copy(update={"segmentations": {"cup_lift": result}})
+        slices = slice_predictions(parent, result.segments)
+        assert all(s.segmentations == {} for s in slices)
+
+    def test_out_of_bounds_segment_rejected(self) -> None:
+        signal = _triple_hump_signal(num_frames=50) * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"])
+        bad = [Segment(start=0, end=100, peak=25)]  # end > 50
+        with pytest.raises(ValueError, match="exceeds frame count"):
+            slice_predictions(preds, bad)
+
+    def test_slice_preserves_pose_values(self) -> None:
+        signal = _triple_hump_signal() * 1000.0
+        preds = _make_predictions(signal, joint=JOINT_INDEX["lwri"])
+        segments = [Segment(start=50, end=55, peak=52)]
+        sliced = slice_predictions(preds, segments)[0]
+        # frame_000000 of the slice must equal frame_000050 of the source
+        assert sliced["frame_000000"].poses3d == preds["frame_000050"].poses3d
diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py
new file mode 100644
index 0000000..aeb4fe8
--- /dev/null
+++ b/tests/unit/test_benchmark.py
@@ -0,0 +1,314 @@
+"""Tests for :mod:`neuropose.benchmark`.
+
+Coverage:
+
+- :func:`run_benchmark` against a fake-model :class:`Estimator` —
+  repeats, warmup-pass discarding, capture-reference, edge cases.
+- Aggregate statistics (mean / p50 / p95 / p99 / throughput / peak RSS)
+  on synthetic metrics.
+- :func:`compute_poses3d_divergence` on matched and mismatched
+  prediction pairs.
+- :func:`format_benchmark_report` rendering.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from neuropose.benchmark import (
+    compute_poses3d_divergence,
+    format_benchmark_report,
+    run_benchmark,
+)
+from neuropose.estimator import Estimator
+from neuropose.io import (
+    BenchmarkResult,
+    PerformanceMetrics,
+    VideoPredictions,
+)
+
+
+class TestRunBenchmark:
+    def test_returns_benchmark_result(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
+        assert isinstance(outcome.result, BenchmarkResult)
+        assert outcome.result.video_name == synthetic_video.name
+
+    def test_repeats_reflected_in_result(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        outcome = run_benchmark(estimator, synthetic_video, repeats=4, warmup_frames=0)
+        assert outcome.result.repeats == 4
+        # One pass is always discarded as warmup; the rest are measured.
+        assert len(outcome.result.measured_passes) == 3
+        assert outcome.result.aggregate.repeats_measured == 3
+
+    def test_first_pass_is_the_warmup(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
+        assert outcome.result.warmup_pass is not None
+        # warmup_pass is a separate object from the measured passes.
+        assert outcome.result.warmup_pass not in outcome.result.measured_passes
+
+    def test_capture_reference_off_by_default(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
+        assert outcome.reference_predictions is None
+
+    def test_capture_reference_returns_last_measured_pass(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        outcome = run_benchmark(
+            estimator,
+            synthetic_video,
+            repeats=3,
+            warmup_frames=0,
+            capture_reference=True,
+        )
+        assert isinstance(outcome.reference_predictions, VideoPredictions)
+        assert len(outcome.reference_predictions) == 5
+
+    def test_rejects_repeats_below_two(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        with pytest.raises(ValueError, match="repeats"):
+            run_benchmark(estimator, synthetic_video, repeats=1, warmup_frames=0)
+
+    def test_rejects_negative_warmup_frames(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        with pytest.raises(ValueError, match="warmup_frames"):
+            run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=-1)
+
+    def test_warmup_frames_filters_aggregate_head(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        # synthetic_video has 5 frames; warmup_frames=4 leaves only
+        # the last frame of each measured pass to contribute.
+        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=4)
+        # The stats should still compute, not error.
+        assert outcome.result.aggregate.mean_frame_latency_ms >= 0.0
+
+    def test_warmup_frames_exceeding_pass_length_gives_zero_latencies(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        # 5-frame video, warmup_frames=10 → every frame discarded.
+        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=10)
+        # Latency aggregate drops to zero; throughput is still
+        # computed from total wall clock.
+        assert outcome.result.aggregate.mean_frame_latency_ms == 0.0
+        assert outcome.result.aggregate.p95_frame_latency_ms == 0.0
+
+
+class TestBenchmarkAggregate:
+    def test_percentiles_ordered_correctly(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        outcome = run_benchmark(estimator, synthetic_video, repeats=4, warmup_frames=0)
+        agg = outcome.result.aggregate
+        assert agg.p50_frame_latency_ms <= agg.p95_frame_latency_ms
+        assert agg.p95_frame_latency_ms <= agg.p99_frame_latency_ms
+
+    def test_throughput_non_negative(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
+        assert outcome.result.aggregate.mean_throughput_fps > 0.0
+
+    def test_peak_rss_matches_max_across_passes(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
+        individual = [p.peak_rss_mb for p in outcome.result.measured_passes]
+        assert outcome.result.aggregate.peak_rss_mb_max == max(individual)
+
+
+class TestComputePoses3dDivergence:
+    def _make_predictions(self, poses3d_values: list[float]) -> VideoPredictions:
+        """Build a 3-frame VideoPredictions whose single joint has the given y-values."""
+        frames: dict[str, dict] = {}
+        for i, value in enumerate(poses3d_values):
+            frames[f"frame_{i:06d}"] = {
+                "boxes": [[0.0, 0.0, 1.0, 1.0, 0.9]],
+                "poses3d": [[[0.0, float(value), 0.0]]],
+                "poses2d": [[[0.0, 0.0]]],
+            }
+        return VideoPredictions.model_validate(
+            {
+                "metadata": {
+                    "frame_count": len(poses3d_values),
+                    "fps": 30.0,
+                    "width": 64,
+                    "height": 64,
+                },
+                "frames": frames,
+            }
+        )
+
+    def test_identical_predictions_zero_divergence(self) -> None:
+        a = self._make_predictions([1.0, 2.0, 3.0])
+        b = self._make_predictions([1.0, 2.0, 3.0])
+        max_diff, compared = compute_poses3d_divergence(a, b)
+        assert max_diff == 0.0
+        assert compared == 3
+
+    def test_divergence_reports_max_abs_diff(self) -> None:
+        a = self._make_predictions([1.0, 2.0, 3.0])
+        b = self._make_predictions([1.0, 2.1, 2.5])  # diffs 0, 0.1, 0.5
+        max_diff, compared = compute_poses3d_divergence(a, b)
+        assert max_diff == pytest.approx(0.5)
+        assert compared == 3
+
+    def test_mismatched_frame_count_raises(self) -> None:
+        a = self._make_predictions([1.0, 2.0])
+        b = self._make_predictions([1.0, 2.0, 3.0])
+        with pytest.raises(ValueError, match="frame count"):
+            compute_poses3d_divergence(a, b)
+
+    def test_mismatched_detection_count_skipped_not_raised(self) -> None:
+        a = self._make_predictions([1.0, 2.0])
+        # Frame 1 has zero detections instead of one.
+        b_payload = {
+            "metadata": {
+                "frame_count": 2,
+                "fps": 30.0,
+                "width": 64,
+                "height": 64,
+            },
+            "frames": {
+                "frame_000000": {
+                    "boxes": [[0, 0, 1, 1, 0.9]],
+                    "poses3d": [[[0.0, 1.0, 0.0]]],
+                    "poses2d": [[[0.0, 0.0]]],
+                },
+                "frame_000001": {
+                    "boxes": [],
+                    "poses3d": [],
+                    "poses2d": [],
+                },
+            },
+        }
+        b = VideoPredictions.model_validate(b_payload)
+        max_diff, compared = compute_poses3d_divergence(a, b)
+        # First frame contributes 0 divergence; second frame is skipped.
+        assert max_diff == 0.0
+        assert compared == 1
+
+
+class TestFormatBenchmarkReport:
+    def _fake_metrics(self, latencies_ms: list[float]) -> PerformanceMetrics:
+        return PerformanceMetrics(
+            model_load_seconds=12.3,
+            total_seconds=sum(latencies_ms) / 1000.0 + 0.01,
+            per_frame_latencies_ms=latencies_ms,
+            peak_rss_mb=512.0,
+            active_device="/CPU:0",
+            tensorflow_metal_active=False,
+            tensorflow_version="2.21.0",
+        )
+
+    def test_report_mentions_key_fields(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
+        text = format_benchmark_report(outcome.result)
+        assert "Benchmark:" in text
+        assert "device:" in text
+        assert "mean:" in text
+        assert "p50:" in text
+        assert "p95:" in text
+        assert "p99:" in text
+        assert "Throughput:" in text
+        assert "Peak RSS:" in text
+
+    def test_report_mentions_metal_when_active(self) -> None:
+        from neuropose.io import BenchmarkAggregate, BenchmarkResult
+
+        metrics = PerformanceMetrics(
+            model_load_seconds=None,
+            total_seconds=1.0,
+            per_frame_latencies_ms=[10.0, 11.0],
+            peak_rss_mb=100.0,
+            active_device="/GPU:0",
+            tensorflow_metal_active=True,
+            tensorflow_version="2.21.0",
+        )
+        agg = BenchmarkAggregate(
+            repeats_measured=1,
+            warmup_frames_per_pass=0,
+            mean_frame_latency_ms=10.5,
+            p50_frame_latency_ms=10.5,
+            p95_frame_latency_ms=11.0,
+            p99_frame_latency_ms=11.0,
+            stddev_frame_latency_ms=0.5,
+            mean_throughput_fps=95.0,
+            peak_rss_mb_max=100.0,
+            active_device="/GPU:0",
+            tensorflow_metal_active=True,
+            tensorflow_version="2.21.0",
+        )
+        result = BenchmarkResult(
+            video_name="test.mp4",
+            repeats=2,
+            warmup_frames=0,
+            warmup_pass=metrics,
+            measured_passes=[metrics],
+            aggregate=agg,
+        )
+        text = format_benchmark_report(result)
+        assert "tensorflow-metal" in text
+
+    def test_report_model_load_injected_label(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
+        text = format_benchmark_report(outcome.result)
+        assert "injected" in text  # fake model was not loaded via load_model
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
index a2ccce6..324de4b 100644
--- a/tests/unit/test_cli.py
+++ b/tests/unit/test_cli.py
@@ -14,8 +14,10 @@ robust.
 
 from __future__ import annotations
 
+import math
 from pathlib import Path
 
+import numpy as np
 import pytest
 import yaml
 from typer.testing import CliRunner
@@ -28,6 +30,15 @@ from neuropose.cli import (
     EXIT_USAGE,
     app,
 )
+from neuropose.io import (
+    JobResults,
+    VideoPredictions,
+    load_benchmark_result,
+    load_job_results,
+    load_video_predictions,
+    save_job_results,
+    save_video_predictions,
+)
 
 
 @pytest.fixture
@@ -57,9 +68,7 @@ def stub_metrabs_loader(monkeypatch: pytest.MonkeyPatch) -> None:
 
     def fake_loader(cache_dir: Path | None = None) -> object:
         del cache_dir
-        raise NotImplementedError(
-            "pending commit 11: MeTRAbs loader stubbed for unit testing"
-        )
+        raise NotImplementedError("pending commit 11: MeTRAbs loader stubbed for unit testing")
 
     monkeypatch.setattr("neuropose.estimator.load_metrabs_model", fake_loader)
 
@@ -83,6 +92,8 @@ class TestTopLevelOptions:
         assert result.exit_code in (EXIT_OK, EXIT_USAGE)
         assert "watch" in result.output
         assert "process" in result.output
+        assert "segment" in result.output
+        assert "benchmark" in result.output
         assert "analyze" in result.output
 
     def test_help_flag(self, runner: CliRunner) -> None:
@@ -91,7 +102,7 @@ class TestTopLevelOptions:
         assert "NeuroPose" in result.output
 
     def test_subcommand_help(self, runner: CliRunner) -> None:
-        for subcommand in ("watch", "process", "analyze"):
+        for subcommand in ("watch", "process", "segment", "benchmark", "analyze"):
             result = runner.invoke(app, [subcommand, "--help"])
             assert result.exit_code == EXIT_OK, f"{subcommand} --help failed"
 
@@ -204,6 +215,393 @@ class TestProcess:
         assert "commit 11" in result.output
 
 
+# ---------------------------------------------------------------------------
+# segment
+# ---------------------------------------------------------------------------
+
+
+_NUM_JOINTS = 43
+
+
+def _triple_hump_predictions(joint_name: str = "lwri") -> VideoPredictions:
+    """Build a 300-frame synthetic VideoPredictions with three clear humps."""
+    from neuropose.analyzer.segment import JOINT_INDEX
+
+    joint = JOINT_INDEX[joint_name]
+    t = np.linspace(0.0, 6.0 * math.pi, 300)
+    signal = np.maximum(0.0, np.sin(t)) ** 2 * 1000.0
+
+    frames = {}
+    for i, value in enumerate(signal):
+        poses = [[[0.0, 0.0, 0.0] for _ in range(_NUM_JOINTS)]]
+        poses[0][joint][1] = float(value)
+        frames[f"frame_{i:06d}"] = {
+            "boxes": [[0.0, 0.0, 1.0, 1.0, 0.9]],
+            "poses3d": poses,
+            "poses2d": [[[0.0, 0.0]] * _NUM_JOINTS],
+        }
+    return VideoPredictions.model_validate(
+        {
+            "metadata": {
+                "frame_count": 300,
+                "fps": 30.0,
+                "width": 640,
+                "height": 480,
+            },
+            "frames": frames,
+        }
+    )
+
+
+class TestSegmentSubcommand:
+    def test_segment_job_results_in_place(self, runner: CliRunner, tmp_path: Path) -> None:
+        path = tmp_path / "results.json"
+        save_job_results(
+            path,
+            JobResults(root={"trial_01.mp4": _triple_hump_predictions()}),
+        )
+
+        result = runner.invoke(
+            app,
+            [
+                "segment",
+                str(path),
+                "--name",
+                "cup_lift",
+                "--extractor",
+                "joint_axis",
+                "--joint",
+                "lwri",
+                "--axis",
+                "1",
+                "--min-prominence",
+                "50",
+            ],
+        )
+
+        assert result.exit_code == EXIT_OK, result.output
+        loaded = load_job_results(path)
+        vp = loaded["trial_01.mp4"]
+        assert "cup_lift" in vp.segmentations
+        assert len(vp.segmentations["cup_lift"].segments) == 3
+
+    def test_segment_single_predictions_file(self, runner: CliRunner, tmp_path: Path) -> None:
+        path = tmp_path / "video_predictions.json"
+        save_video_predictions(path, _triple_hump_predictions())
+
+        result = runner.invoke(
+            app,
+            [
+                "segment",
+                str(path),
+                "--name",
+                "cup_lift",
+                "--extractor",
+                "joint_pair_distance",
+                "--joints",
+                "lwri,rwri",
+                "--min-prominence",
+                "50",
+            ],
+        )
+        # With both wrists at origin except for lwri's y-coordinate, the
+        # pair distance is exactly the lwri.y signal, so the three humps
+        # are detected the same way as the joint_axis case.
+        assert result.exit_code == EXIT_OK, result.output
+        loaded = load_video_predictions(path)
+        assert "cup_lift" in loaded.segmentations
+        assert len(loaded.segmentations["cup_lift"].segments) == 3
+
+    def test_segment_writes_to_output_option(self, runner: CliRunner, tmp_path: Path) -> None:
+        src = tmp_path / "src.json"
+        dst = tmp_path / "dst.json"
+        save_job_results(src, JobResults(root={"trial_01.mp4": _triple_hump_predictions()}))
+
+        result = runner.invoke(
+            app,
+            [
+                "segment",
+                str(src),
+                "--name",
+                "cup_lift",
+                "--extractor",
+                "joint_axis",
+                "--joint",
+                "15",  # integer form
+                "--axis",
+                "1",
+                "--min-prominence",
+                "50",
+                "--output",
+                str(dst),
+            ],
+        )
+
+        assert result.exit_code == EXIT_OK, result.output
+        assert dst.exists()
+        # Source must be left untouched when --output is given.
+        src_loaded = load_job_results(src)
+        assert src_loaded["trial_01.mp4"].segmentations == {}
+        dst_loaded = load_job_results(dst)
+        assert "cup_lift" in dst_loaded["trial_01.mp4"].segmentations
+
+    def test_segment_rejects_collision_without_force(
+        self, runner: CliRunner, tmp_path: Path
+    ) -> None:
+        path = tmp_path / "results.json"
+        save_job_results(path, JobResults(root={"trial_01.mp4": _triple_hump_predictions()}))
+        base_args = [
+            "segment",
+            str(path),
+            "--name",
+            "cup_lift",
+            "--extractor",
+            "joint_axis",
+            "--joint",
+            "lwri",
+            "--axis",
+            "1",
+            "--min-prominence",
+            "50",
+        ]
+
+        first = runner.invoke(app, base_args)
+        assert first.exit_code == EXIT_OK, first.output
+
+        collision = runner.invoke(app, base_args)
+        assert collision.exit_code == EXIT_USAGE
+        assert "force" in collision.output.lower()
+
+    def test_segment_force_overwrites(self, runner: CliRunner, tmp_path: Path) -> None:
+        path = tmp_path / "results.json"
+        save_job_results(path, JobResults(root={"trial_01.mp4": _triple_hump_predictions()}))
+        args = [
+            "segment",
+            str(path),
+            "--name",
+            "cup_lift",
+            "--extractor",
+            "joint_axis",
+            "--joint",
+            "lwri",
+            "--axis",
+            "1",
+            "--min-prominence",
+            "50",
+        ]
+        runner.invoke(app, args)
+        result = runner.invoke(app, [*args, "--force"])
+        assert result.exit_code == EXIT_OK, result.output
+
+    def test_segment_unknown_joint_name_is_usage_error(
+        self, runner: CliRunner, tmp_path: Path
+    ) -> None:
+        path = tmp_path / "results.json"
+        save_job_results(path, JobResults(root={"trial_01.mp4": _triple_hump_predictions()}))
+
+        result = runner.invoke(
+            app,
+            [
+                "segment",
+                str(path),
+                "--name",
+                "cup_lift",
+                "--extractor",
+                "joint_axis",
+                "--joint",
+                "elbow",  # deliberately not a valid berkeley_mhad_43 name
+                "--axis",
+                "1",
+            ],
+        )
+
+        assert result.exit_code == EXIT_USAGE
+        assert "unknown joint" in result.output.lower()
+
+    def test_segment_missing_required_flag_is_usage_error(
+        self, runner: CliRunner, tmp_path: Path
+    ) -> None:
+        path = tmp_path / "results.json"
+        save_job_results(path, JobResults(root={"trial_01.mp4": _triple_hump_predictions()}))
+
+        # joint_axis without --joint
+        result = runner.invoke(
+            app,
+            [
+                "segment",
+                str(path),
+                "--name",
+                "cup_lift",
+                "--extractor",
+                "joint_axis",
+                "--axis",
+                "1",
+            ],
+        )
+        assert result.exit_code == EXIT_USAGE
+
+    def test_segment_unreadable_file_is_usage_error(
+        self, runner: CliRunner, tmp_path: Path
+    ) -> None:
+        path = tmp_path / "garbage.json"
+        path.write_text("{ this is not valid json")
+        result = runner.invoke(
+            app,
+            [
+                "segment",
+                str(path),
+                "--name",
+                "cup_lift",
+                "--extractor",
+                "joint_axis",
+                "--joint",
+                "lwri",
+                "--axis",
+                "1",
+            ],
+        )
+        assert result.exit_code == EXIT_USAGE
+
+    def test_segment_preserves_pose_values(self, runner: CliRunner, tmp_path: Path) -> None:
+        """Segmentation only touches ``segmentations``; pose data is untouched."""
+        path = tmp_path / "results.json"
+        original = _triple_hump_predictions()
+        save_job_results(path, JobResults(root={"trial_01.mp4": original}))
+
+        result = runner.invoke(
+            app,
+            [
+                "segment",
+                str(path),
+                "--name",
+                "cup_lift",
+                "--extractor",
+                "joint_axis",
+                "--joint",
+                "lwri",
+                "--axis",
+                "1",
+                "--min-prominence",
+                "50",
+            ],
+        )
+        assert result.exit_code == EXIT_OK, result.output
+
+        loaded = load_job_results(path)
+        lv = loaded["trial_01.mp4"]
+        assert lv.frame_names() == original.frame_names()
+        for name in original.frame_names():
+            assert lv[name].poses3d == original[name].poses3d
+
+
+# ---------------------------------------------------------------------------
+# benchmark
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def stub_estimator_with_metrics(monkeypatch: pytest.MonkeyPatch):
+    """Monkeypatch the benchmark's Estimator path to use a fake model.
+
+    The CLI's ``benchmark`` subcommand instantiates an :class:`Estimator`
+    and calls ``load_model``. We replace ``load_metrabs_model`` with a
+    fake that returns a deterministic stand-in so the CLI's
+    ``load_model`` succeeds without downloading or touching TF. The
+    estimator's own metrics-collection path still runs, so the CLI
+    exercise is end-to-end except for the real model.
+    """
+    import numpy as np
+
+    class RecordingFake:
+        def detect_poses(self, image, **kwargs):
+            del image, kwargs
+            return {
+                "boxes": np.array([[0.0, 0.0, 1.0, 1.0, 0.9]]),
+                "poses3d": np.array([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]]),
+                "poses2d": np.array([[[0.0, 0.0], [1.0, 1.0]]]),
+            }
+
+    def fake_loader(cache_dir: Path | None = None) -> object:
+        del cache_dir
+        return RecordingFake()
+
+    monkeypatch.setattr("neuropose.estimator.load_metrabs_model", fake_loader)
+
+
+class TestBenchmarkSubcommand:
+    def test_benchmark_smoke(
+        self,
+        runner: CliRunner,
+        synthetic_video: Path,
+        tmp_path: Path,
+        stub_estimator_with_metrics,
+    ) -> None:
+        del stub_estimator_with_metrics
+        output = tmp_path / "bench.json"
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                str(synthetic_video),
+                "--repeats",
+                "3",
+                "--warmup-frames",
+                "0",
+                "--output",
+                str(output),
+            ],
+        )
+        assert result.exit_code == EXIT_OK, result.output
+        # Human-readable report must hit stdout.
+        assert "Benchmark:" in result.output
+        assert "Throughput:" in result.output
+        # JSON output file must exist and validate against the schema.
+        assert output.exists()
+        loaded = load_benchmark_result(output)
+        assert loaded.video_name == synthetic_video.name
+        assert loaded.repeats == 3
+        assert len(loaded.measured_passes) == 2
+        assert loaded.cpu_comparison is None
+
+    def test_benchmark_rejects_repeats_below_two(
+        self,
+        runner: CliRunner,
+        synthetic_video: Path,
+        stub_estimator_with_metrics,
+    ) -> None:
+        del stub_estimator_with_metrics
+        result = runner.invoke(app, ["benchmark", str(synthetic_video), "--repeats", "1"])
+        # Typer's min=2 validation catches this before our code runs.
+        assert result.exit_code != EXIT_OK
+
+    def test_benchmark_missing_video_is_usage_error(
+        self,
+        runner: CliRunner,
+        tmp_path: Path,
+    ) -> None:
+        result = runner.invoke(app, ["benchmark", str(tmp_path / "nope.mp4")])
+        assert result.exit_code == EXIT_USAGE
+
+    def test_benchmark_force_cpu_and_compare_cpu_are_mutually_exclusive(
+        self,
+        runner: CliRunner,
+        synthetic_video: Path,
+        stub_estimator_with_metrics,
+    ) -> None:
+        del stub_estimator_with_metrics
+        result = runner.invoke(
+            app,
+            [
+                "benchmark",
+                str(synthetic_video),
+                "--compare-cpu",
+                "--force-cpu",
+            ],
+        )
+        assert result.exit_code == EXIT_USAGE
+        assert "mutually exclusive" in result.output
+
+
 # ---------------------------------------------------------------------------
 # analyze
 # ---------------------------------------------------------------------------
diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py
index 40c7e4e..e3c8a04 100644
--- a/tests/unit/test_estimator.py
+++ b/tests/unit/test_estimator.py
@@ -19,7 +19,7 @@ from neuropose.estimator import (
     ProcessVideoResult,
     VideoDecodeError,
 )
-from neuropose.io import FramePrediction, VideoPredictions
+from neuropose.io import FramePrediction, PerformanceMetrics, VideoPredictions
 
 
 class TestConstruction:
@@ -199,6 +199,119 @@ class TestProcessVideo:
         assert len(fov_seen) == 5
 
 
+class TestPerformanceMetrics:
+    def test_metrics_attached_to_result(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        result = estimator.process_video(synthetic_video)
+        assert isinstance(result.metrics, PerformanceMetrics)
+
+    def test_per_frame_latencies_length_matches_frames(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        result = estimator.process_video(synthetic_video)
+        assert len(result.metrics.per_frame_latencies_ms) == result.frame_count
+
+    def test_all_latencies_are_non_negative(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        result = estimator.process_video(synthetic_video)
+        assert all(v >= 0.0 for v in result.metrics.per_frame_latencies_ms)
+
+    def test_total_seconds_is_positive(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        result = estimator.process_video(synthetic_video)
+        assert result.metrics.total_seconds > 0.0
+
+    def test_peak_rss_is_positive(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        result = estimator.process_video(synthetic_video)
+        # psutil always reports at least a few MB of RSS for a running
+        # Python process; the exact number varies by platform.
+        assert result.metrics.peak_rss_mb > 0.0
+
+    def test_model_load_seconds_none_when_injected(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        result = estimator.process_video(synthetic_video)
+        assert result.metrics.model_load_seconds is None
+
+    def test_model_load_seconds_set_after_load(
+        self,
+        synthetic_video: Path,
+        monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        """``load_model`` should set ``model_load_seconds`` on the next call.
+
+        We stub the loader to return a recording fake model, time how long
+        the estimator's ``load_model`` takes, and verify the number ends
+        up on the metrics object.
+        """
+        import numpy as np
+
+        class Recorder:
+            def detect_poses(self, image, **kwargs):
+                del image, kwargs
+                return {
+                    "boxes": np.array([[0.0, 0.0, 32.0, 32.0, 0.9]]),
+                    "poses3d": np.array([[[0.0, 0.0, 0.0]]]),
+                    "poses2d": np.array([[[0.0, 0.0]]]),
+                }
+
+        def fake_loader(cache_dir: Path | None = None) -> object:
+            del cache_dir
+            return Recorder()
+
+        monkeypatch.setattr("neuropose.estimator.load_metrabs_model", fake_loader)
+        estimator = Estimator()
+        estimator.load_model()
+        result = estimator.process_video(synthetic_video)
+        assert result.metrics.model_load_seconds is not None
+        assert result.metrics.model_load_seconds >= 0.0
+
+    def test_active_device_string_populated(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        result = estimator.process_video(synthetic_video)
+        # The exact string depends on the runner's TF install, but it
+        # must be one of the two canonical forms.
+        assert result.metrics.active_device in {"/CPU:0", "/GPU:0", "unknown"}
+
+    def test_tensorflow_version_populated(
+        self,
+        synthetic_video: Path,
+        fake_metrabs_model,
+    ) -> None:
+        estimator = Estimator(model=fake_metrabs_model)
+        result = estimator.process_video(synthetic_video)
+        # TF is in the dev deps so the version should always be a real
+        # string, not the "unknown" fallback.
+        assert result.metrics.tensorflow_version not in {"", "unknown"}
+
+
 class TestErrors:
     def test_missing_video(
         self,
diff --git a/tests/unit/test_io.py b/tests/unit/test_io.py
index 5c1c8cd..1c270ca 100644
--- a/tests/unit/test_io.py
+++ b/tests/unit/test_io.py
@@ -10,15 +10,28 @@ import pytest
 from pydantic import ValidationError
 
 from neuropose.io import (
+    BenchmarkAggregate,
+    BenchmarkResult,
+    CpuComparisonResult,
     FramePrediction,
     JobResults,
     JobStatus,
+    JointAngleExtractor,
+    JointAxisExtractor,
+    JointPairDistanceExtractor,
+    JointSpeedExtractor,
+    PerformanceMetrics,
+    Segment,
+    Segmentation,
+    SegmentationConfig,
     StatusFile,
     VideoMetadata,
     VideoPredictions,
+    load_benchmark_result,
     load_job_results,
     load_status,
     load_video_predictions,
+    save_benchmark_result,
     save_job_results,
     save_status,
     save_video_predictions,
@@ -188,6 +201,239 @@ class TestJobResults:
         assert len(loaded["video_a.mp4"]) == 2
 
 
+# ---------------------------------------------------------------------------
+# Performance / benchmark schemas
+# ---------------------------------------------------------------------------
+
+
+def _make_metrics(
+    *,
+    total_seconds: float = 1.0,
+    latencies: list[float] | None = None,
+    peak_rss_mb: float = 512.0,
+    active_device: str = "/CPU:0",
+    metal_active: bool = False,
+    model_load_seconds: float | None = None,
+) -> PerformanceMetrics:
+    return PerformanceMetrics(
+        model_load_seconds=model_load_seconds,
+        total_seconds=total_seconds,
+        per_frame_latencies_ms=latencies if latencies is not None else [10.0, 11.0, 9.5],
+        peak_rss_mb=peak_rss_mb,
+        active_device=active_device,
+        tensorflow_metal_active=metal_active,
+        tensorflow_version="2.21.0",
+    )
+
+
+def _make_aggregate() -> BenchmarkAggregate:
+    return BenchmarkAggregate(
+        repeats_measured=4,
+        warmup_frames_per_pass=3,
+        mean_frame_latency_ms=10.0,
+        p50_frame_latency_ms=9.8,
+        p95_frame_latency_ms=12.5,
+        p99_frame_latency_ms=13.0,
+        stddev_frame_latency_ms=0.7,
+        mean_throughput_fps=100.0,
+        peak_rss_mb_max=512.0,
+        active_device="/CPU:0",
+        tensorflow_metal_active=False,
+        tensorflow_version="2.21.0",
+    )
+
+
+class TestPerformanceMetricsModel:
+    def test_roundtrip(self) -> None:
+        m = _make_metrics()
+        rehydrated = PerformanceMetrics.model_validate(m.model_dump(mode="json"))
+        assert rehydrated == m
+
+    def test_rejects_negative_total_seconds(self) -> None:
+        with pytest.raises(ValidationError):
+            PerformanceMetrics(
+                total_seconds=-1.0,
+                peak_rss_mb=0.0,
+                active_device="/CPU:0",
+                tensorflow_version="2.21.0",
+            )
+
+    def test_rejects_negative_peak_rss(self) -> None:
+        with pytest.raises(ValidationError):
+            PerformanceMetrics(
+                total_seconds=1.0,
+                peak_rss_mb=-5.0,
+                active_device="/CPU:0",
+                tensorflow_version="2.21.0",
+            )
+
+    def test_model_load_seconds_optional(self) -> None:
+        m = _make_metrics(model_load_seconds=None)
+        assert m.model_load_seconds is None
+
+    def test_is_frozen(self) -> None:
+        m = _make_metrics()
+        with pytest.raises(ValidationError):
+            m.total_seconds = 2.0
+
+
+class TestBenchmarkResultPersistence:
+    def test_roundtrip_to_disk(self, tmp_path: Path) -> None:
+        result = BenchmarkResult(
+            video_name="trial.mp4",
+            repeats=5,
+            warmup_frames=3,
+            warmup_pass=_make_metrics(total_seconds=20.0),
+            measured_passes=[_make_metrics(total_seconds=1.5) for _ in range(4)],
+            aggregate=_make_aggregate(),
+        )
+        path = tmp_path / "bench.json"
+        save_benchmark_result(path, result)
+        assert path.exists()
+        loaded = load_benchmark_result(path)
+        assert loaded == result
+
+    def test_rejects_repeats_below_one(self) -> None:
+        with pytest.raises(ValidationError):
+            BenchmarkResult(
+                video_name="x.mp4",
+                repeats=0,
+                warmup_frames=0,
+                warmup_pass=_make_metrics(),
+                measured_passes=[],
+                aggregate=_make_aggregate(),
+            )
+
+    def test_cpu_comparison_nested(self, tmp_path: Path) -> None:
+        comparison = CpuComparisonResult(
+            primary_aggregate=_make_aggregate(),
+            cpu_aggregate=_make_aggregate(),
+            speedup=2.5,
+            max_poses3d_divergence_mm=0.002,
+            frame_count_compared=30,
+        )
+        result = BenchmarkResult(
+            video_name="trial.mp4",
+            repeats=5,
+            warmup_frames=3,
+            warmup_pass=_make_metrics(),
+            measured_passes=[_make_metrics() for _ in range(4)],
+            aggregate=_make_aggregate(),
+            cpu_comparison=comparison,
+        )
+        path = tmp_path / "bench_with_cmp.json"
+        save_benchmark_result(path, result)
+        loaded = load_benchmark_result(path)
+        assert loaded.cpu_comparison is not None
+        assert loaded.cpu_comparison.speedup == pytest.approx(2.5)
+        assert loaded.cpu_comparison.max_poses3d_divergence_mm == pytest.approx(0.002)
+
+
+# ---------------------------------------------------------------------------
+# Segmentation schema
+# ---------------------------------------------------------------------------
+
+
+class TestSegmentModel:
+    def test_valid(self) -> None:
+        seg = Segment(start=0, end=30, peak=15)
+        assert seg.start == 0
+        assert seg.peak == 15
+        assert seg.end == 30
+
+    def test_rejects_end_not_greater_than_start(self) -> None:
+        with pytest.raises(ValidationError, match="end"):
+            Segment(start=10, end=10, peak=10)
+
+    def test_peak_must_be_inside_window(self) -> None:
+        with pytest.raises(ValidationError, match="peak"):
+            Segment(start=0, end=30, peak=30)  # peak == end is out of range
+
+    def test_is_frozen(self) -> None:
+        seg = Segment(start=0, end=10, peak=5)
+        with pytest.raises(ValidationError):
+            seg.start = 1
+
+
+class TestExtractorSpecs:
+    def test_joint_pair_distance_rejects_identical_joints(self) -> None:
+        with pytest.raises(ValidationError, match="distinct"):
+            JointPairDistanceExtractor(joints=(7, 7))
+
+    def test_joint_pair_distance_rejects_negative(self) -> None:
+        with pytest.raises(ValidationError, match="non-negative"):
+            JointPairDistanceExtractor(joints=(-1, 5))
+
+    def test_joint_angle_rejects_negative(self) -> None:
+        with pytest.raises(ValidationError, match="non-negative"):
+            JointAngleExtractor(triplet=(0, -1, 2))
+
+    def test_joint_axis_rejects_bad_axis(self) -> None:
+        with pytest.raises(ValidationError):
+            JointAxisExtractor(joint=0, axis=3)
+
+    def test_discriminator_dispatches_to_correct_variant(self) -> None:
+        # Round-trip each extractor variant through a SegmentationConfig
+        # dict to confirm the discriminator selects the right class.
+        for payload, cls in [
+            ({"kind": "joint_axis", "joint": 1, "axis": 2}, JointAxisExtractor),
+            ({"kind": "joint_pair_distance", "joints": [1, 2]}, JointPairDistanceExtractor),
+            ({"kind": "joint_speed", "joint": 3}, JointSpeedExtractor),
+            ({"kind": "joint_angle", "triplet": [1, 2, 3]}, JointAngleExtractor),
+        ]:
+            cfg = SegmentationConfig.model_validate({"extractor": payload})
+            assert isinstance(cfg.extractor, cls)
+
+
+class TestSegmentationPersistence:
+    def test_roundtrip_through_video_predictions(
+        self,
+        tmp_path: Path,
+        video_predictions_payload: dict,
+    ) -> None:
+        cfg = SegmentationConfig(
+            extractor=JointAxisExtractor(joint=15, axis=1),
+            min_prominence=20.0,
+            pad_seconds=0.1,
+        )
+        segmentation = Segmentation(
+            config=cfg,
+            segments=[Segment(start=0, end=1, peak=0), Segment(start=1, end=2, peak=1)],
+        )
+        video_predictions_payload["segmentations"] = {
+            "cup_lift": segmentation.model_dump(mode="json")
+        }
+        vp = VideoPredictions.model_validate(video_predictions_payload)
+        path = tmp_path / "video.json"
+        save_video_predictions(path, vp)
+        loaded = load_video_predictions(path)
+        assert "cup_lift" in loaded.segmentations
+        cup = loaded.segmentations["cup_lift"]
+        assert cup.config.extractor.kind == "joint_axis"
+        assert len(cup.segments) == 2
+        assert cup.config.method == "valley_to_valley_v1"
+
+    def test_default_empty_segmentations_on_new_instance(
+        self, video_predictions_payload: dict
+    ) -> None:
+        vp = VideoPredictions.model_validate(video_predictions_payload)
+        assert vp.segmentations == {}
+
+    def test_legacy_file_without_segmentations_loads_clean(
+        self,
+        tmp_path: Path,
+        video_predictions_payload: dict,
+    ) -> None:
+        # Older predictions files never wrote the segmentations field;
+        # make sure they still validate and deserialize as if they had
+        # an empty mapping.
+        assert "segmentations" not in video_predictions_payload
+        path = tmp_path / "legacy.json"
+        path.write_text(json.dumps(video_predictions_payload))
+        vp = load_video_predictions(path)
+        assert vp.segmentations == {}
+
+
 # ---------------------------------------------------------------------------
 # Status file
 # ---------------------------------------------------------------------------
diff --git a/uv.lock b/uv.lock
index d4ed16f..5850499 100644
--- a/uv.lock
+++ b/uv.lock
@@ -673,6 +673,7 @@ dependencies = [
     { name = "matplotlib" },
     { name = "numpy" },
     { name = "opencv-python-headless" },
+    { name = "psutil" },
     { name = "pydantic" },
     { name = "pydantic-settings" },
     { name = "pyyaml" },
@@ -711,6 +712,7 @@ requires-dist = [
     { name = "matplotlib", specifier = ">=3.8" },
     { name = "numpy", specifier = ">=1.26" },
     { name = "opencv-python-headless", specifier = ">=4.9" },
+    { name = "psutil", specifier = ">=5.9" },
     { name = "pydantic", specifier = ">=2.6" },
     { name = "pydantic-settings", specifier = ">=2.2" },
     { name = "pyyaml", specifier = ">=6.0" },
@@ -949,6 +951,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/88/95/608f665226bca68b736b79e457fded9a2a38c4f4379a4a7614303d9db3bc/protobuf-7.34.1-py3-none-any.whl", hash = "sha256:bb3812cd53aefea2b028ef42bd780f5b96407247f20c6ef7c679807e9d188f11", size = 170715, upload-time = "2026-03-20T17:34:45.384Z" },
 ]
 
+[[package]]
+name = "psutil"
+version = "7.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740, upload-time = "2026-01-28T18:14:54.428Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090, upload-time = "2026-01-28T18:15:22.168Z" },
+    { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859, upload-time = "2026-01-28T18:15:23.795Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560, upload-time = "2026-01-28T18:15:25.976Z" },
+    { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997, upload-time = "2026-01-28T18:15:27.794Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972, upload-time = "2026-01-28T18:15:29.342Z" },
+    { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266, upload-time = "2026-01-28T18:15:31.597Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737, upload-time = "2026-01-28T18:15:33.849Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" },
+]
+
 [[package]]
 name = "pydantic"
 version = "2.13.0"