neuropose/tests/unit/test_benchmark.py

"""Tests for :mod:`neuropose.benchmark`.

Coverage:

- :func:`run_benchmark` against a fake-model :class:`Estimator` —
  repeats, warmup-pass discarding, capture-reference, edge cases.
- Aggregate statistics (mean / p50 / p95 / p99 / throughput / peak RSS)
  on synthetic metrics.
- :func:`compute_poses3d_divergence` on matched and mismatched
  prediction pairs.
- :func:`format_benchmark_report` rendering.
"""

from __future__ import annotations

from pathlib import Path

import pytest

from neuropose.benchmark import (
    compute_poses3d_divergence,
    format_benchmark_report,
    run_benchmark,
)
from neuropose.estimator import Estimator
from neuropose.io import (
    BenchmarkResult,
    PerformanceMetrics,
    VideoPredictions,
)


class TestRunBenchmark:
    def test_returns_benchmark_result(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
        assert isinstance(outcome.result, BenchmarkResult)
        assert outcome.result.video_name == synthetic_video.name

    def test_repeats_reflected_in_result(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        outcome = run_benchmark(estimator, synthetic_video, repeats=4, warmup_frames=0)
        assert outcome.result.repeats == 4
        # One pass is always discarded as warmup; the rest are measured.
        assert len(outcome.result.measured_passes) == 3
        assert outcome.result.aggregate.repeats_measured == 3

    def test_first_pass_is_the_warmup(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
        assert outcome.result.warmup_pass is not None
        # warmup_pass is a separate object from the measured passes.
        assert outcome.result.warmup_pass not in outcome.result.measured_passes

    def test_capture_reference_off_by_default(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
        assert outcome.reference_predictions is None

    def test_capture_reference_returns_last_measured_pass(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        outcome = run_benchmark(
            estimator,
            synthetic_video,
            repeats=3,
            warmup_frames=0,
            capture_reference=True,
        )
        assert isinstance(outcome.reference_predictions, VideoPredictions)
        assert len(outcome.reference_predictions) == 5

    def test_rejects_repeats_below_two(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        with pytest.raises(ValueError, match="repeats"):
            run_benchmark(estimator, synthetic_video, repeats=1, warmup_frames=0)

    def test_rejects_negative_warmup_frames(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        with pytest.raises(ValueError, match="warmup_frames"):
            run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=-1)

    def test_warmup_frames_filters_aggregate_head(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        # synthetic_video has 5 frames; warmup_frames=4 leaves only
        # the last frame of each measured pass to contribute.
        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=4)
        # The stats should still compute, not error.
        assert outcome.result.aggregate.mean_frame_latency_ms >= 0.0

    def test_warmup_frames_exceeding_pass_length_gives_zero_latencies(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        # 5-frame video, warmup_frames=10 → every frame discarded.
        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=10)
        # Latency aggregate drops to zero; throughput is still
        # computed from total wall clock.
        assert outcome.result.aggregate.mean_frame_latency_ms == 0.0
        assert outcome.result.aggregate.p95_frame_latency_ms == 0.0


class TestBenchmarkAggregate:
    def test_percentiles_ordered_correctly(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        outcome = run_benchmark(estimator, synthetic_video, repeats=4, warmup_frames=0)
        agg = outcome.result.aggregate
        assert agg.p50_frame_latency_ms <= agg.p95_frame_latency_ms
        assert agg.p95_frame_latency_ms <= agg.p99_frame_latency_ms

    def test_throughput_non_negative(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
        assert outcome.result.aggregate.mean_throughput_fps > 0.0

    def test_peak_rss_matches_max_across_passes(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
        individual = [p.peak_rss_mb for p in outcome.result.measured_passes]
        assert outcome.result.aggregate.peak_rss_mb_max == max(individual)


class TestComputePoses3dDivergence:
    def _make_predictions(self, poses3d_values: list[float]) -> VideoPredictions:
        """Build a 3-frame VideoPredictions whose single joint has the given y-values."""
        frames: dict[str, dict] = {}
        for i, value in enumerate(poses3d_values):
            frames[f"frame_{i:06d}"] = {
                "boxes": [[0.0, 0.0, 1.0, 1.0, 0.9]],
                "poses3d": [[[0.0, float(value), 0.0]]],
                "poses2d": [[[0.0, 0.0]]],
            }
        return VideoPredictions.model_validate(
            {
                "metadata": {
                    "frame_count": len(poses3d_values),
                    "fps": 30.0,
                    "width": 64,
                    "height": 64,
                },
                "frames": frames,
            }
        )

    def test_identical_predictions_zero_divergence(self) -> None:
        a = self._make_predictions([1.0, 2.0, 3.0])
        b = self._make_predictions([1.0, 2.0, 3.0])
        max_diff, compared = compute_poses3d_divergence(a, b)
        assert max_diff == 0.0
        assert compared == 3

    def test_divergence_reports_max_abs_diff(self) -> None:
        a = self._make_predictions([1.0, 2.0, 3.0])
        b = self._make_predictions([1.0, 2.1, 2.5])  # diffs 0, 0.1, 0.5
        max_diff, compared = compute_poses3d_divergence(a, b)
        assert max_diff == pytest.approx(0.5)
        assert compared == 3

    def test_mismatched_frame_count_raises(self) -> None:
        a = self._make_predictions([1.0, 2.0])
        b = self._make_predictions([1.0, 2.0, 3.0])
        with pytest.raises(ValueError, match="frame count"):
            compute_poses3d_divergence(a, b)

    def test_mismatched_detection_count_skipped_not_raised(self) -> None:
        a = self._make_predictions([1.0, 2.0])
        # Frame 1 has zero detections instead of one.
        b_payload = {
            "metadata": {
                "frame_count": 2,
                "fps": 30.0,
                "width": 64,
                "height": 64,
            },
            "frames": {
                "frame_000000": {
                    "boxes": [[0, 0, 1, 1, 0.9]],
                    "poses3d": [[[0.0, 1.0, 0.0]]],
                    "poses2d": [[[0.0, 0.0]]],
                },
                "frame_000001": {
                    "boxes": [],
                    "poses3d": [],
                    "poses2d": [],
                },
            },
        }
        b = VideoPredictions.model_validate(b_payload)
        max_diff, compared = compute_poses3d_divergence(a, b)
        # First frame contributes 0 divergence; second frame is skipped.
        assert max_diff == 0.0
        assert compared == 1


class TestFormatBenchmarkReport:
    def _fake_metrics(self, latencies_ms: list[float]) -> PerformanceMetrics:
        return PerformanceMetrics(
            model_load_seconds=12.3,
            total_seconds=sum(latencies_ms) / 1000.0 + 0.01,
            per_frame_latencies_ms=latencies_ms,
            peak_rss_mb=512.0,
            active_device="/CPU:0",
            tensorflow_metal_active=False,
            tensorflow_version="2.21.0",
        )

    def test_report_mentions_key_fields(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
        text = format_benchmark_report(outcome.result)
        assert "Benchmark:" in text
        assert "device:" in text
        assert "mean:" in text
        assert "p50:" in text
        assert "p95:" in text
        assert "p99:" in text
        assert "Throughput:" in text
        assert "Peak RSS:" in text

    def test_report_mentions_metal_when_active(self) -> None:
        from neuropose.io import BenchmarkAggregate, BenchmarkResult

        metrics = PerformanceMetrics(
            model_load_seconds=None,
            total_seconds=1.0,
            per_frame_latencies_ms=[10.0, 11.0],
            peak_rss_mb=100.0,
            active_device="/GPU:0",
            tensorflow_metal_active=True,
            tensorflow_version="2.21.0",
        )
        agg = BenchmarkAggregate(
            repeats_measured=1,
            warmup_frames_per_pass=0,
            mean_frame_latency_ms=10.5,
            p50_frame_latency_ms=10.5,
            p95_frame_latency_ms=11.0,
            p99_frame_latency_ms=11.0,
            stddev_frame_latency_ms=0.5,
            mean_throughput_fps=95.0,
            peak_rss_mb_max=100.0,
            active_device="/GPU:0",
            tensorflow_metal_active=True,
            tensorflow_version="2.21.0",
        )
        result = BenchmarkResult(
            video_name="test.mp4",
            repeats=2,
            warmup_frames=0,
            warmup_pass=metrics,
            measured_passes=[metrics],
            aggregate=agg,
        )
        text = format_benchmark_report(result)
        assert "tensorflow-metal" in text

    def test_report_model_load_injected_label(
        self,
        synthetic_video: Path,
        fake_metrabs_model,
    ) -> None:
        estimator = Estimator(model=fake_metrabs_model)
        outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
        text = format_benchmark_report(outcome.result)
        assert "injected" in text  # fake model was not loaded via load_model