neuropose/tests/unit/test_benchmark.py

315 lines
11 KiB
Python

"""Tests for :mod:`neuropose.benchmark`.
Coverage:
- :func:`run_benchmark` against a fake-model :class:`Estimator` —
repeats, warmup-pass discarding, capture-reference, edge cases.
- Aggregate statistics (mean / p50 / p95 / p99 / throughput / peak RSS)
on synthetic metrics.
- :func:`compute_poses3d_divergence` on matched and mismatched
prediction pairs.
- :func:`format_benchmark_report` rendering.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from neuropose.benchmark import (
compute_poses3d_divergence,
format_benchmark_report,
run_benchmark,
)
from neuropose.estimator import Estimator
from neuropose.io import (
BenchmarkResult,
PerformanceMetrics,
VideoPredictions,
)
class TestRunBenchmark:
def test_returns_benchmark_result(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
assert isinstance(outcome.result, BenchmarkResult)
assert outcome.result.video_name == synthetic_video.name
def test_repeats_reflected_in_result(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
outcome = run_benchmark(estimator, synthetic_video, repeats=4, warmup_frames=0)
assert outcome.result.repeats == 4
# One pass is always discarded as warmup; the rest are measured.
assert len(outcome.result.measured_passes) == 3
assert outcome.result.aggregate.repeats_measured == 3
def test_first_pass_is_the_warmup(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
assert outcome.result.warmup_pass is not None
# warmup_pass is a separate object from the measured passes.
assert outcome.result.warmup_pass not in outcome.result.measured_passes
def test_capture_reference_off_by_default(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
assert outcome.reference_predictions is None
def test_capture_reference_returns_last_measured_pass(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
outcome = run_benchmark(
estimator,
synthetic_video,
repeats=3,
warmup_frames=0,
capture_reference=True,
)
assert isinstance(outcome.reference_predictions, VideoPredictions)
assert len(outcome.reference_predictions) == 5
def test_rejects_repeats_below_two(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
with pytest.raises(ValueError, match="repeats"):
run_benchmark(estimator, synthetic_video, repeats=1, warmup_frames=0)
def test_rejects_negative_warmup_frames(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
with pytest.raises(ValueError, match="warmup_frames"):
run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=-1)
def test_warmup_frames_filters_aggregate_head(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
# synthetic_video has 5 frames; warmup_frames=4 leaves only
# the last frame of each measured pass to contribute.
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=4)
# The stats should still compute, not error.
assert outcome.result.aggregate.mean_frame_latency_ms >= 0.0
def test_warmup_frames_exceeding_pass_length_gives_zero_latencies(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
# 5-frame video, warmup_frames=10 → every frame discarded.
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=10)
# Latency aggregate drops to zero; throughput is still
# computed from total wall clock.
assert outcome.result.aggregate.mean_frame_latency_ms == 0.0
assert outcome.result.aggregate.p95_frame_latency_ms == 0.0
class TestBenchmarkAggregate:
def test_percentiles_ordered_correctly(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
outcome = run_benchmark(estimator, synthetic_video, repeats=4, warmup_frames=0)
agg = outcome.result.aggregate
assert agg.p50_frame_latency_ms <= agg.p95_frame_latency_ms
assert agg.p95_frame_latency_ms <= agg.p99_frame_latency_ms
def test_throughput_non_negative(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
assert outcome.result.aggregate.mean_throughput_fps > 0.0
def test_peak_rss_matches_max_across_passes(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
individual = [p.peak_rss_mb for p in outcome.result.measured_passes]
assert outcome.result.aggregate.peak_rss_mb_max == max(individual)
class TestComputePoses3dDivergence:
def _make_predictions(self, poses3d_values: list[float]) -> VideoPredictions:
"""Build a 3-frame VideoPredictions whose single joint has the given y-values."""
frames: dict[str, dict] = {}
for i, value in enumerate(poses3d_values):
frames[f"frame_{i:06d}"] = {
"boxes": [[0.0, 0.0, 1.0, 1.0, 0.9]],
"poses3d": [[[0.0, float(value), 0.0]]],
"poses2d": [[[0.0, 0.0]]],
}
return VideoPredictions.model_validate(
{
"metadata": {
"frame_count": len(poses3d_values),
"fps": 30.0,
"width": 64,
"height": 64,
},
"frames": frames,
}
)
def test_identical_predictions_zero_divergence(self) -> None:
a = self._make_predictions([1.0, 2.0, 3.0])
b = self._make_predictions([1.0, 2.0, 3.0])
max_diff, compared = compute_poses3d_divergence(a, b)
assert max_diff == 0.0
assert compared == 3
def test_divergence_reports_max_abs_diff(self) -> None:
a = self._make_predictions([1.0, 2.0, 3.0])
b = self._make_predictions([1.0, 2.1, 2.5]) # diffs 0, 0.1, 0.5
max_diff, compared = compute_poses3d_divergence(a, b)
assert max_diff == pytest.approx(0.5)
assert compared == 3
def test_mismatched_frame_count_raises(self) -> None:
a = self._make_predictions([1.0, 2.0])
b = self._make_predictions([1.0, 2.0, 3.0])
with pytest.raises(ValueError, match="frame count"):
compute_poses3d_divergence(a, b)
def test_mismatched_detection_count_skipped_not_raised(self) -> None:
a = self._make_predictions([1.0, 2.0])
# Frame 1 has zero detections instead of one.
b_payload = {
"metadata": {
"frame_count": 2,
"fps": 30.0,
"width": 64,
"height": 64,
},
"frames": {
"frame_000000": {
"boxes": [[0, 0, 1, 1, 0.9]],
"poses3d": [[[0.0, 1.0, 0.0]]],
"poses2d": [[[0.0, 0.0]]],
},
"frame_000001": {
"boxes": [],
"poses3d": [],
"poses2d": [],
},
},
}
b = VideoPredictions.model_validate(b_payload)
max_diff, compared = compute_poses3d_divergence(a, b)
# First frame contributes 0 divergence; second frame is skipped.
assert max_diff == 0.0
assert compared == 1
class TestFormatBenchmarkReport:
def _fake_metrics(self, latencies_ms: list[float]) -> PerformanceMetrics:
return PerformanceMetrics(
model_load_seconds=12.3,
total_seconds=sum(latencies_ms) / 1000.0 + 0.01,
per_frame_latencies_ms=latencies_ms,
peak_rss_mb=512.0,
active_device="/CPU:0",
tensorflow_metal_active=False,
tensorflow_version="2.21.0",
)
def test_report_mentions_key_fields(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
text = format_benchmark_report(outcome.result)
assert "Benchmark:" in text
assert "device:" in text
assert "mean:" in text
assert "p50:" in text
assert "p95:" in text
assert "p99:" in text
assert "Throughput:" in text
assert "Peak RSS:" in text
def test_report_mentions_metal_when_active(self) -> None:
from neuropose.io import BenchmarkAggregate, BenchmarkResult
metrics = PerformanceMetrics(
model_load_seconds=None,
total_seconds=1.0,
per_frame_latencies_ms=[10.0, 11.0],
peak_rss_mb=100.0,
active_device="/GPU:0",
tensorflow_metal_active=True,
tensorflow_version="2.21.0",
)
agg = BenchmarkAggregate(
repeats_measured=1,
warmup_frames_per_pass=0,
mean_frame_latency_ms=10.5,
p50_frame_latency_ms=10.5,
p95_frame_latency_ms=11.0,
p99_frame_latency_ms=11.0,
stddev_frame_latency_ms=0.5,
mean_throughput_fps=95.0,
peak_rss_mb_max=100.0,
active_device="/GPU:0",
tensorflow_metal_active=True,
tensorflow_version="2.21.0",
)
result = BenchmarkResult(
video_name="test.mp4",
repeats=2,
warmup_frames=0,
warmup_pass=metrics,
measured_passes=[metrics],
aggregate=agg,
)
text = format_benchmark_report(result)
assert "tensorflow-metal" in text
def test_report_model_load_injected_label(
self,
synthetic_video: Path,
fake_metrabs_model,
) -> None:
estimator = Estimator(model=fake_metrabs_model)
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
text = format_benchmark_report(outcome.result)
assert "injected" in text # fake model was not loaded via load_model