315 lines
11 KiB
Python
315 lines
11 KiB
Python
"""Tests for :mod:`neuropose.benchmark`.
|
|
|
|
Coverage:
|
|
|
|
- :func:`run_benchmark` against a fake-model :class:`Estimator` —
|
|
repeats, warmup-pass discarding, capture-reference, edge cases.
|
|
- Aggregate statistics (mean / p50 / p95 / p99 / throughput / peak RSS)
|
|
on synthetic metrics.
|
|
- :func:`compute_poses3d_divergence` on matched and mismatched
|
|
prediction pairs.
|
|
- :func:`format_benchmark_report` rendering.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from neuropose.benchmark import (
|
|
compute_poses3d_divergence,
|
|
format_benchmark_report,
|
|
run_benchmark,
|
|
)
|
|
from neuropose.estimator import Estimator
|
|
from neuropose.io import (
|
|
BenchmarkResult,
|
|
PerformanceMetrics,
|
|
VideoPredictions,
|
|
)
|
|
|
|
|
|
class TestRunBenchmark:
|
|
def test_returns_benchmark_result(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
|
|
assert isinstance(outcome.result, BenchmarkResult)
|
|
assert outcome.result.video_name == synthetic_video.name
|
|
|
|
def test_repeats_reflected_in_result(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=4, warmup_frames=0)
|
|
assert outcome.result.repeats == 4
|
|
# One pass is always discarded as warmup; the rest are measured.
|
|
assert len(outcome.result.measured_passes) == 3
|
|
assert outcome.result.aggregate.repeats_measured == 3
|
|
|
|
def test_first_pass_is_the_warmup(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
|
|
assert outcome.result.warmup_pass is not None
|
|
# warmup_pass is a separate object from the measured passes.
|
|
assert outcome.result.warmup_pass not in outcome.result.measured_passes
|
|
|
|
def test_capture_reference_off_by_default(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
|
|
assert outcome.reference_predictions is None
|
|
|
|
def test_capture_reference_returns_last_measured_pass(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
outcome = run_benchmark(
|
|
estimator,
|
|
synthetic_video,
|
|
repeats=3,
|
|
warmup_frames=0,
|
|
capture_reference=True,
|
|
)
|
|
assert isinstance(outcome.reference_predictions, VideoPredictions)
|
|
assert len(outcome.reference_predictions) == 5
|
|
|
|
def test_rejects_repeats_below_two(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
with pytest.raises(ValueError, match="repeats"):
|
|
run_benchmark(estimator, synthetic_video, repeats=1, warmup_frames=0)
|
|
|
|
def test_rejects_negative_warmup_frames(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
with pytest.raises(ValueError, match="warmup_frames"):
|
|
run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=-1)
|
|
|
|
def test_warmup_frames_filters_aggregate_head(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
# synthetic_video has 5 frames; warmup_frames=4 leaves only
|
|
# the last frame of each measured pass to contribute.
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=4)
|
|
# The stats should still compute, not error.
|
|
assert outcome.result.aggregate.mean_frame_latency_ms >= 0.0
|
|
|
|
def test_warmup_frames_exceeding_pass_length_gives_zero_latencies(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
# 5-frame video, warmup_frames=10 → every frame discarded.
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=10)
|
|
# Latency aggregate drops to zero; throughput is still
|
|
# computed from total wall clock.
|
|
assert outcome.result.aggregate.mean_frame_latency_ms == 0.0
|
|
assert outcome.result.aggregate.p95_frame_latency_ms == 0.0
|
|
|
|
|
|
class TestBenchmarkAggregate:
|
|
def test_percentiles_ordered_correctly(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=4, warmup_frames=0)
|
|
agg = outcome.result.aggregate
|
|
assert agg.p50_frame_latency_ms <= agg.p95_frame_latency_ms
|
|
assert agg.p95_frame_latency_ms <= agg.p99_frame_latency_ms
|
|
|
|
def test_throughput_non_negative(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
|
|
assert outcome.result.aggregate.mean_throughput_fps > 0.0
|
|
|
|
def test_peak_rss_matches_max_across_passes(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
|
|
individual = [p.peak_rss_mb for p in outcome.result.measured_passes]
|
|
assert outcome.result.aggregate.peak_rss_mb_max == max(individual)
|
|
|
|
|
|
class TestComputePoses3dDivergence:
|
|
def _make_predictions(self, poses3d_values: list[float]) -> VideoPredictions:
|
|
"""Build a 3-frame VideoPredictions whose single joint has the given y-values."""
|
|
frames: dict[str, dict] = {}
|
|
for i, value in enumerate(poses3d_values):
|
|
frames[f"frame_{i:06d}"] = {
|
|
"boxes": [[0.0, 0.0, 1.0, 1.0, 0.9]],
|
|
"poses3d": [[[0.0, float(value), 0.0]]],
|
|
"poses2d": [[[0.0, 0.0]]],
|
|
}
|
|
return VideoPredictions.model_validate(
|
|
{
|
|
"metadata": {
|
|
"frame_count": len(poses3d_values),
|
|
"fps": 30.0,
|
|
"width": 64,
|
|
"height": 64,
|
|
},
|
|
"frames": frames,
|
|
}
|
|
)
|
|
|
|
def test_identical_predictions_zero_divergence(self) -> None:
|
|
a = self._make_predictions([1.0, 2.0, 3.0])
|
|
b = self._make_predictions([1.0, 2.0, 3.0])
|
|
max_diff, compared = compute_poses3d_divergence(a, b)
|
|
assert max_diff == 0.0
|
|
assert compared == 3
|
|
|
|
def test_divergence_reports_max_abs_diff(self) -> None:
|
|
a = self._make_predictions([1.0, 2.0, 3.0])
|
|
b = self._make_predictions([1.0, 2.1, 2.5]) # diffs 0, 0.1, 0.5
|
|
max_diff, compared = compute_poses3d_divergence(a, b)
|
|
assert max_diff == pytest.approx(0.5)
|
|
assert compared == 3
|
|
|
|
def test_mismatched_frame_count_raises(self) -> None:
|
|
a = self._make_predictions([1.0, 2.0])
|
|
b = self._make_predictions([1.0, 2.0, 3.0])
|
|
with pytest.raises(ValueError, match="frame count"):
|
|
compute_poses3d_divergence(a, b)
|
|
|
|
def test_mismatched_detection_count_skipped_not_raised(self) -> None:
|
|
a = self._make_predictions([1.0, 2.0])
|
|
# Frame 1 has zero detections instead of one.
|
|
b_payload = {
|
|
"metadata": {
|
|
"frame_count": 2,
|
|
"fps": 30.0,
|
|
"width": 64,
|
|
"height": 64,
|
|
},
|
|
"frames": {
|
|
"frame_000000": {
|
|
"boxes": [[0, 0, 1, 1, 0.9]],
|
|
"poses3d": [[[0.0, 1.0, 0.0]]],
|
|
"poses2d": [[[0.0, 0.0]]],
|
|
},
|
|
"frame_000001": {
|
|
"boxes": [],
|
|
"poses3d": [],
|
|
"poses2d": [],
|
|
},
|
|
},
|
|
}
|
|
b = VideoPredictions.model_validate(b_payload)
|
|
max_diff, compared = compute_poses3d_divergence(a, b)
|
|
# First frame contributes 0 divergence; second frame is skipped.
|
|
assert max_diff == 0.0
|
|
assert compared == 1
|
|
|
|
|
|
class TestFormatBenchmarkReport:
|
|
def _fake_metrics(self, latencies_ms: list[float]) -> PerformanceMetrics:
|
|
return PerformanceMetrics(
|
|
model_load_seconds=12.3,
|
|
total_seconds=sum(latencies_ms) / 1000.0 + 0.01,
|
|
per_frame_latencies_ms=latencies_ms,
|
|
peak_rss_mb=512.0,
|
|
active_device="/CPU:0",
|
|
tensorflow_metal_active=False,
|
|
tensorflow_version="2.21.0",
|
|
)
|
|
|
|
def test_report_mentions_key_fields(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
|
|
text = format_benchmark_report(outcome.result)
|
|
assert "Benchmark:" in text
|
|
assert "device:" in text
|
|
assert "mean:" in text
|
|
assert "p50:" in text
|
|
assert "p95:" in text
|
|
assert "p99:" in text
|
|
assert "Throughput:" in text
|
|
assert "Peak RSS:" in text
|
|
|
|
def test_report_mentions_metal_when_active(self) -> None:
|
|
from neuropose.io import BenchmarkAggregate, BenchmarkResult
|
|
|
|
metrics = PerformanceMetrics(
|
|
model_load_seconds=None,
|
|
total_seconds=1.0,
|
|
per_frame_latencies_ms=[10.0, 11.0],
|
|
peak_rss_mb=100.0,
|
|
active_device="/GPU:0",
|
|
tensorflow_metal_active=True,
|
|
tensorflow_version="2.21.0",
|
|
)
|
|
agg = BenchmarkAggregate(
|
|
repeats_measured=1,
|
|
warmup_frames_per_pass=0,
|
|
mean_frame_latency_ms=10.5,
|
|
p50_frame_latency_ms=10.5,
|
|
p95_frame_latency_ms=11.0,
|
|
p99_frame_latency_ms=11.0,
|
|
stddev_frame_latency_ms=0.5,
|
|
mean_throughput_fps=95.0,
|
|
peak_rss_mb_max=100.0,
|
|
active_device="/GPU:0",
|
|
tensorflow_metal_active=True,
|
|
tensorflow_version="2.21.0",
|
|
)
|
|
result = BenchmarkResult(
|
|
video_name="test.mp4",
|
|
repeats=2,
|
|
warmup_frames=0,
|
|
warmup_pass=metrics,
|
|
measured_passes=[metrics],
|
|
aggregate=agg,
|
|
)
|
|
text = format_benchmark_report(result)
|
|
assert "tensorflow-metal" in text
|
|
|
|
def test_report_model_load_injected_label(
|
|
self,
|
|
synthetic_video: Path,
|
|
fake_metrabs_model,
|
|
) -> None:
|
|
estimator = Estimator(model=fake_metrabs_model)
|
|
outcome = run_benchmark(estimator, synthetic_video, repeats=3, warmup_frames=0)
|
|
text = format_benchmark_report(outcome.result)
|
|
assert "injected" in text # fake model was not loaded via load_model
|