Files
Timmy-time-dashboard/tests/infrastructure/world/test_benchmark.py
Alexander Whitestone 49990e6aec
Some checks failed
Tests / lint (pull_request) Successful in 16s
Tests / test (pull_request) Failing after 13m58s
feat: add agent performance regression benchmark suite
Implement standardised Morrowind benchmark scenarios to detect agent
performance regressions after code changes.

- 5 built-in scenarios: navigation (Seyda Neen→Balmora, Balmora
  intra-city), quest (Fargoth's Ring), combat (Mudcrab), observation
- BenchmarkRunner executes scenarios through the heartbeat loop with
  MockWorldAdapter, tracking cycles, wall time, LLM calls, metabolic cost
- Goal predicates (reached_location, interacted_with) for early success
- BenchmarkMetrics with JSONL persistence and compare_runs() for
  regression detection
- CLI script (scripts/run_benchmarks.py) with tag filtering and
  baseline comparison
- tox -e benchmark environment for CI integration
- 31 unit tests covering scenarios, predicates, metrics, runner, and
  persistence

Fixes #1015

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 19:54:26 -04:00

395 lines
13 KiB
Python

"""Tests for the agent performance regression benchmark suite.
Covers: scenario loading, metrics collection, runner execution,
goal predicates, and result persistence.
"""
from __future__ import annotations
import pytest
from infrastructure.world.benchmark.metrics import (
BenchmarkMetrics,
ScenarioResult,
compare_runs,
load_history,
)
from infrastructure.world.benchmark.runner import BenchmarkRunner
from infrastructure.world.benchmark.scenarios import (
BUILTIN_SCENARIOS,
BenchmarkScenario,
load_scenarios,
)
# ---------------------------------------------------------------------------
# Scenario definitions
# ---------------------------------------------------------------------------
class TestBenchmarkScenario:
def test_builtin_scenarios_exist(self):
assert len(BUILTIN_SCENARIOS) >= 5
def test_scenario_fields(self):
s = BUILTIN_SCENARIOS[0]
assert s.name
assert s.description
assert s.start_location
assert s.max_cycles > 0
def test_load_all_scenarios(self):
scenarios = load_scenarios()
assert len(scenarios) == len(BUILTIN_SCENARIOS)
def test_load_scenarios_by_tag(self):
nav = load_scenarios(tags=["navigation"])
assert len(nav) >= 2
for s in nav:
assert "navigation" in s.tags
def test_load_scenarios_no_match(self):
result = load_scenarios(tags=["nonexistent_tag"])
assert result == []
def test_scenario_is_frozen(self):
s = BUILTIN_SCENARIOS[0]
with pytest.raises(AttributeError):
s.name = "modified"
# ---------------------------------------------------------------------------
# Goal predicates
# ---------------------------------------------------------------------------
class TestGoalPredicates:
def test_reached_location_predicate(self):
s = BUILTIN_SCENARIOS[0] # Walk to Balmora
assert s.goal_predicate is not None
assert s.goal_predicate([], "Balmora") is True
assert s.goal_predicate([], "Seyda Neen") is False
def test_reached_location_case_insensitive(self):
s = BUILTIN_SCENARIOS[0]
assert s.goal_predicate([], "balmora") is True
assert s.goal_predicate([], "BALMORA") is True
def test_interacted_with_predicate(self):
s = BUILTIN_SCENARIOS[1] # Fargoth quest
assert s.goal_predicate is not None
actions = [{"action": "speak", "target": "Fargoth"}]
assert s.goal_predicate(actions, "Seyda Neen") is True
def test_interacted_with_no_match(self):
s = BUILTIN_SCENARIOS[1]
actions = [{"action": "speak", "target": "Guard"}]
assert s.goal_predicate(actions, "Seyda Neen") is False
def test_interacted_with_interact_action(self):
s = BUILTIN_SCENARIOS[1]
actions = [{"action": "interact", "target": "Fargoth"}]
assert s.goal_predicate(actions, "Seyda Neen") is True
def test_no_predicate_scenario(self):
combat = [s for s in BUILTIN_SCENARIOS if "combat" in s.tags][0]
assert combat.goal_predicate is None
# ---------------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------------
class TestScenarioResult:
def test_default_values(self):
r = ScenarioResult(scenario_name="test")
assert r.success is False
assert r.cycles_used == 0
assert r.llm_calls == 0
assert r.metabolic_cost == 0.0
assert r.error is None
class TestBenchmarkMetrics:
def test_empty_metrics(self):
m = BenchmarkMetrics()
assert m.pass_count == 0
assert m.fail_count == 0
assert m.success_rate == 0.0
assert m.total_llm_calls == 0
assert m.total_metabolic_cost == 0.0
def test_success_rate(self):
m = BenchmarkMetrics(
results=[
ScenarioResult(scenario_name="a", success=True),
ScenarioResult(scenario_name="b", success=False),
ScenarioResult(scenario_name="c", success=True),
]
)
assert m.pass_count == 2
assert m.fail_count == 1
assert abs(m.success_rate - 2 / 3) < 0.01
def test_totals(self):
m = BenchmarkMetrics(
results=[
ScenarioResult(scenario_name="a", llm_calls=10, metabolic_cost=30.0),
ScenarioResult(scenario_name="b", llm_calls=5, metabolic_cost=15.0),
]
)
assert m.total_llm_calls == 15
assert m.total_metabolic_cost == 45.0
def test_save_and_load(self, tmp_path):
path = tmp_path / "bench.jsonl"
m = BenchmarkMetrics(
timestamp="2026-01-01T00:00:00",
commit_sha="abc123",
total_time_ms=1000,
results=[
ScenarioResult(
scenario_name="a",
success=True,
cycles_used=5,
max_cycles=10,
),
],
)
m.save(path)
history = load_history(path)
assert len(history) == 1
assert history[0]["commit_sha"] == "abc123"
assert history[0]["scenarios"][0]["scenario_name"] == "a"
def test_save_appends(self, tmp_path):
path = tmp_path / "bench.jsonl"
for i in range(3):
m = BenchmarkMetrics(
timestamp=f"2026-01-0{i + 1}T00:00:00",
results=[ScenarioResult(scenario_name=f"s{i}")],
)
m.save(path)
history = load_history(path)
assert len(history) == 3
# Most recent first
assert history[0]["timestamp"] == "2026-01-03T00:00:00"
def test_summary_output(self):
m = BenchmarkMetrics(
timestamp="2026-01-01T00:00:00",
commit_sha="abc123",
total_time_ms=500,
results=[
ScenarioResult(
scenario_name="Walk Test",
success=True,
cycles_used=5,
max_cycles=10,
wall_time_ms=200,
llm_calls=15,
),
],
)
summary = m.summary()
assert "Walk Test" in summary
assert "PASS" in summary
assert "abc123" in summary
def test_load_history_missing_file(self, tmp_path):
assert load_history(tmp_path / "nope.jsonl") == []
def test_load_history_corrupt_lines(self, tmp_path):
path = tmp_path / "bench.jsonl"
path.write_text('{"valid": true}\nnot json\n{"also": "valid"}\n')
history = load_history(path)
assert len(history) == 2
# ---------------------------------------------------------------------------
# Comparison
# ---------------------------------------------------------------------------
class TestCompareRuns:
def test_regression_detected(self):
baseline = BenchmarkMetrics(
results=[
ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
]
)
current = BenchmarkMetrics(
results=[
ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
]
)
report = compare_runs(current, baseline)
assert "REGRESSION" in report
def test_improvement_detected(self):
baseline = BenchmarkMetrics(
results=[
ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
]
)
current = BenchmarkMetrics(
results=[
ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
]
)
report = compare_runs(current, baseline)
assert "IMPROVEMENT" in report
def test_slower_detected(self):
baseline = BenchmarkMetrics(
results=[
ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
]
)
current = BenchmarkMetrics(
results=[
ScenarioResult(scenario_name="walk", success=True, cycles_used=20),
]
)
report = compare_runs(current, baseline)
assert "SLOWER" in report
def test_new_scenario_noted(self):
baseline = BenchmarkMetrics(results=[])
current = BenchmarkMetrics(results=[ScenarioResult(scenario_name="new_one", success=True)])
report = compare_runs(current, baseline)
assert "NEW" in report
# ---------------------------------------------------------------------------
# Runner
# ---------------------------------------------------------------------------
class TestBenchmarkRunner:
@pytest.mark.asyncio
async def test_run_single_scenario(self):
"""Runner executes a scenario and returns a result."""
scenario = BenchmarkScenario(
name="Test Walk",
description="Simple test",
start_location="A",
goal_location="A",
max_cycles=3,
tags=["test"],
)
runner = BenchmarkRunner()
metrics = await runner.run([scenario])
assert len(metrics.results) == 1
r = metrics.results[0]
assert r.scenario_name == "Test Walk"
assert r.cycles_used == 3 # no predicate, runs all cycles
assert r.success is True # no predicate = success if survived
assert r.wall_time_ms >= 0
assert r.llm_calls == 9 # 3 cycles * 3 calls
assert r.metabolic_cost > 0
@pytest.mark.asyncio
async def test_run_with_goal_predicate(self):
"""Runner stops early when goal predicate is satisfied."""
def always_true(actions, location):
return True
scenario = BenchmarkScenario(
name="Instant Win",
description="Predicate satisfied immediately",
start_location="A",
max_cycles=100,
goal_predicate=always_true,
tags=["test"],
)
runner = BenchmarkRunner()
metrics = await runner.run([scenario])
r = metrics.results[0]
assert r.success is True
assert r.cycles_used == 1 # Stopped at first cycle
@pytest.mark.asyncio
async def test_run_with_failing_predicate(self):
"""Scenario fails when predicate never satisfied."""
def never_true(actions, location):
return False
scenario = BenchmarkScenario(
name="Impossible",
description="Predicate never satisfied",
start_location="A",
max_cycles=5,
goal_predicate=never_true,
tags=["test"],
)
runner = BenchmarkRunner()
metrics = await runner.run([scenario])
r = metrics.results[0]
assert r.success is False
assert r.cycles_used == 5
@pytest.mark.asyncio
async def test_run_multiple_scenarios(self):
"""Runner handles multiple scenarios in sequence."""
scenarios = [
BenchmarkScenario(
name=f"Scenario {i}",
description=f"Test {i}",
start_location="A",
max_cycles=2,
tags=["test"],
)
for i in range(3)
]
runner = BenchmarkRunner()
metrics = await runner.run(scenarios)
assert len(metrics.results) == 3
assert metrics.total_time_ms >= 0
assert metrics.timestamp
@pytest.mark.asyncio
async def test_metrics_commit_sha(self):
"""Runner captures git SHA in metrics."""
scenario = BenchmarkScenario(
name="SHA Test",
description="Check SHA capture",
start_location="A",
max_cycles=1,
tags=["test"],
)
runner = BenchmarkRunner()
metrics = await runner.run([scenario])
# SHA may or may not be available in test env; just ensure no crash
assert isinstance(metrics.commit_sha, str)
@pytest.mark.asyncio
async def test_builtin_scenarios_run(self):
"""All built-in scenarios run without crashing."""
# Use just 2 cycles each to keep tests fast
scenarios = [
BenchmarkScenario(
name=s.name,
description=s.description,
start_location=s.start_location,
goal_location=s.goal_location,
entities=list(s.entities),
events=list(s.events),
max_cycles=2, # Override for speed
goal_predicate=None, # Skip predicate for smoke test
tags=list(s.tags),
)
for s in BUILTIN_SCENARIOS
]
runner = BenchmarkRunner()
metrics = await runner.run(scenarios)
assert len(metrics.results) == len(BUILTIN_SCENARIOS)
# All should succeed (no predicate + survived = pass)
for r in metrics.results:
assert r.success is True
assert r.error is None