Implement standardised Morrowind benchmark scenarios to detect agent performance regressions after code changes. - 5 built-in scenarios: navigation (Seyda Neen→Balmora, Balmora intra-city), quest (Fargoth's Ring), combat (Mudcrab), observation - BenchmarkRunner executes scenarios through the heartbeat loop with MockWorldAdapter, tracking cycles, wall time, LLM calls, metabolic cost - Goal predicates (reached_location, interacted_with) for early success - BenchmarkMetrics with JSONL persistence and compare_runs() for regression detection - CLI script (scripts/run_benchmarks.py) with tag filtering and baseline comparison - tox -e benchmark environment for CI integration - 31 unit tests covering scenarios, predicates, metrics, runner, and persistence Fixes #1015 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
395 lines
13 KiB
Python
395 lines
13 KiB
Python
"""Tests for the agent performance regression benchmark suite.
|
|
|
|
Covers: scenario loading, metrics collection, runner execution,
|
|
goal predicates, and result persistence.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from infrastructure.world.benchmark.metrics import (
|
|
BenchmarkMetrics,
|
|
ScenarioResult,
|
|
compare_runs,
|
|
load_history,
|
|
)
|
|
from infrastructure.world.benchmark.runner import BenchmarkRunner
|
|
from infrastructure.world.benchmark.scenarios import (
|
|
BUILTIN_SCENARIOS,
|
|
BenchmarkScenario,
|
|
load_scenarios,
|
|
)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scenario definitions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestBenchmarkScenario:
|
|
def test_builtin_scenarios_exist(self):
|
|
assert len(BUILTIN_SCENARIOS) >= 5
|
|
|
|
def test_scenario_fields(self):
|
|
s = BUILTIN_SCENARIOS[0]
|
|
assert s.name
|
|
assert s.description
|
|
assert s.start_location
|
|
assert s.max_cycles > 0
|
|
|
|
def test_load_all_scenarios(self):
|
|
scenarios = load_scenarios()
|
|
assert len(scenarios) == len(BUILTIN_SCENARIOS)
|
|
|
|
def test_load_scenarios_by_tag(self):
|
|
nav = load_scenarios(tags=["navigation"])
|
|
assert len(nav) >= 2
|
|
for s in nav:
|
|
assert "navigation" in s.tags
|
|
|
|
def test_load_scenarios_no_match(self):
|
|
result = load_scenarios(tags=["nonexistent_tag"])
|
|
assert result == []
|
|
|
|
def test_scenario_is_frozen(self):
|
|
s = BUILTIN_SCENARIOS[0]
|
|
with pytest.raises(AttributeError):
|
|
s.name = "modified"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Goal predicates
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestGoalPredicates:
|
|
def test_reached_location_predicate(self):
|
|
s = BUILTIN_SCENARIOS[0] # Walk to Balmora
|
|
assert s.goal_predicate is not None
|
|
assert s.goal_predicate([], "Balmora") is True
|
|
assert s.goal_predicate([], "Seyda Neen") is False
|
|
|
|
def test_reached_location_case_insensitive(self):
|
|
s = BUILTIN_SCENARIOS[0]
|
|
assert s.goal_predicate([], "balmora") is True
|
|
assert s.goal_predicate([], "BALMORA") is True
|
|
|
|
def test_interacted_with_predicate(self):
|
|
s = BUILTIN_SCENARIOS[1] # Fargoth quest
|
|
assert s.goal_predicate is not None
|
|
actions = [{"action": "speak", "target": "Fargoth"}]
|
|
assert s.goal_predicate(actions, "Seyda Neen") is True
|
|
|
|
def test_interacted_with_no_match(self):
|
|
s = BUILTIN_SCENARIOS[1]
|
|
actions = [{"action": "speak", "target": "Guard"}]
|
|
assert s.goal_predicate(actions, "Seyda Neen") is False
|
|
|
|
def test_interacted_with_interact_action(self):
|
|
s = BUILTIN_SCENARIOS[1]
|
|
actions = [{"action": "interact", "target": "Fargoth"}]
|
|
assert s.goal_predicate(actions, "Seyda Neen") is True
|
|
|
|
def test_no_predicate_scenario(self):
|
|
combat = [s for s in BUILTIN_SCENARIOS if "combat" in s.tags][0]
|
|
assert combat.goal_predicate is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Metrics
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestScenarioResult:
|
|
def test_default_values(self):
|
|
r = ScenarioResult(scenario_name="test")
|
|
assert r.success is False
|
|
assert r.cycles_used == 0
|
|
assert r.llm_calls == 0
|
|
assert r.metabolic_cost == 0.0
|
|
assert r.error is None
|
|
|
|
|
|
class TestBenchmarkMetrics:
|
|
def test_empty_metrics(self):
|
|
m = BenchmarkMetrics()
|
|
assert m.pass_count == 0
|
|
assert m.fail_count == 0
|
|
assert m.success_rate == 0.0
|
|
assert m.total_llm_calls == 0
|
|
assert m.total_metabolic_cost == 0.0
|
|
|
|
def test_success_rate(self):
|
|
m = BenchmarkMetrics(
|
|
results=[
|
|
ScenarioResult(scenario_name="a", success=True),
|
|
ScenarioResult(scenario_name="b", success=False),
|
|
ScenarioResult(scenario_name="c", success=True),
|
|
]
|
|
)
|
|
assert m.pass_count == 2
|
|
assert m.fail_count == 1
|
|
assert abs(m.success_rate - 2 / 3) < 0.01
|
|
|
|
def test_totals(self):
|
|
m = BenchmarkMetrics(
|
|
results=[
|
|
ScenarioResult(scenario_name="a", llm_calls=10, metabolic_cost=30.0),
|
|
ScenarioResult(scenario_name="b", llm_calls=5, metabolic_cost=15.0),
|
|
]
|
|
)
|
|
assert m.total_llm_calls == 15
|
|
assert m.total_metabolic_cost == 45.0
|
|
|
|
def test_save_and_load(self, tmp_path):
|
|
path = tmp_path / "bench.jsonl"
|
|
m = BenchmarkMetrics(
|
|
timestamp="2026-01-01T00:00:00",
|
|
commit_sha="abc123",
|
|
total_time_ms=1000,
|
|
results=[
|
|
ScenarioResult(
|
|
scenario_name="a",
|
|
success=True,
|
|
cycles_used=5,
|
|
max_cycles=10,
|
|
),
|
|
],
|
|
)
|
|
m.save(path)
|
|
|
|
history = load_history(path)
|
|
assert len(history) == 1
|
|
assert history[0]["commit_sha"] == "abc123"
|
|
assert history[0]["scenarios"][0]["scenario_name"] == "a"
|
|
|
|
def test_save_appends(self, tmp_path):
|
|
path = tmp_path / "bench.jsonl"
|
|
for i in range(3):
|
|
m = BenchmarkMetrics(
|
|
timestamp=f"2026-01-0{i + 1}T00:00:00",
|
|
results=[ScenarioResult(scenario_name=f"s{i}")],
|
|
)
|
|
m.save(path)
|
|
|
|
history = load_history(path)
|
|
assert len(history) == 3
|
|
# Most recent first
|
|
assert history[0]["timestamp"] == "2026-01-03T00:00:00"
|
|
|
|
def test_summary_output(self):
|
|
m = BenchmarkMetrics(
|
|
timestamp="2026-01-01T00:00:00",
|
|
commit_sha="abc123",
|
|
total_time_ms=500,
|
|
results=[
|
|
ScenarioResult(
|
|
scenario_name="Walk Test",
|
|
success=True,
|
|
cycles_used=5,
|
|
max_cycles=10,
|
|
wall_time_ms=200,
|
|
llm_calls=15,
|
|
),
|
|
],
|
|
)
|
|
summary = m.summary()
|
|
assert "Walk Test" in summary
|
|
assert "PASS" in summary
|
|
assert "abc123" in summary
|
|
|
|
def test_load_history_missing_file(self, tmp_path):
|
|
assert load_history(tmp_path / "nope.jsonl") == []
|
|
|
|
def test_load_history_corrupt_lines(self, tmp_path):
|
|
path = tmp_path / "bench.jsonl"
|
|
path.write_text('{"valid": true}\nnot json\n{"also": "valid"}\n')
|
|
history = load_history(path)
|
|
assert len(history) == 2
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Comparison
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestCompareRuns:
|
|
def test_regression_detected(self):
|
|
baseline = BenchmarkMetrics(
|
|
results=[
|
|
ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
|
|
]
|
|
)
|
|
current = BenchmarkMetrics(
|
|
results=[
|
|
ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
|
|
]
|
|
)
|
|
report = compare_runs(current, baseline)
|
|
assert "REGRESSION" in report
|
|
|
|
def test_improvement_detected(self):
|
|
baseline = BenchmarkMetrics(
|
|
results=[
|
|
ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
|
|
]
|
|
)
|
|
current = BenchmarkMetrics(
|
|
results=[
|
|
ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
|
|
]
|
|
)
|
|
report = compare_runs(current, baseline)
|
|
assert "IMPROVEMENT" in report
|
|
|
|
def test_slower_detected(self):
|
|
baseline = BenchmarkMetrics(
|
|
results=[
|
|
ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
|
|
]
|
|
)
|
|
current = BenchmarkMetrics(
|
|
results=[
|
|
ScenarioResult(scenario_name="walk", success=True, cycles_used=20),
|
|
]
|
|
)
|
|
report = compare_runs(current, baseline)
|
|
assert "SLOWER" in report
|
|
|
|
def test_new_scenario_noted(self):
|
|
baseline = BenchmarkMetrics(results=[])
|
|
current = BenchmarkMetrics(results=[ScenarioResult(scenario_name="new_one", success=True)])
|
|
report = compare_runs(current, baseline)
|
|
assert "NEW" in report
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Runner
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestBenchmarkRunner:
|
|
@pytest.mark.asyncio
|
|
async def test_run_single_scenario(self):
|
|
"""Runner executes a scenario and returns a result."""
|
|
scenario = BenchmarkScenario(
|
|
name="Test Walk",
|
|
description="Simple test",
|
|
start_location="A",
|
|
goal_location="A",
|
|
max_cycles=3,
|
|
tags=["test"],
|
|
)
|
|
runner = BenchmarkRunner()
|
|
metrics = await runner.run([scenario])
|
|
assert len(metrics.results) == 1
|
|
r = metrics.results[0]
|
|
assert r.scenario_name == "Test Walk"
|
|
assert r.cycles_used == 3 # no predicate, runs all cycles
|
|
assert r.success is True # no predicate = success if survived
|
|
assert r.wall_time_ms >= 0
|
|
assert r.llm_calls == 9 # 3 cycles * 3 calls
|
|
assert r.metabolic_cost > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_with_goal_predicate(self):
|
|
"""Runner stops early when goal predicate is satisfied."""
|
|
|
|
def always_true(actions, location):
|
|
return True
|
|
|
|
scenario = BenchmarkScenario(
|
|
name="Instant Win",
|
|
description="Predicate satisfied immediately",
|
|
start_location="A",
|
|
max_cycles=100,
|
|
goal_predicate=always_true,
|
|
tags=["test"],
|
|
)
|
|
runner = BenchmarkRunner()
|
|
metrics = await runner.run([scenario])
|
|
r = metrics.results[0]
|
|
assert r.success is True
|
|
assert r.cycles_used == 1 # Stopped at first cycle
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_with_failing_predicate(self):
|
|
"""Scenario fails when predicate never satisfied."""
|
|
|
|
def never_true(actions, location):
|
|
return False
|
|
|
|
scenario = BenchmarkScenario(
|
|
name="Impossible",
|
|
description="Predicate never satisfied",
|
|
start_location="A",
|
|
max_cycles=5,
|
|
goal_predicate=never_true,
|
|
tags=["test"],
|
|
)
|
|
runner = BenchmarkRunner()
|
|
metrics = await runner.run([scenario])
|
|
r = metrics.results[0]
|
|
assert r.success is False
|
|
assert r.cycles_used == 5
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_multiple_scenarios(self):
|
|
"""Runner handles multiple scenarios in sequence."""
|
|
scenarios = [
|
|
BenchmarkScenario(
|
|
name=f"Scenario {i}",
|
|
description=f"Test {i}",
|
|
start_location="A",
|
|
max_cycles=2,
|
|
tags=["test"],
|
|
)
|
|
for i in range(3)
|
|
]
|
|
runner = BenchmarkRunner()
|
|
metrics = await runner.run(scenarios)
|
|
assert len(metrics.results) == 3
|
|
assert metrics.total_time_ms >= 0
|
|
assert metrics.timestamp
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_metrics_commit_sha(self):
|
|
"""Runner captures git SHA in metrics."""
|
|
scenario = BenchmarkScenario(
|
|
name="SHA Test",
|
|
description="Check SHA capture",
|
|
start_location="A",
|
|
max_cycles=1,
|
|
tags=["test"],
|
|
)
|
|
runner = BenchmarkRunner()
|
|
metrics = await runner.run([scenario])
|
|
# SHA may or may not be available in test env; just ensure no crash
|
|
assert isinstance(metrics.commit_sha, str)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_builtin_scenarios_run(self):
|
|
"""All built-in scenarios run without crashing."""
|
|
# Use just 2 cycles each to keep tests fast
|
|
scenarios = [
|
|
BenchmarkScenario(
|
|
name=s.name,
|
|
description=s.description,
|
|
start_location=s.start_location,
|
|
goal_location=s.goal_location,
|
|
entities=list(s.entities),
|
|
events=list(s.events),
|
|
max_cycles=2, # Override for speed
|
|
goal_predicate=None, # Skip predicate for smoke test
|
|
tags=list(s.tags),
|
|
)
|
|
for s in BUILTIN_SCENARIOS
|
|
]
|
|
runner = BenchmarkRunner()
|
|
metrics = await runner.run(scenarios)
|
|
assert len(metrics.results) == len(BUILTIN_SCENARIOS)
|
|
# All should succeed (no predicate + survived = pass)
|
|
for r in metrics.results:
|
|
assert r.success is True
|
|
assert r.error is None
|