"""Tests for the agent performance regression benchmark suite. Covers: scenario loading, metrics collection, runner execution, goal predicates, and result persistence. """ from __future__ import annotations import pytest from infrastructure.world.benchmark.metrics import ( BenchmarkMetrics, ScenarioResult, compare_runs, load_history, ) from infrastructure.world.benchmark.runner import BenchmarkRunner from infrastructure.world.benchmark.scenarios import ( BUILTIN_SCENARIOS, BenchmarkScenario, load_scenarios, ) # --------------------------------------------------------------------------- # Scenario definitions # --------------------------------------------------------------------------- class TestBenchmarkScenario: def test_builtin_scenarios_exist(self): assert len(BUILTIN_SCENARIOS) >= 5 def test_scenario_fields(self): s = BUILTIN_SCENARIOS[0] assert s.name assert s.description assert s.start_location assert s.max_cycles > 0 def test_load_all_scenarios(self): scenarios = load_scenarios() assert len(scenarios) == len(BUILTIN_SCENARIOS) def test_load_scenarios_by_tag(self): nav = load_scenarios(tags=["navigation"]) assert len(nav) >= 2 for s in nav: assert "navigation" in s.tags def test_load_scenarios_no_match(self): result = load_scenarios(tags=["nonexistent_tag"]) assert result == [] def test_scenario_is_frozen(self): s = BUILTIN_SCENARIOS[0] with pytest.raises(AttributeError): s.name = "modified" # --------------------------------------------------------------------------- # Goal predicates # --------------------------------------------------------------------------- class TestGoalPredicates: def test_reached_location_predicate(self): s = BUILTIN_SCENARIOS[0] # Walk to Balmora assert s.goal_predicate is not None assert s.goal_predicate([], "Balmora") is True assert s.goal_predicate([], "Seyda Neen") is False def test_reached_location_case_insensitive(self): s = BUILTIN_SCENARIOS[0] assert s.goal_predicate([], "balmora") is True assert s.goal_predicate([], "BALMORA") is True def test_interacted_with_predicate(self): s = BUILTIN_SCENARIOS[1] # Fargoth quest assert s.goal_predicate is not None actions = [{"action": "speak", "target": "Fargoth"}] assert s.goal_predicate(actions, "Seyda Neen") is True def test_interacted_with_no_match(self): s = BUILTIN_SCENARIOS[1] actions = [{"action": "speak", "target": "Guard"}] assert s.goal_predicate(actions, "Seyda Neen") is False def test_interacted_with_interact_action(self): s = BUILTIN_SCENARIOS[1] actions = [{"action": "interact", "target": "Fargoth"}] assert s.goal_predicate(actions, "Seyda Neen") is True def test_no_predicate_scenario(self): combat = [s for s in BUILTIN_SCENARIOS if "combat" in s.tags][0] assert combat.goal_predicate is None # --------------------------------------------------------------------------- # Metrics # --------------------------------------------------------------------------- class TestScenarioResult: def test_default_values(self): r = ScenarioResult(scenario_name="test") assert r.success is False assert r.cycles_used == 0 assert r.llm_calls == 0 assert r.metabolic_cost == 0.0 assert r.error is None class TestBenchmarkMetrics: def test_empty_metrics(self): m = BenchmarkMetrics() assert m.pass_count == 0 assert m.fail_count == 0 assert m.success_rate == 0.0 assert m.total_llm_calls == 0 assert m.total_metabolic_cost == 0.0 def test_success_rate(self): m = BenchmarkMetrics( results=[ ScenarioResult(scenario_name="a", success=True), ScenarioResult(scenario_name="b", success=False), ScenarioResult(scenario_name="c", success=True), ] ) assert m.pass_count == 2 assert m.fail_count == 1 assert abs(m.success_rate - 2 / 3) < 0.01 def test_totals(self): m = BenchmarkMetrics( results=[ ScenarioResult(scenario_name="a", llm_calls=10, metabolic_cost=30.0), ScenarioResult(scenario_name="b", llm_calls=5, metabolic_cost=15.0), ] ) assert m.total_llm_calls == 15 assert m.total_metabolic_cost == 45.0 def test_save_and_load(self, tmp_path): path = tmp_path / "bench.jsonl" m = BenchmarkMetrics( timestamp="2026-01-01T00:00:00", commit_sha="abc123", total_time_ms=1000, results=[ ScenarioResult( scenario_name="a", success=True, cycles_used=5, max_cycles=10, ), ], ) m.save(path) history = load_history(path) assert len(history) == 1 assert history[0]["commit_sha"] == "abc123" assert history[0]["scenarios"][0]["scenario_name"] == "a" def test_save_appends(self, tmp_path): path = tmp_path / "bench.jsonl" for i in range(3): m = BenchmarkMetrics( timestamp=f"2026-01-0{i + 1}T00:00:00", results=[ScenarioResult(scenario_name=f"s{i}")], ) m.save(path) history = load_history(path) assert len(history) == 3 # Most recent first assert history[0]["timestamp"] == "2026-01-03T00:00:00" def test_summary_output(self): m = BenchmarkMetrics( timestamp="2026-01-01T00:00:00", commit_sha="abc123", total_time_ms=500, results=[ ScenarioResult( scenario_name="Walk Test", success=True, cycles_used=5, max_cycles=10, wall_time_ms=200, llm_calls=15, ), ], ) summary = m.summary() assert "Walk Test" in summary assert "PASS" in summary assert "abc123" in summary def test_load_history_missing_file(self, tmp_path): assert load_history(tmp_path / "nope.jsonl") == [] def test_load_history_corrupt_lines(self, tmp_path): path = tmp_path / "bench.jsonl" path.write_text('{"valid": true}\nnot json\n{"also": "valid"}\n') history = load_history(path) assert len(history) == 2 # --------------------------------------------------------------------------- # Comparison # --------------------------------------------------------------------------- class TestCompareRuns: def test_regression_detected(self): baseline = BenchmarkMetrics( results=[ ScenarioResult(scenario_name="walk", success=True, cycles_used=10), ] ) current = BenchmarkMetrics( results=[ ScenarioResult(scenario_name="walk", success=False, cycles_used=10), ] ) report = compare_runs(current, baseline) assert "REGRESSION" in report def test_improvement_detected(self): baseline = BenchmarkMetrics( results=[ ScenarioResult(scenario_name="walk", success=False, cycles_used=10), ] ) current = BenchmarkMetrics( results=[ ScenarioResult(scenario_name="walk", success=True, cycles_used=10), ] ) report = compare_runs(current, baseline) assert "IMPROVEMENT" in report def test_slower_detected(self): baseline = BenchmarkMetrics( results=[ ScenarioResult(scenario_name="walk", success=True, cycles_used=10), ] ) current = BenchmarkMetrics( results=[ ScenarioResult(scenario_name="walk", success=True, cycles_used=20), ] ) report = compare_runs(current, baseline) assert "SLOWER" in report def test_new_scenario_noted(self): baseline = BenchmarkMetrics(results=[]) current = BenchmarkMetrics(results=[ScenarioResult(scenario_name="new_one", success=True)]) report = compare_runs(current, baseline) assert "NEW" in report # --------------------------------------------------------------------------- # Runner # --------------------------------------------------------------------------- class TestBenchmarkRunner: @pytest.mark.asyncio async def test_run_single_scenario(self): """Runner executes a scenario and returns a result.""" scenario = BenchmarkScenario( name="Test Walk", description="Simple test", start_location="A", goal_location="A", max_cycles=3, tags=["test"], ) runner = BenchmarkRunner() metrics = await runner.run([scenario]) assert len(metrics.results) == 1 r = metrics.results[0] assert r.scenario_name == "Test Walk" assert r.cycles_used == 3 # no predicate, runs all cycles assert r.success is True # no predicate = success if survived assert r.wall_time_ms >= 0 assert r.llm_calls == 9 # 3 cycles * 3 calls assert r.metabolic_cost > 0 @pytest.mark.asyncio async def test_run_with_goal_predicate(self): """Runner stops early when goal predicate is satisfied.""" def always_true(actions, location): return True scenario = BenchmarkScenario( name="Instant Win", description="Predicate satisfied immediately", start_location="A", max_cycles=100, goal_predicate=always_true, tags=["test"], ) runner = BenchmarkRunner() metrics = await runner.run([scenario]) r = metrics.results[0] assert r.success is True assert r.cycles_used == 1 # Stopped at first cycle @pytest.mark.asyncio async def test_run_with_failing_predicate(self): """Scenario fails when predicate never satisfied.""" def never_true(actions, location): return False scenario = BenchmarkScenario( name="Impossible", description="Predicate never satisfied", start_location="A", max_cycles=5, goal_predicate=never_true, tags=["test"], ) runner = BenchmarkRunner() metrics = await runner.run([scenario]) r = metrics.results[0] assert r.success is False assert r.cycles_used == 5 @pytest.mark.asyncio async def test_run_multiple_scenarios(self): """Runner handles multiple scenarios in sequence.""" scenarios = [ BenchmarkScenario( name=f"Scenario {i}", description=f"Test {i}", start_location="A", max_cycles=2, tags=["test"], ) for i in range(3) ] runner = BenchmarkRunner() metrics = await runner.run(scenarios) assert len(metrics.results) == 3 assert metrics.total_time_ms >= 0 assert metrics.timestamp @pytest.mark.asyncio async def test_metrics_commit_sha(self): """Runner captures git SHA in metrics.""" scenario = BenchmarkScenario( name="SHA Test", description="Check SHA capture", start_location="A", max_cycles=1, tags=["test"], ) runner = BenchmarkRunner() metrics = await runner.run([scenario]) # SHA may or may not be available in test env; just ensure no crash assert isinstance(metrics.commit_sha, str) @pytest.mark.asyncio async def test_builtin_scenarios_run(self): """All built-in scenarios run without crashing.""" # Use just 2 cycles each to keep tests fast scenarios = [ BenchmarkScenario( name=s.name, description=s.description, start_location=s.start_location, goal_location=s.goal_location, entities=list(s.entities), events=list(s.events), max_cycles=2, # Override for speed goal_predicate=None, # Skip predicate for smoke test tags=list(s.tags), ) for s in BUILTIN_SCENARIOS ] runner = BenchmarkRunner() metrics = await runner.run(scenarios) assert len(metrics.results) == len(BUILTIN_SCENARIOS) # All should succeed (no predicate + survived = pass) for r in metrics.results: assert r.success is True assert r.error is None