From 49990e6aecd0dcac5053048274baad946e58ba2d Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Sun, 22 Mar 2026 19:54:26 -0400 Subject: [PATCH] feat: add agent performance regression benchmark suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement standardised Morrowind benchmark scenarios to detect agent performance regressions after code changes. - 5 built-in scenarios: navigation (Seyda Neen→Balmora, Balmora intra-city), quest (Fargoth's Ring), combat (Mudcrab), observation - BenchmarkRunner executes scenarios through the heartbeat loop with MockWorldAdapter, tracking cycles, wall time, LLM calls, metabolic cost - Goal predicates (reached_location, interacted_with) for early success - BenchmarkMetrics with JSONL persistence and compare_runs() for regression detection - CLI script (scripts/run_benchmarks.py) with tag filtering and baseline comparison - tox -e benchmark environment for CI integration - 31 unit tests covering scenarios, predicates, metrics, runner, and persistence Fixes #1015 Co-Authored-By: Claude Opus 4.6 --- scripts/run_benchmarks.py | 107 +++++ .../world/benchmark/__init__.py | 17 + src/infrastructure/world/benchmark/metrics.py | 195 +++++++++ src/infrastructure/world/benchmark/runner.py | 167 ++++++++ .../world/benchmark/scenarios.py | 160 +++++++ tests/infrastructure/world/test_benchmark.py | 394 ++++++++++++++++++ tox.ini | 5 + 7 files changed, 1045 insertions(+) create mode 100644 scripts/run_benchmarks.py create mode 100644 src/infrastructure/world/benchmark/__init__.py create mode 100644 src/infrastructure/world/benchmark/metrics.py create mode 100644 src/infrastructure/world/benchmark/runner.py create mode 100644 src/infrastructure/world/benchmark/scenarios.py create mode 100644 tests/infrastructure/world/test_benchmark.py diff --git a/scripts/run_benchmarks.py b/scripts/run_benchmarks.py new file mode 100644 index 00000000..c70ff0de --- /dev/null +++ b/scripts/run_benchmarks.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +"""Run the agent performance regression benchmark suite. + +Usage:: + + python scripts/run_benchmarks.py # all scenarios + python scripts/run_benchmarks.py --tags navigation # filter by tag + python scripts/run_benchmarks.py --output results/benchmarks.jsonl + python scripts/run_benchmarks.py --compare results/benchmarks.jsonl + +Exit codes: + 0 — all scenarios passed + 1 — one or more scenarios failed +""" + +from __future__ import annotations + +import argparse +import asyncio +import sys +from pathlib import Path + +# Ensure src/ is on the path when invoked directly +sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) + +from infrastructure.world.benchmark.metrics import BenchmarkMetrics, load_history +from infrastructure.world.benchmark.runner import BenchmarkRunner +from infrastructure.world.benchmark.scenarios import load_scenarios + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Agent performance regression benchmark suite", + ) + parser.add_argument( + "--tags", + nargs="*", + default=None, + help="Filter scenarios by tag (e.g. navigation quest)", + ) + parser.add_argument( + "--output", + type=Path, + default=None, + help="JSONL file to append results to", + ) + parser.add_argument( + "--compare", + type=Path, + default=None, + help="JSONL file with baseline results for regression comparison", + ) + return parser.parse_args() + + +async def main() -> int: + args = parse_args() + + scenarios = load_scenarios(tags=args.tags) + if not scenarios: + print("No matching scenarios found.") + return 1 + + print(f"Running {len(scenarios)} benchmark scenario(s)...\n") + + runner = BenchmarkRunner() + metrics = await runner.run(scenarios) + + print(metrics.summary()) + + if args.output: + metrics.save(args.output) + + if args.compare: + history = load_history(args.compare) + if history: + from infrastructure.world.benchmark.metrics import compare_runs + + # Reconstruct baseline from last recorded run + last = history[0] + baseline = BenchmarkMetrics( + timestamp=last.get("timestamp", ""), + commit_sha=last.get("commit_sha", ""), + total_time_ms=last.get("total_time_ms", 0), + ) + for s in last.get("scenarios", []): + from infrastructure.world.benchmark.metrics import ScenarioResult + + baseline.results.append( + ScenarioResult( + scenario_name=s["scenario_name"], + success=s["success"], + cycles_used=s["cycles_used"], + max_cycles=s["max_cycles"], + wall_time_ms=s.get("wall_time_ms", 0), + llm_calls=s.get("llm_calls", 0), + metabolic_cost=s.get("metabolic_cost", 0.0), + ) + ) + print() + print(compare_runs(metrics, baseline)) + + return 0 if metrics.fail_count == 0 else 1 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/src/infrastructure/world/benchmark/__init__.py b/src/infrastructure/world/benchmark/__init__.py new file mode 100644 index 00000000..8a840b73 --- /dev/null +++ b/src/infrastructure/world/benchmark/__init__.py @@ -0,0 +1,17 @@ +"""Performance regression suite for Morrowind agent scenarios. + +Provides standardised benchmark scenarios, a runner that executes them +through the heartbeat loop with a mock (or live) world adapter, and +metrics collection for CI-integrated regression detection. +""" + +from infrastructure.world.benchmark.metrics import BenchmarkMetrics +from infrastructure.world.benchmark.runner import BenchmarkRunner +from infrastructure.world.benchmark.scenarios import BenchmarkScenario, load_scenarios + +__all__ = [ + "BenchmarkMetrics", + "BenchmarkRunner", + "BenchmarkScenario", + "load_scenarios", +] diff --git a/src/infrastructure/world/benchmark/metrics.py b/src/infrastructure/world/benchmark/metrics.py new file mode 100644 index 00000000..431a4550 --- /dev/null +++ b/src/infrastructure/world/benchmark/metrics.py @@ -0,0 +1,195 @@ +"""Benchmark metrics collection and persistence. + +Tracks per-scenario results: cycles used, wall-clock time, success, +LLM call count, and estimated metabolic cost. Results are persisted +as JSONL for trend analysis and CI regression gates. +""" + +from __future__ import annotations + +import json +import logging +from dataclasses import asdict, dataclass, field +from pathlib import Path + +logger = logging.getLogger(__name__) + + +@dataclass +class ScenarioResult: + """Outcome of running a single benchmark scenario. + + Attributes: + scenario_name: Human-readable scenario name. + success: Whether the goal predicate was satisfied. + cycles_used: Number of heartbeat cycles executed. + max_cycles: The scenario's cycle budget. + wall_time_ms: Total wall-clock time in milliseconds. + llm_calls: Number of LLM inference calls made. + metabolic_cost: Estimated resource cost (arbitrary unit, ≈ tokens). + error: Error message if the run crashed. + tags: Scenario tags (copied for filtering). + """ + + scenario_name: str + success: bool = False + cycles_used: int = 0 + max_cycles: int = 0 + wall_time_ms: int = 0 + llm_calls: int = 0 + metabolic_cost: float = 0.0 + error: str | None = None + tags: list[str] = field(default_factory=list) + + +@dataclass +class BenchmarkMetrics: + """Aggregated metrics across all scenarios in a benchmark run. + + Attributes: + results: Per-scenario results. + total_time_ms: Total wall-clock time for the full suite. + timestamp: ISO-8601 timestamp of the run. + commit_sha: Git commit SHA (if available). + """ + + results: list[ScenarioResult] = field(default_factory=list) + total_time_ms: int = 0 + timestamp: str = "" + commit_sha: str = "" + + # -- derived properties ------------------------------------------------ + + @property + def pass_count(self) -> int: + return sum(1 for r in self.results if r.success) + + @property + def fail_count(self) -> int: + return sum(1 for r in self.results if not r.success) + + @property + def success_rate(self) -> float: + if not self.results: + return 0.0 + return self.pass_count / len(self.results) + + @property + def total_llm_calls(self) -> int: + return sum(r.llm_calls for r in self.results) + + @property + def total_metabolic_cost(self) -> float: + return sum(r.metabolic_cost for r in self.results) + + # -- persistence ------------------------------------------------------- + + def save(self, path: Path) -> None: + """Append this run's results to a JSONL file at *path*.""" + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + record = { + "timestamp": self.timestamp, + "commit_sha": self.commit_sha, + "total_time_ms": self.total_time_ms, + "success_rate": round(self.success_rate, 4), + "total_llm_calls": self.total_llm_calls, + "total_metabolic_cost": round(self.total_metabolic_cost, 2), + "scenarios": [asdict(r) for r in self.results], + } + with path.open("a") as f: + f.write(json.dumps(record) + "\n") + logger.info("Benchmark results saved to %s", path) + + # -- summary ----------------------------------------------------------- + + def summary(self) -> str: + """Return a human-readable summary of the benchmark run.""" + lines = [ + "=== Benchmark Summary ===", + f"Scenarios: {len(self.results)} " + f"Passed: {self.pass_count} " + f"Failed: {self.fail_count} " + f"Success rate: {self.success_rate:.0%}", + f"Total time: {self.total_time_ms} ms " + f"LLM calls: {self.total_llm_calls} " + f"Metabolic cost: {self.total_metabolic_cost:.1f}", + ] + if self.commit_sha: + lines.append(f"Commit: {self.commit_sha}") + lines.append("") + for r in self.results: + status = "PASS" if r.success else "FAIL" + lines.append( + f" [{status}] {r.scenario_name} — " + f"{r.cycles_used}/{r.max_cycles} cycles, " + f"{r.wall_time_ms} ms, " + f"{r.llm_calls} LLM calls" + ) + if r.error: + lines.append(f" Error: {r.error}") + return "\n".join(lines) + + +def load_history(path: Path) -> list[dict]: + """Load benchmark history from a JSONL file. + + Returns: + List of run records, most recent first. + """ + path = Path(path) + if not path.exists(): + return [] + records: list[dict] = [] + for line in path.read_text().strip().splitlines(): + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + continue + return list(reversed(records)) + + +def compare_runs( + current: BenchmarkMetrics, + baseline: BenchmarkMetrics, +) -> str: + """Compare two benchmark runs and report regressions. + + Returns: + Human-readable comparison report. + """ + lines = ["=== Regression Report ==="] + + # Overall + rate_delta = current.success_rate - baseline.success_rate + lines.append( + f"Success rate: {baseline.success_rate:.0%} -> {current.success_rate:.0%} " + f"({rate_delta:+.0%})" + ) + + cost_delta = current.total_metabolic_cost - baseline.total_metabolic_cost + if baseline.total_metabolic_cost > 0: + cost_pct = (cost_delta / baseline.total_metabolic_cost) * 100 + lines.append( + f"Metabolic cost: {baseline.total_metabolic_cost:.1f} -> " + f"{current.total_metabolic_cost:.1f} ({cost_pct:+.1f}%)" + ) + + # Per-scenario + baseline_map = {r.scenario_name: r for r in baseline.results} + for r in current.results: + b = baseline_map.get(r.scenario_name) + if b is None: + lines.append(f" [NEW] {r.scenario_name}") + continue + if b.success and not r.success: + lines.append(f" [REGRESSION] {r.scenario_name} — was PASS, now FAIL") + elif not b.success and r.success: + lines.append(f" [IMPROVEMENT] {r.scenario_name} — was FAIL, now PASS") + elif r.cycles_used > b.cycles_used * 1.5: + lines.append( + f" [SLOWER] {r.scenario_name} — " + f"{b.cycles_used} -> {r.cycles_used} cycles (+{r.cycles_used - b.cycles_used})" + ) + + return "\n".join(lines) diff --git a/src/infrastructure/world/benchmark/runner.py b/src/infrastructure/world/benchmark/runner.py new file mode 100644 index 00000000..965d3ce9 --- /dev/null +++ b/src/infrastructure/world/benchmark/runner.py @@ -0,0 +1,167 @@ +"""Benchmark runner — executes scenarios through the heartbeat loop. + +Wires each ``BenchmarkScenario`` into a ``MockWorldAdapter`` (or a +supplied adapter), runs the heartbeat for up to ``max_cycles``, and +collects ``BenchmarkMetrics``. +""" + +from __future__ import annotations + +import logging +import subprocess +import time +from datetime import UTC, datetime + +from infrastructure.world.adapters.mock import MockWorldAdapter +from infrastructure.world.benchmark.metrics import BenchmarkMetrics, ScenarioResult +from infrastructure.world.benchmark.scenarios import BenchmarkScenario +from infrastructure.world.interface import WorldInterface +from loop.heartbeat import Heartbeat + +logger = logging.getLogger(__name__) + +# Rough estimate: each heartbeat cycle costs ~1 unit of metabolic cost +# (gather + reason + act phases each touch the LLM router once). +_COST_PER_CYCLE = 3.0 # three phases per cycle + + +class BenchmarkRunner: + """Run benchmark scenarios and collect metrics. + + Parameters + ---------- + adapter_factory: + Optional callable that returns a ``WorldInterface`` for a given + scenario. Defaults to building a ``MockWorldAdapter`` from the + scenario's start state. + heartbeat_interval: + Seconds between heartbeat ticks (0 for immediate). + """ + + def __init__( + self, + *, + adapter_factory=None, + heartbeat_interval: float = 0.0, + ) -> None: + self._adapter_factory = adapter_factory or self._default_adapter + self._interval = heartbeat_interval + + # -- public API -------------------------------------------------------- + + async def run( + self, + scenarios: list[BenchmarkScenario], + ) -> BenchmarkMetrics: + """Execute all *scenarios* and return aggregated metrics.""" + metrics = BenchmarkMetrics( + timestamp=datetime.now(UTC).isoformat(), + commit_sha=self._git_sha(), + ) + suite_start = time.monotonic() + + for scenario in scenarios: + logger.info("Benchmark: starting '%s'", scenario.name) + result = await self._run_scenario(scenario) + metrics.results.append(result) + status = "PASS" if result.success else "FAIL" + logger.info( + "Benchmark: '%s' %s (%d/%d cycles, %d ms)", + scenario.name, + status, + result.cycles_used, + result.max_cycles, + result.wall_time_ms, + ) + + metrics.total_time_ms = int((time.monotonic() - suite_start) * 1000) + return metrics + + # -- internal ---------------------------------------------------------- + + async def _run_scenario(self, scenario: BenchmarkScenario) -> ScenarioResult: + """Run a single scenario through the heartbeat loop.""" + result = ScenarioResult( + scenario_name=scenario.name, + max_cycles=scenario.max_cycles, + tags=list(scenario.tags), + ) + + adapter = self._adapter_factory(scenario) + adapter.connect() + + hb = Heartbeat(world=adapter, interval=self._interval) + actions: list[dict] = [] + + start = time.monotonic() + try: + for cycle in range(1, scenario.max_cycles + 1): + record = await hb.run_once() + result.cycles_used = cycle + + # Track LLM calls (each cycle has 3 phases that may call LLM) + result.llm_calls += 3 + + # Accumulate actions for goal predicate + if record.action_taken and record.action_taken != "idle": + actions.append( + { + "action": record.action_taken, + "target": record.observation.get("location", ""), + "status": record.action_status, + } + ) + + # Update adapter location if scenario simulates movement + current_location = self._get_current_location(adapter) + + # Check goal predicate + if scenario.goal_predicate is not None: + if scenario.goal_predicate(actions, current_location): + result.success = True + break + elif cycle == scenario.max_cycles: + # No predicate — success if we survived all cycles + result.success = True + + except Exception as exc: + logger.warning("Benchmark scenario '%s' crashed: %s", scenario.name, exc) + result.error = str(exc) + finally: + adapter.disconnect() + + result.wall_time_ms = int((time.monotonic() - start) * 1000) + result.metabolic_cost = result.cycles_used * _COST_PER_CYCLE + return result + + @staticmethod + def _default_adapter(scenario: BenchmarkScenario) -> WorldInterface: + """Build a MockWorldAdapter from a scenario's starting state.""" + return MockWorldAdapter( + location=scenario.start_location, + entities=list(scenario.entities), + events=list(scenario.events), + ) + + @staticmethod + def _get_current_location(adapter: WorldInterface) -> str: + """Read the current location from the adapter.""" + try: + perception = adapter.observe() + return perception.location + except Exception: + return "" + + @staticmethod + def _git_sha() -> str: + """Best-effort: return the current git commit SHA.""" + try: + result = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + capture_output=True, + text=True, + timeout=5, + ) + return result.stdout.strip() if result.returncode == 0 else "" + except (OSError, subprocess.TimeoutExpired): + return "" diff --git a/src/infrastructure/world/benchmark/scenarios.py b/src/infrastructure/world/benchmark/scenarios.py new file mode 100644 index 00000000..4f0ef1c7 --- /dev/null +++ b/src/infrastructure/world/benchmark/scenarios.py @@ -0,0 +1,160 @@ +"""Benchmark scenario definitions for Morrowind agent regression testing. + +Each scenario specifies a starting location, goal conditions, world state +(entities, events), and maximum cycles allowed. The runner feeds these +into the heartbeat loop and checks completion against the goal predicate. +""" + +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass, field + + +@dataclass(frozen=True) +class BenchmarkScenario: + """A reproducible agent task used to detect performance regressions. + + Attributes: + name: Human-readable scenario name. + description: What the scenario tests. + start_location: Where the agent begins. + goal_location: Target location (if navigation scenario). + entities: NPCs / objects present in the world. + events: Game events injected each cycle. + max_cycles: Hard cap on heartbeat cycles before failure. + goal_predicate: Optional callable ``(actions, location) -> bool`` + evaluated after each cycle to check early success. + tags: Freeform tags for filtering (e.g. "navigation", "quest"). + """ + + name: str + description: str + start_location: str + goal_location: str = "" + entities: list[str] = field(default_factory=list) + events: list[str] = field(default_factory=list) + max_cycles: int = 50 + goal_predicate: Callable | None = None + tags: list[str] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Goal predicates +# --------------------------------------------------------------------------- + + +def _reached_location(target: str) -> Callable: + """Return a predicate that checks whether the agent reached *target*.""" + + def predicate(actions: list[dict], current_location: str) -> bool: + return current_location.lower() == target.lower() + + return predicate + + +def _interacted_with(npc: str) -> Callable: + """Return a predicate that checks for a speak/interact action with *npc*.""" + + def predicate(actions: list[dict], current_location: str) -> bool: + for act in actions: + if act.get("action") in ("speak", "interact", "talk"): + if act.get("target", "").lower() == npc.lower(): + return True + return False + + return predicate + + +# --------------------------------------------------------------------------- +# Built-in scenarios +# --------------------------------------------------------------------------- + +BUILTIN_SCENARIOS: list[BenchmarkScenario] = [ + BenchmarkScenario( + name="Walk Seyda Neen to Balmora", + description=( + "Navigate from the starting village to Balmora via the road. " + "Tests basic navigation and pathfinding." + ), + start_location="Seyda Neen", + goal_location="Balmora", + entities=["Silt Strider", "Road Sign", "Mudcrab"], + events=["player_spawned"], + max_cycles=30, + goal_predicate=_reached_location("Balmora"), + tags=["navigation", "basic"], + ), + BenchmarkScenario( + name="Fargoth's Ring", + description=( + "Complete the Fargoth quest: find Fargoth, receive the ring, " + "and return it. Tests NPC interaction and quest logic." + ), + start_location="Seyda Neen", + goal_location="Seyda Neen", + entities=["Fargoth", "Arrille", "Guard"], + events=["quest_available:fargoth_ring"], + max_cycles=40, + goal_predicate=_interacted_with("Fargoth"), + tags=["quest", "npc_interaction"], + ), + BenchmarkScenario( + name="Balmora Guild Navigation", + description=( + "Walk from Balmora South Wall Corner Club to the Fighters Guild. " + "Tests intra-city navigation with multiple NPCs present." + ), + start_location="Balmora, South Wall Corner Club", + goal_location="Balmora, Fighters Guild", + entities=["Guard", "Merchant", "Caius Cosades"], + events=["player_entered"], + max_cycles=20, + goal_predicate=_reached_location("Balmora, Fighters Guild"), + tags=["navigation", "city"], + ), + BenchmarkScenario( + name="Combat Encounter — Mudcrab", + description=( + "Engage and defeat a single Mudcrab on the road between " + "Seyda Neen and Balmora. Tests combat action selection." + ), + start_location="Bitter Coast Road", + goal_location="Bitter Coast Road", + entities=["Mudcrab"], + events=["hostile_entity_nearby"], + max_cycles=15, + goal_predicate=None, # Success = survived max_cycles without crash + tags=["combat", "basic"], + ), + BenchmarkScenario( + name="Passive Observation — Balmora Market", + description=( + "Observe the Balmora market for 10 cycles without acting. " + "Tests that the agent can reason without unnecessary actions." + ), + start_location="Balmora, Market Square", + goal_location="", + entities=["Merchant", "Guard", "Pilgrim", "Trader"], + events=["market_day"], + max_cycles=10, + tags=["observation", "passive"], + ), +] + + +def load_scenarios( + tags: list[str] | None = None, +) -> list[BenchmarkScenario]: + """Return built-in scenarios, optionally filtered by tags. + + Args: + tags: If provided, only return scenarios whose tags overlap. + + Returns: + List of matching ``BenchmarkScenario`` instances. + """ + if tags is None: + return list(BUILTIN_SCENARIOS) + tag_set = set(tags) + return [s for s in BUILTIN_SCENARIOS if tag_set & set(s.tags)] diff --git a/tests/infrastructure/world/test_benchmark.py b/tests/infrastructure/world/test_benchmark.py new file mode 100644 index 00000000..bda0fc4c --- /dev/null +++ b/tests/infrastructure/world/test_benchmark.py @@ -0,0 +1,394 @@ +"""Tests for the agent performance regression benchmark suite. + +Covers: scenario loading, metrics collection, runner execution, +goal predicates, and result persistence. +""" + +from __future__ import annotations + +import pytest + +from infrastructure.world.benchmark.metrics import ( + BenchmarkMetrics, + ScenarioResult, + compare_runs, + load_history, +) +from infrastructure.world.benchmark.runner import BenchmarkRunner +from infrastructure.world.benchmark.scenarios import ( + BUILTIN_SCENARIOS, + BenchmarkScenario, + load_scenarios, +) + +# --------------------------------------------------------------------------- +# Scenario definitions +# --------------------------------------------------------------------------- + + +class TestBenchmarkScenario: + def test_builtin_scenarios_exist(self): + assert len(BUILTIN_SCENARIOS) >= 5 + + def test_scenario_fields(self): + s = BUILTIN_SCENARIOS[0] + assert s.name + assert s.description + assert s.start_location + assert s.max_cycles > 0 + + def test_load_all_scenarios(self): + scenarios = load_scenarios() + assert len(scenarios) == len(BUILTIN_SCENARIOS) + + def test_load_scenarios_by_tag(self): + nav = load_scenarios(tags=["navigation"]) + assert len(nav) >= 2 + for s in nav: + assert "navigation" in s.tags + + def test_load_scenarios_no_match(self): + result = load_scenarios(tags=["nonexistent_tag"]) + assert result == [] + + def test_scenario_is_frozen(self): + s = BUILTIN_SCENARIOS[0] + with pytest.raises(AttributeError): + s.name = "modified" + + +# --------------------------------------------------------------------------- +# Goal predicates +# --------------------------------------------------------------------------- + + +class TestGoalPredicates: + def test_reached_location_predicate(self): + s = BUILTIN_SCENARIOS[0] # Walk to Balmora + assert s.goal_predicate is not None + assert s.goal_predicate([], "Balmora") is True + assert s.goal_predicate([], "Seyda Neen") is False + + def test_reached_location_case_insensitive(self): + s = BUILTIN_SCENARIOS[0] + assert s.goal_predicate([], "balmora") is True + assert s.goal_predicate([], "BALMORA") is True + + def test_interacted_with_predicate(self): + s = BUILTIN_SCENARIOS[1] # Fargoth quest + assert s.goal_predicate is not None + actions = [{"action": "speak", "target": "Fargoth"}] + assert s.goal_predicate(actions, "Seyda Neen") is True + + def test_interacted_with_no_match(self): + s = BUILTIN_SCENARIOS[1] + actions = [{"action": "speak", "target": "Guard"}] + assert s.goal_predicate(actions, "Seyda Neen") is False + + def test_interacted_with_interact_action(self): + s = BUILTIN_SCENARIOS[1] + actions = [{"action": "interact", "target": "Fargoth"}] + assert s.goal_predicate(actions, "Seyda Neen") is True + + def test_no_predicate_scenario(self): + combat = [s for s in BUILTIN_SCENARIOS if "combat" in s.tags][0] + assert combat.goal_predicate is None + + +# --------------------------------------------------------------------------- +# Metrics +# --------------------------------------------------------------------------- + + +class TestScenarioResult: + def test_default_values(self): + r = ScenarioResult(scenario_name="test") + assert r.success is False + assert r.cycles_used == 0 + assert r.llm_calls == 0 + assert r.metabolic_cost == 0.0 + assert r.error is None + + +class TestBenchmarkMetrics: + def test_empty_metrics(self): + m = BenchmarkMetrics() + assert m.pass_count == 0 + assert m.fail_count == 0 + assert m.success_rate == 0.0 + assert m.total_llm_calls == 0 + assert m.total_metabolic_cost == 0.0 + + def test_success_rate(self): + m = BenchmarkMetrics( + results=[ + ScenarioResult(scenario_name="a", success=True), + ScenarioResult(scenario_name="b", success=False), + ScenarioResult(scenario_name="c", success=True), + ] + ) + assert m.pass_count == 2 + assert m.fail_count == 1 + assert abs(m.success_rate - 2 / 3) < 0.01 + + def test_totals(self): + m = BenchmarkMetrics( + results=[ + ScenarioResult(scenario_name="a", llm_calls=10, metabolic_cost=30.0), + ScenarioResult(scenario_name="b", llm_calls=5, metabolic_cost=15.0), + ] + ) + assert m.total_llm_calls == 15 + assert m.total_metabolic_cost == 45.0 + + def test_save_and_load(self, tmp_path): + path = tmp_path / "bench.jsonl" + m = BenchmarkMetrics( + timestamp="2026-01-01T00:00:00", + commit_sha="abc123", + total_time_ms=1000, + results=[ + ScenarioResult( + scenario_name="a", + success=True, + cycles_used=5, + max_cycles=10, + ), + ], + ) + m.save(path) + + history = load_history(path) + assert len(history) == 1 + assert history[0]["commit_sha"] == "abc123" + assert history[0]["scenarios"][0]["scenario_name"] == "a" + + def test_save_appends(self, tmp_path): + path = tmp_path / "bench.jsonl" + for i in range(3): + m = BenchmarkMetrics( + timestamp=f"2026-01-0{i + 1}T00:00:00", + results=[ScenarioResult(scenario_name=f"s{i}")], + ) + m.save(path) + + history = load_history(path) + assert len(history) == 3 + # Most recent first + assert history[0]["timestamp"] == "2026-01-03T00:00:00" + + def test_summary_output(self): + m = BenchmarkMetrics( + timestamp="2026-01-01T00:00:00", + commit_sha="abc123", + total_time_ms=500, + results=[ + ScenarioResult( + scenario_name="Walk Test", + success=True, + cycles_used=5, + max_cycles=10, + wall_time_ms=200, + llm_calls=15, + ), + ], + ) + summary = m.summary() + assert "Walk Test" in summary + assert "PASS" in summary + assert "abc123" in summary + + def test_load_history_missing_file(self, tmp_path): + assert load_history(tmp_path / "nope.jsonl") == [] + + def test_load_history_corrupt_lines(self, tmp_path): + path = tmp_path / "bench.jsonl" + path.write_text('{"valid": true}\nnot json\n{"also": "valid"}\n') + history = load_history(path) + assert len(history) == 2 + + +# --------------------------------------------------------------------------- +# Comparison +# --------------------------------------------------------------------------- + + +class TestCompareRuns: + def test_regression_detected(self): + baseline = BenchmarkMetrics( + results=[ + ScenarioResult(scenario_name="walk", success=True, cycles_used=10), + ] + ) + current = BenchmarkMetrics( + results=[ + ScenarioResult(scenario_name="walk", success=False, cycles_used=10), + ] + ) + report = compare_runs(current, baseline) + assert "REGRESSION" in report + + def test_improvement_detected(self): + baseline = BenchmarkMetrics( + results=[ + ScenarioResult(scenario_name="walk", success=False, cycles_used=10), + ] + ) + current = BenchmarkMetrics( + results=[ + ScenarioResult(scenario_name="walk", success=True, cycles_used=10), + ] + ) + report = compare_runs(current, baseline) + assert "IMPROVEMENT" in report + + def test_slower_detected(self): + baseline = BenchmarkMetrics( + results=[ + ScenarioResult(scenario_name="walk", success=True, cycles_used=10), + ] + ) + current = BenchmarkMetrics( + results=[ + ScenarioResult(scenario_name="walk", success=True, cycles_used=20), + ] + ) + report = compare_runs(current, baseline) + assert "SLOWER" in report + + def test_new_scenario_noted(self): + baseline = BenchmarkMetrics(results=[]) + current = BenchmarkMetrics(results=[ScenarioResult(scenario_name="new_one", success=True)]) + report = compare_runs(current, baseline) + assert "NEW" in report + + +# --------------------------------------------------------------------------- +# Runner +# --------------------------------------------------------------------------- + + +class TestBenchmarkRunner: + @pytest.mark.asyncio + async def test_run_single_scenario(self): + """Runner executes a scenario and returns a result.""" + scenario = BenchmarkScenario( + name="Test Walk", + description="Simple test", + start_location="A", + goal_location="A", + max_cycles=3, + tags=["test"], + ) + runner = BenchmarkRunner() + metrics = await runner.run([scenario]) + assert len(metrics.results) == 1 + r = metrics.results[0] + assert r.scenario_name == "Test Walk" + assert r.cycles_used == 3 # no predicate, runs all cycles + assert r.success is True # no predicate = success if survived + assert r.wall_time_ms >= 0 + assert r.llm_calls == 9 # 3 cycles * 3 calls + assert r.metabolic_cost > 0 + + @pytest.mark.asyncio + async def test_run_with_goal_predicate(self): + """Runner stops early when goal predicate is satisfied.""" + + def always_true(actions, location): + return True + + scenario = BenchmarkScenario( + name="Instant Win", + description="Predicate satisfied immediately", + start_location="A", + max_cycles=100, + goal_predicate=always_true, + tags=["test"], + ) + runner = BenchmarkRunner() + metrics = await runner.run([scenario]) + r = metrics.results[0] + assert r.success is True + assert r.cycles_used == 1 # Stopped at first cycle + + @pytest.mark.asyncio + async def test_run_with_failing_predicate(self): + """Scenario fails when predicate never satisfied.""" + + def never_true(actions, location): + return False + + scenario = BenchmarkScenario( + name="Impossible", + description="Predicate never satisfied", + start_location="A", + max_cycles=5, + goal_predicate=never_true, + tags=["test"], + ) + runner = BenchmarkRunner() + metrics = await runner.run([scenario]) + r = metrics.results[0] + assert r.success is False + assert r.cycles_used == 5 + + @pytest.mark.asyncio + async def test_run_multiple_scenarios(self): + """Runner handles multiple scenarios in sequence.""" + scenarios = [ + BenchmarkScenario( + name=f"Scenario {i}", + description=f"Test {i}", + start_location="A", + max_cycles=2, + tags=["test"], + ) + for i in range(3) + ] + runner = BenchmarkRunner() + metrics = await runner.run(scenarios) + assert len(metrics.results) == 3 + assert metrics.total_time_ms >= 0 + assert metrics.timestamp + + @pytest.mark.asyncio + async def test_metrics_commit_sha(self): + """Runner captures git SHA in metrics.""" + scenario = BenchmarkScenario( + name="SHA Test", + description="Check SHA capture", + start_location="A", + max_cycles=1, + tags=["test"], + ) + runner = BenchmarkRunner() + metrics = await runner.run([scenario]) + # SHA may or may not be available in test env; just ensure no crash + assert isinstance(metrics.commit_sha, str) + + @pytest.mark.asyncio + async def test_builtin_scenarios_run(self): + """All built-in scenarios run without crashing.""" + # Use just 2 cycles each to keep tests fast + scenarios = [ + BenchmarkScenario( + name=s.name, + description=s.description, + start_location=s.start_location, + goal_location=s.goal_location, + entities=list(s.entities), + events=list(s.events), + max_cycles=2, # Override for speed + goal_predicate=None, # Skip predicate for smoke test + tags=list(s.tags), + ) + for s in BUILTIN_SCENARIOS + ] + runner = BenchmarkRunner() + metrics = await runner.run(scenarios) + assert len(metrics.results) == len(BUILTIN_SCENARIOS) + # All should succeed (no predicate + survived = pass) + for r in metrics.results: + assert r.success is True + assert r.error is None diff --git a/tox.ini b/tox.ini index 0be8d624..00cee66a 100644 --- a/tox.ini +++ b/tox.ini @@ -87,6 +87,11 @@ description = Live LLM tests via Ollama (requires running Ollama) commands = pytest tests/ -q --tb=short -m ollama --timeout=120 +[testenv:benchmark] +description = Agent performance regression benchmark suite +commands = + python scripts/run_benchmarks.py {posargs} + # ── CI / Coverage ──────────────────────────────────────────────────────────── [testenv:ci]