[claude] Add agent performance regression benchmark suite (#1015) (#1053)

2026-03-22 23:55:27 +00:00
parent c0f6ca9fc2
commit 45bde4df58
7 changed files with 1045 additions and 0 deletions
--- a/scripts/run_benchmarks.py
+++ b/scripts/run_benchmarks.py
@@ -0,0 +1,107 @@
 #!/usr/bin/env python3
 """Run the agent performance regression benchmark suite.
 Usage::
    python scripts/run_benchmarks.py                  # all scenarios
    python scripts/run_benchmarks.py --tags navigation # filter by tag
    python scripts/run_benchmarks.py --output results/benchmarks.jsonl
    python scripts/run_benchmarks.py --compare results/benchmarks.jsonl
 Exit codes:
    0 — all scenarios passed
    1 — one or more scenarios failed
 """
 from __future__ import annotations
 import argparse
 import asyncio
 import sys
 from pathlib import Path
 # Ensure src/ is on the path when invoked directly
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
 from infrastructure.world.benchmark.metrics import BenchmarkMetrics, load_history
 from infrastructure.world.benchmark.runner import BenchmarkRunner
 from infrastructure.world.benchmark.scenarios import load_scenarios
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Agent performance regression benchmark suite",
    )
    parser.add_argument(
        "--tags",
        nargs="*",
        default=None,
        help="Filter scenarios by tag (e.g. navigation quest)",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=None,
        help="JSONL file to append results to",
    )
    parser.add_argument(
        "--compare",
        type=Path,
        default=None,
        help="JSONL file with baseline results for regression comparison",
    )
    return parser.parse_args()
 async def main() -> int:
    args = parse_args()
    scenarios = load_scenarios(tags=args.tags)
    if not scenarios:
        print("No matching scenarios found.")
        return 1
    print(f"Running {len(scenarios)} benchmark scenario(s)...\n")
    runner = BenchmarkRunner()
    metrics = await runner.run(scenarios)
    print(metrics.summary())
    if args.output:
        metrics.save(args.output)
    if args.compare:
        history = load_history(args.compare)
        if history:
            from infrastructure.world.benchmark.metrics import compare_runs
            # Reconstruct baseline from last recorded run
            last = history[0]
            baseline = BenchmarkMetrics(
                timestamp=last.get("timestamp", ""),
                commit_sha=last.get("commit_sha", ""),
                total_time_ms=last.get("total_time_ms", 0),
            )
            for s in last.get("scenarios", []):
                from infrastructure.world.benchmark.metrics import ScenarioResult
                baseline.results.append(
                    ScenarioResult(
                        scenario_name=s["scenario_name"],
                        success=s["success"],
                        cycles_used=s["cycles_used"],
                        max_cycles=s["max_cycles"],
                        wall_time_ms=s.get("wall_time_ms", 0),
                        llm_calls=s.get("llm_calls", 0),
                        metabolic_cost=s.get("metabolic_cost", 0.0),
                    )
                )
            print()
            print(compare_runs(metrics, baseline))
    return 0 if metrics.fail_count == 0 else 1
 if __name__ == "__main__":
    sys.exit(asyncio.run(main()))
--- a/src/infrastructure/world/benchmark/init.py
+++ b/src/infrastructure/world/benchmark/init.py
@@ -0,0 +1,17 @@
 """Performance regression suite for Morrowind agent scenarios.
 Provides standardised benchmark scenarios, a runner that executes them
 through the heartbeat loop with a mock (or live) world adapter, and
 metrics collection for CI-integrated regression detection.
 """
 from infrastructure.world.benchmark.metrics import BenchmarkMetrics
 from infrastructure.world.benchmark.runner import BenchmarkRunner
 from infrastructure.world.benchmark.scenarios import BenchmarkScenario, load_scenarios
 __all__ = [
    "BenchmarkMetrics",
    "BenchmarkRunner",
    "BenchmarkScenario",
    "load_scenarios",
 ]
--- a/src/infrastructure/world/benchmark/metrics.py
+++ b/src/infrastructure/world/benchmark/metrics.py
@@ -0,0 +1,195 @@
 """Benchmark metrics collection and persistence.
 Tracks per-scenario results: cycles used, wall-clock time, success,
 LLM call count, and estimated metabolic cost.  Results are persisted
 as JSONL for trend analysis and CI regression gates.
 """
 from __future__ import annotations
 import json
 import logging
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
 logger = logging.getLogger(__name__)
@dataclass
 class ScenarioResult:
    """Outcome of running a single benchmark scenario.
    Attributes:
        scenario_name:  Human-readable scenario name.
        success:        Whether the goal predicate was satisfied.
        cycles_used:    Number of heartbeat cycles executed.
        max_cycles:     The scenario's cycle budget.
        wall_time_ms:   Total wall-clock time in milliseconds.
        llm_calls:      Number of LLM inference calls made.
        metabolic_cost: Estimated resource cost (arbitrary unit, ≈ tokens).
        error:          Error message if the run crashed.
        tags:           Scenario tags (copied for filtering).
    """
    scenario_name: str
    success: bool = False
    cycles_used: int = 0
    max_cycles: int = 0
    wall_time_ms: int = 0
    llm_calls: int = 0
    metabolic_cost: float = 0.0
    error: str | None = None
    tags: list[str] = field(default_factory=list)
@dataclass
 class BenchmarkMetrics:
    """Aggregated metrics across all scenarios in a benchmark run.
    Attributes:
        results:       Per-scenario results.
        total_time_ms: Total wall-clock time for the full suite.
        timestamp:     ISO-8601 timestamp of the run.
        commit_sha:    Git commit SHA (if available).
    """
    results: list[ScenarioResult] = field(default_factory=list)
    total_time_ms: int = 0
    timestamp: str = ""
    commit_sha: str = ""
    # -- derived properties ------------------------------------------------
    @property
    def pass_count(self) -> int:
        return sum(1 for r in self.results if r.success)
    @property
    def fail_count(self) -> int:
        return sum(1 for r in self.results if not r.success)
    @property
    def success_rate(self) -> float:
        if not self.results:
            return 0.0
        return self.pass_count / len(self.results)
    @property
    def total_llm_calls(self) -> int:
        return sum(r.llm_calls for r in self.results)
    @property
    def total_metabolic_cost(self) -> float:
        return sum(r.metabolic_cost for r in self.results)
    # -- persistence -------------------------------------------------------
    def save(self, path: Path) -> None:
        """Append this run's results to a JSONL file at *path*."""
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)
        record = {
            "timestamp": self.timestamp,
            "commit_sha": self.commit_sha,
            "total_time_ms": self.total_time_ms,
            "success_rate": round(self.success_rate, 4),
            "total_llm_calls": self.total_llm_calls,
            "total_metabolic_cost": round(self.total_metabolic_cost, 2),
            "scenarios": [asdict(r) for r in self.results],
        }
        with path.open("a") as f:
            f.write(json.dumps(record) + "\n")
        logger.info("Benchmark results saved to %s", path)
    # -- summary -----------------------------------------------------------
    def summary(self) -> str:
        """Return a human-readable summary of the benchmark run."""
        lines = [
            "=== Benchmark Summary ===",
            f"Scenarios: {len(self.results)}  "
            f"Passed: {self.pass_count}  "
            f"Failed: {self.fail_count}  "
            f"Success rate: {self.success_rate:.0%}",
            f"Total time: {self.total_time_ms} ms  "
            f"LLM calls: {self.total_llm_calls}  "
            f"Metabolic cost: {self.total_metabolic_cost:.1f}",
        ]
        if self.commit_sha:
            lines.append(f"Commit: {self.commit_sha}")
        lines.append("")
        for r in self.results:
            status = "PASS" if r.success else "FAIL"
            lines.append(
                f"  [{status}] {r.scenario_name} — "
                f"{r.cycles_used}/{r.max_cycles} cycles, "
                f"{r.wall_time_ms} ms, "
                f"{r.llm_calls} LLM calls"
            )
            if r.error:
                lines.append(f"         Error: {r.error}")
        return "\n".join(lines)
 def load_history(path: Path) -> list[dict]:
    """Load benchmark history from a JSONL file.
    Returns:
        List of run records, most recent first.
    """
    path = Path(path)
    if not path.exists():
        return []
    records: list[dict] = []
    for line in path.read_text().strip().splitlines():
        try:
            records.append(json.loads(line))
        except json.JSONDecodeError:
            continue
    return list(reversed(records))
 def compare_runs(
    current: BenchmarkMetrics,
    baseline: BenchmarkMetrics,
 ) -> str:
    """Compare two benchmark runs and report regressions.
    Returns:
        Human-readable comparison report.
    """
    lines = ["=== Regression Report ==="]
    # Overall
    rate_delta = current.success_rate - baseline.success_rate
    lines.append(
        f"Success rate: {baseline.success_rate:.0%} -> {current.success_rate:.0%} "
        f"({rate_delta:+.0%})"
    )
    cost_delta = current.total_metabolic_cost - baseline.total_metabolic_cost
    if baseline.total_metabolic_cost > 0:
        cost_pct = (cost_delta / baseline.total_metabolic_cost) * 100
        lines.append(
            f"Metabolic cost: {baseline.total_metabolic_cost:.1f} -> "
            f"{current.total_metabolic_cost:.1f} ({cost_pct:+.1f}%)"
        )
    # Per-scenario
    baseline_map = {r.scenario_name: r for r in baseline.results}
    for r in current.results:
        b = baseline_map.get(r.scenario_name)
        if b is None:
            lines.append(f"  [NEW] {r.scenario_name}")
            continue
        if b.success and not r.success:
            lines.append(f"  [REGRESSION] {r.scenario_name} — was PASS, now FAIL")
        elif not b.success and r.success:
            lines.append(f"  [IMPROVEMENT] {r.scenario_name} — was FAIL, now PASS")
        elif r.cycles_used > b.cycles_used * 1.5:
            lines.append(
                f"  [SLOWER] {r.scenario_name} — "
                f"{b.cycles_used} -> {r.cycles_used} cycles (+{r.cycles_used - b.cycles_used})"
            )
    return "\n".join(lines)
--- a/src/infrastructure/world/benchmark/runner.py
+++ b/src/infrastructure/world/benchmark/runner.py
@@ -0,0 +1,167 @@
 """Benchmark runner — executes scenarios through the heartbeat loop.
 Wires each ``BenchmarkScenario`` into a ``MockWorldAdapter`` (or a
 supplied adapter), runs the heartbeat for up to ``max_cycles``, and
 collects ``BenchmarkMetrics``.
 """
 from __future__ import annotations
 import logging
 import subprocess
 import time
 from datetime import UTC, datetime
 from infrastructure.world.adapters.mock import MockWorldAdapter
 from infrastructure.world.benchmark.metrics import BenchmarkMetrics, ScenarioResult
 from infrastructure.world.benchmark.scenarios import BenchmarkScenario
 from infrastructure.world.interface import WorldInterface
 from loop.heartbeat import Heartbeat
 logger = logging.getLogger(__name__)
 # Rough estimate: each heartbeat cycle costs ~1 unit of metabolic cost
 # (gather + reason + act phases each touch the LLM router once).
 _COST_PER_CYCLE = 3.0  # three phases per cycle
 class BenchmarkRunner:
    """Run benchmark scenarios and collect metrics.
    Parameters
    ----------
    adapter_factory:
        Optional callable that returns a ``WorldInterface`` for a given
        scenario.  Defaults to building a ``MockWorldAdapter`` from the
        scenario's start state.
    heartbeat_interval:
        Seconds between heartbeat ticks (0 for immediate).
    """
    def __init__(
        self,
        *,
        adapter_factory=None,
        heartbeat_interval: float = 0.0,
    ) -> None:
        self._adapter_factory = adapter_factory or self._default_adapter
        self._interval = heartbeat_interval
    # -- public API --------------------------------------------------------
    async def run(
        self,
        scenarios: list[BenchmarkScenario],
    ) -> BenchmarkMetrics:
        """Execute all *scenarios* and return aggregated metrics."""
        metrics = BenchmarkMetrics(
            timestamp=datetime.now(UTC).isoformat(),
            commit_sha=self._git_sha(),
        )
        suite_start = time.monotonic()
        for scenario in scenarios:
            logger.info("Benchmark: starting '%s'", scenario.name)
            result = await self._run_scenario(scenario)
            metrics.results.append(result)
            status = "PASS" if result.success else "FAIL"
            logger.info(
                "Benchmark: '%s' %s (%d/%d cycles, %d ms)",
                scenario.name,
                status,
                result.cycles_used,
                result.max_cycles,
                result.wall_time_ms,
            )
        metrics.total_time_ms = int((time.monotonic() - suite_start) * 1000)
        return metrics
    # -- internal ----------------------------------------------------------
    async def _run_scenario(self, scenario: BenchmarkScenario) -> ScenarioResult:
        """Run a single scenario through the heartbeat loop."""
        result = ScenarioResult(
            scenario_name=scenario.name,
            max_cycles=scenario.max_cycles,
            tags=list(scenario.tags),
        )
        adapter = self._adapter_factory(scenario)
        adapter.connect()
        hb = Heartbeat(world=adapter, interval=self._interval)
        actions: list[dict] = []
        start = time.monotonic()
        try:
            for cycle in range(1, scenario.max_cycles + 1):
                record = await hb.run_once()
                result.cycles_used = cycle
                # Track LLM calls (each cycle has 3 phases that may call LLM)
                result.llm_calls += 3
                # Accumulate actions for goal predicate
                if record.action_taken and record.action_taken != "idle":
                    actions.append(
                        {
                            "action": record.action_taken,
                            "target": record.observation.get("location", ""),
                            "status": record.action_status,
                        }
                    )
                # Update adapter location if scenario simulates movement
                current_location = self._get_current_location(adapter)
                # Check goal predicate
                if scenario.goal_predicate is not None:
                    if scenario.goal_predicate(actions, current_location):
                        result.success = True
                        break
                elif cycle == scenario.max_cycles:
                    # No predicate — success if we survived all cycles
                    result.success = True
        except Exception as exc:
            logger.warning("Benchmark scenario '%s' crashed: %s", scenario.name, exc)
            result.error = str(exc)
        finally:
            adapter.disconnect()
        result.wall_time_ms = int((time.monotonic() - start) * 1000)
        result.metabolic_cost = result.cycles_used * _COST_PER_CYCLE
        return result
    @staticmethod
    def _default_adapter(scenario: BenchmarkScenario) -> WorldInterface:
        """Build a MockWorldAdapter from a scenario's starting state."""
        return MockWorldAdapter(
            location=scenario.start_location,
            entities=list(scenario.entities),
            events=list(scenario.events),
        )
    @staticmethod
    def _get_current_location(adapter: WorldInterface) -> str:
        """Read the current location from the adapter."""
        try:
            perception = adapter.observe()
            return perception.location
        except Exception:
            return ""
    @staticmethod
    def _git_sha() -> str:
        """Best-effort: return the current git commit SHA."""
        try:
            result = subprocess.run(
                ["git", "rev-parse", "--short", "HEAD"],
                capture_output=True,
                text=True,
                timeout=5,
            )
            return result.stdout.strip() if result.returncode == 0 else ""
        except (OSError, subprocess.TimeoutExpired):
            return ""
--- a/src/infrastructure/world/benchmark/scenarios.py
+++ b/src/infrastructure/world/benchmark/scenarios.py
@@ -0,0 +1,160 @@
 """Benchmark scenario definitions for Morrowind agent regression testing.
 Each scenario specifies a starting location, goal conditions, world state
 (entities, events), and maximum cycles allowed.  The runner feeds these
 into the heartbeat loop and checks completion against the goal predicate.
 """
 from __future__ import annotations
 from collections.abc import Callable
 from dataclasses import dataclass, field
@dataclass(frozen=True)
 class BenchmarkScenario:
    """A reproducible agent task used to detect performance regressions.
    Attributes:
        name:           Human-readable scenario name.
        description:    What the scenario tests.
        start_location: Where the agent begins.
        goal_location:  Target location (if navigation scenario).
        entities:       NPCs / objects present in the world.
        events:         Game events injected each cycle.
        max_cycles:     Hard cap on heartbeat cycles before failure.
        goal_predicate: Optional callable ``(actions, location) -> bool``
                        evaluated after each cycle to check early success.
        tags:           Freeform tags for filtering (e.g. "navigation", "quest").
    """
    name: str
    description: str
    start_location: str
    goal_location: str = ""
    entities: list[str] = field(default_factory=list)
    events: list[str] = field(default_factory=list)
    max_cycles: int = 50
    goal_predicate: Callable | None = None
    tags: list[str] = field(default_factory=list)
 # ---------------------------------------------------------------------------
 # Goal predicates
 # ---------------------------------------------------------------------------
 def _reached_location(target: str) -> Callable:
    """Return a predicate that checks whether the agent reached *target*."""
    def predicate(actions: list[dict], current_location: str) -> bool:
        return current_location.lower() == target.lower()
    return predicate
 def _interacted_with(npc: str) -> Callable:
    """Return a predicate that checks for a speak/interact action with *npc*."""
    def predicate(actions: list[dict], current_location: str) -> bool:
        for act in actions:
            if act.get("action") in ("speak", "interact", "talk"):
                if act.get("target", "").lower() == npc.lower():
                    return True
        return False
    return predicate
 # ---------------------------------------------------------------------------
 # Built-in scenarios
 # ---------------------------------------------------------------------------
 BUILTIN_SCENARIOS: list[BenchmarkScenario] = [
    BenchmarkScenario(
        name="Walk Seyda Neen to Balmora",
        description=(
            "Navigate from the starting village to Balmora via the road. "
            "Tests basic navigation and pathfinding."
        ),
        start_location="Seyda Neen",
        goal_location="Balmora",
        entities=["Silt Strider", "Road Sign", "Mudcrab"],
        events=["player_spawned"],
        max_cycles=30,
        goal_predicate=_reached_location("Balmora"),
        tags=["navigation", "basic"],
    ),
    BenchmarkScenario(
        name="Fargoth's Ring",
        description=(
            "Complete the Fargoth quest: find Fargoth, receive the ring, "
            "and return it.  Tests NPC interaction and quest logic."
        ),
        start_location="Seyda Neen",
        goal_location="Seyda Neen",
        entities=["Fargoth", "Arrille", "Guard"],
        events=["quest_available:fargoth_ring"],
        max_cycles=40,
        goal_predicate=_interacted_with("Fargoth"),
        tags=["quest", "npc_interaction"],
    ),
    BenchmarkScenario(
        name="Balmora Guild Navigation",
        description=(
            "Walk from Balmora South Wall Corner Club to the Fighters Guild. "
            "Tests intra-city navigation with multiple NPCs present."
        ),
        start_location="Balmora, South Wall Corner Club",
        goal_location="Balmora, Fighters Guild",
        entities=["Guard", "Merchant", "Caius Cosades"],
        events=["player_entered"],
        max_cycles=20,
        goal_predicate=_reached_location("Balmora, Fighters Guild"),
        tags=["navigation", "city"],
    ),
    BenchmarkScenario(
        name="Combat Encounter — Mudcrab",
        description=(
            "Engage and defeat a single Mudcrab on the road between "
            "Seyda Neen and Balmora.  Tests combat action selection."
        ),
        start_location="Bitter Coast Road",
        goal_location="Bitter Coast Road",
        entities=["Mudcrab"],
        events=["hostile_entity_nearby"],
        max_cycles=15,
        goal_predicate=None,  # Success = survived max_cycles without crash
        tags=["combat", "basic"],
    ),
    BenchmarkScenario(
        name="Passive Observation — Balmora Market",
        description=(
            "Observe the Balmora market for 10 cycles without acting. "
            "Tests that the agent can reason without unnecessary actions."
        ),
        start_location="Balmora, Market Square",
        goal_location="",
        entities=["Merchant", "Guard", "Pilgrim", "Trader"],
        events=["market_day"],
        max_cycles=10,
        tags=["observation", "passive"],
    ),
 ]
 def load_scenarios(
    tags: list[str] | None = None,
 ) -> list[BenchmarkScenario]:
    """Return built-in scenarios, optionally filtered by tags.
    Args:
        tags: If provided, only return scenarios whose tags overlap.
    Returns:
        List of matching ``BenchmarkScenario`` instances.
    """
    if tags is None:
        return list(BUILTIN_SCENARIOS)
    tag_set = set(tags)
    return [s for s in BUILTIN_SCENARIOS if tag_set & set(s.tags)]
--- a/tests/infrastructure/world/test_benchmark.py
+++ b/tests/infrastructure/world/test_benchmark.py
@@ -0,0 +1,394 @@
 """Tests for the agent performance regression benchmark suite.
 Covers: scenario loading, metrics collection, runner execution,
 goal predicates, and result persistence.
 """
 from __future__ import annotations
 import pytest
 from infrastructure.world.benchmark.metrics import (
    BenchmarkMetrics,
    ScenarioResult,
    compare_runs,
    load_history,
 )
 from infrastructure.world.benchmark.runner import BenchmarkRunner
 from infrastructure.world.benchmark.scenarios import (
    BUILTIN_SCENARIOS,
    BenchmarkScenario,
    load_scenarios,
 )
 # ---------------------------------------------------------------------------
 # Scenario definitions
 # ---------------------------------------------------------------------------
 class TestBenchmarkScenario:
    def test_builtin_scenarios_exist(self):
        assert len(BUILTIN_SCENARIOS) >= 5
    def test_scenario_fields(self):
        s = BUILTIN_SCENARIOS[0]
        assert s.name
        assert s.description
        assert s.start_location
        assert s.max_cycles > 0
    def test_load_all_scenarios(self):
        scenarios = load_scenarios()
        assert len(scenarios) == len(BUILTIN_SCENARIOS)
    def test_load_scenarios_by_tag(self):
        nav = load_scenarios(tags=["navigation"])
        assert len(nav) >= 2
        for s in nav:
            assert "navigation" in s.tags
    def test_load_scenarios_no_match(self):
        result = load_scenarios(tags=["nonexistent_tag"])
        assert result == []
    def test_scenario_is_frozen(self):
        s = BUILTIN_SCENARIOS[0]
        with pytest.raises(AttributeError):
            s.name = "modified"
 # ---------------------------------------------------------------------------
 # Goal predicates
 # ---------------------------------------------------------------------------
 class TestGoalPredicates:
    def test_reached_location_predicate(self):
        s = BUILTIN_SCENARIOS[0]  # Walk to Balmora
        assert s.goal_predicate is not None
        assert s.goal_predicate([], "Balmora") is True
        assert s.goal_predicate([], "Seyda Neen") is False
    def test_reached_location_case_insensitive(self):
        s = BUILTIN_SCENARIOS[0]
        assert s.goal_predicate([], "balmora") is True
        assert s.goal_predicate([], "BALMORA") is True
    def test_interacted_with_predicate(self):
        s = BUILTIN_SCENARIOS[1]  # Fargoth quest
        assert s.goal_predicate is not None
        actions = [{"action": "speak", "target": "Fargoth"}]
        assert s.goal_predicate(actions, "Seyda Neen") is True
    def test_interacted_with_no_match(self):
        s = BUILTIN_SCENARIOS[1]
        actions = [{"action": "speak", "target": "Guard"}]
        assert s.goal_predicate(actions, "Seyda Neen") is False
    def test_interacted_with_interact_action(self):
        s = BUILTIN_SCENARIOS[1]
        actions = [{"action": "interact", "target": "Fargoth"}]
        assert s.goal_predicate(actions, "Seyda Neen") is True
    def test_no_predicate_scenario(self):
        combat = [s for s in BUILTIN_SCENARIOS if "combat" in s.tags][0]
        assert combat.goal_predicate is None
 # ---------------------------------------------------------------------------
 # Metrics
 # ---------------------------------------------------------------------------
 class TestScenarioResult:
    def test_default_values(self):
        r = ScenarioResult(scenario_name="test")
        assert r.success is False
        assert r.cycles_used == 0
        assert r.llm_calls == 0
        assert r.metabolic_cost == 0.0
        assert r.error is None
 class TestBenchmarkMetrics:
    def test_empty_metrics(self):
        m = BenchmarkMetrics()
        assert m.pass_count == 0
        assert m.fail_count == 0
        assert m.success_rate == 0.0
        assert m.total_llm_calls == 0
        assert m.total_metabolic_cost == 0.0
    def test_success_rate(self):
        m = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="a", success=True),
                ScenarioResult(scenario_name="b", success=False),
                ScenarioResult(scenario_name="c", success=True),
            ]
        )
        assert m.pass_count == 2
        assert m.fail_count == 1
        assert abs(m.success_rate - 2 / 3) < 0.01
    def test_totals(self):
        m = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="a", llm_calls=10, metabolic_cost=30.0),
                ScenarioResult(scenario_name="b", llm_calls=5, metabolic_cost=15.0),
            ]
        )
        assert m.total_llm_calls == 15
        assert m.total_metabolic_cost == 45.0
    def test_save_and_load(self, tmp_path):
        path = tmp_path / "bench.jsonl"
        m = BenchmarkMetrics(
            timestamp="2026-01-01T00:00:00",
            commit_sha="abc123",
            total_time_ms=1000,
            results=[
                ScenarioResult(
                    scenario_name="a",
                    success=True,
                    cycles_used=5,
                    max_cycles=10,
                ),
            ],
        )
        m.save(path)
        history = load_history(path)
        assert len(history) == 1
        assert history[0]["commit_sha"] == "abc123"
        assert history[0]["scenarios"][0]["scenario_name"] == "a"
    def test_save_appends(self, tmp_path):
        path = tmp_path / "bench.jsonl"
        for i in range(3):
            m = BenchmarkMetrics(
                timestamp=f"2026-01-0{i + 1}T00:00:00",
                results=[ScenarioResult(scenario_name=f"s{i}")],
            )
            m.save(path)
        history = load_history(path)
        assert len(history) == 3
        # Most recent first
        assert history[0]["timestamp"] == "2026-01-03T00:00:00"
    def test_summary_output(self):
        m = BenchmarkMetrics(
            timestamp="2026-01-01T00:00:00",
            commit_sha="abc123",
            total_time_ms=500,
            results=[
                ScenarioResult(
                    scenario_name="Walk Test",
                    success=True,
                    cycles_used=5,
                    max_cycles=10,
                    wall_time_ms=200,
                    llm_calls=15,
                ),
            ],
        )
        summary = m.summary()
        assert "Walk Test" in summary
        assert "PASS" in summary
        assert "abc123" in summary
    def test_load_history_missing_file(self, tmp_path):
        assert load_history(tmp_path / "nope.jsonl") == []
    def test_load_history_corrupt_lines(self, tmp_path):
        path = tmp_path / "bench.jsonl"
        path.write_text('{"valid": true}\nnot json\n{"also": "valid"}\n')
        history = load_history(path)
        assert len(history) == 2
 # ---------------------------------------------------------------------------
 # Comparison
 # ---------------------------------------------------------------------------
 class TestCompareRuns:
    def test_regression_detected(self):
        baseline = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
            ]
        )
        current = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
            ]
        )
        report = compare_runs(current, baseline)
        assert "REGRESSION" in report
    def test_improvement_detected(self):
        baseline = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
            ]
        )
        current = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
            ]
        )
        report = compare_runs(current, baseline)
        assert "IMPROVEMENT" in report
    def test_slower_detected(self):
        baseline = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
            ]
        )
        current = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=True, cycles_used=20),
            ]
        )
        report = compare_runs(current, baseline)
        assert "SLOWER" in report
    def test_new_scenario_noted(self):
        baseline = BenchmarkMetrics(results=[])
        current = BenchmarkMetrics(results=[ScenarioResult(scenario_name="new_one", success=True)])
        report = compare_runs(current, baseline)
        assert "NEW" in report
 # ---------------------------------------------------------------------------
 # Runner
 # ---------------------------------------------------------------------------
 class TestBenchmarkRunner:
    @pytest.mark.asyncio
    async def test_run_single_scenario(self):
        """Runner executes a scenario and returns a result."""
        scenario = BenchmarkScenario(
            name="Test Walk",
            description="Simple test",
            start_location="A",
            goal_location="A",
            max_cycles=3,
            tags=["test"],
        )
        runner = BenchmarkRunner()
        metrics = await runner.run([scenario])
        assert len(metrics.results) == 1
        r = metrics.results[0]
        assert r.scenario_name == "Test Walk"
        assert r.cycles_used == 3  # no predicate, runs all cycles
        assert r.success is True  # no predicate = success if survived
        assert r.wall_time_ms >= 0
        assert r.llm_calls == 9  # 3 cycles * 3 calls
        assert r.metabolic_cost > 0
    @pytest.mark.asyncio
    async def test_run_with_goal_predicate(self):
        """Runner stops early when goal predicate is satisfied."""
        def always_true(actions, location):
            return True
        scenario = BenchmarkScenario(
            name="Instant Win",
            description="Predicate satisfied immediately",
            start_location="A",
            max_cycles=100,
            goal_predicate=always_true,
            tags=["test"],
        )
        runner = BenchmarkRunner()
        metrics = await runner.run([scenario])
        r = metrics.results[0]
        assert r.success is True
        assert r.cycles_used == 1  # Stopped at first cycle
    @pytest.mark.asyncio
    async def test_run_with_failing_predicate(self):
        """Scenario fails when predicate never satisfied."""
        def never_true(actions, location):
            return False
        scenario = BenchmarkScenario(
            name="Impossible",
            description="Predicate never satisfied",
            start_location="A",
            max_cycles=5,
            goal_predicate=never_true,
            tags=["test"],
        )
        runner = BenchmarkRunner()
        metrics = await runner.run([scenario])
        r = metrics.results[0]
        assert r.success is False
        assert r.cycles_used == 5
    @pytest.mark.asyncio
    async def test_run_multiple_scenarios(self):
        """Runner handles multiple scenarios in sequence."""
        scenarios = [
            BenchmarkScenario(
                name=f"Scenario {i}",
                description=f"Test {i}",
                start_location="A",
                max_cycles=2,
                tags=["test"],
            )
            for i in range(3)
        ]
        runner = BenchmarkRunner()
        metrics = await runner.run(scenarios)
        assert len(metrics.results) == 3
        assert metrics.total_time_ms >= 0
        assert metrics.timestamp
    @pytest.mark.asyncio
    async def test_metrics_commit_sha(self):
        """Runner captures git SHA in metrics."""
        scenario = BenchmarkScenario(
            name="SHA Test",
            description="Check SHA capture",
            start_location="A",
            max_cycles=1,
            tags=["test"],
        )
        runner = BenchmarkRunner()
        metrics = await runner.run([scenario])
        # SHA may or may not be available in test env; just ensure no crash
        assert isinstance(metrics.commit_sha, str)
    @pytest.mark.asyncio
    async def test_builtin_scenarios_run(self):
        """All built-in scenarios run without crashing."""
        # Use just 2 cycles each to keep tests fast
        scenarios = [
            BenchmarkScenario(
                name=s.name,
                description=s.description,
                start_location=s.start_location,
                goal_location=s.goal_location,
                entities=list(s.entities),
                events=list(s.events),
                max_cycles=2,  # Override for speed
                goal_predicate=None,  # Skip predicate for smoke test
                tags=list(s.tags),
            )
            for s in BUILTIN_SCENARIOS
        ]
        runner = BenchmarkRunner()
        metrics = await runner.run(scenarios)
        assert len(metrics.results) == len(BUILTIN_SCENARIOS)
        # All should succeed (no predicate + survived = pass)
        for r in metrics.results:
            assert r.success is True
            assert r.error is None
--- a/tox.ini
+++ b/tox.ini
@@ -87,6 +87,11 @@ description = Live LLM tests via Ollama (requires running Ollama)
 commands =
    pytest tests/ -q --tb=short -m ollama --timeout=120
 [testenv:benchmark]
 description = Agent performance regression benchmark suite
 commands =
    python scripts/run_benchmarks.py {posargs}
 # ── CI / Coverage ────────────────────────────────────────────────────────────
 [testenv:ci]