feat: add agent performance regression benchmark suite

Implement standardised Morrowind benchmark scenarios to detect agent performance regressions after code changes. - 5 built-in scenarios: navigation (Seyda Neen→Balmora, Balmora intra-city), quest (Fargoth's Ring), combat (Mudcrab), observation - BenchmarkRunner executes scenarios through the heartbeat loop with MockWorldAdapter, tracking cycles, wall time, LLM calls, metabolic cost - Goal predicates (reached_location, interacted_with) for early success - BenchmarkMetrics with JSONL persistence and compare_runs() for regression detection - CLI script (scripts/run_benchmarks.py) with tag filtering and baseline comparison - tox -e benchmark environment for CI integration - 31 unit tests covering scenarios, predicates, metrics, runner, and persistence Fixes #1015 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 19:54:26 -04:00
parent c0f6ca9fc2
commit 49990e6aec
7 changed files with 1045 additions and 0 deletions
--- a/scripts/run_benchmarks.py
+++ b/scripts/run_benchmarks.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+"""Run the agent performance regression benchmark suite.
+
+Usage::
+
+    python scripts/run_benchmarks.py                  # all scenarios
+    python scripts/run_benchmarks.py --tags navigation # filter by tag
+    python scripts/run_benchmarks.py --output results/benchmarks.jsonl
+    python scripts/run_benchmarks.py --compare results/benchmarks.jsonl
+
+Exit codes:
+    0 — all scenarios passed
+    1 — one or more scenarios failed
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import sys
+from pathlib import Path
+
+# Ensure src/ is on the path when invoked directly
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
+
+from infrastructure.world.benchmark.metrics import BenchmarkMetrics, load_history
+from infrastructure.world.benchmark.runner import BenchmarkRunner
+from infrastructure.world.benchmark.scenarios import load_scenarios
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Agent performance regression benchmark suite",
+    )
+    parser.add_argument(
+        "--tags",
+        nargs="*",
+        default=None,
+        help="Filter scenarios by tag (e.g. navigation quest)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="JSONL file to append results to",
+    )
+    parser.add_argument(
+        "--compare",
+        type=Path,
+        default=None,
+        help="JSONL file with baseline results for regression comparison",
+    )
+    return parser.parse_args()
+
+
+async def main() -> int:
+    args = parse_args()
+
+    scenarios = load_scenarios(tags=args.tags)
+    if not scenarios:
+        print("No matching scenarios found.")
+        return 1
+
+    print(f"Running {len(scenarios)} benchmark scenario(s)...\n")
+
+    runner = BenchmarkRunner()
+    metrics = await runner.run(scenarios)
+
+    print(metrics.summary())
+
+    if args.output:
+        metrics.save(args.output)
+
+    if args.compare:
+        history = load_history(args.compare)
+        if history:
+            from infrastructure.world.benchmark.metrics import compare_runs
+
+            # Reconstruct baseline from last recorded run
+            last = history[0]
+            baseline = BenchmarkMetrics(
+                timestamp=last.get("timestamp", ""),
+                commit_sha=last.get("commit_sha", ""),
+                total_time_ms=last.get("total_time_ms", 0),
+            )
+            for s in last.get("scenarios", []):
+                from infrastructure.world.benchmark.metrics import ScenarioResult
+
+                baseline.results.append(
+                    ScenarioResult(
+                        scenario_name=s["scenario_name"],
+                        success=s["success"],
+                        cycles_used=s["cycles_used"],
+                        max_cycles=s["max_cycles"],
+                        wall_time_ms=s.get("wall_time_ms", 0),
+                        llm_calls=s.get("llm_calls", 0),
+                        metabolic_cost=s.get("metabolic_cost", 0.0),
+                    )
+                )
+            print()
+            print(compare_runs(metrics, baseline))
+
+    return 0 if metrics.fail_count == 0 else 1
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))
--- a/src/infrastructure/world/benchmark/init.py
+++ b/src/infrastructure/world/benchmark/init.py
@@ -0,0 +1,17 @@
+"""Performance regression suite for Morrowind agent scenarios.
+
+Provides standardised benchmark scenarios, a runner that executes them
+through the heartbeat loop with a mock (or live) world adapter, and
+metrics collection for CI-integrated regression detection.
+"""
+
+from infrastructure.world.benchmark.metrics import BenchmarkMetrics
+from infrastructure.world.benchmark.runner import BenchmarkRunner
+from infrastructure.world.benchmark.scenarios import BenchmarkScenario, load_scenarios
+
+__all__ = [
+    "BenchmarkMetrics",
+    "BenchmarkRunner",
+    "BenchmarkScenario",
+    "load_scenarios",
+]
--- a/src/infrastructure/world/benchmark/metrics.py
+++ b/src/infrastructure/world/benchmark/metrics.py
@@ -0,0 +1,195 @@
+"""Benchmark metrics collection and persistence.
+
+Tracks per-scenario results: cycles used, wall-clock time, success,
+LLM call count, and estimated metabolic cost.  Results are persisted
+as JSONL for trend analysis and CI regression gates.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ScenarioResult:
+    """Outcome of running a single benchmark scenario.
+
+    Attributes:
+        scenario_name:  Human-readable scenario name.
+        success:        Whether the goal predicate was satisfied.
+        cycles_used:    Number of heartbeat cycles executed.
+        max_cycles:     The scenario's cycle budget.
+        wall_time_ms:   Total wall-clock time in milliseconds.
+        llm_calls:      Number of LLM inference calls made.
+        metabolic_cost: Estimated resource cost (arbitrary unit, ≈ tokens).
+        error:          Error message if the run crashed.
+        tags:           Scenario tags (copied for filtering).
+    """
+
+    scenario_name: str
+    success: bool = False
+    cycles_used: int = 0
+    max_cycles: int = 0
+    wall_time_ms: int = 0
+    llm_calls: int = 0
+    metabolic_cost: float = 0.0
+    error: str | None = None
+    tags: list[str] = field(default_factory=list)
+
+
+@dataclass
+class BenchmarkMetrics:
+    """Aggregated metrics across all scenarios in a benchmark run.
+
+    Attributes:
+        results:       Per-scenario results.
+        total_time_ms: Total wall-clock time for the full suite.
+        timestamp:     ISO-8601 timestamp of the run.
+        commit_sha:    Git commit SHA (if available).
+    """
+
+    results: list[ScenarioResult] = field(default_factory=list)
+    total_time_ms: int = 0
+    timestamp: str = ""
+    commit_sha: str = ""
+
+    # -- derived properties ------------------------------------------------
+
+    @property
+    def pass_count(self) -> int:
+        return sum(1 for r in self.results if r.success)
+
+    @property
+    def fail_count(self) -> int:
+        return sum(1 for r in self.results if not r.success)
+
+    @property
+    def success_rate(self) -> float:
+        if not self.results:
+            return 0.0
+        return self.pass_count / len(self.results)
+
+    @property
+    def total_llm_calls(self) -> int:
+        return sum(r.llm_calls for r in self.results)
+
+    @property
+    def total_metabolic_cost(self) -> float:
+        return sum(r.metabolic_cost for r in self.results)
+
+    # -- persistence -------------------------------------------------------
+
+    def save(self, path: Path) -> None:
+        """Append this run's results to a JSONL file at *path*."""
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        record = {
+            "timestamp": self.timestamp,
+            "commit_sha": self.commit_sha,
+            "total_time_ms": self.total_time_ms,
+            "success_rate": round(self.success_rate, 4),
+            "total_llm_calls": self.total_llm_calls,
+            "total_metabolic_cost": round(self.total_metabolic_cost, 2),
+            "scenarios": [asdict(r) for r in self.results],
+        }
+        with path.open("a") as f:
+            f.write(json.dumps(record) + "\n")
+        logger.info("Benchmark results saved to %s", path)
+
+    # -- summary -----------------------------------------------------------
+
+    def summary(self) -> str:
+        """Return a human-readable summary of the benchmark run."""
+        lines = [
+            "=== Benchmark Summary ===",
+            f"Scenarios: {len(self.results)}  "
+            f"Passed: {self.pass_count}  "
+            f"Failed: {self.fail_count}  "
+            f"Success rate: {self.success_rate:.0%}",
+            f"Total time: {self.total_time_ms} ms  "
+            f"LLM calls: {self.total_llm_calls}  "
+            f"Metabolic cost: {self.total_metabolic_cost:.1f}",
+        ]
+        if self.commit_sha:
+            lines.append(f"Commit: {self.commit_sha}")
+        lines.append("")
+        for r in self.results:
+            status = "PASS" if r.success else "FAIL"
+            lines.append(
+                f"  [{status}] {r.scenario_name} — "
+                f"{r.cycles_used}/{r.max_cycles} cycles, "
+                f"{r.wall_time_ms} ms, "
+                f"{r.llm_calls} LLM calls"
+            )
+            if r.error:
+                lines.append(f"         Error: {r.error}")
+        return "\n".join(lines)
+
+
+def load_history(path: Path) -> list[dict]:
+    """Load benchmark history from a JSONL file.
+
+    Returns:
+        List of run records, most recent first.
+    """
+    path = Path(path)
+    if not path.exists():
+        return []
+    records: list[dict] = []
+    for line in path.read_text().strip().splitlines():
+        try:
+            records.append(json.loads(line))
+        except json.JSONDecodeError:
+            continue
+    return list(reversed(records))
+
+
+def compare_runs(
+    current: BenchmarkMetrics,
+    baseline: BenchmarkMetrics,
+) -> str:
+    """Compare two benchmark runs and report regressions.
+
+    Returns:
+        Human-readable comparison report.
+    """
+    lines = ["=== Regression Report ==="]
+
+    # Overall
+    rate_delta = current.success_rate - baseline.success_rate
+    lines.append(
+        f"Success rate: {baseline.success_rate:.0%} -> {current.success_rate:.0%} "
+        f"({rate_delta:+.0%})"
+    )
+
+    cost_delta = current.total_metabolic_cost - baseline.total_metabolic_cost
+    if baseline.total_metabolic_cost > 0:
+        cost_pct = (cost_delta / baseline.total_metabolic_cost) * 100
+        lines.append(
+            f"Metabolic cost: {baseline.total_metabolic_cost:.1f} -> "
+            f"{current.total_metabolic_cost:.1f} ({cost_pct:+.1f}%)"
+        )
+
+    # Per-scenario
+    baseline_map = {r.scenario_name: r for r in baseline.results}
+    for r in current.results:
+        b = baseline_map.get(r.scenario_name)
+        if b is None:
+            lines.append(f"  [NEW] {r.scenario_name}")
+            continue
+        if b.success and not r.success:
+            lines.append(f"  [REGRESSION] {r.scenario_name} — was PASS, now FAIL")
+        elif not b.success and r.success:
+            lines.append(f"  [IMPROVEMENT] {r.scenario_name} — was FAIL, now PASS")
+        elif r.cycles_used > b.cycles_used * 1.5:
+            lines.append(
+                f"  [SLOWER] {r.scenario_name} — "
+                f"{b.cycles_used} -> {r.cycles_used} cycles (+{r.cycles_used - b.cycles_used})"
+            )
+
+    return "\n".join(lines)
--- a/src/infrastructure/world/benchmark/runner.py
+++ b/src/infrastructure/world/benchmark/runner.py
@@ -0,0 +1,167 @@
+"""Benchmark runner — executes scenarios through the heartbeat loop.
+
+Wires each ``BenchmarkScenario`` into a ``MockWorldAdapter`` (or a
+supplied adapter), runs the heartbeat for up to ``max_cycles``, and
+collects ``BenchmarkMetrics``.
+"""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+import time
+from datetime import UTC, datetime
+
+from infrastructure.world.adapters.mock import MockWorldAdapter
+from infrastructure.world.benchmark.metrics import BenchmarkMetrics, ScenarioResult
+from infrastructure.world.benchmark.scenarios import BenchmarkScenario
+from infrastructure.world.interface import WorldInterface
+from loop.heartbeat import Heartbeat
+
+logger = logging.getLogger(__name__)
+
+# Rough estimate: each heartbeat cycle costs ~1 unit of metabolic cost
+# (gather + reason + act phases each touch the LLM router once).
+_COST_PER_CYCLE = 3.0  # three phases per cycle
+
+
+class BenchmarkRunner:
+    """Run benchmark scenarios and collect metrics.
+
+    Parameters
+    ----------
+    adapter_factory:
+        Optional callable that returns a ``WorldInterface`` for a given
+        scenario.  Defaults to building a ``MockWorldAdapter`` from the
+        scenario's start state.
+    heartbeat_interval:
+        Seconds between heartbeat ticks (0 for immediate).
+    """
+
+    def __init__(
+        self,
+        *,
+        adapter_factory=None,
+        heartbeat_interval: float = 0.0,
+    ) -> None:
+        self._adapter_factory = adapter_factory or self._default_adapter
+        self._interval = heartbeat_interval
+
+    # -- public API --------------------------------------------------------
+
+    async def run(
+        self,
+        scenarios: list[BenchmarkScenario],
+    ) -> BenchmarkMetrics:
+        """Execute all *scenarios* and return aggregated metrics."""
+        metrics = BenchmarkMetrics(
+            timestamp=datetime.now(UTC).isoformat(),
+            commit_sha=self._git_sha(),
+        )
+        suite_start = time.monotonic()
+
+        for scenario in scenarios:
+            logger.info("Benchmark: starting '%s'", scenario.name)
+            result = await self._run_scenario(scenario)
+            metrics.results.append(result)
+            status = "PASS" if result.success else "FAIL"
+            logger.info(
+                "Benchmark: '%s' %s (%d/%d cycles, %d ms)",
+                scenario.name,
+                status,
+                result.cycles_used,
+                result.max_cycles,
+                result.wall_time_ms,
+            )
+
+        metrics.total_time_ms = int((time.monotonic() - suite_start) * 1000)
+        return metrics
+
+    # -- internal ----------------------------------------------------------
+
+    async def _run_scenario(self, scenario: BenchmarkScenario) -> ScenarioResult:
+        """Run a single scenario through the heartbeat loop."""
+        result = ScenarioResult(
+            scenario_name=scenario.name,
+            max_cycles=scenario.max_cycles,
+            tags=list(scenario.tags),
+        )
+
+        adapter = self._adapter_factory(scenario)
+        adapter.connect()
+
+        hb = Heartbeat(world=adapter, interval=self._interval)
+        actions: list[dict] = []
+
+        start = time.monotonic()
+        try:
+            for cycle in range(1, scenario.max_cycles + 1):
+                record = await hb.run_once()
+                result.cycles_used = cycle
+
+                # Track LLM calls (each cycle has 3 phases that may call LLM)
+                result.llm_calls += 3
+
+                # Accumulate actions for goal predicate
+                if record.action_taken and record.action_taken != "idle":
+                    actions.append(
+                        {
+                            "action": record.action_taken,
+                            "target": record.observation.get("location", ""),
+                            "status": record.action_status,
+                        }
+                    )
+
+                # Update adapter location if scenario simulates movement
+                current_location = self._get_current_location(adapter)
+
+                # Check goal predicate
+                if scenario.goal_predicate is not None:
+                    if scenario.goal_predicate(actions, current_location):
+                        result.success = True
+                        break
+                elif cycle == scenario.max_cycles:
+                    # No predicate — success if we survived all cycles
+                    result.success = True
+
+        except Exception as exc:
+            logger.warning("Benchmark scenario '%s' crashed: %s", scenario.name, exc)
+            result.error = str(exc)
+        finally:
+            adapter.disconnect()
+
+        result.wall_time_ms = int((time.monotonic() - start) * 1000)
+        result.metabolic_cost = result.cycles_used * _COST_PER_CYCLE
+        return result
+
+    @staticmethod
+    def _default_adapter(scenario: BenchmarkScenario) -> WorldInterface:
+        """Build a MockWorldAdapter from a scenario's starting state."""
+        return MockWorldAdapter(
+            location=scenario.start_location,
+            entities=list(scenario.entities),
+            events=list(scenario.events),
+        )
+
+    @staticmethod
+    def _get_current_location(adapter: WorldInterface) -> str:
+        """Read the current location from the adapter."""
+        try:
+            perception = adapter.observe()
+            return perception.location
+        except Exception:
+            return ""
+
+    @staticmethod
+    def _git_sha() -> str:
+        """Best-effort: return the current git commit SHA."""
+        try:
+            result = subprocess.run(
+                ["git", "rev-parse", "--short", "HEAD"],
+                capture_output=True,
+                text=True,
+                timeout=5,
+            )
+            return result.stdout.strip() if result.returncode == 0 else ""
+        except (OSError, subprocess.TimeoutExpired):
+            return ""
--- a/src/infrastructure/world/benchmark/scenarios.py
+++ b/src/infrastructure/world/benchmark/scenarios.py
@@ -0,0 +1,160 @@
+"""Benchmark scenario definitions for Morrowind agent regression testing.
+
+Each scenario specifies a starting location, goal conditions, world state
+(entities, events), and maximum cycles allowed.  The runner feeds these
+into the heartbeat loop and checks completion against the goal predicate.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from dataclasses import dataclass, field
+
+
+@dataclass(frozen=True)
+class BenchmarkScenario:
+    """A reproducible agent task used to detect performance regressions.
+
+    Attributes:
+        name:           Human-readable scenario name.
+        description:    What the scenario tests.
+        start_location: Where the agent begins.
+        goal_location:  Target location (if navigation scenario).
+        entities:       NPCs / objects present in the world.
+        events:         Game events injected each cycle.
+        max_cycles:     Hard cap on heartbeat cycles before failure.
+        goal_predicate: Optional callable ``(actions, location) -> bool``
+                        evaluated after each cycle to check early success.
+        tags:           Freeform tags for filtering (e.g. "navigation", "quest").
+    """
+
+    name: str
+    description: str
+    start_location: str
+    goal_location: str = ""
+    entities: list[str] = field(default_factory=list)
+    events: list[str] = field(default_factory=list)
+    max_cycles: int = 50
+    goal_predicate: Callable | None = None
+    tags: list[str] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Goal predicates
+# ---------------------------------------------------------------------------
+
+
+def _reached_location(target: str) -> Callable:
+    """Return a predicate that checks whether the agent reached *target*."""
+
+    def predicate(actions: list[dict], current_location: str) -> bool:
+        return current_location.lower() == target.lower()
+
+    return predicate
+
+
+def _interacted_with(npc: str) -> Callable:
+    """Return a predicate that checks for a speak/interact action with *npc*."""
+
+    def predicate(actions: list[dict], current_location: str) -> bool:
+        for act in actions:
+            if act.get("action") in ("speak", "interact", "talk"):
+                if act.get("target", "").lower() == npc.lower():
+                    return True
+        return False
+
+    return predicate
+
+
+# ---------------------------------------------------------------------------
+# Built-in scenarios
+# ---------------------------------------------------------------------------
+
+BUILTIN_SCENARIOS: list[BenchmarkScenario] = [
+    BenchmarkScenario(
+        name="Walk Seyda Neen to Balmora",
+        description=(
+            "Navigate from the starting village to Balmora via the road. "
+            "Tests basic navigation and pathfinding."
+        ),
+        start_location="Seyda Neen",
+        goal_location="Balmora",
+        entities=["Silt Strider", "Road Sign", "Mudcrab"],
+        events=["player_spawned"],
+        max_cycles=30,
+        goal_predicate=_reached_location("Balmora"),
+        tags=["navigation", "basic"],
+    ),
+    BenchmarkScenario(
+        name="Fargoth's Ring",
+        description=(
+            "Complete the Fargoth quest: find Fargoth, receive the ring, "
+            "and return it.  Tests NPC interaction and quest logic."
+        ),
+        start_location="Seyda Neen",
+        goal_location="Seyda Neen",
+        entities=["Fargoth", "Arrille", "Guard"],
+        events=["quest_available:fargoth_ring"],
+        max_cycles=40,
+        goal_predicate=_interacted_with("Fargoth"),
+        tags=["quest", "npc_interaction"],
+    ),
+    BenchmarkScenario(
+        name="Balmora Guild Navigation",
+        description=(
+            "Walk from Balmora South Wall Corner Club to the Fighters Guild. "
+            "Tests intra-city navigation with multiple NPCs present."
+        ),
+        start_location="Balmora, South Wall Corner Club",
+        goal_location="Balmora, Fighters Guild",
+        entities=["Guard", "Merchant", "Caius Cosades"],
+        events=["player_entered"],
+        max_cycles=20,
+        goal_predicate=_reached_location("Balmora, Fighters Guild"),
+        tags=["navigation", "city"],
+    ),
+    BenchmarkScenario(
+        name="Combat Encounter — Mudcrab",
+        description=(
+            "Engage and defeat a single Mudcrab on the road between "
+            "Seyda Neen and Balmora.  Tests combat action selection."
+        ),
+        start_location="Bitter Coast Road",
+        goal_location="Bitter Coast Road",
+        entities=["Mudcrab"],
+        events=["hostile_entity_nearby"],
+        max_cycles=15,
+        goal_predicate=None,  # Success = survived max_cycles without crash
+        tags=["combat", "basic"],
+    ),
+    BenchmarkScenario(
+        name="Passive Observation — Balmora Market",
+        description=(
+            "Observe the Balmora market for 10 cycles without acting. "
+            "Tests that the agent can reason without unnecessary actions."
+        ),
+        start_location="Balmora, Market Square",
+        goal_location="",
+        entities=["Merchant", "Guard", "Pilgrim", "Trader"],
+        events=["market_day"],
+        max_cycles=10,
+        tags=["observation", "passive"],
+    ),
+]
+
+
+def load_scenarios(
+    tags: list[str] | None = None,
+) -> list[BenchmarkScenario]:
+    """Return built-in scenarios, optionally filtered by tags.
+
+    Args:
+        tags: If provided, only return scenarios whose tags overlap.
+
+    Returns:
+        List of matching ``BenchmarkScenario`` instances.
+    """
+    if tags is None:
+        return list(BUILTIN_SCENARIOS)
+    tag_set = set(tags)
+    return [s for s in BUILTIN_SCENARIOS if tag_set & set(s.tags)]
--- a/tests/infrastructure/world/test_benchmark.py
+++ b/tests/infrastructure/world/test_benchmark.py
@@ -0,0 +1,394 @@
+"""Tests for the agent performance regression benchmark suite.
+
+Covers: scenario loading, metrics collection, runner execution,
+goal predicates, and result persistence.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from infrastructure.world.benchmark.metrics import (
+    BenchmarkMetrics,
+    ScenarioResult,
+    compare_runs,
+    load_history,
+)
+from infrastructure.world.benchmark.runner import BenchmarkRunner
+from infrastructure.world.benchmark.scenarios import (
+    BUILTIN_SCENARIOS,
+    BenchmarkScenario,
+    load_scenarios,
+)
+
+# ---------------------------------------------------------------------------
+# Scenario definitions
+# ---------------------------------------------------------------------------
+
+
+class TestBenchmarkScenario:
+    def test_builtin_scenarios_exist(self):
+        assert len(BUILTIN_SCENARIOS) >= 5
+
+    def test_scenario_fields(self):
+        s = BUILTIN_SCENARIOS[0]
+        assert s.name
+        assert s.description
+        assert s.start_location
+        assert s.max_cycles > 0
+
+    def test_load_all_scenarios(self):
+        scenarios = load_scenarios()
+        assert len(scenarios) == len(BUILTIN_SCENARIOS)
+
+    def test_load_scenarios_by_tag(self):
+        nav = load_scenarios(tags=["navigation"])
+        assert len(nav) >= 2
+        for s in nav:
+            assert "navigation" in s.tags
+
+    def test_load_scenarios_no_match(self):
+        result = load_scenarios(tags=["nonexistent_tag"])
+        assert result == []
+
+    def test_scenario_is_frozen(self):
+        s = BUILTIN_SCENARIOS[0]
+        with pytest.raises(AttributeError):
+            s.name = "modified"
+
+
+# ---------------------------------------------------------------------------
+# Goal predicates
+# ---------------------------------------------------------------------------
+
+
+class TestGoalPredicates:
+    def test_reached_location_predicate(self):
+        s = BUILTIN_SCENARIOS[0]  # Walk to Balmora
+        assert s.goal_predicate is not None
+        assert s.goal_predicate([], "Balmora") is True
+        assert s.goal_predicate([], "Seyda Neen") is False
+
+    def test_reached_location_case_insensitive(self):
+        s = BUILTIN_SCENARIOS[0]
+        assert s.goal_predicate([], "balmora") is True
+        assert s.goal_predicate([], "BALMORA") is True
+
+    def test_interacted_with_predicate(self):
+        s = BUILTIN_SCENARIOS[1]  # Fargoth quest
+        assert s.goal_predicate is not None
+        actions = [{"action": "speak", "target": "Fargoth"}]
+        assert s.goal_predicate(actions, "Seyda Neen") is True
+
+    def test_interacted_with_no_match(self):
+        s = BUILTIN_SCENARIOS[1]
+        actions = [{"action": "speak", "target": "Guard"}]
+        assert s.goal_predicate(actions, "Seyda Neen") is False
+
+    def test_interacted_with_interact_action(self):
+        s = BUILTIN_SCENARIOS[1]
+        actions = [{"action": "interact", "target": "Fargoth"}]
+        assert s.goal_predicate(actions, "Seyda Neen") is True
+
+    def test_no_predicate_scenario(self):
+        combat = [s for s in BUILTIN_SCENARIOS if "combat" in s.tags][0]
+        assert combat.goal_predicate is None
+
+
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+
+
+class TestScenarioResult:
+    def test_default_values(self):
+        r = ScenarioResult(scenario_name="test")
+        assert r.success is False
+        assert r.cycles_used == 0
+        assert r.llm_calls == 0
+        assert r.metabolic_cost == 0.0
+        assert r.error is None
+
+
+class TestBenchmarkMetrics:
+    def test_empty_metrics(self):
+        m = BenchmarkMetrics()
+        assert m.pass_count == 0
+        assert m.fail_count == 0
+        assert m.success_rate == 0.0
+        assert m.total_llm_calls == 0
+        assert m.total_metabolic_cost == 0.0
+
+    def test_success_rate(self):
+        m = BenchmarkMetrics(
+            results=[
+                ScenarioResult(scenario_name="a", success=True),
+                ScenarioResult(scenario_name="b", success=False),
+                ScenarioResult(scenario_name="c", success=True),
+            ]
+        )
+        assert m.pass_count == 2
+        assert m.fail_count == 1
+        assert abs(m.success_rate - 2 / 3) < 0.01
+
+    def test_totals(self):
+        m = BenchmarkMetrics(
+            results=[
+                ScenarioResult(scenario_name="a", llm_calls=10, metabolic_cost=30.0),
+                ScenarioResult(scenario_name="b", llm_calls=5, metabolic_cost=15.0),
+            ]
+        )
+        assert m.total_llm_calls == 15
+        assert m.total_metabolic_cost == 45.0
+
+    def test_save_and_load(self, tmp_path):
+        path = tmp_path / "bench.jsonl"
+        m = BenchmarkMetrics(
+            timestamp="2026-01-01T00:00:00",
+            commit_sha="abc123",
+            total_time_ms=1000,
+            results=[
+                ScenarioResult(
+                    scenario_name="a",
+                    success=True,
+                    cycles_used=5,
+                    max_cycles=10,
+                ),
+            ],
+        )
+        m.save(path)
+
+        history = load_history(path)
+        assert len(history) == 1
+        assert history[0]["commit_sha"] == "abc123"
+        assert history[0]["scenarios"][0]["scenario_name"] == "a"
+
+    def test_save_appends(self, tmp_path):
+        path = tmp_path / "bench.jsonl"
+        for i in range(3):
+            m = BenchmarkMetrics(
+                timestamp=f"2026-01-0{i + 1}T00:00:00",
+                results=[ScenarioResult(scenario_name=f"s{i}")],
+            )
+            m.save(path)
+
+        history = load_history(path)
+        assert len(history) == 3
+        # Most recent first
+        assert history[0]["timestamp"] == "2026-01-03T00:00:00"
+
+    def test_summary_output(self):
+        m = BenchmarkMetrics(
+            timestamp="2026-01-01T00:00:00",
+            commit_sha="abc123",
+            total_time_ms=500,
+            results=[
+                ScenarioResult(
+                    scenario_name="Walk Test",
+                    success=True,
+                    cycles_used=5,
+                    max_cycles=10,
+                    wall_time_ms=200,
+                    llm_calls=15,
+                ),
+            ],
+        )
+        summary = m.summary()
+        assert "Walk Test" in summary
+        assert "PASS" in summary
+        assert "abc123" in summary
+
+    def test_load_history_missing_file(self, tmp_path):
+        assert load_history(tmp_path / "nope.jsonl") == []
+
+    def test_load_history_corrupt_lines(self, tmp_path):
+        path = tmp_path / "bench.jsonl"
+        path.write_text('{"valid": true}\nnot json\n{"also": "valid"}\n')
+        history = load_history(path)
+        assert len(history) == 2
+
+
+# ---------------------------------------------------------------------------
+# Comparison
+# ---------------------------------------------------------------------------
+
+
+class TestCompareRuns:
+    def test_regression_detected(self):
+        baseline = BenchmarkMetrics(
+            results=[
+                ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
+            ]
+        )
+        current = BenchmarkMetrics(
+            results=[
+                ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
+            ]
+        )
+        report = compare_runs(current, baseline)
+        assert "REGRESSION" in report
+
+    def test_improvement_detected(self):
+        baseline = BenchmarkMetrics(
+            results=[
+                ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
+            ]
+        )
+        current = BenchmarkMetrics(
+            results=[
+                ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
+            ]
+        )
+        report = compare_runs(current, baseline)
+        assert "IMPROVEMENT" in report
+
+    def test_slower_detected(self):
+        baseline = BenchmarkMetrics(
+            results=[
+                ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
+            ]
+        )
+        current = BenchmarkMetrics(
+            results=[
+                ScenarioResult(scenario_name="walk", success=True, cycles_used=20),
+            ]
+        )
+        report = compare_runs(current, baseline)
+        assert "SLOWER" in report
+
+    def test_new_scenario_noted(self):
+        baseline = BenchmarkMetrics(results=[])
+        current = BenchmarkMetrics(results=[ScenarioResult(scenario_name="new_one", success=True)])
+        report = compare_runs(current, baseline)
+        assert "NEW" in report
+
+
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+
+
+class TestBenchmarkRunner:
+    @pytest.mark.asyncio
+    async def test_run_single_scenario(self):
+        """Runner executes a scenario and returns a result."""
+        scenario = BenchmarkScenario(
+            name="Test Walk",
+            description="Simple test",
+            start_location="A",
+            goal_location="A",
+            max_cycles=3,
+            tags=["test"],
+        )
+        runner = BenchmarkRunner()
+        metrics = await runner.run([scenario])
+        assert len(metrics.results) == 1
+        r = metrics.results[0]
+        assert r.scenario_name == "Test Walk"
+        assert r.cycles_used == 3  # no predicate, runs all cycles
+        assert r.success is True  # no predicate = success if survived
+        assert r.wall_time_ms >= 0
+        assert r.llm_calls == 9  # 3 cycles * 3 calls
+        assert r.metabolic_cost > 0
+
+    @pytest.mark.asyncio
+    async def test_run_with_goal_predicate(self):
+        """Runner stops early when goal predicate is satisfied."""
+
+        def always_true(actions, location):
+            return True
+
+        scenario = BenchmarkScenario(
+            name="Instant Win",
+            description="Predicate satisfied immediately",
+            start_location="A",
+            max_cycles=100,
+            goal_predicate=always_true,
+            tags=["test"],
+        )
+        runner = BenchmarkRunner()
+        metrics = await runner.run([scenario])
+        r = metrics.results[0]
+        assert r.success is True
+        assert r.cycles_used == 1  # Stopped at first cycle
+
+    @pytest.mark.asyncio
+    async def test_run_with_failing_predicate(self):
+        """Scenario fails when predicate never satisfied."""
+
+        def never_true(actions, location):
+            return False
+
+        scenario = BenchmarkScenario(
+            name="Impossible",
+            description="Predicate never satisfied",
+            start_location="A",
+            max_cycles=5,
+            goal_predicate=never_true,
+            tags=["test"],
+        )
+        runner = BenchmarkRunner()
+        metrics = await runner.run([scenario])
+        r = metrics.results[0]
+        assert r.success is False
+        assert r.cycles_used == 5
+
+    @pytest.mark.asyncio
+    async def test_run_multiple_scenarios(self):
+        """Runner handles multiple scenarios in sequence."""
+        scenarios = [
+            BenchmarkScenario(
+                name=f"Scenario {i}",
+                description=f"Test {i}",
+                start_location="A",
+                max_cycles=2,
+                tags=["test"],
+            )
+            for i in range(3)
+        ]
+        runner = BenchmarkRunner()
+        metrics = await runner.run(scenarios)
+        assert len(metrics.results) == 3
+        assert metrics.total_time_ms >= 0
+        assert metrics.timestamp
+
+    @pytest.mark.asyncio
+    async def test_metrics_commit_sha(self):
+        """Runner captures git SHA in metrics."""
+        scenario = BenchmarkScenario(
+            name="SHA Test",
+            description="Check SHA capture",
+            start_location="A",
+            max_cycles=1,
+            tags=["test"],
+        )
+        runner = BenchmarkRunner()
+        metrics = await runner.run([scenario])
+        # SHA may or may not be available in test env; just ensure no crash
+        assert isinstance(metrics.commit_sha, str)
+
+    @pytest.mark.asyncio
+    async def test_builtin_scenarios_run(self):
+        """All built-in scenarios run without crashing."""
+        # Use just 2 cycles each to keep tests fast
+        scenarios = [
+            BenchmarkScenario(
+                name=s.name,
+                description=s.description,
+                start_location=s.start_location,
+                goal_location=s.goal_location,
+                entities=list(s.entities),
+                events=list(s.events),
+                max_cycles=2,  # Override for speed
+                goal_predicate=None,  # Skip predicate for smoke test
+                tags=list(s.tags),
+            )
+            for s in BUILTIN_SCENARIOS
+        ]
+        runner = BenchmarkRunner()
+        metrics = await runner.run(scenarios)
+        assert len(metrics.results) == len(BUILTIN_SCENARIOS)
+        # All should succeed (no predicate + survived = pass)
+        for r in metrics.results:
+            assert r.success is True
+            assert r.error is None
--- a/tox.ini
+++ b/tox.ini
@@ -87,6 +87,11 @@ description = Live LLM tests via Ollama (requires running Ollama)
 commands =
    pytest tests/ -q --tb=short -m ollama --timeout=120

+[testenv:benchmark]
+description = Agent performance regression benchmark suite
+commands =
+    python scripts/run_benchmarks.py {posargs}
+
 # ── CI / Coverage ────────────────────────────────────────────────────────────

 [testenv:ci]