feat: add agent performance regression benchmark suite
Implement standardised Morrowind benchmark scenarios to detect agent performance regressions after code changes. - 5 built-in scenarios: navigation (Seyda Neen→Balmora, Balmora intra-city), quest (Fargoth's Ring), combat (Mudcrab), observation - BenchmarkRunner executes scenarios through the heartbeat loop with MockWorldAdapter, tracking cycles, wall time, LLM calls, metabolic cost - Goal predicates (reached_location, interacted_with) for early success - BenchmarkMetrics with JSONL persistence and compare_runs() for regression detection - CLI script (scripts/run_benchmarks.py) with tag filtering and baseline comparison - tox -e benchmark environment for CI integration - 31 unit tests covering scenarios, predicates, metrics, runner, and persistence Fixes #1015 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
107
scripts/run_benchmarks.py
Normal file
107
scripts/run_benchmarks.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Run the agent performance regression benchmark suite.
|
||||
|
||||
Usage::
|
||||
|
||||
python scripts/run_benchmarks.py # all scenarios
|
||||
python scripts/run_benchmarks.py --tags navigation # filter by tag
|
||||
python scripts/run_benchmarks.py --output results/benchmarks.jsonl
|
||||
python scripts/run_benchmarks.py --compare results/benchmarks.jsonl
|
||||
|
||||
Exit codes:
|
||||
0 — all scenarios passed
|
||||
1 — one or more scenarios failed
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Ensure src/ is on the path when invoked directly
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
|
||||
|
||||
from infrastructure.world.benchmark.metrics import BenchmarkMetrics, load_history
|
||||
from infrastructure.world.benchmark.runner import BenchmarkRunner
|
||||
from infrastructure.world.benchmark.scenarios import load_scenarios
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Agent performance regression benchmark suite",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tags",
|
||||
nargs="*",
|
||||
default=None,
|
||||
help="Filter scenarios by tag (e.g. navigation quest)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="JSONL file to append results to",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--compare",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="JSONL file with baseline results for regression comparison",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
async def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
scenarios = load_scenarios(tags=args.tags)
|
||||
if not scenarios:
|
||||
print("No matching scenarios found.")
|
||||
return 1
|
||||
|
||||
print(f"Running {len(scenarios)} benchmark scenario(s)...\n")
|
||||
|
||||
runner = BenchmarkRunner()
|
||||
metrics = await runner.run(scenarios)
|
||||
|
||||
print(metrics.summary())
|
||||
|
||||
if args.output:
|
||||
metrics.save(args.output)
|
||||
|
||||
if args.compare:
|
||||
history = load_history(args.compare)
|
||||
if history:
|
||||
from infrastructure.world.benchmark.metrics import compare_runs
|
||||
|
||||
# Reconstruct baseline from last recorded run
|
||||
last = history[0]
|
||||
baseline = BenchmarkMetrics(
|
||||
timestamp=last.get("timestamp", ""),
|
||||
commit_sha=last.get("commit_sha", ""),
|
||||
total_time_ms=last.get("total_time_ms", 0),
|
||||
)
|
||||
for s in last.get("scenarios", []):
|
||||
from infrastructure.world.benchmark.metrics import ScenarioResult
|
||||
|
||||
baseline.results.append(
|
||||
ScenarioResult(
|
||||
scenario_name=s["scenario_name"],
|
||||
success=s["success"],
|
||||
cycles_used=s["cycles_used"],
|
||||
max_cycles=s["max_cycles"],
|
||||
wall_time_ms=s.get("wall_time_ms", 0),
|
||||
llm_calls=s.get("llm_calls", 0),
|
||||
metabolic_cost=s.get("metabolic_cost", 0.0),
|
||||
)
|
||||
)
|
||||
print()
|
||||
print(compare_runs(metrics, baseline))
|
||||
|
||||
return 0 if metrics.fail_count == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(asyncio.run(main()))
|
||||
17
src/infrastructure/world/benchmark/__init__.py
Normal file
17
src/infrastructure/world/benchmark/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
"""Performance regression suite for Morrowind agent scenarios.
|
||||
|
||||
Provides standardised benchmark scenarios, a runner that executes them
|
||||
through the heartbeat loop with a mock (or live) world adapter, and
|
||||
metrics collection for CI-integrated regression detection.
|
||||
"""
|
||||
|
||||
from infrastructure.world.benchmark.metrics import BenchmarkMetrics
|
||||
from infrastructure.world.benchmark.runner import BenchmarkRunner
|
||||
from infrastructure.world.benchmark.scenarios import BenchmarkScenario, load_scenarios
|
||||
|
||||
__all__ = [
|
||||
"BenchmarkMetrics",
|
||||
"BenchmarkRunner",
|
||||
"BenchmarkScenario",
|
||||
"load_scenarios",
|
||||
]
|
||||
195
src/infrastructure/world/benchmark/metrics.py
Normal file
195
src/infrastructure/world/benchmark/metrics.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""Benchmark metrics collection and persistence.
|
||||
|
||||
Tracks per-scenario results: cycles used, wall-clock time, success,
|
||||
LLM call count, and estimated metabolic cost. Results are persisted
|
||||
as JSONL for trend analysis and CI regression gates.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScenarioResult:
|
||||
"""Outcome of running a single benchmark scenario.
|
||||
|
||||
Attributes:
|
||||
scenario_name: Human-readable scenario name.
|
||||
success: Whether the goal predicate was satisfied.
|
||||
cycles_used: Number of heartbeat cycles executed.
|
||||
max_cycles: The scenario's cycle budget.
|
||||
wall_time_ms: Total wall-clock time in milliseconds.
|
||||
llm_calls: Number of LLM inference calls made.
|
||||
metabolic_cost: Estimated resource cost (arbitrary unit, ≈ tokens).
|
||||
error: Error message if the run crashed.
|
||||
tags: Scenario tags (copied for filtering).
|
||||
"""
|
||||
|
||||
scenario_name: str
|
||||
success: bool = False
|
||||
cycles_used: int = 0
|
||||
max_cycles: int = 0
|
||||
wall_time_ms: int = 0
|
||||
llm_calls: int = 0
|
||||
metabolic_cost: float = 0.0
|
||||
error: str | None = None
|
||||
tags: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkMetrics:
|
||||
"""Aggregated metrics across all scenarios in a benchmark run.
|
||||
|
||||
Attributes:
|
||||
results: Per-scenario results.
|
||||
total_time_ms: Total wall-clock time for the full suite.
|
||||
timestamp: ISO-8601 timestamp of the run.
|
||||
commit_sha: Git commit SHA (if available).
|
||||
"""
|
||||
|
||||
results: list[ScenarioResult] = field(default_factory=list)
|
||||
total_time_ms: int = 0
|
||||
timestamp: str = ""
|
||||
commit_sha: str = ""
|
||||
|
||||
# -- derived properties ------------------------------------------------
|
||||
|
||||
@property
|
||||
def pass_count(self) -> int:
|
||||
return sum(1 for r in self.results if r.success)
|
||||
|
||||
@property
|
||||
def fail_count(self) -> int:
|
||||
return sum(1 for r in self.results if not r.success)
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
if not self.results:
|
||||
return 0.0
|
||||
return self.pass_count / len(self.results)
|
||||
|
||||
@property
|
||||
def total_llm_calls(self) -> int:
|
||||
return sum(r.llm_calls for r in self.results)
|
||||
|
||||
@property
|
||||
def total_metabolic_cost(self) -> float:
|
||||
return sum(r.metabolic_cost for r in self.results)
|
||||
|
||||
# -- persistence -------------------------------------------------------
|
||||
|
||||
def save(self, path: Path) -> None:
|
||||
"""Append this run's results to a JSONL file at *path*."""
|
||||
path = Path(path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
record = {
|
||||
"timestamp": self.timestamp,
|
||||
"commit_sha": self.commit_sha,
|
||||
"total_time_ms": self.total_time_ms,
|
||||
"success_rate": round(self.success_rate, 4),
|
||||
"total_llm_calls": self.total_llm_calls,
|
||||
"total_metabolic_cost": round(self.total_metabolic_cost, 2),
|
||||
"scenarios": [asdict(r) for r in self.results],
|
||||
}
|
||||
with path.open("a") as f:
|
||||
f.write(json.dumps(record) + "\n")
|
||||
logger.info("Benchmark results saved to %s", path)
|
||||
|
||||
# -- summary -----------------------------------------------------------
|
||||
|
||||
def summary(self) -> str:
|
||||
"""Return a human-readable summary of the benchmark run."""
|
||||
lines = [
|
||||
"=== Benchmark Summary ===",
|
||||
f"Scenarios: {len(self.results)} "
|
||||
f"Passed: {self.pass_count} "
|
||||
f"Failed: {self.fail_count} "
|
||||
f"Success rate: {self.success_rate:.0%}",
|
||||
f"Total time: {self.total_time_ms} ms "
|
||||
f"LLM calls: {self.total_llm_calls} "
|
||||
f"Metabolic cost: {self.total_metabolic_cost:.1f}",
|
||||
]
|
||||
if self.commit_sha:
|
||||
lines.append(f"Commit: {self.commit_sha}")
|
||||
lines.append("")
|
||||
for r in self.results:
|
||||
status = "PASS" if r.success else "FAIL"
|
||||
lines.append(
|
||||
f" [{status}] {r.scenario_name} — "
|
||||
f"{r.cycles_used}/{r.max_cycles} cycles, "
|
||||
f"{r.wall_time_ms} ms, "
|
||||
f"{r.llm_calls} LLM calls"
|
||||
)
|
||||
if r.error:
|
||||
lines.append(f" Error: {r.error}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def load_history(path: Path) -> list[dict]:
|
||||
"""Load benchmark history from a JSONL file.
|
||||
|
||||
Returns:
|
||||
List of run records, most recent first.
|
||||
"""
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
return []
|
||||
records: list[dict] = []
|
||||
for line in path.read_text().strip().splitlines():
|
||||
try:
|
||||
records.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return list(reversed(records))
|
||||
|
||||
|
||||
def compare_runs(
|
||||
current: BenchmarkMetrics,
|
||||
baseline: BenchmarkMetrics,
|
||||
) -> str:
|
||||
"""Compare two benchmark runs and report regressions.
|
||||
|
||||
Returns:
|
||||
Human-readable comparison report.
|
||||
"""
|
||||
lines = ["=== Regression Report ==="]
|
||||
|
||||
# Overall
|
||||
rate_delta = current.success_rate - baseline.success_rate
|
||||
lines.append(
|
||||
f"Success rate: {baseline.success_rate:.0%} -> {current.success_rate:.0%} "
|
||||
f"({rate_delta:+.0%})"
|
||||
)
|
||||
|
||||
cost_delta = current.total_metabolic_cost - baseline.total_metabolic_cost
|
||||
if baseline.total_metabolic_cost > 0:
|
||||
cost_pct = (cost_delta / baseline.total_metabolic_cost) * 100
|
||||
lines.append(
|
||||
f"Metabolic cost: {baseline.total_metabolic_cost:.1f} -> "
|
||||
f"{current.total_metabolic_cost:.1f} ({cost_pct:+.1f}%)"
|
||||
)
|
||||
|
||||
# Per-scenario
|
||||
baseline_map = {r.scenario_name: r for r in baseline.results}
|
||||
for r in current.results:
|
||||
b = baseline_map.get(r.scenario_name)
|
||||
if b is None:
|
||||
lines.append(f" [NEW] {r.scenario_name}")
|
||||
continue
|
||||
if b.success and not r.success:
|
||||
lines.append(f" [REGRESSION] {r.scenario_name} — was PASS, now FAIL")
|
||||
elif not b.success and r.success:
|
||||
lines.append(f" [IMPROVEMENT] {r.scenario_name} — was FAIL, now PASS")
|
||||
elif r.cycles_used > b.cycles_used * 1.5:
|
||||
lines.append(
|
||||
f" [SLOWER] {r.scenario_name} — "
|
||||
f"{b.cycles_used} -> {r.cycles_used} cycles (+{r.cycles_used - b.cycles_used})"
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
167
src/infrastructure/world/benchmark/runner.py
Normal file
167
src/infrastructure/world/benchmark/runner.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""Benchmark runner — executes scenarios through the heartbeat loop.
|
||||
|
||||
Wires each ``BenchmarkScenario`` into a ``MockWorldAdapter`` (or a
|
||||
supplied adapter), runs the heartbeat for up to ``max_cycles``, and
|
||||
collects ``BenchmarkMetrics``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from infrastructure.world.adapters.mock import MockWorldAdapter
|
||||
from infrastructure.world.benchmark.metrics import BenchmarkMetrics, ScenarioResult
|
||||
from infrastructure.world.benchmark.scenarios import BenchmarkScenario
|
||||
from infrastructure.world.interface import WorldInterface
|
||||
from loop.heartbeat import Heartbeat
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Rough estimate: each heartbeat cycle costs ~1 unit of metabolic cost
|
||||
# (gather + reason + act phases each touch the LLM router once).
|
||||
_COST_PER_CYCLE = 3.0 # three phases per cycle
|
||||
|
||||
|
||||
class BenchmarkRunner:
|
||||
"""Run benchmark scenarios and collect metrics.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
adapter_factory:
|
||||
Optional callable that returns a ``WorldInterface`` for a given
|
||||
scenario. Defaults to building a ``MockWorldAdapter`` from the
|
||||
scenario's start state.
|
||||
heartbeat_interval:
|
||||
Seconds between heartbeat ticks (0 for immediate).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
adapter_factory=None,
|
||||
heartbeat_interval: float = 0.0,
|
||||
) -> None:
|
||||
self._adapter_factory = adapter_factory or self._default_adapter
|
||||
self._interval = heartbeat_interval
|
||||
|
||||
# -- public API --------------------------------------------------------
|
||||
|
||||
async def run(
|
||||
self,
|
||||
scenarios: list[BenchmarkScenario],
|
||||
) -> BenchmarkMetrics:
|
||||
"""Execute all *scenarios* and return aggregated metrics."""
|
||||
metrics = BenchmarkMetrics(
|
||||
timestamp=datetime.now(UTC).isoformat(),
|
||||
commit_sha=self._git_sha(),
|
||||
)
|
||||
suite_start = time.monotonic()
|
||||
|
||||
for scenario in scenarios:
|
||||
logger.info("Benchmark: starting '%s'", scenario.name)
|
||||
result = await self._run_scenario(scenario)
|
||||
metrics.results.append(result)
|
||||
status = "PASS" if result.success else "FAIL"
|
||||
logger.info(
|
||||
"Benchmark: '%s' %s (%d/%d cycles, %d ms)",
|
||||
scenario.name,
|
||||
status,
|
||||
result.cycles_used,
|
||||
result.max_cycles,
|
||||
result.wall_time_ms,
|
||||
)
|
||||
|
||||
metrics.total_time_ms = int((time.monotonic() - suite_start) * 1000)
|
||||
return metrics
|
||||
|
||||
# -- internal ----------------------------------------------------------
|
||||
|
||||
async def _run_scenario(self, scenario: BenchmarkScenario) -> ScenarioResult:
|
||||
"""Run a single scenario through the heartbeat loop."""
|
||||
result = ScenarioResult(
|
||||
scenario_name=scenario.name,
|
||||
max_cycles=scenario.max_cycles,
|
||||
tags=list(scenario.tags),
|
||||
)
|
||||
|
||||
adapter = self._adapter_factory(scenario)
|
||||
adapter.connect()
|
||||
|
||||
hb = Heartbeat(world=adapter, interval=self._interval)
|
||||
actions: list[dict] = []
|
||||
|
||||
start = time.monotonic()
|
||||
try:
|
||||
for cycle in range(1, scenario.max_cycles + 1):
|
||||
record = await hb.run_once()
|
||||
result.cycles_used = cycle
|
||||
|
||||
# Track LLM calls (each cycle has 3 phases that may call LLM)
|
||||
result.llm_calls += 3
|
||||
|
||||
# Accumulate actions for goal predicate
|
||||
if record.action_taken and record.action_taken != "idle":
|
||||
actions.append(
|
||||
{
|
||||
"action": record.action_taken,
|
||||
"target": record.observation.get("location", ""),
|
||||
"status": record.action_status,
|
||||
}
|
||||
)
|
||||
|
||||
# Update adapter location if scenario simulates movement
|
||||
current_location = self._get_current_location(adapter)
|
||||
|
||||
# Check goal predicate
|
||||
if scenario.goal_predicate is not None:
|
||||
if scenario.goal_predicate(actions, current_location):
|
||||
result.success = True
|
||||
break
|
||||
elif cycle == scenario.max_cycles:
|
||||
# No predicate — success if we survived all cycles
|
||||
result.success = True
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning("Benchmark scenario '%s' crashed: %s", scenario.name, exc)
|
||||
result.error = str(exc)
|
||||
finally:
|
||||
adapter.disconnect()
|
||||
|
||||
result.wall_time_ms = int((time.monotonic() - start) * 1000)
|
||||
result.metabolic_cost = result.cycles_used * _COST_PER_CYCLE
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _default_adapter(scenario: BenchmarkScenario) -> WorldInterface:
|
||||
"""Build a MockWorldAdapter from a scenario's starting state."""
|
||||
return MockWorldAdapter(
|
||||
location=scenario.start_location,
|
||||
entities=list(scenario.entities),
|
||||
events=list(scenario.events),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_current_location(adapter: WorldInterface) -> str:
|
||||
"""Read the current location from the adapter."""
|
||||
try:
|
||||
perception = adapter.observe()
|
||||
return perception.location
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _git_sha() -> str:
|
||||
"""Best-effort: return the current git commit SHA."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "rev-parse", "--short", "HEAD"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
return result.stdout.strip() if result.returncode == 0 else ""
|
||||
except (OSError, subprocess.TimeoutExpired):
|
||||
return ""
|
||||
160
src/infrastructure/world/benchmark/scenarios.py
Normal file
160
src/infrastructure/world/benchmark/scenarios.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Benchmark scenario definitions for Morrowind agent regression testing.
|
||||
|
||||
Each scenario specifies a starting location, goal conditions, world state
|
||||
(entities, events), and maximum cycles allowed. The runner feeds these
|
||||
into the heartbeat loop and checks completion against the goal predicate.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BenchmarkScenario:
|
||||
"""A reproducible agent task used to detect performance regressions.
|
||||
|
||||
Attributes:
|
||||
name: Human-readable scenario name.
|
||||
description: What the scenario tests.
|
||||
start_location: Where the agent begins.
|
||||
goal_location: Target location (if navigation scenario).
|
||||
entities: NPCs / objects present in the world.
|
||||
events: Game events injected each cycle.
|
||||
max_cycles: Hard cap on heartbeat cycles before failure.
|
||||
goal_predicate: Optional callable ``(actions, location) -> bool``
|
||||
evaluated after each cycle to check early success.
|
||||
tags: Freeform tags for filtering (e.g. "navigation", "quest").
|
||||
"""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
start_location: str
|
||||
goal_location: str = ""
|
||||
entities: list[str] = field(default_factory=list)
|
||||
events: list[str] = field(default_factory=list)
|
||||
max_cycles: int = 50
|
||||
goal_predicate: Callable | None = None
|
||||
tags: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Goal predicates
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _reached_location(target: str) -> Callable:
|
||||
"""Return a predicate that checks whether the agent reached *target*."""
|
||||
|
||||
def predicate(actions: list[dict], current_location: str) -> bool:
|
||||
return current_location.lower() == target.lower()
|
||||
|
||||
return predicate
|
||||
|
||||
|
||||
def _interacted_with(npc: str) -> Callable:
|
||||
"""Return a predicate that checks for a speak/interact action with *npc*."""
|
||||
|
||||
def predicate(actions: list[dict], current_location: str) -> bool:
|
||||
for act in actions:
|
||||
if act.get("action") in ("speak", "interact", "talk"):
|
||||
if act.get("target", "").lower() == npc.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
return predicate
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Built-in scenarios
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
BUILTIN_SCENARIOS: list[BenchmarkScenario] = [
|
||||
BenchmarkScenario(
|
||||
name="Walk Seyda Neen to Balmora",
|
||||
description=(
|
||||
"Navigate from the starting village to Balmora via the road. "
|
||||
"Tests basic navigation and pathfinding."
|
||||
),
|
||||
start_location="Seyda Neen",
|
||||
goal_location="Balmora",
|
||||
entities=["Silt Strider", "Road Sign", "Mudcrab"],
|
||||
events=["player_spawned"],
|
||||
max_cycles=30,
|
||||
goal_predicate=_reached_location("Balmora"),
|
||||
tags=["navigation", "basic"],
|
||||
),
|
||||
BenchmarkScenario(
|
||||
name="Fargoth's Ring",
|
||||
description=(
|
||||
"Complete the Fargoth quest: find Fargoth, receive the ring, "
|
||||
"and return it. Tests NPC interaction and quest logic."
|
||||
),
|
||||
start_location="Seyda Neen",
|
||||
goal_location="Seyda Neen",
|
||||
entities=["Fargoth", "Arrille", "Guard"],
|
||||
events=["quest_available:fargoth_ring"],
|
||||
max_cycles=40,
|
||||
goal_predicate=_interacted_with("Fargoth"),
|
||||
tags=["quest", "npc_interaction"],
|
||||
),
|
||||
BenchmarkScenario(
|
||||
name="Balmora Guild Navigation",
|
||||
description=(
|
||||
"Walk from Balmora South Wall Corner Club to the Fighters Guild. "
|
||||
"Tests intra-city navigation with multiple NPCs present."
|
||||
),
|
||||
start_location="Balmora, South Wall Corner Club",
|
||||
goal_location="Balmora, Fighters Guild",
|
||||
entities=["Guard", "Merchant", "Caius Cosades"],
|
||||
events=["player_entered"],
|
||||
max_cycles=20,
|
||||
goal_predicate=_reached_location("Balmora, Fighters Guild"),
|
||||
tags=["navigation", "city"],
|
||||
),
|
||||
BenchmarkScenario(
|
||||
name="Combat Encounter — Mudcrab",
|
||||
description=(
|
||||
"Engage and defeat a single Mudcrab on the road between "
|
||||
"Seyda Neen and Balmora. Tests combat action selection."
|
||||
),
|
||||
start_location="Bitter Coast Road",
|
||||
goal_location="Bitter Coast Road",
|
||||
entities=["Mudcrab"],
|
||||
events=["hostile_entity_nearby"],
|
||||
max_cycles=15,
|
||||
goal_predicate=None, # Success = survived max_cycles without crash
|
||||
tags=["combat", "basic"],
|
||||
),
|
||||
BenchmarkScenario(
|
||||
name="Passive Observation — Balmora Market",
|
||||
description=(
|
||||
"Observe the Balmora market for 10 cycles without acting. "
|
||||
"Tests that the agent can reason without unnecessary actions."
|
||||
),
|
||||
start_location="Balmora, Market Square",
|
||||
goal_location="",
|
||||
entities=["Merchant", "Guard", "Pilgrim", "Trader"],
|
||||
events=["market_day"],
|
||||
max_cycles=10,
|
||||
tags=["observation", "passive"],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def load_scenarios(
|
||||
tags: list[str] | None = None,
|
||||
) -> list[BenchmarkScenario]:
|
||||
"""Return built-in scenarios, optionally filtered by tags.
|
||||
|
||||
Args:
|
||||
tags: If provided, only return scenarios whose tags overlap.
|
||||
|
||||
Returns:
|
||||
List of matching ``BenchmarkScenario`` instances.
|
||||
"""
|
||||
if tags is None:
|
||||
return list(BUILTIN_SCENARIOS)
|
||||
tag_set = set(tags)
|
||||
return [s for s in BUILTIN_SCENARIOS if tag_set & set(s.tags)]
|
||||
394
tests/infrastructure/world/test_benchmark.py
Normal file
394
tests/infrastructure/world/test_benchmark.py
Normal file
@@ -0,0 +1,394 @@
|
||||
"""Tests for the agent performance regression benchmark suite.
|
||||
|
||||
Covers: scenario loading, metrics collection, runner execution,
|
||||
goal predicates, and result persistence.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from infrastructure.world.benchmark.metrics import (
|
||||
BenchmarkMetrics,
|
||||
ScenarioResult,
|
||||
compare_runs,
|
||||
load_history,
|
||||
)
|
||||
from infrastructure.world.benchmark.runner import BenchmarkRunner
|
||||
from infrastructure.world.benchmark.scenarios import (
|
||||
BUILTIN_SCENARIOS,
|
||||
BenchmarkScenario,
|
||||
load_scenarios,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario definitions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestBenchmarkScenario:
|
||||
def test_builtin_scenarios_exist(self):
|
||||
assert len(BUILTIN_SCENARIOS) >= 5
|
||||
|
||||
def test_scenario_fields(self):
|
||||
s = BUILTIN_SCENARIOS[0]
|
||||
assert s.name
|
||||
assert s.description
|
||||
assert s.start_location
|
||||
assert s.max_cycles > 0
|
||||
|
||||
def test_load_all_scenarios(self):
|
||||
scenarios = load_scenarios()
|
||||
assert len(scenarios) == len(BUILTIN_SCENARIOS)
|
||||
|
||||
def test_load_scenarios_by_tag(self):
|
||||
nav = load_scenarios(tags=["navigation"])
|
||||
assert len(nav) >= 2
|
||||
for s in nav:
|
||||
assert "navigation" in s.tags
|
||||
|
||||
def test_load_scenarios_no_match(self):
|
||||
result = load_scenarios(tags=["nonexistent_tag"])
|
||||
assert result == []
|
||||
|
||||
def test_scenario_is_frozen(self):
|
||||
s = BUILTIN_SCENARIOS[0]
|
||||
with pytest.raises(AttributeError):
|
||||
s.name = "modified"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Goal predicates
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestGoalPredicates:
|
||||
def test_reached_location_predicate(self):
|
||||
s = BUILTIN_SCENARIOS[0] # Walk to Balmora
|
||||
assert s.goal_predicate is not None
|
||||
assert s.goal_predicate([], "Balmora") is True
|
||||
assert s.goal_predicate([], "Seyda Neen") is False
|
||||
|
||||
def test_reached_location_case_insensitive(self):
|
||||
s = BUILTIN_SCENARIOS[0]
|
||||
assert s.goal_predicate([], "balmora") is True
|
||||
assert s.goal_predicate([], "BALMORA") is True
|
||||
|
||||
def test_interacted_with_predicate(self):
|
||||
s = BUILTIN_SCENARIOS[1] # Fargoth quest
|
||||
assert s.goal_predicate is not None
|
||||
actions = [{"action": "speak", "target": "Fargoth"}]
|
||||
assert s.goal_predicate(actions, "Seyda Neen") is True
|
||||
|
||||
def test_interacted_with_no_match(self):
|
||||
s = BUILTIN_SCENARIOS[1]
|
||||
actions = [{"action": "speak", "target": "Guard"}]
|
||||
assert s.goal_predicate(actions, "Seyda Neen") is False
|
||||
|
||||
def test_interacted_with_interact_action(self):
|
||||
s = BUILTIN_SCENARIOS[1]
|
||||
actions = [{"action": "interact", "target": "Fargoth"}]
|
||||
assert s.goal_predicate(actions, "Seyda Neen") is True
|
||||
|
||||
def test_no_predicate_scenario(self):
|
||||
combat = [s for s in BUILTIN_SCENARIOS if "combat" in s.tags][0]
|
||||
assert combat.goal_predicate is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestScenarioResult:
|
||||
def test_default_values(self):
|
||||
r = ScenarioResult(scenario_name="test")
|
||||
assert r.success is False
|
||||
assert r.cycles_used == 0
|
||||
assert r.llm_calls == 0
|
||||
assert r.metabolic_cost == 0.0
|
||||
assert r.error is None
|
||||
|
||||
|
||||
class TestBenchmarkMetrics:
|
||||
def test_empty_metrics(self):
|
||||
m = BenchmarkMetrics()
|
||||
assert m.pass_count == 0
|
||||
assert m.fail_count == 0
|
||||
assert m.success_rate == 0.0
|
||||
assert m.total_llm_calls == 0
|
||||
assert m.total_metabolic_cost == 0.0
|
||||
|
||||
def test_success_rate(self):
|
||||
m = BenchmarkMetrics(
|
||||
results=[
|
||||
ScenarioResult(scenario_name="a", success=True),
|
||||
ScenarioResult(scenario_name="b", success=False),
|
||||
ScenarioResult(scenario_name="c", success=True),
|
||||
]
|
||||
)
|
||||
assert m.pass_count == 2
|
||||
assert m.fail_count == 1
|
||||
assert abs(m.success_rate - 2 / 3) < 0.01
|
||||
|
||||
def test_totals(self):
|
||||
m = BenchmarkMetrics(
|
||||
results=[
|
||||
ScenarioResult(scenario_name="a", llm_calls=10, metabolic_cost=30.0),
|
||||
ScenarioResult(scenario_name="b", llm_calls=5, metabolic_cost=15.0),
|
||||
]
|
||||
)
|
||||
assert m.total_llm_calls == 15
|
||||
assert m.total_metabolic_cost == 45.0
|
||||
|
||||
def test_save_and_load(self, tmp_path):
|
||||
path = tmp_path / "bench.jsonl"
|
||||
m = BenchmarkMetrics(
|
||||
timestamp="2026-01-01T00:00:00",
|
||||
commit_sha="abc123",
|
||||
total_time_ms=1000,
|
||||
results=[
|
||||
ScenarioResult(
|
||||
scenario_name="a",
|
||||
success=True,
|
||||
cycles_used=5,
|
||||
max_cycles=10,
|
||||
),
|
||||
],
|
||||
)
|
||||
m.save(path)
|
||||
|
||||
history = load_history(path)
|
||||
assert len(history) == 1
|
||||
assert history[0]["commit_sha"] == "abc123"
|
||||
assert history[0]["scenarios"][0]["scenario_name"] == "a"
|
||||
|
||||
def test_save_appends(self, tmp_path):
|
||||
path = tmp_path / "bench.jsonl"
|
||||
for i in range(3):
|
||||
m = BenchmarkMetrics(
|
||||
timestamp=f"2026-01-0{i + 1}T00:00:00",
|
||||
results=[ScenarioResult(scenario_name=f"s{i}")],
|
||||
)
|
||||
m.save(path)
|
||||
|
||||
history = load_history(path)
|
||||
assert len(history) == 3
|
||||
# Most recent first
|
||||
assert history[0]["timestamp"] == "2026-01-03T00:00:00"
|
||||
|
||||
def test_summary_output(self):
|
||||
m = BenchmarkMetrics(
|
||||
timestamp="2026-01-01T00:00:00",
|
||||
commit_sha="abc123",
|
||||
total_time_ms=500,
|
||||
results=[
|
||||
ScenarioResult(
|
||||
scenario_name="Walk Test",
|
||||
success=True,
|
||||
cycles_used=5,
|
||||
max_cycles=10,
|
||||
wall_time_ms=200,
|
||||
llm_calls=15,
|
||||
),
|
||||
],
|
||||
)
|
||||
summary = m.summary()
|
||||
assert "Walk Test" in summary
|
||||
assert "PASS" in summary
|
||||
assert "abc123" in summary
|
||||
|
||||
def test_load_history_missing_file(self, tmp_path):
|
||||
assert load_history(tmp_path / "nope.jsonl") == []
|
||||
|
||||
def test_load_history_corrupt_lines(self, tmp_path):
|
||||
path = tmp_path / "bench.jsonl"
|
||||
path.write_text('{"valid": true}\nnot json\n{"also": "valid"}\n')
|
||||
history = load_history(path)
|
||||
assert len(history) == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Comparison
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCompareRuns:
|
||||
def test_regression_detected(self):
|
||||
baseline = BenchmarkMetrics(
|
||||
results=[
|
||||
ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
|
||||
]
|
||||
)
|
||||
current = BenchmarkMetrics(
|
||||
results=[
|
||||
ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
|
||||
]
|
||||
)
|
||||
report = compare_runs(current, baseline)
|
||||
assert "REGRESSION" in report
|
||||
|
||||
def test_improvement_detected(self):
|
||||
baseline = BenchmarkMetrics(
|
||||
results=[
|
||||
ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
|
||||
]
|
||||
)
|
||||
current = BenchmarkMetrics(
|
||||
results=[
|
||||
ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
|
||||
]
|
||||
)
|
||||
report = compare_runs(current, baseline)
|
||||
assert "IMPROVEMENT" in report
|
||||
|
||||
def test_slower_detected(self):
|
||||
baseline = BenchmarkMetrics(
|
||||
results=[
|
||||
ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
|
||||
]
|
||||
)
|
||||
current = BenchmarkMetrics(
|
||||
results=[
|
||||
ScenarioResult(scenario_name="walk", success=True, cycles_used=20),
|
||||
]
|
||||
)
|
||||
report = compare_runs(current, baseline)
|
||||
assert "SLOWER" in report
|
||||
|
||||
def test_new_scenario_noted(self):
|
||||
baseline = BenchmarkMetrics(results=[])
|
||||
current = BenchmarkMetrics(results=[ScenarioResult(scenario_name="new_one", success=True)])
|
||||
report = compare_runs(current, baseline)
|
||||
assert "NEW" in report
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestBenchmarkRunner:
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_single_scenario(self):
|
||||
"""Runner executes a scenario and returns a result."""
|
||||
scenario = BenchmarkScenario(
|
||||
name="Test Walk",
|
||||
description="Simple test",
|
||||
start_location="A",
|
||||
goal_location="A",
|
||||
max_cycles=3,
|
||||
tags=["test"],
|
||||
)
|
||||
runner = BenchmarkRunner()
|
||||
metrics = await runner.run([scenario])
|
||||
assert len(metrics.results) == 1
|
||||
r = metrics.results[0]
|
||||
assert r.scenario_name == "Test Walk"
|
||||
assert r.cycles_used == 3 # no predicate, runs all cycles
|
||||
assert r.success is True # no predicate = success if survived
|
||||
assert r.wall_time_ms >= 0
|
||||
assert r.llm_calls == 9 # 3 cycles * 3 calls
|
||||
assert r.metabolic_cost > 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_with_goal_predicate(self):
|
||||
"""Runner stops early when goal predicate is satisfied."""
|
||||
|
||||
def always_true(actions, location):
|
||||
return True
|
||||
|
||||
scenario = BenchmarkScenario(
|
||||
name="Instant Win",
|
||||
description="Predicate satisfied immediately",
|
||||
start_location="A",
|
||||
max_cycles=100,
|
||||
goal_predicate=always_true,
|
||||
tags=["test"],
|
||||
)
|
||||
runner = BenchmarkRunner()
|
||||
metrics = await runner.run([scenario])
|
||||
r = metrics.results[0]
|
||||
assert r.success is True
|
||||
assert r.cycles_used == 1 # Stopped at first cycle
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_with_failing_predicate(self):
|
||||
"""Scenario fails when predicate never satisfied."""
|
||||
|
||||
def never_true(actions, location):
|
||||
return False
|
||||
|
||||
scenario = BenchmarkScenario(
|
||||
name="Impossible",
|
||||
description="Predicate never satisfied",
|
||||
start_location="A",
|
||||
max_cycles=5,
|
||||
goal_predicate=never_true,
|
||||
tags=["test"],
|
||||
)
|
||||
runner = BenchmarkRunner()
|
||||
metrics = await runner.run([scenario])
|
||||
r = metrics.results[0]
|
||||
assert r.success is False
|
||||
assert r.cycles_used == 5
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_multiple_scenarios(self):
|
||||
"""Runner handles multiple scenarios in sequence."""
|
||||
scenarios = [
|
||||
BenchmarkScenario(
|
||||
name=f"Scenario {i}",
|
||||
description=f"Test {i}",
|
||||
start_location="A",
|
||||
max_cycles=2,
|
||||
tags=["test"],
|
||||
)
|
||||
for i in range(3)
|
||||
]
|
||||
runner = BenchmarkRunner()
|
||||
metrics = await runner.run(scenarios)
|
||||
assert len(metrics.results) == 3
|
||||
assert metrics.total_time_ms >= 0
|
||||
assert metrics.timestamp
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_metrics_commit_sha(self):
|
||||
"""Runner captures git SHA in metrics."""
|
||||
scenario = BenchmarkScenario(
|
||||
name="SHA Test",
|
||||
description="Check SHA capture",
|
||||
start_location="A",
|
||||
max_cycles=1,
|
||||
tags=["test"],
|
||||
)
|
||||
runner = BenchmarkRunner()
|
||||
metrics = await runner.run([scenario])
|
||||
# SHA may or may not be available in test env; just ensure no crash
|
||||
assert isinstance(metrics.commit_sha, str)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_builtin_scenarios_run(self):
|
||||
"""All built-in scenarios run without crashing."""
|
||||
# Use just 2 cycles each to keep tests fast
|
||||
scenarios = [
|
||||
BenchmarkScenario(
|
||||
name=s.name,
|
||||
description=s.description,
|
||||
start_location=s.start_location,
|
||||
goal_location=s.goal_location,
|
||||
entities=list(s.entities),
|
||||
events=list(s.events),
|
||||
max_cycles=2, # Override for speed
|
||||
goal_predicate=None, # Skip predicate for smoke test
|
||||
tags=list(s.tags),
|
||||
)
|
||||
for s in BUILTIN_SCENARIOS
|
||||
]
|
||||
runner = BenchmarkRunner()
|
||||
metrics = await runner.run(scenarios)
|
||||
assert len(metrics.results) == len(BUILTIN_SCENARIOS)
|
||||
# All should succeed (no predicate + survived = pass)
|
||||
for r in metrics.results:
|
||||
assert r.success is True
|
||||
assert r.error is None
|
||||
5
tox.ini
5
tox.ini
@@ -87,6 +87,11 @@ description = Live LLM tests via Ollama (requires running Ollama)
|
||||
commands =
|
||||
pytest tests/ -q --tb=short -m ollama --timeout=120
|
||||
|
||||
[testenv:benchmark]
|
||||
description = Agent performance regression benchmark suite
|
||||
commands =
|
||||
python scripts/run_benchmarks.py {posargs}
|
||||
|
||||
# ── CI / Coverage ────────────────────────────────────────────────────────────
|
||||
|
||||
[testenv:ci]
|
||||
|
||||
Reference in New Issue
Block a user