Files
Timmy-time-dashboard/scripts/run_benchmarks.py
Alexander Whitestone 49990e6aec
Some checks failed
Tests / lint (pull_request) Successful in 16s
Tests / test (pull_request) Failing after 13m58s
feat: add agent performance regression benchmark suite
Implement standardised Morrowind benchmark scenarios to detect agent
performance regressions after code changes.

- 5 built-in scenarios: navigation (Seyda Neen→Balmora, Balmora
  intra-city), quest (Fargoth's Ring), combat (Mudcrab), observation
- BenchmarkRunner executes scenarios through the heartbeat loop with
  MockWorldAdapter, tracking cycles, wall time, LLM calls, metabolic cost
- Goal predicates (reached_location, interacted_with) for early success
- BenchmarkMetrics with JSONL persistence and compare_runs() for
  regression detection
- CLI script (scripts/run_benchmarks.py) with tag filtering and
  baseline comparison
- tox -e benchmark environment for CI integration
- 31 unit tests covering scenarios, predicates, metrics, runner, and
  persistence

Fixes #1015

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 19:54:26 -04:00

108 lines
3.2 KiB
Python

#!/usr/bin/env python3
"""Run the agent performance regression benchmark suite.
Usage::
python scripts/run_benchmarks.py # all scenarios
python scripts/run_benchmarks.py --tags navigation # filter by tag
python scripts/run_benchmarks.py --output results/benchmarks.jsonl
python scripts/run_benchmarks.py --compare results/benchmarks.jsonl
Exit codes:
0 — all scenarios passed
1 — one or more scenarios failed
"""
from __future__ import annotations
import argparse
import asyncio
import sys
from pathlib import Path
# Ensure src/ is on the path when invoked directly
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
from infrastructure.world.benchmark.metrics import BenchmarkMetrics, load_history
from infrastructure.world.benchmark.runner import BenchmarkRunner
from infrastructure.world.benchmark.scenarios import load_scenarios
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Agent performance regression benchmark suite",
)
parser.add_argument(
"--tags",
nargs="*",
default=None,
help="Filter scenarios by tag (e.g. navigation quest)",
)
parser.add_argument(
"--output",
type=Path,
default=None,
help="JSONL file to append results to",
)
parser.add_argument(
"--compare",
type=Path,
default=None,
help="JSONL file with baseline results for regression comparison",
)
return parser.parse_args()
async def main() -> int:
args = parse_args()
scenarios = load_scenarios(tags=args.tags)
if not scenarios:
print("No matching scenarios found.")
return 1
print(f"Running {len(scenarios)} benchmark scenario(s)...\n")
runner = BenchmarkRunner()
metrics = await runner.run(scenarios)
print(metrics.summary())
if args.output:
metrics.save(args.output)
if args.compare:
history = load_history(args.compare)
if history:
from infrastructure.world.benchmark.metrics import compare_runs
# Reconstruct baseline from last recorded run
last = history[0]
baseline = BenchmarkMetrics(
timestamp=last.get("timestamp", ""),
commit_sha=last.get("commit_sha", ""),
total_time_ms=last.get("total_time_ms", 0),
)
for s in last.get("scenarios", []):
from infrastructure.world.benchmark.metrics import ScenarioResult
baseline.results.append(
ScenarioResult(
scenario_name=s["scenario_name"],
success=s["success"],
cycles_used=s["cycles_used"],
max_cycles=s["max_cycles"],
wall_time_ms=s.get("wall_time_ms", 0),
llm_calls=s.get("llm_calls", 0),
metabolic_cost=s.get("metabolic_cost", 0.0),
)
)
print()
print(compare_runs(metrics, baseline))
return 0 if metrics.fail_count == 0 else 1
if __name__ == "__main__":
sys.exit(asyncio.run(main()))