#!/usr/bin/env python3 """Run the agent performance regression benchmark suite. Usage:: python scripts/run_benchmarks.py # all scenarios python scripts/run_benchmarks.py --tags navigation # filter by tag python scripts/run_benchmarks.py --output results/benchmarks.jsonl python scripts/run_benchmarks.py --compare results/benchmarks.jsonl Exit codes: 0 — all scenarios passed 1 — one or more scenarios failed """ from __future__ import annotations import argparse import asyncio import sys from pathlib import Path # Ensure src/ is on the path when invoked directly sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) from infrastructure.world.benchmark.metrics import BenchmarkMetrics, load_history from infrastructure.world.benchmark.runner import BenchmarkRunner from infrastructure.world.benchmark.scenarios import load_scenarios def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Agent performance regression benchmark suite", ) parser.add_argument( "--tags", nargs="*", default=None, help="Filter scenarios by tag (e.g. navigation quest)", ) parser.add_argument( "--output", type=Path, default=None, help="JSONL file to append results to", ) parser.add_argument( "--compare", type=Path, default=None, help="JSONL file with baseline results for regression comparison", ) return parser.parse_args() async def main() -> int: args = parse_args() scenarios = load_scenarios(tags=args.tags) if not scenarios: print("No matching scenarios found.") return 1 print(f"Running {len(scenarios)} benchmark scenario(s)...\n") runner = BenchmarkRunner() metrics = await runner.run(scenarios) print(metrics.summary()) if args.output: metrics.save(args.output) if args.compare: history = load_history(args.compare) if history: from infrastructure.world.benchmark.metrics import compare_runs # Reconstruct baseline from last recorded run last = history[0] baseline = BenchmarkMetrics( timestamp=last.get("timestamp", ""), commit_sha=last.get("commit_sha", ""), total_time_ms=last.get("total_time_ms", 0), ) for s in last.get("scenarios", []): from infrastructure.world.benchmark.metrics import ScenarioResult baseline.results.append( ScenarioResult( scenario_name=s["scenario_name"], success=s["success"], cycles_used=s["cycles_used"], max_cycles=s["max_cycles"], wall_time_ms=s.get("wall_time_ms", 0), llm_calls=s.get("llm_calls", 0), metabolic_cost=s.get("metabolic_cost", 0.0), ) ) print() print(compare_runs(metrics, baseline)) return 0 if metrics.fail_count == 0 else 1 if __name__ == "__main__": sys.exit(asyncio.run(main()))