forked from Rockachopa/Timmy-time-dashboard
108 lines
3.2 KiB
Python
108 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Run the agent performance regression benchmark suite.
|
|
|
|
Usage::
|
|
|
|
python scripts/run_benchmarks.py # all scenarios
|
|
python scripts/run_benchmarks.py --tags navigation # filter by tag
|
|
python scripts/run_benchmarks.py --output results/benchmarks.jsonl
|
|
python scripts/run_benchmarks.py --compare results/benchmarks.jsonl
|
|
|
|
Exit codes:
|
|
0 — all scenarios passed
|
|
1 — one or more scenarios failed
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Ensure src/ is on the path when invoked directly
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
|
|
|
|
from infrastructure.world.benchmark.metrics import BenchmarkMetrics, load_history
|
|
from infrastructure.world.benchmark.runner import BenchmarkRunner
|
|
from infrastructure.world.benchmark.scenarios import load_scenarios
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Agent performance regression benchmark suite",
|
|
)
|
|
parser.add_argument(
|
|
"--tags",
|
|
nargs="*",
|
|
default=None,
|
|
help="Filter scenarios by tag (e.g. navigation quest)",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=None,
|
|
help="JSONL file to append results to",
|
|
)
|
|
parser.add_argument(
|
|
"--compare",
|
|
type=Path,
|
|
default=None,
|
|
help="JSONL file with baseline results for regression comparison",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
async def main() -> int:
|
|
args = parse_args()
|
|
|
|
scenarios = load_scenarios(tags=args.tags)
|
|
if not scenarios:
|
|
print("No matching scenarios found.")
|
|
return 1
|
|
|
|
print(f"Running {len(scenarios)} benchmark scenario(s)...\n")
|
|
|
|
runner = BenchmarkRunner()
|
|
metrics = await runner.run(scenarios)
|
|
|
|
print(metrics.summary())
|
|
|
|
if args.output:
|
|
metrics.save(args.output)
|
|
|
|
if args.compare:
|
|
history = load_history(args.compare)
|
|
if history:
|
|
from infrastructure.world.benchmark.metrics import compare_runs
|
|
|
|
# Reconstruct baseline from last recorded run
|
|
last = history[0]
|
|
baseline = BenchmarkMetrics(
|
|
timestamp=last.get("timestamp", ""),
|
|
commit_sha=last.get("commit_sha", ""),
|
|
total_time_ms=last.get("total_time_ms", 0),
|
|
)
|
|
for s in last.get("scenarios", []):
|
|
from infrastructure.world.benchmark.metrics import ScenarioResult
|
|
|
|
baseline.results.append(
|
|
ScenarioResult(
|
|
scenario_name=s["scenario_name"],
|
|
success=s["success"],
|
|
cycles_used=s["cycles_used"],
|
|
max_cycles=s["max_cycles"],
|
|
wall_time_ms=s.get("wall_time_ms", 0),
|
|
llm_calls=s.get("llm_calls", 0),
|
|
metabolic_cost=s.get("metabolic_cost", 0.0),
|
|
)
|
|
)
|
|
print()
|
|
print(compare_runs(metrics, baseline))
|
|
|
|
return 0 if metrics.fail_count == 0 else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(asyncio.run(main()))
|