Timmy-time-dashboard/scripts/run_benchmarks.py

#!/usr/bin/env python3
"""Run the agent performance regression benchmark suite.

Usage::

    python scripts/run_benchmarks.py                  # all scenarios
    python scripts/run_benchmarks.py --tags navigation # filter by tag
    python scripts/run_benchmarks.py --output results/benchmarks.jsonl
    python scripts/run_benchmarks.py --compare results/benchmarks.jsonl

Exit codes:
    0 — all scenarios passed
    1 — one or more scenarios failed
"""

from __future__ import annotations

import argparse
import asyncio
import sys
from pathlib import Path

# Ensure src/ is on the path when invoked directly
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))

from infrastructure.world.benchmark.metrics import BenchmarkMetrics, load_history
from infrastructure.world.benchmark.runner import BenchmarkRunner
from infrastructure.world.benchmark.scenarios import load_scenarios


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Agent performance regression benchmark suite",
    )
    parser.add_argument(
        "--tags",
        nargs="*",
        default=None,
        help="Filter scenarios by tag (e.g. navigation quest)",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=None,
        help="JSONL file to append results to",
    )
    parser.add_argument(
        "--compare",
        type=Path,
        default=None,
        help="JSONL file with baseline results for regression comparison",
    )
    return parser.parse_args()


async def main() -> int:
    args = parse_args()

    scenarios = load_scenarios(tags=args.tags)
    if not scenarios:
        print("No matching scenarios found.")
        return 1

    print(f"Running {len(scenarios)} benchmark scenario(s)...\n")

    runner = BenchmarkRunner()
    metrics = await runner.run(scenarios)

    print(metrics.summary())

    if args.output:
        metrics.save(args.output)

    if args.compare:
        history = load_history(args.compare)
        if history:
            from infrastructure.world.benchmark.metrics import compare_runs

            # Reconstruct baseline from last recorded run
            last = history[0]
            baseline = BenchmarkMetrics(
                timestamp=last.get("timestamp", ""),
                commit_sha=last.get("commit_sha", ""),
                total_time_ms=last.get("total_time_ms", 0),
            )
            for s in last.get("scenarios", []):
                from infrastructure.world.benchmark.metrics import ScenarioResult

                baseline.results.append(
                    ScenarioResult(
                        scenario_name=s["scenario_name"],
                        success=s["success"],
                        cycles_used=s["cycles_used"],
                        max_cycles=s["max_cycles"],
                        wall_time_ms=s.get("wall_time_ms", 0),
                        llm_calls=s.get("llm_calls", 0),
                        metabolic_cost=s.get("metabolic_cost", 0.0),
                    )
                )
            print()
            print(compare_runs(metrics, baseline))

    return 0 if metrics.fail_count == 0 else 1


if __name__ == "__main__":
    sys.exit(asyncio.run(main()))