Timmy-time-dashboard/timmy-benchmark/run_benchmark.py

#!/usr/bin/env python3
"""Timmy Cognitive Benchmark Harness — Project Bannerlord M0.

Runs a 6-level cognitive benchmark against an Ollama model to assess
readiness for autonomous Bannerlord gameplay.

Usage:
    python run_benchmark.py --model qwen2.5:14b --verbose
    python run_benchmark.py --model qwen3:14b --levels 0,1,2
    python run_benchmark.py --model qwen2.5:14b --output results/my_run.json
"""

import argparse
import dataclasses
import json
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path

try:
    import ollama
except ImportError:
    print("ERROR: 'ollama' package not installed. Run: pip install ollama", file=sys.stderr)
    sys.exit(1)

# Add parent dir to path so levels can be imported
sys.path.insert(0, str(Path(__file__).parent))

from levels import level_0_coin_flip
from levels import level_1_tic_tac_toe
from levels import level_2_resource_mgmt
from levels import level_3_battle_tactics
from levels import level_4_trade_route
from levels import level_5_mini_campaign

ALL_LEVELS = [
    level_0_coin_flip,
    level_1_tic_tac_toe,
    level_2_resource_mgmt,
    level_3_battle_tactics,
    level_4_trade_route,
    level_5_mini_campaign,
]

# Pass criteria for M1 gate
M1_GATE_LEVELS = {0, 1}  # Must pass Level 0 and Level 1
M1_LATENCY_THRESHOLD_MS = 10_000  # < 10s per decision for L0-L1


def _dataclass_to_dict(obj):
    """Recursively convert dataclass instances to dicts for JSON serialization."""
    if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
        return {k: _dataclass_to_dict(v) for k, v in dataclasses.asdict(obj).items()}
    if isinstance(obj, list):
        return [_dataclass_to_dict(i) for i in obj]
    if isinstance(obj, dict):
        return {k: _dataclass_to_dict(v) for k, v in obj.items()}
    return obj


def check_model_available(model: str) -> bool:
    """Return True if the model is available in Ollama."""
    try:
        models = ollama.list()
        model_names = [m["model"] for m in models.get("models", [])]
        # Also check without tag
        base_model = model.split(":")[0]
        return any(
            m == model or m.startswith(base_model + ":") or m == base_model
            for m in model_names
        )
    except Exception:
        return False


def run_benchmark(
    model: str,
    levels_to_run: list[int] | None = None,
    verbose: bool = False,
    skip_missing: bool = True,
) -> dict:
    """Run the benchmark and return a results dict."""
    if levels_to_run is None:
        levels_to_run = list(range(len(ALL_LEVELS)))

    print(f"\n{'=' * 60}")
    print(f"  Timmy Cognitive Benchmark — Project Bannerlord M0")
    print(f"{'=' * 60}")
    print(f"  Model:  {model}")
    print(f"  Levels: {levels_to_run}")
    print(f"  Time:   {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'=' * 60}\n")

    if not check_model_available(model):
        if skip_missing:
            print(f"  WARNING: Model '{model}' not found in Ollama. Skipping.\n")
            return {
                "model": model,
                "skipped": True,
                "reason": f"Model '{model}' not available",
                "timestamp": datetime.now(timezone.utc).isoformat(),
            }
        else:
            print(f"  ERROR: Model '{model}' not found in Ollama.", file=sys.stderr)
            sys.exit(1)

    client = ollama

    results = {
        "model": model,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "skipped": False,
        "levels": {},
        "summary": {},
    }

    level_results = {}
    total_start = time.time()

    for level_idx in levels_to_run:
        if level_idx >= len(ALL_LEVELS):
            print(f"  WARNING: Level {level_idx} does not exist, skipping.")
            continue

        module = ALL_LEVELS[level_idx]
        print(f"Level {module.LEVEL}: {module.NAME}")
        print(f"  {module.DESCRIPTION}")

        try:
            level_result = module.run(client, model, verbose=verbose)
            level_results[level_idx] = level_result

            passed_str = "PASS" if level_result.passed else "FAIL"
            score_pct = f"{level_result.score * 100:.0f}%"
            lat_str = f"p50={level_result.latency_p50_ms:.0f}ms p99={level_result.latency_p99_ms:.0f}ms"
            print(f"  Result: {passed_str} | Score: {score_pct} | Latency: {lat_str}")

        except Exception as exc:
            print(f"  ERROR running level {level_idx}: {exc}")
            import traceback
            traceback.print_exc()

        print()

    total_elapsed_s = time.time() - total_start

    # Build summary
    m1_gate_passed = True
    m1_gate_notes = []

    for level_idx, lr in level_results.items():
        results["levels"][str(level_idx)] = _dataclass_to_dict(lr)

        if level_idx in M1_GATE_LEVELS:
            if not lr.passed:
                m1_gate_passed = False
                m1_gate_notes.append(f"Level {level_idx} FAILED (score={lr.score:.2f})")
            if lr.latency_p99_ms > M1_LATENCY_THRESHOLD_MS:
                m1_gate_passed = False
                m1_gate_notes.append(
                    f"Level {level_idx} latency too high "
                    f"(p99={lr.latency_p99_ms:.0f}ms > {M1_LATENCY_THRESHOLD_MS}ms)"
                )

    results["summary"] = {
        "total_elapsed_s": round(total_elapsed_s, 1),
        "levels_run": levels_to_run,
        "levels_passed": [i for i, lr in level_results.items() if lr.passed],
        "levels_failed": [i for i, lr in level_results.items() if not lr.passed],
        "m1_gate_passed": m1_gate_passed,
        "m1_gate_notes": m1_gate_notes,
        "m1_latency_threshold_ms": M1_LATENCY_THRESHOLD_MS,
    }

    # Print scorecard
    print(f"{'=' * 60}")
    print(f"  SCORECARD — {model}")
    print(f"{'=' * 60}")

    all_level_modules = {m.LEVEL: m for m in ALL_LEVELS}
    for level_idx in levels_to_run:
        if level_idx not in level_results:
            continue
        lr = level_results[level_idx]
        module = ALL_LEVELS[level_idx]
        passed_str = "✓ PASS" if lr.passed else "✗ FAIL"
        gate_str = " [M1 GATE]" if level_idx in M1_GATE_LEVELS else ""
        lat = f"{lr.latency_p50_ms:.0f}ms"
        print(f"  L{level_idx}: {passed_str}{gate_str} | {lr.score*100:.0f}% | {lat} | {module.NAME}")

    print(f"{'─' * 60}")
    gate_str = "✓ M1 GATE PASSED" if m1_gate_passed else "✗ M1 GATE FAILED"
    print(f"  {gate_str}")
    if m1_gate_notes:
        for note in m1_gate_notes:
            print(f"    → {note}")
    print(f"  Total time: {total_elapsed_s:.1f}s")
    print(f"{'=' * 60}\n")

    return results


def main():
    parser = argparse.ArgumentParser(
        description="Timmy Cognitive Benchmark Harness — Project Bannerlord M0"
    )
    parser.add_argument("--model", required=True, help="Ollama model name (e.g. qwen2.5:14b)")
    parser.add_argument("--levels", default=None, help="Comma-separated level indices (default: all)")
    parser.add_argument("--verbose", action="store_true", help="Show per-trial details")
    parser.add_argument(
        "--output", default=None,
        help="Output JSON path (default: results/<model>_<timestamp>.json)"
    )
    parser.add_argument(
        "--skip-missing", action="store_true", default=True,
        help="Skip instead of error if model not available"
    )
    args = parser.parse_args()

    levels_to_run = None
    if args.levels:
        try:
            levels_to_run = [int(x.strip()) for x in args.levels.split(",")]
        except ValueError:
            print(f"ERROR: --levels must be comma-separated integers, got: {args.levels}", file=sys.stderr)
            sys.exit(1)

    results = run_benchmark(
        model=args.model,
        levels_to_run=levels_to_run,
        verbose=args.verbose,
        skip_missing=args.skip_missing,
    )

    # Save results
    if args.output:
        output_path = Path(args.output)
    else:
        results_dir = Path(__file__).parent / "results"
        results_dir.mkdir(exist_ok=True)
        safe_model = args.model.replace(":", "_").replace("/", "_")
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = results_dir / f"{safe_model}_{ts}.json"

    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w") as f:
        json.dump(results, f, indent=2, default=str)

    print(f"Results saved to: {output_path}")

    # Exit with non-zero if M1 gate failed
    if not results.get("skipped") and not results.get("summary", {}).get("m1_gate_passed", True):
        sys.exit(1)


if __name__ == "__main__":
    main()