#!/usr/bin/env python3 """Timmy Cognitive Benchmark Harness — Project Bannerlord M0. Runs a 6-level cognitive benchmark against an Ollama model to assess readiness for autonomous Bannerlord gameplay. Usage: python run_benchmark.py --model qwen2.5:14b --verbose python run_benchmark.py --model qwen3:14b --levels 0,1,2 python run_benchmark.py --model qwen2.5:14b --output results/my_run.json """ import argparse import dataclasses import json import os import sys import time from datetime import datetime, timezone from pathlib import Path try: import ollama except ImportError: print("ERROR: 'ollama' package not installed. Run: pip install ollama", file=sys.stderr) sys.exit(1) # Add parent dir to path so levels can be imported sys.path.insert(0, str(Path(__file__).parent)) from levels import level_0_coin_flip from levels import level_1_tic_tac_toe from levels import level_2_resource_mgmt from levels import level_3_battle_tactics from levels import level_4_trade_route from levels import level_5_mini_campaign ALL_LEVELS = [ level_0_coin_flip, level_1_tic_tac_toe, level_2_resource_mgmt, level_3_battle_tactics, level_4_trade_route, level_5_mini_campaign, ] # Pass criteria for M1 gate M1_GATE_LEVELS = {0, 1} # Must pass Level 0 and Level 1 M1_LATENCY_THRESHOLD_MS = 10_000 # < 10s per decision for L0-L1 def _dataclass_to_dict(obj): """Recursively convert dataclass instances to dicts for JSON serialization.""" if dataclasses.is_dataclass(obj) and not isinstance(obj, type): return {k: _dataclass_to_dict(v) for k, v in dataclasses.asdict(obj).items()} if isinstance(obj, list): return [_dataclass_to_dict(i) for i in obj] if isinstance(obj, dict): return {k: _dataclass_to_dict(v) for k, v in obj.items()} return obj def check_model_available(model: str) -> bool: """Return True if the model is available in Ollama.""" try: models = ollama.list() model_names = [m["model"] for m in models.get("models", [])] # Also check without tag base_model = model.split(":")[0] return any( m == model or m.startswith(base_model + ":") or m == base_model for m in model_names ) except Exception: return False def run_benchmark( model: str, levels_to_run: list[int] | None = None, verbose: bool = False, skip_missing: bool = True, ) -> dict: """Run the benchmark and return a results dict.""" if levels_to_run is None: levels_to_run = list(range(len(ALL_LEVELS))) print(f"\n{'=' * 60}") print(f" Timmy Cognitive Benchmark — Project Bannerlord M0") print(f"{'=' * 60}") print(f" Model: {model}") print(f" Levels: {levels_to_run}") print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"{'=' * 60}\n") if not check_model_available(model): if skip_missing: print(f" WARNING: Model '{model}' not found in Ollama. Skipping.\n") return { "model": model, "skipped": True, "reason": f"Model '{model}' not available", "timestamp": datetime.now(timezone.utc).isoformat(), } else: print(f" ERROR: Model '{model}' not found in Ollama.", file=sys.stderr) sys.exit(1) client = ollama results = { "model": model, "timestamp": datetime.now(timezone.utc).isoformat(), "skipped": False, "levels": {}, "summary": {}, } level_results = {} total_start = time.time() for level_idx in levels_to_run: if level_idx >= len(ALL_LEVELS): print(f" WARNING: Level {level_idx} does not exist, skipping.") continue module = ALL_LEVELS[level_idx] print(f"Level {module.LEVEL}: {module.NAME}") print(f" {module.DESCRIPTION}") try: level_result = module.run(client, model, verbose=verbose) level_results[level_idx] = level_result passed_str = "PASS" if level_result.passed else "FAIL" score_pct = f"{level_result.score * 100:.0f}%" lat_str = f"p50={level_result.latency_p50_ms:.0f}ms p99={level_result.latency_p99_ms:.0f}ms" print(f" Result: {passed_str} | Score: {score_pct} | Latency: {lat_str}") except Exception as exc: print(f" ERROR running level {level_idx}: {exc}") import traceback traceback.print_exc() print() total_elapsed_s = time.time() - total_start # Build summary m1_gate_passed = True m1_gate_notes = [] for level_idx, lr in level_results.items(): results["levels"][str(level_idx)] = _dataclass_to_dict(lr) if level_idx in M1_GATE_LEVELS: if not lr.passed: m1_gate_passed = False m1_gate_notes.append(f"Level {level_idx} FAILED (score={lr.score:.2f})") if lr.latency_p99_ms > M1_LATENCY_THRESHOLD_MS: m1_gate_passed = False m1_gate_notes.append( f"Level {level_idx} latency too high " f"(p99={lr.latency_p99_ms:.0f}ms > {M1_LATENCY_THRESHOLD_MS}ms)" ) results["summary"] = { "total_elapsed_s": round(total_elapsed_s, 1), "levels_run": levels_to_run, "levels_passed": [i for i, lr in level_results.items() if lr.passed], "levels_failed": [i for i, lr in level_results.items() if not lr.passed], "m1_gate_passed": m1_gate_passed, "m1_gate_notes": m1_gate_notes, "m1_latency_threshold_ms": M1_LATENCY_THRESHOLD_MS, } # Print scorecard print(f"{'=' * 60}") print(f" SCORECARD — {model}") print(f"{'=' * 60}") all_level_modules = {m.LEVEL: m for m in ALL_LEVELS} for level_idx in levels_to_run: if level_idx not in level_results: continue lr = level_results[level_idx] module = ALL_LEVELS[level_idx] passed_str = "✓ PASS" if lr.passed else "✗ FAIL" gate_str = " [M1 GATE]" if level_idx in M1_GATE_LEVELS else "" lat = f"{lr.latency_p50_ms:.0f}ms" print(f" L{level_idx}: {passed_str}{gate_str} | {lr.score*100:.0f}% | {lat} | {module.NAME}") print(f"{'─' * 60}") gate_str = "✓ M1 GATE PASSED" if m1_gate_passed else "✗ M1 GATE FAILED" print(f" {gate_str}") if m1_gate_notes: for note in m1_gate_notes: print(f" → {note}") print(f" Total time: {total_elapsed_s:.1f}s") print(f"{'=' * 60}\n") return results def main(): parser = argparse.ArgumentParser( description="Timmy Cognitive Benchmark Harness — Project Bannerlord M0" ) parser.add_argument("--model", required=True, help="Ollama model name (e.g. qwen2.5:14b)") parser.add_argument("--levels", default=None, help="Comma-separated level indices (default: all)") parser.add_argument("--verbose", action="store_true", help="Show per-trial details") parser.add_argument( "--output", default=None, help="Output JSON path (default: results/_.json)" ) parser.add_argument( "--skip-missing", action="store_true", default=True, help="Skip instead of error if model not available" ) args = parser.parse_args() levels_to_run = None if args.levels: try: levels_to_run = [int(x.strip()) for x in args.levels.split(",")] except ValueError: print(f"ERROR: --levels must be comma-separated integers, got: {args.levels}", file=sys.stderr) sys.exit(1) results = run_benchmark( model=args.model, levels_to_run=levels_to_run, verbose=args.verbose, skip_missing=args.skip_missing, ) # Save results if args.output: output_path = Path(args.output) else: results_dir = Path(__file__).parent / "results" results_dir.mkdir(exist_ok=True) safe_model = args.model.replace(":", "_").replace("/", "_") ts = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = results_dir / f"{safe_model}_{ts}.json" output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w") as f: json.dump(results, f, indent=2, default=str) print(f"Results saved to: {output_path}") # Exit with non-zero if M1 gate failed if not results.get("skipped") and not results.get("summary", {}).get("m1_gate_passed", True): sys.exit(1) if __name__ == "__main__": main()