1
0
This repository has been archived on 2026-03-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Timmy-time-dashboard/timmy-benchmark/run_benchmark.py
Alexander Whitestone 9e08e87312 [claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com>
Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00

260 lines
8.5 KiB
Python

#!/usr/bin/env python3
"""Timmy Cognitive Benchmark Harness — Project Bannerlord M0.
Runs a 6-level cognitive benchmark against an Ollama model to assess
readiness for autonomous Bannerlord gameplay.
Usage:
python run_benchmark.py --model qwen2.5:14b --verbose
python run_benchmark.py --model qwen3:14b --levels 0,1,2
python run_benchmark.py --model qwen2.5:14b --output results/my_run.json
"""
import argparse
import dataclasses
import json
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
try:
import ollama
except ImportError:
print("ERROR: 'ollama' package not installed. Run: pip install ollama", file=sys.stderr)
sys.exit(1)
# Add parent dir to path so levels can be imported
sys.path.insert(0, str(Path(__file__).parent))
from levels import level_0_coin_flip
from levels import level_1_tic_tac_toe
from levels import level_2_resource_mgmt
from levels import level_3_battle_tactics
from levels import level_4_trade_route
from levels import level_5_mini_campaign
ALL_LEVELS = [
level_0_coin_flip,
level_1_tic_tac_toe,
level_2_resource_mgmt,
level_3_battle_tactics,
level_4_trade_route,
level_5_mini_campaign,
]
# Pass criteria for M1 gate
M1_GATE_LEVELS = {0, 1} # Must pass Level 0 and Level 1
M1_LATENCY_THRESHOLD_MS = 10_000 # < 10s per decision for L0-L1
def _dataclass_to_dict(obj):
"""Recursively convert dataclass instances to dicts for JSON serialization."""
if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
return {k: _dataclass_to_dict(v) for k, v in dataclasses.asdict(obj).items()}
if isinstance(obj, list):
return [_dataclass_to_dict(i) for i in obj]
if isinstance(obj, dict):
return {k: _dataclass_to_dict(v) for k, v in obj.items()}
return obj
def check_model_available(model: str) -> bool:
"""Return True if the model is available in Ollama."""
try:
models = ollama.list()
model_names = [m["model"] for m in models.get("models", [])]
# Also check without tag
base_model = model.split(":")[0]
return any(
m == model or m.startswith(base_model + ":") or m == base_model
for m in model_names
)
except Exception:
return False
def run_benchmark(
model: str,
levels_to_run: list[int] | None = None,
verbose: bool = False,
skip_missing: bool = True,
) -> dict:
"""Run the benchmark and return a results dict."""
if levels_to_run is None:
levels_to_run = list(range(len(ALL_LEVELS)))
print(f"\n{'=' * 60}")
print(f" Timmy Cognitive Benchmark — Project Bannerlord M0")
print(f"{'=' * 60}")
print(f" Model: {model}")
print(f" Levels: {levels_to_run}")
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'=' * 60}\n")
if not check_model_available(model):
if skip_missing:
print(f" WARNING: Model '{model}' not found in Ollama. Skipping.\n")
return {
"model": model,
"skipped": True,
"reason": f"Model '{model}' not available",
"timestamp": datetime.now(timezone.utc).isoformat(),
}
else:
print(f" ERROR: Model '{model}' not found in Ollama.", file=sys.stderr)
sys.exit(1)
client = ollama
results = {
"model": model,
"timestamp": datetime.now(timezone.utc).isoformat(),
"skipped": False,
"levels": {},
"summary": {},
}
level_results = {}
total_start = time.time()
for level_idx in levels_to_run:
if level_idx >= len(ALL_LEVELS):
print(f" WARNING: Level {level_idx} does not exist, skipping.")
continue
module = ALL_LEVELS[level_idx]
print(f"Level {module.LEVEL}: {module.NAME}")
print(f" {module.DESCRIPTION}")
try:
level_result = module.run(client, model, verbose=verbose)
level_results[level_idx] = level_result
passed_str = "PASS" if level_result.passed else "FAIL"
score_pct = f"{level_result.score * 100:.0f}%"
lat_str = f"p50={level_result.latency_p50_ms:.0f}ms p99={level_result.latency_p99_ms:.0f}ms"
print(f" Result: {passed_str} | Score: {score_pct} | Latency: {lat_str}")
except Exception as exc:
print(f" ERROR running level {level_idx}: {exc}")
import traceback
traceback.print_exc()
print()
total_elapsed_s = time.time() - total_start
# Build summary
m1_gate_passed = True
m1_gate_notes = []
for level_idx, lr in level_results.items():
results["levels"][str(level_idx)] = _dataclass_to_dict(lr)
if level_idx in M1_GATE_LEVELS:
if not lr.passed:
m1_gate_passed = False
m1_gate_notes.append(f"Level {level_idx} FAILED (score={lr.score:.2f})")
if lr.latency_p99_ms > M1_LATENCY_THRESHOLD_MS:
m1_gate_passed = False
m1_gate_notes.append(
f"Level {level_idx} latency too high "
f"(p99={lr.latency_p99_ms:.0f}ms > {M1_LATENCY_THRESHOLD_MS}ms)"
)
results["summary"] = {
"total_elapsed_s": round(total_elapsed_s, 1),
"levels_run": levels_to_run,
"levels_passed": [i for i, lr in level_results.items() if lr.passed],
"levels_failed": [i for i, lr in level_results.items() if not lr.passed],
"m1_gate_passed": m1_gate_passed,
"m1_gate_notes": m1_gate_notes,
"m1_latency_threshold_ms": M1_LATENCY_THRESHOLD_MS,
}
# Print scorecard
print(f"{'=' * 60}")
print(f" SCORECARD — {model}")
print(f"{'=' * 60}")
all_level_modules = {m.LEVEL: m for m in ALL_LEVELS}
for level_idx in levels_to_run:
if level_idx not in level_results:
continue
lr = level_results[level_idx]
module = ALL_LEVELS[level_idx]
passed_str = "✓ PASS" if lr.passed else "✗ FAIL"
gate_str = " [M1 GATE]" if level_idx in M1_GATE_LEVELS else ""
lat = f"{lr.latency_p50_ms:.0f}ms"
print(f" L{level_idx}: {passed_str}{gate_str} | {lr.score*100:.0f}% | {lat} | {module.NAME}")
print(f"{'' * 60}")
gate_str = "✓ M1 GATE PASSED" if m1_gate_passed else "✗ M1 GATE FAILED"
print(f" {gate_str}")
if m1_gate_notes:
for note in m1_gate_notes:
print(f"{note}")
print(f" Total time: {total_elapsed_s:.1f}s")
print(f"{'=' * 60}\n")
return results
def main():
parser = argparse.ArgumentParser(
description="Timmy Cognitive Benchmark Harness — Project Bannerlord M0"
)
parser.add_argument("--model", required=True, help="Ollama model name (e.g. qwen2.5:14b)")
parser.add_argument("--levels", default=None, help="Comma-separated level indices (default: all)")
parser.add_argument("--verbose", action="store_true", help="Show per-trial details")
parser.add_argument(
"--output", default=None,
help="Output JSON path (default: results/<model>_<timestamp>.json)"
)
parser.add_argument(
"--skip-missing", action="store_true", default=True,
help="Skip instead of error if model not available"
)
args = parser.parse_args()
levels_to_run = None
if args.levels:
try:
levels_to_run = [int(x.strip()) for x in args.levels.split(",")]
except ValueError:
print(f"ERROR: --levels must be comma-separated integers, got: {args.levels}", file=sys.stderr)
sys.exit(1)
results = run_benchmark(
model=args.model,
levels_to_run=levels_to_run,
verbose=args.verbose,
skip_missing=args.skip_missing,
)
# Save results
if args.output:
output_path = Path(args.output)
else:
results_dir = Path(__file__).parent / "results"
results_dir.mkdir(exist_ok=True)
safe_model = args.model.replace(":", "_").replace("/", "_")
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = results_dir / f"{safe_model}_{ts}.json"
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump(results, f, indent=2, default=str)
print(f"Results saved to: {output_path}")
# Exit with non-zero if M1 gate failed
if not results.get("skipped") and not results.get("summary", {}).get("m1_gate_passed", True):
sys.exit(1)
if __name__ == "__main__":
main()