Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
260 lines
8.5 KiB
Python
260 lines
8.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Timmy Cognitive Benchmark Harness — Project Bannerlord M0.
|
|
|
|
Runs a 6-level cognitive benchmark against an Ollama model to assess
|
|
readiness for autonomous Bannerlord gameplay.
|
|
|
|
Usage:
|
|
python run_benchmark.py --model qwen2.5:14b --verbose
|
|
python run_benchmark.py --model qwen3:14b --levels 0,1,2
|
|
python run_benchmark.py --model qwen2.5:14b --output results/my_run.json
|
|
"""
|
|
|
|
import argparse
|
|
import dataclasses
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import ollama
|
|
except ImportError:
|
|
print("ERROR: 'ollama' package not installed. Run: pip install ollama", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Add parent dir to path so levels can be imported
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from levels import level_0_coin_flip
|
|
from levels import level_1_tic_tac_toe
|
|
from levels import level_2_resource_mgmt
|
|
from levels import level_3_battle_tactics
|
|
from levels import level_4_trade_route
|
|
from levels import level_5_mini_campaign
|
|
|
|
ALL_LEVELS = [
|
|
level_0_coin_flip,
|
|
level_1_tic_tac_toe,
|
|
level_2_resource_mgmt,
|
|
level_3_battle_tactics,
|
|
level_4_trade_route,
|
|
level_5_mini_campaign,
|
|
]
|
|
|
|
# Pass criteria for M1 gate
|
|
M1_GATE_LEVELS = {0, 1} # Must pass Level 0 and Level 1
|
|
M1_LATENCY_THRESHOLD_MS = 10_000 # < 10s per decision for L0-L1
|
|
|
|
|
|
def _dataclass_to_dict(obj):
|
|
"""Recursively convert dataclass instances to dicts for JSON serialization."""
|
|
if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
|
|
return {k: _dataclass_to_dict(v) for k, v in dataclasses.asdict(obj).items()}
|
|
if isinstance(obj, list):
|
|
return [_dataclass_to_dict(i) for i in obj]
|
|
if isinstance(obj, dict):
|
|
return {k: _dataclass_to_dict(v) for k, v in obj.items()}
|
|
return obj
|
|
|
|
|
|
def check_model_available(model: str) -> bool:
|
|
"""Return True if the model is available in Ollama."""
|
|
try:
|
|
models = ollama.list()
|
|
model_names = [m["model"] for m in models.get("models", [])]
|
|
# Also check without tag
|
|
base_model = model.split(":")[0]
|
|
return any(
|
|
m == model or m.startswith(base_model + ":") or m == base_model
|
|
for m in model_names
|
|
)
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def run_benchmark(
|
|
model: str,
|
|
levels_to_run: list[int] | None = None,
|
|
verbose: bool = False,
|
|
skip_missing: bool = True,
|
|
) -> dict:
|
|
"""Run the benchmark and return a results dict."""
|
|
if levels_to_run is None:
|
|
levels_to_run = list(range(len(ALL_LEVELS)))
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print(f" Timmy Cognitive Benchmark — Project Bannerlord M0")
|
|
print(f"{'=' * 60}")
|
|
print(f" Model: {model}")
|
|
print(f" Levels: {levels_to_run}")
|
|
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print(f"{'=' * 60}\n")
|
|
|
|
if not check_model_available(model):
|
|
if skip_missing:
|
|
print(f" WARNING: Model '{model}' not found in Ollama. Skipping.\n")
|
|
return {
|
|
"model": model,
|
|
"skipped": True,
|
|
"reason": f"Model '{model}' not available",
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
else:
|
|
print(f" ERROR: Model '{model}' not found in Ollama.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
client = ollama
|
|
|
|
results = {
|
|
"model": model,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"skipped": False,
|
|
"levels": {},
|
|
"summary": {},
|
|
}
|
|
|
|
level_results = {}
|
|
total_start = time.time()
|
|
|
|
for level_idx in levels_to_run:
|
|
if level_idx >= len(ALL_LEVELS):
|
|
print(f" WARNING: Level {level_idx} does not exist, skipping.")
|
|
continue
|
|
|
|
module = ALL_LEVELS[level_idx]
|
|
print(f"Level {module.LEVEL}: {module.NAME}")
|
|
print(f" {module.DESCRIPTION}")
|
|
|
|
try:
|
|
level_result = module.run(client, model, verbose=verbose)
|
|
level_results[level_idx] = level_result
|
|
|
|
passed_str = "PASS" if level_result.passed else "FAIL"
|
|
score_pct = f"{level_result.score * 100:.0f}%"
|
|
lat_str = f"p50={level_result.latency_p50_ms:.0f}ms p99={level_result.latency_p99_ms:.0f}ms"
|
|
print(f" Result: {passed_str} | Score: {score_pct} | Latency: {lat_str}")
|
|
|
|
except Exception as exc:
|
|
print(f" ERROR running level {level_idx}: {exc}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
print()
|
|
|
|
total_elapsed_s = time.time() - total_start
|
|
|
|
# Build summary
|
|
m1_gate_passed = True
|
|
m1_gate_notes = []
|
|
|
|
for level_idx, lr in level_results.items():
|
|
results["levels"][str(level_idx)] = _dataclass_to_dict(lr)
|
|
|
|
if level_idx in M1_GATE_LEVELS:
|
|
if not lr.passed:
|
|
m1_gate_passed = False
|
|
m1_gate_notes.append(f"Level {level_idx} FAILED (score={lr.score:.2f})")
|
|
if lr.latency_p99_ms > M1_LATENCY_THRESHOLD_MS:
|
|
m1_gate_passed = False
|
|
m1_gate_notes.append(
|
|
f"Level {level_idx} latency too high "
|
|
f"(p99={lr.latency_p99_ms:.0f}ms > {M1_LATENCY_THRESHOLD_MS}ms)"
|
|
)
|
|
|
|
results["summary"] = {
|
|
"total_elapsed_s": round(total_elapsed_s, 1),
|
|
"levels_run": levels_to_run,
|
|
"levels_passed": [i for i, lr in level_results.items() if lr.passed],
|
|
"levels_failed": [i for i, lr in level_results.items() if not lr.passed],
|
|
"m1_gate_passed": m1_gate_passed,
|
|
"m1_gate_notes": m1_gate_notes,
|
|
"m1_latency_threshold_ms": M1_LATENCY_THRESHOLD_MS,
|
|
}
|
|
|
|
# Print scorecard
|
|
print(f"{'=' * 60}")
|
|
print(f" SCORECARD — {model}")
|
|
print(f"{'=' * 60}")
|
|
|
|
all_level_modules = {m.LEVEL: m for m in ALL_LEVELS}
|
|
for level_idx in levels_to_run:
|
|
if level_idx not in level_results:
|
|
continue
|
|
lr = level_results[level_idx]
|
|
module = ALL_LEVELS[level_idx]
|
|
passed_str = "✓ PASS" if lr.passed else "✗ FAIL"
|
|
gate_str = " [M1 GATE]" if level_idx in M1_GATE_LEVELS else ""
|
|
lat = f"{lr.latency_p50_ms:.0f}ms"
|
|
print(f" L{level_idx}: {passed_str}{gate_str} | {lr.score*100:.0f}% | {lat} | {module.NAME}")
|
|
|
|
print(f"{'─' * 60}")
|
|
gate_str = "✓ M1 GATE PASSED" if m1_gate_passed else "✗ M1 GATE FAILED"
|
|
print(f" {gate_str}")
|
|
if m1_gate_notes:
|
|
for note in m1_gate_notes:
|
|
print(f" → {note}")
|
|
print(f" Total time: {total_elapsed_s:.1f}s")
|
|
print(f"{'=' * 60}\n")
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Timmy Cognitive Benchmark Harness — Project Bannerlord M0"
|
|
)
|
|
parser.add_argument("--model", required=True, help="Ollama model name (e.g. qwen2.5:14b)")
|
|
parser.add_argument("--levels", default=None, help="Comma-separated level indices (default: all)")
|
|
parser.add_argument("--verbose", action="store_true", help="Show per-trial details")
|
|
parser.add_argument(
|
|
"--output", default=None,
|
|
help="Output JSON path (default: results/<model>_<timestamp>.json)"
|
|
)
|
|
parser.add_argument(
|
|
"--skip-missing", action="store_true", default=True,
|
|
help="Skip instead of error if model not available"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
levels_to_run = None
|
|
if args.levels:
|
|
try:
|
|
levels_to_run = [int(x.strip()) for x in args.levels.split(",")]
|
|
except ValueError:
|
|
print(f"ERROR: --levels must be comma-separated integers, got: {args.levels}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
results = run_benchmark(
|
|
model=args.model,
|
|
levels_to_run=levels_to_run,
|
|
verbose=args.verbose,
|
|
skip_missing=args.skip_missing,
|
|
)
|
|
|
|
# Save results
|
|
if args.output:
|
|
output_path = Path(args.output)
|
|
else:
|
|
results_dir = Path(__file__).parent / "results"
|
|
results_dir.mkdir(exist_ok=True)
|
|
safe_model = args.model.replace(":", "_").replace("/", "_")
|
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_path = results_dir / f"{safe_model}_{ts}.json"
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, "w") as f:
|
|
json.dump(results, f, indent=2, default=str)
|
|
|
|
print(f"Results saved to: {output_path}")
|
|
|
|
# Exit with non-zero if M1 gate failed
|
|
if not results.get("skipped") and not results.get("summary", {}).get("m1_gate_passed", True):
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|