[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)

Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00
parent 6e65b53f3a
commit 9e08e87312
12 changed files with 3068 additions and 0 deletions
--- a/timmy-benchmark/levels/level_1_tic_tac_toe.py
+++ b/timmy-benchmark/levels/level_1_tic_tac_toe.py
@@ -0,0 +1,211 @@
+"""Level 1: Board State Tracking — Tic-Tac-Toe.
+
+Tests whether the model can maintain game state across turns, select
+legal moves, and exhibit basic strategic awareness.
+Maps to: Bannerlord board state / campaign map tracking.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+LEVEL = 1
+NAME = "Board State Tracking (Tic-Tac-Toe)"
+DESCRIPTION = "Model must track a tic-tac-toe board and make legal, strategic moves."
+
+SYSTEM_PROMPT = """You are a strategic AI playing tic-tac-toe. The board is a 3x3 grid.
+Positions are numbered 0-8 left-to-right, top-to-bottom:
+  0|1|2
+  3|4|5
+  6|7|8
+
+You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only.
+Format: {"move": <position 0-8>, "reason": "<brief reason>"}"""
+
+
+SCENARIOS = [
+    {
+        "description": "Empty board — opening move",
+        "board": [None, None, None, None, None, None, None, None, None],
+        "player": "X",
+        "prompt": (
+            'Board state: [null,null,null,null,null,null,null,null,null]. '
+            'You are X. It is your turn. Choose a move. '
+            'Respond: {"move": <0-8>, "reason": "<why>"}'
+        ),
+        "check": lambda move, board: move in range(9) and board[move] is None,
+        "check_desc": "Move must be a valid empty position (0-8)",
+    },
+    {
+        "description": "Block opponent's winning move",
+        "board": ["O", None, "O", None, "X", None, None, None, None],
+        "player": "X",
+        "prompt": (
+            'Board: ["O",null,"O",null,"X",null,null,null,null]. '
+            "O has positions 0 and 2. You are X. "
+            "O will win on next turn unless you block. "
+            'Respond: {"move": <0-8>, "reason": "<why>"}'
+        ),
+        "check": lambda move, board: move == 1,  # Must block at position 1
+        "check_desc": "Must block O's win at position 1",
+    },
+    {
+        "description": "Take winning move",
+        "board": ["X", None, "X", None, "O", None, None, "O", None],
+        "player": "X",
+        "prompt": (
+            'Board: ["X",null,"X",null,"O",null,null,"O",null]. '
+            "You are X. You have positions 0 and 2. "
+            "You can win this turn. "
+            'Respond: {"move": <0-8>, "reason": "<why>"}'
+        ),
+        "check": lambda move, board: move == 1,  # Win at position 1
+        "check_desc": "Must take winning move at position 1",
+    },
+    {
+        "description": "Legal move on partially filled board",
+        "board": ["X", "O", "X", "O", "X", "O", None, None, None],
+        "player": "O",
+        "prompt": (
+            'Board: ["X","O","X","O","X","O",null,null,null]. '
+            "You are O. Choose a legal move (positions 6, 7, or 8 are available). "
+            'Respond: {"move": <0-8>, "reason": "<why>"}'
+        ),
+        "check": lambda move, board: move in [6, 7, 8],
+        "check_desc": "Move must be one of the empty positions: 6, 7, or 8",
+    },
+]
+
+
+@dataclass
+class ScenarioResult:
+    scenario_index: int
+    description: str
+    prompt: str
+    raw_response: str
+    parsed: dict | None
+    valid_json: bool
+    move_legal: bool
+    move_correct: bool
+    latency_ms: float
+    error: str = ""
+
+
+@dataclass
+class LevelResult:
+    level: int = LEVEL
+    name: str = NAME
+    trials: list[ScenarioResult] = field(default_factory=list)
+    passed: bool = False
+    score: float = 0.0
+    latency_p50_ms: float = 0.0
+    latency_p99_ms: float = 0.0
+
+
+def _clean_response(raw: str) -> str:
+    raw = raw.strip()
+    if raw.startswith("```"):
+        lines = raw.splitlines()
+        lines = [l for l in lines if not l.startswith("```")]
+        raw = "\n".join(lines).strip()
+    return raw
+
+
+def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
+    result = LevelResult()
+    latencies = []
+
+    for i, scenario in enumerate(SCENARIOS):
+        t0 = time.time()
+        try:
+            response = client.chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": scenario["prompt"]},
+                ],
+                options={"temperature": 0.1},
+            )
+            raw = response["message"]["content"]
+            latency_ms = (time.time() - t0) * 1000
+        except Exception as exc:
+            latency_ms = (time.time() - t0) * 1000
+            sr = ScenarioResult(
+                scenario_index=i,
+                description=scenario["description"],
+                prompt=scenario["prompt"],
+                raw_response="",
+                parsed=None,
+                valid_json=False,
+                move_legal=False,
+                move_correct=False,
+                latency_ms=latency_ms,
+                error=str(exc),
+            )
+            result.trials.append(sr)
+            if verbose:
+                print(f"  Scenario {i}: ERROR — {exc}")
+            continue
+
+        latencies.append(latency_ms)
+
+        cleaned = _clean_response(raw)
+        parsed = None
+        valid_json = False
+        move_legal = False
+        move_correct = False
+        error = ""
+
+        try:
+            parsed = json.loads(cleaned)
+            valid_json = True
+
+            if "move" in parsed:
+                move = parsed["move"]
+                # Coerce string digits to int (some models emit "4" instead of 4)
+                if isinstance(move, str) and move.strip().lstrip("-").isdigit():
+                    move = int(move.strip())
+                if isinstance(move, int):
+                    board = scenario["board"]
+                    move_legal = 0 <= move <= 8 and board[move] is None
+                    move_correct = scenario["check"](move, board)
+        except json.JSONDecodeError as exc:
+            error = f"JSONDecodeError: {exc}"
+
+        sr = ScenarioResult(
+            scenario_index=i,
+            description=scenario["description"],
+            prompt=scenario["prompt"],
+            raw_response=raw,
+            parsed=parsed,
+            valid_json=valid_json,
+            move_legal=move_legal,
+            move_correct=move_correct,
+            latency_ms=latency_ms,
+            error=error,
+        )
+        result.trials.append(sr)
+
+        if verbose:
+            status = "PASS" if (valid_json and move_legal) else "FAIL"
+            correct_str = "CORRECT" if move_correct else "suboptimal"
+            move_val = parsed.get("move", "?") if parsed else "?"
+            print(
+                f"  Scenario {i} [{scenario['description']}]: {status} ({correct_str}) "
+                f"| move={move_val} | {latency_ms:.0f}ms"
+            )
+            if not move_correct and valid_json:
+                print(f"    Expected: {scenario['check_desc']}")
+
+    # Pass criteria: all moves must be valid JSON + legal
+    legal_moves = sum(1 for t in result.trials if t.valid_json and t.move_legal)
+    result.score = legal_moves / len(SCENARIOS)
+    result.passed = result.score >= 1.0
+
+    if latencies:
+        latencies_sorted = sorted(latencies)
+        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
+        result.latency_p99_ms = latencies_sorted[-1]
+
+    return result