forked from Rockachopa/Timmy-time-dashboard
[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
This commit is contained in:
committed by
rockachopa
parent
6e65b53f3a
commit
9e08e87312
211
timmy-benchmark/levels/level_1_tic_tac_toe.py
Normal file
211
timmy-benchmark/levels/level_1_tic_tac_toe.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""Level 1: Board State Tracking — Tic-Tac-Toe.
|
||||
|
||||
Tests whether the model can maintain game state across turns, select
|
||||
legal moves, and exhibit basic strategic awareness.
|
||||
Maps to: Bannerlord board state / campaign map tracking.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
LEVEL = 1
|
||||
NAME = "Board State Tracking (Tic-Tac-Toe)"
|
||||
DESCRIPTION = "Model must track a tic-tac-toe board and make legal, strategic moves."
|
||||
|
||||
SYSTEM_PROMPT = """You are a strategic AI playing tic-tac-toe. The board is a 3x3 grid.
|
||||
Positions are numbered 0-8 left-to-right, top-to-bottom:
|
||||
0|1|2
|
||||
3|4|5
|
||||
6|7|8
|
||||
|
||||
You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only.
|
||||
Format: {"move": <position 0-8>, "reason": "<brief reason>"}"""
|
||||
|
||||
|
||||
SCENARIOS = [
|
||||
{
|
||||
"description": "Empty board — opening move",
|
||||
"board": [None, None, None, None, None, None, None, None, None],
|
||||
"player": "X",
|
||||
"prompt": (
|
||||
'Board state: [null,null,null,null,null,null,null,null,null]. '
|
||||
'You are X. It is your turn. Choose a move. '
|
||||
'Respond: {"move": <0-8>, "reason": "<why>"}'
|
||||
),
|
||||
"check": lambda move, board: move in range(9) and board[move] is None,
|
||||
"check_desc": "Move must be a valid empty position (0-8)",
|
||||
},
|
||||
{
|
||||
"description": "Block opponent's winning move",
|
||||
"board": ["O", None, "O", None, "X", None, None, None, None],
|
||||
"player": "X",
|
||||
"prompt": (
|
||||
'Board: ["O",null,"O",null,"X",null,null,null,null]. '
|
||||
"O has positions 0 and 2. You are X. "
|
||||
"O will win on next turn unless you block. "
|
||||
'Respond: {"move": <0-8>, "reason": "<why>"}'
|
||||
),
|
||||
"check": lambda move, board: move == 1, # Must block at position 1
|
||||
"check_desc": "Must block O's win at position 1",
|
||||
},
|
||||
{
|
||||
"description": "Take winning move",
|
||||
"board": ["X", None, "X", None, "O", None, None, "O", None],
|
||||
"player": "X",
|
||||
"prompt": (
|
||||
'Board: ["X",null,"X",null,"O",null,null,"O",null]. '
|
||||
"You are X. You have positions 0 and 2. "
|
||||
"You can win this turn. "
|
||||
'Respond: {"move": <0-8>, "reason": "<why>"}'
|
||||
),
|
||||
"check": lambda move, board: move == 1, # Win at position 1
|
||||
"check_desc": "Must take winning move at position 1",
|
||||
},
|
||||
{
|
||||
"description": "Legal move on partially filled board",
|
||||
"board": ["X", "O", "X", "O", "X", "O", None, None, None],
|
||||
"player": "O",
|
||||
"prompt": (
|
||||
'Board: ["X","O","X","O","X","O",null,null,null]. '
|
||||
"You are O. Choose a legal move (positions 6, 7, or 8 are available). "
|
||||
'Respond: {"move": <0-8>, "reason": "<why>"}'
|
||||
),
|
||||
"check": lambda move, board: move in [6, 7, 8],
|
||||
"check_desc": "Move must be one of the empty positions: 6, 7, or 8",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScenarioResult:
|
||||
scenario_index: int
|
||||
description: str
|
||||
prompt: str
|
||||
raw_response: str
|
||||
parsed: dict | None
|
||||
valid_json: bool
|
||||
move_legal: bool
|
||||
move_correct: bool
|
||||
latency_ms: float
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevelResult:
|
||||
level: int = LEVEL
|
||||
name: str = NAME
|
||||
trials: list[ScenarioResult] = field(default_factory=list)
|
||||
passed: bool = False
|
||||
score: float = 0.0
|
||||
latency_p50_ms: float = 0.0
|
||||
latency_p99_ms: float = 0.0
|
||||
|
||||
|
||||
def _clean_response(raw: str) -> str:
|
||||
raw = raw.strip()
|
||||
if raw.startswith("```"):
|
||||
lines = raw.splitlines()
|
||||
lines = [l for l in lines if not l.startswith("```")]
|
||||
raw = "\n".join(lines).strip()
|
||||
return raw
|
||||
|
||||
|
||||
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
|
||||
result = LevelResult()
|
||||
latencies = []
|
||||
|
||||
for i, scenario in enumerate(SCENARIOS):
|
||||
t0 = time.time()
|
||||
try:
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": scenario["prompt"]},
|
||||
],
|
||||
options={"temperature": 0.1},
|
||||
)
|
||||
raw = response["message"]["content"]
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
except Exception as exc:
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
prompt=scenario["prompt"],
|
||||
raw_response="",
|
||||
parsed=None,
|
||||
valid_json=False,
|
||||
move_legal=False,
|
||||
move_correct=False,
|
||||
latency_ms=latency_ms,
|
||||
error=str(exc),
|
||||
)
|
||||
result.trials.append(sr)
|
||||
if verbose:
|
||||
print(f" Scenario {i}: ERROR — {exc}")
|
||||
continue
|
||||
|
||||
latencies.append(latency_ms)
|
||||
|
||||
cleaned = _clean_response(raw)
|
||||
parsed = None
|
||||
valid_json = False
|
||||
move_legal = False
|
||||
move_correct = False
|
||||
error = ""
|
||||
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
valid_json = True
|
||||
|
||||
if "move" in parsed:
|
||||
move = parsed["move"]
|
||||
# Coerce string digits to int (some models emit "4" instead of 4)
|
||||
if isinstance(move, str) and move.strip().lstrip("-").isdigit():
|
||||
move = int(move.strip())
|
||||
if isinstance(move, int):
|
||||
board = scenario["board"]
|
||||
move_legal = 0 <= move <= 8 and board[move] is None
|
||||
move_correct = scenario["check"](move, board)
|
||||
except json.JSONDecodeError as exc:
|
||||
error = f"JSONDecodeError: {exc}"
|
||||
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
prompt=scenario["prompt"],
|
||||
raw_response=raw,
|
||||
parsed=parsed,
|
||||
valid_json=valid_json,
|
||||
move_legal=move_legal,
|
||||
move_correct=move_correct,
|
||||
latency_ms=latency_ms,
|
||||
error=error,
|
||||
)
|
||||
result.trials.append(sr)
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if (valid_json and move_legal) else "FAIL"
|
||||
correct_str = "CORRECT" if move_correct else "suboptimal"
|
||||
move_val = parsed.get("move", "?") if parsed else "?"
|
||||
print(
|
||||
f" Scenario {i} [{scenario['description']}]: {status} ({correct_str}) "
|
||||
f"| move={move_val} | {latency_ms:.0f}ms"
|
||||
)
|
||||
if not move_correct and valid_json:
|
||||
print(f" Expected: {scenario['check_desc']}")
|
||||
|
||||
# Pass criteria: all moves must be valid JSON + legal
|
||||
legal_moves = sum(1 for t in result.trials if t.valid_json and t.move_legal)
|
||||
result.score = legal_moves / len(SCENARIOS)
|
||||
result.passed = result.score >= 1.0
|
||||
|
||||
if latencies:
|
||||
latencies_sorted = sorted(latencies)
|
||||
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
||||
result.latency_p99_ms = latencies_sorted[-1]
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user