"""Level 1: Board State Tracking — Tic-Tac-Toe. Tests whether the model can maintain game state across turns, select legal moves, and exhibit basic strategic awareness. Maps to: Bannerlord board state / campaign map tracking. """ import json import time from dataclasses import dataclass, field from typing import Any LEVEL = 1 NAME = "Board State Tracking (Tic-Tac-Toe)" DESCRIPTION = "Model must track a tic-tac-toe board and make legal, strategic moves." SYSTEM_PROMPT = """You are a strategic AI playing tic-tac-toe. The board is a 3x3 grid. Positions are numbered 0-8 left-to-right, top-to-bottom: 0|1|2 3|4|5 6|7|8 You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only. Format: {"move": , "reason": ""}""" SCENARIOS = [ { "description": "Empty board — opening move", "board": [None, None, None, None, None, None, None, None, None], "player": "X", "prompt": ( 'Board state: [null,null,null,null,null,null,null,null,null]. ' 'You are X. It is your turn. Choose a move. ' 'Respond: {"move": <0-8>, "reason": ""}' ), "check": lambda move, board: move in range(9) and board[move] is None, "check_desc": "Move must be a valid empty position (0-8)", }, { "description": "Block opponent's winning move", "board": ["O", None, "O", None, "X", None, None, None, None], "player": "X", "prompt": ( 'Board: ["O",null,"O",null,"X",null,null,null,null]. ' "O has positions 0 and 2. You are X. " "O will win on next turn unless you block. " 'Respond: {"move": <0-8>, "reason": ""}' ), "check": lambda move, board: move == 1, # Must block at position 1 "check_desc": "Must block O's win at position 1", }, { "description": "Take winning move", "board": ["X", None, "X", None, "O", None, None, "O", None], "player": "X", "prompt": ( 'Board: ["X",null,"X",null,"O",null,null,"O",null]. ' "You are X. You have positions 0 and 2. " "You can win this turn. " 'Respond: {"move": <0-8>, "reason": ""}' ), "check": lambda move, board: move == 1, # Win at position 1 "check_desc": "Must take winning move at position 1", }, { "description": "Legal move on partially filled board", "board": ["X", "O", "X", "O", "X", "O", None, None, None], "player": "O", "prompt": ( 'Board: ["X","O","X","O","X","O",null,null,null]. ' "You are O. Choose a legal move (positions 6, 7, or 8 are available). " 'Respond: {"move": <0-8>, "reason": ""}' ), "check": lambda move, board: move in [6, 7, 8], "check_desc": "Move must be one of the empty positions: 6, 7, or 8", }, ] @dataclass class ScenarioResult: scenario_index: int description: str prompt: str raw_response: str parsed: dict | None valid_json: bool move_legal: bool move_correct: bool latency_ms: float error: str = "" @dataclass class LevelResult: level: int = LEVEL name: str = NAME trials: list[ScenarioResult] = field(default_factory=list) passed: bool = False score: float = 0.0 latency_p50_ms: float = 0.0 latency_p99_ms: float = 0.0 def _clean_response(raw: str) -> str: raw = raw.strip() if raw.startswith("```"): lines = raw.splitlines() lines = [l for l in lines if not l.startswith("```")] raw = "\n".join(lines).strip() return raw def run(client: Any, model: str, verbose: bool = False) -> LevelResult: result = LevelResult() latencies = [] for i, scenario in enumerate(SCENARIOS): t0 = time.time() try: response = client.chat( model=model, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": scenario["prompt"]}, ], options={"temperature": 0.1}, ) raw = response["message"]["content"] latency_ms = (time.time() - t0) * 1000 except Exception as exc: latency_ms = (time.time() - t0) * 1000 sr = ScenarioResult( scenario_index=i, description=scenario["description"], prompt=scenario["prompt"], raw_response="", parsed=None, valid_json=False, move_legal=False, move_correct=False, latency_ms=latency_ms, error=str(exc), ) result.trials.append(sr) if verbose: print(f" Scenario {i}: ERROR — {exc}") continue latencies.append(latency_ms) cleaned = _clean_response(raw) parsed = None valid_json = False move_legal = False move_correct = False error = "" try: parsed = json.loads(cleaned) valid_json = True if "move" in parsed: move = parsed["move"] # Coerce string digits to int (some models emit "4" instead of 4) if isinstance(move, str) and move.strip().lstrip("-").isdigit(): move = int(move.strip()) if isinstance(move, int): board = scenario["board"] move_legal = 0 <= move <= 8 and board[move] is None move_correct = scenario["check"](move, board) except json.JSONDecodeError as exc: error = f"JSONDecodeError: {exc}" sr = ScenarioResult( scenario_index=i, description=scenario["description"], prompt=scenario["prompt"], raw_response=raw, parsed=parsed, valid_json=valid_json, move_legal=move_legal, move_correct=move_correct, latency_ms=latency_ms, error=error, ) result.trials.append(sr) if verbose: status = "PASS" if (valid_json and move_legal) else "FAIL" correct_str = "CORRECT" if move_correct else "suboptimal" move_val = parsed.get("move", "?") if parsed else "?" print( f" Scenario {i} [{scenario['description']}]: {status} ({correct_str}) " f"| move={move_val} | {latency_ms:.0f}ms" ) if not move_correct and valid_json: print(f" Expected: {scenario['check_desc']}") # Pass criteria: all moves must be valid JSON + legal legal_moves = sum(1 for t in result.trials if t.valid_json and t.move_legal) result.score = legal_moves / len(SCENARIOS) result.passed = result.score >= 1.0 if latencies: latencies_sorted = sorted(latencies) result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2] result.latency_p99_ms = latencies_sorted[-1] return result