Timmy-time-dashboard/timmy-benchmark/levels/level_1_tic_tac_toe.py

"""Level 1: Board State Tracking — Tic-Tac-Toe.

Tests whether the model can maintain game state across turns, select
legal moves, and exhibit basic strategic awareness.
Maps to: Bannerlord board state / campaign map tracking.
"""

import json
import time
from dataclasses import dataclass, field
from typing import Any

LEVEL = 1
NAME = "Board State Tracking (Tic-Tac-Toe)"
DESCRIPTION = "Model must track a tic-tac-toe board and make legal, strategic moves."

SYSTEM_PROMPT = """You are a strategic AI playing tic-tac-toe. The board is a 3x3 grid.
Positions are numbered 0-8 left-to-right, top-to-bottom:
  0|1|2
  3|4|5
  6|7|8

You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only.
Format: {"move": <position 0-8>, "reason": "<brief reason>"}"""


SCENARIOS = [
    {
        "description": "Empty board — opening move",
        "board": [None, None, None, None, None, None, None, None, None],
        "player": "X",
        "prompt": (
            'Board state: [null,null,null,null,null,null,null,null,null]. '
            'You are X. It is your turn. Choose a move. '
            'Respond: {"move": <0-8>, "reason": "<why>"}'
        ),
        "check": lambda move, board: move in range(9) and board[move] is None,
        "check_desc": "Move must be a valid empty position (0-8)",
    },
    {
        "description": "Block opponent's winning move",
        "board": ["O", None, "O", None, "X", None, None, None, None],
        "player": "X",
        "prompt": (
            'Board: ["O",null,"O",null,"X",null,null,null,null]. '
            "O has positions 0 and 2. You are X. "
            "O will win on next turn unless you block. "
            'Respond: {"move": <0-8>, "reason": "<why>"}'
        ),
        "check": lambda move, board: move == 1,  # Must block at position 1
        "check_desc": "Must block O's win at position 1",
    },
    {
        "description": "Take winning move",
        "board": ["X", None, "X", None, "O", None, None, "O", None],
        "player": "X",
        "prompt": (
            'Board: ["X",null,"X",null,"O",null,null,"O",null]. '
            "You are X. You have positions 0 and 2. "
            "You can win this turn. "
            'Respond: {"move": <0-8>, "reason": "<why>"}'
        ),
        "check": lambda move, board: move == 1,  # Win at position 1
        "check_desc": "Must take winning move at position 1",
    },
    {
        "description": "Legal move on partially filled board",
        "board": ["X", "O", "X", "O", "X", "O", None, None, None],
        "player": "O",
        "prompt": (
            'Board: ["X","O","X","O","X","O",null,null,null]. '
            "You are O. Choose a legal move (positions 6, 7, or 8 are available). "
            'Respond: {"move": <0-8>, "reason": "<why>"}'
        ),
        "check": lambda move, board: move in [6, 7, 8],
        "check_desc": "Move must be one of the empty positions: 6, 7, or 8",
    },
]


@dataclass
class ScenarioResult:
    scenario_index: int
    description: str
    prompt: str
    raw_response: str
    parsed: dict | None
    valid_json: bool
    move_legal: bool
    move_correct: bool
    latency_ms: float
    error: str = ""


@dataclass
class LevelResult:
    level: int = LEVEL
    name: str = NAME
    trials: list[ScenarioResult] = field(default_factory=list)
    passed: bool = False
    score: float = 0.0
    latency_p50_ms: float = 0.0
    latency_p99_ms: float = 0.0


def _clean_response(raw: str) -> str:
    raw = raw.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()
    return raw


def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
    result = LevelResult()
    latencies = []

    for i, scenario in enumerate(SCENARIOS):
        t0 = time.time()
        try:
            response = client.chat(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": scenario["prompt"]},
                ],
                options={"temperature": 0.1},
            )
            raw = response["message"]["content"]
            latency_ms = (time.time() - t0) * 1000
        except Exception as exc:
            latency_ms = (time.time() - t0) * 1000
            sr = ScenarioResult(
                scenario_index=i,
                description=scenario["description"],
                prompt=scenario["prompt"],
                raw_response="",
                parsed=None,
                valid_json=False,
                move_legal=False,
                move_correct=False,
                latency_ms=latency_ms,
                error=str(exc),
            )
            result.trials.append(sr)
            if verbose:
                print(f"  Scenario {i}: ERROR — {exc}")
            continue

        latencies.append(latency_ms)

        cleaned = _clean_response(raw)
        parsed = None
        valid_json = False
        move_legal = False
        move_correct = False
        error = ""

        try:
            parsed = json.loads(cleaned)
            valid_json = True

            if "move" in parsed:
                move = parsed["move"]
                # Coerce string digits to int (some models emit "4" instead of 4)
                if isinstance(move, str) and move.strip().lstrip("-").isdigit():
                    move = int(move.strip())
                if isinstance(move, int):
                    board = scenario["board"]
                    move_legal = 0 <= move <= 8 and board[move] is None
                    move_correct = scenario["check"](move, board)
        except json.JSONDecodeError as exc:
            error = f"JSONDecodeError: {exc}"

        sr = ScenarioResult(
            scenario_index=i,
            description=scenario["description"],
            prompt=scenario["prompt"],
            raw_response=raw,
            parsed=parsed,
            valid_json=valid_json,
            move_legal=move_legal,
            move_correct=move_correct,
            latency_ms=latency_ms,
            error=error,
        )
        result.trials.append(sr)

        if verbose:
            status = "PASS" if (valid_json and move_legal) else "FAIL"
            correct_str = "CORRECT" if move_correct else "suboptimal"
            move_val = parsed.get("move", "?") if parsed else "?"
            print(
                f"  Scenario {i} [{scenario['description']}]: {status} ({correct_str}) "
                f"| move={move_val} | {latency_ms:.0f}ms"
            )
            if not move_correct and valid_json:
                print(f"    Expected: {scenario['check_desc']}")

    # Pass criteria: all moves must be valid JSON + legal
    legal_moves = sum(1 for t in result.trials if t.valid_json and t.move_legal)
    result.score = legal_moves / len(SCENARIOS)
    result.passed = result.score >= 1.0

    if latencies:
        latencies_sorted = sorted(latencies)
        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
        result.latency_p99_ms = latencies_sorted[-1]

    return result