[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)

Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00
parent 6e65b53f3a
commit 9e08e87312
12 changed files with 3068 additions and 0 deletions
--- a/timmy-benchmark/levels/init.py
+++ b/timmy-benchmark/levels/init.py
@@ -0,0 +1 @@
 """Cognitive benchmark levels for Project Bannerlord readiness testing."""
--- a/timmy-benchmark/levels/level_0_coin_flip.py
+++ b/timmy-benchmark/levels/level_0_coin_flip.py
@@ -0,0 +1,183 @@
 """Level 0: JSON Compliance — Coin Flip.
 Tests whether the model can reliably return well-formed JSON responses
 with a specific schema. This is the minimum bar for GABS tool calls.
 """
 import json
 import time
 from dataclasses import dataclass, field
 from typing import Any
 LEVEL = 0
 NAME = "JSON Compliance (Coin Flip)"
 DESCRIPTION = "Model must return valid JSON matching a strict schema on each trial."
 SYSTEM_PROMPT = """You are a strategic AI agent. You MUST respond ONLY with valid JSON.
 No markdown, no explanation, no code fences. Raw JSON only."""
 TRIALS = [
    {
        "prompt": (
            'A coin is flipped. Respond with exactly: {"choice": "heads"} or {"choice": "tails"}. '
            "Pick one. JSON only."
        ),
        "schema": {"choice": str},
        "valid_values": {"choice": ["heads", "tails"]},
    },
    {
        "prompt": (
            'You must attack or defend. Respond with: {"action": "attack", "confidence": 0.8} '
            'or {"action": "defend", "confidence": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.'
        ),
        "schema": {"action": str, "confidence": float},
        "valid_values": {"action": ["attack", "defend"]},
    },
    {
        "prompt": (
            'Choose a direction to march. Respond with exactly: '
            '{"direction": "north", "reason": "string explaining why"}. '
            "Pick north/south/east/west. JSON only."
        ),
        "schema": {"direction": str, "reason": str},
        "valid_values": {"direction": ["north", "south", "east", "west"]},
    },
 ]
@dataclass
 class TrialResult:
    trial_index: int
    prompt: str
    raw_response: str
    parsed: dict | None
    valid_json: bool
    schema_valid: bool
    value_valid: bool
    latency_ms: float
    error: str = ""
@dataclass
 class LevelResult:
    level: int = LEVEL
    name: str = NAME
    trials: list[TrialResult] = field(default_factory=list)
    passed: bool = False
    score: float = 0.0
    latency_p50_ms: float = 0.0
    latency_p99_ms: float = 0.0
 def _validate_schema(parsed: dict, schema: dict[str, type]) -> bool:
    for key, expected_type in schema.items():
        if key not in parsed:
            return False
        if not isinstance(parsed[key], expected_type):
            # Allow int where float is expected
            if expected_type is float and isinstance(parsed[key], int):
                continue
            return False
    return True
 def _validate_values(parsed: dict, valid_values: dict[str, list]) -> bool:
    for key, valid_list in valid_values.items():
        if key in parsed and parsed[key] not in valid_list:
            return False
    return True
 def _clean_response(raw: str) -> str:
    """Strip markdown fences if model wrapped JSON in them."""
    raw = raw.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        # Remove first and last fence lines
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()
    return raw
 def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
    result = LevelResult()
    latencies = []
    for i, trial in enumerate(TRIALS):
        t0 = time.time()
        try:
            response = client.chat(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": trial["prompt"]},
                ],
                options={"temperature": 0.1},
            )
            raw = response["message"]["content"]
            latency_ms = (time.time() - t0) * 1000
        except Exception as exc:
            latency_ms = (time.time() - t0) * 1000
            tr = TrialResult(
                trial_index=i,
                prompt=trial["prompt"],
                raw_response="",
                parsed=None,
                valid_json=False,
                schema_valid=False,
                value_valid=False,
                latency_ms=latency_ms,
                error=str(exc),
            )
            result.trials.append(tr)
            if verbose:
                print(f"  Trial {i}: ERROR — {exc}")
            continue
        latencies.append(latency_ms)
        cleaned = _clean_response(raw)
        parsed = None
        valid_json = False
        schema_valid = False
        value_valid = False
        error = ""
        try:
            parsed = json.loads(cleaned)
            valid_json = True
            schema_valid = _validate_schema(parsed, trial["schema"])
            value_valid = _validate_values(parsed, trial["valid_values"])
        except json.JSONDecodeError as exc:
            error = f"JSONDecodeError: {exc}"
        tr = TrialResult(
            trial_index=i,
            prompt=trial["prompt"],
            raw_response=raw,
            parsed=parsed,
            valid_json=valid_json,
            schema_valid=schema_valid,
            value_valid=value_valid,
            latency_ms=latency_ms,
            error=error,
        )
        result.trials.append(tr)
        if verbose:
            status = "PASS" if (valid_json and schema_valid) else "FAIL"
            print(
                f"  Trial {i}: {status} | json={valid_json} schema={schema_valid} "
                f"value={value_valid} | {latency_ms:.0f}ms | {raw[:80]!r}"
            )
    passed_trials = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
    result.score = passed_trials / len(TRIALS)
    result.passed = result.score >= 1.0  # Must pass all 3 trials
    if latencies:
        latencies_sorted = sorted(latencies)
        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
        result.latency_p99_ms = latencies_sorted[-1]
    return result
--- a/timmy-benchmark/levels/level_1_tic_tac_toe.py
+++ b/timmy-benchmark/levels/level_1_tic_tac_toe.py
@@ -0,0 +1,211 @@
 """Level 1: Board State Tracking — Tic-Tac-Toe.
 Tests whether the model can maintain game state across turns, select
 legal moves, and exhibit basic strategic awareness.
 Maps to: Bannerlord board state / campaign map tracking.
 """
 import json
 import time
 from dataclasses import dataclass, field
 from typing import Any
 LEVEL = 1
 NAME = "Board State Tracking (Tic-Tac-Toe)"
 DESCRIPTION = "Model must track a tic-tac-toe board and make legal, strategic moves."
 SYSTEM_PROMPT = """You are a strategic AI playing tic-tac-toe. The board is a 3x3 grid.
 Positions are numbered 0-8 left-to-right, top-to-bottom:
  0|1|2
  3|4|5
  6|7|8
 You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only.
 Format: {"move": <position 0-8>, "reason": "<brief reason>"}"""
 SCENARIOS = [
    {
        "description": "Empty board — opening move",
        "board": [None, None, None, None, None, None, None, None, None],
        "player": "X",
        "prompt": (
            'Board state: [null,null,null,null,null,null,null,null,null]. '
            'You are X. It is your turn. Choose a move. '
            'Respond: {"move": <0-8>, "reason": "<why>"}'
        ),
        "check": lambda move, board: move in range(9) and board[move] is None,
        "check_desc": "Move must be a valid empty position (0-8)",
    },
    {
        "description": "Block opponent's winning move",
        "board": ["O", None, "O", None, "X", None, None, None, None],
        "player": "X",
        "prompt": (
            'Board: ["O",null,"O",null,"X",null,null,null,null]. '
            "O has positions 0 and 2. You are X. "
            "O will win on next turn unless you block. "
            'Respond: {"move": <0-8>, "reason": "<why>"}'
        ),
        "check": lambda move, board: move == 1,  # Must block at position 1
        "check_desc": "Must block O's win at position 1",
    },
    {
        "description": "Take winning move",
        "board": ["X", None, "X", None, "O", None, None, "O", None],
        "player": "X",
        "prompt": (
            'Board: ["X",null,"X",null,"O",null,null,"O",null]. '
            "You are X. You have positions 0 and 2. "
            "You can win this turn. "
            'Respond: {"move": <0-8>, "reason": "<why>"}'
        ),
        "check": lambda move, board: move == 1,  # Win at position 1
        "check_desc": "Must take winning move at position 1",
    },
    {
        "description": "Legal move on partially filled board",
        "board": ["X", "O", "X", "O", "X", "O", None, None, None],
        "player": "O",
        "prompt": (
            'Board: ["X","O","X","O","X","O",null,null,null]. '
            "You are O. Choose a legal move (positions 6, 7, or 8 are available). "
            'Respond: {"move": <0-8>, "reason": "<why>"}'
        ),
        "check": lambda move, board: move in [6, 7, 8],
        "check_desc": "Move must be one of the empty positions: 6, 7, or 8",
    },
 ]
@dataclass
 class ScenarioResult:
    scenario_index: int
    description: str
    prompt: str
    raw_response: str
    parsed: dict | None
    valid_json: bool
    move_legal: bool
    move_correct: bool
    latency_ms: float
    error: str = ""
@dataclass
 class LevelResult:
    level: int = LEVEL
    name: str = NAME
    trials: list[ScenarioResult] = field(default_factory=list)
    passed: bool = False
    score: float = 0.0
    latency_p50_ms: float = 0.0
    latency_p99_ms: float = 0.0
 def _clean_response(raw: str) -> str:
    raw = raw.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()
    return raw
 def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
    result = LevelResult()
    latencies = []
    for i, scenario in enumerate(SCENARIOS):
        t0 = time.time()
        try:
            response = client.chat(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": scenario["prompt"]},
                ],
                options={"temperature": 0.1},
            )
            raw = response["message"]["content"]
            latency_ms = (time.time() - t0) * 1000
        except Exception as exc:
            latency_ms = (time.time() - t0) * 1000
            sr = ScenarioResult(
                scenario_index=i,
                description=scenario["description"],
                prompt=scenario["prompt"],
                raw_response="",
                parsed=None,
                valid_json=False,
                move_legal=False,
                move_correct=False,
                latency_ms=latency_ms,
                error=str(exc),
            )
            result.trials.append(sr)
            if verbose:
                print(f"  Scenario {i}: ERROR — {exc}")
            continue
        latencies.append(latency_ms)
        cleaned = _clean_response(raw)
        parsed = None
        valid_json = False
        move_legal = False
        move_correct = False
        error = ""
        try:
            parsed = json.loads(cleaned)
            valid_json = True
            if "move" in parsed:
                move = parsed["move"]
                # Coerce string digits to int (some models emit "4" instead of 4)
                if isinstance(move, str) and move.strip().lstrip("-").isdigit():
                    move = int(move.strip())
                if isinstance(move, int):
                    board = scenario["board"]
                    move_legal = 0 <= move <= 8 and board[move] is None
                    move_correct = scenario["check"](move, board)
        except json.JSONDecodeError as exc:
            error = f"JSONDecodeError: {exc}"
        sr = ScenarioResult(
            scenario_index=i,
            description=scenario["description"],
            prompt=scenario["prompt"],
            raw_response=raw,
            parsed=parsed,
            valid_json=valid_json,
            move_legal=move_legal,
            move_correct=move_correct,
            latency_ms=latency_ms,
            error=error,
        )
        result.trials.append(sr)
        if verbose:
            status = "PASS" if (valid_json and move_legal) else "FAIL"
            correct_str = "CORRECT" if move_correct else "suboptimal"
            move_val = parsed.get("move", "?") if parsed else "?"
            print(
                f"  Scenario {i} [{scenario['description']}]: {status} ({correct_str}) "
                f"| move={move_val} | {latency_ms:.0f}ms"
            )
            if not move_correct and valid_json:
                print(f"    Expected: {scenario['check_desc']}")
    # Pass criteria: all moves must be valid JSON + legal
    legal_moves = sum(1 for t in result.trials if t.valid_json and t.move_legal)
    result.score = legal_moves / len(SCENARIOS)
    result.passed = result.score >= 1.0
    if latencies:
        latencies_sorted = sorted(latencies)
        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
        result.latency_p99_ms = latencies_sorted[-1]
    return result
--- a/timmy-benchmark/levels/level_2_resource_mgmt.py
+++ b/timmy-benchmark/levels/level_2_resource_mgmt.py
@@ -0,0 +1,213 @@
 """Level 2: Resource Management — Party Economy.
 Tests whether the model can allocate limited resources across competing
 priorities and adapt when constraints change.
 Maps to: Bannerlord party economy (troops, food, gold, morale).
 """
 import json
 import time
 from dataclasses import dataclass, field
 from typing import Any
 LEVEL = 2
 NAME = "Resource Management (Party Economy)"
 DESCRIPTION = "Model must allocate limited resources across troops, food, and equipment."
 SYSTEM_PROMPT = """You are a Bannerlord campaign advisor managing a party.
 Resources are limited — every decision has trade-offs.
 You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only."""
 SCENARIOS = [
    {
        "description": "Budget allocation under constraint",
        "prompt": (
            "You have 500 gold. Options:\n"
            "- Recruit 10 infantry: costs 300 gold, +10 combat strength\n"
            "- Buy food for 20 days: costs 200 gold, keeps morale stable\n"
            "- Repair armor: costs 150 gold, -20% casualty rate\n\n"
            "You cannot afford all three. Morale is currently CRITICAL (troops may desert).\n"
            'Choose 1-2 options. Respond: {"choices": ["option_a", ...], "gold_spent": <int>, "reason": "<why>"}\n'
            "Where option keys are: recruit_infantry, buy_food, repair_armor"
        ),
        "check": lambda r: (
            isinstance(r.get("choices"), list)
            and len(r["choices"]) >= 1
            and all(c in ["recruit_infantry", "buy_food", "repair_armor"] for c in r["choices"])
            and isinstance(r.get("gold_spent"), (int, float))
            and r.get("gold_spent", 9999) <= 500
        ),
        "check_desc": "choices must be valid options, gold_spent <= 500",
        "strategic_check": lambda r: "buy_food" in r.get("choices", []),
        "strategic_desc": "With CRITICAL morale, food should be prioritized",
    },
    {
        "description": "Troop tier upgrade decision",
        "prompt": (
            "Party status:\n"
            "- 15 Tier-1 recruits (weak, 30 upkeep/day)\n"
            "- 5 Tier-3 veterans (strong, 90 upkeep/day)\n"
            "- Daily income: 200 gold\n"
            "- Upcoming: raider camp attack (moderate difficulty)\n\n"
            "Options:\n"
            "- Upgrade 5 recruits to Tier-2 (costs 250 gold total)\n"
            "- Keep all current troops, save gold for emergencies\n"
            "- Dismiss 5 recruits to save upkeep\n\n"
            'Respond: {"action": "upgrade_recruits"|"save_gold"|"dismiss_recruits", '
            '"reason": "<why>", "expected_outcome": "<string>"}'
        ),
        "check": lambda r: (
            r.get("action") in ["upgrade_recruits", "save_gold", "dismiss_recruits"]
            and isinstance(r.get("reason"), str)
            and len(r.get("reason", "")) > 0
        ),
        "check_desc": "action must be one of the three options with a non-empty reason",
        "strategic_check": lambda r: r.get("action") in ["upgrade_recruits", "save_gold"],
        "strategic_desc": "Dismissing troops before a fight is suboptimal",
    },
    {
        "description": "Multi-turn planning horizon",
        "prompt": (
            "Current: 300 gold, 10 days of food, 20 troops\n"
            "Day 5: Must cross desert (costs 5 extra food days)\n"
            "Day 10: Reach town (can buy supplies)\n\n"
            "You need a 15-day food reserve to survive the journey.\n"
            "Food costs 10 gold/day. You have enough for 10 days now.\n\n"
            "How many extra food days do you buy today?\n"
            'Respond: {"extra_food_days": <int>, "cost": <int>, "remaining_gold": <int>, "reason": "<why>"}'
        ),
        "check": lambda r: (
            isinstance(r.get("extra_food_days"), (int, float))
            and isinstance(r.get("cost"), (int, float))
            and isinstance(r.get("remaining_gold"), (int, float))
        ),
        "check_desc": "Must include extra_food_days, cost, remaining_gold as numbers",
        "strategic_check": lambda r: r.get("extra_food_days", 0) >= 5,
        "strategic_desc": "Need at least 5 more days of food for desert crossing",
    },
 ]
@dataclass
 class ScenarioResult:
    scenario_index: int
    description: str
    raw_response: str
    parsed: dict | None
    valid_json: bool
    schema_valid: bool
    strategically_sound: bool
    latency_ms: float
    error: str = ""
@dataclass
 class LevelResult:
    level: int = LEVEL
    name: str = NAME
    trials: list[ScenarioResult] = field(default_factory=list)
    passed: bool = False
    score: float = 0.0
    latency_p50_ms: float = 0.0
    latency_p99_ms: float = 0.0
 def _clean_response(raw: str) -> str:
    raw = raw.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()
    return raw
 def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
    result = LevelResult()
    latencies = []
    for i, scenario in enumerate(SCENARIOS):
        t0 = time.time()
        try:
            response = client.chat(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": scenario["prompt"]},
                ],
                options={"temperature": 0.1},
            )
            raw = response["message"]["content"]
            latency_ms = (time.time() - t0) * 1000
        except Exception as exc:
            latency_ms = (time.time() - t0) * 1000
            sr = ScenarioResult(
                scenario_index=i,
                description=scenario["description"],
                raw_response="",
                parsed=None,
                valid_json=False,
                schema_valid=False,
                strategically_sound=False,
                latency_ms=latency_ms,
                error=str(exc),
            )
            result.trials.append(sr)
            if verbose:
                print(f"  Scenario {i}: ERROR — {exc}")
            continue
        latencies.append(latency_ms)
        cleaned = _clean_response(raw)
        parsed = None
        valid_json = False
        schema_valid = False
        strategically_sound = False
        error = ""
        try:
            parsed = json.loads(cleaned)
            valid_json = True
            schema_valid = scenario["check"](parsed)
            if schema_valid:
                strategically_sound = scenario["strategic_check"](parsed)
        except json.JSONDecodeError as exc:
            error = f"JSONDecodeError: {exc}"
        except Exception as exc:
            error = f"Validation error: {exc}"
        sr = ScenarioResult(
            scenario_index=i,
            description=scenario["description"],
            raw_response=raw,
            parsed=parsed,
            valid_json=valid_json,
            schema_valid=schema_valid,
            strategically_sound=strategically_sound,
            latency_ms=latency_ms,
            error=error,
        )
        result.trials.append(sr)
        if verbose:
            status = "PASS" if (valid_json and schema_valid) else "FAIL"
            strat = "strategic" if strategically_sound else "suboptimal"
            print(
                f"  Scenario {i} [{scenario['description']}]: {status} ({strat}) "
                f"| {latency_ms:.0f}ms"
            )
            if not schema_valid and valid_json:
                print(f"    Schema issue: {scenario['check_desc']}")
            if not strategically_sound and schema_valid:
                print(f"    Strategy note: {scenario['strategic_desc']}")
    valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
    result.score = valid_count / len(SCENARIOS)
    result.passed = result.score >= 0.67  # 2/3 scenarios
    if latencies:
        latencies_sorted = sorted(latencies)
        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
        result.latency_p99_ms = latencies_sorted[-1]
    return result
--- a/timmy-benchmark/levels/level_3_battle_tactics.py
+++ b/timmy-benchmark/levels/level_3_battle_tactics.py
@@ -0,0 +1,216 @@
 """Level 3: Battle Tactics — Formation Commands.
 Tests whether the model can issue coherent formation and tactical orders
 under simulated battlefield pressure with multiple unit types.
 Maps to: Bannerlord formation commands (charge, shield wall, skirmish, etc.).
 """
 import json
 import time
 from dataclasses import dataclass, field
 from typing import Any
 LEVEL = 3
 NAME = "Battle Tactics (Formation Commands)"
 DESCRIPTION = "Model must issue tactically sound formation orders under simulated battle conditions."
 SYSTEM_PROMPT = """You are a Bannerlord battle commander. Issue formation orders using these commands:
 - shield_wall: infantry forms defensive line (good vs ranged, slow advance)
 - charge: all-out attack (high casualties, breaks weak enemies fast)
 - skirmish: ranged units pepper enemy (good vs heavy infantry, needs distance)
 - advance: move forward holding formation (balanced)
 - flank_left / flank_right: cavalry sweeps around enemy side
 - fallback: retreat to regroup (when badly outnumbered)
 You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
 SCENARIOS = [
    {
        "description": "Ranged vs infantry — defensive opening",
        "prompt": (
            "Situation: You have 20 archers + 10 infantry. Enemy has 30 heavy infantry, no ranged.\n"
            "Enemy is 200m away and advancing.\n"
            "Objective: Maximize casualties before melee contact.\n\n"
            'Issue orders for both units. Respond:\n'
            '{"infantry_order": "<command>", "archer_order": "<command>", '
            '"reason": "<tactical reasoning>", "expected_outcome": "<string>"}'
        ),
        "check": lambda r: (
            r.get("infantry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
            and r.get("archer_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
            and isinstance(r.get("reason"), str)
        ),
        "check_desc": "Both orders must be valid commands",
        "strategic_check": lambda r: (
            r.get("archer_order") == "skirmish"
            and r.get("infantry_order") in ["shield_wall", "advance"]
        ),
        "strategic_desc": "Archers should skirmish while infantry holds (shield_wall or advance)",
    },
    {
        "description": "Outnumbered — retreat decision",
        "prompt": (
            "Situation: Your party (15 troops) has been ambushed.\n"
            "Enemy: 60 bandits, surrounding you on 3 sides.\n"
            "Your troops: 40% wounded. One escape route to the east.\n\n"
            'What is your command? Respond:\n'
            '{"order": "<command>", "direction": "east"|"west"|"north"|"south"|null, '
            '"reason": "<tactical reasoning>", "priority": "preserve_troops"|"fight_through"}'
        ),
        "check": lambda r: (
            r.get("order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
            and r.get("priority") in ["preserve_troops", "fight_through"]
        ),
        "check_desc": "order and priority must be valid values",
        "strategic_check": lambda r: (
            r.get("order") == "fallback"
            and r.get("priority") == "preserve_troops"
        ),
        "strategic_desc": "Outnumbered 4:1 with wounded troops — fallback is the sound choice",
    },
    {
        "description": "Cavalry flanking opportunity",
        "prompt": (
            "Situation: Main battle is engaged. Your infantry and enemy infantry are locked.\n"
            "You have 8 cavalry in reserve. Enemy left flank is unprotected.\n"
            "If cavalry hits the flank now, it will route enemy in ~30 seconds.\n\n"
            'Order for cavalry: Respond:\n'
            '{"cavalry_order": "<command>", "timing": "now"|"wait", '
            '"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
        ),
        "check": lambda r: (
            r.get("cavalry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
            and r.get("timing") in ["now", "wait"]
            and r.get("risk") in ["low", "medium", "high"]
        ),
        "check_desc": "cavalry_order, timing, and risk must be valid values",
        "strategic_check": lambda r: (
            r.get("cavalry_order") in ["flank_left", "flank_right", "charge"]
            and r.get("timing") == "now"
        ),
        "strategic_desc": "Should capitalize on the flank opportunity immediately",
    },
 ]
@dataclass
 class ScenarioResult:
    scenario_index: int
    description: str
    raw_response: str
    parsed: dict | None
    valid_json: bool
    schema_valid: bool
    strategically_sound: bool
    latency_ms: float
    error: str = ""
@dataclass
 class LevelResult:
    level: int = LEVEL
    name: str = NAME
    trials: list[ScenarioResult] = field(default_factory=list)
    passed: bool = False
    score: float = 0.0
    latency_p50_ms: float = 0.0
    latency_p99_ms: float = 0.0
 def _clean_response(raw: str) -> str:
    raw = raw.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()
    return raw
 def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
    result = LevelResult()
    latencies = []
    for i, scenario in enumerate(SCENARIOS):
        t0 = time.time()
        try:
            response = client.chat(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": scenario["prompt"]},
                ],
                options={"temperature": 0.2},
            )
            raw = response["message"]["content"]
            latency_ms = (time.time() - t0) * 1000
        except Exception as exc:
            latency_ms = (time.time() - t0) * 1000
            sr = ScenarioResult(
                scenario_index=i,
                description=scenario["description"],
                raw_response="",
                parsed=None,
                valid_json=False,
                schema_valid=False,
                strategically_sound=False,
                latency_ms=latency_ms,
                error=str(exc),
            )
            result.trials.append(sr)
            if verbose:
                print(f"  Scenario {i}: ERROR — {exc}")
            continue
        latencies.append(latency_ms)
        cleaned = _clean_response(raw)
        parsed = None
        valid_json = False
        schema_valid = False
        strategically_sound = False
        error = ""
        try:
            parsed = json.loads(cleaned)
            valid_json = True
            schema_valid = scenario["check"](parsed)
            if schema_valid:
                strategically_sound = scenario["strategic_check"](parsed)
        except json.JSONDecodeError as exc:
            error = f"JSONDecodeError: {exc}"
        except Exception as exc:
            error = f"Validation error: {exc}"
        sr = ScenarioResult(
            scenario_index=i,
            description=scenario["description"],
            raw_response=raw,
            parsed=parsed,
            valid_json=valid_json,
            schema_valid=schema_valid,
            strategically_sound=strategically_sound,
            latency_ms=latency_ms,
            error=error,
        )
        result.trials.append(sr)
        if verbose:
            status = "PASS" if (valid_json and schema_valid) else "FAIL"
            strat = "strategic" if strategically_sound else "suboptimal"
            print(
                f"  Scenario {i} [{scenario['description']}]: {status} ({strat}) "
                f"| {latency_ms:.0f}ms"
            )
            if not schema_valid and valid_json:
                print(f"    Schema issue: {scenario['check_desc']}")
    valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
    result.score = valid_count / len(SCENARIOS)
    result.passed = result.score >= 0.67
    if latencies:
        latencies_sorted = sorted(latencies)
        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
        result.latency_p99_ms = latencies_sorted[-1]
    return result
--- a/timmy-benchmark/levels/level_4_trade_route.py
+++ b/timmy-benchmark/levels/level_4_trade_route.py
@@ -0,0 +1,223 @@
 """Level 4: Trade Route — Campaign Navigation.
 Tests multi-step planning ability: route optimization, trade-off analysis
 across time horizons, and adapting plans when conditions change.
 Maps to: Bannerlord campaign map navigation, caravans, and economy.
 """
 import json
 import time
 from dataclasses import dataclass, field
 from typing import Any
 LEVEL = 4
 NAME = "Trade Route (Campaign Navigation)"
 DESCRIPTION = "Model must plan optimal routes and adapt to changing conditions on the campaign map."
 SYSTEM_PROMPT = """You are a Bannerlord merchant lord planning campaign movements.
 Consider distance, profitability, risk, and timing.
 You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
 SCENARIOS = [
    {
        "description": "Optimal trade route selection",
        "prompt": (
            "You are at Epicrotea with 500 gold and 20 days travel budget.\n\n"
            "Trade opportunities:\n"
            "- Route A: Epicrotea → Vlandia (3 days) → Sturgia (5 days back)\n"
            "  Sell grain in Vlandia: +300 gold. Buy furs in Sturgia: costs 200, sells for 400 in Calradia.\n"
            "  Total: +500 gold profit, 8 days.\n"
            "- Route B: Epicrotea → Calradia (2 days) → Aserai (4 days)\n"
            "  Sell iron in Calradia: +150 gold. Buy spice in Aserai: costs 300, sells for 600 in Empire.\n"
            "  Empire is 6 more days away. Total: +450 gold profit, 12 days.\n"
            "- Route C: Epicrotea → nearby village (1 day)\n"
            "  Buy cheap food: costs 100, sells for 180 in any city.\n"
            "  Total: +80 gold profit, 2 days. Repeatable.\n\n"
            'Choose route. Respond:\n'
            '{"route": "A"|"B"|"C", "expected_profit": <int>, "days_used": <int>, '
            '"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
        ),
        "check": lambda r: (
            r.get("route") in ["A", "B", "C"]
            and isinstance(r.get("expected_profit"), (int, float))
            and isinstance(r.get("days_used"), (int, float))
            and r.get("risk") in ["low", "medium", "high"]
        ),
        "check_desc": "route, expected_profit, days_used, risk must be valid",
        "strategic_check": lambda r: r.get("route") in ["A", "C"],  # A is best single trip, C is best if repeated
        "strategic_desc": "Route A has best profit/day ratio; C is best if multiple loops possible",
    },
    {
        "description": "Adapt plan when war declared",
        "prompt": (
            "You were heading to Vlandia to trade, 2 days into the journey.\n"
            "NEWS: Vlandia just declared war on your faction. Entering Vlandia territory is now dangerous.\n\n"
            "Your current position: borderlands, equidistant between:\n"
            "- Vlandia (2 days): Now at war — high risk of attack\n"
            "- Sturgia (3 days): Neutral — safe\n"
            "- Empire (4 days): Allied — very safe, good prices\n\n"
            "You have 400 gold of trade goods for the Vlandia market.\n"
            'What do you do? Respond:\n'
            '{"decision": "continue_to_vlandia"|"divert_to_sturgia"|"divert_to_empire", '
            '"reason": "<why>", "gold_at_risk": <int>}'
        ),
        "check": lambda r: (
            r.get("decision") in ["continue_to_vlandia", "divert_to_sturgia", "divert_to_empire"]
            and isinstance(r.get("gold_at_risk"), (int, float))
        ),
        "check_desc": "decision must be one of three options, gold_at_risk must be a number",
        "strategic_check": lambda r: r.get("decision") in ["divert_to_sturgia", "divert_to_empire"],
        "strategic_desc": "Should avoid active war zone — divert to safe destination",
    },
    {
        "description": "Multi-stop route planning with constraints",
        "prompt": (
            "Plan a 3-stop trading circuit starting and ending at Pravend.\n"
            "Budget: 800 gold. Time limit: 20 days.\n\n"
            "Available cities and travel times from Pravend:\n"
            "- Rhotae: 2 days (leather cheap, sells well in south)\n"
            "- Ortysia: 4 days (grain surplus — buy cheap)\n"
            "- Epicrotea: 3 days (iron market — buy/sell)\n"
            "- Pen Cannoc: 5 days (wine — high profit, far)\n\n"
            "Each stop takes 1 day for trading.\n"
            'Plan 3 stops. Respond:\n'
            '{"stops": ["<city1>", "<city2>", "<city3>"], '
            '"total_days": <int>, "estimated_profit": <int>, '
            '"reason": "<reasoning>"}'
        ),
        "check": lambda r: (
            isinstance(r.get("stops"), list)
            and len(r["stops"]) == 3
            and all(isinstance(s, str) for s in r["stops"])
            and isinstance(r.get("total_days"), (int, float))
            and r.get("total_days", 99) <= 20
            and isinstance(r.get("estimated_profit"), (int, float))
        ),
        "check_desc": "stops must be list of 3 strings, total_days <= 20, estimated_profit numeric",
        "strategic_check": lambda r: "Pen Cannoc" not in r.get("stops", []),  # Too far for 20 days
        "strategic_desc": "Pen Cannoc at 5 days each way is likely too far for a 20-day circuit",
    },
 ]
@dataclass
 class ScenarioResult:
    scenario_index: int
    description: str
    raw_response: str
    parsed: dict | None
    valid_json: bool
    schema_valid: bool
    strategically_sound: bool
    latency_ms: float
    error: str = ""
@dataclass
 class LevelResult:
    level: int = LEVEL
    name: str = NAME
    trials: list[ScenarioResult] = field(default_factory=list)
    passed: bool = False
    score: float = 0.0
    latency_p50_ms: float = 0.0
    latency_p99_ms: float = 0.0
 def _clean_response(raw: str) -> str:
    raw = raw.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()
    return raw
 def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
    result = LevelResult()
    latencies = []
    for i, scenario in enumerate(SCENARIOS):
        t0 = time.time()
        try:
            response = client.chat(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": scenario["prompt"]},
                ],
                options={"temperature": 0.2},
            )
            raw = response["message"]["content"]
            latency_ms = (time.time() - t0) * 1000
        except Exception as exc:
            latency_ms = (time.time() - t0) * 1000
            sr = ScenarioResult(
                scenario_index=i,
                description=scenario["description"],
                raw_response="",
                parsed=None,
                valid_json=False,
                schema_valid=False,
                strategically_sound=False,
                latency_ms=latency_ms,
                error=str(exc),
            )
            result.trials.append(sr)
            if verbose:
                print(f"  Scenario {i}: ERROR — {exc}")
            continue
        latencies.append(latency_ms)
        cleaned = _clean_response(raw)
        parsed = None
        valid_json = False
        schema_valid = False
        strategically_sound = False
        error = ""
        try:
            parsed = json.loads(cleaned)
            valid_json = True
            schema_valid = scenario["check"](parsed)
            if schema_valid:
                strategically_sound = scenario["strategic_check"](parsed)
        except json.JSONDecodeError as exc:
            error = f"JSONDecodeError: {exc}"
        except Exception as exc:
            error = f"Validation error: {exc}"
        sr = ScenarioResult(
            scenario_index=i,
            description=scenario["description"],
            raw_response=raw,
            parsed=parsed,
            valid_json=valid_json,
            schema_valid=schema_valid,
            strategically_sound=strategically_sound,
            latency_ms=latency_ms,
            error=error,
        )
        result.trials.append(sr)
        if verbose:
            status = "PASS" if (valid_json and schema_valid) else "FAIL"
            strat = "strategic" if strategically_sound else "suboptimal"
            print(
                f"  Scenario {i} [{scenario['description']}]: {status} ({strat}) "
                f"| {latency_ms:.0f}ms"
            )
            if not schema_valid and valid_json:
                print(f"    Schema issue: {scenario['check_desc']}")
    valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
    result.score = valid_count / len(SCENARIOS)
    result.passed = result.score >= 0.67
    if latencies:
        latencies_sorted = sorted(latencies)
        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
        result.latency_p99_ms = latencies_sorted[-1]
    return result
--- a/timmy-benchmark/levels/level_5_mini_campaign.py
+++ b/timmy-benchmark/levels/level_5_mini_campaign.py
@@ -0,0 +1,252 @@
 """Level 5: Mini Campaign — Full Campaign Loop.
 Tests multi-turn strategic coherence: the model must maintain state across
 several turns of a simulated Bannerlord campaign, making consistent decisions
 that build toward a long-term goal.
 Maps to: Full Bannerlord campaign loop — economy, diplomacy, conquest.
 """
 import json
 import time
 from dataclasses import dataclass, field
 from typing import Any
 LEVEL = 5
 NAME = "Mini Campaign (Full Campaign Loop)"
 DESCRIPTION = "Multi-turn strategic planning maintaining coherent goals across 4 turns."
 SYSTEM_PROMPT = """You are Timmy, a Bannerlord lord with ambitions to become King of Calradia.
 You have 4 turns to establish a power base. Each turn represents 2 weeks of in-game time.
 Your starting position:
 - Clan tier: 1 (minor lord)
 - Gold: 1000
 - Troops: 25 (mixed infantry/cavalry)
 - Renown: 150
 - Relations: Neutral with all factions
 Winning requires: Gold > 3000 AND Renown > 400 AND Own 1+ settlement by Turn 4.
 Each turn, choose ONE primary action:
 - "raid_village": +200 gold, -50 relations target faction, +30 renown, risk of retaliation
 - "trade_circuit": +300 gold, 0 relation change, +10 renown, no risk
 - "escort_caravan": +150 gold, +20 relations with faction, +20 renown
 - "tournament": costs 100 gold, +60 renown, +20 relations with host faction
 - "recruit_troops": costs 200 gold, +15 troops, no other change
 - "siege_castle": costs 500 gold + 200 troops morale, -100 relations, +80 renown, +1 settlement if succeed (30% base chance)
 - "pledge_vassalage": 0 cost, +100 relations with liege, +50 renown, lose independence
 You MUST respond ONLY with valid JSON for each turn. Raw JSON only."""
 def run(client: Any, model: str, verbose: bool = False) -> "LevelResult":
    """Run a 4-turn mini campaign, tracking state and decision quality."""
    result = LevelResult()
    # Initial game state
    state = {
        "turn": 1,
        "gold": 1000,
        "troops": 25,
        "renown": 150,
        "settlements": 0,
        "relations": {"vlandia": 0, "sturgia": 0, "empire": 0, "aserai": 0, "battania": 0},
    }
    conversation = [{"role": "system", "content": SYSTEM_PROMPT}]
    turns_passed = []
    total_latency = []
    valid_actions = [
        "raid_village", "trade_circuit", "escort_caravan", "tournament",
        "recruit_troops", "siege_castle", "pledge_vassalage",
    ]
    for turn_num in range(1, 5):
        state["turn"] = turn_num
        state_str = json.dumps(state, indent=2)
        prompt = (
            f"=== TURN {turn_num} / 4 ===\n"
            f"Current state:\n{state_str}\n\n"
            f"Win conditions remaining: Gold > 3000 ({state['gold']}/3000), "
            f"Renown > 400 ({state['renown']}/400), Settlements >= 1 ({state['settlements']}/1)\n\n"
            f"Choose your action for Turn {turn_num}.\n"
            f'Respond: {{"action": "<action>", "target_faction": "<faction or null>", '
            f'"reason": "<strategic reasoning>", "goal": "<what this advances>"}}'
        )
        conversation.append({"role": "user", "content": prompt})
        t0 = time.time()
        try:
            response = client.chat(
                model=model,
                messages=conversation,
                options={"temperature": 0.3},
            )
            raw = response["message"]["content"]
            latency_ms = (time.time() - t0) * 1000
        except Exception as exc:
            latency_ms = (time.time() - t0) * 1000
            tr = TurnResult(
                turn=turn_num,
                state_before=dict(state),
                raw_response="",
                parsed=None,
                valid_json=False,
                valid_action=False,
                action=None,
                latency_ms=latency_ms,
                error=str(exc),
            )
            turns_passed.append(tr)
            if verbose:
                print(f"  Turn {turn_num}: ERROR — {exc}")
            break
        total_latency.append(latency_ms)
        # Clean and parse response
        cleaned = raw.strip()
        if cleaned.startswith("```"):
            lines = cleaned.splitlines()
            lines = [l for l in lines if not l.startswith("```")]
            cleaned = "\n".join(lines).strip()
        parsed = None
        valid_json = False
        valid_action = False
        action = None
        error = ""
        try:
            parsed = json.loads(cleaned)
            valid_json = True
            action = parsed.get("action")
            valid_action = action in valid_actions
        except json.JSONDecodeError as exc:
            error = f"JSONDecodeError: {exc}"
        tr = TurnResult(
            turn=turn_num,
            state_before=dict(state),
            raw_response=raw,
            parsed=parsed,
            valid_json=valid_json,
            valid_action=valid_action,
            action=action,
            latency_ms=latency_ms,
            error=error,
        )
        turns_passed.append(tr)
        # Add model response to conversation for continuity
        conversation.append({"role": "assistant", "content": raw})
        # Apply state changes based on action
        if valid_action:
            _apply_action(state, action, parsed.get("target_faction"))
        if verbose:
            status = "PASS" if (valid_json and valid_action) else "FAIL"
            print(
                f"  Turn {turn_num}: {status} | action={action} | {latency_ms:.0f}ms | "
                f"gold={state['gold']} renown={state['renown']} settlements={state['settlements']}"
            )
    result.turns = turns_passed
    result.final_state = dict(state)
    # Win condition check
    result.reached_gold_target = state["gold"] >= 3000
    result.reached_renown_target = state["renown"] >= 400
    result.reached_settlement_target = state["settlements"] >= 1
    # Score: % of turns with valid JSON + valid action
    valid_turns = sum(1 for t in turns_passed if t.valid_json and t.valid_action)
    result.score = valid_turns / 4 if turns_passed else 0.0
    result.passed = result.score >= 0.75  # 3/4 turns valid
    if total_latency:
        latencies_sorted = sorted(total_latency)
        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
        result.latency_p99_ms = latencies_sorted[-1]
    if verbose:
        win_status = []
        if result.reached_gold_target:
            win_status.append("GOLD")
        if result.reached_renown_target:
            win_status.append("RENOWN")
        if result.reached_settlement_target:
            win_status.append("SETTLEMENT")
        print(f"  Win conditions met: {win_status or 'none'}")
        print(f"  Final: gold={state['gold']} renown={state['renown']} settlements={state['settlements']}")
    return result
 def _apply_action(state: dict, action: str, target_faction: str | None) -> None:
    """Simulate game state changes for a given action."""
    if action == "raid_village":
        state["gold"] += 200
        state["renown"] += 30
        if target_faction and target_faction in state["relations"]:
            state["relations"][target_faction] -= 50
    elif action == "trade_circuit":
        state["gold"] += 300
        state["renown"] += 10
    elif action == "escort_caravan":
        state["gold"] += 150
        state["renown"] += 20
        if target_faction and target_faction in state["relations"]:
            state["relations"][target_faction] += 20
    elif action == "tournament":
        state["gold"] -= 100
        state["renown"] += 60
        if target_faction and target_faction in state["relations"]:
            state["relations"][target_faction] += 20
    elif action == "recruit_troops":
        state["gold"] -= 200
        state["troops"] += 15
    elif action == "siege_castle":
        state["gold"] -= 500
        state["renown"] += 80
        # 30% chance success (deterministic sim: succeed on turn 3+ if attempted)
        if state["turn"] >= 3:
            state["settlements"] += 1
        if target_faction and target_faction in state["relations"]:
            state["relations"][target_faction] -= 100
    elif action == "pledge_vassalage":
        state["renown"] += 50
        if target_faction and target_faction in state["relations"]:
            state["relations"][target_faction] += 100
@dataclass
 class TurnResult:
    turn: int
    state_before: dict
    raw_response: str
    parsed: dict | None
    valid_json: bool
    valid_action: bool
    action: str | None
    latency_ms: float
    error: str = ""
@dataclass
 class LevelResult:
    level: int = LEVEL
    name: str = NAME
    turns: list[TurnResult] = field(default_factory=list)
    final_state: dict = field(default_factory=dict)
    passed: bool = False
    score: float = 0.0
    reached_gold_target: bool = False
    reached_renown_target: bool = False
    reached_settlement_target: bool = False
    latency_p50_ms: float = 0.0
    latency_p99_ms: float = 0.0
--- a/timmy-benchmark/results/SCORECARD.md
+++ b/timmy-benchmark/results/SCORECARD.md
@@ -0,0 +1,82 @@
 # Bannerlord M0 — Cognitive Benchmark Scorecard
 **Date:** 2026-03-23
 **Benchmark:** 6-level cognitive harness (L0–L5)
 **M1 Gate:** Must pass L0 + L1, latency < 10s per decision
 ---
 ## Results Summary
 | Level | Description | qwen2.5:14b | hermes3:latest | hermes3:8b |
 |-------|-------------|:-----------:|:--------------:|:----------:|
 | **L0 [M1 GATE]** | JSON Compliance | ✓ PASS 100% | ✓ PASS 100% | ✓ PASS 100% |
 | **L1 [M1 GATE]** | Board State Tracking | ✗ FAIL 50% | ✗ FAIL 50% | ✗ FAIL 50% |
 | L2 | Resource Management | ✓ PASS 100% | ✓ PASS 100% | ✓ PASS 100% |
 | L3 | Battle Tactics | ✓ PASS 100% | ✓ PASS 100% | ✓ PASS 100% |
 | L4 | Trade Route | ✓ PASS 100% | ✓ PASS 100% | ✓ PASS 100% |
 | L5 | Mini Campaign | ✓ PASS 100% | ✓ PASS 100% | ✓ PASS 100% |
 | **M1 GATE** | | ✗ **FAIL** | ✗ **FAIL** | ✗ **FAIL** |
 ---
 ## Latency (p50 / p99)
 | Level | qwen2.5:14b | hermes3:latest | hermes3:8b |
 |-------|-------------|----------------|------------|
 | L0 | 1443ms / 6348ms | 1028ms / 1184ms | 570ms / 593ms |
 | L1 | 943ms / 1184ms | 1166ms / 1303ms | 767ms / 1313ms |
 | L2 | 2936ms / 3122ms | 2032ms / 2232ms | 2408ms / 2832ms |
 | L3 | 2248ms / 3828ms | 1614ms / 3525ms | 2174ms / 3437ms |
 | L4 | 3235ms / 3318ms | 2724ms / 3038ms | 2507ms / 3420ms |
 | L5 | 3414ms / 3970ms | 3137ms / 3433ms | 2571ms / 2763ms |
 All models are **well under the 10s latency threshold** for L0–L1.
 ---
 ## Level 1 Failure Analysis
 All three models fail L1 with **identical pattern** (2/4 scenarios pass):
 | Scenario | Expected | All Models |
 |----------|----------|-----------|
 | Empty board — opening move | Any empty square | ✓ center (4) |
 | Block opponent's winning move | Position 1 (only block) | ✗ position 4 (occupied!) |
 | Take winning move | Position 1 (win) | ✗ position 0 or 2 (occupied!) |
 | Legal move on partially filled board | Any of 6,7,8 | ✓ position 6 |
 **Root cause:** Models choose moves by heuristic (center, corners) without checking whether the chosen square is already occupied. They read the board description but don't cross-reference their move choice against it. This is a genuine spatial state-tracking failure.
 **Note:** `hermes3` models emit `"move": "4"` (string) vs `"move": 4` (int). The benchmark was patched to coerce string digits to int for L1, since type fidelity is already tested at L0.
 ---
 ## M1 Gate: FAILED (all models)
 No model passes the M1 gate. The blocker is **Level 1 — Board State Tracking**.
 ### Recommendation
 The L1 failure is consistent and structural. All models understand the format and can make reasonable *opening* moves but fail to avoid already-occupied squares. Options for M1:
 1. **Lower the L1 pass threshold** from 100% to ≥ 75% — the scenarios where models fail require recognizing occupied positions from a sparse JSON array, which is a known weakness. Would allow proceeding to M1 with flagged risk.
 2. **Prompt engineering** — add explicit "The following squares are taken: X at positions [P1, P2]" to the prompt to see if board tracking improves.
 3. **Re-evaluate L1 gate requirement** — models pass L2–L5 (resource, tactics, trade, campaign) which are more directly relevant to Bannerlord play. Consider whether L1 is the right gate.
 ---
 ## qwen3:14b
 Model **not available** on this Ollama instance. Available qwen3 model: `qwen3:30b`.
 `qwen3:30b` was not benchmarked (significantly slower; requires explicit decision to run).
 ---
 ## Result Files
 | Model | File |
 |-------|------|
 | qwen2.5:14b | `results/qwen2.5_14b_20260323_142119.json` |
 | hermes3:latest | `results/hermes3_latest_20260323_152900.json` |
 | hermes3:8b | `results/hermes3_8b_20260323_153000.json` |
--- a/timmy-benchmark/results/hermes3_8b_20260323_153000.json
+++ b/timmy-benchmark/results/hermes3_8b_20260323_153000.json
@@ -0,0 +1,476 @@
 {
  "model": "hermes3:8b",
  "timestamp": "2026-03-23T19:29:22.731080+00:00",
  "skipped": false,
  "levels": {
    "0": {
      "level": 0,
      "name": "JSON Compliance (Coin Flip)",
      "trials": [
        {
          "trial_index": 0,
          "prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
          "raw_response": "{\"choice\": \"heads\"}",
          "parsed": {
            "choice": "heads"
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 522.9010581970215,
          "error": ""
        },
        {
          "trial_index": 1,
          "prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
          "raw_response": "{\"action\": \"defend\", \"confidence\": 0.7}",
          "parsed": {
            "action": "defend",
            "confidence": 0.7
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 592.6809310913086,
          "error": ""
        },
        {
          "trial_index": 2,
          "prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
          "raw_response": "{\"direction\": \"north\", \"reason\": \"string explaining why\"}",
          "parsed": {
            "direction": "north",
            "reason": "string explaining why"
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 569.8130130767822,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 569.8130130767822,
      "latency_p99_ms": 592.6809310913086
    },
    "1": {
      "level": 1,
      "name": "Board State Tracking (Tic-Tac-Toe)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Empty board \u2014 opening move",
          "prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": \"4\", \"reason\": \"Center move to establish dominance\"}",
          "parsed": {
            "move": "4",
            "reason": "Center move to establish dominance"
          },
          "valid_json": true,
          "move_legal": true,
          "move_correct": true,
          "latency_ms": 766.8571472167969,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Block opponent's winning move",
          "prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 4, \"reason\": \"Block the winning move for O in the center\"}",
          "parsed": {
            "move": 4,
            "reason": "Block the winning move for O in the center"
          },
          "valid_json": true,
          "move_legal": false,
          "move_correct": false,
          "latency_ms": 734.1668605804443,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Take winning move",
          "prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 0, \"reason\": \"I can win this turn by placing my X in position 0, creating a winning line with X at positions 0, 4, and 8.\"}",
          "parsed": {
            "move": 0,
            "reason": "I can win this turn by placing my X in position 0, creating a winning line with X at positions 0, 4, and 8."
          },
          "valid_json": true,
          "move_legal": false,
          "move_correct": false,
          "latency_ms": 1312.8020763397217,
          "error": ""
        },
        {
          "scenario_index": 3,
          "description": "Legal move on partially filled board",
          "prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 6, \"reason\": \"Block X's potential winning move in the center.\"}",
          "parsed": {
            "move": 6,
            "reason": "Block X's potential winning move in the center."
          },
          "valid_json": true,
          "move_legal": true,
          "move_correct": true,
          "latency_ms": 751.0488033294678,
          "error": ""
        }
      ],
      "passed": false,
      "score": 0.5,
      "latency_p50_ms": 766.8571472167969,
      "latency_p99_ms": 1312.8020763397217
    },
    "2": {
      "level": 2,
      "name": "Resource Management (Party Economy)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Budget allocation under constraint",
          "raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"The priority is to address the critical morale issue and reduce casualty rates. Buying food for 20 days will stabilize morale, preventing potential desertions. Repairing armor will also help reduce the casualty rate, making the remaining troops more effective. Recruit infantry is not chosen as it would deplete all the gold, leaving no funds for food or armor repair.\"}",
          "parsed": {
            "choices": [
              "buy_food",
              "repair_armor"
            ],
            "gold_spent": 350,
            "reason": "The priority is to address the critical morale issue and reduce casualty rates. Buying food for 20 days will stabilize morale, preventing potential desertions. Repairing armor will also help reduce the casualty rate, making the remaining troops more effective. Recruit infantry is not chosen as it would deplete all the gold, leaving no funds for food or armor repair."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2832.072973251343,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Troop tier upgrade decision",
          "raw_response": "{\"action\": \"upgrade_recruits\", \"reason\": \"Upgrading recruits to Tier-2 will improve our party's combat effectiveness against the upcoming raider camp attack. The moderate difficulty suggests we need a stronger force to succeed.\", \"expected_outcome\": \"Increased chances of victory in the upcoming battle, potentially saving resources in the long run by avoiding higher upkeep costs of dismissing recruits.\"}",
          "parsed": {
            "action": "upgrade_recruits",
            "reason": "Upgrading recruits to Tier-2 will improve our party's combat effectiveness against the upcoming raider camp attack. The moderate difficulty suggests we need a stronger force to succeed.",
            "expected_outcome": "Increased chances of victory in the upcoming battle, potentially saving resources in the long run by avoiding higher upkeep costs of dismissing recruits."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2407.7491760253906,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Multi-turn planning horizon",
          "raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"Buying 5 extra food days will ensure we have enough food to cross the desert, and we will still have enough gold left to reach the town on day 10 to buy more supplies.\"}",
          "parsed": {
            "extra_food_days": 5,
            "cost": 50,
            "remaining_gold": 250,
            "reason": "Buying 5 extra food days will ensure we have enough food to cross the desert, and we will still have enough gold left to reach the town on day 10 to buy more supplies."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2028.775930404663,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 2407.7491760253906,
      "latency_p99_ms": 2832.072973251343
    },
    "3": {
      "level": 3,
      "name": "Battle Tactics (Formation Commands)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Ranged vs infantry \u2014 defensive opening",
          "raw_response": "{\"infantry_order\": \"skirmish\", \"archer_order\": \"skirmish\", \"reason\": \"The enemy is advancing with heavy infantry, so our priority should be to disrupt their formation and inflict casualties from range before they close the gap. The archers and infantry will pepper the enemy with arrows as they approach, aiming to break their cohesion and lower their morale.\", \"expected_outcome\": \"The enemy's heavy infantry will take significant casualties and lose cohesion before reaching our lines, giving us a tactical advantage when the two forces meet in melee combat.\"}",
          "parsed": {
            "infantry_order": "skirmish",
            "archer_order": "skirmish",
            "reason": "The enemy is advancing with heavy infantry, so our priority should be to disrupt their formation and inflict casualties from range before they close the gap. The archers and infantry will pepper the enemy with arrows as they approach, aiming to break their cohesion and lower their morale.",
            "expected_outcome": "The enemy's heavy infantry will take significant casualties and lose cohesion before reaching our lines, giving us a tactical advantage when the two forces meet in melee combat."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": false,
          "latency_ms": 3436.897039413452,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Outnumbered \u2014 retreat decision",
          "raw_response": "{\"order\": \"fallback\", \"direction\": \"east\", \"reason\": \"Our troops are heavily wounded and outmatched by the 60 bandits surrounding us. We must retreat immediately to the east where there is an escape route. Fallback is the best option to preserve our troops and live to fight another day.\", \"priority\": \"preserve_troops\"}",
          "parsed": {
            "order": "fallback",
            "direction": "east",
            "reason": "Our troops are heavily wounded and outmatched by the 60 bandits surrounding us. We must retreat immediately to the east where there is an escape route. Fallback is the best option to preserve our troops and live to fight another day.",
            "priority": "preserve_troops"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2173.5548973083496,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Cavalry flanking opportunity",
          "raw_response": "{\"cavalry_order\": \"flank_left\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and our cavalry can break their line quickly, routing them in ~30 seconds.\", \"risk\": \"low\"}",
          "parsed": {
            "cavalry_order": "flank_left",
            "timing": "now",
            "reason": "Enemy left flank is unprotected and our cavalry can break their line quickly, routing them in ~30 seconds.",
            "risk": "low"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 1566.443920135498,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 2173.5548973083496,
      "latency_p99_ms": 3436.897039413452
    },
    "4": {
      "level": 4,
      "name": "Trade Route (Campaign Navigation)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Optimal trade route selection",
          "raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of goods and markets.\", \"risk\": \"low\"}",
          "parsed": {
            "route": "A",
            "expected_profit": 500,
            "days_used": 8,
            "reason": "Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of goods and markets.",
            "risk": "low"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2507.443904876709,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Adapt plan when war declared",
          "raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Vlandia declaring war makes entering their territory too dangerous for trade. Sturgia is neutral but closer than the Empire, so less time and gold at risk.\", \"gold_at_risk\": 400}",
          "parsed": {
            "decision": "divert_to_sturgia",
            "reason": "Vlandia declaring war makes entering their territory too dangerous for trade. Sturgia is neutral but closer than the Empire, so less time and gold at risk.",
            "gold_at_risk": 400
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2011.15083694458,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Multi-stop route planning with constraints",
          "raw_response": "{\n  \"stops\": [\n    \"Rhotae\",\n    \"Ortysia\",\n    \"Epicrotea\"\n  ],\n  \"total_days\": 7,\n  \"estimated_profit\": 800,\n  \"reason\": \"Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. 7 days total. 800 gold budget should be enough with this plan.\"\n}",
          "parsed": {
            "stops": [
              "Rhotae",
              "Ortysia",
              "Epicrotea"
            ],
            "total_days": 7,
            "estimated_profit": 800,
            "reason": "Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. 7 days total. 800 gold budget should be enough with this plan."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3420.1788902282715,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 2507.443904876709,
      "latency_p99_ms": 3420.1788902282715
    },
    "5": {
      "level": 5,
      "name": "Mini Campaign (Full Campaign Loop)",
      "turns": [
        {
          "turn": 1,
          "state_before": {
            "turn": 1,
            "gold": 1000,
            "troops": 25,
            "renown": 150,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": 0,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Establishing trade relations will bring in gold and renown without risking retaliation or damaging relations with any faction. This sets a strong financial foundation for future expansion.\", \"goal\": \"Increase gold and renown\"}",
          "parsed": {
            "action": "trade_circuit",
            "target_faction": null,
            "reason": "Establishing trade relations will bring in gold and renown without risking retaliation or damaging relations with any faction. This sets a strong financial foundation for future expansion.",
            "goal": "Increase gold and renown"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "trade_circuit",
          "latency_ms": 2763.1959915161133,
          "error": ""
        },
        {
          "turn": 2,
          "state_before": {
            "turn": 2,
            "gold": 1300,
            "troops": 25,
            "renown": 160,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": 0,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"tournament\", \"target_faction\": null, \"reason\": \"Participating in a tournament will boost my renown and relations without spending gold. This positive publicity can help attract potential vassals or allies in the future.\", \"goal\": \"Increase renown and relations\"}",
          "parsed": {
            "action": "tournament",
            "target_faction": null,
            "reason": "Participating in a tournament will boost my renown and relations without spending gold. This positive publicity can help attract potential vassals or allies in the future.",
            "goal": "Increase renown and relations"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "tournament",
          "latency_ms": 2335.5319499969482,
          "error": ""
        },
        {
          "turn": 3,
          "state_before": {
            "turn": 3,
            "gold": 1200,
            "troops": 25,
            "renown": 220,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": 0,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Recruiting more troops will strengthen my military force, which is essential for future expansion and defending my settlements once I have them. This investment in my army will pay off in the long run.\", \"goal\": \"Increase troops\"}",
          "parsed": {
            "action": "recruit_troops",
            "target_faction": null,
            "reason": "Recruiting more troops will strengthen my military force, which is essential for future expansion and defending my settlements once I have them. This investment in my army will pay off in the long run.",
            "goal": "Increase troops"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "recruit_troops",
          "latency_ms": 2570.871114730835,
          "error": ""
        },
        {
          "turn": 4,
          "state_before": {
            "turn": 4,
            "gold": 1000,
            "troops": 40,
            "renown": 220,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": 0,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"siege_castle\", \"target_faction\": null, \"reason\": \"Capturing a castle will give me my first settlement, putting me closer to winning the game. This physical foothold in Calradia will provide a base of operations and resources to further my ambitions of becoming King.\", \"goal\": \"Gain first settlement\"}",
          "parsed": {
            "action": "siege_castle",
            "target_faction": null,
            "reason": "Capturing a castle will give me my first settlement, putting me closer to winning the game. This physical foothold in Calradia will provide a base of operations and resources to further my ambitions of becoming King.",
            "goal": "Gain first settlement"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "siege_castle",
          "latency_ms": 2367.640972137451,
          "error": ""
        }
      ],
      "final_state": {
        "turn": 4,
        "gold": 500,
        "troops": 40,
        "renown": 300,
        "settlements": 1,
        "relations": {
          "vlandia": 0,
          "sturgia": 0,
          "empire": 0,
          "aserai": 0,
          "battania": 0
        }
      },
      "passed": true,
      "score": 1.0,
      "reached_gold_target": false,
      "reached_renown_target": false,
      "reached_settlement_target": true,
      "latency_p50_ms": 2570.871114730835,
      "latency_p99_ms": 2763.1959915161133
    }
  },
  "summary": {
    "total_elapsed_s": 37.7,
    "levels_run": [
      0,
      1,
      2,
      3,
      4,
      5
    ],
    "levels_passed": [
      0,
      2,
      3,
      4,
      5
    ],
    "levels_failed": [
      1
    ],
    "m1_gate_passed": false,
    "m1_gate_notes": [
      "Level 1 FAILED (score=0.50)"
    ],
    "m1_latency_threshold_ms": 10000
  }
 }
--- a/timmy-benchmark/results/hermes3_latest_20260323_152900.json
+++ b/timmy-benchmark/results/hermes3_latest_20260323_152900.json
@@ -0,0 +1,476 @@
 {
  "model": "hermes3:latest",
  "timestamp": "2026-03-23T19:28:19.717773+00:00",
  "skipped": false,
  "levels": {
    "0": {
      "level": 0,
      "name": "JSON Compliance (Coin Flip)",
      "trials": [
        {
          "trial_index": 0,
          "prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
          "raw_response": "{\"choice\": \"heads\"}",
          "parsed": {
            "choice": "heads"
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 1028.2011032104492,
          "error": ""
        },
        {
          "trial_index": 1,
          "prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
          "raw_response": "{\"action\": \"defend\", \"confidence\": 0.7}",
          "parsed": {
            "action": "defend",
            "confidence": 0.7
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 700.0582218170166,
          "error": ""
        },
        {
          "trial_index": 2,
          "prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
          "raw_response": "{\"direction\": \"north\", \"reason\": \"The northern direction allows for the most efficient and direct path towards the goal, minimizing potential obstacles and maximizing progress.\"}",
          "parsed": {
            "direction": "north",
            "reason": "The northern direction allows for the most efficient and direct path towards the goal, minimizing potential obstacles and maximizing progress."
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 1184.3459606170654,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 1028.2011032104492,
      "latency_p99_ms": 1184.3459606170654
    },
    "1": {
      "level": 1,
      "name": "Board State Tracking (Tic-Tac-Toe)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Empty board \u2014 opening move",
          "prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": \"4\", \"reason\": \"Center move to establish dominance\"}",
          "parsed": {
            "move": "4",
            "reason": "Center move to establish dominance"
          },
          "valid_json": true,
          "move_legal": true,
          "move_correct": true,
          "latency_ms": 932.2080612182617,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Block opponent's winning move",
          "prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 4, \"reason\": \"Block the winning move for O in the center, creating a chance for a winning X cross\"}",
          "parsed": {
            "move": 4,
            "reason": "Block the winning move for O in the center, creating a chance for a winning X cross"
          },
          "valid_json": true,
          "move_legal": false,
          "move_correct": false,
          "latency_ms": 1165.644884109497,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Take winning move",
          "prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 0, \"reason\": \"I can win this turn by placing my X in position 0, creating a winning line of three X's horizontally.\"}",
          "parsed": {
            "move": 0,
            "reason": "I can win this turn by placing my X in position 0, creating a winning line of three X's horizontally."
          },
          "valid_json": true,
          "move_legal": false,
          "move_correct": false,
          "latency_ms": 1302.915096282959,
          "error": ""
        },
        {
          "scenario_index": 3,
          "description": "Legal move on partially filled board",
          "prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 6, \"reason\": \"Block X's potential win in the center.\"}",
          "parsed": {
            "move": 6,
            "reason": "Block X's potential win in the center."
          },
          "valid_json": true,
          "move_legal": true,
          "move_correct": true,
          "latency_ms": 851.456880569458,
          "error": ""
        }
      ],
      "passed": false,
      "score": 0.5,
      "latency_p50_ms": 1165.644884109497,
      "latency_p99_ms": 1302.915096282959
    },
    "2": {
      "level": 2,
      "name": "Resource Management (Party Economy)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Budget allocation under constraint",
          "raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"Buying food is crucial to prevent desertion, and repairing armor will help reduce casualties. Recruit infantry is put on hold due to limited resources.\"}",
          "parsed": {
            "choices": [
              "buy_food",
              "repair_armor"
            ],
            "gold_spent": 350,
            "reason": "Buying food is crucial to prevent desertion, and repairing armor will help reduce casualties. Recruit infantry is put on hold due to limited resources."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2031.9790840148926,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Troop tier upgrade decision",
          "raw_response": "{\"action\": \"save_gold\", \"reason\": \"Upgrading recruits now would leave little gold for emergencies, especially with the upcoming raiders camp attack. It's better to maintain the current troop composition and save gold for potential future threats or opportunities.\", \"expected_outcome\": \"Having enough gold to respond to unexpected situations without needing to dismiss or downgrade troops.\"}",
          "parsed": {
            "action": "save_gold",
            "reason": "Upgrading recruits now would leave little gold for emergencies, especially with the upcoming raiders camp attack. It's better to maintain the current troop composition and save gold for potential future threats or opportunities.",
            "expected_outcome": "Having enough gold to respond to unexpected situations without needing to dismiss or downgrade troops."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2231.9281101226807,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Multi-turn planning horizon",
          "raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"Buying 5 extra food days will ensure we have enough food to cross the desert and reach the town on day 10, while still having enough gold left for potential expenses.\"}",
          "parsed": {
            "extra_food_days": 5,
            "cost": 50,
            "remaining_gold": 250,
            "reason": "Buying 5 extra food days will ensure we have enough food to cross the desert and reach the town on day 10, while still having enough gold left for potential expenses."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 1943.3560371398926,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 2031.9790840148926,
      "latency_p99_ms": 2231.9281101226807
    },
    "3": {
      "level": 3,
      "name": "Battle Tactics (Formation Commands)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Ranged vs infantry \u2014 defensive opening",
          "raw_response": "{\"infantry_order\": \"skirmish\", \"archer_order\": \"skirmish\", \"reason\": \"The enemy heavy infantry is advancing, so we should use our archers and infantry to pepper them with ranged attacks before they close to melee range. This will soften them up and reduce their combat effectiveness when our forces meet.\", \"expected_outcome\": \"By the time our forces engage in melee, the enemy heavy infantry will have suffered significant casualties from the ranged attacks, giving us a tactical advantage and maximizing the damage we can inflict before contact is made.\"}",
          "parsed": {
            "infantry_order": "skirmish",
            "archer_order": "skirmish",
            "reason": "The enemy heavy infantry is advancing, so we should use our archers and infantry to pepper them with ranged attacks before they close to melee range. This will soften them up and reduce their combat effectiveness when our forces meet.",
            "expected_outcome": "By the time our forces engage in melee, the enemy heavy infantry will have suffered significant casualties from the ranged attacks, giving us a tactical advantage and maximizing the damage we can inflict before contact is made."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": false,
          "latency_ms": 3524.5888233184814,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Outnumbered \u2014 retreat decision",
          "raw_response": "{\"order\": \"fallback\", \"direction\": \"east\", \"reason\": \"Outnumbered and with many wounded, falling back to the escape route is the best option to preserve our troops.\", \"priority\": \"preserve_troops\"}",
          "parsed": {
            "order": "fallback",
            "direction": "east",
            "reason": "Outnumbered and with many wounded, falling back to the escape route is the best option to preserve our troops.",
            "priority": "preserve_troops"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 1523.0557918548584,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Cavalry flanking opportunity",
          "raw_response": "{\"cavalry_order\": \"flank_left\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and engaging our main force. A swift cavalry charge will catch them off guard and route them in ~30 seconds.\", \"risk\": \"low\"}",
          "parsed": {
            "cavalry_order": "flank_left",
            "timing": "now",
            "reason": "Enemy left flank is unprotected and engaging our main force. A swift cavalry charge will catch them off guard and route them in ~30 seconds.",
            "risk": "low"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 1614.2308712005615,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 1614.2308712005615,
      "latency_p99_ms": 3524.5888233184814
    },
    "4": {
      "level": 4,
      "name": "Trade Route (Campaign Navigation)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Optimal trade route selection",
          "raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of profitability and travel distance.\", \"risk\": \"low\"}",
          "parsed": {
            "route": "A",
            "expected_profit": 500,
            "days_used": 8,
            "reason": "Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of profitability and travel distance.",
            "risk": "low"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2335.57391166687,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Adapt plan when war declared",
          "raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Given the new information that Vlandia has declared war on our faction, entering their territory would be too dangerous. Sturgia is a neutral but friendly region, so it should be safe to trade there. The Empire is allied but further away, and while safe, the extra travel time could reduce the profitability of the trade mission.\", \"gold_at_risk\": 400}",
          "parsed": {
            "decision": "divert_to_sturgia",
            "reason": "Given the new information that Vlandia has declared war on our faction, entering their territory would be too dangerous. Sturgia is a neutral but friendly region, so it should be safe to trade there. The Empire is allied but further away, and while safe, the extra travel time could reduce the profitability of the trade mission.",
            "gold_at_risk": 400
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2723.7470149993896,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Multi-stop route planning with constraints",
          "raw_response": "{\n  \"stops\": [\n    \"Rhotae\",\n    \"Ortysia\",\n    \"Epicrotea\"\n  ],\n  \"total_days\": 7,\n  \"estimated_profit\": 1200,\n  \"reason\": \"Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. Total 7 days, 800 gold budget sufficient. Expected profit of 1200 gold.\"\n}",
          "parsed": {
            "stops": [
              "Rhotae",
              "Ortysia",
              "Epicrotea"
            ],
            "total_days": 7,
            "estimated_profit": 1200,
            "reason": "Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. Total 7 days, 800 gold budget sufficient. Expected profit of 1200 gold."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3038.438320159912,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 2723.7470149993896,
      "latency_p99_ms": 3038.438320159912
    },
    "5": {
      "level": 5,
      "name": "Mini Campaign (Full Campaign Loop)",
      "turns": [
        {
          "turn": 1,
          "state_before": {
            "turn": 1,
            "gold": 1000,
            "troops": 25,
            "renown": 150,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": 0,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Establishing trade relations will bring in gold without risking relations or retaliation. This sets a foundation for economic growth and allows me to assess other factions before committing to any alliances or conflicts.\", \"goal\": \"Increase gold reserves and build a power base through trade and diplomacy\"}",
          "parsed": {
            "action": "trade_circuit",
            "target_faction": null,
            "reason": "Establishing trade relations will bring in gold without risking relations or retaliation. This sets a foundation for economic growth and allows me to assess other factions before committing to any alliances or conflicts.",
            "goal": "Increase gold reserves and build a power base through trade and diplomacy"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "trade_circuit",
          "latency_ms": 2898.583173751831,
          "error": ""
        },
        {
          "turn": 2,
          "state_before": {
            "turn": 2,
            "gold": 1300,
            "troops": 25,
            "renown": 160,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": 0,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"raid_village\", \"target_faction\": null, \"reason\": \"Raiding a village will provide a quick boost to my gold reserves and renown. This will also help me gauge the strength of my troops and the potential resistance from the targeted faction. It's a bold move that shows my power and willingness to act, which may intimidate other factions and open up opportunities for future alliances or conflicts.\", \"goal\": \"Increase gold and renown, assess troop strength and potential faction responses\"}",
          "parsed": {
            "action": "raid_village",
            "target_faction": null,
            "reason": "Raiding a village will provide a quick boost to my gold reserves and renown. This will also help me gauge the strength of my troops and the potential resistance from the targeted faction. It's a bold move that shows my power and willingness to act, which may intimidate other factions and open up opportunities for future alliances or conflicts.",
            "goal": "Increase gold and renown, assess troop strength and potential faction responses"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "raid_village",
          "latency_ms": 3112.649917602539,
          "error": ""
        },
        {
          "turn": 3,
          "state_before": {
            "turn": 3,
            "gold": 1500,
            "troops": 25,
            "renown": 190,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": 0,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Recruiting more troops is essential for expanding my military strength and increasing my chances of successfully sieging a castle in the next turn. This will also help me prepare for potential conflicts with other factions and solidify my power base. Strengthening my army is a crucial step towards achieving my goal of becoming the King of Calradia.\", \"goal\": \"Increase troop count for future military expansion and defense\"}",
          "parsed": {
            "action": "recruit_troops",
            "target_faction": null,
            "reason": "Recruiting more troops is essential for expanding my military strength and increasing my chances of successfully sieging a castle in the next turn. This will also help me prepare for potential conflicts with other factions and solidify my power base. Strengthening my army is a crucial step towards achieving my goal of becoming the King of Calradia.",
            "goal": "Increase troop count for future military expansion and defense"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "recruit_troops",
          "latency_ms": 3136.754035949707,
          "error": ""
        },
        {
          "turn": 4,
          "state_before": {
            "turn": 4,
            "gold": 1300,
            "troops": 40,
            "renown": 190,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": 0,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"siege_castle\", \"target_faction\": null, \"reason\": \"Sieging a castle will allow me to establish a settlement, which is crucial for meeting one of my win conditions. This will also demonstrate my military might and potentially intimidate other factions, opening up opportunities for future alliances or conflicts. While it comes with a risk of losing troops morale and straining relations, it's a necessary step towards achieving my goal of becoming the King of Calradia.\", \"goal\": \"Establish a settlement and demonstrate military strength to intimidate other factions\"}",
          "parsed": {
            "action": "siege_castle",
            "target_faction": null,
            "reason": "Sieging a castle will allow me to establish a settlement, which is crucial for meeting one of my win conditions. This will also demonstrate my military might and potentially intimidate other factions, opening up opportunities for future alliances or conflicts. While it comes with a risk of losing troops morale and straining relations, it's a necessary step towards achieving my goal of becoming the King of Calradia.",
            "goal": "Establish a settlement and demonstrate military strength to intimidate other factions"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "siege_castle",
          "latency_ms": 3433.211326599121,
          "error": ""
        }
      ],
      "final_state": {
        "turn": 4,
        "gold": 800,
        "troops": 40,
        "renown": 270,
        "settlements": 1,
        "relations": {
          "vlandia": 0,
          "sturgia": 0,
          "empire": 0,
          "aserai": 0,
          "battania": 0
        }
      },
      "passed": true,
      "score": 1.0,
      "reached_gold_target": false,
      "reached_renown_target": false,
      "reached_settlement_target": true,
      "latency_p50_ms": 3136.754035949707,
      "latency_p99_ms": 3433.211326599121
    }
  },
  "summary": {
    "total_elapsed_s": 40.7,
    "levels_run": [
      0,
      1,
      2,
      3,
      4,
      5
    ],
    "levels_passed": [
      0,
      2,
      3,
      4,
      5
    ],
    "levels_failed": [
      1
    ],
    "m1_gate_passed": false,
    "m1_gate_notes": [
      "Level 1 FAILED (score=0.50)"
    ],
    "m1_latency_threshold_ms": 10000
  }
 }
--- a/timmy-benchmark/results/qwen2.5_14b_20260323_142119.json
+++ b/timmy-benchmark/results/qwen2.5_14b_20260323_142119.json
@@ -0,0 +1,476 @@
 {
  "model": "qwen2.5:14b",
  "timestamp": "2026-03-23T18:20:27.095190+00:00",
  "skipped": false,
  "levels": {
    "0": {
      "level": 0,
      "name": "JSON Compliance (Coin Flip)",
      "trials": [
        {
          "trial_index": 0,
          "prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
          "raw_response": "{\"choice\": \"heads\"}",
          "parsed": {
            "choice": "heads"
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 6347.5000858306885,
          "error": ""
        },
        {
          "trial_index": 1,
          "prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
          "raw_response": "{\"action\": \"defend\", \"confidence\": 0.6}",
          "parsed": {
            "action": "defend",
            "confidence": 0.6
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 850.42405128479,
          "error": ""
        },
        {
          "trial_index": 2,
          "prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
          "raw_response": "{\"direction\": \"north\", \"reason\": \"There is a reported enemy encampment to the north that we need to engage.\"}",
          "parsed": {
            "direction": "north",
            "reason": "There is a reported enemy encampment to the north that we need to engage."
          },
          "valid_json": true,
          "schema_valid": true,
          "value_valid": true,
          "latency_ms": 1443.0510997772217,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 1443.0510997772217,
      "latency_p99_ms": 6347.5000858306885
    },
    "1": {
      "level": 1,
      "name": "Board State Tracking (Tic-Tac-Toe)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Empty board \u2014 opening move",
          "prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 4, \"reason\": \"Control the center\"}",
          "parsed": {
            "move": 4,
            "reason": "Control the center"
          },
          "valid_json": true,
          "move_legal": true,
          "move_correct": true,
          "latency_ms": 1184.3791007995605,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Block opponent's winning move",
          "prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 4, \"reason\": \"Block O's winning move\"}",
          "parsed": {
            "move": 4,
            "reason": "Block O's winning move"
          },
          "valid_json": true,
          "move_legal": false,
          "move_correct": false,
          "latency_ms": 942.816972732544,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Take winning move",
          "prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 2, \"reason\": \"Winning move\"}",
          "parsed": {
            "move": 2,
            "reason": "Winning move"
          },
          "valid_json": true,
          "move_legal": false,
          "move_correct": false,
          "latency_ms": 849.2491245269775,
          "error": ""
        },
        {
          "scenario_index": 3,
          "description": "Legal move on partially filled board",
          "prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
          "raw_response": "{\"move\": 7, \"reason\": \"Creating a fork opportunity\"}",
          "parsed": {
            "move": 7,
            "reason": "Creating a fork opportunity"
          },
          "valid_json": true,
          "move_legal": true,
          "move_correct": true,
          "latency_ms": 885.8909606933594,
          "error": ""
        }
      ],
      "passed": false,
      "score": 0.5,
      "latency_p50_ms": 942.816972732544,
      "latency_p99_ms": 1184.3791007995605
    },
    "2": {
      "level": 2,
      "name": "Resource Management (Party Economy)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Budget allocation under constraint",
          "raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles.\"}",
          "parsed": {
            "choices": [
              "buy_food",
              "repair_armor"
            ],
            "gold_spent": 350,
            "reason": "Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3121.7379570007324,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Troop tier upgrade decision",
          "raw_response": "{\"action\": \"upgrade_recruits\", \"reason\": \"Improving the strength of the weakest troops can balance the party's overall combat effectiveness.\", \"expected_outcome\": \"Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers.\"}",
          "parsed": {
            "action": "upgrade_recruits",
            "reason": "Improving the strength of the weakest troops can balance the party's overall combat effectiveness.",
            "expected_outcome": "Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2936.119794845581,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Multi-turn planning horizon",
          "raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now.\"}",
          "parsed": {
            "extra_food_days": 5,
            "cost": 50,
            "remaining_gold": 250,
            "reason": "To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2515.505790710449,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 2936.119794845581,
      "latency_p99_ms": 3121.7379570007324
    },
    "3": {
      "level": 3,
      "name": "Battle Tactics (Formation Commands)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Ranged vs infantry \u2014 defensive opening",
          "raw_response": "{\"infantry_order\": \"shield_wall\", \"archer_order\": \"skirmish\", \"reason\": \"Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.\", \"expected_outcome\": \"Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact.\"}",
          "parsed": {
            "infantry_order": "shield_wall",
            "archer_order": "skirmish",
            "reason": "Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.",
            "expected_outcome": "Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3828.04012298584,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Outnumbered \u2014 retreat decision",
          "raw_response": "{\"order\": \"skirmish\", \"direction\": \"east\", \"reason\": \"Engage the enemy at range to buy time for an organized retreat.\", \"priority\": \"preserve_troops\"}",
          "parsed": {
            "order": "skirmish",
            "direction": "east",
            "reason": "Engage the enemy at range to buy time for an organized retreat.",
            "priority": "preserve_troops"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": false,
          "latency_ms": 2162.7819538116455,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Cavalry flanking opportunity",
          "raw_response": "{\"cavalry_order\": \"flank_right\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.\", \"risk\": \"low\"}",
          "parsed": {
            "cavalry_order": "flank_right",
            "timing": "now",
            "reason": "Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.",
            "risk": "low"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 2248.1229305267334,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 2248.1229305267334,
      "latency_p99_ms": 3828.04012298584
    },
    "4": {
      "level": 4,
      "name": "Trade Route (Campaign Navigation)",
      "trials": [
        {
          "scenario_index": 0,
          "description": "Optimal trade route selection",
          "raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Offers the highest profit with a reasonable travel time.\", \"risk\": \"medium\"}",
          "parsed": {
            "route": "A",
            "expected_profit": 500,
            "days_used": 8,
            "reason": "Offers the highest profit with a reasonable travel time.",
            "risk": "medium"
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3142.971992492676,
          "error": ""
        },
        {
          "scenario_index": 1,
          "description": "Adapt plan when war declared",
          "raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.\", \"gold_at_risk\": 400}",
          "parsed": {
            "decision": "divert_to_sturgia",
            "reason": "Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.",
            "gold_at_risk": 400
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3317.8718090057373,
          "error": ""
        },
        {
          "scenario_index": 2,
          "description": "Multi-stop route planning with constraints",
          "raw_response": "{\"stops\": [\"Ortysia\", \"Epicrotea\", \"Rhotae\"], \"total_days\": 12, \"estimated_profit\": 1200, \"reason\": \"Optimized for time and profit, focusing on essential goods with high demand and low risk.\"}",
          "parsed": {
            "stops": [
              "Ortysia",
              "Epicrotea",
              "Rhotae"
            ],
            "total_days": 12,
            "estimated_profit": 1200,
            "reason": "Optimized for time and profit, focusing on essential goods with high demand and low risk."
          },
          "valid_json": true,
          "schema_valid": true,
          "strategically_sound": true,
          "latency_ms": 3235.442876815796,
          "error": ""
        }
      ],
      "passed": true,
      "score": 1.0,
      "latency_p50_ms": 3235.442876815796,
      "latency_p99_ms": 3317.8718090057373
    },
    "5": {
      "level": 5,
      "name": "Mini Campaign (Full Campaign Loop)",
      "turns": [
        {
          "turn": 1,
          "state_before": {
            "turn": 1,
            "gold": 1000,
            "troops": 25,
            "renown": 150,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": -80,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Increase gold and renown without risking relations or encountering retaliation.\", \"goal\": \"Economic stability and reputation building\"}",
          "parsed": {
            "action": "trade_circuit",
            "target_faction": null,
            "reason": "Increase gold and renown without risking relations or encountering retaliation.",
            "goal": "Economic stability and reputation building"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "trade_circuit",
          "latency_ms": 3970.310926437378,
          "error": ""
        },
        {
          "turn": 2,
          "state_before": {
            "turn": 2,
            "gold": 1300,
            "troops": 25,
            "renown": 160,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": -80,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"escort_caravan\", \"target_faction\": \"empire\", \"reason\": \"Improve relations with a major faction and gain additional renown.\", \"goal\": \"Political alliance and reputation advancement\"}",
          "parsed": {
            "action": "escort_caravan",
            "target_faction": "empire",
            "reason": "Improve relations with a major faction and gain additional renown.",
            "goal": "Political alliance and reputation advancement"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "escort_caravan",
          "latency_ms": 2754.3041706085205,
          "error": ""
        },
        {
          "turn": 3,
          "state_before": {
            "turn": 3,
            "gold": 1450,
            "troops": 25,
            "renown": 180,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": -80,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Increase military strength to prepare for a potential siege or defense.\", \"goal\": \"Military readiness and expansion capability\"}",
          "parsed": {
            "action": "recruit_troops",
            "target_faction": null,
            "reason": "Increase military strength to prepare for a potential siege or defense.",
            "goal": "Military readiness and expansion capability"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "recruit_troops",
          "latency_ms": 2931.4141273498535,
          "error": ""
        },
        {
          "turn": 4,
          "state_before": {
            "turn": 4,
            "gold": 1250,
            "troops": 40,
            "renown": 180,
            "settlements": 0,
            "relations": {
              "vlandia": 0,
              "sturgia": 0,
              "empire": -80,
              "aserai": 0,
              "battania": 0
            }
          },
          "raw_response": "{\"action\": \"siege_castle\", \"target_faction\": \"empire\", \"reason\": \"Attempt to capture a settlement to meet the win condition of owning at least one settlement.\", \"goal\": \"Territorial expansion and victory condition\"}",
          "parsed": {
            "action": "siege_castle",
            "target_faction": "empire",
            "reason": "Attempt to capture a settlement to meet the win condition of owning at least one settlement.",
            "goal": "Territorial expansion and victory condition"
          },
          "valid_json": true,
          "valid_action": true,
          "action": "siege_castle",
          "latency_ms": 3413.828134536743,
          "error": ""
        }
      ],
      "final_state": {
        "turn": 4,
        "gold": 750,
        "troops": 40,
        "renown": 260,
        "settlements": 1,
        "relations": {
          "vlandia": 0,
          "sturgia": 0,
          "empire": -80,
          "aserai": 0,
          "battania": 0
        }
      },
      "passed": true,
      "score": 1.0,
      "reached_gold_target": false,
      "reached_renown_target": false,
      "reached_settlement_target": true,
      "latency_p50_ms": 3413.828134536743,
      "latency_p99_ms": 3970.310926437378
    }
  },
  "summary": {
    "total_elapsed_s": 52.1,
    "levels_run": [
      0,
      1,
      2,
      3,
      4,
      5
    ],
    "levels_passed": [
      0,
      2,
      3,
      4,
      5
    ],
    "levels_failed": [
      1
    ],
    "m1_gate_passed": false,
    "m1_gate_notes": [
      "Level 1 FAILED (score=0.50)"
    ],
    "m1_latency_threshold_ms": 10000
  }
 }
--- a/timmy-benchmark/run_benchmark.py
+++ b/timmy-benchmark/run_benchmark.py
@@ -0,0 +1,259 @@
 #!/usr/bin/env python3
 """Timmy Cognitive Benchmark Harness — Project Bannerlord M0.
 Runs a 6-level cognitive benchmark against an Ollama model to assess
 readiness for autonomous Bannerlord gameplay.
 Usage:
    python run_benchmark.py --model qwen2.5:14b --verbose
    python run_benchmark.py --model qwen3:14b --levels 0,1,2
    python run_benchmark.py --model qwen2.5:14b --output results/my_run.json
 """
 import argparse
 import dataclasses
 import json
 import os
 import sys
 import time
 from datetime import datetime, timezone
 from pathlib import Path
 try:
    import ollama
 except ImportError:
    print("ERROR: 'ollama' package not installed. Run: pip install ollama", file=sys.stderr)
    sys.exit(1)
 # Add parent dir to path so levels can be imported
 sys.path.insert(0, str(Path(__file__).parent))
 from levels import level_0_coin_flip
 from levels import level_1_tic_tac_toe
 from levels import level_2_resource_mgmt
 from levels import level_3_battle_tactics
 from levels import level_4_trade_route
 from levels import level_5_mini_campaign
 ALL_LEVELS = [
    level_0_coin_flip,
    level_1_tic_tac_toe,
    level_2_resource_mgmt,
    level_3_battle_tactics,
    level_4_trade_route,
    level_5_mini_campaign,
 ]
 # Pass criteria for M1 gate
 M1_GATE_LEVELS = {0, 1}  # Must pass Level 0 and Level 1
 M1_LATENCY_THRESHOLD_MS = 10_000  # < 10s per decision for L0-L1
 def _dataclass_to_dict(obj):
    """Recursively convert dataclass instances to dicts for JSON serialization."""
    if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
        return {k: _dataclass_to_dict(v) for k, v in dataclasses.asdict(obj).items()}
    if isinstance(obj, list):
        return [_dataclass_to_dict(i) for i in obj]
    if isinstance(obj, dict):
        return {k: _dataclass_to_dict(v) for k, v in obj.items()}
    return obj
 def check_model_available(model: str) -> bool:
    """Return True if the model is available in Ollama."""
    try:
        models = ollama.list()
        model_names = [m["model"] for m in models.get("models", [])]
        # Also check without tag
        base_model = model.split(":")[0]
        return any(
            m == model or m.startswith(base_model + ":") or m == base_model
            for m in model_names
        )
    except Exception:
        return False
 def run_benchmark(
    model: str,
    levels_to_run: list[int] | None = None,
    verbose: bool = False,
    skip_missing: bool = True,
 ) -> dict:
    """Run the benchmark and return a results dict."""
    if levels_to_run is None:
        levels_to_run = list(range(len(ALL_LEVELS)))
    print(f"\n{'=' * 60}")
    print(f"  Timmy Cognitive Benchmark — Project Bannerlord M0")
    print(f"{'=' * 60}")
    print(f"  Model:  {model}")
    print(f"  Levels: {levels_to_run}")
    print(f"  Time:   {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'=' * 60}\n")
    if not check_model_available(model):
        if skip_missing:
            print(f"  WARNING: Model '{model}' not found in Ollama. Skipping.\n")
            return {
                "model": model,
                "skipped": True,
                "reason": f"Model '{model}' not available",
                "timestamp": datetime.now(timezone.utc).isoformat(),
            }
        else:
            print(f"  ERROR: Model '{model}' not found in Ollama.", file=sys.stderr)
            sys.exit(1)
    client = ollama
    results = {
        "model": model,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "skipped": False,
        "levels": {},
        "summary": {},
    }
    level_results = {}
    total_start = time.time()
    for level_idx in levels_to_run:
        if level_idx >= len(ALL_LEVELS):
            print(f"  WARNING: Level {level_idx} does not exist, skipping.")
            continue
        module = ALL_LEVELS[level_idx]
        print(f"Level {module.LEVEL}: {module.NAME}")
        print(f"  {module.DESCRIPTION}")
        try:
            level_result = module.run(client, model, verbose=verbose)
            level_results[level_idx] = level_result
            passed_str = "PASS" if level_result.passed else "FAIL"
            score_pct = f"{level_result.score * 100:.0f}%"
            lat_str = f"p50={level_result.latency_p50_ms:.0f}ms p99={level_result.latency_p99_ms:.0f}ms"
            print(f"  Result: {passed_str} | Score: {score_pct} | Latency: {lat_str}")
        except Exception as exc:
            print(f"  ERROR running level {level_idx}: {exc}")
            import traceback
            traceback.print_exc()
        print()
    total_elapsed_s = time.time() - total_start
    # Build summary
    m1_gate_passed = True
    m1_gate_notes = []
    for level_idx, lr in level_results.items():
        results["levels"][str(level_idx)] = _dataclass_to_dict(lr)
        if level_idx in M1_GATE_LEVELS:
            if not lr.passed:
                m1_gate_passed = False
                m1_gate_notes.append(f"Level {level_idx} FAILED (score={lr.score:.2f})")
            if lr.latency_p99_ms > M1_LATENCY_THRESHOLD_MS:
                m1_gate_passed = False
                m1_gate_notes.append(
                    f"Level {level_idx} latency too high "
                    f"(p99={lr.latency_p99_ms:.0f}ms > {M1_LATENCY_THRESHOLD_MS}ms)"
                )
    results["summary"] = {
        "total_elapsed_s": round(total_elapsed_s, 1),
        "levels_run": levels_to_run,
        "levels_passed": [i for i, lr in level_results.items() if lr.passed],
        "levels_failed": [i for i, lr in level_results.items() if not lr.passed],
        "m1_gate_passed": m1_gate_passed,
        "m1_gate_notes": m1_gate_notes,
        "m1_latency_threshold_ms": M1_LATENCY_THRESHOLD_MS,
    }
    # Print scorecard
    print(f"{'=' * 60}")
    print(f"  SCORECARD — {model}")
    print(f"{'=' * 60}")
    all_level_modules = {m.LEVEL: m for m in ALL_LEVELS}
    for level_idx in levels_to_run:
        if level_idx not in level_results:
            continue
        lr = level_results[level_idx]
        module = ALL_LEVELS[level_idx]
        passed_str = "✓ PASS" if lr.passed else "✗ FAIL"
        gate_str = " [M1 GATE]" if level_idx in M1_GATE_LEVELS else ""
        lat = f"{lr.latency_p50_ms:.0f}ms"
        print(f"  L{level_idx}: {passed_str}{gate_str} | {lr.score*100:.0f}% | {lat} | {module.NAME}")
    print(f"{'─' * 60}")
    gate_str = "✓ M1 GATE PASSED" if m1_gate_passed else "✗ M1 GATE FAILED"
    print(f"  {gate_str}")
    if m1_gate_notes:
        for note in m1_gate_notes:
            print(f"    → {note}")
    print(f"  Total time: {total_elapsed_s:.1f}s")
    print(f"{'=' * 60}\n")
    return results
 def main():
    parser = argparse.ArgumentParser(
        description="Timmy Cognitive Benchmark Harness — Project Bannerlord M0"
    )
    parser.add_argument("--model", required=True, help="Ollama model name (e.g. qwen2.5:14b)")
    parser.add_argument("--levels", default=None, help="Comma-separated level indices (default: all)")
    parser.add_argument("--verbose", action="store_true", help="Show per-trial details")
    parser.add_argument(
        "--output", default=None,
        help="Output JSON path (default: results/<model>_<timestamp>.json)"
    )
    parser.add_argument(
        "--skip-missing", action="store_true", default=True,
        help="Skip instead of error if model not available"
    )
    args = parser.parse_args()
    levels_to_run = None
    if args.levels:
        try:
            levels_to_run = [int(x.strip()) for x in args.levels.split(",")]
        except ValueError:
            print(f"ERROR: --levels must be comma-separated integers, got: {args.levels}", file=sys.stderr)
            sys.exit(1)
    results = run_benchmark(
        model=args.model,
        levels_to_run=levels_to_run,
        verbose=args.verbose,
        skip_missing=args.skip_missing,
    )
    # Save results
    if args.output:
        output_path = Path(args.output)
    else:
        results_dir = Path(__file__).parent / "results"
        results_dir.mkdir(exist_ok=True)
        safe_model = args.model.replace(":", "_").replace("/", "_")
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = results_dir / f"{safe_model}_{ts}.json"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w") as f:
        json.dump(results, f, indent=2, default=str)
    print(f"Results saved to: {output_path}")
    # Exit with non-zero if M1 gate failed
    if not results.get("skipped") and not results.get("summary", {}).get("m1_gate_passed", True):
        sys.exit(1)
 if __name__ == "__main__":
    main()
		`@@ -0,0 +1 @@`
							`"""Cognitive benchmark levels for Project Bannerlord readiness testing."""`