[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)

Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00
parent 6e65b53f3a
commit 9e08e87312
12 changed files with 3068 additions and 0 deletions
--- a/timmy-benchmark/levels/init.py
+++ b/timmy-benchmark/levels/init.py
@@ -0,0 +1 @@
+"""Cognitive benchmark levels for Project Bannerlord readiness testing."""
--- a/timmy-benchmark/levels/level_0_coin_flip.py
+++ b/timmy-benchmark/levels/level_0_coin_flip.py
@@ -0,0 +1,183 @@
+"""Level 0: JSON Compliance — Coin Flip.
+
+Tests whether the model can reliably return well-formed JSON responses
+with a specific schema. This is the minimum bar for GABS tool calls.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+LEVEL = 0
+NAME = "JSON Compliance (Coin Flip)"
+DESCRIPTION = "Model must return valid JSON matching a strict schema on each trial."
+
+SYSTEM_PROMPT = """You are a strategic AI agent. You MUST respond ONLY with valid JSON.
+No markdown, no explanation, no code fences. Raw JSON only."""
+
+TRIALS = [
+    {
+        "prompt": (
+            'A coin is flipped. Respond with exactly: {"choice": "heads"} or {"choice": "tails"}. '
+            "Pick one. JSON only."
+        ),
+        "schema": {"choice": str},
+        "valid_values": {"choice": ["heads", "tails"]},
+    },
+    {
+        "prompt": (
+            'You must attack or defend. Respond with: {"action": "attack", "confidence": 0.8} '
+            'or {"action": "defend", "confidence": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.'
+        ),
+        "schema": {"action": str, "confidence": float},
+        "valid_values": {"action": ["attack", "defend"]},
+    },
+    {
+        "prompt": (
+            'Choose a direction to march. Respond with exactly: '
+            '{"direction": "north", "reason": "string explaining why"}. '
+            "Pick north/south/east/west. JSON only."
+        ),
+        "schema": {"direction": str, "reason": str},
+        "valid_values": {"direction": ["north", "south", "east", "west"]},
+    },
+]
+
+
+@dataclass
+class TrialResult:
+    trial_index: int
+    prompt: str
+    raw_response: str
+    parsed: dict | None
+    valid_json: bool
+    schema_valid: bool
+    value_valid: bool
+    latency_ms: float
+    error: str = ""
+
+
+@dataclass
+class LevelResult:
+    level: int = LEVEL
+    name: str = NAME
+    trials: list[TrialResult] = field(default_factory=list)
+    passed: bool = False
+    score: float = 0.0
+    latency_p50_ms: float = 0.0
+    latency_p99_ms: float = 0.0
+
+
+def _validate_schema(parsed: dict, schema: dict[str, type]) -> bool:
+    for key, expected_type in schema.items():
+        if key not in parsed:
+            return False
+        if not isinstance(parsed[key], expected_type):
+            # Allow int where float is expected
+            if expected_type is float and isinstance(parsed[key], int):
+                continue
+            return False
+    return True
+
+
+def _validate_values(parsed: dict, valid_values: dict[str, list]) -> bool:
+    for key, valid_list in valid_values.items():
+        if key in parsed and parsed[key] not in valid_list:
+            return False
+    return True
+
+
+def _clean_response(raw: str) -> str:
+    """Strip markdown fences if model wrapped JSON in them."""
+    raw = raw.strip()
+    if raw.startswith("```"):
+        lines = raw.splitlines()
+        # Remove first and last fence lines
+        lines = [l for l in lines if not l.startswith("```")]
+        raw = "\n".join(lines).strip()
+    return raw
+
+
+def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
+    result = LevelResult()
+    latencies = []
+
+    for i, trial in enumerate(TRIALS):
+        t0 = time.time()
+        try:
+            response = client.chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": trial["prompt"]},
+                ],
+                options={"temperature": 0.1},
+            )
+            raw = response["message"]["content"]
+            latency_ms = (time.time() - t0) * 1000
+        except Exception as exc:
+            latency_ms = (time.time() - t0) * 1000
+            tr = TrialResult(
+                trial_index=i,
+                prompt=trial["prompt"],
+                raw_response="",
+                parsed=None,
+                valid_json=False,
+                schema_valid=False,
+                value_valid=False,
+                latency_ms=latency_ms,
+                error=str(exc),
+            )
+            result.trials.append(tr)
+            if verbose:
+                print(f"  Trial {i}: ERROR — {exc}")
+            continue
+
+        latencies.append(latency_ms)
+
+        cleaned = _clean_response(raw)
+        parsed = None
+        valid_json = False
+        schema_valid = False
+        value_valid = False
+        error = ""
+
+        try:
+            parsed = json.loads(cleaned)
+            valid_json = True
+            schema_valid = _validate_schema(parsed, trial["schema"])
+            value_valid = _validate_values(parsed, trial["valid_values"])
+        except json.JSONDecodeError as exc:
+            error = f"JSONDecodeError: {exc}"
+
+        tr = TrialResult(
+            trial_index=i,
+            prompt=trial["prompt"],
+            raw_response=raw,
+            parsed=parsed,
+            valid_json=valid_json,
+            schema_valid=schema_valid,
+            value_valid=value_valid,
+            latency_ms=latency_ms,
+            error=error,
+        )
+        result.trials.append(tr)
+
+        if verbose:
+            status = "PASS" if (valid_json and schema_valid) else "FAIL"
+            print(
+                f"  Trial {i}: {status} | json={valid_json} schema={schema_valid} "
+                f"value={value_valid} | {latency_ms:.0f}ms | {raw[:80]!r}"
+            )
+
+    passed_trials = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
+    result.score = passed_trials / len(TRIALS)
+    result.passed = result.score >= 1.0  # Must pass all 3 trials
+
+    if latencies:
+        latencies_sorted = sorted(latencies)
+        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
+        result.latency_p99_ms = latencies_sorted[-1]
+
+    return result
--- a/timmy-benchmark/levels/level_1_tic_tac_toe.py
+++ b/timmy-benchmark/levels/level_1_tic_tac_toe.py
@@ -0,0 +1,211 @@
+"""Level 1: Board State Tracking — Tic-Tac-Toe.
+
+Tests whether the model can maintain game state across turns, select
+legal moves, and exhibit basic strategic awareness.
+Maps to: Bannerlord board state / campaign map tracking.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+LEVEL = 1
+NAME = "Board State Tracking (Tic-Tac-Toe)"
+DESCRIPTION = "Model must track a tic-tac-toe board and make legal, strategic moves."
+
+SYSTEM_PROMPT = """You are a strategic AI playing tic-tac-toe. The board is a 3x3 grid.
+Positions are numbered 0-8 left-to-right, top-to-bottom:
+  0|1|2
+  3|4|5
+  6|7|8
+
+You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only.
+Format: {"move": <position 0-8>, "reason": "<brief reason>"}"""
+
+
+SCENARIOS = [
+    {
+        "description": "Empty board — opening move",
+        "board": [None, None, None, None, None, None, None, None, None],
+        "player": "X",
+        "prompt": (
+            'Board state: [null,null,null,null,null,null,null,null,null]. '
+            'You are X. It is your turn. Choose a move. '
+            'Respond: {"move": <0-8>, "reason": "<why>"}'
+        ),
+        "check": lambda move, board: move in range(9) and board[move] is None,
+        "check_desc": "Move must be a valid empty position (0-8)",
+    },
+    {
+        "description": "Block opponent's winning move",
+        "board": ["O", None, "O", None, "X", None, None, None, None],
+        "player": "X",
+        "prompt": (
+            'Board: ["O",null,"O",null,"X",null,null,null,null]. '
+            "O has positions 0 and 2. You are X. "
+            "O will win on next turn unless you block. "
+            'Respond: {"move": <0-8>, "reason": "<why>"}'
+        ),
+        "check": lambda move, board: move == 1,  # Must block at position 1
+        "check_desc": "Must block O's win at position 1",
+    },
+    {
+        "description": "Take winning move",
+        "board": ["X", None, "X", None, "O", None, None, "O", None],
+        "player": "X",
+        "prompt": (
+            'Board: ["X",null,"X",null,"O",null,null,"O",null]. '
+            "You are X. You have positions 0 and 2. "
+            "You can win this turn. "
+            'Respond: {"move": <0-8>, "reason": "<why>"}'
+        ),
+        "check": lambda move, board: move == 1,  # Win at position 1
+        "check_desc": "Must take winning move at position 1",
+    },
+    {
+        "description": "Legal move on partially filled board",
+        "board": ["X", "O", "X", "O", "X", "O", None, None, None],
+        "player": "O",
+        "prompt": (
+            'Board: ["X","O","X","O","X","O",null,null,null]. '
+            "You are O. Choose a legal move (positions 6, 7, or 8 are available). "
+            'Respond: {"move": <0-8>, "reason": "<why>"}'
+        ),
+        "check": lambda move, board: move in [6, 7, 8],
+        "check_desc": "Move must be one of the empty positions: 6, 7, or 8",
+    },
+]
+
+
+@dataclass
+class ScenarioResult:
+    scenario_index: int
+    description: str
+    prompt: str
+    raw_response: str
+    parsed: dict | None
+    valid_json: bool
+    move_legal: bool
+    move_correct: bool
+    latency_ms: float
+    error: str = ""
+
+
+@dataclass
+class LevelResult:
+    level: int = LEVEL
+    name: str = NAME
+    trials: list[ScenarioResult] = field(default_factory=list)
+    passed: bool = False
+    score: float = 0.0
+    latency_p50_ms: float = 0.0
+    latency_p99_ms: float = 0.0
+
+
+def _clean_response(raw: str) -> str:
+    raw = raw.strip()
+    if raw.startswith("```"):
+        lines = raw.splitlines()
+        lines = [l for l in lines if not l.startswith("```")]
+        raw = "\n".join(lines).strip()
+    return raw
+
+
+def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
+    result = LevelResult()
+    latencies = []
+
+    for i, scenario in enumerate(SCENARIOS):
+        t0 = time.time()
+        try:
+            response = client.chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": scenario["prompt"]},
+                ],
+                options={"temperature": 0.1},
+            )
+            raw = response["message"]["content"]
+            latency_ms = (time.time() - t0) * 1000
+        except Exception as exc:
+            latency_ms = (time.time() - t0) * 1000
+            sr = ScenarioResult(
+                scenario_index=i,
+                description=scenario["description"],
+                prompt=scenario["prompt"],
+                raw_response="",
+                parsed=None,
+                valid_json=False,
+                move_legal=False,
+                move_correct=False,
+                latency_ms=latency_ms,
+                error=str(exc),
+            )
+            result.trials.append(sr)
+            if verbose:
+                print(f"  Scenario {i}: ERROR — {exc}")
+            continue
+
+        latencies.append(latency_ms)
+
+        cleaned = _clean_response(raw)
+        parsed = None
+        valid_json = False
+        move_legal = False
+        move_correct = False
+        error = ""
+
+        try:
+            parsed = json.loads(cleaned)
+            valid_json = True
+
+            if "move" in parsed:
+                move = parsed["move"]
+                # Coerce string digits to int (some models emit "4" instead of 4)
+                if isinstance(move, str) and move.strip().lstrip("-").isdigit():
+                    move = int(move.strip())
+                if isinstance(move, int):
+                    board = scenario["board"]
+                    move_legal = 0 <= move <= 8 and board[move] is None
+                    move_correct = scenario["check"](move, board)
+        except json.JSONDecodeError as exc:
+            error = f"JSONDecodeError: {exc}"
+
+        sr = ScenarioResult(
+            scenario_index=i,
+            description=scenario["description"],
+            prompt=scenario["prompt"],
+            raw_response=raw,
+            parsed=parsed,
+            valid_json=valid_json,
+            move_legal=move_legal,
+            move_correct=move_correct,
+            latency_ms=latency_ms,
+            error=error,
+        )
+        result.trials.append(sr)
+
+        if verbose:
+            status = "PASS" if (valid_json and move_legal) else "FAIL"
+            correct_str = "CORRECT" if move_correct else "suboptimal"
+            move_val = parsed.get("move", "?") if parsed else "?"
+            print(
+                f"  Scenario {i} [{scenario['description']}]: {status} ({correct_str}) "
+                f"| move={move_val} | {latency_ms:.0f}ms"
+            )
+            if not move_correct and valid_json:
+                print(f"    Expected: {scenario['check_desc']}")
+
+    # Pass criteria: all moves must be valid JSON + legal
+    legal_moves = sum(1 for t in result.trials if t.valid_json and t.move_legal)
+    result.score = legal_moves / len(SCENARIOS)
+    result.passed = result.score >= 1.0
+
+    if latencies:
+        latencies_sorted = sorted(latencies)
+        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
+        result.latency_p99_ms = latencies_sorted[-1]
+
+    return result
--- a/timmy-benchmark/levels/level_2_resource_mgmt.py
+++ b/timmy-benchmark/levels/level_2_resource_mgmt.py
@@ -0,0 +1,213 @@
+"""Level 2: Resource Management — Party Economy.
+
+Tests whether the model can allocate limited resources across competing
+priorities and adapt when constraints change.
+Maps to: Bannerlord party economy (troops, food, gold, morale).
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+LEVEL = 2
+NAME = "Resource Management (Party Economy)"
+DESCRIPTION = "Model must allocate limited resources across troops, food, and equipment."
+
+SYSTEM_PROMPT = """You are a Bannerlord campaign advisor managing a party.
+Resources are limited — every decision has trade-offs.
+You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only."""
+
+SCENARIOS = [
+    {
+        "description": "Budget allocation under constraint",
+        "prompt": (
+            "You have 500 gold. Options:\n"
+            "- Recruit 10 infantry: costs 300 gold, +10 combat strength\n"
+            "- Buy food for 20 days: costs 200 gold, keeps morale stable\n"
+            "- Repair armor: costs 150 gold, -20% casualty rate\n\n"
+            "You cannot afford all three. Morale is currently CRITICAL (troops may desert).\n"
+            'Choose 1-2 options. Respond: {"choices": ["option_a", ...], "gold_spent": <int>, "reason": "<why>"}\n'
+            "Where option keys are: recruit_infantry, buy_food, repair_armor"
+        ),
+        "check": lambda r: (
+            isinstance(r.get("choices"), list)
+            and len(r["choices"]) >= 1
+            and all(c in ["recruit_infantry", "buy_food", "repair_armor"] for c in r["choices"])
+            and isinstance(r.get("gold_spent"), (int, float))
+            and r.get("gold_spent", 9999) <= 500
+        ),
+        "check_desc": "choices must be valid options, gold_spent <= 500",
+        "strategic_check": lambda r: "buy_food" in r.get("choices", []),
+        "strategic_desc": "With CRITICAL morale, food should be prioritized",
+    },
+    {
+        "description": "Troop tier upgrade decision",
+        "prompt": (
+            "Party status:\n"
+            "- 15 Tier-1 recruits (weak, 30 upkeep/day)\n"
+            "- 5 Tier-3 veterans (strong, 90 upkeep/day)\n"
+            "- Daily income: 200 gold\n"
+            "- Upcoming: raider camp attack (moderate difficulty)\n\n"
+            "Options:\n"
+            "- Upgrade 5 recruits to Tier-2 (costs 250 gold total)\n"
+            "- Keep all current troops, save gold for emergencies\n"
+            "- Dismiss 5 recruits to save upkeep\n\n"
+            'Respond: {"action": "upgrade_recruits"|"save_gold"|"dismiss_recruits", '
+            '"reason": "<why>", "expected_outcome": "<string>"}'
+        ),
+        "check": lambda r: (
+            r.get("action") in ["upgrade_recruits", "save_gold", "dismiss_recruits"]
+            and isinstance(r.get("reason"), str)
+            and len(r.get("reason", "")) > 0
+        ),
+        "check_desc": "action must be one of the three options with a non-empty reason",
+        "strategic_check": lambda r: r.get("action") in ["upgrade_recruits", "save_gold"],
+        "strategic_desc": "Dismissing troops before a fight is suboptimal",
+    },
+    {
+        "description": "Multi-turn planning horizon",
+        "prompt": (
+            "Current: 300 gold, 10 days of food, 20 troops\n"
+            "Day 5: Must cross desert (costs 5 extra food days)\n"
+            "Day 10: Reach town (can buy supplies)\n\n"
+            "You need a 15-day food reserve to survive the journey.\n"
+            "Food costs 10 gold/day. You have enough for 10 days now.\n\n"
+            "How many extra food days do you buy today?\n"
+            'Respond: {"extra_food_days": <int>, "cost": <int>, "remaining_gold": <int>, "reason": "<why>"}'
+        ),
+        "check": lambda r: (
+            isinstance(r.get("extra_food_days"), (int, float))
+            and isinstance(r.get("cost"), (int, float))
+            and isinstance(r.get("remaining_gold"), (int, float))
+        ),
+        "check_desc": "Must include extra_food_days, cost, remaining_gold as numbers",
+        "strategic_check": lambda r: r.get("extra_food_days", 0) >= 5,
+        "strategic_desc": "Need at least 5 more days of food for desert crossing",
+    },
+]
+
+
+@dataclass
+class ScenarioResult:
+    scenario_index: int
+    description: str
+    raw_response: str
+    parsed: dict | None
+    valid_json: bool
+    schema_valid: bool
+    strategically_sound: bool
+    latency_ms: float
+    error: str = ""
+
+
+@dataclass
+class LevelResult:
+    level: int = LEVEL
+    name: str = NAME
+    trials: list[ScenarioResult] = field(default_factory=list)
+    passed: bool = False
+    score: float = 0.0
+    latency_p50_ms: float = 0.0
+    latency_p99_ms: float = 0.0
+
+
+def _clean_response(raw: str) -> str:
+    raw = raw.strip()
+    if raw.startswith("```"):
+        lines = raw.splitlines()
+        lines = [l for l in lines if not l.startswith("```")]
+        raw = "\n".join(lines).strip()
+    return raw
+
+
+def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
+    result = LevelResult()
+    latencies = []
+
+    for i, scenario in enumerate(SCENARIOS):
+        t0 = time.time()
+        try:
+            response = client.chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": scenario["prompt"]},
+                ],
+                options={"temperature": 0.1},
+            )
+            raw = response["message"]["content"]
+            latency_ms = (time.time() - t0) * 1000
+        except Exception as exc:
+            latency_ms = (time.time() - t0) * 1000
+            sr = ScenarioResult(
+                scenario_index=i,
+                description=scenario["description"],
+                raw_response="",
+                parsed=None,
+                valid_json=False,
+                schema_valid=False,
+                strategically_sound=False,
+                latency_ms=latency_ms,
+                error=str(exc),
+            )
+            result.trials.append(sr)
+            if verbose:
+                print(f"  Scenario {i}: ERROR — {exc}")
+            continue
+
+        latencies.append(latency_ms)
+
+        cleaned = _clean_response(raw)
+        parsed = None
+        valid_json = False
+        schema_valid = False
+        strategically_sound = False
+        error = ""
+
+        try:
+            parsed = json.loads(cleaned)
+            valid_json = True
+            schema_valid = scenario["check"](parsed)
+            if schema_valid:
+                strategically_sound = scenario["strategic_check"](parsed)
+        except json.JSONDecodeError as exc:
+            error = f"JSONDecodeError: {exc}"
+        except Exception as exc:
+            error = f"Validation error: {exc}"
+
+        sr = ScenarioResult(
+            scenario_index=i,
+            description=scenario["description"],
+            raw_response=raw,
+            parsed=parsed,
+            valid_json=valid_json,
+            schema_valid=schema_valid,
+            strategically_sound=strategically_sound,
+            latency_ms=latency_ms,
+            error=error,
+        )
+        result.trials.append(sr)
+
+        if verbose:
+            status = "PASS" if (valid_json and schema_valid) else "FAIL"
+            strat = "strategic" if strategically_sound else "suboptimal"
+            print(
+                f"  Scenario {i} [{scenario['description']}]: {status} ({strat}) "
+                f"| {latency_ms:.0f}ms"
+            )
+            if not schema_valid and valid_json:
+                print(f"    Schema issue: {scenario['check_desc']}")
+            if not strategically_sound and schema_valid:
+                print(f"    Strategy note: {scenario['strategic_desc']}")
+
+    valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
+    result.score = valid_count / len(SCENARIOS)
+    result.passed = result.score >= 0.67  # 2/3 scenarios
+
+    if latencies:
+        latencies_sorted = sorted(latencies)
+        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
+        result.latency_p99_ms = latencies_sorted[-1]
+
+    return result
--- a/timmy-benchmark/levels/level_3_battle_tactics.py
+++ b/timmy-benchmark/levels/level_3_battle_tactics.py
@@ -0,0 +1,216 @@
+"""Level 3: Battle Tactics — Formation Commands.
+
+Tests whether the model can issue coherent formation and tactical orders
+under simulated battlefield pressure with multiple unit types.
+Maps to: Bannerlord formation commands (charge, shield wall, skirmish, etc.).
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+LEVEL = 3
+NAME = "Battle Tactics (Formation Commands)"
+DESCRIPTION = "Model must issue tactically sound formation orders under simulated battle conditions."
+
+SYSTEM_PROMPT = """You are a Bannerlord battle commander. Issue formation orders using these commands:
+- shield_wall: infantry forms defensive line (good vs ranged, slow advance)
+- charge: all-out attack (high casualties, breaks weak enemies fast)
+- skirmish: ranged units pepper enemy (good vs heavy infantry, needs distance)
+- advance: move forward holding formation (balanced)
+- flank_left / flank_right: cavalry sweeps around enemy side
+- fallback: retreat to regroup (when badly outnumbered)
+
+You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
+
+SCENARIOS = [
+    {
+        "description": "Ranged vs infantry — defensive opening",
+        "prompt": (
+            "Situation: You have 20 archers + 10 infantry. Enemy has 30 heavy infantry, no ranged.\n"
+            "Enemy is 200m away and advancing.\n"
+            "Objective: Maximize casualties before melee contact.\n\n"
+            'Issue orders for both units. Respond:\n'
+            '{"infantry_order": "<command>", "archer_order": "<command>", '
+            '"reason": "<tactical reasoning>", "expected_outcome": "<string>"}'
+        ),
+        "check": lambda r: (
+            r.get("infantry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
+            and r.get("archer_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
+            and isinstance(r.get("reason"), str)
+        ),
+        "check_desc": "Both orders must be valid commands",
+        "strategic_check": lambda r: (
+            r.get("archer_order") == "skirmish"
+            and r.get("infantry_order") in ["shield_wall", "advance"]
+        ),
+        "strategic_desc": "Archers should skirmish while infantry holds (shield_wall or advance)",
+    },
+    {
+        "description": "Outnumbered — retreat decision",
+        "prompt": (
+            "Situation: Your party (15 troops) has been ambushed.\n"
+            "Enemy: 60 bandits, surrounding you on 3 sides.\n"
+            "Your troops: 40% wounded. One escape route to the east.\n\n"
+            'What is your command? Respond:\n'
+            '{"order": "<command>", "direction": "east"|"west"|"north"|"south"|null, '
+            '"reason": "<tactical reasoning>", "priority": "preserve_troops"|"fight_through"}'
+        ),
+        "check": lambda r: (
+            r.get("order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
+            and r.get("priority") in ["preserve_troops", "fight_through"]
+        ),
+        "check_desc": "order and priority must be valid values",
+        "strategic_check": lambda r: (
+            r.get("order") == "fallback"
+            and r.get("priority") == "preserve_troops"
+        ),
+        "strategic_desc": "Outnumbered 4:1 with wounded troops — fallback is the sound choice",
+    },
+    {
+        "description": "Cavalry flanking opportunity",
+        "prompt": (
+            "Situation: Main battle is engaged. Your infantry and enemy infantry are locked.\n"
+            "You have 8 cavalry in reserve. Enemy left flank is unprotected.\n"
+            "If cavalry hits the flank now, it will route enemy in ~30 seconds.\n\n"
+            'Order for cavalry: Respond:\n'
+            '{"cavalry_order": "<command>", "timing": "now"|"wait", '
+            '"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
+        ),
+        "check": lambda r: (
+            r.get("cavalry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
+            and r.get("timing") in ["now", "wait"]
+            and r.get("risk") in ["low", "medium", "high"]
+        ),
+        "check_desc": "cavalry_order, timing, and risk must be valid values",
+        "strategic_check": lambda r: (
+            r.get("cavalry_order") in ["flank_left", "flank_right", "charge"]
+            and r.get("timing") == "now"
+        ),
+        "strategic_desc": "Should capitalize on the flank opportunity immediately",
+    },
+]
+
+
+@dataclass
+class ScenarioResult:
+    scenario_index: int
+    description: str
+    raw_response: str
+    parsed: dict | None
+    valid_json: bool
+    schema_valid: bool
+    strategically_sound: bool
+    latency_ms: float
+    error: str = ""
+
+
+@dataclass
+class LevelResult:
+    level: int = LEVEL
+    name: str = NAME
+    trials: list[ScenarioResult] = field(default_factory=list)
+    passed: bool = False
+    score: float = 0.0
+    latency_p50_ms: float = 0.0
+    latency_p99_ms: float = 0.0
+
+
+def _clean_response(raw: str) -> str:
+    raw = raw.strip()
+    if raw.startswith("```"):
+        lines = raw.splitlines()
+        lines = [l for l in lines if not l.startswith("```")]
+        raw = "\n".join(lines).strip()
+    return raw
+
+
+def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
+    result = LevelResult()
+    latencies = []
+
+    for i, scenario in enumerate(SCENARIOS):
+        t0 = time.time()
+        try:
+            response = client.chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": scenario["prompt"]},
+                ],
+                options={"temperature": 0.2},
+            )
+            raw = response["message"]["content"]
+            latency_ms = (time.time() - t0) * 1000
+        except Exception as exc:
+            latency_ms = (time.time() - t0) * 1000
+            sr = ScenarioResult(
+                scenario_index=i,
+                description=scenario["description"],
+                raw_response="",
+                parsed=None,
+                valid_json=False,
+                schema_valid=False,
+                strategically_sound=False,
+                latency_ms=latency_ms,
+                error=str(exc),
+            )
+            result.trials.append(sr)
+            if verbose:
+                print(f"  Scenario {i}: ERROR — {exc}")
+            continue
+
+        latencies.append(latency_ms)
+
+        cleaned = _clean_response(raw)
+        parsed = None
+        valid_json = False
+        schema_valid = False
+        strategically_sound = False
+        error = ""
+
+        try:
+            parsed = json.loads(cleaned)
+            valid_json = True
+            schema_valid = scenario["check"](parsed)
+            if schema_valid:
+                strategically_sound = scenario["strategic_check"](parsed)
+        except json.JSONDecodeError as exc:
+            error = f"JSONDecodeError: {exc}"
+        except Exception as exc:
+            error = f"Validation error: {exc}"
+
+        sr = ScenarioResult(
+            scenario_index=i,
+            description=scenario["description"],
+            raw_response=raw,
+            parsed=parsed,
+            valid_json=valid_json,
+            schema_valid=schema_valid,
+            strategically_sound=strategically_sound,
+            latency_ms=latency_ms,
+            error=error,
+        )
+        result.trials.append(sr)
+
+        if verbose:
+            status = "PASS" if (valid_json and schema_valid) else "FAIL"
+            strat = "strategic" if strategically_sound else "suboptimal"
+            print(
+                f"  Scenario {i} [{scenario['description']}]: {status} ({strat}) "
+                f"| {latency_ms:.0f}ms"
+            )
+            if not schema_valid and valid_json:
+                print(f"    Schema issue: {scenario['check_desc']}")
+
+    valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
+    result.score = valid_count / len(SCENARIOS)
+    result.passed = result.score >= 0.67
+
+    if latencies:
+        latencies_sorted = sorted(latencies)
+        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
+        result.latency_p99_ms = latencies_sorted[-1]
+
+    return result
--- a/timmy-benchmark/levels/level_4_trade_route.py
+++ b/timmy-benchmark/levels/level_4_trade_route.py
@@ -0,0 +1,223 @@
+"""Level 4: Trade Route — Campaign Navigation.
+
+Tests multi-step planning ability: route optimization, trade-off analysis
+across time horizons, and adapting plans when conditions change.
+Maps to: Bannerlord campaign map navigation, caravans, and economy.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+LEVEL = 4
+NAME = "Trade Route (Campaign Navigation)"
+DESCRIPTION = "Model must plan optimal routes and adapt to changing conditions on the campaign map."
+
+SYSTEM_PROMPT = """You are a Bannerlord merchant lord planning campaign movements.
+Consider distance, profitability, risk, and timing.
+You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
+
+SCENARIOS = [
+    {
+        "description": "Optimal trade route selection",
+        "prompt": (
+            "You are at Epicrotea with 500 gold and 20 days travel budget.\n\n"
+            "Trade opportunities:\n"
+            "- Route A: Epicrotea → Vlandia (3 days) → Sturgia (5 days back)\n"
+            "  Sell grain in Vlandia: +300 gold. Buy furs in Sturgia: costs 200, sells for 400 in Calradia.\n"
+            "  Total: +500 gold profit, 8 days.\n"
+            "- Route B: Epicrotea → Calradia (2 days) → Aserai (4 days)\n"
+            "  Sell iron in Calradia: +150 gold. Buy spice in Aserai: costs 300, sells for 600 in Empire.\n"
+            "  Empire is 6 more days away. Total: +450 gold profit, 12 days.\n"
+            "- Route C: Epicrotea → nearby village (1 day)\n"
+            "  Buy cheap food: costs 100, sells for 180 in any city.\n"
+            "  Total: +80 gold profit, 2 days. Repeatable.\n\n"
+            'Choose route. Respond:\n'
+            '{"route": "A"|"B"|"C", "expected_profit": <int>, "days_used": <int>, '
+            '"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
+        ),
+        "check": lambda r: (
+            r.get("route") in ["A", "B", "C"]
+            and isinstance(r.get("expected_profit"), (int, float))
+            and isinstance(r.get("days_used"), (int, float))
+            and r.get("risk") in ["low", "medium", "high"]
+        ),
+        "check_desc": "route, expected_profit, days_used, risk must be valid",
+        "strategic_check": lambda r: r.get("route") in ["A", "C"],  # A is best single trip, C is best if repeated
+        "strategic_desc": "Route A has best profit/day ratio; C is best if multiple loops possible",
+    },
+    {
+        "description": "Adapt plan when war declared",
+        "prompt": (
+            "You were heading to Vlandia to trade, 2 days into the journey.\n"
+            "NEWS: Vlandia just declared war on your faction. Entering Vlandia territory is now dangerous.\n\n"
+            "Your current position: borderlands, equidistant between:\n"
+            "- Vlandia (2 days): Now at war — high risk of attack\n"
+            "- Sturgia (3 days): Neutral — safe\n"
+            "- Empire (4 days): Allied — very safe, good prices\n\n"
+            "You have 400 gold of trade goods for the Vlandia market.\n"
+            'What do you do? Respond:\n'
+            '{"decision": "continue_to_vlandia"|"divert_to_sturgia"|"divert_to_empire", '
+            '"reason": "<why>", "gold_at_risk": <int>}'
+        ),
+        "check": lambda r: (
+            r.get("decision") in ["continue_to_vlandia", "divert_to_sturgia", "divert_to_empire"]
+            and isinstance(r.get("gold_at_risk"), (int, float))
+        ),
+        "check_desc": "decision must be one of three options, gold_at_risk must be a number",
+        "strategic_check": lambda r: r.get("decision") in ["divert_to_sturgia", "divert_to_empire"],
+        "strategic_desc": "Should avoid active war zone — divert to safe destination",
+    },
+    {
+        "description": "Multi-stop route planning with constraints",
+        "prompt": (
+            "Plan a 3-stop trading circuit starting and ending at Pravend.\n"
+            "Budget: 800 gold. Time limit: 20 days.\n\n"
+            "Available cities and travel times from Pravend:\n"
+            "- Rhotae: 2 days (leather cheap, sells well in south)\n"
+            "- Ortysia: 4 days (grain surplus — buy cheap)\n"
+            "- Epicrotea: 3 days (iron market — buy/sell)\n"
+            "- Pen Cannoc: 5 days (wine — high profit, far)\n\n"
+            "Each stop takes 1 day for trading.\n"
+            'Plan 3 stops. Respond:\n'
+            '{"stops": ["<city1>", "<city2>", "<city3>"], '
+            '"total_days": <int>, "estimated_profit": <int>, '
+            '"reason": "<reasoning>"}'
+        ),
+        "check": lambda r: (
+            isinstance(r.get("stops"), list)
+            and len(r["stops"]) == 3
+            and all(isinstance(s, str) for s in r["stops"])
+            and isinstance(r.get("total_days"), (int, float))
+            and r.get("total_days", 99) <= 20
+            and isinstance(r.get("estimated_profit"), (int, float))
+        ),
+        "check_desc": "stops must be list of 3 strings, total_days <= 20, estimated_profit numeric",
+        "strategic_check": lambda r: "Pen Cannoc" not in r.get("stops", []),  # Too far for 20 days
+        "strategic_desc": "Pen Cannoc at 5 days each way is likely too far for a 20-day circuit",
+    },
+]
+
+
+@dataclass
+class ScenarioResult:
+    scenario_index: int
+    description: str
+    raw_response: str
+    parsed: dict | None
+    valid_json: bool
+    schema_valid: bool
+    strategically_sound: bool
+    latency_ms: float
+    error: str = ""
+
+
+@dataclass
+class LevelResult:
+    level: int = LEVEL
+    name: str = NAME
+    trials: list[ScenarioResult] = field(default_factory=list)
+    passed: bool = False
+    score: float = 0.0
+    latency_p50_ms: float = 0.0
+    latency_p99_ms: float = 0.0
+
+
+def _clean_response(raw: str) -> str:
+    raw = raw.strip()
+    if raw.startswith("```"):
+        lines = raw.splitlines()
+        lines = [l for l in lines if not l.startswith("```")]
+        raw = "\n".join(lines).strip()
+    return raw
+
+
+def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
+    result = LevelResult()
+    latencies = []
+
+    for i, scenario in enumerate(SCENARIOS):
+        t0 = time.time()
+        try:
+            response = client.chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": scenario["prompt"]},
+                ],
+                options={"temperature": 0.2},
+            )
+            raw = response["message"]["content"]
+            latency_ms = (time.time() - t0) * 1000
+        except Exception as exc:
+            latency_ms = (time.time() - t0) * 1000
+            sr = ScenarioResult(
+                scenario_index=i,
+                description=scenario["description"],
+                raw_response="",
+                parsed=None,
+                valid_json=False,
+                schema_valid=False,
+                strategically_sound=False,
+                latency_ms=latency_ms,
+                error=str(exc),
+            )
+            result.trials.append(sr)
+            if verbose:
+                print(f"  Scenario {i}: ERROR — {exc}")
+            continue
+
+        latencies.append(latency_ms)
+
+        cleaned = _clean_response(raw)
+        parsed = None
+        valid_json = False
+        schema_valid = False
+        strategically_sound = False
+        error = ""
+
+        try:
+            parsed = json.loads(cleaned)
+            valid_json = True
+            schema_valid = scenario["check"](parsed)
+            if schema_valid:
+                strategically_sound = scenario["strategic_check"](parsed)
+        except json.JSONDecodeError as exc:
+            error = f"JSONDecodeError: {exc}"
+        except Exception as exc:
+            error = f"Validation error: {exc}"
+
+        sr = ScenarioResult(
+            scenario_index=i,
+            description=scenario["description"],
+            raw_response=raw,
+            parsed=parsed,
+            valid_json=valid_json,
+            schema_valid=schema_valid,
+            strategically_sound=strategically_sound,
+            latency_ms=latency_ms,
+            error=error,
+        )
+        result.trials.append(sr)
+
+        if verbose:
+            status = "PASS" if (valid_json and schema_valid) else "FAIL"
+            strat = "strategic" if strategically_sound else "suboptimal"
+            print(
+                f"  Scenario {i} [{scenario['description']}]: {status} ({strat}) "
+                f"| {latency_ms:.0f}ms"
+            )
+            if not schema_valid and valid_json:
+                print(f"    Schema issue: {scenario['check_desc']}")
+
+    valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
+    result.score = valid_count / len(SCENARIOS)
+    result.passed = result.score >= 0.67
+
+    if latencies:
+        latencies_sorted = sorted(latencies)
+        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
+        result.latency_p99_ms = latencies_sorted[-1]
+
+    return result
--- a/timmy-benchmark/levels/level_5_mini_campaign.py
+++ b/timmy-benchmark/levels/level_5_mini_campaign.py
@@ -0,0 +1,252 @@
+"""Level 5: Mini Campaign — Full Campaign Loop.
+
+Tests multi-turn strategic coherence: the model must maintain state across
+several turns of a simulated Bannerlord campaign, making consistent decisions
+that build toward a long-term goal.
+Maps to: Full Bannerlord campaign loop — economy, diplomacy, conquest.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+LEVEL = 5
+NAME = "Mini Campaign (Full Campaign Loop)"
+DESCRIPTION = "Multi-turn strategic planning maintaining coherent goals across 4 turns."
+
+SYSTEM_PROMPT = """You are Timmy, a Bannerlord lord with ambitions to become King of Calradia.
+You have 4 turns to establish a power base. Each turn represents 2 weeks of in-game time.
+
+Your starting position:
+- Clan tier: 1 (minor lord)
+- Gold: 1000
+- Troops: 25 (mixed infantry/cavalry)
+- Renown: 150
+- Relations: Neutral with all factions
+
+Winning requires: Gold > 3000 AND Renown > 400 AND Own 1+ settlement by Turn 4.
+
+Each turn, choose ONE primary action:
+- "raid_village": +200 gold, -50 relations target faction, +30 renown, risk of retaliation
+- "trade_circuit": +300 gold, 0 relation change, +10 renown, no risk
+- "escort_caravan": +150 gold, +20 relations with faction, +20 renown
+- "tournament": costs 100 gold, +60 renown, +20 relations with host faction
+- "recruit_troops": costs 200 gold, +15 troops, no other change
+- "siege_castle": costs 500 gold + 200 troops morale, -100 relations, +80 renown, +1 settlement if succeed (30% base chance)
+- "pledge_vassalage": 0 cost, +100 relations with liege, +50 renown, lose independence
+
+You MUST respond ONLY with valid JSON for each turn. Raw JSON only."""
+
+
+def run(client: Any, model: str, verbose: bool = False) -> "LevelResult":
+    """Run a 4-turn mini campaign, tracking state and decision quality."""
+    result = LevelResult()
+
+    # Initial game state
+    state = {
+        "turn": 1,
+        "gold": 1000,
+        "troops": 25,
+        "renown": 150,
+        "settlements": 0,
+        "relations": {"vlandia": 0, "sturgia": 0, "empire": 0, "aserai": 0, "battania": 0},
+    }
+
+    conversation = [{"role": "system", "content": SYSTEM_PROMPT}]
+    turns_passed = []
+    total_latency = []
+
+    valid_actions = [
+        "raid_village", "trade_circuit", "escort_caravan", "tournament",
+        "recruit_troops", "siege_castle", "pledge_vassalage",
+    ]
+
+    for turn_num in range(1, 5):
+        state["turn"] = turn_num
+        state_str = json.dumps(state, indent=2)
+
+        prompt = (
+            f"=== TURN {turn_num} / 4 ===\n"
+            f"Current state:\n{state_str}\n\n"
+            f"Win conditions remaining: Gold > 3000 ({state['gold']}/3000), "
+            f"Renown > 400 ({state['renown']}/400), Settlements >= 1 ({state['settlements']}/1)\n\n"
+            f"Choose your action for Turn {turn_num}.\n"
+            f'Respond: {{"action": "<action>", "target_faction": "<faction or null>", '
+            f'"reason": "<strategic reasoning>", "goal": "<what this advances>"}}'
+        )
+
+        conversation.append({"role": "user", "content": prompt})
+
+        t0 = time.time()
+        try:
+            response = client.chat(
+                model=model,
+                messages=conversation,
+                options={"temperature": 0.3},
+            )
+            raw = response["message"]["content"]
+            latency_ms = (time.time() - t0) * 1000
+        except Exception as exc:
+            latency_ms = (time.time() - t0) * 1000
+            tr = TurnResult(
+                turn=turn_num,
+                state_before=dict(state),
+                raw_response="",
+                parsed=None,
+                valid_json=False,
+                valid_action=False,
+                action=None,
+                latency_ms=latency_ms,
+                error=str(exc),
+            )
+            turns_passed.append(tr)
+            if verbose:
+                print(f"  Turn {turn_num}: ERROR — {exc}")
+            break
+
+        total_latency.append(latency_ms)
+
+        # Clean and parse response
+        cleaned = raw.strip()
+        if cleaned.startswith("```"):
+            lines = cleaned.splitlines()
+            lines = [l for l in lines if not l.startswith("```")]
+            cleaned = "\n".join(lines).strip()
+
+        parsed = None
+        valid_json = False
+        valid_action = False
+        action = None
+        error = ""
+
+        try:
+            parsed = json.loads(cleaned)
+            valid_json = True
+            action = parsed.get("action")
+            valid_action = action in valid_actions
+        except json.JSONDecodeError as exc:
+            error = f"JSONDecodeError: {exc}"
+
+        tr = TurnResult(
+            turn=turn_num,
+            state_before=dict(state),
+            raw_response=raw,
+            parsed=parsed,
+            valid_json=valid_json,
+            valid_action=valid_action,
+            action=action,
+            latency_ms=latency_ms,
+            error=error,
+        )
+        turns_passed.append(tr)
+
+        # Add model response to conversation for continuity
+        conversation.append({"role": "assistant", "content": raw})
+
+        # Apply state changes based on action
+        if valid_action:
+            _apply_action(state, action, parsed.get("target_faction"))
+
+        if verbose:
+            status = "PASS" if (valid_json and valid_action) else "FAIL"
+            print(
+                f"  Turn {turn_num}: {status} | action={action} | {latency_ms:.0f}ms | "
+                f"gold={state['gold']} renown={state['renown']} settlements={state['settlements']}"
+            )
+
+    result.turns = turns_passed
+    result.final_state = dict(state)
+
+    # Win condition check
+    result.reached_gold_target = state["gold"] >= 3000
+    result.reached_renown_target = state["renown"] >= 400
+    result.reached_settlement_target = state["settlements"] >= 1
+
+    # Score: % of turns with valid JSON + valid action
+    valid_turns = sum(1 for t in turns_passed if t.valid_json and t.valid_action)
+    result.score = valid_turns / 4 if turns_passed else 0.0
+    result.passed = result.score >= 0.75  # 3/4 turns valid
+
+    if total_latency:
+        latencies_sorted = sorted(total_latency)
+        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
+        result.latency_p99_ms = latencies_sorted[-1]
+
+    if verbose:
+        win_status = []
+        if result.reached_gold_target:
+            win_status.append("GOLD")
+        if result.reached_renown_target:
+            win_status.append("RENOWN")
+        if result.reached_settlement_target:
+            win_status.append("SETTLEMENT")
+        print(f"  Win conditions met: {win_status or 'none'}")
+        print(f"  Final: gold={state['gold']} renown={state['renown']} settlements={state['settlements']}")
+
+    return result
+
+
+def _apply_action(state: dict, action: str, target_faction: str | None) -> None:
+    """Simulate game state changes for a given action."""
+    if action == "raid_village":
+        state["gold"] += 200
+        state["renown"] += 30
+        if target_faction and target_faction in state["relations"]:
+            state["relations"][target_faction] -= 50
+    elif action == "trade_circuit":
+        state["gold"] += 300
+        state["renown"] += 10
+    elif action == "escort_caravan":
+        state["gold"] += 150
+        state["renown"] += 20
+        if target_faction and target_faction in state["relations"]:
+            state["relations"][target_faction] += 20
+    elif action == "tournament":
+        state["gold"] -= 100
+        state["renown"] += 60
+        if target_faction and target_faction in state["relations"]:
+            state["relations"][target_faction] += 20
+    elif action == "recruit_troops":
+        state["gold"] -= 200
+        state["troops"] += 15
+    elif action == "siege_castle":
+        state["gold"] -= 500
+        state["renown"] += 80
+        # 30% chance success (deterministic sim: succeed on turn 3+ if attempted)
+        if state["turn"] >= 3:
+            state["settlements"] += 1
+        if target_faction and target_faction in state["relations"]:
+            state["relations"][target_faction] -= 100
+    elif action == "pledge_vassalage":
+        state["renown"] += 50
+        if target_faction and target_faction in state["relations"]:
+            state["relations"][target_faction] += 100
+
+
+@dataclass
+class TurnResult:
+    turn: int
+    state_before: dict
+    raw_response: str
+    parsed: dict | None
+    valid_json: bool
+    valid_action: bool
+    action: str | None
+    latency_ms: float
+    error: str = ""
+
+
+@dataclass
+class LevelResult:
+    level: int = LEVEL
+    name: str = NAME
+    turns: list[TurnResult] = field(default_factory=list)
+    final_state: dict = field(default_factory=dict)
+    passed: bool = False
+    score: float = 0.0
+    reached_gold_target: bool = False
+    reached_renown_target: bool = False
+    reached_settlement_target: bool = False
+    latency_p50_ms: float = 0.0
+    latency_p99_ms: float = 0.0
				`@@ -0,0 +1 @@`
				`"""Cognitive benchmark levels for Project Bannerlord readiness testing."""`