[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)

Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00
parent 6e65b53f3a
commit 9e08e87312
12 changed files with 3068 additions and 0 deletions
--- a/timmy-benchmark/levels/level_5_mini_campaign.py
+++ b/timmy-benchmark/levels/level_5_mini_campaign.py
@@ -0,0 +1,252 @@
+"""Level 5: Mini Campaign — Full Campaign Loop.
+
+Tests multi-turn strategic coherence: the model must maintain state across
+several turns of a simulated Bannerlord campaign, making consistent decisions
+that build toward a long-term goal.
+Maps to: Full Bannerlord campaign loop — economy, diplomacy, conquest.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+LEVEL = 5
+NAME = "Mini Campaign (Full Campaign Loop)"
+DESCRIPTION = "Multi-turn strategic planning maintaining coherent goals across 4 turns."
+
+SYSTEM_PROMPT = """You are Timmy, a Bannerlord lord with ambitions to become King of Calradia.
+You have 4 turns to establish a power base. Each turn represents 2 weeks of in-game time.
+
+Your starting position:
+- Clan tier: 1 (minor lord)
+- Gold: 1000
+- Troops: 25 (mixed infantry/cavalry)
+- Renown: 150
+- Relations: Neutral with all factions
+
+Winning requires: Gold > 3000 AND Renown > 400 AND Own 1+ settlement by Turn 4.
+
+Each turn, choose ONE primary action:
+- "raid_village": +200 gold, -50 relations target faction, +30 renown, risk of retaliation
+- "trade_circuit": +300 gold, 0 relation change, +10 renown, no risk
+- "escort_caravan": +150 gold, +20 relations with faction, +20 renown
+- "tournament": costs 100 gold, +60 renown, +20 relations with host faction
+- "recruit_troops": costs 200 gold, +15 troops, no other change
+- "siege_castle": costs 500 gold + 200 troops morale, -100 relations, +80 renown, +1 settlement if succeed (30% base chance)
+- "pledge_vassalage": 0 cost, +100 relations with liege, +50 renown, lose independence
+
+You MUST respond ONLY with valid JSON for each turn. Raw JSON only."""
+
+
+def run(client: Any, model: str, verbose: bool = False) -> "LevelResult":
+    """Run a 4-turn mini campaign, tracking state and decision quality."""
+    result = LevelResult()
+
+    # Initial game state
+    state = {
+        "turn": 1,
+        "gold": 1000,
+        "troops": 25,
+        "renown": 150,
+        "settlements": 0,
+        "relations": {"vlandia": 0, "sturgia": 0, "empire": 0, "aserai": 0, "battania": 0},
+    }
+
+    conversation = [{"role": "system", "content": SYSTEM_PROMPT}]
+    turns_passed = []
+    total_latency = []
+
+    valid_actions = [
+        "raid_village", "trade_circuit", "escort_caravan", "tournament",
+        "recruit_troops", "siege_castle", "pledge_vassalage",
+    ]
+
+    for turn_num in range(1, 5):
+        state["turn"] = turn_num
+        state_str = json.dumps(state, indent=2)
+
+        prompt = (
+            f"=== TURN {turn_num} / 4 ===\n"
+            f"Current state:\n{state_str}\n\n"
+            f"Win conditions remaining: Gold > 3000 ({state['gold']}/3000), "
+            f"Renown > 400 ({state['renown']}/400), Settlements >= 1 ({state['settlements']}/1)\n\n"
+            f"Choose your action for Turn {turn_num}.\n"
+            f'Respond: {{"action": "<action>", "target_faction": "<faction or null>", '
+            f'"reason": "<strategic reasoning>", "goal": "<what this advances>"}}'
+        )
+
+        conversation.append({"role": "user", "content": prompt})
+
+        t0 = time.time()
+        try:
+            response = client.chat(
+                model=model,
+                messages=conversation,
+                options={"temperature": 0.3},
+            )
+            raw = response["message"]["content"]
+            latency_ms = (time.time() - t0) * 1000
+        except Exception as exc:
+            latency_ms = (time.time() - t0) * 1000
+            tr = TurnResult(
+                turn=turn_num,
+                state_before=dict(state),
+                raw_response="",
+                parsed=None,
+                valid_json=False,
+                valid_action=False,
+                action=None,
+                latency_ms=latency_ms,
+                error=str(exc),
+            )
+            turns_passed.append(tr)
+            if verbose:
+                print(f"  Turn {turn_num}: ERROR — {exc}")
+            break
+
+        total_latency.append(latency_ms)
+
+        # Clean and parse response
+        cleaned = raw.strip()
+        if cleaned.startswith("```"):
+            lines = cleaned.splitlines()
+            lines = [l for l in lines if not l.startswith("```")]
+            cleaned = "\n".join(lines).strip()
+
+        parsed = None
+        valid_json = False
+        valid_action = False
+        action = None
+        error = ""
+
+        try:
+            parsed = json.loads(cleaned)
+            valid_json = True
+            action = parsed.get("action")
+            valid_action = action in valid_actions
+        except json.JSONDecodeError as exc:
+            error = f"JSONDecodeError: {exc}"
+
+        tr = TurnResult(
+            turn=turn_num,
+            state_before=dict(state),
+            raw_response=raw,
+            parsed=parsed,
+            valid_json=valid_json,
+            valid_action=valid_action,
+            action=action,
+            latency_ms=latency_ms,
+            error=error,
+        )
+        turns_passed.append(tr)
+
+        # Add model response to conversation for continuity
+        conversation.append({"role": "assistant", "content": raw})
+
+        # Apply state changes based on action
+        if valid_action:
+            _apply_action(state, action, parsed.get("target_faction"))
+
+        if verbose:
+            status = "PASS" if (valid_json and valid_action) else "FAIL"
+            print(
+                f"  Turn {turn_num}: {status} | action={action} | {latency_ms:.0f}ms | "
+                f"gold={state['gold']} renown={state['renown']} settlements={state['settlements']}"
+            )
+
+    result.turns = turns_passed
+    result.final_state = dict(state)
+
+    # Win condition check
+    result.reached_gold_target = state["gold"] >= 3000
+    result.reached_renown_target = state["renown"] >= 400
+    result.reached_settlement_target = state["settlements"] >= 1
+
+    # Score: % of turns with valid JSON + valid action
+    valid_turns = sum(1 for t in turns_passed if t.valid_json and t.valid_action)
+    result.score = valid_turns / 4 if turns_passed else 0.0
+    result.passed = result.score >= 0.75  # 3/4 turns valid
+
+    if total_latency:
+        latencies_sorted = sorted(total_latency)
+        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
+        result.latency_p99_ms = latencies_sorted[-1]
+
+    if verbose:
+        win_status = []
+        if result.reached_gold_target:
+            win_status.append("GOLD")
+        if result.reached_renown_target:
+            win_status.append("RENOWN")
+        if result.reached_settlement_target:
+            win_status.append("SETTLEMENT")
+        print(f"  Win conditions met: {win_status or 'none'}")
+        print(f"  Final: gold={state['gold']} renown={state['renown']} settlements={state['settlements']}")
+
+    return result
+
+
+def _apply_action(state: dict, action: str, target_faction: str | None) -> None:
+    """Simulate game state changes for a given action."""
+    if action == "raid_village":
+        state["gold"] += 200
+        state["renown"] += 30
+        if target_faction and target_faction in state["relations"]:
+            state["relations"][target_faction] -= 50
+    elif action == "trade_circuit":
+        state["gold"] += 300
+        state["renown"] += 10
+    elif action == "escort_caravan":
+        state["gold"] += 150
+        state["renown"] += 20
+        if target_faction and target_faction in state["relations"]:
+            state["relations"][target_faction] += 20
+    elif action == "tournament":
+        state["gold"] -= 100
+        state["renown"] += 60
+        if target_faction and target_faction in state["relations"]:
+            state["relations"][target_faction] += 20
+    elif action == "recruit_troops":
+        state["gold"] -= 200
+        state["troops"] += 15
+    elif action == "siege_castle":
+        state["gold"] -= 500
+        state["renown"] += 80
+        # 30% chance success (deterministic sim: succeed on turn 3+ if attempted)
+        if state["turn"] >= 3:
+            state["settlements"] += 1
+        if target_faction and target_faction in state["relations"]:
+            state["relations"][target_faction] -= 100
+    elif action == "pledge_vassalage":
+        state["renown"] += 50
+        if target_faction and target_faction in state["relations"]:
+            state["relations"][target_faction] += 100
+
+
+@dataclass
+class TurnResult:
+    turn: int
+    state_before: dict
+    raw_response: str
+    parsed: dict | None
+    valid_json: bool
+    valid_action: bool
+    action: str | None
+    latency_ms: float
+    error: str = ""
+
+
+@dataclass
+class LevelResult:
+    level: int = LEVEL
+    name: str = NAME
+    turns: list[TurnResult] = field(default_factory=list)
+    final_state: dict = field(default_factory=dict)
+    passed: bool = False
+    score: float = 0.0
+    reached_gold_target: bool = False
+    reached_renown_target: bool = False
+    reached_settlement_target: bool = False
+    latency_p50_ms: float = 0.0
+    latency_p99_ms: float = 0.0