1
0
This repository has been archived on 2026-03-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Timmy-time-dashboard/timmy-benchmark/levels/level_5_mini_campaign.py
Alexander Whitestone 9e08e87312 [claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com>
Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00

253 lines
8.8 KiB
Python

"""Level 5: Mini Campaign — Full Campaign Loop.
Tests multi-turn strategic coherence: the model must maintain state across
several turns of a simulated Bannerlord campaign, making consistent decisions
that build toward a long-term goal.
Maps to: Full Bannerlord campaign loop — economy, diplomacy, conquest.
"""
import json
import time
from dataclasses import dataclass, field
from typing import Any
LEVEL = 5
NAME = "Mini Campaign (Full Campaign Loop)"
DESCRIPTION = "Multi-turn strategic planning maintaining coherent goals across 4 turns."
SYSTEM_PROMPT = """You are Timmy, a Bannerlord lord with ambitions to become King of Calradia.
You have 4 turns to establish a power base. Each turn represents 2 weeks of in-game time.
Your starting position:
- Clan tier: 1 (minor lord)
- Gold: 1000
- Troops: 25 (mixed infantry/cavalry)
- Renown: 150
- Relations: Neutral with all factions
Winning requires: Gold > 3000 AND Renown > 400 AND Own 1+ settlement by Turn 4.
Each turn, choose ONE primary action:
- "raid_village": +200 gold, -50 relations target faction, +30 renown, risk of retaliation
- "trade_circuit": +300 gold, 0 relation change, +10 renown, no risk
- "escort_caravan": +150 gold, +20 relations with faction, +20 renown
- "tournament": costs 100 gold, +60 renown, +20 relations with host faction
- "recruit_troops": costs 200 gold, +15 troops, no other change
- "siege_castle": costs 500 gold + 200 troops morale, -100 relations, +80 renown, +1 settlement if succeed (30% base chance)
- "pledge_vassalage": 0 cost, +100 relations with liege, +50 renown, lose independence
You MUST respond ONLY with valid JSON for each turn. Raw JSON only."""
def run(client: Any, model: str, verbose: bool = False) -> "LevelResult":
"""Run a 4-turn mini campaign, tracking state and decision quality."""
result = LevelResult()
# Initial game state
state = {
"turn": 1,
"gold": 1000,
"troops": 25,
"renown": 150,
"settlements": 0,
"relations": {"vlandia": 0, "sturgia": 0, "empire": 0, "aserai": 0, "battania": 0},
}
conversation = [{"role": "system", "content": SYSTEM_PROMPT}]
turns_passed = []
total_latency = []
valid_actions = [
"raid_village", "trade_circuit", "escort_caravan", "tournament",
"recruit_troops", "siege_castle", "pledge_vassalage",
]
for turn_num in range(1, 5):
state["turn"] = turn_num
state_str = json.dumps(state, indent=2)
prompt = (
f"=== TURN {turn_num} / 4 ===\n"
f"Current state:\n{state_str}\n\n"
f"Win conditions remaining: Gold > 3000 ({state['gold']}/3000), "
f"Renown > 400 ({state['renown']}/400), Settlements >= 1 ({state['settlements']}/1)\n\n"
f"Choose your action for Turn {turn_num}.\n"
f'Respond: {{"action": "<action>", "target_faction": "<faction or null>", '
f'"reason": "<strategic reasoning>", "goal": "<what this advances>"}}'
)
conversation.append({"role": "user", "content": prompt})
t0 = time.time()
try:
response = client.chat(
model=model,
messages=conversation,
options={"temperature": 0.3},
)
raw = response["message"]["content"]
latency_ms = (time.time() - t0) * 1000
except Exception as exc:
latency_ms = (time.time() - t0) * 1000
tr = TurnResult(
turn=turn_num,
state_before=dict(state),
raw_response="",
parsed=None,
valid_json=False,
valid_action=False,
action=None,
latency_ms=latency_ms,
error=str(exc),
)
turns_passed.append(tr)
if verbose:
print(f" Turn {turn_num}: ERROR — {exc}")
break
total_latency.append(latency_ms)
# Clean and parse response
cleaned = raw.strip()
if cleaned.startswith("```"):
lines = cleaned.splitlines()
lines = [l for l in lines if not l.startswith("```")]
cleaned = "\n".join(lines).strip()
parsed = None
valid_json = False
valid_action = False
action = None
error = ""
try:
parsed = json.loads(cleaned)
valid_json = True
action = parsed.get("action")
valid_action = action in valid_actions
except json.JSONDecodeError as exc:
error = f"JSONDecodeError: {exc}"
tr = TurnResult(
turn=turn_num,
state_before=dict(state),
raw_response=raw,
parsed=parsed,
valid_json=valid_json,
valid_action=valid_action,
action=action,
latency_ms=latency_ms,
error=error,
)
turns_passed.append(tr)
# Add model response to conversation for continuity
conversation.append({"role": "assistant", "content": raw})
# Apply state changes based on action
if valid_action:
_apply_action(state, action, parsed.get("target_faction"))
if verbose:
status = "PASS" if (valid_json and valid_action) else "FAIL"
print(
f" Turn {turn_num}: {status} | action={action} | {latency_ms:.0f}ms | "
f"gold={state['gold']} renown={state['renown']} settlements={state['settlements']}"
)
result.turns = turns_passed
result.final_state = dict(state)
# Win condition check
result.reached_gold_target = state["gold"] >= 3000
result.reached_renown_target = state["renown"] >= 400
result.reached_settlement_target = state["settlements"] >= 1
# Score: % of turns with valid JSON + valid action
valid_turns = sum(1 for t in turns_passed if t.valid_json and t.valid_action)
result.score = valid_turns / 4 if turns_passed else 0.0
result.passed = result.score >= 0.75 # 3/4 turns valid
if total_latency:
latencies_sorted = sorted(total_latency)
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
result.latency_p99_ms = latencies_sorted[-1]
if verbose:
win_status = []
if result.reached_gold_target:
win_status.append("GOLD")
if result.reached_renown_target:
win_status.append("RENOWN")
if result.reached_settlement_target:
win_status.append("SETTLEMENT")
print(f" Win conditions met: {win_status or 'none'}")
print(f" Final: gold={state['gold']} renown={state['renown']} settlements={state['settlements']}")
return result
def _apply_action(state: dict, action: str, target_faction: str | None) -> None:
"""Simulate game state changes for a given action."""
if action == "raid_village":
state["gold"] += 200
state["renown"] += 30
if target_faction and target_faction in state["relations"]:
state["relations"][target_faction] -= 50
elif action == "trade_circuit":
state["gold"] += 300
state["renown"] += 10
elif action == "escort_caravan":
state["gold"] += 150
state["renown"] += 20
if target_faction and target_faction in state["relations"]:
state["relations"][target_faction] += 20
elif action == "tournament":
state["gold"] -= 100
state["renown"] += 60
if target_faction and target_faction in state["relations"]:
state["relations"][target_faction] += 20
elif action == "recruit_troops":
state["gold"] -= 200
state["troops"] += 15
elif action == "siege_castle":
state["gold"] -= 500
state["renown"] += 80
# 30% chance success (deterministic sim: succeed on turn 3+ if attempted)
if state["turn"] >= 3:
state["settlements"] += 1
if target_faction and target_faction in state["relations"]:
state["relations"][target_faction] -= 100
elif action == "pledge_vassalage":
state["renown"] += 50
if target_faction and target_faction in state["relations"]:
state["relations"][target_faction] += 100
@dataclass
class TurnResult:
turn: int
state_before: dict
raw_response: str
parsed: dict | None
valid_json: bool
valid_action: bool
action: str | None
latency_ms: float
error: str = ""
@dataclass
class LevelResult:
level: int = LEVEL
name: str = NAME
turns: list[TurnResult] = field(default_factory=list)
final_state: dict = field(default_factory=dict)
passed: bool = False
score: float = 0.0
reached_gold_target: bool = False
reached_renown_target: bool = False
reached_settlement_target: bool = False
latency_p50_ms: float = 0.0
latency_p99_ms: float = 0.0