forked from Rockachopa/Timmy-time-dashboard
[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
This commit is contained in:
committed by
rockachopa
parent
6e65b53f3a
commit
9e08e87312
252
timmy-benchmark/levels/level_5_mini_campaign.py
Normal file
252
timmy-benchmark/levels/level_5_mini_campaign.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""Level 5: Mini Campaign — Full Campaign Loop.
|
||||
|
||||
Tests multi-turn strategic coherence: the model must maintain state across
|
||||
several turns of a simulated Bannerlord campaign, making consistent decisions
|
||||
that build toward a long-term goal.
|
||||
Maps to: Full Bannerlord campaign loop — economy, diplomacy, conquest.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
LEVEL = 5
|
||||
NAME = "Mini Campaign (Full Campaign Loop)"
|
||||
DESCRIPTION = "Multi-turn strategic planning maintaining coherent goals across 4 turns."
|
||||
|
||||
SYSTEM_PROMPT = """You are Timmy, a Bannerlord lord with ambitions to become King of Calradia.
|
||||
You have 4 turns to establish a power base. Each turn represents 2 weeks of in-game time.
|
||||
|
||||
Your starting position:
|
||||
- Clan tier: 1 (minor lord)
|
||||
- Gold: 1000
|
||||
- Troops: 25 (mixed infantry/cavalry)
|
||||
- Renown: 150
|
||||
- Relations: Neutral with all factions
|
||||
|
||||
Winning requires: Gold > 3000 AND Renown > 400 AND Own 1+ settlement by Turn 4.
|
||||
|
||||
Each turn, choose ONE primary action:
|
||||
- "raid_village": +200 gold, -50 relations target faction, +30 renown, risk of retaliation
|
||||
- "trade_circuit": +300 gold, 0 relation change, +10 renown, no risk
|
||||
- "escort_caravan": +150 gold, +20 relations with faction, +20 renown
|
||||
- "tournament": costs 100 gold, +60 renown, +20 relations with host faction
|
||||
- "recruit_troops": costs 200 gold, +15 troops, no other change
|
||||
- "siege_castle": costs 500 gold + 200 troops morale, -100 relations, +80 renown, +1 settlement if succeed (30% base chance)
|
||||
- "pledge_vassalage": 0 cost, +100 relations with liege, +50 renown, lose independence
|
||||
|
||||
You MUST respond ONLY with valid JSON for each turn. Raw JSON only."""
|
||||
|
||||
|
||||
def run(client: Any, model: str, verbose: bool = False) -> "LevelResult":
|
||||
"""Run a 4-turn mini campaign, tracking state and decision quality."""
|
||||
result = LevelResult()
|
||||
|
||||
# Initial game state
|
||||
state = {
|
||||
"turn": 1,
|
||||
"gold": 1000,
|
||||
"troops": 25,
|
||||
"renown": 150,
|
||||
"settlements": 0,
|
||||
"relations": {"vlandia": 0, "sturgia": 0, "empire": 0, "aserai": 0, "battania": 0},
|
||||
}
|
||||
|
||||
conversation = [{"role": "system", "content": SYSTEM_PROMPT}]
|
||||
turns_passed = []
|
||||
total_latency = []
|
||||
|
||||
valid_actions = [
|
||||
"raid_village", "trade_circuit", "escort_caravan", "tournament",
|
||||
"recruit_troops", "siege_castle", "pledge_vassalage",
|
||||
]
|
||||
|
||||
for turn_num in range(1, 5):
|
||||
state["turn"] = turn_num
|
||||
state_str = json.dumps(state, indent=2)
|
||||
|
||||
prompt = (
|
||||
f"=== TURN {turn_num} / 4 ===\n"
|
||||
f"Current state:\n{state_str}\n\n"
|
||||
f"Win conditions remaining: Gold > 3000 ({state['gold']}/3000), "
|
||||
f"Renown > 400 ({state['renown']}/400), Settlements >= 1 ({state['settlements']}/1)\n\n"
|
||||
f"Choose your action for Turn {turn_num}.\n"
|
||||
f'Respond: {{"action": "<action>", "target_faction": "<faction or null>", '
|
||||
f'"reason": "<strategic reasoning>", "goal": "<what this advances>"}}'
|
||||
)
|
||||
|
||||
conversation.append({"role": "user", "content": prompt})
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=conversation,
|
||||
options={"temperature": 0.3},
|
||||
)
|
||||
raw = response["message"]["content"]
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
except Exception as exc:
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
tr = TurnResult(
|
||||
turn=turn_num,
|
||||
state_before=dict(state),
|
||||
raw_response="",
|
||||
parsed=None,
|
||||
valid_json=False,
|
||||
valid_action=False,
|
||||
action=None,
|
||||
latency_ms=latency_ms,
|
||||
error=str(exc),
|
||||
)
|
||||
turns_passed.append(tr)
|
||||
if verbose:
|
||||
print(f" Turn {turn_num}: ERROR — {exc}")
|
||||
break
|
||||
|
||||
total_latency.append(latency_ms)
|
||||
|
||||
# Clean and parse response
|
||||
cleaned = raw.strip()
|
||||
if cleaned.startswith("```"):
|
||||
lines = cleaned.splitlines()
|
||||
lines = [l for l in lines if not l.startswith("```")]
|
||||
cleaned = "\n".join(lines).strip()
|
||||
|
||||
parsed = None
|
||||
valid_json = False
|
||||
valid_action = False
|
||||
action = None
|
||||
error = ""
|
||||
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
valid_json = True
|
||||
action = parsed.get("action")
|
||||
valid_action = action in valid_actions
|
||||
except json.JSONDecodeError as exc:
|
||||
error = f"JSONDecodeError: {exc}"
|
||||
|
||||
tr = TurnResult(
|
||||
turn=turn_num,
|
||||
state_before=dict(state),
|
||||
raw_response=raw,
|
||||
parsed=parsed,
|
||||
valid_json=valid_json,
|
||||
valid_action=valid_action,
|
||||
action=action,
|
||||
latency_ms=latency_ms,
|
||||
error=error,
|
||||
)
|
||||
turns_passed.append(tr)
|
||||
|
||||
# Add model response to conversation for continuity
|
||||
conversation.append({"role": "assistant", "content": raw})
|
||||
|
||||
# Apply state changes based on action
|
||||
if valid_action:
|
||||
_apply_action(state, action, parsed.get("target_faction"))
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if (valid_json and valid_action) else "FAIL"
|
||||
print(
|
||||
f" Turn {turn_num}: {status} | action={action} | {latency_ms:.0f}ms | "
|
||||
f"gold={state['gold']} renown={state['renown']} settlements={state['settlements']}"
|
||||
)
|
||||
|
||||
result.turns = turns_passed
|
||||
result.final_state = dict(state)
|
||||
|
||||
# Win condition check
|
||||
result.reached_gold_target = state["gold"] >= 3000
|
||||
result.reached_renown_target = state["renown"] >= 400
|
||||
result.reached_settlement_target = state["settlements"] >= 1
|
||||
|
||||
# Score: % of turns with valid JSON + valid action
|
||||
valid_turns = sum(1 for t in turns_passed if t.valid_json and t.valid_action)
|
||||
result.score = valid_turns / 4 if turns_passed else 0.0
|
||||
result.passed = result.score >= 0.75 # 3/4 turns valid
|
||||
|
||||
if total_latency:
|
||||
latencies_sorted = sorted(total_latency)
|
||||
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
||||
result.latency_p99_ms = latencies_sorted[-1]
|
||||
|
||||
if verbose:
|
||||
win_status = []
|
||||
if result.reached_gold_target:
|
||||
win_status.append("GOLD")
|
||||
if result.reached_renown_target:
|
||||
win_status.append("RENOWN")
|
||||
if result.reached_settlement_target:
|
||||
win_status.append("SETTLEMENT")
|
||||
print(f" Win conditions met: {win_status or 'none'}")
|
||||
print(f" Final: gold={state['gold']} renown={state['renown']} settlements={state['settlements']}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _apply_action(state: dict, action: str, target_faction: str | None) -> None:
|
||||
"""Simulate game state changes for a given action."""
|
||||
if action == "raid_village":
|
||||
state["gold"] += 200
|
||||
state["renown"] += 30
|
||||
if target_faction and target_faction in state["relations"]:
|
||||
state["relations"][target_faction] -= 50
|
||||
elif action == "trade_circuit":
|
||||
state["gold"] += 300
|
||||
state["renown"] += 10
|
||||
elif action == "escort_caravan":
|
||||
state["gold"] += 150
|
||||
state["renown"] += 20
|
||||
if target_faction and target_faction in state["relations"]:
|
||||
state["relations"][target_faction] += 20
|
||||
elif action == "tournament":
|
||||
state["gold"] -= 100
|
||||
state["renown"] += 60
|
||||
if target_faction and target_faction in state["relations"]:
|
||||
state["relations"][target_faction] += 20
|
||||
elif action == "recruit_troops":
|
||||
state["gold"] -= 200
|
||||
state["troops"] += 15
|
||||
elif action == "siege_castle":
|
||||
state["gold"] -= 500
|
||||
state["renown"] += 80
|
||||
# 30% chance success (deterministic sim: succeed on turn 3+ if attempted)
|
||||
if state["turn"] >= 3:
|
||||
state["settlements"] += 1
|
||||
if target_faction and target_faction in state["relations"]:
|
||||
state["relations"][target_faction] -= 100
|
||||
elif action == "pledge_vassalage":
|
||||
state["renown"] += 50
|
||||
if target_faction and target_faction in state["relations"]:
|
||||
state["relations"][target_faction] += 100
|
||||
|
||||
|
||||
@dataclass
|
||||
class TurnResult:
|
||||
turn: int
|
||||
state_before: dict
|
||||
raw_response: str
|
||||
parsed: dict | None
|
||||
valid_json: bool
|
||||
valid_action: bool
|
||||
action: str | None
|
||||
latency_ms: float
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevelResult:
|
||||
level: int = LEVEL
|
||||
name: str = NAME
|
||||
turns: list[TurnResult] = field(default_factory=list)
|
||||
final_state: dict = field(default_factory=dict)
|
||||
passed: bool = False
|
||||
score: float = 0.0
|
||||
reached_gold_target: bool = False
|
||||
reached_renown_target: bool = False
|
||||
reached_settlement_target: bool = False
|
||||
latency_p50_ms: float = 0.0
|
||||
latency_p99_ms: float = 0.0
|
||||
Reference in New Issue
Block a user