forked from Rockachopa/Timmy-time-dashboard
[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
This commit is contained in:
committed by
rockachopa
parent
6e65b53f3a
commit
9e08e87312
216
timmy-benchmark/levels/level_3_battle_tactics.py
Normal file
216
timmy-benchmark/levels/level_3_battle_tactics.py
Normal file
@@ -0,0 +1,216 @@
|
||||
"""Level 3: Battle Tactics — Formation Commands.
|
||||
|
||||
Tests whether the model can issue coherent formation and tactical orders
|
||||
under simulated battlefield pressure with multiple unit types.
|
||||
Maps to: Bannerlord formation commands (charge, shield wall, skirmish, etc.).
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
LEVEL = 3
|
||||
NAME = "Battle Tactics (Formation Commands)"
|
||||
DESCRIPTION = "Model must issue tactically sound formation orders under simulated battle conditions."
|
||||
|
||||
SYSTEM_PROMPT = """You are a Bannerlord battle commander. Issue formation orders using these commands:
|
||||
- shield_wall: infantry forms defensive line (good vs ranged, slow advance)
|
||||
- charge: all-out attack (high casualties, breaks weak enemies fast)
|
||||
- skirmish: ranged units pepper enemy (good vs heavy infantry, needs distance)
|
||||
- advance: move forward holding formation (balanced)
|
||||
- flank_left / flank_right: cavalry sweeps around enemy side
|
||||
- fallback: retreat to regroup (when badly outnumbered)
|
||||
|
||||
You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
|
||||
|
||||
SCENARIOS = [
|
||||
{
|
||||
"description": "Ranged vs infantry — defensive opening",
|
||||
"prompt": (
|
||||
"Situation: You have 20 archers + 10 infantry. Enemy has 30 heavy infantry, no ranged.\n"
|
||||
"Enemy is 200m away and advancing.\n"
|
||||
"Objective: Maximize casualties before melee contact.\n\n"
|
||||
'Issue orders for both units. Respond:\n'
|
||||
'{"infantry_order": "<command>", "archer_order": "<command>", '
|
||||
'"reason": "<tactical reasoning>", "expected_outcome": "<string>"}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
r.get("infantry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
|
||||
and r.get("archer_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
|
||||
and isinstance(r.get("reason"), str)
|
||||
),
|
||||
"check_desc": "Both orders must be valid commands",
|
||||
"strategic_check": lambda r: (
|
||||
r.get("archer_order") == "skirmish"
|
||||
and r.get("infantry_order") in ["shield_wall", "advance"]
|
||||
),
|
||||
"strategic_desc": "Archers should skirmish while infantry holds (shield_wall or advance)",
|
||||
},
|
||||
{
|
||||
"description": "Outnumbered — retreat decision",
|
||||
"prompt": (
|
||||
"Situation: Your party (15 troops) has been ambushed.\n"
|
||||
"Enemy: 60 bandits, surrounding you on 3 sides.\n"
|
||||
"Your troops: 40% wounded. One escape route to the east.\n\n"
|
||||
'What is your command? Respond:\n'
|
||||
'{"order": "<command>", "direction": "east"|"west"|"north"|"south"|null, '
|
||||
'"reason": "<tactical reasoning>", "priority": "preserve_troops"|"fight_through"}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
r.get("order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
|
||||
and r.get("priority") in ["preserve_troops", "fight_through"]
|
||||
),
|
||||
"check_desc": "order and priority must be valid values",
|
||||
"strategic_check": lambda r: (
|
||||
r.get("order") == "fallback"
|
||||
and r.get("priority") == "preserve_troops"
|
||||
),
|
||||
"strategic_desc": "Outnumbered 4:1 with wounded troops — fallback is the sound choice",
|
||||
},
|
||||
{
|
||||
"description": "Cavalry flanking opportunity",
|
||||
"prompt": (
|
||||
"Situation: Main battle is engaged. Your infantry and enemy infantry are locked.\n"
|
||||
"You have 8 cavalry in reserve. Enemy left flank is unprotected.\n"
|
||||
"If cavalry hits the flank now, it will route enemy in ~30 seconds.\n\n"
|
||||
'Order for cavalry: Respond:\n'
|
||||
'{"cavalry_order": "<command>", "timing": "now"|"wait", '
|
||||
'"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
r.get("cavalry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
|
||||
and r.get("timing") in ["now", "wait"]
|
||||
and r.get("risk") in ["low", "medium", "high"]
|
||||
),
|
||||
"check_desc": "cavalry_order, timing, and risk must be valid values",
|
||||
"strategic_check": lambda r: (
|
||||
r.get("cavalry_order") in ["flank_left", "flank_right", "charge"]
|
||||
and r.get("timing") == "now"
|
||||
),
|
||||
"strategic_desc": "Should capitalize on the flank opportunity immediately",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScenarioResult:
|
||||
scenario_index: int
|
||||
description: str
|
||||
raw_response: str
|
||||
parsed: dict | None
|
||||
valid_json: bool
|
||||
schema_valid: bool
|
||||
strategically_sound: bool
|
||||
latency_ms: float
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevelResult:
|
||||
level: int = LEVEL
|
||||
name: str = NAME
|
||||
trials: list[ScenarioResult] = field(default_factory=list)
|
||||
passed: bool = False
|
||||
score: float = 0.0
|
||||
latency_p50_ms: float = 0.0
|
||||
latency_p99_ms: float = 0.0
|
||||
|
||||
|
||||
def _clean_response(raw: str) -> str:
|
||||
raw = raw.strip()
|
||||
if raw.startswith("```"):
|
||||
lines = raw.splitlines()
|
||||
lines = [l for l in lines if not l.startswith("```")]
|
||||
raw = "\n".join(lines).strip()
|
||||
return raw
|
||||
|
||||
|
||||
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
|
||||
result = LevelResult()
|
||||
latencies = []
|
||||
|
||||
for i, scenario in enumerate(SCENARIOS):
|
||||
t0 = time.time()
|
||||
try:
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": scenario["prompt"]},
|
||||
],
|
||||
options={"temperature": 0.2},
|
||||
)
|
||||
raw = response["message"]["content"]
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
except Exception as exc:
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
raw_response="",
|
||||
parsed=None,
|
||||
valid_json=False,
|
||||
schema_valid=False,
|
||||
strategically_sound=False,
|
||||
latency_ms=latency_ms,
|
||||
error=str(exc),
|
||||
)
|
||||
result.trials.append(sr)
|
||||
if verbose:
|
||||
print(f" Scenario {i}: ERROR — {exc}")
|
||||
continue
|
||||
|
||||
latencies.append(latency_ms)
|
||||
|
||||
cleaned = _clean_response(raw)
|
||||
parsed = None
|
||||
valid_json = False
|
||||
schema_valid = False
|
||||
strategically_sound = False
|
||||
error = ""
|
||||
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
valid_json = True
|
||||
schema_valid = scenario["check"](parsed)
|
||||
if schema_valid:
|
||||
strategically_sound = scenario["strategic_check"](parsed)
|
||||
except json.JSONDecodeError as exc:
|
||||
error = f"JSONDecodeError: {exc}"
|
||||
except Exception as exc:
|
||||
error = f"Validation error: {exc}"
|
||||
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
raw_response=raw,
|
||||
parsed=parsed,
|
||||
valid_json=valid_json,
|
||||
schema_valid=schema_valid,
|
||||
strategically_sound=strategically_sound,
|
||||
latency_ms=latency_ms,
|
||||
error=error,
|
||||
)
|
||||
result.trials.append(sr)
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if (valid_json and schema_valid) else "FAIL"
|
||||
strat = "strategic" if strategically_sound else "suboptimal"
|
||||
print(
|
||||
f" Scenario {i} [{scenario['description']}]: {status} ({strat}) "
|
||||
f"| {latency_ms:.0f}ms"
|
||||
)
|
||||
if not schema_valid and valid_json:
|
||||
print(f" Schema issue: {scenario['check_desc']}")
|
||||
|
||||
valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
|
||||
result.score = valid_count / len(SCENARIOS)
|
||||
result.passed = result.score >= 0.67
|
||||
|
||||
if latencies:
|
||||
latencies_sorted = sorted(latencies)
|
||||
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
||||
result.latency_p99_ms = latencies_sorted[-1]
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user