[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)

Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00
parent 6e65b53f3a
commit 9e08e87312
12 changed files with 3068 additions and 0 deletions
--- a/timmy-benchmark/levels/level_2_resource_mgmt.py
+++ b/timmy-benchmark/levels/level_2_resource_mgmt.py
@@ -0,0 +1,213 @@
+"""Level 2: Resource Management — Party Economy.
+
+Tests whether the model can allocate limited resources across competing
+priorities and adapt when constraints change.
+Maps to: Bannerlord party economy (troops, food, gold, morale).
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+LEVEL = 2
+NAME = "Resource Management (Party Economy)"
+DESCRIPTION = "Model must allocate limited resources across troops, food, and equipment."
+
+SYSTEM_PROMPT = """You are a Bannerlord campaign advisor managing a party.
+Resources are limited — every decision has trade-offs.
+You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only."""
+
+SCENARIOS = [
+    {
+        "description": "Budget allocation under constraint",
+        "prompt": (
+            "You have 500 gold. Options:\n"
+            "- Recruit 10 infantry: costs 300 gold, +10 combat strength\n"
+            "- Buy food for 20 days: costs 200 gold, keeps morale stable\n"
+            "- Repair armor: costs 150 gold, -20% casualty rate\n\n"
+            "You cannot afford all three. Morale is currently CRITICAL (troops may desert).\n"
+            'Choose 1-2 options. Respond: {"choices": ["option_a", ...], "gold_spent": <int>, "reason": "<why>"}\n'
+            "Where option keys are: recruit_infantry, buy_food, repair_armor"
+        ),
+        "check": lambda r: (
+            isinstance(r.get("choices"), list)
+            and len(r["choices"]) >= 1
+            and all(c in ["recruit_infantry", "buy_food", "repair_armor"] for c in r["choices"])
+            and isinstance(r.get("gold_spent"), (int, float))
+            and r.get("gold_spent", 9999) <= 500
+        ),
+        "check_desc": "choices must be valid options, gold_spent <= 500",
+        "strategic_check": lambda r: "buy_food" in r.get("choices", []),
+        "strategic_desc": "With CRITICAL morale, food should be prioritized",
+    },
+    {
+        "description": "Troop tier upgrade decision",
+        "prompt": (
+            "Party status:\n"
+            "- 15 Tier-1 recruits (weak, 30 upkeep/day)\n"
+            "- 5 Tier-3 veterans (strong, 90 upkeep/day)\n"
+            "- Daily income: 200 gold\n"
+            "- Upcoming: raider camp attack (moderate difficulty)\n\n"
+            "Options:\n"
+            "- Upgrade 5 recruits to Tier-2 (costs 250 gold total)\n"
+            "- Keep all current troops, save gold for emergencies\n"
+            "- Dismiss 5 recruits to save upkeep\n\n"
+            'Respond: {"action": "upgrade_recruits"|"save_gold"|"dismiss_recruits", '
+            '"reason": "<why>", "expected_outcome": "<string>"}'
+        ),
+        "check": lambda r: (
+            r.get("action") in ["upgrade_recruits", "save_gold", "dismiss_recruits"]
+            and isinstance(r.get("reason"), str)
+            and len(r.get("reason", "")) > 0
+        ),
+        "check_desc": "action must be one of the three options with a non-empty reason",
+        "strategic_check": lambda r: r.get("action") in ["upgrade_recruits", "save_gold"],
+        "strategic_desc": "Dismissing troops before a fight is suboptimal",
+    },
+    {
+        "description": "Multi-turn planning horizon",
+        "prompt": (
+            "Current: 300 gold, 10 days of food, 20 troops\n"
+            "Day 5: Must cross desert (costs 5 extra food days)\n"
+            "Day 10: Reach town (can buy supplies)\n\n"
+            "You need a 15-day food reserve to survive the journey.\n"
+            "Food costs 10 gold/day. You have enough for 10 days now.\n\n"
+            "How many extra food days do you buy today?\n"
+            'Respond: {"extra_food_days": <int>, "cost": <int>, "remaining_gold": <int>, "reason": "<why>"}'
+        ),
+        "check": lambda r: (
+            isinstance(r.get("extra_food_days"), (int, float))
+            and isinstance(r.get("cost"), (int, float))
+            and isinstance(r.get("remaining_gold"), (int, float))
+        ),
+        "check_desc": "Must include extra_food_days, cost, remaining_gold as numbers",
+        "strategic_check": lambda r: r.get("extra_food_days", 0) >= 5,
+        "strategic_desc": "Need at least 5 more days of food for desert crossing",
+    },
+]
+
+
+@dataclass
+class ScenarioResult:
+    scenario_index: int
+    description: str
+    raw_response: str
+    parsed: dict | None
+    valid_json: bool
+    schema_valid: bool
+    strategically_sound: bool
+    latency_ms: float
+    error: str = ""
+
+
+@dataclass
+class LevelResult:
+    level: int = LEVEL
+    name: str = NAME
+    trials: list[ScenarioResult] = field(default_factory=list)
+    passed: bool = False
+    score: float = 0.0
+    latency_p50_ms: float = 0.0
+    latency_p99_ms: float = 0.0
+
+
+def _clean_response(raw: str) -> str:
+    raw = raw.strip()
+    if raw.startswith("```"):
+        lines = raw.splitlines()
+        lines = [l for l in lines if not l.startswith("```")]
+        raw = "\n".join(lines).strip()
+    return raw
+
+
+def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
+    result = LevelResult()
+    latencies = []
+
+    for i, scenario in enumerate(SCENARIOS):
+        t0 = time.time()
+        try:
+            response = client.chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": scenario["prompt"]},
+                ],
+                options={"temperature": 0.1},
+            )
+            raw = response["message"]["content"]
+            latency_ms = (time.time() - t0) * 1000
+        except Exception as exc:
+            latency_ms = (time.time() - t0) * 1000
+            sr = ScenarioResult(
+                scenario_index=i,
+                description=scenario["description"],
+                raw_response="",
+                parsed=None,
+                valid_json=False,
+                schema_valid=False,
+                strategically_sound=False,
+                latency_ms=latency_ms,
+                error=str(exc),
+            )
+            result.trials.append(sr)
+            if verbose:
+                print(f"  Scenario {i}: ERROR — {exc}")
+            continue
+
+        latencies.append(latency_ms)
+
+        cleaned = _clean_response(raw)
+        parsed = None
+        valid_json = False
+        schema_valid = False
+        strategically_sound = False
+        error = ""
+
+        try:
+            parsed = json.loads(cleaned)
+            valid_json = True
+            schema_valid = scenario["check"](parsed)
+            if schema_valid:
+                strategically_sound = scenario["strategic_check"](parsed)
+        except json.JSONDecodeError as exc:
+            error = f"JSONDecodeError: {exc}"
+        except Exception as exc:
+            error = f"Validation error: {exc}"
+
+        sr = ScenarioResult(
+            scenario_index=i,
+            description=scenario["description"],
+            raw_response=raw,
+            parsed=parsed,
+            valid_json=valid_json,
+            schema_valid=schema_valid,
+            strategically_sound=strategically_sound,
+            latency_ms=latency_ms,
+            error=error,
+        )
+        result.trials.append(sr)
+
+        if verbose:
+            status = "PASS" if (valid_json and schema_valid) else "FAIL"
+            strat = "strategic" if strategically_sound else "suboptimal"
+            print(
+                f"  Scenario {i} [{scenario['description']}]: {status} ({strat}) "
+                f"| {latency_ms:.0f}ms"
+            )
+            if not schema_valid and valid_json:
+                print(f"    Schema issue: {scenario['check_desc']}")
+            if not strategically_sound and schema_valid:
+                print(f"    Strategy note: {scenario['strategic_desc']}")
+
+    valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
+    result.score = valid_count / len(SCENARIOS)
+    result.passed = result.score >= 0.67  # 2/3 scenarios
+
+    if latencies:
+        latencies_sorted = sorted(latencies)
+        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
+        result.latency_p99_ms = latencies_sorted[-1]
+
+    return result