Timmy-time-dashboard/timmy-benchmark/levels/level_2_resource_mgmt.py

"""Level 2: Resource Management — Party Economy.

Tests whether the model can allocate limited resources across competing
priorities and adapt when constraints change.
Maps to: Bannerlord party economy (troops, food, gold, morale).
"""

import json
import time
from dataclasses import dataclass, field
from typing import Any

LEVEL = 2
NAME = "Resource Management (Party Economy)"
DESCRIPTION = "Model must allocate limited resources across troops, food, and equipment."

SYSTEM_PROMPT = """You are a Bannerlord campaign advisor managing a party.
Resources are limited — every decision has trade-offs.
You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only."""

SCENARIOS = [
    {
        "description": "Budget allocation under constraint",
        "prompt": (
            "You have 500 gold. Options:\n"
            "- Recruit 10 infantry: costs 300 gold, +10 combat strength\n"
            "- Buy food for 20 days: costs 200 gold, keeps morale stable\n"
            "- Repair armor: costs 150 gold, -20% casualty rate\n\n"
            "You cannot afford all three. Morale is currently CRITICAL (troops may desert).\n"
            'Choose 1-2 options. Respond: {"choices": ["option_a", ...], "gold_spent": <int>, "reason": "<why>"}\n'
            "Where option keys are: recruit_infantry, buy_food, repair_armor"
        ),
        "check": lambda r: (
            isinstance(r.get("choices"), list)
            and len(r["choices"]) >= 1
            and all(c in ["recruit_infantry", "buy_food", "repair_armor"] for c in r["choices"])
            and isinstance(r.get("gold_spent"), (int, float))
            and r.get("gold_spent", 9999) <= 500
        ),
        "check_desc": "choices must be valid options, gold_spent <= 500",
        "strategic_check": lambda r: "buy_food" in r.get("choices", []),
        "strategic_desc": "With CRITICAL morale, food should be prioritized",
    },
    {
        "description": "Troop tier upgrade decision",
        "prompt": (
            "Party status:\n"
            "- 15 Tier-1 recruits (weak, 30 upkeep/day)\n"
            "- 5 Tier-3 veterans (strong, 90 upkeep/day)\n"
            "- Daily income: 200 gold\n"
            "- Upcoming: raider camp attack (moderate difficulty)\n\n"
            "Options:\n"
            "- Upgrade 5 recruits to Tier-2 (costs 250 gold total)\n"
            "- Keep all current troops, save gold for emergencies\n"
            "- Dismiss 5 recruits to save upkeep\n\n"
            'Respond: {"action": "upgrade_recruits"|"save_gold"|"dismiss_recruits", '
            '"reason": "<why>", "expected_outcome": "<string>"}'
        ),
        "check": lambda r: (
            r.get("action") in ["upgrade_recruits", "save_gold", "dismiss_recruits"]
            and isinstance(r.get("reason"), str)
            and len(r.get("reason", "")) > 0
        ),
        "check_desc": "action must be one of the three options with a non-empty reason",
        "strategic_check": lambda r: r.get("action") in ["upgrade_recruits", "save_gold"],
        "strategic_desc": "Dismissing troops before a fight is suboptimal",
    },
    {
        "description": "Multi-turn planning horizon",
        "prompt": (
            "Current: 300 gold, 10 days of food, 20 troops\n"
            "Day 5: Must cross desert (costs 5 extra food days)\n"
            "Day 10: Reach town (can buy supplies)\n\n"
            "You need a 15-day food reserve to survive the journey.\n"
            "Food costs 10 gold/day. You have enough for 10 days now.\n\n"
            "How many extra food days do you buy today?\n"
            'Respond: {"extra_food_days": <int>, "cost": <int>, "remaining_gold": <int>, "reason": "<why>"}'
        ),
        "check": lambda r: (
            isinstance(r.get("extra_food_days"), (int, float))
            and isinstance(r.get("cost"), (int, float))
            and isinstance(r.get("remaining_gold"), (int, float))
        ),
        "check_desc": "Must include extra_food_days, cost, remaining_gold as numbers",
        "strategic_check": lambda r: r.get("extra_food_days", 0) >= 5,
        "strategic_desc": "Need at least 5 more days of food for desert crossing",
    },
]


@dataclass
class ScenarioResult:
    scenario_index: int
    description: str
    raw_response: str
    parsed: dict | None
    valid_json: bool
    schema_valid: bool
    strategically_sound: bool
    latency_ms: float
    error: str = ""


@dataclass
class LevelResult:
    level: int = LEVEL
    name: str = NAME
    trials: list[ScenarioResult] = field(default_factory=list)
    passed: bool = False
    score: float = 0.0
    latency_p50_ms: float = 0.0
    latency_p99_ms: float = 0.0


def _clean_response(raw: str) -> str:
    raw = raw.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()
    return raw


def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
    result = LevelResult()
    latencies = []

    for i, scenario in enumerate(SCENARIOS):
        t0 = time.time()
        try:
            response = client.chat(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": scenario["prompt"]},
                ],
                options={"temperature": 0.1},
            )
            raw = response["message"]["content"]
            latency_ms = (time.time() - t0) * 1000
        except Exception as exc:
            latency_ms = (time.time() - t0) * 1000
            sr = ScenarioResult(
                scenario_index=i,
                description=scenario["description"],
                raw_response="",
                parsed=None,
                valid_json=False,
                schema_valid=False,
                strategically_sound=False,
                latency_ms=latency_ms,
                error=str(exc),
            )
            result.trials.append(sr)
            if verbose:
                print(f"  Scenario {i}: ERROR — {exc}")
            continue

        latencies.append(latency_ms)

        cleaned = _clean_response(raw)
        parsed = None
        valid_json = False
        schema_valid = False
        strategically_sound = False
        error = ""

        try:
            parsed = json.loads(cleaned)
            valid_json = True
            schema_valid = scenario["check"](parsed)
            if schema_valid:
                strategically_sound = scenario["strategic_check"](parsed)
        except json.JSONDecodeError as exc:
            error = f"JSONDecodeError: {exc}"
        except Exception as exc:
            error = f"Validation error: {exc}"

        sr = ScenarioResult(
            scenario_index=i,
            description=scenario["description"],
            raw_response=raw,
            parsed=parsed,
            valid_json=valid_json,
            schema_valid=schema_valid,
            strategically_sound=strategically_sound,
            latency_ms=latency_ms,
            error=error,
        )
        result.trials.append(sr)

        if verbose:
            status = "PASS" if (valid_json and schema_valid) else "FAIL"
            strat = "strategic" if strategically_sound else "suboptimal"
            print(
                f"  Scenario {i} [{scenario['description']}]: {status} ({strat}) "
                f"| {latency_ms:.0f}ms"
            )
            if not schema_valid and valid_json:
                print(f"    Schema issue: {scenario['check_desc']}")
            if not strategically_sound and schema_valid:
                print(f"    Strategy note: {scenario['strategic_desc']}")

    valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
    result.score = valid_count / len(SCENARIOS)
    result.passed = result.score >= 0.67  # 2/3 scenarios

    if latencies:
        latencies_sorted = sorted(latencies)
        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
        result.latency_p99_ms = latencies_sorted[-1]

    return result