forked from Rockachopa/Timmy-time-dashboard
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
214 lines
7.9 KiB
Python
214 lines
7.9 KiB
Python
"""Level 2: Resource Management — Party Economy.
|
|
|
|
Tests whether the model can allocate limited resources across competing
|
|
priorities and adapt when constraints change.
|
|
Maps to: Bannerlord party economy (troops, food, gold, morale).
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
LEVEL = 2
|
|
NAME = "Resource Management (Party Economy)"
|
|
DESCRIPTION = "Model must allocate limited resources across troops, food, and equipment."
|
|
|
|
SYSTEM_PROMPT = """You are a Bannerlord campaign advisor managing a party.
|
|
Resources are limited — every decision has trade-offs.
|
|
You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only."""
|
|
|
|
SCENARIOS = [
|
|
{
|
|
"description": "Budget allocation under constraint",
|
|
"prompt": (
|
|
"You have 500 gold. Options:\n"
|
|
"- Recruit 10 infantry: costs 300 gold, +10 combat strength\n"
|
|
"- Buy food for 20 days: costs 200 gold, keeps morale stable\n"
|
|
"- Repair armor: costs 150 gold, -20% casualty rate\n\n"
|
|
"You cannot afford all three. Morale is currently CRITICAL (troops may desert).\n"
|
|
'Choose 1-2 options. Respond: {"choices": ["option_a", ...], "gold_spent": <int>, "reason": "<why>"}\n'
|
|
"Where option keys are: recruit_infantry, buy_food, repair_armor"
|
|
),
|
|
"check": lambda r: (
|
|
isinstance(r.get("choices"), list)
|
|
and len(r["choices"]) >= 1
|
|
and all(c in ["recruit_infantry", "buy_food", "repair_armor"] for c in r["choices"])
|
|
and isinstance(r.get("gold_spent"), (int, float))
|
|
and r.get("gold_spent", 9999) <= 500
|
|
),
|
|
"check_desc": "choices must be valid options, gold_spent <= 500",
|
|
"strategic_check": lambda r: "buy_food" in r.get("choices", []),
|
|
"strategic_desc": "With CRITICAL morale, food should be prioritized",
|
|
},
|
|
{
|
|
"description": "Troop tier upgrade decision",
|
|
"prompt": (
|
|
"Party status:\n"
|
|
"- 15 Tier-1 recruits (weak, 30 upkeep/day)\n"
|
|
"- 5 Tier-3 veterans (strong, 90 upkeep/day)\n"
|
|
"- Daily income: 200 gold\n"
|
|
"- Upcoming: raider camp attack (moderate difficulty)\n\n"
|
|
"Options:\n"
|
|
"- Upgrade 5 recruits to Tier-2 (costs 250 gold total)\n"
|
|
"- Keep all current troops, save gold for emergencies\n"
|
|
"- Dismiss 5 recruits to save upkeep\n\n"
|
|
'Respond: {"action": "upgrade_recruits"|"save_gold"|"dismiss_recruits", '
|
|
'"reason": "<why>", "expected_outcome": "<string>"}'
|
|
),
|
|
"check": lambda r: (
|
|
r.get("action") in ["upgrade_recruits", "save_gold", "dismiss_recruits"]
|
|
and isinstance(r.get("reason"), str)
|
|
and len(r.get("reason", "")) > 0
|
|
),
|
|
"check_desc": "action must be one of the three options with a non-empty reason",
|
|
"strategic_check": lambda r: r.get("action") in ["upgrade_recruits", "save_gold"],
|
|
"strategic_desc": "Dismissing troops before a fight is suboptimal",
|
|
},
|
|
{
|
|
"description": "Multi-turn planning horizon",
|
|
"prompt": (
|
|
"Current: 300 gold, 10 days of food, 20 troops\n"
|
|
"Day 5: Must cross desert (costs 5 extra food days)\n"
|
|
"Day 10: Reach town (can buy supplies)\n\n"
|
|
"You need a 15-day food reserve to survive the journey.\n"
|
|
"Food costs 10 gold/day. You have enough for 10 days now.\n\n"
|
|
"How many extra food days do you buy today?\n"
|
|
'Respond: {"extra_food_days": <int>, "cost": <int>, "remaining_gold": <int>, "reason": "<why>"}'
|
|
),
|
|
"check": lambda r: (
|
|
isinstance(r.get("extra_food_days"), (int, float))
|
|
and isinstance(r.get("cost"), (int, float))
|
|
and isinstance(r.get("remaining_gold"), (int, float))
|
|
),
|
|
"check_desc": "Must include extra_food_days, cost, remaining_gold as numbers",
|
|
"strategic_check": lambda r: r.get("extra_food_days", 0) >= 5,
|
|
"strategic_desc": "Need at least 5 more days of food for desert crossing",
|
|
},
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class ScenarioResult:
|
|
scenario_index: int
|
|
description: str
|
|
raw_response: str
|
|
parsed: dict | None
|
|
valid_json: bool
|
|
schema_valid: bool
|
|
strategically_sound: bool
|
|
latency_ms: float
|
|
error: str = ""
|
|
|
|
|
|
@dataclass
|
|
class LevelResult:
|
|
level: int = LEVEL
|
|
name: str = NAME
|
|
trials: list[ScenarioResult] = field(default_factory=list)
|
|
passed: bool = False
|
|
score: float = 0.0
|
|
latency_p50_ms: float = 0.0
|
|
latency_p99_ms: float = 0.0
|
|
|
|
|
|
def _clean_response(raw: str) -> str:
|
|
raw = raw.strip()
|
|
if raw.startswith("```"):
|
|
lines = raw.splitlines()
|
|
lines = [l for l in lines if not l.startswith("```")]
|
|
raw = "\n".join(lines).strip()
|
|
return raw
|
|
|
|
|
|
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
|
|
result = LevelResult()
|
|
latencies = []
|
|
|
|
for i, scenario in enumerate(SCENARIOS):
|
|
t0 = time.time()
|
|
try:
|
|
response = client.chat(
|
|
model=model,
|
|
messages=[
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": scenario["prompt"]},
|
|
],
|
|
options={"temperature": 0.1},
|
|
)
|
|
raw = response["message"]["content"]
|
|
latency_ms = (time.time() - t0) * 1000
|
|
except Exception as exc:
|
|
latency_ms = (time.time() - t0) * 1000
|
|
sr = ScenarioResult(
|
|
scenario_index=i,
|
|
description=scenario["description"],
|
|
raw_response="",
|
|
parsed=None,
|
|
valid_json=False,
|
|
schema_valid=False,
|
|
strategically_sound=False,
|
|
latency_ms=latency_ms,
|
|
error=str(exc),
|
|
)
|
|
result.trials.append(sr)
|
|
if verbose:
|
|
print(f" Scenario {i}: ERROR — {exc}")
|
|
continue
|
|
|
|
latencies.append(latency_ms)
|
|
|
|
cleaned = _clean_response(raw)
|
|
parsed = None
|
|
valid_json = False
|
|
schema_valid = False
|
|
strategically_sound = False
|
|
error = ""
|
|
|
|
try:
|
|
parsed = json.loads(cleaned)
|
|
valid_json = True
|
|
schema_valid = scenario["check"](parsed)
|
|
if schema_valid:
|
|
strategically_sound = scenario["strategic_check"](parsed)
|
|
except json.JSONDecodeError as exc:
|
|
error = f"JSONDecodeError: {exc}"
|
|
except Exception as exc:
|
|
error = f"Validation error: {exc}"
|
|
|
|
sr = ScenarioResult(
|
|
scenario_index=i,
|
|
description=scenario["description"],
|
|
raw_response=raw,
|
|
parsed=parsed,
|
|
valid_json=valid_json,
|
|
schema_valid=schema_valid,
|
|
strategically_sound=strategically_sound,
|
|
latency_ms=latency_ms,
|
|
error=error,
|
|
)
|
|
result.trials.append(sr)
|
|
|
|
if verbose:
|
|
status = "PASS" if (valid_json and schema_valid) else "FAIL"
|
|
strat = "strategic" if strategically_sound else "suboptimal"
|
|
print(
|
|
f" Scenario {i} [{scenario['description']}]: {status} ({strat}) "
|
|
f"| {latency_ms:.0f}ms"
|
|
)
|
|
if not schema_valid and valid_json:
|
|
print(f" Schema issue: {scenario['check_desc']}")
|
|
if not strategically_sound and schema_valid:
|
|
print(f" Strategy note: {scenario['strategic_desc']}")
|
|
|
|
valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
|
|
result.score = valid_count / len(SCENARIOS)
|
|
result.passed = result.score >= 0.67 # 2/3 scenarios
|
|
|
|
if latencies:
|
|
latencies_sorted = sorted(latencies)
|
|
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
|
result.latency_p99_ms = latencies_sorted[-1]
|
|
|
|
return result
|