Files
Timmy-time-dashboard/timmy-benchmark/levels/level_4_trade_route.py
Alexander Whitestone 9e08e87312
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled
[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com>
Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00

224 lines
8.8 KiB
Python

"""Level 4: Trade Route — Campaign Navigation.
Tests multi-step planning ability: route optimization, trade-off analysis
across time horizons, and adapting plans when conditions change.
Maps to: Bannerlord campaign map navigation, caravans, and economy.
"""
import json
import time
from dataclasses import dataclass, field
from typing import Any
LEVEL = 4
NAME = "Trade Route (Campaign Navigation)"
DESCRIPTION = "Model must plan optimal routes and adapt to changing conditions on the campaign map."
SYSTEM_PROMPT = """You are a Bannerlord merchant lord planning campaign movements.
Consider distance, profitability, risk, and timing.
You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
SCENARIOS = [
{
"description": "Optimal trade route selection",
"prompt": (
"You are at Epicrotea with 500 gold and 20 days travel budget.\n\n"
"Trade opportunities:\n"
"- Route A: Epicrotea → Vlandia (3 days) → Sturgia (5 days back)\n"
" Sell grain in Vlandia: +300 gold. Buy furs in Sturgia: costs 200, sells for 400 in Calradia.\n"
" Total: +500 gold profit, 8 days.\n"
"- Route B: Epicrotea → Calradia (2 days) → Aserai (4 days)\n"
" Sell iron in Calradia: +150 gold. Buy spice in Aserai: costs 300, sells for 600 in Empire.\n"
" Empire is 6 more days away. Total: +450 gold profit, 12 days.\n"
"- Route C: Epicrotea → nearby village (1 day)\n"
" Buy cheap food: costs 100, sells for 180 in any city.\n"
" Total: +80 gold profit, 2 days. Repeatable.\n\n"
'Choose route. Respond:\n'
'{"route": "A"|"B"|"C", "expected_profit": <int>, "days_used": <int>, '
'"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
),
"check": lambda r: (
r.get("route") in ["A", "B", "C"]
and isinstance(r.get("expected_profit"), (int, float))
and isinstance(r.get("days_used"), (int, float))
and r.get("risk") in ["low", "medium", "high"]
),
"check_desc": "route, expected_profit, days_used, risk must be valid",
"strategic_check": lambda r: r.get("route") in ["A", "C"], # A is best single trip, C is best if repeated
"strategic_desc": "Route A has best profit/day ratio; C is best if multiple loops possible",
},
{
"description": "Adapt plan when war declared",
"prompt": (
"You were heading to Vlandia to trade, 2 days into the journey.\n"
"NEWS: Vlandia just declared war on your faction. Entering Vlandia territory is now dangerous.\n\n"
"Your current position: borderlands, equidistant between:\n"
"- Vlandia (2 days): Now at war — high risk of attack\n"
"- Sturgia (3 days): Neutral — safe\n"
"- Empire (4 days): Allied — very safe, good prices\n\n"
"You have 400 gold of trade goods for the Vlandia market.\n"
'What do you do? Respond:\n'
'{"decision": "continue_to_vlandia"|"divert_to_sturgia"|"divert_to_empire", '
'"reason": "<why>", "gold_at_risk": <int>}'
),
"check": lambda r: (
r.get("decision") in ["continue_to_vlandia", "divert_to_sturgia", "divert_to_empire"]
and isinstance(r.get("gold_at_risk"), (int, float))
),
"check_desc": "decision must be one of three options, gold_at_risk must be a number",
"strategic_check": lambda r: r.get("decision") in ["divert_to_sturgia", "divert_to_empire"],
"strategic_desc": "Should avoid active war zone — divert to safe destination",
},
{
"description": "Multi-stop route planning with constraints",
"prompt": (
"Plan a 3-stop trading circuit starting and ending at Pravend.\n"
"Budget: 800 gold. Time limit: 20 days.\n\n"
"Available cities and travel times from Pravend:\n"
"- Rhotae: 2 days (leather cheap, sells well in south)\n"
"- Ortysia: 4 days (grain surplus — buy cheap)\n"
"- Epicrotea: 3 days (iron market — buy/sell)\n"
"- Pen Cannoc: 5 days (wine — high profit, far)\n\n"
"Each stop takes 1 day for trading.\n"
'Plan 3 stops. Respond:\n'
'{"stops": ["<city1>", "<city2>", "<city3>"], '
'"total_days": <int>, "estimated_profit": <int>, '
'"reason": "<reasoning>"}'
),
"check": lambda r: (
isinstance(r.get("stops"), list)
and len(r["stops"]) == 3
and all(isinstance(s, str) for s in r["stops"])
and isinstance(r.get("total_days"), (int, float))
and r.get("total_days", 99) <= 20
and isinstance(r.get("estimated_profit"), (int, float))
),
"check_desc": "stops must be list of 3 strings, total_days <= 20, estimated_profit numeric",
"strategic_check": lambda r: "Pen Cannoc" not in r.get("stops", []), # Too far for 20 days
"strategic_desc": "Pen Cannoc at 5 days each way is likely too far for a 20-day circuit",
},
]
@dataclass
class ScenarioResult:
scenario_index: int
description: str
raw_response: str
parsed: dict | None
valid_json: bool
schema_valid: bool
strategically_sound: bool
latency_ms: float
error: str = ""
@dataclass
class LevelResult:
level: int = LEVEL
name: str = NAME
trials: list[ScenarioResult] = field(default_factory=list)
passed: bool = False
score: float = 0.0
latency_p50_ms: float = 0.0
latency_p99_ms: float = 0.0
def _clean_response(raw: str) -> str:
raw = raw.strip()
if raw.startswith("```"):
lines = raw.splitlines()
lines = [l for l in lines if not l.startswith("```")]
raw = "\n".join(lines).strip()
return raw
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
result = LevelResult()
latencies = []
for i, scenario in enumerate(SCENARIOS):
t0 = time.time()
try:
response = client.chat(
model=model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": scenario["prompt"]},
],
options={"temperature": 0.2},
)
raw = response["message"]["content"]
latency_ms = (time.time() - t0) * 1000
except Exception as exc:
latency_ms = (time.time() - t0) * 1000
sr = ScenarioResult(
scenario_index=i,
description=scenario["description"],
raw_response="",
parsed=None,
valid_json=False,
schema_valid=False,
strategically_sound=False,
latency_ms=latency_ms,
error=str(exc),
)
result.trials.append(sr)
if verbose:
print(f" Scenario {i}: ERROR — {exc}")
continue
latencies.append(latency_ms)
cleaned = _clean_response(raw)
parsed = None
valid_json = False
schema_valid = False
strategically_sound = False
error = ""
try:
parsed = json.loads(cleaned)
valid_json = True
schema_valid = scenario["check"](parsed)
if schema_valid:
strategically_sound = scenario["strategic_check"](parsed)
except json.JSONDecodeError as exc:
error = f"JSONDecodeError: {exc}"
except Exception as exc:
error = f"Validation error: {exc}"
sr = ScenarioResult(
scenario_index=i,
description=scenario["description"],
raw_response=raw,
parsed=parsed,
valid_json=valid_json,
schema_valid=schema_valid,
strategically_sound=strategically_sound,
latency_ms=latency_ms,
error=error,
)
result.trials.append(sr)
if verbose:
status = "PASS" if (valid_json and schema_valid) else "FAIL"
strat = "strategic" if strategically_sound else "suboptimal"
print(
f" Scenario {i} [{scenario['description']}]: {status} ({strat}) "
f"| {latency_ms:.0f}ms"
)
if not schema_valid and valid_json:
print(f" Schema issue: {scenario['check_desc']}")
valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
result.score = valid_count / len(SCENARIOS)
result.passed = result.score >= 0.67
if latencies:
latencies_sorted = sorted(latencies)
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
result.latency_p99_ms = latencies_sorted[-1]
return result