Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
224 lines
8.8 KiB
Python
224 lines
8.8 KiB
Python
"""Level 4: Trade Route — Campaign Navigation.
|
|
|
|
Tests multi-step planning ability: route optimization, trade-off analysis
|
|
across time horizons, and adapting plans when conditions change.
|
|
Maps to: Bannerlord campaign map navigation, caravans, and economy.
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
LEVEL = 4
|
|
NAME = "Trade Route (Campaign Navigation)"
|
|
DESCRIPTION = "Model must plan optimal routes and adapt to changing conditions on the campaign map."
|
|
|
|
SYSTEM_PROMPT = """You are a Bannerlord merchant lord planning campaign movements.
|
|
Consider distance, profitability, risk, and timing.
|
|
You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
|
|
|
|
SCENARIOS = [
|
|
{
|
|
"description": "Optimal trade route selection",
|
|
"prompt": (
|
|
"You are at Epicrotea with 500 gold and 20 days travel budget.\n\n"
|
|
"Trade opportunities:\n"
|
|
"- Route A: Epicrotea → Vlandia (3 days) → Sturgia (5 days back)\n"
|
|
" Sell grain in Vlandia: +300 gold. Buy furs in Sturgia: costs 200, sells for 400 in Calradia.\n"
|
|
" Total: +500 gold profit, 8 days.\n"
|
|
"- Route B: Epicrotea → Calradia (2 days) → Aserai (4 days)\n"
|
|
" Sell iron in Calradia: +150 gold. Buy spice in Aserai: costs 300, sells for 600 in Empire.\n"
|
|
" Empire is 6 more days away. Total: +450 gold profit, 12 days.\n"
|
|
"- Route C: Epicrotea → nearby village (1 day)\n"
|
|
" Buy cheap food: costs 100, sells for 180 in any city.\n"
|
|
" Total: +80 gold profit, 2 days. Repeatable.\n\n"
|
|
'Choose route. Respond:\n'
|
|
'{"route": "A"|"B"|"C", "expected_profit": <int>, "days_used": <int>, '
|
|
'"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
|
|
),
|
|
"check": lambda r: (
|
|
r.get("route") in ["A", "B", "C"]
|
|
and isinstance(r.get("expected_profit"), (int, float))
|
|
and isinstance(r.get("days_used"), (int, float))
|
|
and r.get("risk") in ["low", "medium", "high"]
|
|
),
|
|
"check_desc": "route, expected_profit, days_used, risk must be valid",
|
|
"strategic_check": lambda r: r.get("route") in ["A", "C"], # A is best single trip, C is best if repeated
|
|
"strategic_desc": "Route A has best profit/day ratio; C is best if multiple loops possible",
|
|
},
|
|
{
|
|
"description": "Adapt plan when war declared",
|
|
"prompt": (
|
|
"You were heading to Vlandia to trade, 2 days into the journey.\n"
|
|
"NEWS: Vlandia just declared war on your faction. Entering Vlandia territory is now dangerous.\n\n"
|
|
"Your current position: borderlands, equidistant between:\n"
|
|
"- Vlandia (2 days): Now at war — high risk of attack\n"
|
|
"- Sturgia (3 days): Neutral — safe\n"
|
|
"- Empire (4 days): Allied — very safe, good prices\n\n"
|
|
"You have 400 gold of trade goods for the Vlandia market.\n"
|
|
'What do you do? Respond:\n'
|
|
'{"decision": "continue_to_vlandia"|"divert_to_sturgia"|"divert_to_empire", '
|
|
'"reason": "<why>", "gold_at_risk": <int>}'
|
|
),
|
|
"check": lambda r: (
|
|
r.get("decision") in ["continue_to_vlandia", "divert_to_sturgia", "divert_to_empire"]
|
|
and isinstance(r.get("gold_at_risk"), (int, float))
|
|
),
|
|
"check_desc": "decision must be one of three options, gold_at_risk must be a number",
|
|
"strategic_check": lambda r: r.get("decision") in ["divert_to_sturgia", "divert_to_empire"],
|
|
"strategic_desc": "Should avoid active war zone — divert to safe destination",
|
|
},
|
|
{
|
|
"description": "Multi-stop route planning with constraints",
|
|
"prompt": (
|
|
"Plan a 3-stop trading circuit starting and ending at Pravend.\n"
|
|
"Budget: 800 gold. Time limit: 20 days.\n\n"
|
|
"Available cities and travel times from Pravend:\n"
|
|
"- Rhotae: 2 days (leather cheap, sells well in south)\n"
|
|
"- Ortysia: 4 days (grain surplus — buy cheap)\n"
|
|
"- Epicrotea: 3 days (iron market — buy/sell)\n"
|
|
"- Pen Cannoc: 5 days (wine — high profit, far)\n\n"
|
|
"Each stop takes 1 day for trading.\n"
|
|
'Plan 3 stops. Respond:\n'
|
|
'{"stops": ["<city1>", "<city2>", "<city3>"], '
|
|
'"total_days": <int>, "estimated_profit": <int>, '
|
|
'"reason": "<reasoning>"}'
|
|
),
|
|
"check": lambda r: (
|
|
isinstance(r.get("stops"), list)
|
|
and len(r["stops"]) == 3
|
|
and all(isinstance(s, str) for s in r["stops"])
|
|
and isinstance(r.get("total_days"), (int, float))
|
|
and r.get("total_days", 99) <= 20
|
|
and isinstance(r.get("estimated_profit"), (int, float))
|
|
),
|
|
"check_desc": "stops must be list of 3 strings, total_days <= 20, estimated_profit numeric",
|
|
"strategic_check": lambda r: "Pen Cannoc" not in r.get("stops", []), # Too far for 20 days
|
|
"strategic_desc": "Pen Cannoc at 5 days each way is likely too far for a 20-day circuit",
|
|
},
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class ScenarioResult:
|
|
scenario_index: int
|
|
description: str
|
|
raw_response: str
|
|
parsed: dict | None
|
|
valid_json: bool
|
|
schema_valid: bool
|
|
strategically_sound: bool
|
|
latency_ms: float
|
|
error: str = ""
|
|
|
|
|
|
@dataclass
|
|
class LevelResult:
|
|
level: int = LEVEL
|
|
name: str = NAME
|
|
trials: list[ScenarioResult] = field(default_factory=list)
|
|
passed: bool = False
|
|
score: float = 0.0
|
|
latency_p50_ms: float = 0.0
|
|
latency_p99_ms: float = 0.0
|
|
|
|
|
|
def _clean_response(raw: str) -> str:
|
|
raw = raw.strip()
|
|
if raw.startswith("```"):
|
|
lines = raw.splitlines()
|
|
lines = [l for l in lines if not l.startswith("```")]
|
|
raw = "\n".join(lines).strip()
|
|
return raw
|
|
|
|
|
|
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
|
|
result = LevelResult()
|
|
latencies = []
|
|
|
|
for i, scenario in enumerate(SCENARIOS):
|
|
t0 = time.time()
|
|
try:
|
|
response = client.chat(
|
|
model=model,
|
|
messages=[
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": scenario["prompt"]},
|
|
],
|
|
options={"temperature": 0.2},
|
|
)
|
|
raw = response["message"]["content"]
|
|
latency_ms = (time.time() - t0) * 1000
|
|
except Exception as exc:
|
|
latency_ms = (time.time() - t0) * 1000
|
|
sr = ScenarioResult(
|
|
scenario_index=i,
|
|
description=scenario["description"],
|
|
raw_response="",
|
|
parsed=None,
|
|
valid_json=False,
|
|
schema_valid=False,
|
|
strategically_sound=False,
|
|
latency_ms=latency_ms,
|
|
error=str(exc),
|
|
)
|
|
result.trials.append(sr)
|
|
if verbose:
|
|
print(f" Scenario {i}: ERROR — {exc}")
|
|
continue
|
|
|
|
latencies.append(latency_ms)
|
|
|
|
cleaned = _clean_response(raw)
|
|
parsed = None
|
|
valid_json = False
|
|
schema_valid = False
|
|
strategically_sound = False
|
|
error = ""
|
|
|
|
try:
|
|
parsed = json.loads(cleaned)
|
|
valid_json = True
|
|
schema_valid = scenario["check"](parsed)
|
|
if schema_valid:
|
|
strategically_sound = scenario["strategic_check"](parsed)
|
|
except json.JSONDecodeError as exc:
|
|
error = f"JSONDecodeError: {exc}"
|
|
except Exception as exc:
|
|
error = f"Validation error: {exc}"
|
|
|
|
sr = ScenarioResult(
|
|
scenario_index=i,
|
|
description=scenario["description"],
|
|
raw_response=raw,
|
|
parsed=parsed,
|
|
valid_json=valid_json,
|
|
schema_valid=schema_valid,
|
|
strategically_sound=strategically_sound,
|
|
latency_ms=latency_ms,
|
|
error=error,
|
|
)
|
|
result.trials.append(sr)
|
|
|
|
if verbose:
|
|
status = "PASS" if (valid_json and schema_valid) else "FAIL"
|
|
strat = "strategic" if strategically_sound else "suboptimal"
|
|
print(
|
|
f" Scenario {i} [{scenario['description']}]: {status} ({strat}) "
|
|
f"| {latency_ms:.0f}ms"
|
|
)
|
|
if not schema_valid and valid_json:
|
|
print(f" Schema issue: {scenario['check_desc']}")
|
|
|
|
valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
|
|
result.score = valid_count / len(SCENARIOS)
|
|
result.passed = result.score >= 0.67
|
|
|
|
if latencies:
|
|
latencies_sorted = sorted(latencies)
|
|
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
|
result.latency_p99_ms = latencies_sorted[-1]
|
|
|
|
return result
|