Timmy-time-dashboard/timmy-benchmark/levels/level_4_trade_route.py

"""Level 4: Trade Route — Campaign Navigation.

Tests multi-step planning ability: route optimization, trade-off analysis
across time horizons, and adapting plans when conditions change.
Maps to: Bannerlord campaign map navigation, caravans, and economy.
"""

import json
import time
from dataclasses import dataclass, field
from typing import Any

LEVEL = 4
NAME = "Trade Route (Campaign Navigation)"
DESCRIPTION = "Model must plan optimal routes and adapt to changing conditions on the campaign map."

SYSTEM_PROMPT = """You are a Bannerlord merchant lord planning campaign movements.
Consider distance, profitability, risk, and timing.
You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""

SCENARIOS = [
    {
        "description": "Optimal trade route selection",
        "prompt": (
            "You are at Epicrotea with 500 gold and 20 days travel budget.\n\n"
            "Trade opportunities:\n"
            "- Route A: Epicrotea → Vlandia (3 days) → Sturgia (5 days back)\n"
            "  Sell grain in Vlandia: +300 gold. Buy furs in Sturgia: costs 200, sells for 400 in Calradia.\n"
            "  Total: +500 gold profit, 8 days.\n"
            "- Route B: Epicrotea → Calradia (2 days) → Aserai (4 days)\n"
            "  Sell iron in Calradia: +150 gold. Buy spice in Aserai: costs 300, sells for 600 in Empire.\n"
            "  Empire is 6 more days away. Total: +450 gold profit, 12 days.\n"
            "- Route C: Epicrotea → nearby village (1 day)\n"
            "  Buy cheap food: costs 100, sells for 180 in any city.\n"
            "  Total: +80 gold profit, 2 days. Repeatable.\n\n"
            'Choose route. Respond:\n'
            '{"route": "A"|"B"|"C", "expected_profit": <int>, "days_used": <int>, '
            '"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
        ),
        "check": lambda r: (
            r.get("route") in ["A", "B", "C"]
            and isinstance(r.get("expected_profit"), (int, float))
            and isinstance(r.get("days_used"), (int, float))
            and r.get("risk") in ["low", "medium", "high"]
        ),
        "check_desc": "route, expected_profit, days_used, risk must be valid",
        "strategic_check": lambda r: r.get("route") in ["A", "C"],  # A is best single trip, C is best if repeated
        "strategic_desc": "Route A has best profit/day ratio; C is best if multiple loops possible",
    },
    {
        "description": "Adapt plan when war declared",
        "prompt": (
            "You were heading to Vlandia to trade, 2 days into the journey.\n"
            "NEWS: Vlandia just declared war on your faction. Entering Vlandia territory is now dangerous.\n\n"
            "Your current position: borderlands, equidistant between:\n"
            "- Vlandia (2 days): Now at war — high risk of attack\n"
            "- Sturgia (3 days): Neutral — safe\n"
            "- Empire (4 days): Allied — very safe, good prices\n\n"
            "You have 400 gold of trade goods for the Vlandia market.\n"
            'What do you do? Respond:\n'
            '{"decision": "continue_to_vlandia"|"divert_to_sturgia"|"divert_to_empire", '
            '"reason": "<why>", "gold_at_risk": <int>}'
        ),
        "check": lambda r: (
            r.get("decision") in ["continue_to_vlandia", "divert_to_sturgia", "divert_to_empire"]
            and isinstance(r.get("gold_at_risk"), (int, float))
        ),
        "check_desc": "decision must be one of three options, gold_at_risk must be a number",
        "strategic_check": lambda r: r.get("decision") in ["divert_to_sturgia", "divert_to_empire"],
        "strategic_desc": "Should avoid active war zone — divert to safe destination",
    },
    {
        "description": "Multi-stop route planning with constraints",
        "prompt": (
            "Plan a 3-stop trading circuit starting and ending at Pravend.\n"
            "Budget: 800 gold. Time limit: 20 days.\n\n"
            "Available cities and travel times from Pravend:\n"
            "- Rhotae: 2 days (leather cheap, sells well in south)\n"
            "- Ortysia: 4 days (grain surplus — buy cheap)\n"
            "- Epicrotea: 3 days (iron market — buy/sell)\n"
            "- Pen Cannoc: 5 days (wine — high profit, far)\n\n"
            "Each stop takes 1 day for trading.\n"
            'Plan 3 stops. Respond:\n'
            '{"stops": ["<city1>", "<city2>", "<city3>"], '
            '"total_days": <int>, "estimated_profit": <int>, '
            '"reason": "<reasoning>"}'
        ),
        "check": lambda r: (
            isinstance(r.get("stops"), list)
            and len(r["stops"]) == 3
            and all(isinstance(s, str) for s in r["stops"])
            and isinstance(r.get("total_days"), (int, float))
            and r.get("total_days", 99) <= 20
            and isinstance(r.get("estimated_profit"), (int, float))
        ),
        "check_desc": "stops must be list of 3 strings, total_days <= 20, estimated_profit numeric",
        "strategic_check": lambda r: "Pen Cannoc" not in r.get("stops", []),  # Too far for 20 days
        "strategic_desc": "Pen Cannoc at 5 days each way is likely too far for a 20-day circuit",
    },
]


@dataclass
class ScenarioResult:
    scenario_index: int
    description: str
    raw_response: str
    parsed: dict | None
    valid_json: bool
    schema_valid: bool
    strategically_sound: bool
    latency_ms: float
    error: str = ""


@dataclass
class LevelResult:
    level: int = LEVEL
    name: str = NAME
    trials: list[ScenarioResult] = field(default_factory=list)
    passed: bool = False
    score: float = 0.0
    latency_p50_ms: float = 0.0
    latency_p99_ms: float = 0.0


def _clean_response(raw: str) -> str:
    raw = raw.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()
    return raw


def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
    result = LevelResult()
    latencies = []

    for i, scenario in enumerate(SCENARIOS):
        t0 = time.time()
        try:
            response = client.chat(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": scenario["prompt"]},
                ],
                options={"temperature": 0.2},
            )
            raw = response["message"]["content"]
            latency_ms = (time.time() - t0) * 1000
        except Exception as exc:
            latency_ms = (time.time() - t0) * 1000
            sr = ScenarioResult(
                scenario_index=i,
                description=scenario["description"],
                raw_response="",
                parsed=None,
                valid_json=False,
                schema_valid=False,
                strategically_sound=False,
                latency_ms=latency_ms,
                error=str(exc),
            )
            result.trials.append(sr)
            if verbose:
                print(f"  Scenario {i}: ERROR — {exc}")
            continue

        latencies.append(latency_ms)

        cleaned = _clean_response(raw)
        parsed = None
        valid_json = False
        schema_valid = False
        strategically_sound = False
        error = ""

        try:
            parsed = json.loads(cleaned)
            valid_json = True
            schema_valid = scenario["check"](parsed)
            if schema_valid:
                strategically_sound = scenario["strategic_check"](parsed)
        except json.JSONDecodeError as exc:
            error = f"JSONDecodeError: {exc}"
        except Exception as exc:
            error = f"Validation error: {exc}"

        sr = ScenarioResult(
            scenario_index=i,
            description=scenario["description"],
            raw_response=raw,
            parsed=parsed,
            valid_json=valid_json,
            schema_valid=schema_valid,
            strategically_sound=strategically_sound,
            latency_ms=latency_ms,
            error=error,
        )
        result.trials.append(sr)

        if verbose:
            status = "PASS" if (valid_json and schema_valid) else "FAIL"
            strat = "strategic" if strategically_sound else "suboptimal"
            print(
                f"  Scenario {i} [{scenario['description']}]: {status} ({strat}) "
                f"| {latency_ms:.0f}ms"
            )
            if not schema_valid and valid_json:
                print(f"    Schema issue: {scenario['check_desc']}")

    valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
    result.score = valid_count / len(SCENARIOS)
    result.passed = result.score >= 0.67

    if latencies:
        latencies_sorted = sorted(latencies)
        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
        result.latency_p99_ms = latencies_sorted[-1]

    return result