"""Level 3: Battle Tactics — Formation Commands. Tests whether the model can issue coherent formation and tactical orders under simulated battlefield pressure with multiple unit types. Maps to: Bannerlord formation commands (charge, shield wall, skirmish, etc.). """ import json import time from dataclasses import dataclass, field from typing import Any LEVEL = 3 NAME = "Battle Tactics (Formation Commands)" DESCRIPTION = "Model must issue tactically sound formation orders under simulated battle conditions." SYSTEM_PROMPT = """You are a Bannerlord battle commander. Issue formation orders using these commands: - shield_wall: infantry forms defensive line (good vs ranged, slow advance) - charge: all-out attack (high casualties, breaks weak enemies fast) - skirmish: ranged units pepper enemy (good vs heavy infantry, needs distance) - advance: move forward holding formation (balanced) - flank_left / flank_right: cavalry sweeps around enemy side - fallback: retreat to regroup (when badly outnumbered) You MUST respond ONLY with valid JSON. No markdown. Raw JSON only.""" SCENARIOS = [ { "description": "Ranged vs infantry — defensive opening", "prompt": ( "Situation: You have 20 archers + 10 infantry. Enemy has 30 heavy infantry, no ranged.\n" "Enemy is 200m away and advancing.\n" "Objective: Maximize casualties before melee contact.\n\n" 'Issue orders for both units. Respond:\n' '{"infantry_order": "", "archer_order": "", ' '"reason": "", "expected_outcome": ""}' ), "check": lambda r: ( r.get("infantry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"] and r.get("archer_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"] and isinstance(r.get("reason"), str) ), "check_desc": "Both orders must be valid commands", "strategic_check": lambda r: ( r.get("archer_order") == "skirmish" and r.get("infantry_order") in ["shield_wall", "advance"] ), "strategic_desc": "Archers should skirmish while infantry holds (shield_wall or advance)", }, { "description": "Outnumbered — retreat decision", "prompt": ( "Situation: Your party (15 troops) has been ambushed.\n" "Enemy: 60 bandits, surrounding you on 3 sides.\n" "Your troops: 40% wounded. One escape route to the east.\n\n" 'What is your command? Respond:\n' '{"order": "", "direction": "east"|"west"|"north"|"south"|null, ' '"reason": "", "priority": "preserve_troops"|"fight_through"}' ), "check": lambda r: ( r.get("order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"] and r.get("priority") in ["preserve_troops", "fight_through"] ), "check_desc": "order and priority must be valid values", "strategic_check": lambda r: ( r.get("order") == "fallback" and r.get("priority") == "preserve_troops" ), "strategic_desc": "Outnumbered 4:1 with wounded troops — fallback is the sound choice", }, { "description": "Cavalry flanking opportunity", "prompt": ( "Situation: Main battle is engaged. Your infantry and enemy infantry are locked.\n" "You have 8 cavalry in reserve. Enemy left flank is unprotected.\n" "If cavalry hits the flank now, it will route enemy in ~30 seconds.\n\n" 'Order for cavalry: Respond:\n' '{"cavalry_order": "", "timing": "now"|"wait", ' '"reason": "", "risk": "low"|"medium"|"high"}' ), "check": lambda r: ( r.get("cavalry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"] and r.get("timing") in ["now", "wait"] and r.get("risk") in ["low", "medium", "high"] ), "check_desc": "cavalry_order, timing, and risk must be valid values", "strategic_check": lambda r: ( r.get("cavalry_order") in ["flank_left", "flank_right", "charge"] and r.get("timing") == "now" ), "strategic_desc": "Should capitalize on the flank opportunity immediately", }, ] @dataclass class ScenarioResult: scenario_index: int description: str raw_response: str parsed: dict | None valid_json: bool schema_valid: bool strategically_sound: bool latency_ms: float error: str = "" @dataclass class LevelResult: level: int = LEVEL name: str = NAME trials: list[ScenarioResult] = field(default_factory=list) passed: bool = False score: float = 0.0 latency_p50_ms: float = 0.0 latency_p99_ms: float = 0.0 def _clean_response(raw: str) -> str: raw = raw.strip() if raw.startswith("```"): lines = raw.splitlines() lines = [l for l in lines if not l.startswith("```")] raw = "\n".join(lines).strip() return raw def run(client: Any, model: str, verbose: bool = False) -> LevelResult: result = LevelResult() latencies = [] for i, scenario in enumerate(SCENARIOS): t0 = time.time() try: response = client.chat( model=model, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": scenario["prompt"]}, ], options={"temperature": 0.2}, ) raw = response["message"]["content"] latency_ms = (time.time() - t0) * 1000 except Exception as exc: latency_ms = (time.time() - t0) * 1000 sr = ScenarioResult( scenario_index=i, description=scenario["description"], raw_response="", parsed=None, valid_json=False, schema_valid=False, strategically_sound=False, latency_ms=latency_ms, error=str(exc), ) result.trials.append(sr) if verbose: print(f" Scenario {i}: ERROR — {exc}") continue latencies.append(latency_ms) cleaned = _clean_response(raw) parsed = None valid_json = False schema_valid = False strategically_sound = False error = "" try: parsed = json.loads(cleaned) valid_json = True schema_valid = scenario["check"](parsed) if schema_valid: strategically_sound = scenario["strategic_check"](parsed) except json.JSONDecodeError as exc: error = f"JSONDecodeError: {exc}" except Exception as exc: error = f"Validation error: {exc}" sr = ScenarioResult( scenario_index=i, description=scenario["description"], raw_response=raw, parsed=parsed, valid_json=valid_json, schema_valid=schema_valid, strategically_sound=strategically_sound, latency_ms=latency_ms, error=error, ) result.trials.append(sr) if verbose: status = "PASS" if (valid_json and schema_valid) else "FAIL" strat = "strategic" if strategically_sound else "suboptimal" print( f" Scenario {i} [{scenario['description']}]: {status} ({strat}) " f"| {latency_ms:.0f}ms" ) if not schema_valid and valid_json: print(f" Schema issue: {scenario['check_desc']}") valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid) result.score = valid_count / len(SCENARIOS) result.passed = result.score >= 0.67 if latencies: latencies_sorted = sorted(latencies) result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2] result.latency_p99_ms = latencies_sorted[-1] return result