1
0

[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)

Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com>
Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
This commit is contained in:
Alexander Whitestone
2026-03-23 19:38:48 +00:00
committed by rockachopa
parent 6e65b53f3a
commit 9e08e87312
12 changed files with 3068 additions and 0 deletions

View File

@@ -0,0 +1 @@
"""Cognitive benchmark levels for Project Bannerlord readiness testing."""

View File

@@ -0,0 +1,183 @@
"""Level 0: JSON Compliance — Coin Flip.
Tests whether the model can reliably return well-formed JSON responses
with a specific schema. This is the minimum bar for GABS tool calls.
"""
import json
import time
from dataclasses import dataclass, field
from typing import Any
LEVEL = 0
NAME = "JSON Compliance (Coin Flip)"
DESCRIPTION = "Model must return valid JSON matching a strict schema on each trial."
SYSTEM_PROMPT = """You are a strategic AI agent. You MUST respond ONLY with valid JSON.
No markdown, no explanation, no code fences. Raw JSON only."""
TRIALS = [
{
"prompt": (
'A coin is flipped. Respond with exactly: {"choice": "heads"} or {"choice": "tails"}. '
"Pick one. JSON only."
),
"schema": {"choice": str},
"valid_values": {"choice": ["heads", "tails"]},
},
{
"prompt": (
'You must attack or defend. Respond with: {"action": "attack", "confidence": 0.8} '
'or {"action": "defend", "confidence": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.'
),
"schema": {"action": str, "confidence": float},
"valid_values": {"action": ["attack", "defend"]},
},
{
"prompt": (
'Choose a direction to march. Respond with exactly: '
'{"direction": "north", "reason": "string explaining why"}. '
"Pick north/south/east/west. JSON only."
),
"schema": {"direction": str, "reason": str},
"valid_values": {"direction": ["north", "south", "east", "west"]},
},
]
@dataclass
class TrialResult:
trial_index: int
prompt: str
raw_response: str
parsed: dict | None
valid_json: bool
schema_valid: bool
value_valid: bool
latency_ms: float
error: str = ""
@dataclass
class LevelResult:
level: int = LEVEL
name: str = NAME
trials: list[TrialResult] = field(default_factory=list)
passed: bool = False
score: float = 0.0
latency_p50_ms: float = 0.0
latency_p99_ms: float = 0.0
def _validate_schema(parsed: dict, schema: dict[str, type]) -> bool:
for key, expected_type in schema.items():
if key not in parsed:
return False
if not isinstance(parsed[key], expected_type):
# Allow int where float is expected
if expected_type is float and isinstance(parsed[key], int):
continue
return False
return True
def _validate_values(parsed: dict, valid_values: dict[str, list]) -> bool:
for key, valid_list in valid_values.items():
if key in parsed and parsed[key] not in valid_list:
return False
return True
def _clean_response(raw: str) -> str:
"""Strip markdown fences if model wrapped JSON in them."""
raw = raw.strip()
if raw.startswith("```"):
lines = raw.splitlines()
# Remove first and last fence lines
lines = [l for l in lines if not l.startswith("```")]
raw = "\n".join(lines).strip()
return raw
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
result = LevelResult()
latencies = []
for i, trial in enumerate(TRIALS):
t0 = time.time()
try:
response = client.chat(
model=model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": trial["prompt"]},
],
options={"temperature": 0.1},
)
raw = response["message"]["content"]
latency_ms = (time.time() - t0) * 1000
except Exception as exc:
latency_ms = (time.time() - t0) * 1000
tr = TrialResult(
trial_index=i,
prompt=trial["prompt"],
raw_response="",
parsed=None,
valid_json=False,
schema_valid=False,
value_valid=False,
latency_ms=latency_ms,
error=str(exc),
)
result.trials.append(tr)
if verbose:
print(f" Trial {i}: ERROR — {exc}")
continue
latencies.append(latency_ms)
cleaned = _clean_response(raw)
parsed = None
valid_json = False
schema_valid = False
value_valid = False
error = ""
try:
parsed = json.loads(cleaned)
valid_json = True
schema_valid = _validate_schema(parsed, trial["schema"])
value_valid = _validate_values(parsed, trial["valid_values"])
except json.JSONDecodeError as exc:
error = f"JSONDecodeError: {exc}"
tr = TrialResult(
trial_index=i,
prompt=trial["prompt"],
raw_response=raw,
parsed=parsed,
valid_json=valid_json,
schema_valid=schema_valid,
value_valid=value_valid,
latency_ms=latency_ms,
error=error,
)
result.trials.append(tr)
if verbose:
status = "PASS" if (valid_json and schema_valid) else "FAIL"
print(
f" Trial {i}: {status} | json={valid_json} schema={schema_valid} "
f"value={value_valid} | {latency_ms:.0f}ms | {raw[:80]!r}"
)
passed_trials = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
result.score = passed_trials / len(TRIALS)
result.passed = result.score >= 1.0 # Must pass all 3 trials
if latencies:
latencies_sorted = sorted(latencies)
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
result.latency_p99_ms = latencies_sorted[-1]
return result

View File

@@ -0,0 +1,211 @@
"""Level 1: Board State Tracking — Tic-Tac-Toe.
Tests whether the model can maintain game state across turns, select
legal moves, and exhibit basic strategic awareness.
Maps to: Bannerlord board state / campaign map tracking.
"""
import json
import time
from dataclasses import dataclass, field
from typing import Any
LEVEL = 1
NAME = "Board State Tracking (Tic-Tac-Toe)"
DESCRIPTION = "Model must track a tic-tac-toe board and make legal, strategic moves."
SYSTEM_PROMPT = """You are a strategic AI playing tic-tac-toe. The board is a 3x3 grid.
Positions are numbered 0-8 left-to-right, top-to-bottom:
0|1|2
3|4|5
6|7|8
You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only.
Format: {"move": <position 0-8>, "reason": "<brief reason>"}"""
SCENARIOS = [
{
"description": "Empty board — opening move",
"board": [None, None, None, None, None, None, None, None, None],
"player": "X",
"prompt": (
'Board state: [null,null,null,null,null,null,null,null,null]. '
'You are X. It is your turn. Choose a move. '
'Respond: {"move": <0-8>, "reason": "<why>"}'
),
"check": lambda move, board: move in range(9) and board[move] is None,
"check_desc": "Move must be a valid empty position (0-8)",
},
{
"description": "Block opponent's winning move",
"board": ["O", None, "O", None, "X", None, None, None, None],
"player": "X",
"prompt": (
'Board: ["O",null,"O",null,"X",null,null,null,null]. '
"O has positions 0 and 2. You are X. "
"O will win on next turn unless you block. "
'Respond: {"move": <0-8>, "reason": "<why>"}'
),
"check": lambda move, board: move == 1, # Must block at position 1
"check_desc": "Must block O's win at position 1",
},
{
"description": "Take winning move",
"board": ["X", None, "X", None, "O", None, None, "O", None],
"player": "X",
"prompt": (
'Board: ["X",null,"X",null,"O",null,null,"O",null]. '
"You are X. You have positions 0 and 2. "
"You can win this turn. "
'Respond: {"move": <0-8>, "reason": "<why>"}'
),
"check": lambda move, board: move == 1, # Win at position 1
"check_desc": "Must take winning move at position 1",
},
{
"description": "Legal move on partially filled board",
"board": ["X", "O", "X", "O", "X", "O", None, None, None],
"player": "O",
"prompt": (
'Board: ["X","O","X","O","X","O",null,null,null]. '
"You are O. Choose a legal move (positions 6, 7, or 8 are available). "
'Respond: {"move": <0-8>, "reason": "<why>"}'
),
"check": lambda move, board: move in [6, 7, 8],
"check_desc": "Move must be one of the empty positions: 6, 7, or 8",
},
]
@dataclass
class ScenarioResult:
scenario_index: int
description: str
prompt: str
raw_response: str
parsed: dict | None
valid_json: bool
move_legal: bool
move_correct: bool
latency_ms: float
error: str = ""
@dataclass
class LevelResult:
level: int = LEVEL
name: str = NAME
trials: list[ScenarioResult] = field(default_factory=list)
passed: bool = False
score: float = 0.0
latency_p50_ms: float = 0.0
latency_p99_ms: float = 0.0
def _clean_response(raw: str) -> str:
raw = raw.strip()
if raw.startswith("```"):
lines = raw.splitlines()
lines = [l for l in lines if not l.startswith("```")]
raw = "\n".join(lines).strip()
return raw
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
result = LevelResult()
latencies = []
for i, scenario in enumerate(SCENARIOS):
t0 = time.time()
try:
response = client.chat(
model=model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": scenario["prompt"]},
],
options={"temperature": 0.1},
)
raw = response["message"]["content"]
latency_ms = (time.time() - t0) * 1000
except Exception as exc:
latency_ms = (time.time() - t0) * 1000
sr = ScenarioResult(
scenario_index=i,
description=scenario["description"],
prompt=scenario["prompt"],
raw_response="",
parsed=None,
valid_json=False,
move_legal=False,
move_correct=False,
latency_ms=latency_ms,
error=str(exc),
)
result.trials.append(sr)
if verbose:
print(f" Scenario {i}: ERROR — {exc}")
continue
latencies.append(latency_ms)
cleaned = _clean_response(raw)
parsed = None
valid_json = False
move_legal = False
move_correct = False
error = ""
try:
parsed = json.loads(cleaned)
valid_json = True
if "move" in parsed:
move = parsed["move"]
# Coerce string digits to int (some models emit "4" instead of 4)
if isinstance(move, str) and move.strip().lstrip("-").isdigit():
move = int(move.strip())
if isinstance(move, int):
board = scenario["board"]
move_legal = 0 <= move <= 8 and board[move] is None
move_correct = scenario["check"](move, board)
except json.JSONDecodeError as exc:
error = f"JSONDecodeError: {exc}"
sr = ScenarioResult(
scenario_index=i,
description=scenario["description"],
prompt=scenario["prompt"],
raw_response=raw,
parsed=parsed,
valid_json=valid_json,
move_legal=move_legal,
move_correct=move_correct,
latency_ms=latency_ms,
error=error,
)
result.trials.append(sr)
if verbose:
status = "PASS" if (valid_json and move_legal) else "FAIL"
correct_str = "CORRECT" if move_correct else "suboptimal"
move_val = parsed.get("move", "?") if parsed else "?"
print(
f" Scenario {i} [{scenario['description']}]: {status} ({correct_str}) "
f"| move={move_val} | {latency_ms:.0f}ms"
)
if not move_correct and valid_json:
print(f" Expected: {scenario['check_desc']}")
# Pass criteria: all moves must be valid JSON + legal
legal_moves = sum(1 for t in result.trials if t.valid_json and t.move_legal)
result.score = legal_moves / len(SCENARIOS)
result.passed = result.score >= 1.0
if latencies:
latencies_sorted = sorted(latencies)
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
result.latency_p99_ms = latencies_sorted[-1]
return result

View File

@@ -0,0 +1,213 @@
"""Level 2: Resource Management — Party Economy.
Tests whether the model can allocate limited resources across competing
priorities and adapt when constraints change.
Maps to: Bannerlord party economy (troops, food, gold, morale).
"""
import json
import time
from dataclasses import dataclass, field
from typing import Any
LEVEL = 2
NAME = "Resource Management (Party Economy)"
DESCRIPTION = "Model must allocate limited resources across troops, food, and equipment."
SYSTEM_PROMPT = """You are a Bannerlord campaign advisor managing a party.
Resources are limited — every decision has trade-offs.
You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only."""
SCENARIOS = [
{
"description": "Budget allocation under constraint",
"prompt": (
"You have 500 gold. Options:\n"
"- Recruit 10 infantry: costs 300 gold, +10 combat strength\n"
"- Buy food for 20 days: costs 200 gold, keeps morale stable\n"
"- Repair armor: costs 150 gold, -20% casualty rate\n\n"
"You cannot afford all three. Morale is currently CRITICAL (troops may desert).\n"
'Choose 1-2 options. Respond: {"choices": ["option_a", ...], "gold_spent": <int>, "reason": "<why>"}\n'
"Where option keys are: recruit_infantry, buy_food, repair_armor"
),
"check": lambda r: (
isinstance(r.get("choices"), list)
and len(r["choices"]) >= 1
and all(c in ["recruit_infantry", "buy_food", "repair_armor"] for c in r["choices"])
and isinstance(r.get("gold_spent"), (int, float))
and r.get("gold_spent", 9999) <= 500
),
"check_desc": "choices must be valid options, gold_spent <= 500",
"strategic_check": lambda r: "buy_food" in r.get("choices", []),
"strategic_desc": "With CRITICAL morale, food should be prioritized",
},
{
"description": "Troop tier upgrade decision",
"prompt": (
"Party status:\n"
"- 15 Tier-1 recruits (weak, 30 upkeep/day)\n"
"- 5 Tier-3 veterans (strong, 90 upkeep/day)\n"
"- Daily income: 200 gold\n"
"- Upcoming: raider camp attack (moderate difficulty)\n\n"
"Options:\n"
"- Upgrade 5 recruits to Tier-2 (costs 250 gold total)\n"
"- Keep all current troops, save gold for emergencies\n"
"- Dismiss 5 recruits to save upkeep\n\n"
'Respond: {"action": "upgrade_recruits"|"save_gold"|"dismiss_recruits", '
'"reason": "<why>", "expected_outcome": "<string>"}'
),
"check": lambda r: (
r.get("action") in ["upgrade_recruits", "save_gold", "dismiss_recruits"]
and isinstance(r.get("reason"), str)
and len(r.get("reason", "")) > 0
),
"check_desc": "action must be one of the three options with a non-empty reason",
"strategic_check": lambda r: r.get("action") in ["upgrade_recruits", "save_gold"],
"strategic_desc": "Dismissing troops before a fight is suboptimal",
},
{
"description": "Multi-turn planning horizon",
"prompt": (
"Current: 300 gold, 10 days of food, 20 troops\n"
"Day 5: Must cross desert (costs 5 extra food days)\n"
"Day 10: Reach town (can buy supplies)\n\n"
"You need a 15-day food reserve to survive the journey.\n"
"Food costs 10 gold/day. You have enough for 10 days now.\n\n"
"How many extra food days do you buy today?\n"
'Respond: {"extra_food_days": <int>, "cost": <int>, "remaining_gold": <int>, "reason": "<why>"}'
),
"check": lambda r: (
isinstance(r.get("extra_food_days"), (int, float))
and isinstance(r.get("cost"), (int, float))
and isinstance(r.get("remaining_gold"), (int, float))
),
"check_desc": "Must include extra_food_days, cost, remaining_gold as numbers",
"strategic_check": lambda r: r.get("extra_food_days", 0) >= 5,
"strategic_desc": "Need at least 5 more days of food for desert crossing",
},
]
@dataclass
class ScenarioResult:
scenario_index: int
description: str
raw_response: str
parsed: dict | None
valid_json: bool
schema_valid: bool
strategically_sound: bool
latency_ms: float
error: str = ""
@dataclass
class LevelResult:
level: int = LEVEL
name: str = NAME
trials: list[ScenarioResult] = field(default_factory=list)
passed: bool = False
score: float = 0.0
latency_p50_ms: float = 0.0
latency_p99_ms: float = 0.0
def _clean_response(raw: str) -> str:
raw = raw.strip()
if raw.startswith("```"):
lines = raw.splitlines()
lines = [l for l in lines if not l.startswith("```")]
raw = "\n".join(lines).strip()
return raw
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
result = LevelResult()
latencies = []
for i, scenario in enumerate(SCENARIOS):
t0 = time.time()
try:
response = client.chat(
model=model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": scenario["prompt"]},
],
options={"temperature": 0.1},
)
raw = response["message"]["content"]
latency_ms = (time.time() - t0) * 1000
except Exception as exc:
latency_ms = (time.time() - t0) * 1000
sr = ScenarioResult(
scenario_index=i,
description=scenario["description"],
raw_response="",
parsed=None,
valid_json=False,
schema_valid=False,
strategically_sound=False,
latency_ms=latency_ms,
error=str(exc),
)
result.trials.append(sr)
if verbose:
print(f" Scenario {i}: ERROR — {exc}")
continue
latencies.append(latency_ms)
cleaned = _clean_response(raw)
parsed = None
valid_json = False
schema_valid = False
strategically_sound = False
error = ""
try:
parsed = json.loads(cleaned)
valid_json = True
schema_valid = scenario["check"](parsed)
if schema_valid:
strategically_sound = scenario["strategic_check"](parsed)
except json.JSONDecodeError as exc:
error = f"JSONDecodeError: {exc}"
except Exception as exc:
error = f"Validation error: {exc}"
sr = ScenarioResult(
scenario_index=i,
description=scenario["description"],
raw_response=raw,
parsed=parsed,
valid_json=valid_json,
schema_valid=schema_valid,
strategically_sound=strategically_sound,
latency_ms=latency_ms,
error=error,
)
result.trials.append(sr)
if verbose:
status = "PASS" if (valid_json and schema_valid) else "FAIL"
strat = "strategic" if strategically_sound else "suboptimal"
print(
f" Scenario {i} [{scenario['description']}]: {status} ({strat}) "
f"| {latency_ms:.0f}ms"
)
if not schema_valid and valid_json:
print(f" Schema issue: {scenario['check_desc']}")
if not strategically_sound and schema_valid:
print(f" Strategy note: {scenario['strategic_desc']}")
valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
result.score = valid_count / len(SCENARIOS)
result.passed = result.score >= 0.67 # 2/3 scenarios
if latencies:
latencies_sorted = sorted(latencies)
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
result.latency_p99_ms = latencies_sorted[-1]
return result

View File

@@ -0,0 +1,216 @@
"""Level 3: Battle Tactics — Formation Commands.
Tests whether the model can issue coherent formation and tactical orders
under simulated battlefield pressure with multiple unit types.
Maps to: Bannerlord formation commands (charge, shield wall, skirmish, etc.).
"""
import json
import time
from dataclasses import dataclass, field
from typing import Any
LEVEL = 3
NAME = "Battle Tactics (Formation Commands)"
DESCRIPTION = "Model must issue tactically sound formation orders under simulated battle conditions."
SYSTEM_PROMPT = """You are a Bannerlord battle commander. Issue formation orders using these commands:
- shield_wall: infantry forms defensive line (good vs ranged, slow advance)
- charge: all-out attack (high casualties, breaks weak enemies fast)
- skirmish: ranged units pepper enemy (good vs heavy infantry, needs distance)
- advance: move forward holding formation (balanced)
- flank_left / flank_right: cavalry sweeps around enemy side
- fallback: retreat to regroup (when badly outnumbered)
You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
SCENARIOS = [
{
"description": "Ranged vs infantry — defensive opening",
"prompt": (
"Situation: You have 20 archers + 10 infantry. Enemy has 30 heavy infantry, no ranged.\n"
"Enemy is 200m away and advancing.\n"
"Objective: Maximize casualties before melee contact.\n\n"
'Issue orders for both units. Respond:\n'
'{"infantry_order": "<command>", "archer_order": "<command>", '
'"reason": "<tactical reasoning>", "expected_outcome": "<string>"}'
),
"check": lambda r: (
r.get("infantry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
and r.get("archer_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
and isinstance(r.get("reason"), str)
),
"check_desc": "Both orders must be valid commands",
"strategic_check": lambda r: (
r.get("archer_order") == "skirmish"
and r.get("infantry_order") in ["shield_wall", "advance"]
),
"strategic_desc": "Archers should skirmish while infantry holds (shield_wall or advance)",
},
{
"description": "Outnumbered — retreat decision",
"prompt": (
"Situation: Your party (15 troops) has been ambushed.\n"
"Enemy: 60 bandits, surrounding you on 3 sides.\n"
"Your troops: 40% wounded. One escape route to the east.\n\n"
'What is your command? Respond:\n'
'{"order": "<command>", "direction": "east"|"west"|"north"|"south"|null, '
'"reason": "<tactical reasoning>", "priority": "preserve_troops"|"fight_through"}'
),
"check": lambda r: (
r.get("order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
and r.get("priority") in ["preserve_troops", "fight_through"]
),
"check_desc": "order and priority must be valid values",
"strategic_check": lambda r: (
r.get("order") == "fallback"
and r.get("priority") == "preserve_troops"
),
"strategic_desc": "Outnumbered 4:1 with wounded troops — fallback is the sound choice",
},
{
"description": "Cavalry flanking opportunity",
"prompt": (
"Situation: Main battle is engaged. Your infantry and enemy infantry are locked.\n"
"You have 8 cavalry in reserve. Enemy left flank is unprotected.\n"
"If cavalry hits the flank now, it will route enemy in ~30 seconds.\n\n"
'Order for cavalry: Respond:\n'
'{"cavalry_order": "<command>", "timing": "now"|"wait", '
'"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
),
"check": lambda r: (
r.get("cavalry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
and r.get("timing") in ["now", "wait"]
and r.get("risk") in ["low", "medium", "high"]
),
"check_desc": "cavalry_order, timing, and risk must be valid values",
"strategic_check": lambda r: (
r.get("cavalry_order") in ["flank_left", "flank_right", "charge"]
and r.get("timing") == "now"
),
"strategic_desc": "Should capitalize on the flank opportunity immediately",
},
]
@dataclass
class ScenarioResult:
scenario_index: int
description: str
raw_response: str
parsed: dict | None
valid_json: bool
schema_valid: bool
strategically_sound: bool
latency_ms: float
error: str = ""
@dataclass
class LevelResult:
level: int = LEVEL
name: str = NAME
trials: list[ScenarioResult] = field(default_factory=list)
passed: bool = False
score: float = 0.0
latency_p50_ms: float = 0.0
latency_p99_ms: float = 0.0
def _clean_response(raw: str) -> str:
raw = raw.strip()
if raw.startswith("```"):
lines = raw.splitlines()
lines = [l for l in lines if not l.startswith("```")]
raw = "\n".join(lines).strip()
return raw
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
result = LevelResult()
latencies = []
for i, scenario in enumerate(SCENARIOS):
t0 = time.time()
try:
response = client.chat(
model=model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": scenario["prompt"]},
],
options={"temperature": 0.2},
)
raw = response["message"]["content"]
latency_ms = (time.time() - t0) * 1000
except Exception as exc:
latency_ms = (time.time() - t0) * 1000
sr = ScenarioResult(
scenario_index=i,
description=scenario["description"],
raw_response="",
parsed=None,
valid_json=False,
schema_valid=False,
strategically_sound=False,
latency_ms=latency_ms,
error=str(exc),
)
result.trials.append(sr)
if verbose:
print(f" Scenario {i}: ERROR — {exc}")
continue
latencies.append(latency_ms)
cleaned = _clean_response(raw)
parsed = None
valid_json = False
schema_valid = False
strategically_sound = False
error = ""
try:
parsed = json.loads(cleaned)
valid_json = True
schema_valid = scenario["check"](parsed)
if schema_valid:
strategically_sound = scenario["strategic_check"](parsed)
except json.JSONDecodeError as exc:
error = f"JSONDecodeError: {exc}"
except Exception as exc:
error = f"Validation error: {exc}"
sr = ScenarioResult(
scenario_index=i,
description=scenario["description"],
raw_response=raw,
parsed=parsed,
valid_json=valid_json,
schema_valid=schema_valid,
strategically_sound=strategically_sound,
latency_ms=latency_ms,
error=error,
)
result.trials.append(sr)
if verbose:
status = "PASS" if (valid_json and schema_valid) else "FAIL"
strat = "strategic" if strategically_sound else "suboptimal"
print(
f" Scenario {i} [{scenario['description']}]: {status} ({strat}) "
f"| {latency_ms:.0f}ms"
)
if not schema_valid and valid_json:
print(f" Schema issue: {scenario['check_desc']}")
valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
result.score = valid_count / len(SCENARIOS)
result.passed = result.score >= 0.67
if latencies:
latencies_sorted = sorted(latencies)
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
result.latency_p99_ms = latencies_sorted[-1]
return result

View File

@@ -0,0 +1,223 @@
"""Level 4: Trade Route — Campaign Navigation.
Tests multi-step planning ability: route optimization, trade-off analysis
across time horizons, and adapting plans when conditions change.
Maps to: Bannerlord campaign map navigation, caravans, and economy.
"""
import json
import time
from dataclasses import dataclass, field
from typing import Any
LEVEL = 4
NAME = "Trade Route (Campaign Navigation)"
DESCRIPTION = "Model must plan optimal routes and adapt to changing conditions on the campaign map."
SYSTEM_PROMPT = """You are a Bannerlord merchant lord planning campaign movements.
Consider distance, profitability, risk, and timing.
You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
SCENARIOS = [
{
"description": "Optimal trade route selection",
"prompt": (
"You are at Epicrotea with 500 gold and 20 days travel budget.\n\n"
"Trade opportunities:\n"
"- Route A: Epicrotea → Vlandia (3 days) → Sturgia (5 days back)\n"
" Sell grain in Vlandia: +300 gold. Buy furs in Sturgia: costs 200, sells for 400 in Calradia.\n"
" Total: +500 gold profit, 8 days.\n"
"- Route B: Epicrotea → Calradia (2 days) → Aserai (4 days)\n"
" Sell iron in Calradia: +150 gold. Buy spice in Aserai: costs 300, sells for 600 in Empire.\n"
" Empire is 6 more days away. Total: +450 gold profit, 12 days.\n"
"- Route C: Epicrotea → nearby village (1 day)\n"
" Buy cheap food: costs 100, sells for 180 in any city.\n"
" Total: +80 gold profit, 2 days. Repeatable.\n\n"
'Choose route. Respond:\n'
'{"route": "A"|"B"|"C", "expected_profit": <int>, "days_used": <int>, '
'"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
),
"check": lambda r: (
r.get("route") in ["A", "B", "C"]
and isinstance(r.get("expected_profit"), (int, float))
and isinstance(r.get("days_used"), (int, float))
and r.get("risk") in ["low", "medium", "high"]
),
"check_desc": "route, expected_profit, days_used, risk must be valid",
"strategic_check": lambda r: r.get("route") in ["A", "C"], # A is best single trip, C is best if repeated
"strategic_desc": "Route A has best profit/day ratio; C is best if multiple loops possible",
},
{
"description": "Adapt plan when war declared",
"prompt": (
"You were heading to Vlandia to trade, 2 days into the journey.\n"
"NEWS: Vlandia just declared war on your faction. Entering Vlandia territory is now dangerous.\n\n"
"Your current position: borderlands, equidistant between:\n"
"- Vlandia (2 days): Now at war — high risk of attack\n"
"- Sturgia (3 days): Neutral — safe\n"
"- Empire (4 days): Allied — very safe, good prices\n\n"
"You have 400 gold of trade goods for the Vlandia market.\n"
'What do you do? Respond:\n'
'{"decision": "continue_to_vlandia"|"divert_to_sturgia"|"divert_to_empire", '
'"reason": "<why>", "gold_at_risk": <int>}'
),
"check": lambda r: (
r.get("decision") in ["continue_to_vlandia", "divert_to_sturgia", "divert_to_empire"]
and isinstance(r.get("gold_at_risk"), (int, float))
),
"check_desc": "decision must be one of three options, gold_at_risk must be a number",
"strategic_check": lambda r: r.get("decision") in ["divert_to_sturgia", "divert_to_empire"],
"strategic_desc": "Should avoid active war zone — divert to safe destination",
},
{
"description": "Multi-stop route planning with constraints",
"prompt": (
"Plan a 3-stop trading circuit starting and ending at Pravend.\n"
"Budget: 800 gold. Time limit: 20 days.\n\n"
"Available cities and travel times from Pravend:\n"
"- Rhotae: 2 days (leather cheap, sells well in south)\n"
"- Ortysia: 4 days (grain surplus — buy cheap)\n"
"- Epicrotea: 3 days (iron market — buy/sell)\n"
"- Pen Cannoc: 5 days (wine — high profit, far)\n\n"
"Each stop takes 1 day for trading.\n"
'Plan 3 stops. Respond:\n'
'{"stops": ["<city1>", "<city2>", "<city3>"], '
'"total_days": <int>, "estimated_profit": <int>, '
'"reason": "<reasoning>"}'
),
"check": lambda r: (
isinstance(r.get("stops"), list)
and len(r["stops"]) == 3
and all(isinstance(s, str) for s in r["stops"])
and isinstance(r.get("total_days"), (int, float))
and r.get("total_days", 99) <= 20
and isinstance(r.get("estimated_profit"), (int, float))
),
"check_desc": "stops must be list of 3 strings, total_days <= 20, estimated_profit numeric",
"strategic_check": lambda r: "Pen Cannoc" not in r.get("stops", []), # Too far for 20 days
"strategic_desc": "Pen Cannoc at 5 days each way is likely too far for a 20-day circuit",
},
]
@dataclass
class ScenarioResult:
scenario_index: int
description: str
raw_response: str
parsed: dict | None
valid_json: bool
schema_valid: bool
strategically_sound: bool
latency_ms: float
error: str = ""
@dataclass
class LevelResult:
level: int = LEVEL
name: str = NAME
trials: list[ScenarioResult] = field(default_factory=list)
passed: bool = False
score: float = 0.0
latency_p50_ms: float = 0.0
latency_p99_ms: float = 0.0
def _clean_response(raw: str) -> str:
raw = raw.strip()
if raw.startswith("```"):
lines = raw.splitlines()
lines = [l for l in lines if not l.startswith("```")]
raw = "\n".join(lines).strip()
return raw
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
result = LevelResult()
latencies = []
for i, scenario in enumerate(SCENARIOS):
t0 = time.time()
try:
response = client.chat(
model=model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": scenario["prompt"]},
],
options={"temperature": 0.2},
)
raw = response["message"]["content"]
latency_ms = (time.time() - t0) * 1000
except Exception as exc:
latency_ms = (time.time() - t0) * 1000
sr = ScenarioResult(
scenario_index=i,
description=scenario["description"],
raw_response="",
parsed=None,
valid_json=False,
schema_valid=False,
strategically_sound=False,
latency_ms=latency_ms,
error=str(exc),
)
result.trials.append(sr)
if verbose:
print(f" Scenario {i}: ERROR — {exc}")
continue
latencies.append(latency_ms)
cleaned = _clean_response(raw)
parsed = None
valid_json = False
schema_valid = False
strategically_sound = False
error = ""
try:
parsed = json.loads(cleaned)
valid_json = True
schema_valid = scenario["check"](parsed)
if schema_valid:
strategically_sound = scenario["strategic_check"](parsed)
except json.JSONDecodeError as exc:
error = f"JSONDecodeError: {exc}"
except Exception as exc:
error = f"Validation error: {exc}"
sr = ScenarioResult(
scenario_index=i,
description=scenario["description"],
raw_response=raw,
parsed=parsed,
valid_json=valid_json,
schema_valid=schema_valid,
strategically_sound=strategically_sound,
latency_ms=latency_ms,
error=error,
)
result.trials.append(sr)
if verbose:
status = "PASS" if (valid_json and schema_valid) else "FAIL"
strat = "strategic" if strategically_sound else "suboptimal"
print(
f" Scenario {i} [{scenario['description']}]: {status} ({strat}) "
f"| {latency_ms:.0f}ms"
)
if not schema_valid and valid_json:
print(f" Schema issue: {scenario['check_desc']}")
valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
result.score = valid_count / len(SCENARIOS)
result.passed = result.score >= 0.67
if latencies:
latencies_sorted = sorted(latencies)
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
result.latency_p99_ms = latencies_sorted[-1]
return result

View File

@@ -0,0 +1,252 @@
"""Level 5: Mini Campaign — Full Campaign Loop.
Tests multi-turn strategic coherence: the model must maintain state across
several turns of a simulated Bannerlord campaign, making consistent decisions
that build toward a long-term goal.
Maps to: Full Bannerlord campaign loop — economy, diplomacy, conquest.
"""
import json
import time
from dataclasses import dataclass, field
from typing import Any
LEVEL = 5
NAME = "Mini Campaign (Full Campaign Loop)"
DESCRIPTION = "Multi-turn strategic planning maintaining coherent goals across 4 turns."
SYSTEM_PROMPT = """You are Timmy, a Bannerlord lord with ambitions to become King of Calradia.
You have 4 turns to establish a power base. Each turn represents 2 weeks of in-game time.
Your starting position:
- Clan tier: 1 (minor lord)
- Gold: 1000
- Troops: 25 (mixed infantry/cavalry)
- Renown: 150
- Relations: Neutral with all factions
Winning requires: Gold > 3000 AND Renown > 400 AND Own 1+ settlement by Turn 4.
Each turn, choose ONE primary action:
- "raid_village": +200 gold, -50 relations target faction, +30 renown, risk of retaliation
- "trade_circuit": +300 gold, 0 relation change, +10 renown, no risk
- "escort_caravan": +150 gold, +20 relations with faction, +20 renown
- "tournament": costs 100 gold, +60 renown, +20 relations with host faction
- "recruit_troops": costs 200 gold, +15 troops, no other change
- "siege_castle": costs 500 gold + 200 troops morale, -100 relations, +80 renown, +1 settlement if succeed (30% base chance)
- "pledge_vassalage": 0 cost, +100 relations with liege, +50 renown, lose independence
You MUST respond ONLY with valid JSON for each turn. Raw JSON only."""
def run(client: Any, model: str, verbose: bool = False) -> "LevelResult":
"""Run a 4-turn mini campaign, tracking state and decision quality."""
result = LevelResult()
# Initial game state
state = {
"turn": 1,
"gold": 1000,
"troops": 25,
"renown": 150,
"settlements": 0,
"relations": {"vlandia": 0, "sturgia": 0, "empire": 0, "aserai": 0, "battania": 0},
}
conversation = [{"role": "system", "content": SYSTEM_PROMPT}]
turns_passed = []
total_latency = []
valid_actions = [
"raid_village", "trade_circuit", "escort_caravan", "tournament",
"recruit_troops", "siege_castle", "pledge_vassalage",
]
for turn_num in range(1, 5):
state["turn"] = turn_num
state_str = json.dumps(state, indent=2)
prompt = (
f"=== TURN {turn_num} / 4 ===\n"
f"Current state:\n{state_str}\n\n"
f"Win conditions remaining: Gold > 3000 ({state['gold']}/3000), "
f"Renown > 400 ({state['renown']}/400), Settlements >= 1 ({state['settlements']}/1)\n\n"
f"Choose your action for Turn {turn_num}.\n"
f'Respond: {{"action": "<action>", "target_faction": "<faction or null>", '
f'"reason": "<strategic reasoning>", "goal": "<what this advances>"}}'
)
conversation.append({"role": "user", "content": prompt})
t0 = time.time()
try:
response = client.chat(
model=model,
messages=conversation,
options={"temperature": 0.3},
)
raw = response["message"]["content"]
latency_ms = (time.time() - t0) * 1000
except Exception as exc:
latency_ms = (time.time() - t0) * 1000
tr = TurnResult(
turn=turn_num,
state_before=dict(state),
raw_response="",
parsed=None,
valid_json=False,
valid_action=False,
action=None,
latency_ms=latency_ms,
error=str(exc),
)
turns_passed.append(tr)
if verbose:
print(f" Turn {turn_num}: ERROR — {exc}")
break
total_latency.append(latency_ms)
# Clean and parse response
cleaned = raw.strip()
if cleaned.startswith("```"):
lines = cleaned.splitlines()
lines = [l for l in lines if not l.startswith("```")]
cleaned = "\n".join(lines).strip()
parsed = None
valid_json = False
valid_action = False
action = None
error = ""
try:
parsed = json.loads(cleaned)
valid_json = True
action = parsed.get("action")
valid_action = action in valid_actions
except json.JSONDecodeError as exc:
error = f"JSONDecodeError: {exc}"
tr = TurnResult(
turn=turn_num,
state_before=dict(state),
raw_response=raw,
parsed=parsed,
valid_json=valid_json,
valid_action=valid_action,
action=action,
latency_ms=latency_ms,
error=error,
)
turns_passed.append(tr)
# Add model response to conversation for continuity
conversation.append({"role": "assistant", "content": raw})
# Apply state changes based on action
if valid_action:
_apply_action(state, action, parsed.get("target_faction"))
if verbose:
status = "PASS" if (valid_json and valid_action) else "FAIL"
print(
f" Turn {turn_num}: {status} | action={action} | {latency_ms:.0f}ms | "
f"gold={state['gold']} renown={state['renown']} settlements={state['settlements']}"
)
result.turns = turns_passed
result.final_state = dict(state)
# Win condition check
result.reached_gold_target = state["gold"] >= 3000
result.reached_renown_target = state["renown"] >= 400
result.reached_settlement_target = state["settlements"] >= 1
# Score: % of turns with valid JSON + valid action
valid_turns = sum(1 for t in turns_passed if t.valid_json and t.valid_action)
result.score = valid_turns / 4 if turns_passed else 0.0
result.passed = result.score >= 0.75 # 3/4 turns valid
if total_latency:
latencies_sorted = sorted(total_latency)
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
result.latency_p99_ms = latencies_sorted[-1]
if verbose:
win_status = []
if result.reached_gold_target:
win_status.append("GOLD")
if result.reached_renown_target:
win_status.append("RENOWN")
if result.reached_settlement_target:
win_status.append("SETTLEMENT")
print(f" Win conditions met: {win_status or 'none'}")
print(f" Final: gold={state['gold']} renown={state['renown']} settlements={state['settlements']}")
return result
def _apply_action(state: dict, action: str, target_faction: str | None) -> None:
"""Simulate game state changes for a given action."""
if action == "raid_village":
state["gold"] += 200
state["renown"] += 30
if target_faction and target_faction in state["relations"]:
state["relations"][target_faction] -= 50
elif action == "trade_circuit":
state["gold"] += 300
state["renown"] += 10
elif action == "escort_caravan":
state["gold"] += 150
state["renown"] += 20
if target_faction and target_faction in state["relations"]:
state["relations"][target_faction] += 20
elif action == "tournament":
state["gold"] -= 100
state["renown"] += 60
if target_faction and target_faction in state["relations"]:
state["relations"][target_faction] += 20
elif action == "recruit_troops":
state["gold"] -= 200
state["troops"] += 15
elif action == "siege_castle":
state["gold"] -= 500
state["renown"] += 80
# 30% chance success (deterministic sim: succeed on turn 3+ if attempted)
if state["turn"] >= 3:
state["settlements"] += 1
if target_faction and target_faction in state["relations"]:
state["relations"][target_faction] -= 100
elif action == "pledge_vassalage":
state["renown"] += 50
if target_faction and target_faction in state["relations"]:
state["relations"][target_faction] += 100
@dataclass
class TurnResult:
turn: int
state_before: dict
raw_response: str
parsed: dict | None
valid_json: bool
valid_action: bool
action: str | None
latency_ms: float
error: str = ""
@dataclass
class LevelResult:
level: int = LEVEL
name: str = NAME
turns: list[TurnResult] = field(default_factory=list)
final_state: dict = field(default_factory=dict)
passed: bool = False
score: float = 0.0
reached_gold_target: bool = False
reached_renown_target: bool = False
reached_settlement_target: bool = False
latency_p50_ms: float = 0.0
latency_p99_ms: float = 0.0

View File

@@ -0,0 +1,82 @@
# Bannerlord M0 — Cognitive Benchmark Scorecard
**Date:** 2026-03-23
**Benchmark:** 6-level cognitive harness (L0L5)
**M1 Gate:** Must pass L0 + L1, latency < 10s per decision
---
## Results Summary
| Level | Description | qwen2.5:14b | hermes3:latest | hermes3:8b |
|-------|-------------|:-----------:|:--------------:|:----------:|
| **L0 [M1 GATE]** | JSON Compliance | ✓ PASS 100% | ✓ PASS 100% | ✓ PASS 100% |
| **L1 [M1 GATE]** | Board State Tracking | ✗ FAIL 50% | ✗ FAIL 50% | ✗ FAIL 50% |
| L2 | Resource Management | ✓ PASS 100% | ✓ PASS 100% | ✓ PASS 100% |
| L3 | Battle Tactics | ✓ PASS 100% | ✓ PASS 100% | ✓ PASS 100% |
| L4 | Trade Route | ✓ PASS 100% | ✓ PASS 100% | ✓ PASS 100% |
| L5 | Mini Campaign | ✓ PASS 100% | ✓ PASS 100% | ✓ PASS 100% |
| **M1 GATE** | | ✗ **FAIL** | ✗ **FAIL** | ✗ **FAIL** |
---
## Latency (p50 / p99)
| Level | qwen2.5:14b | hermes3:latest | hermes3:8b |
|-------|-------------|----------------|------------|
| L0 | 1443ms / 6348ms | 1028ms / 1184ms | 570ms / 593ms |
| L1 | 943ms / 1184ms | 1166ms / 1303ms | 767ms / 1313ms |
| L2 | 2936ms / 3122ms | 2032ms / 2232ms | 2408ms / 2832ms |
| L3 | 2248ms / 3828ms | 1614ms / 3525ms | 2174ms / 3437ms |
| L4 | 3235ms / 3318ms | 2724ms / 3038ms | 2507ms / 3420ms |
| L5 | 3414ms / 3970ms | 3137ms / 3433ms | 2571ms / 2763ms |
All models are **well under the 10s latency threshold** for L0L1.
---
## Level 1 Failure Analysis
All three models fail L1 with **identical pattern** (2/4 scenarios pass):
| Scenario | Expected | All Models |
|----------|----------|-----------|
| Empty board — opening move | Any empty square | ✓ center (4) |
| Block opponent's winning move | Position 1 (only block) | ✗ position 4 (occupied!) |
| Take winning move | Position 1 (win) | ✗ position 0 or 2 (occupied!) |
| Legal move on partially filled board | Any of 6,7,8 | ✓ position 6 |
**Root cause:** Models choose moves by heuristic (center, corners) without checking whether the chosen square is already occupied. They read the board description but don't cross-reference their move choice against it. This is a genuine spatial state-tracking failure.
**Note:** `hermes3` models emit `"move": "4"` (string) vs `"move": 4` (int). The benchmark was patched to coerce string digits to int for L1, since type fidelity is already tested at L0.
---
## M1 Gate: FAILED (all models)
No model passes the M1 gate. The blocker is **Level 1 — Board State Tracking**.
### Recommendation
The L1 failure is consistent and structural. All models understand the format and can make reasonable *opening* moves but fail to avoid already-occupied squares. Options for M1:
1. **Lower the L1 pass threshold** from 100% to ≥ 75% — the scenarios where models fail require recognizing occupied positions from a sparse JSON array, which is a known weakness. Would allow proceeding to M1 with flagged risk.
2. **Prompt engineering** — add explicit "The following squares are taken: X at positions [P1, P2]" to the prompt to see if board tracking improves.
3. **Re-evaluate L1 gate requirement** — models pass L2L5 (resource, tactics, trade, campaign) which are more directly relevant to Bannerlord play. Consider whether L1 is the right gate.
---
## qwen3:14b
Model **not available** on this Ollama instance. Available qwen3 model: `qwen3:30b`.
`qwen3:30b` was not benchmarked (significantly slower; requires explicit decision to run).
---
## Result Files
| Model | File |
|-------|------|
| qwen2.5:14b | `results/qwen2.5_14b_20260323_142119.json` |
| hermes3:latest | `results/hermes3_latest_20260323_152900.json` |
| hermes3:8b | `results/hermes3_8b_20260323_153000.json` |

View File

@@ -0,0 +1,476 @@
{
"model": "hermes3:8b",
"timestamp": "2026-03-23T19:29:22.731080+00:00",
"skipped": false,
"levels": {
"0": {
"level": 0,
"name": "JSON Compliance (Coin Flip)",
"trials": [
{
"trial_index": 0,
"prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
"raw_response": "{\"choice\": \"heads\"}",
"parsed": {
"choice": "heads"
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 522.9010581970215,
"error": ""
},
{
"trial_index": 1,
"prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
"raw_response": "{\"action\": \"defend\", \"confidence\": 0.7}",
"parsed": {
"action": "defend",
"confidence": 0.7
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 592.6809310913086,
"error": ""
},
{
"trial_index": 2,
"prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
"raw_response": "{\"direction\": \"north\", \"reason\": \"string explaining why\"}",
"parsed": {
"direction": "north",
"reason": "string explaining why"
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 569.8130130767822,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 569.8130130767822,
"latency_p99_ms": 592.6809310913086
},
"1": {
"level": 1,
"name": "Board State Tracking (Tic-Tac-Toe)",
"trials": [
{
"scenario_index": 0,
"description": "Empty board \u2014 opening move",
"prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": \"4\", \"reason\": \"Center move to establish dominance\"}",
"parsed": {
"move": "4",
"reason": "Center move to establish dominance"
},
"valid_json": true,
"move_legal": true,
"move_correct": true,
"latency_ms": 766.8571472167969,
"error": ""
},
{
"scenario_index": 1,
"description": "Block opponent's winning move",
"prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 4, \"reason\": \"Block the winning move for O in the center\"}",
"parsed": {
"move": 4,
"reason": "Block the winning move for O in the center"
},
"valid_json": true,
"move_legal": false,
"move_correct": false,
"latency_ms": 734.1668605804443,
"error": ""
},
{
"scenario_index": 2,
"description": "Take winning move",
"prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 0, \"reason\": \"I can win this turn by placing my X in position 0, creating a winning line with X at positions 0, 4, and 8.\"}",
"parsed": {
"move": 0,
"reason": "I can win this turn by placing my X in position 0, creating a winning line with X at positions 0, 4, and 8."
},
"valid_json": true,
"move_legal": false,
"move_correct": false,
"latency_ms": 1312.8020763397217,
"error": ""
},
{
"scenario_index": 3,
"description": "Legal move on partially filled board",
"prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 6, \"reason\": \"Block X's potential winning move in the center.\"}",
"parsed": {
"move": 6,
"reason": "Block X's potential winning move in the center."
},
"valid_json": true,
"move_legal": true,
"move_correct": true,
"latency_ms": 751.0488033294678,
"error": ""
}
],
"passed": false,
"score": 0.5,
"latency_p50_ms": 766.8571472167969,
"latency_p99_ms": 1312.8020763397217
},
"2": {
"level": 2,
"name": "Resource Management (Party Economy)",
"trials": [
{
"scenario_index": 0,
"description": "Budget allocation under constraint",
"raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"The priority is to address the critical morale issue and reduce casualty rates. Buying food for 20 days will stabilize morale, preventing potential desertions. Repairing armor will also help reduce the casualty rate, making the remaining troops more effective. Recruit infantry is not chosen as it would deplete all the gold, leaving no funds for food or armor repair.\"}",
"parsed": {
"choices": [
"buy_food",
"repair_armor"
],
"gold_spent": 350,
"reason": "The priority is to address the critical morale issue and reduce casualty rates. Buying food for 20 days will stabilize morale, preventing potential desertions. Repairing armor will also help reduce the casualty rate, making the remaining troops more effective. Recruit infantry is not chosen as it would deplete all the gold, leaving no funds for food or armor repair."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2832.072973251343,
"error": ""
},
{
"scenario_index": 1,
"description": "Troop tier upgrade decision",
"raw_response": "{\"action\": \"upgrade_recruits\", \"reason\": \"Upgrading recruits to Tier-2 will improve our party's combat effectiveness against the upcoming raider camp attack. The moderate difficulty suggests we need a stronger force to succeed.\", \"expected_outcome\": \"Increased chances of victory in the upcoming battle, potentially saving resources in the long run by avoiding higher upkeep costs of dismissing recruits.\"}",
"parsed": {
"action": "upgrade_recruits",
"reason": "Upgrading recruits to Tier-2 will improve our party's combat effectiveness against the upcoming raider camp attack. The moderate difficulty suggests we need a stronger force to succeed.",
"expected_outcome": "Increased chances of victory in the upcoming battle, potentially saving resources in the long run by avoiding higher upkeep costs of dismissing recruits."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2407.7491760253906,
"error": ""
},
{
"scenario_index": 2,
"description": "Multi-turn planning horizon",
"raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"Buying 5 extra food days will ensure we have enough food to cross the desert, and we will still have enough gold left to reach the town on day 10 to buy more supplies.\"}",
"parsed": {
"extra_food_days": 5,
"cost": 50,
"remaining_gold": 250,
"reason": "Buying 5 extra food days will ensure we have enough food to cross the desert, and we will still have enough gold left to reach the town on day 10 to buy more supplies."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2028.775930404663,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2407.7491760253906,
"latency_p99_ms": 2832.072973251343
},
"3": {
"level": 3,
"name": "Battle Tactics (Formation Commands)",
"trials": [
{
"scenario_index": 0,
"description": "Ranged vs infantry \u2014 defensive opening",
"raw_response": "{\"infantry_order\": \"skirmish\", \"archer_order\": \"skirmish\", \"reason\": \"The enemy is advancing with heavy infantry, so our priority should be to disrupt their formation and inflict casualties from range before they close the gap. The archers and infantry will pepper the enemy with arrows as they approach, aiming to break their cohesion and lower their morale.\", \"expected_outcome\": \"The enemy's heavy infantry will take significant casualties and lose cohesion before reaching our lines, giving us a tactical advantage when the two forces meet in melee combat.\"}",
"parsed": {
"infantry_order": "skirmish",
"archer_order": "skirmish",
"reason": "The enemy is advancing with heavy infantry, so our priority should be to disrupt their formation and inflict casualties from range before they close the gap. The archers and infantry will pepper the enemy with arrows as they approach, aiming to break their cohesion and lower their morale.",
"expected_outcome": "The enemy's heavy infantry will take significant casualties and lose cohesion before reaching our lines, giving us a tactical advantage when the two forces meet in melee combat."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": false,
"latency_ms": 3436.897039413452,
"error": ""
},
{
"scenario_index": 1,
"description": "Outnumbered \u2014 retreat decision",
"raw_response": "{\"order\": \"fallback\", \"direction\": \"east\", \"reason\": \"Our troops are heavily wounded and outmatched by the 60 bandits surrounding us. We must retreat immediately to the east where there is an escape route. Fallback is the best option to preserve our troops and live to fight another day.\", \"priority\": \"preserve_troops\"}",
"parsed": {
"order": "fallback",
"direction": "east",
"reason": "Our troops are heavily wounded and outmatched by the 60 bandits surrounding us. We must retreat immediately to the east where there is an escape route. Fallback is the best option to preserve our troops and live to fight another day.",
"priority": "preserve_troops"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2173.5548973083496,
"error": ""
},
{
"scenario_index": 2,
"description": "Cavalry flanking opportunity",
"raw_response": "{\"cavalry_order\": \"flank_left\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and our cavalry can break their line quickly, routing them in ~30 seconds.\", \"risk\": \"low\"}",
"parsed": {
"cavalry_order": "flank_left",
"timing": "now",
"reason": "Enemy left flank is unprotected and our cavalry can break their line quickly, routing them in ~30 seconds.",
"risk": "low"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 1566.443920135498,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2173.5548973083496,
"latency_p99_ms": 3436.897039413452
},
"4": {
"level": 4,
"name": "Trade Route (Campaign Navigation)",
"trials": [
{
"scenario_index": 0,
"description": "Optimal trade route selection",
"raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of goods and markets.\", \"risk\": \"low\"}",
"parsed": {
"route": "A",
"expected_profit": 500,
"days_used": 8,
"reason": "Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of goods and markets.",
"risk": "low"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2507.443904876709,
"error": ""
},
{
"scenario_index": 1,
"description": "Adapt plan when war declared",
"raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Vlandia declaring war makes entering their territory too dangerous for trade. Sturgia is neutral but closer than the Empire, so less time and gold at risk.\", \"gold_at_risk\": 400}",
"parsed": {
"decision": "divert_to_sturgia",
"reason": "Vlandia declaring war makes entering their territory too dangerous for trade. Sturgia is neutral but closer than the Empire, so less time and gold at risk.",
"gold_at_risk": 400
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2011.15083694458,
"error": ""
},
{
"scenario_index": 2,
"description": "Multi-stop route planning with constraints",
"raw_response": "{\n \"stops\": [\n \"Rhotae\",\n \"Ortysia\",\n \"Epicrotea\"\n ],\n \"total_days\": 7,\n \"estimated_profit\": 800,\n \"reason\": \"Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. 7 days total. 800 gold budget should be enough with this plan.\"\n}",
"parsed": {
"stops": [
"Rhotae",
"Ortysia",
"Epicrotea"
],
"total_days": 7,
"estimated_profit": 800,
"reason": "Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. 7 days total. 800 gold budget should be enough with this plan."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3420.1788902282715,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2507.443904876709,
"latency_p99_ms": 3420.1788902282715
},
"5": {
"level": 5,
"name": "Mini Campaign (Full Campaign Loop)",
"turns": [
{
"turn": 1,
"state_before": {
"turn": 1,
"gold": 1000,
"troops": 25,
"renown": 150,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Establishing trade relations will bring in gold and renown without risking retaliation or damaging relations with any faction. This sets a strong financial foundation for future expansion.\", \"goal\": \"Increase gold and renown\"}",
"parsed": {
"action": "trade_circuit",
"target_faction": null,
"reason": "Establishing trade relations will bring in gold and renown without risking retaliation or damaging relations with any faction. This sets a strong financial foundation for future expansion.",
"goal": "Increase gold and renown"
},
"valid_json": true,
"valid_action": true,
"action": "trade_circuit",
"latency_ms": 2763.1959915161133,
"error": ""
},
{
"turn": 2,
"state_before": {
"turn": 2,
"gold": 1300,
"troops": 25,
"renown": 160,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"tournament\", \"target_faction\": null, \"reason\": \"Participating in a tournament will boost my renown and relations without spending gold. This positive publicity can help attract potential vassals or allies in the future.\", \"goal\": \"Increase renown and relations\"}",
"parsed": {
"action": "tournament",
"target_faction": null,
"reason": "Participating in a tournament will boost my renown and relations without spending gold. This positive publicity can help attract potential vassals or allies in the future.",
"goal": "Increase renown and relations"
},
"valid_json": true,
"valid_action": true,
"action": "tournament",
"latency_ms": 2335.5319499969482,
"error": ""
},
{
"turn": 3,
"state_before": {
"turn": 3,
"gold": 1200,
"troops": 25,
"renown": 220,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Recruiting more troops will strengthen my military force, which is essential for future expansion and defending my settlements once I have them. This investment in my army will pay off in the long run.\", \"goal\": \"Increase troops\"}",
"parsed": {
"action": "recruit_troops",
"target_faction": null,
"reason": "Recruiting more troops will strengthen my military force, which is essential for future expansion and defending my settlements once I have them. This investment in my army will pay off in the long run.",
"goal": "Increase troops"
},
"valid_json": true,
"valid_action": true,
"action": "recruit_troops",
"latency_ms": 2570.871114730835,
"error": ""
},
{
"turn": 4,
"state_before": {
"turn": 4,
"gold": 1000,
"troops": 40,
"renown": 220,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"siege_castle\", \"target_faction\": null, \"reason\": \"Capturing a castle will give me my first settlement, putting me closer to winning the game. This physical foothold in Calradia will provide a base of operations and resources to further my ambitions of becoming King.\", \"goal\": \"Gain first settlement\"}",
"parsed": {
"action": "siege_castle",
"target_faction": null,
"reason": "Capturing a castle will give me my first settlement, putting me closer to winning the game. This physical foothold in Calradia will provide a base of operations and resources to further my ambitions of becoming King.",
"goal": "Gain first settlement"
},
"valid_json": true,
"valid_action": true,
"action": "siege_castle",
"latency_ms": 2367.640972137451,
"error": ""
}
],
"final_state": {
"turn": 4,
"gold": 500,
"troops": 40,
"renown": 300,
"settlements": 1,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"passed": true,
"score": 1.0,
"reached_gold_target": false,
"reached_renown_target": false,
"reached_settlement_target": true,
"latency_p50_ms": 2570.871114730835,
"latency_p99_ms": 2763.1959915161133
}
},
"summary": {
"total_elapsed_s": 37.7,
"levels_run": [
0,
1,
2,
3,
4,
5
],
"levels_passed": [
0,
2,
3,
4,
5
],
"levels_failed": [
1
],
"m1_gate_passed": false,
"m1_gate_notes": [
"Level 1 FAILED (score=0.50)"
],
"m1_latency_threshold_ms": 10000
}
}

View File

@@ -0,0 +1,476 @@
{
"model": "hermes3:latest",
"timestamp": "2026-03-23T19:28:19.717773+00:00",
"skipped": false,
"levels": {
"0": {
"level": 0,
"name": "JSON Compliance (Coin Flip)",
"trials": [
{
"trial_index": 0,
"prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
"raw_response": "{\"choice\": \"heads\"}",
"parsed": {
"choice": "heads"
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 1028.2011032104492,
"error": ""
},
{
"trial_index": 1,
"prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
"raw_response": "{\"action\": \"defend\", \"confidence\": 0.7}",
"parsed": {
"action": "defend",
"confidence": 0.7
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 700.0582218170166,
"error": ""
},
{
"trial_index": 2,
"prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
"raw_response": "{\"direction\": \"north\", \"reason\": \"The northern direction allows for the most efficient and direct path towards the goal, minimizing potential obstacles and maximizing progress.\"}",
"parsed": {
"direction": "north",
"reason": "The northern direction allows for the most efficient and direct path towards the goal, minimizing potential obstacles and maximizing progress."
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 1184.3459606170654,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 1028.2011032104492,
"latency_p99_ms": 1184.3459606170654
},
"1": {
"level": 1,
"name": "Board State Tracking (Tic-Tac-Toe)",
"trials": [
{
"scenario_index": 0,
"description": "Empty board \u2014 opening move",
"prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": \"4\", \"reason\": \"Center move to establish dominance\"}",
"parsed": {
"move": "4",
"reason": "Center move to establish dominance"
},
"valid_json": true,
"move_legal": true,
"move_correct": true,
"latency_ms": 932.2080612182617,
"error": ""
},
{
"scenario_index": 1,
"description": "Block opponent's winning move",
"prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 4, \"reason\": \"Block the winning move for O in the center, creating a chance for a winning X cross\"}",
"parsed": {
"move": 4,
"reason": "Block the winning move for O in the center, creating a chance for a winning X cross"
},
"valid_json": true,
"move_legal": false,
"move_correct": false,
"latency_ms": 1165.644884109497,
"error": ""
},
{
"scenario_index": 2,
"description": "Take winning move",
"prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 0, \"reason\": \"I can win this turn by placing my X in position 0, creating a winning line of three X's horizontally.\"}",
"parsed": {
"move": 0,
"reason": "I can win this turn by placing my X in position 0, creating a winning line of three X's horizontally."
},
"valid_json": true,
"move_legal": false,
"move_correct": false,
"latency_ms": 1302.915096282959,
"error": ""
},
{
"scenario_index": 3,
"description": "Legal move on partially filled board",
"prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 6, \"reason\": \"Block X's potential win in the center.\"}",
"parsed": {
"move": 6,
"reason": "Block X's potential win in the center."
},
"valid_json": true,
"move_legal": true,
"move_correct": true,
"latency_ms": 851.456880569458,
"error": ""
}
],
"passed": false,
"score": 0.5,
"latency_p50_ms": 1165.644884109497,
"latency_p99_ms": 1302.915096282959
},
"2": {
"level": 2,
"name": "Resource Management (Party Economy)",
"trials": [
{
"scenario_index": 0,
"description": "Budget allocation under constraint",
"raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"Buying food is crucial to prevent desertion, and repairing armor will help reduce casualties. Recruit infantry is put on hold due to limited resources.\"}",
"parsed": {
"choices": [
"buy_food",
"repair_armor"
],
"gold_spent": 350,
"reason": "Buying food is crucial to prevent desertion, and repairing armor will help reduce casualties. Recruit infantry is put on hold due to limited resources."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2031.9790840148926,
"error": ""
},
{
"scenario_index": 1,
"description": "Troop tier upgrade decision",
"raw_response": "{\"action\": \"save_gold\", \"reason\": \"Upgrading recruits now would leave little gold for emergencies, especially with the upcoming raiders camp attack. It's better to maintain the current troop composition and save gold for potential future threats or opportunities.\", \"expected_outcome\": \"Having enough gold to respond to unexpected situations without needing to dismiss or downgrade troops.\"}",
"parsed": {
"action": "save_gold",
"reason": "Upgrading recruits now would leave little gold for emergencies, especially with the upcoming raiders camp attack. It's better to maintain the current troop composition and save gold for potential future threats or opportunities.",
"expected_outcome": "Having enough gold to respond to unexpected situations without needing to dismiss or downgrade troops."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2231.9281101226807,
"error": ""
},
{
"scenario_index": 2,
"description": "Multi-turn planning horizon",
"raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"Buying 5 extra food days will ensure we have enough food to cross the desert and reach the town on day 10, while still having enough gold left for potential expenses.\"}",
"parsed": {
"extra_food_days": 5,
"cost": 50,
"remaining_gold": 250,
"reason": "Buying 5 extra food days will ensure we have enough food to cross the desert and reach the town on day 10, while still having enough gold left for potential expenses."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 1943.3560371398926,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2031.9790840148926,
"latency_p99_ms": 2231.9281101226807
},
"3": {
"level": 3,
"name": "Battle Tactics (Formation Commands)",
"trials": [
{
"scenario_index": 0,
"description": "Ranged vs infantry \u2014 defensive opening",
"raw_response": "{\"infantry_order\": \"skirmish\", \"archer_order\": \"skirmish\", \"reason\": \"The enemy heavy infantry is advancing, so we should use our archers and infantry to pepper them with ranged attacks before they close to melee range. This will soften them up and reduce their combat effectiveness when our forces meet.\", \"expected_outcome\": \"By the time our forces engage in melee, the enemy heavy infantry will have suffered significant casualties from the ranged attacks, giving us a tactical advantage and maximizing the damage we can inflict before contact is made.\"}",
"parsed": {
"infantry_order": "skirmish",
"archer_order": "skirmish",
"reason": "The enemy heavy infantry is advancing, so we should use our archers and infantry to pepper them with ranged attacks before they close to melee range. This will soften them up and reduce their combat effectiveness when our forces meet.",
"expected_outcome": "By the time our forces engage in melee, the enemy heavy infantry will have suffered significant casualties from the ranged attacks, giving us a tactical advantage and maximizing the damage we can inflict before contact is made."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": false,
"latency_ms": 3524.5888233184814,
"error": ""
},
{
"scenario_index": 1,
"description": "Outnumbered \u2014 retreat decision",
"raw_response": "{\"order\": \"fallback\", \"direction\": \"east\", \"reason\": \"Outnumbered and with many wounded, falling back to the escape route is the best option to preserve our troops.\", \"priority\": \"preserve_troops\"}",
"parsed": {
"order": "fallback",
"direction": "east",
"reason": "Outnumbered and with many wounded, falling back to the escape route is the best option to preserve our troops.",
"priority": "preserve_troops"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 1523.0557918548584,
"error": ""
},
{
"scenario_index": 2,
"description": "Cavalry flanking opportunity",
"raw_response": "{\"cavalry_order\": \"flank_left\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and engaging our main force. A swift cavalry charge will catch them off guard and route them in ~30 seconds.\", \"risk\": \"low\"}",
"parsed": {
"cavalry_order": "flank_left",
"timing": "now",
"reason": "Enemy left flank is unprotected and engaging our main force. A swift cavalry charge will catch them off guard and route them in ~30 seconds.",
"risk": "low"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 1614.2308712005615,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 1614.2308712005615,
"latency_p99_ms": 3524.5888233184814
},
"4": {
"level": 4,
"name": "Trade Route (Campaign Navigation)",
"trials": [
{
"scenario_index": 0,
"description": "Optimal trade route selection",
"raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of profitability and travel distance.\", \"risk\": \"low\"}",
"parsed": {
"route": "A",
"expected_profit": 500,
"days_used": 8,
"reason": "Route A offers the highest profit with a manageable travel time and low risk. Selling grain in Vlandia and furs in Sturgia provides a good balance of profitability and travel distance.",
"risk": "low"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2335.57391166687,
"error": ""
},
{
"scenario_index": 1,
"description": "Adapt plan when war declared",
"raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Given the new information that Vlandia has declared war on our faction, entering their territory would be too dangerous. Sturgia is a neutral but friendly region, so it should be safe to trade there. The Empire is allied but further away, and while safe, the extra travel time could reduce the profitability of the trade mission.\", \"gold_at_risk\": 400}",
"parsed": {
"decision": "divert_to_sturgia",
"reason": "Given the new information that Vlandia has declared war on our faction, entering their territory would be too dangerous. Sturgia is a neutral but friendly region, so it should be safe to trade there. The Empire is allied but further away, and while safe, the extra travel time could reduce the profitability of the trade mission.",
"gold_at_risk": 400
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2723.7470149993896,
"error": ""
},
{
"scenario_index": 2,
"description": "Multi-stop route planning with constraints",
"raw_response": "{\n \"stops\": [\n \"Rhotae\",\n \"Ortysia\",\n \"Epicrotea\"\n ],\n \"total_days\": 7,\n \"estimated_profit\": 1200,\n \"reason\": \"Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. Total 7 days, 800 gold budget sufficient. Expected profit of 1200 gold.\"\n}",
"parsed": {
"stops": [
"Rhotae",
"Ortysia",
"Epicrotea"
],
"total_days": 7,
"estimated_profit": 1200,
"reason": "Starting in Rhotae allows buying cheap leather and selling well in the south. Moving to Ortysia to buy grain cheap. Finally, Epicrotea for iron market. Total 7 days, 800 gold budget sufficient. Expected profit of 1200 gold."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3038.438320159912,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2723.7470149993896,
"latency_p99_ms": 3038.438320159912
},
"5": {
"level": 5,
"name": "Mini Campaign (Full Campaign Loop)",
"turns": [
{
"turn": 1,
"state_before": {
"turn": 1,
"gold": 1000,
"troops": 25,
"renown": 150,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Establishing trade relations will bring in gold without risking relations or retaliation. This sets a foundation for economic growth and allows me to assess other factions before committing to any alliances or conflicts.\", \"goal\": \"Increase gold reserves and build a power base through trade and diplomacy\"}",
"parsed": {
"action": "trade_circuit",
"target_faction": null,
"reason": "Establishing trade relations will bring in gold without risking relations or retaliation. This sets a foundation for economic growth and allows me to assess other factions before committing to any alliances or conflicts.",
"goal": "Increase gold reserves and build a power base through trade and diplomacy"
},
"valid_json": true,
"valid_action": true,
"action": "trade_circuit",
"latency_ms": 2898.583173751831,
"error": ""
},
{
"turn": 2,
"state_before": {
"turn": 2,
"gold": 1300,
"troops": 25,
"renown": 160,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"raid_village\", \"target_faction\": null, \"reason\": \"Raiding a village will provide a quick boost to my gold reserves and renown. This will also help me gauge the strength of my troops and the potential resistance from the targeted faction. It's a bold move that shows my power and willingness to act, which may intimidate other factions and open up opportunities for future alliances or conflicts.\", \"goal\": \"Increase gold and renown, assess troop strength and potential faction responses\"}",
"parsed": {
"action": "raid_village",
"target_faction": null,
"reason": "Raiding a village will provide a quick boost to my gold reserves and renown. This will also help me gauge the strength of my troops and the potential resistance from the targeted faction. It's a bold move that shows my power and willingness to act, which may intimidate other factions and open up opportunities for future alliances or conflicts.",
"goal": "Increase gold and renown, assess troop strength and potential faction responses"
},
"valid_json": true,
"valid_action": true,
"action": "raid_village",
"latency_ms": 3112.649917602539,
"error": ""
},
{
"turn": 3,
"state_before": {
"turn": 3,
"gold": 1500,
"troops": 25,
"renown": 190,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Recruiting more troops is essential for expanding my military strength and increasing my chances of successfully sieging a castle in the next turn. This will also help me prepare for potential conflicts with other factions and solidify my power base. Strengthening my army is a crucial step towards achieving my goal of becoming the King of Calradia.\", \"goal\": \"Increase troop count for future military expansion and defense\"}",
"parsed": {
"action": "recruit_troops",
"target_faction": null,
"reason": "Recruiting more troops is essential for expanding my military strength and increasing my chances of successfully sieging a castle in the next turn. This will also help me prepare for potential conflicts with other factions and solidify my power base. Strengthening my army is a crucial step towards achieving my goal of becoming the King of Calradia.",
"goal": "Increase troop count for future military expansion and defense"
},
"valid_json": true,
"valid_action": true,
"action": "recruit_troops",
"latency_ms": 3136.754035949707,
"error": ""
},
{
"turn": 4,
"state_before": {
"turn": 4,
"gold": 1300,
"troops": 40,
"renown": 190,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"siege_castle\", \"target_faction\": null, \"reason\": \"Sieging a castle will allow me to establish a settlement, which is crucial for meeting one of my win conditions. This will also demonstrate my military might and potentially intimidate other factions, opening up opportunities for future alliances or conflicts. While it comes with a risk of losing troops morale and straining relations, it's a necessary step towards achieving my goal of becoming the King of Calradia.\", \"goal\": \"Establish a settlement and demonstrate military strength to intimidate other factions\"}",
"parsed": {
"action": "siege_castle",
"target_faction": null,
"reason": "Sieging a castle will allow me to establish a settlement, which is crucial for meeting one of my win conditions. This will also demonstrate my military might and potentially intimidate other factions, opening up opportunities for future alliances or conflicts. While it comes with a risk of losing troops morale and straining relations, it's a necessary step towards achieving my goal of becoming the King of Calradia.",
"goal": "Establish a settlement and demonstrate military strength to intimidate other factions"
},
"valid_json": true,
"valid_action": true,
"action": "siege_castle",
"latency_ms": 3433.211326599121,
"error": ""
}
],
"final_state": {
"turn": 4,
"gold": 800,
"troops": 40,
"renown": 270,
"settlements": 1,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": 0,
"aserai": 0,
"battania": 0
}
},
"passed": true,
"score": 1.0,
"reached_gold_target": false,
"reached_renown_target": false,
"reached_settlement_target": true,
"latency_p50_ms": 3136.754035949707,
"latency_p99_ms": 3433.211326599121
}
},
"summary": {
"total_elapsed_s": 40.7,
"levels_run": [
0,
1,
2,
3,
4,
5
],
"levels_passed": [
0,
2,
3,
4,
5
],
"levels_failed": [
1
],
"m1_gate_passed": false,
"m1_gate_notes": [
"Level 1 FAILED (score=0.50)"
],
"m1_latency_threshold_ms": 10000
}
}

View File

@@ -0,0 +1,476 @@
{
"model": "qwen2.5:14b",
"timestamp": "2026-03-23T18:20:27.095190+00:00",
"skipped": false,
"levels": {
"0": {
"level": 0,
"name": "JSON Compliance (Coin Flip)",
"trials": [
{
"trial_index": 0,
"prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
"raw_response": "{\"choice\": \"heads\"}",
"parsed": {
"choice": "heads"
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 6347.5000858306885,
"error": ""
},
{
"trial_index": 1,
"prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
"raw_response": "{\"action\": \"defend\", \"confidence\": 0.6}",
"parsed": {
"action": "defend",
"confidence": 0.6
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 850.42405128479,
"error": ""
},
{
"trial_index": 2,
"prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
"raw_response": "{\"direction\": \"north\", \"reason\": \"There is a reported enemy encampment to the north that we need to engage.\"}",
"parsed": {
"direction": "north",
"reason": "There is a reported enemy encampment to the north that we need to engage."
},
"valid_json": true,
"schema_valid": true,
"value_valid": true,
"latency_ms": 1443.0510997772217,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 1443.0510997772217,
"latency_p99_ms": 6347.5000858306885
},
"1": {
"level": 1,
"name": "Board State Tracking (Tic-Tac-Toe)",
"trials": [
{
"scenario_index": 0,
"description": "Empty board \u2014 opening move",
"prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 4, \"reason\": \"Control the center\"}",
"parsed": {
"move": 4,
"reason": "Control the center"
},
"valid_json": true,
"move_legal": true,
"move_correct": true,
"latency_ms": 1184.3791007995605,
"error": ""
},
{
"scenario_index": 1,
"description": "Block opponent's winning move",
"prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 4, \"reason\": \"Block O's winning move\"}",
"parsed": {
"move": 4,
"reason": "Block O's winning move"
},
"valid_json": true,
"move_legal": false,
"move_correct": false,
"latency_ms": 942.816972732544,
"error": ""
},
{
"scenario_index": 2,
"description": "Take winning move",
"prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 2, \"reason\": \"Winning move\"}",
"parsed": {
"move": 2,
"reason": "Winning move"
},
"valid_json": true,
"move_legal": false,
"move_correct": false,
"latency_ms": 849.2491245269775,
"error": ""
},
{
"scenario_index": 3,
"description": "Legal move on partially filled board",
"prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
"raw_response": "{\"move\": 7, \"reason\": \"Creating a fork opportunity\"}",
"parsed": {
"move": 7,
"reason": "Creating a fork opportunity"
},
"valid_json": true,
"move_legal": true,
"move_correct": true,
"latency_ms": 885.8909606933594,
"error": ""
}
],
"passed": false,
"score": 0.5,
"latency_p50_ms": 942.816972732544,
"latency_p99_ms": 1184.3791007995605
},
"2": {
"level": 2,
"name": "Resource Management (Party Economy)",
"trials": [
{
"scenario_index": 0,
"description": "Budget allocation under constraint",
"raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles.\"}",
"parsed": {
"choices": [
"buy_food",
"repair_armor"
],
"gold_spent": 350,
"reason": "Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3121.7379570007324,
"error": ""
},
{
"scenario_index": 1,
"description": "Troop tier upgrade decision",
"raw_response": "{\"action\": \"upgrade_recruits\", \"reason\": \"Improving the strength of the weakest troops can balance the party's overall combat effectiveness.\", \"expected_outcome\": \"Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers.\"}",
"parsed": {
"action": "upgrade_recruits",
"reason": "Improving the strength of the weakest troops can balance the party's overall combat effectiveness.",
"expected_outcome": "Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2936.119794845581,
"error": ""
},
{
"scenario_index": 2,
"description": "Multi-turn planning horizon",
"raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now.\"}",
"parsed": {
"extra_food_days": 5,
"cost": 50,
"remaining_gold": 250,
"reason": "To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2515.505790710449,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2936.119794845581,
"latency_p99_ms": 3121.7379570007324
},
"3": {
"level": 3,
"name": "Battle Tactics (Formation Commands)",
"trials": [
{
"scenario_index": 0,
"description": "Ranged vs infantry \u2014 defensive opening",
"raw_response": "{\"infantry_order\": \"shield_wall\", \"archer_order\": \"skirmish\", \"reason\": \"Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.\", \"expected_outcome\": \"Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact.\"}",
"parsed": {
"infantry_order": "shield_wall",
"archer_order": "skirmish",
"reason": "Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.",
"expected_outcome": "Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3828.04012298584,
"error": ""
},
{
"scenario_index": 1,
"description": "Outnumbered \u2014 retreat decision",
"raw_response": "{\"order\": \"skirmish\", \"direction\": \"east\", \"reason\": \"Engage the enemy at range to buy time for an organized retreat.\", \"priority\": \"preserve_troops\"}",
"parsed": {
"order": "skirmish",
"direction": "east",
"reason": "Engage the enemy at range to buy time for an organized retreat.",
"priority": "preserve_troops"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": false,
"latency_ms": 2162.7819538116455,
"error": ""
},
{
"scenario_index": 2,
"description": "Cavalry flanking opportunity",
"raw_response": "{\"cavalry_order\": \"flank_right\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.\", \"risk\": \"low\"}",
"parsed": {
"cavalry_order": "flank_right",
"timing": "now",
"reason": "Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.",
"risk": "low"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 2248.1229305267334,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 2248.1229305267334,
"latency_p99_ms": 3828.04012298584
},
"4": {
"level": 4,
"name": "Trade Route (Campaign Navigation)",
"trials": [
{
"scenario_index": 0,
"description": "Optimal trade route selection",
"raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Offers the highest profit with a reasonable travel time.\", \"risk\": \"medium\"}",
"parsed": {
"route": "A",
"expected_profit": 500,
"days_used": 8,
"reason": "Offers the highest profit with a reasonable travel time.",
"risk": "medium"
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3142.971992492676,
"error": ""
},
{
"scenario_index": 1,
"description": "Adapt plan when war declared",
"raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.\", \"gold_at_risk\": 400}",
"parsed": {
"decision": "divert_to_sturgia",
"reason": "Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.",
"gold_at_risk": 400
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3317.8718090057373,
"error": ""
},
{
"scenario_index": 2,
"description": "Multi-stop route planning with constraints",
"raw_response": "{\"stops\": [\"Ortysia\", \"Epicrotea\", \"Rhotae\"], \"total_days\": 12, \"estimated_profit\": 1200, \"reason\": \"Optimized for time and profit, focusing on essential goods with high demand and low risk.\"}",
"parsed": {
"stops": [
"Ortysia",
"Epicrotea",
"Rhotae"
],
"total_days": 12,
"estimated_profit": 1200,
"reason": "Optimized for time and profit, focusing on essential goods with high demand and low risk."
},
"valid_json": true,
"schema_valid": true,
"strategically_sound": true,
"latency_ms": 3235.442876815796,
"error": ""
}
],
"passed": true,
"score": 1.0,
"latency_p50_ms": 3235.442876815796,
"latency_p99_ms": 3317.8718090057373
},
"5": {
"level": 5,
"name": "Mini Campaign (Full Campaign Loop)",
"turns": [
{
"turn": 1,
"state_before": {
"turn": 1,
"gold": 1000,
"troops": 25,
"renown": 150,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": -80,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Increase gold and renown without risking relations or encountering retaliation.\", \"goal\": \"Economic stability and reputation building\"}",
"parsed": {
"action": "trade_circuit",
"target_faction": null,
"reason": "Increase gold and renown without risking relations or encountering retaliation.",
"goal": "Economic stability and reputation building"
},
"valid_json": true,
"valid_action": true,
"action": "trade_circuit",
"latency_ms": 3970.310926437378,
"error": ""
},
{
"turn": 2,
"state_before": {
"turn": 2,
"gold": 1300,
"troops": 25,
"renown": 160,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": -80,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"escort_caravan\", \"target_faction\": \"empire\", \"reason\": \"Improve relations with a major faction and gain additional renown.\", \"goal\": \"Political alliance and reputation advancement\"}",
"parsed": {
"action": "escort_caravan",
"target_faction": "empire",
"reason": "Improve relations with a major faction and gain additional renown.",
"goal": "Political alliance and reputation advancement"
},
"valid_json": true,
"valid_action": true,
"action": "escort_caravan",
"latency_ms": 2754.3041706085205,
"error": ""
},
{
"turn": 3,
"state_before": {
"turn": 3,
"gold": 1450,
"troops": 25,
"renown": 180,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": -80,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Increase military strength to prepare for a potential siege or defense.\", \"goal\": \"Military readiness and expansion capability\"}",
"parsed": {
"action": "recruit_troops",
"target_faction": null,
"reason": "Increase military strength to prepare for a potential siege or defense.",
"goal": "Military readiness and expansion capability"
},
"valid_json": true,
"valid_action": true,
"action": "recruit_troops",
"latency_ms": 2931.4141273498535,
"error": ""
},
{
"turn": 4,
"state_before": {
"turn": 4,
"gold": 1250,
"troops": 40,
"renown": 180,
"settlements": 0,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": -80,
"aserai": 0,
"battania": 0
}
},
"raw_response": "{\"action\": \"siege_castle\", \"target_faction\": \"empire\", \"reason\": \"Attempt to capture a settlement to meet the win condition of owning at least one settlement.\", \"goal\": \"Territorial expansion and victory condition\"}",
"parsed": {
"action": "siege_castle",
"target_faction": "empire",
"reason": "Attempt to capture a settlement to meet the win condition of owning at least one settlement.",
"goal": "Territorial expansion and victory condition"
},
"valid_json": true,
"valid_action": true,
"action": "siege_castle",
"latency_ms": 3413.828134536743,
"error": ""
}
],
"final_state": {
"turn": 4,
"gold": 750,
"troops": 40,
"renown": 260,
"settlements": 1,
"relations": {
"vlandia": 0,
"sturgia": 0,
"empire": -80,
"aserai": 0,
"battania": 0
}
},
"passed": true,
"score": 1.0,
"reached_gold_target": false,
"reached_renown_target": false,
"reached_settlement_target": true,
"latency_p50_ms": 3413.828134536743,
"latency_p99_ms": 3970.310926437378
}
},
"summary": {
"total_elapsed_s": 52.1,
"levels_run": [
0,
1,
2,
3,
4,
5
],
"levels_passed": [
0,
2,
3,
4,
5
],
"levels_failed": [
1
],
"m1_gate_passed": false,
"m1_gate_notes": [
"Level 1 FAILED (score=0.50)"
],
"m1_latency_threshold_ms": 10000
}
}

View File

@@ -0,0 +1,259 @@
#!/usr/bin/env python3
"""Timmy Cognitive Benchmark Harness — Project Bannerlord M0.
Runs a 6-level cognitive benchmark against an Ollama model to assess
readiness for autonomous Bannerlord gameplay.
Usage:
python run_benchmark.py --model qwen2.5:14b --verbose
python run_benchmark.py --model qwen3:14b --levels 0,1,2
python run_benchmark.py --model qwen2.5:14b --output results/my_run.json
"""
import argparse
import dataclasses
import json
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
try:
import ollama
except ImportError:
print("ERROR: 'ollama' package not installed. Run: pip install ollama", file=sys.stderr)
sys.exit(1)
# Add parent dir to path so levels can be imported
sys.path.insert(0, str(Path(__file__).parent))
from levels import level_0_coin_flip
from levels import level_1_tic_tac_toe
from levels import level_2_resource_mgmt
from levels import level_3_battle_tactics
from levels import level_4_trade_route
from levels import level_5_mini_campaign
ALL_LEVELS = [
level_0_coin_flip,
level_1_tic_tac_toe,
level_2_resource_mgmt,
level_3_battle_tactics,
level_4_trade_route,
level_5_mini_campaign,
]
# Pass criteria for M1 gate
M1_GATE_LEVELS = {0, 1} # Must pass Level 0 and Level 1
M1_LATENCY_THRESHOLD_MS = 10_000 # < 10s per decision for L0-L1
def _dataclass_to_dict(obj):
"""Recursively convert dataclass instances to dicts for JSON serialization."""
if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
return {k: _dataclass_to_dict(v) for k, v in dataclasses.asdict(obj).items()}
if isinstance(obj, list):
return [_dataclass_to_dict(i) for i in obj]
if isinstance(obj, dict):
return {k: _dataclass_to_dict(v) for k, v in obj.items()}
return obj
def check_model_available(model: str) -> bool:
"""Return True if the model is available in Ollama."""
try:
models = ollama.list()
model_names = [m["model"] for m in models.get("models", [])]
# Also check without tag
base_model = model.split(":")[0]
return any(
m == model or m.startswith(base_model + ":") or m == base_model
for m in model_names
)
except Exception:
return False
def run_benchmark(
model: str,
levels_to_run: list[int] | None = None,
verbose: bool = False,
skip_missing: bool = True,
) -> dict:
"""Run the benchmark and return a results dict."""
if levels_to_run is None:
levels_to_run = list(range(len(ALL_LEVELS)))
print(f"\n{'=' * 60}")
print(f" Timmy Cognitive Benchmark — Project Bannerlord M0")
print(f"{'=' * 60}")
print(f" Model: {model}")
print(f" Levels: {levels_to_run}")
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'=' * 60}\n")
if not check_model_available(model):
if skip_missing:
print(f" WARNING: Model '{model}' not found in Ollama. Skipping.\n")
return {
"model": model,
"skipped": True,
"reason": f"Model '{model}' not available",
"timestamp": datetime.now(timezone.utc).isoformat(),
}
else:
print(f" ERROR: Model '{model}' not found in Ollama.", file=sys.stderr)
sys.exit(1)
client = ollama
results = {
"model": model,
"timestamp": datetime.now(timezone.utc).isoformat(),
"skipped": False,
"levels": {},
"summary": {},
}
level_results = {}
total_start = time.time()
for level_idx in levels_to_run:
if level_idx >= len(ALL_LEVELS):
print(f" WARNING: Level {level_idx} does not exist, skipping.")
continue
module = ALL_LEVELS[level_idx]
print(f"Level {module.LEVEL}: {module.NAME}")
print(f" {module.DESCRIPTION}")
try:
level_result = module.run(client, model, verbose=verbose)
level_results[level_idx] = level_result
passed_str = "PASS" if level_result.passed else "FAIL"
score_pct = f"{level_result.score * 100:.0f}%"
lat_str = f"p50={level_result.latency_p50_ms:.0f}ms p99={level_result.latency_p99_ms:.0f}ms"
print(f" Result: {passed_str} | Score: {score_pct} | Latency: {lat_str}")
except Exception as exc:
print(f" ERROR running level {level_idx}: {exc}")
import traceback
traceback.print_exc()
print()
total_elapsed_s = time.time() - total_start
# Build summary
m1_gate_passed = True
m1_gate_notes = []
for level_idx, lr in level_results.items():
results["levels"][str(level_idx)] = _dataclass_to_dict(lr)
if level_idx in M1_GATE_LEVELS:
if not lr.passed:
m1_gate_passed = False
m1_gate_notes.append(f"Level {level_idx} FAILED (score={lr.score:.2f})")
if lr.latency_p99_ms > M1_LATENCY_THRESHOLD_MS:
m1_gate_passed = False
m1_gate_notes.append(
f"Level {level_idx} latency too high "
f"(p99={lr.latency_p99_ms:.0f}ms > {M1_LATENCY_THRESHOLD_MS}ms)"
)
results["summary"] = {
"total_elapsed_s": round(total_elapsed_s, 1),
"levels_run": levels_to_run,
"levels_passed": [i for i, lr in level_results.items() if lr.passed],
"levels_failed": [i for i, lr in level_results.items() if not lr.passed],
"m1_gate_passed": m1_gate_passed,
"m1_gate_notes": m1_gate_notes,
"m1_latency_threshold_ms": M1_LATENCY_THRESHOLD_MS,
}
# Print scorecard
print(f"{'=' * 60}")
print(f" SCORECARD — {model}")
print(f"{'=' * 60}")
all_level_modules = {m.LEVEL: m for m in ALL_LEVELS}
for level_idx in levels_to_run:
if level_idx not in level_results:
continue
lr = level_results[level_idx]
module = ALL_LEVELS[level_idx]
passed_str = "✓ PASS" if lr.passed else "✗ FAIL"
gate_str = " [M1 GATE]" if level_idx in M1_GATE_LEVELS else ""
lat = f"{lr.latency_p50_ms:.0f}ms"
print(f" L{level_idx}: {passed_str}{gate_str} | {lr.score*100:.0f}% | {lat} | {module.NAME}")
print(f"{'' * 60}")
gate_str = "✓ M1 GATE PASSED" if m1_gate_passed else "✗ M1 GATE FAILED"
print(f" {gate_str}")
if m1_gate_notes:
for note in m1_gate_notes:
print(f"{note}")
print(f" Total time: {total_elapsed_s:.1f}s")
print(f"{'=' * 60}\n")
return results
def main():
parser = argparse.ArgumentParser(
description="Timmy Cognitive Benchmark Harness — Project Bannerlord M0"
)
parser.add_argument("--model", required=True, help="Ollama model name (e.g. qwen2.5:14b)")
parser.add_argument("--levels", default=None, help="Comma-separated level indices (default: all)")
parser.add_argument("--verbose", action="store_true", help="Show per-trial details")
parser.add_argument(
"--output", default=None,
help="Output JSON path (default: results/<model>_<timestamp>.json)"
)
parser.add_argument(
"--skip-missing", action="store_true", default=True,
help="Skip instead of error if model not available"
)
args = parser.parse_args()
levels_to_run = None
if args.levels:
try:
levels_to_run = [int(x.strip()) for x in args.levels.split(",")]
except ValueError:
print(f"ERROR: --levels must be comma-separated integers, got: {args.levels}", file=sys.stderr)
sys.exit(1)
results = run_benchmark(
model=args.model,
levels_to_run=levels_to_run,
verbose=args.verbose,
skip_missing=args.skip_missing,
)
# Save results
if args.output:
output_path = Path(args.output)
else:
results_dir = Path(__file__).parent / "results"
results_dir.mkdir(exist_ok=True)
safe_model = args.model.replace(":", "_").replace("/", "_")
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = results_dir / f"{safe_model}_{ts}.json"
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump(results, f, indent=2, default=str)
print(f"Results saved to: {output_path}")
# Exit with non-zero if M1 gate failed
if not results.get("skipped") and not results.get("summary", {}).get("m1_gate_passed", True):
sys.exit(1)
if __name__ == "__main__":
main()