WIP: Claude Code progress on #1092
Automated salvage commit — agent session ended (exit 124). Work in progress, may need continuation.
This commit is contained in:
1
timmy-benchmark/levels/__init__.py
Normal file
1
timmy-benchmark/levels/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Cognitive benchmark levels for Project Bannerlord readiness testing."""
|
||||
183
timmy-benchmark/levels/level_0_coin_flip.py
Normal file
183
timmy-benchmark/levels/level_0_coin_flip.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""Level 0: JSON Compliance — Coin Flip.
|
||||
|
||||
Tests whether the model can reliably return well-formed JSON responses
|
||||
with a specific schema. This is the minimum bar for GABS tool calls.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
LEVEL = 0
|
||||
NAME = "JSON Compliance (Coin Flip)"
|
||||
DESCRIPTION = "Model must return valid JSON matching a strict schema on each trial."
|
||||
|
||||
SYSTEM_PROMPT = """You are a strategic AI agent. You MUST respond ONLY with valid JSON.
|
||||
No markdown, no explanation, no code fences. Raw JSON only."""
|
||||
|
||||
TRIALS = [
|
||||
{
|
||||
"prompt": (
|
||||
'A coin is flipped. Respond with exactly: {"choice": "heads"} or {"choice": "tails"}. '
|
||||
"Pick one. JSON only."
|
||||
),
|
||||
"schema": {"choice": str},
|
||||
"valid_values": {"choice": ["heads", "tails"]},
|
||||
},
|
||||
{
|
||||
"prompt": (
|
||||
'You must attack or defend. Respond with: {"action": "attack", "confidence": 0.8} '
|
||||
'or {"action": "defend", "confidence": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.'
|
||||
),
|
||||
"schema": {"action": str, "confidence": float},
|
||||
"valid_values": {"action": ["attack", "defend"]},
|
||||
},
|
||||
{
|
||||
"prompt": (
|
||||
'Choose a direction to march. Respond with exactly: '
|
||||
'{"direction": "north", "reason": "string explaining why"}. '
|
||||
"Pick north/south/east/west. JSON only."
|
||||
),
|
||||
"schema": {"direction": str, "reason": str},
|
||||
"valid_values": {"direction": ["north", "south", "east", "west"]},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrialResult:
|
||||
trial_index: int
|
||||
prompt: str
|
||||
raw_response: str
|
||||
parsed: dict | None
|
||||
valid_json: bool
|
||||
schema_valid: bool
|
||||
value_valid: bool
|
||||
latency_ms: float
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevelResult:
|
||||
level: int = LEVEL
|
||||
name: str = NAME
|
||||
trials: list[TrialResult] = field(default_factory=list)
|
||||
passed: bool = False
|
||||
score: float = 0.0
|
||||
latency_p50_ms: float = 0.0
|
||||
latency_p99_ms: float = 0.0
|
||||
|
||||
|
||||
def _validate_schema(parsed: dict, schema: dict[str, type]) -> bool:
|
||||
for key, expected_type in schema.items():
|
||||
if key not in parsed:
|
||||
return False
|
||||
if not isinstance(parsed[key], expected_type):
|
||||
# Allow int where float is expected
|
||||
if expected_type is float and isinstance(parsed[key], int):
|
||||
continue
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _validate_values(parsed: dict, valid_values: dict[str, list]) -> bool:
|
||||
for key, valid_list in valid_values.items():
|
||||
if key in parsed and parsed[key] not in valid_list:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _clean_response(raw: str) -> str:
|
||||
"""Strip markdown fences if model wrapped JSON in them."""
|
||||
raw = raw.strip()
|
||||
if raw.startswith("```"):
|
||||
lines = raw.splitlines()
|
||||
# Remove first and last fence lines
|
||||
lines = [l for l in lines if not l.startswith("```")]
|
||||
raw = "\n".join(lines).strip()
|
||||
return raw
|
||||
|
||||
|
||||
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
|
||||
result = LevelResult()
|
||||
latencies = []
|
||||
|
||||
for i, trial in enumerate(TRIALS):
|
||||
t0 = time.time()
|
||||
try:
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": trial["prompt"]},
|
||||
],
|
||||
options={"temperature": 0.1},
|
||||
)
|
||||
raw = response["message"]["content"]
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
except Exception as exc:
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
tr = TrialResult(
|
||||
trial_index=i,
|
||||
prompt=trial["prompt"],
|
||||
raw_response="",
|
||||
parsed=None,
|
||||
valid_json=False,
|
||||
schema_valid=False,
|
||||
value_valid=False,
|
||||
latency_ms=latency_ms,
|
||||
error=str(exc),
|
||||
)
|
||||
result.trials.append(tr)
|
||||
if verbose:
|
||||
print(f" Trial {i}: ERROR — {exc}")
|
||||
continue
|
||||
|
||||
latencies.append(latency_ms)
|
||||
|
||||
cleaned = _clean_response(raw)
|
||||
parsed = None
|
||||
valid_json = False
|
||||
schema_valid = False
|
||||
value_valid = False
|
||||
error = ""
|
||||
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
valid_json = True
|
||||
schema_valid = _validate_schema(parsed, trial["schema"])
|
||||
value_valid = _validate_values(parsed, trial["valid_values"])
|
||||
except json.JSONDecodeError as exc:
|
||||
error = f"JSONDecodeError: {exc}"
|
||||
|
||||
tr = TrialResult(
|
||||
trial_index=i,
|
||||
prompt=trial["prompt"],
|
||||
raw_response=raw,
|
||||
parsed=parsed,
|
||||
valid_json=valid_json,
|
||||
schema_valid=schema_valid,
|
||||
value_valid=value_valid,
|
||||
latency_ms=latency_ms,
|
||||
error=error,
|
||||
)
|
||||
result.trials.append(tr)
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if (valid_json and schema_valid) else "FAIL"
|
||||
print(
|
||||
f" Trial {i}: {status} | json={valid_json} schema={schema_valid} "
|
||||
f"value={value_valid} | {latency_ms:.0f}ms | {raw[:80]!r}"
|
||||
)
|
||||
|
||||
passed_trials = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
|
||||
result.score = passed_trials / len(TRIALS)
|
||||
result.passed = result.score >= 1.0 # Must pass all 3 trials
|
||||
|
||||
if latencies:
|
||||
latencies_sorted = sorted(latencies)
|
||||
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
||||
result.latency_p99_ms = latencies_sorted[-1]
|
||||
|
||||
return result
|
||||
208
timmy-benchmark/levels/level_1_tic_tac_toe.py
Normal file
208
timmy-benchmark/levels/level_1_tic_tac_toe.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""Level 1: Board State Tracking — Tic-Tac-Toe.
|
||||
|
||||
Tests whether the model can maintain game state across turns, select
|
||||
legal moves, and exhibit basic strategic awareness.
|
||||
Maps to: Bannerlord board state / campaign map tracking.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
LEVEL = 1
|
||||
NAME = "Board State Tracking (Tic-Tac-Toe)"
|
||||
DESCRIPTION = "Model must track a tic-tac-toe board and make legal, strategic moves."
|
||||
|
||||
SYSTEM_PROMPT = """You are a strategic AI playing tic-tac-toe. The board is a 3x3 grid.
|
||||
Positions are numbered 0-8 left-to-right, top-to-bottom:
|
||||
0|1|2
|
||||
3|4|5
|
||||
6|7|8
|
||||
|
||||
You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only.
|
||||
Format: {"move": <position 0-8>, "reason": "<brief reason>"}"""
|
||||
|
||||
|
||||
SCENARIOS = [
|
||||
{
|
||||
"description": "Empty board — opening move",
|
||||
"board": [None, None, None, None, None, None, None, None, None],
|
||||
"player": "X",
|
||||
"prompt": (
|
||||
'Board state: [null,null,null,null,null,null,null,null,null]. '
|
||||
'You are X. It is your turn. Choose a move. '
|
||||
'Respond: {"move": <0-8>, "reason": "<why>"}'
|
||||
),
|
||||
"check": lambda move, board: move in range(9) and board[move] is None,
|
||||
"check_desc": "Move must be a valid empty position (0-8)",
|
||||
},
|
||||
{
|
||||
"description": "Block opponent's winning move",
|
||||
"board": ["O", None, "O", None, "X", None, None, None, None],
|
||||
"player": "X",
|
||||
"prompt": (
|
||||
'Board: ["O",null,"O",null,"X",null,null,null,null]. '
|
||||
"O has positions 0 and 2. You are X. "
|
||||
"O will win on next turn unless you block. "
|
||||
'Respond: {"move": <0-8>, "reason": "<why>"}'
|
||||
),
|
||||
"check": lambda move, board: move == 1, # Must block at position 1
|
||||
"check_desc": "Must block O's win at position 1",
|
||||
},
|
||||
{
|
||||
"description": "Take winning move",
|
||||
"board": ["X", None, "X", None, "O", None, None, "O", None],
|
||||
"player": "X",
|
||||
"prompt": (
|
||||
'Board: ["X",null,"X",null,"O",null,null,"O",null]. '
|
||||
"You are X. You have positions 0 and 2. "
|
||||
"You can win this turn. "
|
||||
'Respond: {"move": <0-8>, "reason": "<why>"}'
|
||||
),
|
||||
"check": lambda move, board: move == 1, # Win at position 1
|
||||
"check_desc": "Must take winning move at position 1",
|
||||
},
|
||||
{
|
||||
"description": "Legal move on partially filled board",
|
||||
"board": ["X", "O", "X", "O", "X", "O", None, None, None],
|
||||
"player": "O",
|
||||
"prompt": (
|
||||
'Board: ["X","O","X","O","X","O",null,null,null]. '
|
||||
"You are O. Choose a legal move (positions 6, 7, or 8 are available). "
|
||||
'Respond: {"move": <0-8>, "reason": "<why>"}'
|
||||
),
|
||||
"check": lambda move, board: move in [6, 7, 8],
|
||||
"check_desc": "Move must be one of the empty positions: 6, 7, or 8",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScenarioResult:
|
||||
scenario_index: int
|
||||
description: str
|
||||
prompt: str
|
||||
raw_response: str
|
||||
parsed: dict | None
|
||||
valid_json: bool
|
||||
move_legal: bool
|
||||
move_correct: bool
|
||||
latency_ms: float
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevelResult:
|
||||
level: int = LEVEL
|
||||
name: str = NAME
|
||||
trials: list[ScenarioResult] = field(default_factory=list)
|
||||
passed: bool = False
|
||||
score: float = 0.0
|
||||
latency_p50_ms: float = 0.0
|
||||
latency_p99_ms: float = 0.0
|
||||
|
||||
|
||||
def _clean_response(raw: str) -> str:
|
||||
raw = raw.strip()
|
||||
if raw.startswith("```"):
|
||||
lines = raw.splitlines()
|
||||
lines = [l for l in lines if not l.startswith("```")]
|
||||
raw = "\n".join(lines).strip()
|
||||
return raw
|
||||
|
||||
|
||||
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
|
||||
result = LevelResult()
|
||||
latencies = []
|
||||
|
||||
for i, scenario in enumerate(SCENARIOS):
|
||||
t0 = time.time()
|
||||
try:
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": scenario["prompt"]},
|
||||
],
|
||||
options={"temperature": 0.1},
|
||||
)
|
||||
raw = response["message"]["content"]
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
except Exception as exc:
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
prompt=scenario["prompt"],
|
||||
raw_response="",
|
||||
parsed=None,
|
||||
valid_json=False,
|
||||
move_legal=False,
|
||||
move_correct=False,
|
||||
latency_ms=latency_ms,
|
||||
error=str(exc),
|
||||
)
|
||||
result.trials.append(sr)
|
||||
if verbose:
|
||||
print(f" Scenario {i}: ERROR — {exc}")
|
||||
continue
|
||||
|
||||
latencies.append(latency_ms)
|
||||
|
||||
cleaned = _clean_response(raw)
|
||||
parsed = None
|
||||
valid_json = False
|
||||
move_legal = False
|
||||
move_correct = False
|
||||
error = ""
|
||||
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
valid_json = True
|
||||
|
||||
if "move" in parsed:
|
||||
move = parsed["move"]
|
||||
if isinstance(move, int):
|
||||
board = scenario["board"]
|
||||
move_legal = 0 <= move <= 8 and board[move] is None
|
||||
move_correct = scenario["check"](move, board)
|
||||
except json.JSONDecodeError as exc:
|
||||
error = f"JSONDecodeError: {exc}"
|
||||
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
prompt=scenario["prompt"],
|
||||
raw_response=raw,
|
||||
parsed=parsed,
|
||||
valid_json=valid_json,
|
||||
move_legal=move_legal,
|
||||
move_correct=move_correct,
|
||||
latency_ms=latency_ms,
|
||||
error=error,
|
||||
)
|
||||
result.trials.append(sr)
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if (valid_json and move_legal) else "FAIL"
|
||||
correct_str = "CORRECT" if move_correct else "suboptimal"
|
||||
move_val = parsed.get("move", "?") if parsed else "?"
|
||||
print(
|
||||
f" Scenario {i} [{scenario['description']}]: {status} ({correct_str}) "
|
||||
f"| move={move_val} | {latency_ms:.0f}ms"
|
||||
)
|
||||
if not move_correct and valid_json:
|
||||
print(f" Expected: {scenario['check_desc']}")
|
||||
|
||||
# Pass criteria: all moves must be valid JSON + legal
|
||||
legal_moves = sum(1 for t in result.trials if t.valid_json and t.move_legal)
|
||||
result.score = legal_moves / len(SCENARIOS)
|
||||
result.passed = result.score >= 1.0
|
||||
|
||||
if latencies:
|
||||
latencies_sorted = sorted(latencies)
|
||||
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
||||
result.latency_p99_ms = latencies_sorted[-1]
|
||||
|
||||
return result
|
||||
213
timmy-benchmark/levels/level_2_resource_mgmt.py
Normal file
213
timmy-benchmark/levels/level_2_resource_mgmt.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""Level 2: Resource Management — Party Economy.
|
||||
|
||||
Tests whether the model can allocate limited resources across competing
|
||||
priorities and adapt when constraints change.
|
||||
Maps to: Bannerlord party economy (troops, food, gold, morale).
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
LEVEL = 2
|
||||
NAME = "Resource Management (Party Economy)"
|
||||
DESCRIPTION = "Model must allocate limited resources across troops, food, and equipment."
|
||||
|
||||
SYSTEM_PROMPT = """You are a Bannerlord campaign advisor managing a party.
|
||||
Resources are limited — every decision has trade-offs.
|
||||
You MUST respond ONLY with valid JSON. No markdown, no explanation. Raw JSON only."""
|
||||
|
||||
SCENARIOS = [
|
||||
{
|
||||
"description": "Budget allocation under constraint",
|
||||
"prompt": (
|
||||
"You have 500 gold. Options:\n"
|
||||
"- Recruit 10 infantry: costs 300 gold, +10 combat strength\n"
|
||||
"- Buy food for 20 days: costs 200 gold, keeps morale stable\n"
|
||||
"- Repair armor: costs 150 gold, -20% casualty rate\n\n"
|
||||
"You cannot afford all three. Morale is currently CRITICAL (troops may desert).\n"
|
||||
'Choose 1-2 options. Respond: {"choices": ["option_a", ...], "gold_spent": <int>, "reason": "<why>"}\n'
|
||||
"Where option keys are: recruit_infantry, buy_food, repair_armor"
|
||||
),
|
||||
"check": lambda r: (
|
||||
isinstance(r.get("choices"), list)
|
||||
and len(r["choices"]) >= 1
|
||||
and all(c in ["recruit_infantry", "buy_food", "repair_armor"] for c in r["choices"])
|
||||
and isinstance(r.get("gold_spent"), (int, float))
|
||||
and r.get("gold_spent", 9999) <= 500
|
||||
),
|
||||
"check_desc": "choices must be valid options, gold_spent <= 500",
|
||||
"strategic_check": lambda r: "buy_food" in r.get("choices", []),
|
||||
"strategic_desc": "With CRITICAL morale, food should be prioritized",
|
||||
},
|
||||
{
|
||||
"description": "Troop tier upgrade decision",
|
||||
"prompt": (
|
||||
"Party status:\n"
|
||||
"- 15 Tier-1 recruits (weak, 30 upkeep/day)\n"
|
||||
"- 5 Tier-3 veterans (strong, 90 upkeep/day)\n"
|
||||
"- Daily income: 200 gold\n"
|
||||
"- Upcoming: raider camp attack (moderate difficulty)\n\n"
|
||||
"Options:\n"
|
||||
"- Upgrade 5 recruits to Tier-2 (costs 250 gold total)\n"
|
||||
"- Keep all current troops, save gold for emergencies\n"
|
||||
"- Dismiss 5 recruits to save upkeep\n\n"
|
||||
'Respond: {"action": "upgrade_recruits"|"save_gold"|"dismiss_recruits", '
|
||||
'"reason": "<why>", "expected_outcome": "<string>"}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
r.get("action") in ["upgrade_recruits", "save_gold", "dismiss_recruits"]
|
||||
and isinstance(r.get("reason"), str)
|
||||
and len(r.get("reason", "")) > 0
|
||||
),
|
||||
"check_desc": "action must be one of the three options with a non-empty reason",
|
||||
"strategic_check": lambda r: r.get("action") in ["upgrade_recruits", "save_gold"],
|
||||
"strategic_desc": "Dismissing troops before a fight is suboptimal",
|
||||
},
|
||||
{
|
||||
"description": "Multi-turn planning horizon",
|
||||
"prompt": (
|
||||
"Current: 300 gold, 10 days of food, 20 troops\n"
|
||||
"Day 5: Must cross desert (costs 5 extra food days)\n"
|
||||
"Day 10: Reach town (can buy supplies)\n\n"
|
||||
"You need a 15-day food reserve to survive the journey.\n"
|
||||
"Food costs 10 gold/day. You have enough for 10 days now.\n\n"
|
||||
"How many extra food days do you buy today?\n"
|
||||
'Respond: {"extra_food_days": <int>, "cost": <int>, "remaining_gold": <int>, "reason": "<why>"}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
isinstance(r.get("extra_food_days"), (int, float))
|
||||
and isinstance(r.get("cost"), (int, float))
|
||||
and isinstance(r.get("remaining_gold"), (int, float))
|
||||
),
|
||||
"check_desc": "Must include extra_food_days, cost, remaining_gold as numbers",
|
||||
"strategic_check": lambda r: r.get("extra_food_days", 0) >= 5,
|
||||
"strategic_desc": "Need at least 5 more days of food for desert crossing",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScenarioResult:
|
||||
scenario_index: int
|
||||
description: str
|
||||
raw_response: str
|
||||
parsed: dict | None
|
||||
valid_json: bool
|
||||
schema_valid: bool
|
||||
strategically_sound: bool
|
||||
latency_ms: float
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevelResult:
|
||||
level: int = LEVEL
|
||||
name: str = NAME
|
||||
trials: list[ScenarioResult] = field(default_factory=list)
|
||||
passed: bool = False
|
||||
score: float = 0.0
|
||||
latency_p50_ms: float = 0.0
|
||||
latency_p99_ms: float = 0.0
|
||||
|
||||
|
||||
def _clean_response(raw: str) -> str:
|
||||
raw = raw.strip()
|
||||
if raw.startswith("```"):
|
||||
lines = raw.splitlines()
|
||||
lines = [l for l in lines if not l.startswith("```")]
|
||||
raw = "\n".join(lines).strip()
|
||||
return raw
|
||||
|
||||
|
||||
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
|
||||
result = LevelResult()
|
||||
latencies = []
|
||||
|
||||
for i, scenario in enumerate(SCENARIOS):
|
||||
t0 = time.time()
|
||||
try:
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": scenario["prompt"]},
|
||||
],
|
||||
options={"temperature": 0.1},
|
||||
)
|
||||
raw = response["message"]["content"]
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
except Exception as exc:
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
raw_response="",
|
||||
parsed=None,
|
||||
valid_json=False,
|
||||
schema_valid=False,
|
||||
strategically_sound=False,
|
||||
latency_ms=latency_ms,
|
||||
error=str(exc),
|
||||
)
|
||||
result.trials.append(sr)
|
||||
if verbose:
|
||||
print(f" Scenario {i}: ERROR — {exc}")
|
||||
continue
|
||||
|
||||
latencies.append(latency_ms)
|
||||
|
||||
cleaned = _clean_response(raw)
|
||||
parsed = None
|
||||
valid_json = False
|
||||
schema_valid = False
|
||||
strategically_sound = False
|
||||
error = ""
|
||||
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
valid_json = True
|
||||
schema_valid = scenario["check"](parsed)
|
||||
if schema_valid:
|
||||
strategically_sound = scenario["strategic_check"](parsed)
|
||||
except json.JSONDecodeError as exc:
|
||||
error = f"JSONDecodeError: {exc}"
|
||||
except Exception as exc:
|
||||
error = f"Validation error: {exc}"
|
||||
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
raw_response=raw,
|
||||
parsed=parsed,
|
||||
valid_json=valid_json,
|
||||
schema_valid=schema_valid,
|
||||
strategically_sound=strategically_sound,
|
||||
latency_ms=latency_ms,
|
||||
error=error,
|
||||
)
|
||||
result.trials.append(sr)
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if (valid_json and schema_valid) else "FAIL"
|
||||
strat = "strategic" if strategically_sound else "suboptimal"
|
||||
print(
|
||||
f" Scenario {i} [{scenario['description']}]: {status} ({strat}) "
|
||||
f"| {latency_ms:.0f}ms"
|
||||
)
|
||||
if not schema_valid and valid_json:
|
||||
print(f" Schema issue: {scenario['check_desc']}")
|
||||
if not strategically_sound and schema_valid:
|
||||
print(f" Strategy note: {scenario['strategic_desc']}")
|
||||
|
||||
valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
|
||||
result.score = valid_count / len(SCENARIOS)
|
||||
result.passed = result.score >= 0.67 # 2/3 scenarios
|
||||
|
||||
if latencies:
|
||||
latencies_sorted = sorted(latencies)
|
||||
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
||||
result.latency_p99_ms = latencies_sorted[-1]
|
||||
|
||||
return result
|
||||
216
timmy-benchmark/levels/level_3_battle_tactics.py
Normal file
216
timmy-benchmark/levels/level_3_battle_tactics.py
Normal file
@@ -0,0 +1,216 @@
|
||||
"""Level 3: Battle Tactics — Formation Commands.
|
||||
|
||||
Tests whether the model can issue coherent formation and tactical orders
|
||||
under simulated battlefield pressure with multiple unit types.
|
||||
Maps to: Bannerlord formation commands (charge, shield wall, skirmish, etc.).
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
LEVEL = 3
|
||||
NAME = "Battle Tactics (Formation Commands)"
|
||||
DESCRIPTION = "Model must issue tactically sound formation orders under simulated battle conditions."
|
||||
|
||||
SYSTEM_PROMPT = """You are a Bannerlord battle commander. Issue formation orders using these commands:
|
||||
- shield_wall: infantry forms defensive line (good vs ranged, slow advance)
|
||||
- charge: all-out attack (high casualties, breaks weak enemies fast)
|
||||
- skirmish: ranged units pepper enemy (good vs heavy infantry, needs distance)
|
||||
- advance: move forward holding formation (balanced)
|
||||
- flank_left / flank_right: cavalry sweeps around enemy side
|
||||
- fallback: retreat to regroup (when badly outnumbered)
|
||||
|
||||
You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
|
||||
|
||||
SCENARIOS = [
|
||||
{
|
||||
"description": "Ranged vs infantry — defensive opening",
|
||||
"prompt": (
|
||||
"Situation: You have 20 archers + 10 infantry. Enemy has 30 heavy infantry, no ranged.\n"
|
||||
"Enemy is 200m away and advancing.\n"
|
||||
"Objective: Maximize casualties before melee contact.\n\n"
|
||||
'Issue orders for both units. Respond:\n'
|
||||
'{"infantry_order": "<command>", "archer_order": "<command>", '
|
||||
'"reason": "<tactical reasoning>", "expected_outcome": "<string>"}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
r.get("infantry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
|
||||
and r.get("archer_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
|
||||
and isinstance(r.get("reason"), str)
|
||||
),
|
||||
"check_desc": "Both orders must be valid commands",
|
||||
"strategic_check": lambda r: (
|
||||
r.get("archer_order") == "skirmish"
|
||||
and r.get("infantry_order") in ["shield_wall", "advance"]
|
||||
),
|
||||
"strategic_desc": "Archers should skirmish while infantry holds (shield_wall or advance)",
|
||||
},
|
||||
{
|
||||
"description": "Outnumbered — retreat decision",
|
||||
"prompt": (
|
||||
"Situation: Your party (15 troops) has been ambushed.\n"
|
||||
"Enemy: 60 bandits, surrounding you on 3 sides.\n"
|
||||
"Your troops: 40% wounded. One escape route to the east.\n\n"
|
||||
'What is your command? Respond:\n'
|
||||
'{"order": "<command>", "direction": "east"|"west"|"north"|"south"|null, '
|
||||
'"reason": "<tactical reasoning>", "priority": "preserve_troops"|"fight_through"}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
r.get("order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
|
||||
and r.get("priority") in ["preserve_troops", "fight_through"]
|
||||
),
|
||||
"check_desc": "order and priority must be valid values",
|
||||
"strategic_check": lambda r: (
|
||||
r.get("order") == "fallback"
|
||||
and r.get("priority") == "preserve_troops"
|
||||
),
|
||||
"strategic_desc": "Outnumbered 4:1 with wounded troops — fallback is the sound choice",
|
||||
},
|
||||
{
|
||||
"description": "Cavalry flanking opportunity",
|
||||
"prompt": (
|
||||
"Situation: Main battle is engaged. Your infantry and enemy infantry are locked.\n"
|
||||
"You have 8 cavalry in reserve. Enemy left flank is unprotected.\n"
|
||||
"If cavalry hits the flank now, it will route enemy in ~30 seconds.\n\n"
|
||||
'Order for cavalry: Respond:\n'
|
||||
'{"cavalry_order": "<command>", "timing": "now"|"wait", '
|
||||
'"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
r.get("cavalry_order") in ["shield_wall", "charge", "skirmish", "advance", "flank_left", "flank_right", "fallback"]
|
||||
and r.get("timing") in ["now", "wait"]
|
||||
and r.get("risk") in ["low", "medium", "high"]
|
||||
),
|
||||
"check_desc": "cavalry_order, timing, and risk must be valid values",
|
||||
"strategic_check": lambda r: (
|
||||
r.get("cavalry_order") in ["flank_left", "flank_right", "charge"]
|
||||
and r.get("timing") == "now"
|
||||
),
|
||||
"strategic_desc": "Should capitalize on the flank opportunity immediately",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScenarioResult:
|
||||
scenario_index: int
|
||||
description: str
|
||||
raw_response: str
|
||||
parsed: dict | None
|
||||
valid_json: bool
|
||||
schema_valid: bool
|
||||
strategically_sound: bool
|
||||
latency_ms: float
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevelResult:
|
||||
level: int = LEVEL
|
||||
name: str = NAME
|
||||
trials: list[ScenarioResult] = field(default_factory=list)
|
||||
passed: bool = False
|
||||
score: float = 0.0
|
||||
latency_p50_ms: float = 0.0
|
||||
latency_p99_ms: float = 0.0
|
||||
|
||||
|
||||
def _clean_response(raw: str) -> str:
|
||||
raw = raw.strip()
|
||||
if raw.startswith("```"):
|
||||
lines = raw.splitlines()
|
||||
lines = [l for l in lines if not l.startswith("```")]
|
||||
raw = "\n".join(lines).strip()
|
||||
return raw
|
||||
|
||||
|
||||
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
|
||||
result = LevelResult()
|
||||
latencies = []
|
||||
|
||||
for i, scenario in enumerate(SCENARIOS):
|
||||
t0 = time.time()
|
||||
try:
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": scenario["prompt"]},
|
||||
],
|
||||
options={"temperature": 0.2},
|
||||
)
|
||||
raw = response["message"]["content"]
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
except Exception as exc:
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
raw_response="",
|
||||
parsed=None,
|
||||
valid_json=False,
|
||||
schema_valid=False,
|
||||
strategically_sound=False,
|
||||
latency_ms=latency_ms,
|
||||
error=str(exc),
|
||||
)
|
||||
result.trials.append(sr)
|
||||
if verbose:
|
||||
print(f" Scenario {i}: ERROR — {exc}")
|
||||
continue
|
||||
|
||||
latencies.append(latency_ms)
|
||||
|
||||
cleaned = _clean_response(raw)
|
||||
parsed = None
|
||||
valid_json = False
|
||||
schema_valid = False
|
||||
strategically_sound = False
|
||||
error = ""
|
||||
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
valid_json = True
|
||||
schema_valid = scenario["check"](parsed)
|
||||
if schema_valid:
|
||||
strategically_sound = scenario["strategic_check"](parsed)
|
||||
except json.JSONDecodeError as exc:
|
||||
error = f"JSONDecodeError: {exc}"
|
||||
except Exception as exc:
|
||||
error = f"Validation error: {exc}"
|
||||
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
raw_response=raw,
|
||||
parsed=parsed,
|
||||
valid_json=valid_json,
|
||||
schema_valid=schema_valid,
|
||||
strategically_sound=strategically_sound,
|
||||
latency_ms=latency_ms,
|
||||
error=error,
|
||||
)
|
||||
result.trials.append(sr)
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if (valid_json and schema_valid) else "FAIL"
|
||||
strat = "strategic" if strategically_sound else "suboptimal"
|
||||
print(
|
||||
f" Scenario {i} [{scenario['description']}]: {status} ({strat}) "
|
||||
f"| {latency_ms:.0f}ms"
|
||||
)
|
||||
if not schema_valid and valid_json:
|
||||
print(f" Schema issue: {scenario['check_desc']}")
|
||||
|
||||
valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
|
||||
result.score = valid_count / len(SCENARIOS)
|
||||
result.passed = result.score >= 0.67
|
||||
|
||||
if latencies:
|
||||
latencies_sorted = sorted(latencies)
|
||||
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
||||
result.latency_p99_ms = latencies_sorted[-1]
|
||||
|
||||
return result
|
||||
223
timmy-benchmark/levels/level_4_trade_route.py
Normal file
223
timmy-benchmark/levels/level_4_trade_route.py
Normal file
@@ -0,0 +1,223 @@
|
||||
"""Level 4: Trade Route — Campaign Navigation.
|
||||
|
||||
Tests multi-step planning ability: route optimization, trade-off analysis
|
||||
across time horizons, and adapting plans when conditions change.
|
||||
Maps to: Bannerlord campaign map navigation, caravans, and economy.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
LEVEL = 4
|
||||
NAME = "Trade Route (Campaign Navigation)"
|
||||
DESCRIPTION = "Model must plan optimal routes and adapt to changing conditions on the campaign map."
|
||||
|
||||
SYSTEM_PROMPT = """You are a Bannerlord merchant lord planning campaign movements.
|
||||
Consider distance, profitability, risk, and timing.
|
||||
You MUST respond ONLY with valid JSON. No markdown. Raw JSON only."""
|
||||
|
||||
SCENARIOS = [
|
||||
{
|
||||
"description": "Optimal trade route selection",
|
||||
"prompt": (
|
||||
"You are at Epicrotea with 500 gold and 20 days travel budget.\n\n"
|
||||
"Trade opportunities:\n"
|
||||
"- Route A: Epicrotea → Vlandia (3 days) → Sturgia (5 days back)\n"
|
||||
" Sell grain in Vlandia: +300 gold. Buy furs in Sturgia: costs 200, sells for 400 in Calradia.\n"
|
||||
" Total: +500 gold profit, 8 days.\n"
|
||||
"- Route B: Epicrotea → Calradia (2 days) → Aserai (4 days)\n"
|
||||
" Sell iron in Calradia: +150 gold. Buy spice in Aserai: costs 300, sells for 600 in Empire.\n"
|
||||
" Empire is 6 more days away. Total: +450 gold profit, 12 days.\n"
|
||||
"- Route C: Epicrotea → nearby village (1 day)\n"
|
||||
" Buy cheap food: costs 100, sells for 180 in any city.\n"
|
||||
" Total: +80 gold profit, 2 days. Repeatable.\n\n"
|
||||
'Choose route. Respond:\n'
|
||||
'{"route": "A"|"B"|"C", "expected_profit": <int>, "days_used": <int>, '
|
||||
'"reason": "<reasoning>", "risk": "low"|"medium"|"high"}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
r.get("route") in ["A", "B", "C"]
|
||||
and isinstance(r.get("expected_profit"), (int, float))
|
||||
and isinstance(r.get("days_used"), (int, float))
|
||||
and r.get("risk") in ["low", "medium", "high"]
|
||||
),
|
||||
"check_desc": "route, expected_profit, days_used, risk must be valid",
|
||||
"strategic_check": lambda r: r.get("route") in ["A", "C"], # A is best single trip, C is best if repeated
|
||||
"strategic_desc": "Route A has best profit/day ratio; C is best if multiple loops possible",
|
||||
},
|
||||
{
|
||||
"description": "Adapt plan when war declared",
|
||||
"prompt": (
|
||||
"You were heading to Vlandia to trade, 2 days into the journey.\n"
|
||||
"NEWS: Vlandia just declared war on your faction. Entering Vlandia territory is now dangerous.\n\n"
|
||||
"Your current position: borderlands, equidistant between:\n"
|
||||
"- Vlandia (2 days): Now at war — high risk of attack\n"
|
||||
"- Sturgia (3 days): Neutral — safe\n"
|
||||
"- Empire (4 days): Allied — very safe, good prices\n\n"
|
||||
"You have 400 gold of trade goods for the Vlandia market.\n"
|
||||
'What do you do? Respond:\n'
|
||||
'{"decision": "continue_to_vlandia"|"divert_to_sturgia"|"divert_to_empire", '
|
||||
'"reason": "<why>", "gold_at_risk": <int>}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
r.get("decision") in ["continue_to_vlandia", "divert_to_sturgia", "divert_to_empire"]
|
||||
and isinstance(r.get("gold_at_risk"), (int, float))
|
||||
),
|
||||
"check_desc": "decision must be one of three options, gold_at_risk must be a number",
|
||||
"strategic_check": lambda r: r.get("decision") in ["divert_to_sturgia", "divert_to_empire"],
|
||||
"strategic_desc": "Should avoid active war zone — divert to safe destination",
|
||||
},
|
||||
{
|
||||
"description": "Multi-stop route planning with constraints",
|
||||
"prompt": (
|
||||
"Plan a 3-stop trading circuit starting and ending at Pravend.\n"
|
||||
"Budget: 800 gold. Time limit: 20 days.\n\n"
|
||||
"Available cities and travel times from Pravend:\n"
|
||||
"- Rhotae: 2 days (leather cheap, sells well in south)\n"
|
||||
"- Ortysia: 4 days (grain surplus — buy cheap)\n"
|
||||
"- Epicrotea: 3 days (iron market — buy/sell)\n"
|
||||
"- Pen Cannoc: 5 days (wine — high profit, far)\n\n"
|
||||
"Each stop takes 1 day for trading.\n"
|
||||
'Plan 3 stops. Respond:\n'
|
||||
'{"stops": ["<city1>", "<city2>", "<city3>"], '
|
||||
'"total_days": <int>, "estimated_profit": <int>, '
|
||||
'"reason": "<reasoning>"}'
|
||||
),
|
||||
"check": lambda r: (
|
||||
isinstance(r.get("stops"), list)
|
||||
and len(r["stops"]) == 3
|
||||
and all(isinstance(s, str) for s in r["stops"])
|
||||
and isinstance(r.get("total_days"), (int, float))
|
||||
and r.get("total_days", 99) <= 20
|
||||
and isinstance(r.get("estimated_profit"), (int, float))
|
||||
),
|
||||
"check_desc": "stops must be list of 3 strings, total_days <= 20, estimated_profit numeric",
|
||||
"strategic_check": lambda r: "Pen Cannoc" not in r.get("stops", []), # Too far for 20 days
|
||||
"strategic_desc": "Pen Cannoc at 5 days each way is likely too far for a 20-day circuit",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScenarioResult:
|
||||
scenario_index: int
|
||||
description: str
|
||||
raw_response: str
|
||||
parsed: dict | None
|
||||
valid_json: bool
|
||||
schema_valid: bool
|
||||
strategically_sound: bool
|
||||
latency_ms: float
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevelResult:
|
||||
level: int = LEVEL
|
||||
name: str = NAME
|
||||
trials: list[ScenarioResult] = field(default_factory=list)
|
||||
passed: bool = False
|
||||
score: float = 0.0
|
||||
latency_p50_ms: float = 0.0
|
||||
latency_p99_ms: float = 0.0
|
||||
|
||||
|
||||
def _clean_response(raw: str) -> str:
|
||||
raw = raw.strip()
|
||||
if raw.startswith("```"):
|
||||
lines = raw.splitlines()
|
||||
lines = [l for l in lines if not l.startswith("```")]
|
||||
raw = "\n".join(lines).strip()
|
||||
return raw
|
||||
|
||||
|
||||
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
|
||||
result = LevelResult()
|
||||
latencies = []
|
||||
|
||||
for i, scenario in enumerate(SCENARIOS):
|
||||
t0 = time.time()
|
||||
try:
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": scenario["prompt"]},
|
||||
],
|
||||
options={"temperature": 0.2},
|
||||
)
|
||||
raw = response["message"]["content"]
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
except Exception as exc:
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
raw_response="",
|
||||
parsed=None,
|
||||
valid_json=False,
|
||||
schema_valid=False,
|
||||
strategically_sound=False,
|
||||
latency_ms=latency_ms,
|
||||
error=str(exc),
|
||||
)
|
||||
result.trials.append(sr)
|
||||
if verbose:
|
||||
print(f" Scenario {i}: ERROR — {exc}")
|
||||
continue
|
||||
|
||||
latencies.append(latency_ms)
|
||||
|
||||
cleaned = _clean_response(raw)
|
||||
parsed = None
|
||||
valid_json = False
|
||||
schema_valid = False
|
||||
strategically_sound = False
|
||||
error = ""
|
||||
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
valid_json = True
|
||||
schema_valid = scenario["check"](parsed)
|
||||
if schema_valid:
|
||||
strategically_sound = scenario["strategic_check"](parsed)
|
||||
except json.JSONDecodeError as exc:
|
||||
error = f"JSONDecodeError: {exc}"
|
||||
except Exception as exc:
|
||||
error = f"Validation error: {exc}"
|
||||
|
||||
sr = ScenarioResult(
|
||||
scenario_index=i,
|
||||
description=scenario["description"],
|
||||
raw_response=raw,
|
||||
parsed=parsed,
|
||||
valid_json=valid_json,
|
||||
schema_valid=schema_valid,
|
||||
strategically_sound=strategically_sound,
|
||||
latency_ms=latency_ms,
|
||||
error=error,
|
||||
)
|
||||
result.trials.append(sr)
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if (valid_json and schema_valid) else "FAIL"
|
||||
strat = "strategic" if strategically_sound else "suboptimal"
|
||||
print(
|
||||
f" Scenario {i} [{scenario['description']}]: {status} ({strat}) "
|
||||
f"| {latency_ms:.0f}ms"
|
||||
)
|
||||
if not schema_valid and valid_json:
|
||||
print(f" Schema issue: {scenario['check_desc']}")
|
||||
|
||||
valid_count = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
|
||||
result.score = valid_count / len(SCENARIOS)
|
||||
result.passed = result.score >= 0.67
|
||||
|
||||
if latencies:
|
||||
latencies_sorted = sorted(latencies)
|
||||
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
||||
result.latency_p99_ms = latencies_sorted[-1]
|
||||
|
||||
return result
|
||||
252
timmy-benchmark/levels/level_5_mini_campaign.py
Normal file
252
timmy-benchmark/levels/level_5_mini_campaign.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""Level 5: Mini Campaign — Full Campaign Loop.
|
||||
|
||||
Tests multi-turn strategic coherence: the model must maintain state across
|
||||
several turns of a simulated Bannerlord campaign, making consistent decisions
|
||||
that build toward a long-term goal.
|
||||
Maps to: Full Bannerlord campaign loop — economy, diplomacy, conquest.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
LEVEL = 5
|
||||
NAME = "Mini Campaign (Full Campaign Loop)"
|
||||
DESCRIPTION = "Multi-turn strategic planning maintaining coherent goals across 4 turns."
|
||||
|
||||
SYSTEM_PROMPT = """You are Timmy, a Bannerlord lord with ambitions to become King of Calradia.
|
||||
You have 4 turns to establish a power base. Each turn represents 2 weeks of in-game time.
|
||||
|
||||
Your starting position:
|
||||
- Clan tier: 1 (minor lord)
|
||||
- Gold: 1000
|
||||
- Troops: 25 (mixed infantry/cavalry)
|
||||
- Renown: 150
|
||||
- Relations: Neutral with all factions
|
||||
|
||||
Winning requires: Gold > 3000 AND Renown > 400 AND Own 1+ settlement by Turn 4.
|
||||
|
||||
Each turn, choose ONE primary action:
|
||||
- "raid_village": +200 gold, -50 relations target faction, +30 renown, risk of retaliation
|
||||
- "trade_circuit": +300 gold, 0 relation change, +10 renown, no risk
|
||||
- "escort_caravan": +150 gold, +20 relations with faction, +20 renown
|
||||
- "tournament": costs 100 gold, +60 renown, +20 relations with host faction
|
||||
- "recruit_troops": costs 200 gold, +15 troops, no other change
|
||||
- "siege_castle": costs 500 gold + 200 troops morale, -100 relations, +80 renown, +1 settlement if succeed (30% base chance)
|
||||
- "pledge_vassalage": 0 cost, +100 relations with liege, +50 renown, lose independence
|
||||
|
||||
You MUST respond ONLY with valid JSON for each turn. Raw JSON only."""
|
||||
|
||||
|
||||
def run(client: Any, model: str, verbose: bool = False) -> "LevelResult":
|
||||
"""Run a 4-turn mini campaign, tracking state and decision quality."""
|
||||
result = LevelResult()
|
||||
|
||||
# Initial game state
|
||||
state = {
|
||||
"turn": 1,
|
||||
"gold": 1000,
|
||||
"troops": 25,
|
||||
"renown": 150,
|
||||
"settlements": 0,
|
||||
"relations": {"vlandia": 0, "sturgia": 0, "empire": 0, "aserai": 0, "battania": 0},
|
||||
}
|
||||
|
||||
conversation = [{"role": "system", "content": SYSTEM_PROMPT}]
|
||||
turns_passed = []
|
||||
total_latency = []
|
||||
|
||||
valid_actions = [
|
||||
"raid_village", "trade_circuit", "escort_caravan", "tournament",
|
||||
"recruit_troops", "siege_castle", "pledge_vassalage",
|
||||
]
|
||||
|
||||
for turn_num in range(1, 5):
|
||||
state["turn"] = turn_num
|
||||
state_str = json.dumps(state, indent=2)
|
||||
|
||||
prompt = (
|
||||
f"=== TURN {turn_num} / 4 ===\n"
|
||||
f"Current state:\n{state_str}\n\n"
|
||||
f"Win conditions remaining: Gold > 3000 ({state['gold']}/3000), "
|
||||
f"Renown > 400 ({state['renown']}/400), Settlements >= 1 ({state['settlements']}/1)\n\n"
|
||||
f"Choose your action for Turn {turn_num}.\n"
|
||||
f'Respond: {{"action": "<action>", "target_faction": "<faction or null>", '
|
||||
f'"reason": "<strategic reasoning>", "goal": "<what this advances>"}}'
|
||||
)
|
||||
|
||||
conversation.append({"role": "user", "content": prompt})
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=conversation,
|
||||
options={"temperature": 0.3},
|
||||
)
|
||||
raw = response["message"]["content"]
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
except Exception as exc:
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
tr = TurnResult(
|
||||
turn=turn_num,
|
||||
state_before=dict(state),
|
||||
raw_response="",
|
||||
parsed=None,
|
||||
valid_json=False,
|
||||
valid_action=False,
|
||||
action=None,
|
||||
latency_ms=latency_ms,
|
||||
error=str(exc),
|
||||
)
|
||||
turns_passed.append(tr)
|
||||
if verbose:
|
||||
print(f" Turn {turn_num}: ERROR — {exc}")
|
||||
break
|
||||
|
||||
total_latency.append(latency_ms)
|
||||
|
||||
# Clean and parse response
|
||||
cleaned = raw.strip()
|
||||
if cleaned.startswith("```"):
|
||||
lines = cleaned.splitlines()
|
||||
lines = [l for l in lines if not l.startswith("```")]
|
||||
cleaned = "\n".join(lines).strip()
|
||||
|
||||
parsed = None
|
||||
valid_json = False
|
||||
valid_action = False
|
||||
action = None
|
||||
error = ""
|
||||
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
valid_json = True
|
||||
action = parsed.get("action")
|
||||
valid_action = action in valid_actions
|
||||
except json.JSONDecodeError as exc:
|
||||
error = f"JSONDecodeError: {exc}"
|
||||
|
||||
tr = TurnResult(
|
||||
turn=turn_num,
|
||||
state_before=dict(state),
|
||||
raw_response=raw,
|
||||
parsed=parsed,
|
||||
valid_json=valid_json,
|
||||
valid_action=valid_action,
|
||||
action=action,
|
||||
latency_ms=latency_ms,
|
||||
error=error,
|
||||
)
|
||||
turns_passed.append(tr)
|
||||
|
||||
# Add model response to conversation for continuity
|
||||
conversation.append({"role": "assistant", "content": raw})
|
||||
|
||||
# Apply state changes based on action
|
||||
if valid_action:
|
||||
_apply_action(state, action, parsed.get("target_faction"))
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if (valid_json and valid_action) else "FAIL"
|
||||
print(
|
||||
f" Turn {turn_num}: {status} | action={action} | {latency_ms:.0f}ms | "
|
||||
f"gold={state['gold']} renown={state['renown']} settlements={state['settlements']}"
|
||||
)
|
||||
|
||||
result.turns = turns_passed
|
||||
result.final_state = dict(state)
|
||||
|
||||
# Win condition check
|
||||
result.reached_gold_target = state["gold"] >= 3000
|
||||
result.reached_renown_target = state["renown"] >= 400
|
||||
result.reached_settlement_target = state["settlements"] >= 1
|
||||
|
||||
# Score: % of turns with valid JSON + valid action
|
||||
valid_turns = sum(1 for t in turns_passed if t.valid_json and t.valid_action)
|
||||
result.score = valid_turns / 4 if turns_passed else 0.0
|
||||
result.passed = result.score >= 0.75 # 3/4 turns valid
|
||||
|
||||
if total_latency:
|
||||
latencies_sorted = sorted(total_latency)
|
||||
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
||||
result.latency_p99_ms = latencies_sorted[-1]
|
||||
|
||||
if verbose:
|
||||
win_status = []
|
||||
if result.reached_gold_target:
|
||||
win_status.append("GOLD")
|
||||
if result.reached_renown_target:
|
||||
win_status.append("RENOWN")
|
||||
if result.reached_settlement_target:
|
||||
win_status.append("SETTLEMENT")
|
||||
print(f" Win conditions met: {win_status or 'none'}")
|
||||
print(f" Final: gold={state['gold']} renown={state['renown']} settlements={state['settlements']}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _apply_action(state: dict, action: str, target_faction: str | None) -> None:
|
||||
"""Simulate game state changes for a given action."""
|
||||
if action == "raid_village":
|
||||
state["gold"] += 200
|
||||
state["renown"] += 30
|
||||
if target_faction and target_faction in state["relations"]:
|
||||
state["relations"][target_faction] -= 50
|
||||
elif action == "trade_circuit":
|
||||
state["gold"] += 300
|
||||
state["renown"] += 10
|
||||
elif action == "escort_caravan":
|
||||
state["gold"] += 150
|
||||
state["renown"] += 20
|
||||
if target_faction and target_faction in state["relations"]:
|
||||
state["relations"][target_faction] += 20
|
||||
elif action == "tournament":
|
||||
state["gold"] -= 100
|
||||
state["renown"] += 60
|
||||
if target_faction and target_faction in state["relations"]:
|
||||
state["relations"][target_faction] += 20
|
||||
elif action == "recruit_troops":
|
||||
state["gold"] -= 200
|
||||
state["troops"] += 15
|
||||
elif action == "siege_castle":
|
||||
state["gold"] -= 500
|
||||
state["renown"] += 80
|
||||
# 30% chance success (deterministic sim: succeed on turn 3+ if attempted)
|
||||
if state["turn"] >= 3:
|
||||
state["settlements"] += 1
|
||||
if target_faction and target_faction in state["relations"]:
|
||||
state["relations"][target_faction] -= 100
|
||||
elif action == "pledge_vassalage":
|
||||
state["renown"] += 50
|
||||
if target_faction and target_faction in state["relations"]:
|
||||
state["relations"][target_faction] += 100
|
||||
|
||||
|
||||
@dataclass
|
||||
class TurnResult:
|
||||
turn: int
|
||||
state_before: dict
|
||||
raw_response: str
|
||||
parsed: dict | None
|
||||
valid_json: bool
|
||||
valid_action: bool
|
||||
action: str | None
|
||||
latency_ms: float
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevelResult:
|
||||
level: int = LEVEL
|
||||
name: str = NAME
|
||||
turns: list[TurnResult] = field(default_factory=list)
|
||||
final_state: dict = field(default_factory=dict)
|
||||
passed: bool = False
|
||||
score: float = 0.0
|
||||
reached_gold_target: bool = False
|
||||
reached_renown_target: bool = False
|
||||
reached_settlement_target: bool = False
|
||||
latency_p50_ms: float = 0.0
|
||||
latency_p99_ms: float = 0.0
|
||||
476
timmy-benchmark/results/qwen2.5_14b_20260323_142119.json
Normal file
476
timmy-benchmark/results/qwen2.5_14b_20260323_142119.json
Normal file
@@ -0,0 +1,476 @@
|
||||
{
|
||||
"model": "qwen2.5:14b",
|
||||
"timestamp": "2026-03-23T18:20:27.095190+00:00",
|
||||
"skipped": false,
|
||||
"levels": {
|
||||
"0": {
|
||||
"level": 0,
|
||||
"name": "JSON Compliance (Coin Flip)",
|
||||
"trials": [
|
||||
{
|
||||
"trial_index": 0,
|
||||
"prompt": "A coin is flipped. Respond with exactly: {\"choice\": \"heads\"} or {\"choice\": \"tails\"}. Pick one. JSON only.",
|
||||
"raw_response": "{\"choice\": \"heads\"}",
|
||||
"parsed": {
|
||||
"choice": "heads"
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"value_valid": true,
|
||||
"latency_ms": 6347.5000858306885,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"trial_index": 1,
|
||||
"prompt": "You must attack or defend. Respond with: {\"action\": \"attack\", \"confidence\": 0.8} or {\"action\": \"defend\", \"confidence\": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.",
|
||||
"raw_response": "{\"action\": \"defend\", \"confidence\": 0.6}",
|
||||
"parsed": {
|
||||
"action": "defend",
|
||||
"confidence": 0.6
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"value_valid": true,
|
||||
"latency_ms": 850.42405128479,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"trial_index": 2,
|
||||
"prompt": "Choose a direction to march. Respond with exactly: {\"direction\": \"north\", \"reason\": \"string explaining why\"}. Pick north/south/east/west. JSON only.",
|
||||
"raw_response": "{\"direction\": \"north\", \"reason\": \"There is a reported enemy encampment to the north that we need to engage.\"}",
|
||||
"parsed": {
|
||||
"direction": "north",
|
||||
"reason": "There is a reported enemy encampment to the north that we need to engage."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"value_valid": true,
|
||||
"latency_ms": 1443.0510997772217,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"passed": true,
|
||||
"score": 1.0,
|
||||
"latency_p50_ms": 1443.0510997772217,
|
||||
"latency_p99_ms": 6347.5000858306885
|
||||
},
|
||||
"1": {
|
||||
"level": 1,
|
||||
"name": "Board State Tracking (Tic-Tac-Toe)",
|
||||
"trials": [
|
||||
{
|
||||
"scenario_index": 0,
|
||||
"description": "Empty board \u2014 opening move",
|
||||
"prompt": "Board state: [null,null,null,null,null,null,null,null,null]. You are X. It is your turn. Choose a move. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
||||
"raw_response": "{\"move\": 4, \"reason\": \"Control the center\"}",
|
||||
"parsed": {
|
||||
"move": 4,
|
||||
"reason": "Control the center"
|
||||
},
|
||||
"valid_json": true,
|
||||
"move_legal": true,
|
||||
"move_correct": true,
|
||||
"latency_ms": 1184.3791007995605,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 1,
|
||||
"description": "Block opponent's winning move",
|
||||
"prompt": "Board: [\"O\",null,\"O\",null,\"X\",null,null,null,null]. O has positions 0 and 2. You are X. O will win on next turn unless you block. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
||||
"raw_response": "{\"move\": 4, \"reason\": \"Block O's winning move\"}",
|
||||
"parsed": {
|
||||
"move": 4,
|
||||
"reason": "Block O's winning move"
|
||||
},
|
||||
"valid_json": true,
|
||||
"move_legal": false,
|
||||
"move_correct": false,
|
||||
"latency_ms": 942.816972732544,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 2,
|
||||
"description": "Take winning move",
|
||||
"prompt": "Board: [\"X\",null,\"X\",null,\"O\",null,null,\"O\",null]. You are X. You have positions 0 and 2. You can win this turn. Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
||||
"raw_response": "{\"move\": 2, \"reason\": \"Winning move\"}",
|
||||
"parsed": {
|
||||
"move": 2,
|
||||
"reason": "Winning move"
|
||||
},
|
||||
"valid_json": true,
|
||||
"move_legal": false,
|
||||
"move_correct": false,
|
||||
"latency_ms": 849.2491245269775,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 3,
|
||||
"description": "Legal move on partially filled board",
|
||||
"prompt": "Board: [\"X\",\"O\",\"X\",\"O\",\"X\",\"O\",null,null,null]. You are O. Choose a legal move (positions 6, 7, or 8 are available). Respond: {\"move\": <0-8>, \"reason\": \"<why>\"}",
|
||||
"raw_response": "{\"move\": 7, \"reason\": \"Creating a fork opportunity\"}",
|
||||
"parsed": {
|
||||
"move": 7,
|
||||
"reason": "Creating a fork opportunity"
|
||||
},
|
||||
"valid_json": true,
|
||||
"move_legal": true,
|
||||
"move_correct": true,
|
||||
"latency_ms": 885.8909606933594,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"passed": false,
|
||||
"score": 0.5,
|
||||
"latency_p50_ms": 942.816972732544,
|
||||
"latency_p99_ms": 1184.3791007995605
|
||||
},
|
||||
"2": {
|
||||
"level": 2,
|
||||
"name": "Resource Management (Party Economy)",
|
||||
"trials": [
|
||||
{
|
||||
"scenario_index": 0,
|
||||
"description": "Budget allocation under constraint",
|
||||
"raw_response": "{\"choices\": [\"buy_food\", \"repair_armor\"], \"gold_spent\": 350, \"reason\": \"Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles.\"}",
|
||||
"parsed": {
|
||||
"choices": [
|
||||
"buy_food",
|
||||
"repair_armor"
|
||||
],
|
||||
"gold_spent": 350,
|
||||
"reason": "Morale is critical, so buying food is the priority to prevent desertion. Repairing armor is also crucial to reduce casualties in the upcoming battles."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 3121.7379570007324,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 1,
|
||||
"description": "Troop tier upgrade decision",
|
||||
"raw_response": "{\"action\": \"upgrade_recruits\", \"reason\": \"Improving the strength of the weakest troops can balance the party's overall combat effectiveness.\", \"expected_outcome\": \"Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers.\"}",
|
||||
"parsed": {
|
||||
"action": "upgrade_recruits",
|
||||
"reason": "Improving the strength of the weakest troops can balance the party's overall combat effectiveness.",
|
||||
"expected_outcome": "Enhanced combat capability against the raider camp with a mix of Tier-2 and Tier-3 soldiers."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 2936.119794845581,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 2,
|
||||
"description": "Multi-turn planning horizon",
|
||||
"raw_response": "{\"extra_food_days\": 5, \"cost\": 50, \"remaining_gold\": 250, \"reason\": \"To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now.\"}",
|
||||
"parsed": {
|
||||
"extra_food_days": 5,
|
||||
"cost": 50,
|
||||
"remaining_gold": 250,
|
||||
"reason": "To ensure survival, buy enough food to cover the 15-day requirement, spending the minimum necessary now."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 2515.505790710449,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"passed": true,
|
||||
"score": 1.0,
|
||||
"latency_p50_ms": 2936.119794845581,
|
||||
"latency_p99_ms": 3121.7379570007324
|
||||
},
|
||||
"3": {
|
||||
"level": 3,
|
||||
"name": "Battle Tactics (Formation Commands)",
|
||||
"trials": [
|
||||
{
|
||||
"scenario_index": 0,
|
||||
"description": "Ranged vs infantry \u2014 defensive opening",
|
||||
"raw_response": "{\"infantry_order\": \"shield_wall\", \"archer_order\": \"skirmish\", \"reason\": \"Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.\", \"expected_outcome\": \"Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact.\"}",
|
||||
"parsed": {
|
||||
"infantry_order": "shield_wall",
|
||||
"archer_order": "skirmish",
|
||||
"reason": "Infantry should prepare for incoming melee by forming a defensive posture. Archers should engage enemy from a distance to maximize casualties before close combat.",
|
||||
"expected_outcome": "Inflict significant casualties on the enemy heavy infantry while minimizing losses until melee contact."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 3828.04012298584,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 1,
|
||||
"description": "Outnumbered \u2014 retreat decision",
|
||||
"raw_response": "{\"order\": \"skirmish\", \"direction\": \"east\", \"reason\": \"Engage the enemy at range to buy time for an organized retreat.\", \"priority\": \"preserve_troops\"}",
|
||||
"parsed": {
|
||||
"order": "skirmish",
|
||||
"direction": "east",
|
||||
"reason": "Engage the enemy at range to buy time for an organized retreat.",
|
||||
"priority": "preserve_troops"
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": false,
|
||||
"latency_ms": 2162.7819538116455,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 2,
|
||||
"description": "Cavalry flanking opportunity",
|
||||
"raw_response": "{\"cavalry_order\": \"flank_right\", \"timing\": \"now\", \"reason\": \"Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.\", \"risk\": \"low\"}",
|
||||
"parsed": {
|
||||
"cavalry_order": "flank_right",
|
||||
"timing": "now",
|
||||
"reason": "Enemy left flank is unprotected and vulnerable, providing an opportunity to break their line quickly.",
|
||||
"risk": "low"
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 2248.1229305267334,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"passed": true,
|
||||
"score": 1.0,
|
||||
"latency_p50_ms": 2248.1229305267334,
|
||||
"latency_p99_ms": 3828.04012298584
|
||||
},
|
||||
"4": {
|
||||
"level": 4,
|
||||
"name": "Trade Route (Campaign Navigation)",
|
||||
"trials": [
|
||||
{
|
||||
"scenario_index": 0,
|
||||
"description": "Optimal trade route selection",
|
||||
"raw_response": "{\"route\": \"A\", \"expected_profit\": 500, \"days_used\": 8, \"reason\": \"Offers the highest profit with a reasonable travel time.\", \"risk\": \"medium\"}",
|
||||
"parsed": {
|
||||
"route": "A",
|
||||
"expected_profit": 500,
|
||||
"days_used": 8,
|
||||
"reason": "Offers the highest profit with a reasonable travel time.",
|
||||
"risk": "medium"
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 3142.971992492676,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 1,
|
||||
"description": "Adapt plan when war declared",
|
||||
"raw_response": "{\"decision\": \"divert_to_sturgia\", \"reason\": \"Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.\", \"gold_at_risk\": 400}",
|
||||
"parsed": {
|
||||
"decision": "divert_to_sturgia",
|
||||
"reason": "Vlandia is at war with my faction, making the journey dangerous. Sturgia is neutral and safer, though it's slightly further. The Empire is allied and safest but further still.",
|
||||
"gold_at_risk": 400
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 3317.8718090057373,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"scenario_index": 2,
|
||||
"description": "Multi-stop route planning with constraints",
|
||||
"raw_response": "{\"stops\": [\"Ortysia\", \"Epicrotea\", \"Rhotae\"], \"total_days\": 12, \"estimated_profit\": 1200, \"reason\": \"Optimized for time and profit, focusing on essential goods with high demand and low risk.\"}",
|
||||
"parsed": {
|
||||
"stops": [
|
||||
"Ortysia",
|
||||
"Epicrotea",
|
||||
"Rhotae"
|
||||
],
|
||||
"total_days": 12,
|
||||
"estimated_profit": 1200,
|
||||
"reason": "Optimized for time and profit, focusing on essential goods with high demand and low risk."
|
||||
},
|
||||
"valid_json": true,
|
||||
"schema_valid": true,
|
||||
"strategically_sound": true,
|
||||
"latency_ms": 3235.442876815796,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"passed": true,
|
||||
"score": 1.0,
|
||||
"latency_p50_ms": 3235.442876815796,
|
||||
"latency_p99_ms": 3317.8718090057373
|
||||
},
|
||||
"5": {
|
||||
"level": 5,
|
||||
"name": "Mini Campaign (Full Campaign Loop)",
|
||||
"turns": [
|
||||
{
|
||||
"turn": 1,
|
||||
"state_before": {
|
||||
"turn": 1,
|
||||
"gold": 1000,
|
||||
"troops": 25,
|
||||
"renown": 150,
|
||||
"settlements": 0,
|
||||
"relations": {
|
||||
"vlandia": 0,
|
||||
"sturgia": 0,
|
||||
"empire": -80,
|
||||
"aserai": 0,
|
||||
"battania": 0
|
||||
}
|
||||
},
|
||||
"raw_response": "{\"action\": \"trade_circuit\", \"target_faction\": null, \"reason\": \"Increase gold and renown without risking relations or encountering retaliation.\", \"goal\": \"Economic stability and reputation building\"}",
|
||||
"parsed": {
|
||||
"action": "trade_circuit",
|
||||
"target_faction": null,
|
||||
"reason": "Increase gold and renown without risking relations or encountering retaliation.",
|
||||
"goal": "Economic stability and reputation building"
|
||||
},
|
||||
"valid_json": true,
|
||||
"valid_action": true,
|
||||
"action": "trade_circuit",
|
||||
"latency_ms": 3970.310926437378,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"turn": 2,
|
||||
"state_before": {
|
||||
"turn": 2,
|
||||
"gold": 1300,
|
||||
"troops": 25,
|
||||
"renown": 160,
|
||||
"settlements": 0,
|
||||
"relations": {
|
||||
"vlandia": 0,
|
||||
"sturgia": 0,
|
||||
"empire": -80,
|
||||
"aserai": 0,
|
||||
"battania": 0
|
||||
}
|
||||
},
|
||||
"raw_response": "{\"action\": \"escort_caravan\", \"target_faction\": \"empire\", \"reason\": \"Improve relations with a major faction and gain additional renown.\", \"goal\": \"Political alliance and reputation advancement\"}",
|
||||
"parsed": {
|
||||
"action": "escort_caravan",
|
||||
"target_faction": "empire",
|
||||
"reason": "Improve relations with a major faction and gain additional renown.",
|
||||
"goal": "Political alliance and reputation advancement"
|
||||
},
|
||||
"valid_json": true,
|
||||
"valid_action": true,
|
||||
"action": "escort_caravan",
|
||||
"latency_ms": 2754.3041706085205,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"turn": 3,
|
||||
"state_before": {
|
||||
"turn": 3,
|
||||
"gold": 1450,
|
||||
"troops": 25,
|
||||
"renown": 180,
|
||||
"settlements": 0,
|
||||
"relations": {
|
||||
"vlandia": 0,
|
||||
"sturgia": 0,
|
||||
"empire": -80,
|
||||
"aserai": 0,
|
||||
"battania": 0
|
||||
}
|
||||
},
|
||||
"raw_response": "{\"action\": \"recruit_troops\", \"target_faction\": null, \"reason\": \"Increase military strength to prepare for a potential siege or defense.\", \"goal\": \"Military readiness and expansion capability\"}",
|
||||
"parsed": {
|
||||
"action": "recruit_troops",
|
||||
"target_faction": null,
|
||||
"reason": "Increase military strength to prepare for a potential siege or defense.",
|
||||
"goal": "Military readiness and expansion capability"
|
||||
},
|
||||
"valid_json": true,
|
||||
"valid_action": true,
|
||||
"action": "recruit_troops",
|
||||
"latency_ms": 2931.4141273498535,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"turn": 4,
|
||||
"state_before": {
|
||||
"turn": 4,
|
||||
"gold": 1250,
|
||||
"troops": 40,
|
||||
"renown": 180,
|
||||
"settlements": 0,
|
||||
"relations": {
|
||||
"vlandia": 0,
|
||||
"sturgia": 0,
|
||||
"empire": -80,
|
||||
"aserai": 0,
|
||||
"battania": 0
|
||||
}
|
||||
},
|
||||
"raw_response": "{\"action\": \"siege_castle\", \"target_faction\": \"empire\", \"reason\": \"Attempt to capture a settlement to meet the win condition of owning at least one settlement.\", \"goal\": \"Territorial expansion and victory condition\"}",
|
||||
"parsed": {
|
||||
"action": "siege_castle",
|
||||
"target_faction": "empire",
|
||||
"reason": "Attempt to capture a settlement to meet the win condition of owning at least one settlement.",
|
||||
"goal": "Territorial expansion and victory condition"
|
||||
},
|
||||
"valid_json": true,
|
||||
"valid_action": true,
|
||||
"action": "siege_castle",
|
||||
"latency_ms": 3413.828134536743,
|
||||
"error": ""
|
||||
}
|
||||
],
|
||||
"final_state": {
|
||||
"turn": 4,
|
||||
"gold": 750,
|
||||
"troops": 40,
|
||||
"renown": 260,
|
||||
"settlements": 1,
|
||||
"relations": {
|
||||
"vlandia": 0,
|
||||
"sturgia": 0,
|
||||
"empire": -80,
|
||||
"aserai": 0,
|
||||
"battania": 0
|
||||
}
|
||||
},
|
||||
"passed": true,
|
||||
"score": 1.0,
|
||||
"reached_gold_target": false,
|
||||
"reached_renown_target": false,
|
||||
"reached_settlement_target": true,
|
||||
"latency_p50_ms": 3413.828134536743,
|
||||
"latency_p99_ms": 3970.310926437378
|
||||
}
|
||||
},
|
||||
"summary": {
|
||||
"total_elapsed_s": 52.1,
|
||||
"levels_run": [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5
|
||||
],
|
||||
"levels_passed": [
|
||||
0,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5
|
||||
],
|
||||
"levels_failed": [
|
||||
1
|
||||
],
|
||||
"m1_gate_passed": false,
|
||||
"m1_gate_notes": [
|
||||
"Level 1 FAILED (score=0.50)"
|
||||
],
|
||||
"m1_latency_threshold_ms": 10000
|
||||
}
|
||||
}
|
||||
259
timmy-benchmark/run_benchmark.py
Normal file
259
timmy-benchmark/run_benchmark.py
Normal file
@@ -0,0 +1,259 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Timmy Cognitive Benchmark Harness — Project Bannerlord M0.
|
||||
|
||||
Runs a 6-level cognitive benchmark against an Ollama model to assess
|
||||
readiness for autonomous Bannerlord gameplay.
|
||||
|
||||
Usage:
|
||||
python run_benchmark.py --model qwen2.5:14b --verbose
|
||||
python run_benchmark.py --model qwen3:14b --levels 0,1,2
|
||||
python run_benchmark.py --model qwen2.5:14b --output results/my_run.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import ollama
|
||||
except ImportError:
|
||||
print("ERROR: 'ollama' package not installed. Run: pip install ollama", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Add parent dir to path so levels can be imported
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from levels import level_0_coin_flip
|
||||
from levels import level_1_tic_tac_toe
|
||||
from levels import level_2_resource_mgmt
|
||||
from levels import level_3_battle_tactics
|
||||
from levels import level_4_trade_route
|
||||
from levels import level_5_mini_campaign
|
||||
|
||||
ALL_LEVELS = [
|
||||
level_0_coin_flip,
|
||||
level_1_tic_tac_toe,
|
||||
level_2_resource_mgmt,
|
||||
level_3_battle_tactics,
|
||||
level_4_trade_route,
|
||||
level_5_mini_campaign,
|
||||
]
|
||||
|
||||
# Pass criteria for M1 gate
|
||||
M1_GATE_LEVELS = {0, 1} # Must pass Level 0 and Level 1
|
||||
M1_LATENCY_THRESHOLD_MS = 10_000 # < 10s per decision for L0-L1
|
||||
|
||||
|
||||
def _dataclass_to_dict(obj):
|
||||
"""Recursively convert dataclass instances to dicts for JSON serialization."""
|
||||
if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
|
||||
return {k: _dataclass_to_dict(v) for k, v in dataclasses.asdict(obj).items()}
|
||||
if isinstance(obj, list):
|
||||
return [_dataclass_to_dict(i) for i in obj]
|
||||
if isinstance(obj, dict):
|
||||
return {k: _dataclass_to_dict(v) for k, v in obj.items()}
|
||||
return obj
|
||||
|
||||
|
||||
def check_model_available(model: str) -> bool:
|
||||
"""Return True if the model is available in Ollama."""
|
||||
try:
|
||||
models = ollama.list()
|
||||
model_names = [m["model"] for m in models.get("models", [])]
|
||||
# Also check without tag
|
||||
base_model = model.split(":")[0]
|
||||
return any(
|
||||
m == model or m.startswith(base_model + ":") or m == base_model
|
||||
for m in model_names
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
model: str,
|
||||
levels_to_run: list[int] | None = None,
|
||||
verbose: bool = False,
|
||||
skip_missing: bool = True,
|
||||
) -> dict:
|
||||
"""Run the benchmark and return a results dict."""
|
||||
if levels_to_run is None:
|
||||
levels_to_run = list(range(len(ALL_LEVELS)))
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f" Timmy Cognitive Benchmark — Project Bannerlord M0")
|
||||
print(f"{'=' * 60}")
|
||||
print(f" Model: {model}")
|
||||
print(f" Levels: {levels_to_run}")
|
||||
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
if not check_model_available(model):
|
||||
if skip_missing:
|
||||
print(f" WARNING: Model '{model}' not found in Ollama. Skipping.\n")
|
||||
return {
|
||||
"model": model,
|
||||
"skipped": True,
|
||||
"reason": f"Model '{model}' not available",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
else:
|
||||
print(f" ERROR: Model '{model}' not found in Ollama.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
client = ollama
|
||||
|
||||
results = {
|
||||
"model": model,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"skipped": False,
|
||||
"levels": {},
|
||||
"summary": {},
|
||||
}
|
||||
|
||||
level_results = {}
|
||||
total_start = time.time()
|
||||
|
||||
for level_idx in levels_to_run:
|
||||
if level_idx >= len(ALL_LEVELS):
|
||||
print(f" WARNING: Level {level_idx} does not exist, skipping.")
|
||||
continue
|
||||
|
||||
module = ALL_LEVELS[level_idx]
|
||||
print(f"Level {module.LEVEL}: {module.NAME}")
|
||||
print(f" {module.DESCRIPTION}")
|
||||
|
||||
try:
|
||||
level_result = module.run(client, model, verbose=verbose)
|
||||
level_results[level_idx] = level_result
|
||||
|
||||
passed_str = "PASS" if level_result.passed else "FAIL"
|
||||
score_pct = f"{level_result.score * 100:.0f}%"
|
||||
lat_str = f"p50={level_result.latency_p50_ms:.0f}ms p99={level_result.latency_p99_ms:.0f}ms"
|
||||
print(f" Result: {passed_str} | Score: {score_pct} | Latency: {lat_str}")
|
||||
|
||||
except Exception as exc:
|
||||
print(f" ERROR running level {level_idx}: {exc}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print()
|
||||
|
||||
total_elapsed_s = time.time() - total_start
|
||||
|
||||
# Build summary
|
||||
m1_gate_passed = True
|
||||
m1_gate_notes = []
|
||||
|
||||
for level_idx, lr in level_results.items():
|
||||
results["levels"][str(level_idx)] = _dataclass_to_dict(lr)
|
||||
|
||||
if level_idx in M1_GATE_LEVELS:
|
||||
if not lr.passed:
|
||||
m1_gate_passed = False
|
||||
m1_gate_notes.append(f"Level {level_idx} FAILED (score={lr.score:.2f})")
|
||||
if lr.latency_p99_ms > M1_LATENCY_THRESHOLD_MS:
|
||||
m1_gate_passed = False
|
||||
m1_gate_notes.append(
|
||||
f"Level {level_idx} latency too high "
|
||||
f"(p99={lr.latency_p99_ms:.0f}ms > {M1_LATENCY_THRESHOLD_MS}ms)"
|
||||
)
|
||||
|
||||
results["summary"] = {
|
||||
"total_elapsed_s": round(total_elapsed_s, 1),
|
||||
"levels_run": levels_to_run,
|
||||
"levels_passed": [i for i, lr in level_results.items() if lr.passed],
|
||||
"levels_failed": [i for i, lr in level_results.items() if not lr.passed],
|
||||
"m1_gate_passed": m1_gate_passed,
|
||||
"m1_gate_notes": m1_gate_notes,
|
||||
"m1_latency_threshold_ms": M1_LATENCY_THRESHOLD_MS,
|
||||
}
|
||||
|
||||
# Print scorecard
|
||||
print(f"{'=' * 60}")
|
||||
print(f" SCORECARD — {model}")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
all_level_modules = {m.LEVEL: m for m in ALL_LEVELS}
|
||||
for level_idx in levels_to_run:
|
||||
if level_idx not in level_results:
|
||||
continue
|
||||
lr = level_results[level_idx]
|
||||
module = ALL_LEVELS[level_idx]
|
||||
passed_str = "✓ PASS" if lr.passed else "✗ FAIL"
|
||||
gate_str = " [M1 GATE]" if level_idx in M1_GATE_LEVELS else ""
|
||||
lat = f"{lr.latency_p50_ms:.0f}ms"
|
||||
print(f" L{level_idx}: {passed_str}{gate_str} | {lr.score*100:.0f}% | {lat} | {module.NAME}")
|
||||
|
||||
print(f"{'─' * 60}")
|
||||
gate_str = "✓ M1 GATE PASSED" if m1_gate_passed else "✗ M1 GATE FAILED"
|
||||
print(f" {gate_str}")
|
||||
if m1_gate_notes:
|
||||
for note in m1_gate_notes:
|
||||
print(f" → {note}")
|
||||
print(f" Total time: {total_elapsed_s:.1f}s")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Timmy Cognitive Benchmark Harness — Project Bannerlord M0"
|
||||
)
|
||||
parser.add_argument("--model", required=True, help="Ollama model name (e.g. qwen2.5:14b)")
|
||||
parser.add_argument("--levels", default=None, help="Comma-separated level indices (default: all)")
|
||||
parser.add_argument("--verbose", action="store_true", help="Show per-trial details")
|
||||
parser.add_argument(
|
||||
"--output", default=None,
|
||||
help="Output JSON path (default: results/<model>_<timestamp>.json)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-missing", action="store_true", default=True,
|
||||
help="Skip instead of error if model not available"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
levels_to_run = None
|
||||
if args.levels:
|
||||
try:
|
||||
levels_to_run = [int(x.strip()) for x in args.levels.split(",")]
|
||||
except ValueError:
|
||||
print(f"ERROR: --levels must be comma-separated integers, got: {args.levels}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
results = run_benchmark(
|
||||
model=args.model,
|
||||
levels_to_run=levels_to_run,
|
||||
verbose=args.verbose,
|
||||
skip_missing=args.skip_missing,
|
||||
)
|
||||
|
||||
# Save results
|
||||
if args.output:
|
||||
output_path = Path(args.output)
|
||||
else:
|
||||
results_dir = Path(__file__).parent / "results"
|
||||
results_dir.mkdir(exist_ok=True)
|
||||
safe_model = args.model.replace(":", "_").replace("/", "_")
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_path = results_dir / f"{safe_model}_{ts}.json"
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
print(f"Results saved to: {output_path}")
|
||||
|
||||
# Exit with non-zero if M1 gate failed
|
||||
if not results.get("skipped") and not results.get("summary", {}).get("m1_gate_passed", True):
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user