[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)
Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
This commit was merged in pull request #1159.
This commit is contained in:
committed by
rockachopa
parent
6e65b53f3a
commit
9e08e87312
183
timmy-benchmark/levels/level_0_coin_flip.py
Normal file
183
timmy-benchmark/levels/level_0_coin_flip.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""Level 0: JSON Compliance — Coin Flip.
|
||||
|
||||
Tests whether the model can reliably return well-formed JSON responses
|
||||
with a specific schema. This is the minimum bar for GABS tool calls.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
LEVEL = 0
|
||||
NAME = "JSON Compliance (Coin Flip)"
|
||||
DESCRIPTION = "Model must return valid JSON matching a strict schema on each trial."
|
||||
|
||||
SYSTEM_PROMPT = """You are a strategic AI agent. You MUST respond ONLY with valid JSON.
|
||||
No markdown, no explanation, no code fences. Raw JSON only."""
|
||||
|
||||
TRIALS = [
|
||||
{
|
||||
"prompt": (
|
||||
'A coin is flipped. Respond with exactly: {"choice": "heads"} or {"choice": "tails"}. '
|
||||
"Pick one. JSON only."
|
||||
),
|
||||
"schema": {"choice": str},
|
||||
"valid_values": {"choice": ["heads", "tails"]},
|
||||
},
|
||||
{
|
||||
"prompt": (
|
||||
'You must attack or defend. Respond with: {"action": "attack", "confidence": 0.8} '
|
||||
'or {"action": "defend", "confidence": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.'
|
||||
),
|
||||
"schema": {"action": str, "confidence": float},
|
||||
"valid_values": {"action": ["attack", "defend"]},
|
||||
},
|
||||
{
|
||||
"prompt": (
|
||||
'Choose a direction to march. Respond with exactly: '
|
||||
'{"direction": "north", "reason": "string explaining why"}. '
|
||||
"Pick north/south/east/west. JSON only."
|
||||
),
|
||||
"schema": {"direction": str, "reason": str},
|
||||
"valid_values": {"direction": ["north", "south", "east", "west"]},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrialResult:
|
||||
trial_index: int
|
||||
prompt: str
|
||||
raw_response: str
|
||||
parsed: dict | None
|
||||
valid_json: bool
|
||||
schema_valid: bool
|
||||
value_valid: bool
|
||||
latency_ms: float
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevelResult:
|
||||
level: int = LEVEL
|
||||
name: str = NAME
|
||||
trials: list[TrialResult] = field(default_factory=list)
|
||||
passed: bool = False
|
||||
score: float = 0.0
|
||||
latency_p50_ms: float = 0.0
|
||||
latency_p99_ms: float = 0.0
|
||||
|
||||
|
||||
def _validate_schema(parsed: dict, schema: dict[str, type]) -> bool:
|
||||
for key, expected_type in schema.items():
|
||||
if key not in parsed:
|
||||
return False
|
||||
if not isinstance(parsed[key], expected_type):
|
||||
# Allow int where float is expected
|
||||
if expected_type is float and isinstance(parsed[key], int):
|
||||
continue
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _validate_values(parsed: dict, valid_values: dict[str, list]) -> bool:
|
||||
for key, valid_list in valid_values.items():
|
||||
if key in parsed and parsed[key] not in valid_list:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _clean_response(raw: str) -> str:
|
||||
"""Strip markdown fences if model wrapped JSON in them."""
|
||||
raw = raw.strip()
|
||||
if raw.startswith("```"):
|
||||
lines = raw.splitlines()
|
||||
# Remove first and last fence lines
|
||||
lines = [l for l in lines if not l.startswith("```")]
|
||||
raw = "\n".join(lines).strip()
|
||||
return raw
|
||||
|
||||
|
||||
def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
|
||||
result = LevelResult()
|
||||
latencies = []
|
||||
|
||||
for i, trial in enumerate(TRIALS):
|
||||
t0 = time.time()
|
||||
try:
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": trial["prompt"]},
|
||||
],
|
||||
options={"temperature": 0.1},
|
||||
)
|
||||
raw = response["message"]["content"]
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
except Exception as exc:
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
tr = TrialResult(
|
||||
trial_index=i,
|
||||
prompt=trial["prompt"],
|
||||
raw_response="",
|
||||
parsed=None,
|
||||
valid_json=False,
|
||||
schema_valid=False,
|
||||
value_valid=False,
|
||||
latency_ms=latency_ms,
|
||||
error=str(exc),
|
||||
)
|
||||
result.trials.append(tr)
|
||||
if verbose:
|
||||
print(f" Trial {i}: ERROR — {exc}")
|
||||
continue
|
||||
|
||||
latencies.append(latency_ms)
|
||||
|
||||
cleaned = _clean_response(raw)
|
||||
parsed = None
|
||||
valid_json = False
|
||||
schema_valid = False
|
||||
value_valid = False
|
||||
error = ""
|
||||
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
valid_json = True
|
||||
schema_valid = _validate_schema(parsed, trial["schema"])
|
||||
value_valid = _validate_values(parsed, trial["valid_values"])
|
||||
except json.JSONDecodeError as exc:
|
||||
error = f"JSONDecodeError: {exc}"
|
||||
|
||||
tr = TrialResult(
|
||||
trial_index=i,
|
||||
prompt=trial["prompt"],
|
||||
raw_response=raw,
|
||||
parsed=parsed,
|
||||
valid_json=valid_json,
|
||||
schema_valid=schema_valid,
|
||||
value_valid=value_valid,
|
||||
latency_ms=latency_ms,
|
||||
error=error,
|
||||
)
|
||||
result.trials.append(tr)
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if (valid_json and schema_valid) else "FAIL"
|
||||
print(
|
||||
f" Trial {i}: {status} | json={valid_json} schema={schema_valid} "
|
||||
f"value={value_valid} | {latency_ms:.0f}ms | {raw[:80]!r}"
|
||||
)
|
||||
|
||||
passed_trials = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
|
||||
result.score = passed_trials / len(TRIALS)
|
||||
result.passed = result.score >= 1.0 # Must pass all 3 trials
|
||||
|
||||
if latencies:
|
||||
latencies_sorted = sorted(latencies)
|
||||
result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
|
||||
result.latency_p99_ms = latencies_sorted[-1]
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user