[claude] Bannerlord M0: Run cognitive benchmark on hermes3, fix L1 string-int coercion (#1092) (#1159)

Co-authored-by: Alexander Whitestone <alexpaynex@gmail.com> Co-committed-by: Alexander Whitestone <alexpaynex@gmail.com>
2026-03-23 19:38:48 +00:00
parent 6e65b53f3a
commit 9e08e87312
12 changed files with 3068 additions and 0 deletions
--- a/timmy-benchmark/levels/level_0_coin_flip.py
+++ b/timmy-benchmark/levels/level_0_coin_flip.py
@@ -0,0 +1,183 @@
+"""Level 0: JSON Compliance — Coin Flip.
+
+Tests whether the model can reliably return well-formed JSON responses
+with a specific schema. This is the minimum bar for GABS tool calls.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+LEVEL = 0
+NAME = "JSON Compliance (Coin Flip)"
+DESCRIPTION = "Model must return valid JSON matching a strict schema on each trial."
+
+SYSTEM_PROMPT = """You are a strategic AI agent. You MUST respond ONLY with valid JSON.
+No markdown, no explanation, no code fences. Raw JSON only."""
+
+TRIALS = [
+    {
+        "prompt": (
+            'A coin is flipped. Respond with exactly: {"choice": "heads"} or {"choice": "tails"}. '
+            "Pick one. JSON only."
+        ),
+        "schema": {"choice": str},
+        "valid_values": {"choice": ["heads", "tails"]},
+    },
+    {
+        "prompt": (
+            'You must attack or defend. Respond with: {"action": "attack", "confidence": 0.8} '
+            'or {"action": "defend", "confidence": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.'
+        ),
+        "schema": {"action": str, "confidence": float},
+        "valid_values": {"action": ["attack", "defend"]},
+    },
+    {
+        "prompt": (
+            'Choose a direction to march. Respond with exactly: '
+            '{"direction": "north", "reason": "string explaining why"}. '
+            "Pick north/south/east/west. JSON only."
+        ),
+        "schema": {"direction": str, "reason": str},
+        "valid_values": {"direction": ["north", "south", "east", "west"]},
+    },
+]
+
+
+@dataclass
+class TrialResult:
+    trial_index: int
+    prompt: str
+    raw_response: str
+    parsed: dict | None
+    valid_json: bool
+    schema_valid: bool
+    value_valid: bool
+    latency_ms: float
+    error: str = ""
+
+
+@dataclass
+class LevelResult:
+    level: int = LEVEL
+    name: str = NAME
+    trials: list[TrialResult] = field(default_factory=list)
+    passed: bool = False
+    score: float = 0.0
+    latency_p50_ms: float = 0.0
+    latency_p99_ms: float = 0.0
+
+
+def _validate_schema(parsed: dict, schema: dict[str, type]) -> bool:
+    for key, expected_type in schema.items():
+        if key not in parsed:
+            return False
+        if not isinstance(parsed[key], expected_type):
+            # Allow int where float is expected
+            if expected_type is float and isinstance(parsed[key], int):
+                continue
+            return False
+    return True
+
+
+def _validate_values(parsed: dict, valid_values: dict[str, list]) -> bool:
+    for key, valid_list in valid_values.items():
+        if key in parsed and parsed[key] not in valid_list:
+            return False
+    return True
+
+
+def _clean_response(raw: str) -> str:
+    """Strip markdown fences if model wrapped JSON in them."""
+    raw = raw.strip()
+    if raw.startswith("```"):
+        lines = raw.splitlines()
+        # Remove first and last fence lines
+        lines = [l for l in lines if not l.startswith("```")]
+        raw = "\n".join(lines).strip()
+    return raw
+
+
+def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
+    result = LevelResult()
+    latencies = []
+
+    for i, trial in enumerate(TRIALS):
+        t0 = time.time()
+        try:
+            response = client.chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": trial["prompt"]},
+                ],
+                options={"temperature": 0.1},
+            )
+            raw = response["message"]["content"]
+            latency_ms = (time.time() - t0) * 1000
+        except Exception as exc:
+            latency_ms = (time.time() - t0) * 1000
+            tr = TrialResult(
+                trial_index=i,
+                prompt=trial["prompt"],
+                raw_response="",
+                parsed=None,
+                valid_json=False,
+                schema_valid=False,
+                value_valid=False,
+                latency_ms=latency_ms,
+                error=str(exc),
+            )
+            result.trials.append(tr)
+            if verbose:
+                print(f"  Trial {i}: ERROR — {exc}")
+            continue
+
+        latencies.append(latency_ms)
+
+        cleaned = _clean_response(raw)
+        parsed = None
+        valid_json = False
+        schema_valid = False
+        value_valid = False
+        error = ""
+
+        try:
+            parsed = json.loads(cleaned)
+            valid_json = True
+            schema_valid = _validate_schema(parsed, trial["schema"])
+            value_valid = _validate_values(parsed, trial["valid_values"])
+        except json.JSONDecodeError as exc:
+            error = f"JSONDecodeError: {exc}"
+
+        tr = TrialResult(
+            trial_index=i,
+            prompt=trial["prompt"],
+            raw_response=raw,
+            parsed=parsed,
+            valid_json=valid_json,
+            schema_valid=schema_valid,
+            value_valid=value_valid,
+            latency_ms=latency_ms,
+            error=error,
+        )
+        result.trials.append(tr)
+
+        if verbose:
+            status = "PASS" if (valid_json and schema_valid) else "FAIL"
+            print(
+                f"  Trial {i}: {status} | json={valid_json} schema={schema_valid} "
+                f"value={value_valid} | {latency_ms:.0f}ms | {raw[:80]!r}"
+            )
+
+    passed_trials = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
+    result.score = passed_trials / len(TRIALS)
+    result.passed = result.score >= 1.0  # Must pass all 3 trials
+
+    if latencies:
+        latencies_sorted = sorted(latencies)
+        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
+        result.latency_p99_ms = latencies_sorted[-1]
+
+    return result