Timmy-time-dashboard/timmy-benchmark/levels/level_0_coin_flip.py

"""Level 0: JSON Compliance — Coin Flip.

Tests whether the model can reliably return well-formed JSON responses
with a specific schema. This is the minimum bar for GABS tool calls.
"""

import json
import time
from dataclasses import dataclass, field
from typing import Any

LEVEL = 0
NAME = "JSON Compliance (Coin Flip)"
DESCRIPTION = "Model must return valid JSON matching a strict schema on each trial."

SYSTEM_PROMPT = """You are a strategic AI agent. You MUST respond ONLY with valid JSON.
No markdown, no explanation, no code fences. Raw JSON only."""

TRIALS = [
    {
        "prompt": (
            'A coin is flipped. Respond with exactly: {"choice": "heads"} or {"choice": "tails"}. '
            "Pick one. JSON only."
        ),
        "schema": {"choice": str},
        "valid_values": {"choice": ["heads", "tails"]},
    },
    {
        "prompt": (
            'You must attack or defend. Respond with: {"action": "attack", "confidence": 0.8} '
            'or {"action": "defend", "confidence": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.'
        ),
        "schema": {"action": str, "confidence": float},
        "valid_values": {"action": ["attack", "defend"]},
    },
    {
        "prompt": (
            'Choose a direction to march. Respond with exactly: '
            '{"direction": "north", "reason": "string explaining why"}. '
            "Pick north/south/east/west. JSON only."
        ),
        "schema": {"direction": str, "reason": str},
        "valid_values": {"direction": ["north", "south", "east", "west"]},
    },
]


@dataclass
class TrialResult:
    trial_index: int
    prompt: str
    raw_response: str
    parsed: dict | None
    valid_json: bool
    schema_valid: bool
    value_valid: bool
    latency_ms: float
    error: str = ""


@dataclass
class LevelResult:
    level: int = LEVEL
    name: str = NAME
    trials: list[TrialResult] = field(default_factory=list)
    passed: bool = False
    score: float = 0.0
    latency_p50_ms: float = 0.0
    latency_p99_ms: float = 0.0


def _validate_schema(parsed: dict, schema: dict[str, type]) -> bool:
    for key, expected_type in schema.items():
        if key not in parsed:
            return False
        if not isinstance(parsed[key], expected_type):
            # Allow int where float is expected
            if expected_type is float and isinstance(parsed[key], int):
                continue
            return False
    return True


def _validate_values(parsed: dict, valid_values: dict[str, list]) -> bool:
    for key, valid_list in valid_values.items():
        if key in parsed and parsed[key] not in valid_list:
            return False
    return True


def _clean_response(raw: str) -> str:
    """Strip markdown fences if model wrapped JSON in them."""
    raw = raw.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        # Remove first and last fence lines
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()
    return raw


def run(client: Any, model: str, verbose: bool = False) -> LevelResult:
    result = LevelResult()
    latencies = []

    for i, trial in enumerate(TRIALS):
        t0 = time.time()
        try:
            response = client.chat(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": trial["prompt"]},
                ],
                options={"temperature": 0.1},
            )
            raw = response["message"]["content"]
            latency_ms = (time.time() - t0) * 1000
        except Exception as exc:
            latency_ms = (time.time() - t0) * 1000
            tr = TrialResult(
                trial_index=i,
                prompt=trial["prompt"],
                raw_response="",
                parsed=None,
                valid_json=False,
                schema_valid=False,
                value_valid=False,
                latency_ms=latency_ms,
                error=str(exc),
            )
            result.trials.append(tr)
            if verbose:
                print(f"  Trial {i}: ERROR — {exc}")
            continue

        latencies.append(latency_ms)

        cleaned = _clean_response(raw)
        parsed = None
        valid_json = False
        schema_valid = False
        value_valid = False
        error = ""

        try:
            parsed = json.loads(cleaned)
            valid_json = True
            schema_valid = _validate_schema(parsed, trial["schema"])
            value_valid = _validate_values(parsed, trial["valid_values"])
        except json.JSONDecodeError as exc:
            error = f"JSONDecodeError: {exc}"

        tr = TrialResult(
            trial_index=i,
            prompt=trial["prompt"],
            raw_response=raw,
            parsed=parsed,
            valid_json=valid_json,
            schema_valid=schema_valid,
            value_valid=value_valid,
            latency_ms=latency_ms,
            error=error,
        )
        result.trials.append(tr)

        if verbose:
            status = "PASS" if (valid_json and schema_valid) else "FAIL"
            print(
                f"  Trial {i}: {status} | json={valid_json} schema={schema_valid} "
                f"value={value_valid} | {latency_ms:.0f}ms | {raw[:80]!r}"
            )

    passed_trials = sum(1 for t in result.trials if t.valid_json and t.schema_valid)
    result.score = passed_trials / len(TRIALS)
    result.passed = result.score >= 1.0  # Must pass all 3 trials

    if latencies:
        latencies_sorted = sorted(latencies)
        result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2]
        result.latency_p99_ms = latencies_sorted[-1]

    return result