"""Level 0: JSON Compliance — Coin Flip. Tests whether the model can reliably return well-formed JSON responses with a specific schema. This is the minimum bar for GABS tool calls. """ import json import time from dataclasses import dataclass, field from typing import Any LEVEL = 0 NAME = "JSON Compliance (Coin Flip)" DESCRIPTION = "Model must return valid JSON matching a strict schema on each trial." SYSTEM_PROMPT = """You are a strategic AI agent. You MUST respond ONLY with valid JSON. No markdown, no explanation, no code fences. Raw JSON only.""" TRIALS = [ { "prompt": ( 'A coin is flipped. Respond with exactly: {"choice": "heads"} or {"choice": "tails"}. ' "Pick one. JSON only." ), "schema": {"choice": str}, "valid_values": {"choice": ["heads", "tails"]}, }, { "prompt": ( 'You must attack or defend. Respond with: {"action": "attack", "confidence": 0.8} ' 'or {"action": "defend", "confidence": 0.6}. Replace confidence with your own value 0.0-1.0. JSON only.' ), "schema": {"action": str, "confidence": float}, "valid_values": {"action": ["attack", "defend"]}, }, { "prompt": ( 'Choose a direction to march. Respond with exactly: ' '{"direction": "north", "reason": "string explaining why"}. ' "Pick north/south/east/west. JSON only." ), "schema": {"direction": str, "reason": str}, "valid_values": {"direction": ["north", "south", "east", "west"]}, }, ] @dataclass class TrialResult: trial_index: int prompt: str raw_response: str parsed: dict | None valid_json: bool schema_valid: bool value_valid: bool latency_ms: float error: str = "" @dataclass class LevelResult: level: int = LEVEL name: str = NAME trials: list[TrialResult] = field(default_factory=list) passed: bool = False score: float = 0.0 latency_p50_ms: float = 0.0 latency_p99_ms: float = 0.0 def _validate_schema(parsed: dict, schema: dict[str, type]) -> bool: for key, expected_type in schema.items(): if key not in parsed: return False if not isinstance(parsed[key], expected_type): # Allow int where float is expected if expected_type is float and isinstance(parsed[key], int): continue return False return True def _validate_values(parsed: dict, valid_values: dict[str, list]) -> bool: for key, valid_list in valid_values.items(): if key in parsed and parsed[key] not in valid_list: return False return True def _clean_response(raw: str) -> str: """Strip markdown fences if model wrapped JSON in them.""" raw = raw.strip() if raw.startswith("```"): lines = raw.splitlines() # Remove first and last fence lines lines = [l for l in lines if not l.startswith("```")] raw = "\n".join(lines).strip() return raw def run(client: Any, model: str, verbose: bool = False) -> LevelResult: result = LevelResult() latencies = [] for i, trial in enumerate(TRIALS): t0 = time.time() try: response = client.chat( model=model, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": trial["prompt"]}, ], options={"temperature": 0.1}, ) raw = response["message"]["content"] latency_ms = (time.time() - t0) * 1000 except Exception as exc: latency_ms = (time.time() - t0) * 1000 tr = TrialResult( trial_index=i, prompt=trial["prompt"], raw_response="", parsed=None, valid_json=False, schema_valid=False, value_valid=False, latency_ms=latency_ms, error=str(exc), ) result.trials.append(tr) if verbose: print(f" Trial {i}: ERROR — {exc}") continue latencies.append(latency_ms) cleaned = _clean_response(raw) parsed = None valid_json = False schema_valid = False value_valid = False error = "" try: parsed = json.loads(cleaned) valid_json = True schema_valid = _validate_schema(parsed, trial["schema"]) value_valid = _validate_values(parsed, trial["valid_values"]) except json.JSONDecodeError as exc: error = f"JSONDecodeError: {exc}" tr = TrialResult( trial_index=i, prompt=trial["prompt"], raw_response=raw, parsed=parsed, valid_json=valid_json, schema_valid=schema_valid, value_valid=value_valid, latency_ms=latency_ms, error=error, ) result.trials.append(tr) if verbose: status = "PASS" if (valid_json and schema_valid) else "FAIL" print( f" Trial {i}: {status} | json={valid_json} schema={schema_valid} " f"value={value_valid} | {latency_ms:.0f}ms | {raw[:80]!r}" ) passed_trials = sum(1 for t in result.trials if t.valid_json and t.schema_valid) result.score = passed_trials / len(TRIALS) result.passed = result.score >= 1.0 # Must pass all 3 trials if latencies: latencies_sorted = sorted(latencies) result.latency_p50_ms = latencies_sorted[len(latencies_sorted) // 2] result.latency_p99_ms = latencies_sorted[-1] return result