#!/usr/bin/env python3
"""Benchmark 2: Code Generation Correctness

Ask model to generate a fibonacci function, execute it, verify fib(10) = 55.
"""

from __future__ import annotations

import json
import re
import subprocess
import sys
import tempfile
import time
from pathlib import Path

import requests

OLLAMA_URL = "http://localhost:11434"

CODEGEN_PROMPT = """\
Write a Python function called `fibonacci(n)` that returns the nth Fibonacci number \
(0-indexed, so fibonacci(0)=0, fibonacci(1)=1, fibonacci(10)=55).

Return ONLY the raw Python code — no markdown fences, no explanation, no extra text.
The function must be named exactly `fibonacci`.
"""


def extract_python(text: str) -> str:
    """Extract Python code from a response."""
    text = text.strip()

    # Remove markdown fences
    fence_match = re.search(r"```(?:python)?\s*(.*?)```", text, re.DOTALL)
    if fence_match:
        return fence_match.group(1).strip()

    # Return as-is if it looks like code
    if "def " in text:
        return text

    return text


def run_prompt(model: str, prompt: str) -> str:
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.1, "num_predict": 512},
    }
    resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120)
    resp.raise_for_status()
    return resp.json()["response"]


def execute_fibonacci(code: str) -> tuple[bool, str]:
    """Execute the generated fibonacci code and check fib(10) == 55."""
    test_code = code + "\n\nresult = fibonacci(10)\nprint(result)\n"

    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
        f.write(test_code)
        tmpfile = f.name

    try:
        proc = subprocess.run(
            [sys.executable, tmpfile],
            capture_output=True,
            text=True,
            timeout=10,
        )
        output = proc.stdout.strip()
        if proc.returncode != 0:
            return False, f"Runtime error: {proc.stderr.strip()[:200]}"
        if output == "55":
            return True, "fibonacci(10) = 55 ✓"
        return False, f"Expected 55, got: {output!r}"
    except subprocess.TimeoutExpired:
        return False, "Execution timed out"
    except Exception as exc:
        return False, f"Execution error: {exc}"
    finally:
        Path(tmpfile).unlink(missing_ok=True)


def run_benchmark(model: str) -> dict:
    """Run code generation benchmark for a single model."""
    start = time.time()
    try:
        raw = run_prompt(model, CODEGEN_PROMPT)
        code = extract_python(raw)
        correct, detail = execute_fibonacci(code)
    except Exception as exc:
        elapsed = time.time() - start
        return {
            "benchmark": "code_generation",
            "model": model,
            "passed": False,
            "error": str(exc),
            "elapsed_s": round(elapsed, 2),
        }

    elapsed = time.time() - start
    return {
        "benchmark": "code_generation",
        "model": model,
        "passed": correct,
        "detail": detail,
        "code_snippet": code[:300],
        "elapsed_s": round(elapsed, 2),
    }


if __name__ == "__main__":
    model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b"
    print(f"Running code-generation benchmark against {model}...")
    result = run_benchmark(model)
    print(json.dumps(result, indent=2))
    sys.exit(0 if result["passed"] else 1)