scripts/benchmarks/02_code_generation.py

#!/usr/bin/env python3
"""Benchmark 2: Code Generation Correctness

Ask model to generate a fibonacci function, execute it, verify fib(10) = 55.
"""

from __future__ import annotations

import json
import re
import subprocess
import sys
import tempfile
import time
from pathlib import Path

import requests

OLLAMA_URL = "http://localhost:11434"

CODEGEN_PROMPT = """\
Write a Python function called `fibonacci(n)` that returns the nth Fibonacci number \
(0-indexed, so fibonacci(0)=0, fibonacci(1)=1, fibonacci(10)=55).

Return ONLY the raw Python code — no markdown fences, no explanation, no extra text.
The function must be named exactly `fibonacci`.
"""


def extract_python(text: str) -> str:
    """Extract Python code from a response."""
    text = text.strip()

    # Remove markdown fences
    fence_match = re.search(r"```(?:python)?\s*(.*?)```", text, re.DOTALL)
    if fence_match:
        return fence_match.group(1).strip()

    # Return as-is if it looks like code
    if "def " in text:
        return text

    return text


def run_prompt(model: str, prompt: str) -> str:
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.1, "num_predict": 512},
    }
    resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120)
    resp.raise_for_status()
    return resp.json()["response"]


def execute_fibonacci(code: str) -> tuple[bool, str]:
    """Execute the generated fibonacci code and check fib(10) == 55."""
    test_code = code + "\n\nresult = fibonacci(10)\nprint(result)\n"

    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
        f.write(test_code)
        tmpfile = f.name

    try:
        proc = subprocess.run(
            [sys.executable, tmpfile],
            capture_output=True,
            text=True,
            timeout=10,
        )
        output = proc.stdout.strip()
        if proc.returncode != 0:
            return False, f"Runtime error: {proc.stderr.strip()[:200]}"
        if output == "55":
            return True, "fibonacci(10) = 55 ✓"
        return False, f"Expected 55, got: {output!r}"
    except subprocess.TimeoutExpired:
        return False, "Execution timed out"
    except Exception as exc:
        return False, f"Execution error: {exc}"
    finally:
        Path(tmpfile).unlink(missing_ok=True)


def run_benchmark(model: str) -> dict:
    """Run code generation benchmark for a single model."""
    start = time.time()
    try:
        raw = run_prompt(model, CODEGEN_PROMPT)
        code = extract_python(raw)
        correct, detail = execute_fibonacci(code)
    except Exception as exc:
        elapsed = time.time() - start
        return {
            "benchmark": "code_generation",
            "model": model,
            "passed": False,
            "error": str(exc),
            "elapsed_s": round(elapsed, 2),
        }

    elapsed = time.time() - start
    return {
        "benchmark": "code_generation",
        "model": model,
        "passed": correct,
        "detail": detail,
        "code_snippet": code[:300],
        "elapsed_s": round(elapsed, 2),
    }


if __name__ == "__main__":
    model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b"
    print(f"Running code-generation benchmark against {model}...")
    result = run_benchmark(model)
    print(json.dumps(result, indent=2))
    sys.exit(0 if result["passed"] else 1)
[claude] Run 5-test benchmark suite against local model candidates (#1066) (#1271) 2026-03-24 01:38:59 +00:00			`#!/usr/bin/env python3`
			`"""Benchmark 2: Code Generation Correctness`

			`Ask model to generate a fibonacci function, execute it, verify fib(10) = 55.`
			`"""`

			`from __future__ import annotations`

			`import json`
			`import re`
			`import subprocess`
			`import sys`
			`import tempfile`
			`import time`
			`from pathlib import Path`

			`import requests`

			`OLLAMA_URL = "http://localhost:11434"`

			`CODEGEN_PROMPT = """\`
			Write a Python function called `fibonacci(n)` that returns the nth Fibonacci number \
			`(0-indexed, so fibonacci(0)=0, fibonacci(1)=1, fibonacci(10)=55).`

			`Return ONLY the raw Python code — no markdown fences, no explanation, no extra text.`
			The function must be named exactly `fibonacci`.
			`"""`


			`def extract_python(text: str) -> str:`
			`"""Extract Python code from a response."""`
			`text = text.strip()`

			`# Remove markdown fences`
			fence_match = re.search(r"```(?:python)?\s(.?)```", text, re.DOTALL)
			`if fence_match:`
			`return fence_match.group(1).strip()`

			`# Return as-is if it looks like code`
			`if "def " in text:`
			`return text`

			`return text`


			`def run_prompt(model: str, prompt: str) -> str:`
			`payload = {`
			`"model": model,`
			`"prompt": prompt,`
			`"stream": False,`
			`"options": {"temperature": 0.1, "num_predict": 512},`
			`}`
			`resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120)`
			`resp.raise_for_status()`
			`return resp.json()["response"]`


			`def execute_fibonacci(code: str) -> tuple[bool, str]:`
			`"""Execute the generated fibonacci code and check fib(10) == 55."""`
			`test_code = code + "\n\nresult = fibonacci(10)\nprint(result)\n"`

			`with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:`
			`f.write(test_code)`
			`tmpfile = f.name`

			`try:`
			`proc = subprocess.run(`
			`[sys.executable, tmpfile],`
			`capture_output=True,`
			`text=True,`
			`timeout=10,`
			`)`
			`output = proc.stdout.strip()`
			`if proc.returncode != 0:`
			`return False, f"Runtime error: {proc.stderr.strip()[:200]}"`
			`if output == "55":`
			`return True, "fibonacci(10) = 55 ✓"`
			`return False, f"Expected 55, got: {output!r}"`
			`except subprocess.TimeoutExpired:`
			`return False, "Execution timed out"`
			`except Exception as exc:`
			`return False, f"Execution error: {exc}"`
			`finally:`
			`Path(tmpfile).unlink(missing_ok=True)`


			`def run_benchmark(model: str) -> dict:`
			`"""Run code generation benchmark for a single model."""`
			`start = time.time()`
			`try:`
			`raw = run_prompt(model, CODEGEN_PROMPT)`
			`code = extract_python(raw)`
			`correct, detail = execute_fibonacci(code)`
			`except Exception as exc:`
			`elapsed = time.time() - start`
			`return {`
			`"benchmark": "code_generation",`
			`"model": model,`
			`"passed": False,`
			`"error": str(exc),`
			`"elapsed_s": round(elapsed, 2),`
			`}`

			`elapsed = time.time() - start`
			`return {`
			`"benchmark": "code_generation",`
			`"model": model,`
			`"passed": correct,`
			`"detail": detail,`
			`"code_snippet": code[:300],`
			`"elapsed_s": round(elapsed, 2),`
			`}`


			`if __name__ == "__main__":`
			`model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b"`
			`print(f"Running code-generation benchmark against {model}...")`
			`result = run_benchmark(model)`
			`print(json.dumps(result, indent=2))`
			`sys.exit(0 if result["passed"] else 1)`