Files

121 lines
3.4 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""Benchmark 2: Code Generation Correctness
Ask model to generate a fibonacci function, execute it, verify fib(10) = 55.
"""
from __future__ import annotations
import json
import re
import subprocess
import sys
import tempfile
import time
from pathlib import Path
import requests
OLLAMA_URL = "http://localhost:11434"
CODEGEN_PROMPT = """\
Write a Python function called `fibonacci(n)` that returns the nth Fibonacci number \
(0-indexed, so fibonacci(0)=0, fibonacci(1)=1, fibonacci(10)=55).
Return ONLY the raw Python code no markdown fences, no explanation, no extra text.
The function must be named exactly `fibonacci`.
"""
def extract_python(text: str) -> str:
"""Extract Python code from a response."""
text = text.strip()
# Remove markdown fences
fence_match = re.search(r"```(?:python)?\s*(.*?)```", text, re.DOTALL)
if fence_match:
return fence_match.group(1).strip()
# Return as-is if it looks like code
if "def " in text:
return text
return text
def run_prompt(model: str, prompt: str) -> str:
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 512},
}
resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120)
resp.raise_for_status()
return resp.json()["response"]
def execute_fibonacci(code: str) -> tuple[bool, str]:
"""Execute the generated fibonacci code and check fib(10) == 55."""
test_code = code + "\n\nresult = fibonacci(10)\nprint(result)\n"
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
f.write(test_code)
tmpfile = f.name
try:
proc = subprocess.run(
[sys.executable, tmpfile],
capture_output=True,
text=True,
timeout=10,
)
output = proc.stdout.strip()
if proc.returncode != 0:
return False, f"Runtime error: {proc.stderr.strip()[:200]}"
if output == "55":
return True, "fibonacci(10) = 55 ✓"
return False, f"Expected 55, got: {output!r}"
except subprocess.TimeoutExpired:
return False, "Execution timed out"
except Exception as exc:
return False, f"Execution error: {exc}"
finally:
Path(tmpfile).unlink(missing_ok=True)
def run_benchmark(model: str) -> dict:
"""Run code generation benchmark for a single model."""
start = time.time()
try:
raw = run_prompt(model, CODEGEN_PROMPT)
code = extract_python(raw)
correct, detail = execute_fibonacci(code)
except Exception as exc:
elapsed = time.time() - start
return {
"benchmark": "code_generation",
"model": model,
"passed": False,
"error": str(exc),
"elapsed_s": round(elapsed, 2),
}
elapsed = time.time() - start
return {
"benchmark": "code_generation",
"model": model,
"passed": correct,
"detail": detail,
"code_snippet": code[:300],
"elapsed_s": round(elapsed, 2),
}
if __name__ == "__main__":
model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b"
print(f"Running code-generation benchmark against {model}...")
result = run_benchmark(model)
print(json.dumps(result, indent=2))
sys.exit(0 if result["passed"] else 1)