121 lines
3.4 KiB
Python
121 lines
3.4 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Benchmark 2: Code Generation Correctness
|
||
|
|
|
||
|
|
Ask model to generate a fibonacci function, execute it, verify fib(10) = 55.
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import re
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
import tempfile
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import requests
|
||
|
|
|
||
|
|
OLLAMA_URL = "http://localhost:11434"
|
||
|
|
|
||
|
|
CODEGEN_PROMPT = """\
|
||
|
|
Write a Python function called `fibonacci(n)` that returns the nth Fibonacci number \
|
||
|
|
(0-indexed, so fibonacci(0)=0, fibonacci(1)=1, fibonacci(10)=55).
|
||
|
|
|
||
|
|
Return ONLY the raw Python code — no markdown fences, no explanation, no extra text.
|
||
|
|
The function must be named exactly `fibonacci`.
|
||
|
|
"""
|
||
|
|
|
||
|
|
|
||
|
|
def extract_python(text: str) -> str:
|
||
|
|
"""Extract Python code from a response."""
|
||
|
|
text = text.strip()
|
||
|
|
|
||
|
|
# Remove markdown fences
|
||
|
|
fence_match = re.search(r"```(?:python)?\s*(.*?)```", text, re.DOTALL)
|
||
|
|
if fence_match:
|
||
|
|
return fence_match.group(1).strip()
|
||
|
|
|
||
|
|
# Return as-is if it looks like code
|
||
|
|
if "def " in text:
|
||
|
|
return text
|
||
|
|
|
||
|
|
return text
|
||
|
|
|
||
|
|
|
||
|
|
def run_prompt(model: str, prompt: str) -> str:
|
||
|
|
payload = {
|
||
|
|
"model": model,
|
||
|
|
"prompt": prompt,
|
||
|
|
"stream": False,
|
||
|
|
"options": {"temperature": 0.1, "num_predict": 512},
|
||
|
|
}
|
||
|
|
resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120)
|
||
|
|
resp.raise_for_status()
|
||
|
|
return resp.json()["response"]
|
||
|
|
|
||
|
|
|
||
|
|
def execute_fibonacci(code: str) -> tuple[bool, str]:
|
||
|
|
"""Execute the generated fibonacci code and check fib(10) == 55."""
|
||
|
|
test_code = code + "\n\nresult = fibonacci(10)\nprint(result)\n"
|
||
|
|
|
||
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
||
|
|
f.write(test_code)
|
||
|
|
tmpfile = f.name
|
||
|
|
|
||
|
|
try:
|
||
|
|
proc = subprocess.run(
|
||
|
|
[sys.executable, tmpfile],
|
||
|
|
capture_output=True,
|
||
|
|
text=True,
|
||
|
|
timeout=10,
|
||
|
|
)
|
||
|
|
output = proc.stdout.strip()
|
||
|
|
if proc.returncode != 0:
|
||
|
|
return False, f"Runtime error: {proc.stderr.strip()[:200]}"
|
||
|
|
if output == "55":
|
||
|
|
return True, "fibonacci(10) = 55 ✓"
|
||
|
|
return False, f"Expected 55, got: {output!r}"
|
||
|
|
except subprocess.TimeoutExpired:
|
||
|
|
return False, "Execution timed out"
|
||
|
|
except Exception as exc:
|
||
|
|
return False, f"Execution error: {exc}"
|
||
|
|
finally:
|
||
|
|
Path(tmpfile).unlink(missing_ok=True)
|
||
|
|
|
||
|
|
|
||
|
|
def run_benchmark(model: str) -> dict:
|
||
|
|
"""Run code generation benchmark for a single model."""
|
||
|
|
start = time.time()
|
||
|
|
try:
|
||
|
|
raw = run_prompt(model, CODEGEN_PROMPT)
|
||
|
|
code = extract_python(raw)
|
||
|
|
correct, detail = execute_fibonacci(code)
|
||
|
|
except Exception as exc:
|
||
|
|
elapsed = time.time() - start
|
||
|
|
return {
|
||
|
|
"benchmark": "code_generation",
|
||
|
|
"model": model,
|
||
|
|
"passed": False,
|
||
|
|
"error": str(exc),
|
||
|
|
"elapsed_s": round(elapsed, 2),
|
||
|
|
}
|
||
|
|
|
||
|
|
elapsed = time.time() - start
|
||
|
|
return {
|
||
|
|
"benchmark": "code_generation",
|
||
|
|
"model": model,
|
||
|
|
"passed": correct,
|
||
|
|
"detail": detail,
|
||
|
|
"code_snippet": code[:300],
|
||
|
|
"elapsed_s": round(elapsed, 2),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b"
|
||
|
|
print(f"Running code-generation benchmark against {model}...")
|
||
|
|
result = run_benchmark(model)
|
||
|
|
print(json.dumps(result, indent=2))
|
||
|
|
sys.exit(0 if result["passed"] else 1)
|