#!/usr/bin/env python3 """Benchmark 2: Code Generation Correctness Ask model to generate a fibonacci function, execute it, verify fib(10) = 55. """ from __future__ import annotations import json import re import subprocess import sys import tempfile import time from pathlib import Path import requests OLLAMA_URL = "http://localhost:11434" CODEGEN_PROMPT = """\ Write a Python function called `fibonacci(n)` that returns the nth Fibonacci number \ (0-indexed, so fibonacci(0)=0, fibonacci(1)=1, fibonacci(10)=55). Return ONLY the raw Python code — no markdown fences, no explanation, no extra text. The function must be named exactly `fibonacci`. """ def extract_python(text: str) -> str: """Extract Python code from a response.""" text = text.strip() # Remove markdown fences fence_match = re.search(r"```(?:python)?\s*(.*?)```", text, re.DOTALL) if fence_match: return fence_match.group(1).strip() # Return as-is if it looks like code if "def " in text: return text return text def run_prompt(model: str, prompt: str) -> str: payload = { "model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 512}, } resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120) resp.raise_for_status() return resp.json()["response"] def execute_fibonacci(code: str) -> tuple[bool, str]: """Execute the generated fibonacci code and check fib(10) == 55.""" test_code = code + "\n\nresult = fibonacci(10)\nprint(result)\n" with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: f.write(test_code) tmpfile = f.name try: proc = subprocess.run( [sys.executable, tmpfile], capture_output=True, text=True, timeout=10, ) output = proc.stdout.strip() if proc.returncode != 0: return False, f"Runtime error: {proc.stderr.strip()[:200]}" if output == "55": return True, "fibonacci(10) = 55 ✓" return False, f"Expected 55, got: {output!r}" except subprocess.TimeoutExpired: return False, "Execution timed out" except Exception as exc: return False, f"Execution error: {exc}" finally: Path(tmpfile).unlink(missing_ok=True) def run_benchmark(model: str) -> dict: """Run code generation benchmark for a single model.""" start = time.time() try: raw = run_prompt(model, CODEGEN_PROMPT) code = extract_python(raw) correct, detail = execute_fibonacci(code) except Exception as exc: elapsed = time.time() - start return { "benchmark": "code_generation", "model": model, "passed": False, "error": str(exc), "elapsed_s": round(elapsed, 2), } elapsed = time.time() - start return { "benchmark": "code_generation", "model": model, "passed": correct, "detail": detail, "code_snippet": code[:300], "elapsed_s": round(elapsed, 2), } if __name__ == "__main__": model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b" print(f"Running code-generation benchmark against {model}...") result = run_benchmark(model) print(json.dumps(result, indent=2)) sys.exit(0 if result["passed"] else 1)