[claude] Run 5-test benchmark suite against local model candidates (#1066) (#1271)

2026-03-24 01:38:59 +00:00
parent 1cce28d1bb
commit 7dfbf05867
7 changed files with 2399 additions and 0 deletions
--- a/scripts/benchmarks/02_code_generation.py
+++ b/scripts/benchmarks/02_code_generation.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+"""Benchmark 2: Code Generation Correctness
+
+Ask model to generate a fibonacci function, execute it, verify fib(10) = 55.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+
+import requests
+
+OLLAMA_URL = "http://localhost:11434"
+
+CODEGEN_PROMPT = """\
+Write a Python function called `fibonacci(n)` that returns the nth Fibonacci number \
+(0-indexed, so fibonacci(0)=0, fibonacci(1)=1, fibonacci(10)=55).
+
+Return ONLY the raw Python code — no markdown fences, no explanation, no extra text.
+The function must be named exactly `fibonacci`.
+"""
+
+
+def extract_python(text: str) -> str:
+    """Extract Python code from a response."""
+    text = text.strip()
+
+    # Remove markdown fences
+    fence_match = re.search(r"```(?:python)?\s*(.*?)```", text, re.DOTALL)
+    if fence_match:
+        return fence_match.group(1).strip()
+
+    # Return as-is if it looks like code
+    if "def " in text:
+        return text
+
+    return text
+
+
+def run_prompt(model: str, prompt: str) -> str:
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False,
+        "options": {"temperature": 0.1, "num_predict": 512},
+    }
+    resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120)
+    resp.raise_for_status()
+    return resp.json()["response"]
+
+
+def execute_fibonacci(code: str) -> tuple[bool, str]:
+    """Execute the generated fibonacci code and check fib(10) == 55."""
+    test_code = code + "\n\nresult = fibonacci(10)\nprint(result)\n"
+
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+        f.write(test_code)
+        tmpfile = f.name
+
+    try:
+        proc = subprocess.run(
+            [sys.executable, tmpfile],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        output = proc.stdout.strip()
+        if proc.returncode != 0:
+            return False, f"Runtime error: {proc.stderr.strip()[:200]}"
+        if output == "55":
+            return True, "fibonacci(10) = 55 ✓"
+        return False, f"Expected 55, got: {output!r}"
+    except subprocess.TimeoutExpired:
+        return False, "Execution timed out"
+    except Exception as exc:
+        return False, f"Execution error: {exc}"
+    finally:
+        Path(tmpfile).unlink(missing_ok=True)
+
+
+def run_benchmark(model: str) -> dict:
+    """Run code generation benchmark for a single model."""
+    start = time.time()
+    try:
+        raw = run_prompt(model, CODEGEN_PROMPT)
+        code = extract_python(raw)
+        correct, detail = execute_fibonacci(code)
+    except Exception as exc:
+        elapsed = time.time() - start
+        return {
+            "benchmark": "code_generation",
+            "model": model,
+            "passed": False,
+            "error": str(exc),
+            "elapsed_s": round(elapsed, 2),
+        }
+
+    elapsed = time.time() - start
+    return {
+        "benchmark": "code_generation",
+        "model": model,
+        "passed": correct,
+        "detail": detail,
+        "code_snippet": code[:300],
+        "elapsed_s": round(elapsed, 2),
+    }
+
+
+if __name__ == "__main__":
+    model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b"
+    print(f"Running code-generation benchmark against {model}...")
+    result = run_benchmark(model)
+    print(json.dumps(result, indent=2))
+    sys.exit(0 if result["passed"] else 1)