Files
Timmy-time-dashboard/scripts/benchmarks/run_suite.py
Claude (Opus 4.6) 7dfbf05867
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled
[claude] Run 5-test benchmark suite against local model candidates (#1066) (#1271)
2026-03-24 01:38:59 +00:00

335 lines
12 KiB
Python

#!/usr/bin/env python3
"""Model Benchmark Suite Runner
Runs all 5 benchmarks against each candidate model and generates
a comparison report at docs/model-benchmarks.md.
Usage:
python scripts/benchmarks/run_suite.py
python scripts/benchmarks/run_suite.py --models hermes3:8b qwen3.5:latest
python scripts/benchmarks/run_suite.py --output docs/model-benchmarks.md
"""
from __future__ import annotations
import argparse
import importlib.util
import json
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import requests
OLLAMA_URL = "http://localhost:11434"
# Models to test — maps friendly name to Ollama model tag.
# Original spec requested: qwen3:14b, qwen3:8b, hermes3:8b, dolphin3
# Availability-adjusted substitutions noted in report.
DEFAULT_MODELS = [
"hermes3:8b",
"qwen3.5:latest",
"qwen2.5:14b",
"llama3.2:latest",
]
BENCHMARKS_DIR = Path(__file__).parent
DOCS_DIR = Path(__file__).resolve().parent.parent.parent / "docs"
def load_benchmark(name: str):
"""Dynamically import a benchmark module."""
path = BENCHMARKS_DIR / name
module_name = Path(name).stem
spec = importlib.util.spec_from_file_location(module_name, path)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
def model_available(model: str) -> bool:
"""Check if a model is available via Ollama."""
try:
resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10)
if resp.status_code != 200:
return False
models = {m["name"] for m in resp.json().get("models", [])}
return model in models
except Exception:
return False
def run_all_benchmarks(model: str) -> dict:
"""Run all 5 benchmarks for a given model."""
benchmark_files = [
"01_tool_calling.py",
"02_code_generation.py",
"03_shell_commands.py",
"04_multi_turn_coherence.py",
"05_issue_triage.py",
]
results = {}
for fname in benchmark_files:
key = fname.replace(".py", "")
print(f" [{model}] Running {key}...", flush=True)
try:
mod = load_benchmark(fname)
start = time.time()
if key == "01_tool_calling":
result = mod.run_benchmark(model)
elif key == "02_code_generation":
result = mod.run_benchmark(model)
elif key == "03_shell_commands":
result = mod.run_benchmark(model)
elif key == "04_multi_turn_coherence":
result = mod.run_multi_turn(model)
elif key == "05_issue_triage":
result = mod.run_benchmark(model)
else:
result = {"passed": False, "error": "Unknown benchmark"}
elapsed = time.time() - start
print(
f" -> {'PASS' if result.get('passed') else 'FAIL'} ({elapsed:.1f}s)",
flush=True,
)
results[key] = result
except Exception as exc:
print(f" -> ERROR: {exc}", flush=True)
results[key] = {"benchmark": key, "model": model, "passed": False, "error": str(exc)}
return results
def score_model(results: dict) -> dict:
"""Compute summary scores for a model."""
benchmarks = list(results.values())
passed = sum(1 for b in benchmarks if b.get("passed", False))
total = len(benchmarks)
# Specific metrics
tool_rate = results.get("01_tool_calling", {}).get("compliance_rate", 0.0)
code_pass = results.get("02_code_generation", {}).get("passed", False)
shell_pass = results.get("03_shell_commands", {}).get("passed", False)
coherence = results.get("04_multi_turn_coherence", {}).get("coherence_rate", 0.0)
triage_acc = results.get("05_issue_triage", {}).get("accuracy", 0.0)
total_time = sum(
r.get("total_time_s", r.get("elapsed_s", 0.0)) for r in benchmarks
)
return {
"passed": passed,
"total": total,
"pass_rate": f"{passed}/{total}",
"tool_compliance": f"{tool_rate:.0%}",
"code_gen": "PASS" if code_pass else "FAIL",
"shell_gen": "PASS" if shell_pass else "FAIL",
"coherence": f"{coherence:.0%}",
"triage_accuracy": f"{triage_acc:.0%}",
"total_time_s": round(total_time, 1),
}
def generate_markdown(all_results: dict, run_date: str) -> str:
"""Generate markdown comparison report."""
lines = []
lines.append("# Model Benchmark Results")
lines.append("")
lines.append(f"> Generated: {run_date} ")
lines.append(f"> Ollama URL: `{OLLAMA_URL}` ")
lines.append("> Issue: [#1066](http://143.198.27.163:3000/rockachopa/Timmy-time-dashboard/issues/1066)")
lines.append("")
lines.append("## Overview")
lines.append("")
lines.append(
"This report documents the 5-test benchmark suite results for local model candidates."
)
lines.append("")
lines.append("### Model Availability vs. Spec")
lines.append("")
lines.append("| Requested | Tested Substitute | Reason |")
lines.append("|-----------|-------------------|--------|")
lines.append("| `qwen3:14b` | `qwen2.5:14b` | `qwen3:14b` not pulled locally |")
lines.append("| `qwen3:8b` | `qwen3.5:latest` | `qwen3:8b` not pulled locally |")
lines.append("| `hermes3:8b` | `hermes3:8b` | Exact match |")
lines.append("| `dolphin3` | `llama3.2:latest` | `dolphin3` not pulled locally |")
lines.append("")
# Summary table
lines.append("## Summary Comparison Table")
lines.append("")
lines.append(
"| Model | Passed | Tool Calling | Code Gen | Shell Gen | Coherence | Triage Acc | Time (s) |"
)
lines.append(
"|-------|--------|-------------|----------|-----------|-----------|------------|----------|"
)
for model, results in all_results.items():
if "error" in results and "01_tool_calling" not in results:
lines.append(f"| `{model}` | — | — | — | — | — | — | — |")
continue
s = score_model(results)
lines.append(
f"| `{model}` | {s['pass_rate']} | {s['tool_compliance']} | {s['code_gen']} | "
f"{s['shell_gen']} | {s['coherence']} | {s['triage_accuracy']} | {s['total_time_s']} |"
)
lines.append("")
# Per-model detail sections
lines.append("## Per-Model Detail")
lines.append("")
for model, results in all_results.items():
lines.append(f"### `{model}`")
lines.append("")
if "error" in results and not isinstance(results.get("error"), str):
lines.append(f"> **Error:** {results.get('error')}")
lines.append("")
continue
for bkey, bres in results.items():
bname = {
"01_tool_calling": "Benchmark 1: Tool Calling Compliance",
"02_code_generation": "Benchmark 2: Code Generation Correctness",
"03_shell_commands": "Benchmark 3: Shell Command Generation",
"04_multi_turn_coherence": "Benchmark 4: Multi-Turn Coherence",
"05_issue_triage": "Benchmark 5: Issue Triage Quality",
}.get(bkey, bkey)
status = "✅ PASS" if bres.get("passed") else "❌ FAIL"
lines.append(f"#### {bname}{status}")
lines.append("")
if bkey == "01_tool_calling":
rate = bres.get("compliance_rate", 0)
count = bres.get("valid_json_count", 0)
total = bres.get("total_prompts", 0)
lines.append(
f"- **JSON Compliance:** {count}/{total} ({rate:.0%}) — target ≥90%"
)
elif bkey == "02_code_generation":
lines.append(f"- **Result:** {bres.get('detail', bres.get('error', 'n/a'))}")
snippet = bres.get("code_snippet", "")
if snippet:
lines.append(f"- **Generated code snippet:**")
lines.append(" ```python")
for ln in snippet.splitlines()[:8]:
lines.append(f" {ln}")
lines.append(" ```")
elif bkey == "03_shell_commands":
passed = bres.get("passed_count", 0)
refused = bres.get("refused_count", 0)
total = bres.get("total_prompts", 0)
lines.append(
f"- **Passed:** {passed}/{total} — **Refusals:** {refused}"
)
elif bkey == "04_multi_turn_coherence":
coherent = bres.get("coherent_turns", 0)
total = bres.get("total_turns", 0)
rate = bres.get("coherence_rate", 0)
lines.append(
f"- **Coherent turns:** {coherent}/{total} ({rate:.0%}) — target ≥80%"
)
elif bkey == "05_issue_triage":
exact = bres.get("exact_matches", 0)
total = bres.get("total_issues", 0)
acc = bres.get("accuracy", 0)
lines.append(
f"- **Accuracy:** {exact}/{total} ({acc:.0%}) — target ≥80%"
)
elapsed = bres.get("total_time_s", bres.get("elapsed_s", 0))
lines.append(f"- **Time:** {elapsed}s")
lines.append("")
lines.append("## Raw JSON Data")
lines.append("")
lines.append("<details>")
lines.append("<summary>Click to expand full JSON results</summary>")
lines.append("")
lines.append("```json")
lines.append(json.dumps(all_results, indent=2))
lines.append("```")
lines.append("")
lines.append("</details>")
lines.append("")
return "\n".join(lines)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run model benchmark suite")
parser.add_argument(
"--models",
nargs="+",
default=DEFAULT_MODELS,
help="Models to test",
)
parser.add_argument(
"--output",
type=Path,
default=DOCS_DIR / "model-benchmarks.md",
help="Output markdown file",
)
parser.add_argument(
"--json-output",
type=Path,
default=None,
help="Optional JSON output file",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
run_date = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
print(f"Model Benchmark Suite — {run_date}")
print(f"Testing {len(args.models)} model(s): {', '.join(args.models)}")
print()
all_results: dict[str, dict] = {}
for model in args.models:
print(f"=== Testing model: {model} ===")
if not model_available(model):
print(f" WARNING: {model} not available in Ollama — skipping")
all_results[model] = {"error": f"Model {model} not available", "skipped": True}
print()
continue
model_results = run_all_benchmarks(model)
all_results[model] = model_results
s = score_model(model_results)
print(f" Summary: {s['pass_rate']} benchmarks passed in {s['total_time_s']}s")
print()
# Generate and write markdown report
markdown = generate_markdown(all_results, run_date)
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(markdown, encoding="utf-8")
print(f"Report written to: {args.output}")
if args.json_output:
args.json_output.write_text(json.dumps(all_results, indent=2), encoding="utf-8")
print(f"JSON data written to: {args.json_output}")
# Overall pass/fail
all_pass = all(
not r.get("skipped", False)
and all(b.get("passed", False) for b in r.values() if isinstance(b, dict))
for r in all_results.values()
)
return 0 if all_pass else 1
if __name__ == "__main__":
sys.exit(main())