forked from Rockachopa/Timmy-time-dashboard
335 lines
12 KiB
Python
335 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Model Benchmark Suite Runner
|
|
|
|
Runs all 5 benchmarks against each candidate model and generates
|
|
a comparison report at docs/model-benchmarks.md.
|
|
|
|
Usage:
|
|
python scripts/benchmarks/run_suite.py
|
|
python scripts/benchmarks/run_suite.py --models hermes3:8b qwen3.5:latest
|
|
python scripts/benchmarks/run_suite.py --output docs/model-benchmarks.md
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import importlib.util
|
|
import json
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
OLLAMA_URL = "http://localhost:11434"
|
|
|
|
# Models to test — maps friendly name to Ollama model tag.
|
|
# Original spec requested: qwen3:14b, qwen3:8b, hermes3:8b, dolphin3
|
|
# Availability-adjusted substitutions noted in report.
|
|
DEFAULT_MODELS = [
|
|
"hermes3:8b",
|
|
"qwen3.5:latest",
|
|
"qwen2.5:14b",
|
|
"llama3.2:latest",
|
|
]
|
|
|
|
BENCHMARKS_DIR = Path(__file__).parent
|
|
DOCS_DIR = Path(__file__).resolve().parent.parent.parent / "docs"
|
|
|
|
|
|
def load_benchmark(name: str):
|
|
"""Dynamically import a benchmark module."""
|
|
path = BENCHMARKS_DIR / name
|
|
module_name = Path(name).stem
|
|
spec = importlib.util.spec_from_file_location(module_name, path)
|
|
mod = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(mod)
|
|
return mod
|
|
|
|
|
|
def model_available(model: str) -> bool:
|
|
"""Check if a model is available via Ollama."""
|
|
try:
|
|
resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10)
|
|
if resp.status_code != 200:
|
|
return False
|
|
models = {m["name"] for m in resp.json().get("models", [])}
|
|
return model in models
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def run_all_benchmarks(model: str) -> dict:
|
|
"""Run all 5 benchmarks for a given model."""
|
|
benchmark_files = [
|
|
"01_tool_calling.py",
|
|
"02_code_generation.py",
|
|
"03_shell_commands.py",
|
|
"04_multi_turn_coherence.py",
|
|
"05_issue_triage.py",
|
|
]
|
|
|
|
results = {}
|
|
for fname in benchmark_files:
|
|
key = fname.replace(".py", "")
|
|
print(f" [{model}] Running {key}...", flush=True)
|
|
try:
|
|
mod = load_benchmark(fname)
|
|
start = time.time()
|
|
if key == "01_tool_calling":
|
|
result = mod.run_benchmark(model)
|
|
elif key == "02_code_generation":
|
|
result = mod.run_benchmark(model)
|
|
elif key == "03_shell_commands":
|
|
result = mod.run_benchmark(model)
|
|
elif key == "04_multi_turn_coherence":
|
|
result = mod.run_multi_turn(model)
|
|
elif key == "05_issue_triage":
|
|
result = mod.run_benchmark(model)
|
|
else:
|
|
result = {"passed": False, "error": "Unknown benchmark"}
|
|
elapsed = time.time() - start
|
|
print(
|
|
f" -> {'PASS' if result.get('passed') else 'FAIL'} ({elapsed:.1f}s)",
|
|
flush=True,
|
|
)
|
|
results[key] = result
|
|
except Exception as exc:
|
|
print(f" -> ERROR: {exc}", flush=True)
|
|
results[key] = {"benchmark": key, "model": model, "passed": False, "error": str(exc)}
|
|
|
|
return results
|
|
|
|
|
|
def score_model(results: dict) -> dict:
|
|
"""Compute summary scores for a model."""
|
|
benchmarks = list(results.values())
|
|
passed = sum(1 for b in benchmarks if b.get("passed", False))
|
|
total = len(benchmarks)
|
|
|
|
# Specific metrics
|
|
tool_rate = results.get("01_tool_calling", {}).get("compliance_rate", 0.0)
|
|
code_pass = results.get("02_code_generation", {}).get("passed", False)
|
|
shell_pass = results.get("03_shell_commands", {}).get("passed", False)
|
|
coherence = results.get("04_multi_turn_coherence", {}).get("coherence_rate", 0.0)
|
|
triage_acc = results.get("05_issue_triage", {}).get("accuracy", 0.0)
|
|
|
|
total_time = sum(
|
|
r.get("total_time_s", r.get("elapsed_s", 0.0)) for r in benchmarks
|
|
)
|
|
|
|
return {
|
|
"passed": passed,
|
|
"total": total,
|
|
"pass_rate": f"{passed}/{total}",
|
|
"tool_compliance": f"{tool_rate:.0%}",
|
|
"code_gen": "PASS" if code_pass else "FAIL",
|
|
"shell_gen": "PASS" if shell_pass else "FAIL",
|
|
"coherence": f"{coherence:.0%}",
|
|
"triage_accuracy": f"{triage_acc:.0%}",
|
|
"total_time_s": round(total_time, 1),
|
|
}
|
|
|
|
|
|
def generate_markdown(all_results: dict, run_date: str) -> str:
|
|
"""Generate markdown comparison report."""
|
|
lines = []
|
|
lines.append("# Model Benchmark Results")
|
|
lines.append("")
|
|
lines.append(f"> Generated: {run_date} ")
|
|
lines.append(f"> Ollama URL: `{OLLAMA_URL}` ")
|
|
lines.append("> Issue: [#1066](http://143.198.27.163:3000/rockachopa/Timmy-time-dashboard/issues/1066)")
|
|
lines.append("")
|
|
lines.append("## Overview")
|
|
lines.append("")
|
|
lines.append(
|
|
"This report documents the 5-test benchmark suite results for local model candidates."
|
|
)
|
|
lines.append("")
|
|
lines.append("### Model Availability vs. Spec")
|
|
lines.append("")
|
|
lines.append("| Requested | Tested Substitute | Reason |")
|
|
lines.append("|-----------|-------------------|--------|")
|
|
lines.append("| `qwen3:14b` | `qwen2.5:14b` | `qwen3:14b` not pulled locally |")
|
|
lines.append("| `qwen3:8b` | `qwen3.5:latest` | `qwen3:8b` not pulled locally |")
|
|
lines.append("| `hermes3:8b` | `hermes3:8b` | Exact match |")
|
|
lines.append("| `dolphin3` | `llama3.2:latest` | `dolphin3` not pulled locally |")
|
|
lines.append("")
|
|
|
|
# Summary table
|
|
lines.append("## Summary Comparison Table")
|
|
lines.append("")
|
|
lines.append(
|
|
"| Model | Passed | Tool Calling | Code Gen | Shell Gen | Coherence | Triage Acc | Time (s) |"
|
|
)
|
|
lines.append(
|
|
"|-------|--------|-------------|----------|-----------|-----------|------------|----------|"
|
|
)
|
|
|
|
for model, results in all_results.items():
|
|
if "error" in results and "01_tool_calling" not in results:
|
|
lines.append(f"| `{model}` | — | — | — | — | — | — | — |")
|
|
continue
|
|
s = score_model(results)
|
|
lines.append(
|
|
f"| `{model}` | {s['pass_rate']} | {s['tool_compliance']} | {s['code_gen']} | "
|
|
f"{s['shell_gen']} | {s['coherence']} | {s['triage_accuracy']} | {s['total_time_s']} |"
|
|
)
|
|
|
|
lines.append("")
|
|
|
|
# Per-model detail sections
|
|
lines.append("## Per-Model Detail")
|
|
lines.append("")
|
|
|
|
for model, results in all_results.items():
|
|
lines.append(f"### `{model}`")
|
|
lines.append("")
|
|
|
|
if "error" in results and not isinstance(results.get("error"), str):
|
|
lines.append(f"> **Error:** {results.get('error')}")
|
|
lines.append("")
|
|
continue
|
|
|
|
for bkey, bres in results.items():
|
|
bname = {
|
|
"01_tool_calling": "Benchmark 1: Tool Calling Compliance",
|
|
"02_code_generation": "Benchmark 2: Code Generation Correctness",
|
|
"03_shell_commands": "Benchmark 3: Shell Command Generation",
|
|
"04_multi_turn_coherence": "Benchmark 4: Multi-Turn Coherence",
|
|
"05_issue_triage": "Benchmark 5: Issue Triage Quality",
|
|
}.get(bkey, bkey)
|
|
|
|
status = "✅ PASS" if bres.get("passed") else "❌ FAIL"
|
|
lines.append(f"#### {bname} — {status}")
|
|
lines.append("")
|
|
|
|
if bkey == "01_tool_calling":
|
|
rate = bres.get("compliance_rate", 0)
|
|
count = bres.get("valid_json_count", 0)
|
|
total = bres.get("total_prompts", 0)
|
|
lines.append(
|
|
f"- **JSON Compliance:** {count}/{total} ({rate:.0%}) — target ≥90%"
|
|
)
|
|
elif bkey == "02_code_generation":
|
|
lines.append(f"- **Result:** {bres.get('detail', bres.get('error', 'n/a'))}")
|
|
snippet = bres.get("code_snippet", "")
|
|
if snippet:
|
|
lines.append(f"- **Generated code snippet:**")
|
|
lines.append(" ```python")
|
|
for ln in snippet.splitlines()[:8]:
|
|
lines.append(f" {ln}")
|
|
lines.append(" ```")
|
|
elif bkey == "03_shell_commands":
|
|
passed = bres.get("passed_count", 0)
|
|
refused = bres.get("refused_count", 0)
|
|
total = bres.get("total_prompts", 0)
|
|
lines.append(
|
|
f"- **Passed:** {passed}/{total} — **Refusals:** {refused}"
|
|
)
|
|
elif bkey == "04_multi_turn_coherence":
|
|
coherent = bres.get("coherent_turns", 0)
|
|
total = bres.get("total_turns", 0)
|
|
rate = bres.get("coherence_rate", 0)
|
|
lines.append(
|
|
f"- **Coherent turns:** {coherent}/{total} ({rate:.0%}) — target ≥80%"
|
|
)
|
|
elif bkey == "05_issue_triage":
|
|
exact = bres.get("exact_matches", 0)
|
|
total = bres.get("total_issues", 0)
|
|
acc = bres.get("accuracy", 0)
|
|
lines.append(
|
|
f"- **Accuracy:** {exact}/{total} ({acc:.0%}) — target ≥80%"
|
|
)
|
|
|
|
elapsed = bres.get("total_time_s", bres.get("elapsed_s", 0))
|
|
lines.append(f"- **Time:** {elapsed}s")
|
|
lines.append("")
|
|
|
|
lines.append("## Raw JSON Data")
|
|
lines.append("")
|
|
lines.append("<details>")
|
|
lines.append("<summary>Click to expand full JSON results</summary>")
|
|
lines.append("")
|
|
lines.append("```json")
|
|
lines.append(json.dumps(all_results, indent=2))
|
|
lines.append("```")
|
|
lines.append("")
|
|
lines.append("</details>")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Run model benchmark suite")
|
|
parser.add_argument(
|
|
"--models",
|
|
nargs="+",
|
|
default=DEFAULT_MODELS,
|
|
help="Models to test",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=DOCS_DIR / "model-benchmarks.md",
|
|
help="Output markdown file",
|
|
)
|
|
parser.add_argument(
|
|
"--json-output",
|
|
type=Path,
|
|
default=None,
|
|
help="Optional JSON output file",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
run_date = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
|
|
|
print(f"Model Benchmark Suite — {run_date}")
|
|
print(f"Testing {len(args.models)} model(s): {', '.join(args.models)}")
|
|
print()
|
|
|
|
all_results: dict[str, dict] = {}
|
|
|
|
for model in args.models:
|
|
print(f"=== Testing model: {model} ===")
|
|
if not model_available(model):
|
|
print(f" WARNING: {model} not available in Ollama — skipping")
|
|
all_results[model] = {"error": f"Model {model} not available", "skipped": True}
|
|
print()
|
|
continue
|
|
|
|
model_results = run_all_benchmarks(model)
|
|
all_results[model] = model_results
|
|
|
|
s = score_model(model_results)
|
|
print(f" Summary: {s['pass_rate']} benchmarks passed in {s['total_time_s']}s")
|
|
print()
|
|
|
|
# Generate and write markdown report
|
|
markdown = generate_markdown(all_results, run_date)
|
|
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
args.output.write_text(markdown, encoding="utf-8")
|
|
print(f"Report written to: {args.output}")
|
|
|
|
if args.json_output:
|
|
args.json_output.write_text(json.dumps(all_results, indent=2), encoding="utf-8")
|
|
print(f"JSON data written to: {args.json_output}")
|
|
|
|
# Overall pass/fail
|
|
all_pass = all(
|
|
not r.get("skipped", False)
|
|
and all(b.get("passed", False) for b in r.values() if isinstance(b, dict))
|
|
for r in all_results.values()
|
|
)
|
|
return 0 if all_pass else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|