#!/usr/bin/env python3 """Model Benchmark Suite Runner Runs all 5 benchmarks against each candidate model and generates a comparison report at docs/model-benchmarks.md. Usage: python scripts/benchmarks/run_suite.py python scripts/benchmarks/run_suite.py --models hermes3:8b qwen3.5:latest python scripts/benchmarks/run_suite.py --output docs/model-benchmarks.md """ from __future__ import annotations import argparse import importlib.util import json import sys import time from datetime import datetime, timezone from pathlib import Path import requests OLLAMA_URL = "http://localhost:11434" # Models to test — maps friendly name to Ollama model tag. # Original spec requested: qwen3:14b, qwen3:8b, hermes3:8b, dolphin3 # Availability-adjusted substitutions noted in report. DEFAULT_MODELS = [ "hermes3:8b", "qwen3.5:latest", "qwen2.5:14b", "llama3.2:latest", ] BENCHMARKS_DIR = Path(__file__).parent DOCS_DIR = Path(__file__).resolve().parent.parent.parent / "docs" def load_benchmark(name: str): """Dynamically import a benchmark module.""" path = BENCHMARKS_DIR / name module_name = Path(name).stem spec = importlib.util.spec_from_file_location(module_name, path) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) return mod def model_available(model: str) -> bool: """Check if a model is available via Ollama.""" try: resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10) if resp.status_code != 200: return False models = {m["name"] for m in resp.json().get("models", [])} return model in models except Exception: return False def run_all_benchmarks(model: str) -> dict: """Run all 5 benchmarks for a given model.""" benchmark_files = [ "01_tool_calling.py", "02_code_generation.py", "03_shell_commands.py", "04_multi_turn_coherence.py", "05_issue_triage.py", ] results = {} for fname in benchmark_files: key = fname.replace(".py", "") print(f" [{model}] Running {key}...", flush=True) try: mod = load_benchmark(fname) start = time.time() if key == "01_tool_calling": result = mod.run_benchmark(model) elif key == "02_code_generation": result = mod.run_benchmark(model) elif key == "03_shell_commands": result = mod.run_benchmark(model) elif key == "04_multi_turn_coherence": result = mod.run_multi_turn(model) elif key == "05_issue_triage": result = mod.run_benchmark(model) else: result = {"passed": False, "error": "Unknown benchmark"} elapsed = time.time() - start print( f" -> {'PASS' if result.get('passed') else 'FAIL'} ({elapsed:.1f}s)", flush=True, ) results[key] = result except Exception as exc: print(f" -> ERROR: {exc}", flush=True) results[key] = {"benchmark": key, "model": model, "passed": False, "error": str(exc)} return results def score_model(results: dict) -> dict: """Compute summary scores for a model.""" benchmarks = list(results.values()) passed = sum(1 for b in benchmarks if b.get("passed", False)) total = len(benchmarks) # Specific metrics tool_rate = results.get("01_tool_calling", {}).get("compliance_rate", 0.0) code_pass = results.get("02_code_generation", {}).get("passed", False) shell_pass = results.get("03_shell_commands", {}).get("passed", False) coherence = results.get("04_multi_turn_coherence", {}).get("coherence_rate", 0.0) triage_acc = results.get("05_issue_triage", {}).get("accuracy", 0.0) total_time = sum( r.get("total_time_s", r.get("elapsed_s", 0.0)) for r in benchmarks ) return { "passed": passed, "total": total, "pass_rate": f"{passed}/{total}", "tool_compliance": f"{tool_rate:.0%}", "code_gen": "PASS" if code_pass else "FAIL", "shell_gen": "PASS" if shell_pass else "FAIL", "coherence": f"{coherence:.0%}", "triage_accuracy": f"{triage_acc:.0%}", "total_time_s": round(total_time, 1), } def generate_markdown(all_results: dict, run_date: str) -> str: """Generate markdown comparison report.""" lines = [] lines.append("# Model Benchmark Results") lines.append("") lines.append(f"> Generated: {run_date} ") lines.append(f"> Ollama URL: `{OLLAMA_URL}` ") lines.append("> Issue: [#1066](http://143.198.27.163:3000/rockachopa/Timmy-time-dashboard/issues/1066)") lines.append("") lines.append("## Overview") lines.append("") lines.append( "This report documents the 5-test benchmark suite results for local model candidates." ) lines.append("") lines.append("### Model Availability vs. Spec") lines.append("") lines.append("| Requested | Tested Substitute | Reason |") lines.append("|-----------|-------------------|--------|") lines.append("| `qwen3:14b` | `qwen2.5:14b` | `qwen3:14b` not pulled locally |") lines.append("| `qwen3:8b` | `qwen3.5:latest` | `qwen3:8b` not pulled locally |") lines.append("| `hermes3:8b` | `hermes3:8b` | Exact match |") lines.append("| `dolphin3` | `llama3.2:latest` | `dolphin3` not pulled locally |") lines.append("") # Summary table lines.append("## Summary Comparison Table") lines.append("") lines.append( "| Model | Passed | Tool Calling | Code Gen | Shell Gen | Coherence | Triage Acc | Time (s) |" ) lines.append( "|-------|--------|-------------|----------|-----------|-----------|------------|----------|" ) for model, results in all_results.items(): if "error" in results and "01_tool_calling" not in results: lines.append(f"| `{model}` | — | — | — | — | — | — | — |") continue s = score_model(results) lines.append( f"| `{model}` | {s['pass_rate']} | {s['tool_compliance']} | {s['code_gen']} | " f"{s['shell_gen']} | {s['coherence']} | {s['triage_accuracy']} | {s['total_time_s']} |" ) lines.append("") # Per-model detail sections lines.append("## Per-Model Detail") lines.append("") for model, results in all_results.items(): lines.append(f"### `{model}`") lines.append("") if "error" in results and not isinstance(results.get("error"), str): lines.append(f"> **Error:** {results.get('error')}") lines.append("") continue for bkey, bres in results.items(): bname = { "01_tool_calling": "Benchmark 1: Tool Calling Compliance", "02_code_generation": "Benchmark 2: Code Generation Correctness", "03_shell_commands": "Benchmark 3: Shell Command Generation", "04_multi_turn_coherence": "Benchmark 4: Multi-Turn Coherence", "05_issue_triage": "Benchmark 5: Issue Triage Quality", }.get(bkey, bkey) status = "✅ PASS" if bres.get("passed") else "❌ FAIL" lines.append(f"#### {bname} — {status}") lines.append("") if bkey == "01_tool_calling": rate = bres.get("compliance_rate", 0) count = bres.get("valid_json_count", 0) total = bres.get("total_prompts", 0) lines.append( f"- **JSON Compliance:** {count}/{total} ({rate:.0%}) — target ≥90%" ) elif bkey == "02_code_generation": lines.append(f"- **Result:** {bres.get('detail', bres.get('error', 'n/a'))}") snippet = bres.get("code_snippet", "") if snippet: lines.append(f"- **Generated code snippet:**") lines.append(" ```python") for ln in snippet.splitlines()[:8]: lines.append(f" {ln}") lines.append(" ```") elif bkey == "03_shell_commands": passed = bres.get("passed_count", 0) refused = bres.get("refused_count", 0) total = bres.get("total_prompts", 0) lines.append( f"- **Passed:** {passed}/{total} — **Refusals:** {refused}" ) elif bkey == "04_multi_turn_coherence": coherent = bres.get("coherent_turns", 0) total = bres.get("total_turns", 0) rate = bres.get("coherence_rate", 0) lines.append( f"- **Coherent turns:** {coherent}/{total} ({rate:.0%}) — target ≥80%" ) elif bkey == "05_issue_triage": exact = bres.get("exact_matches", 0) total = bres.get("total_issues", 0) acc = bres.get("accuracy", 0) lines.append( f"- **Accuracy:** {exact}/{total} ({acc:.0%}) — target ≥80%" ) elapsed = bres.get("total_time_s", bres.get("elapsed_s", 0)) lines.append(f"- **Time:** {elapsed}s") lines.append("") lines.append("## Raw JSON Data") lines.append("") lines.append("
") lines.append("Click to expand full JSON results") lines.append("") lines.append("```json") lines.append(json.dumps(all_results, indent=2)) lines.append("```") lines.append("") lines.append("
") lines.append("") return "\n".join(lines) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run model benchmark suite") parser.add_argument( "--models", nargs="+", default=DEFAULT_MODELS, help="Models to test", ) parser.add_argument( "--output", type=Path, default=DOCS_DIR / "model-benchmarks.md", help="Output markdown file", ) parser.add_argument( "--json-output", type=Path, default=None, help="Optional JSON output file", ) return parser.parse_args() def main() -> int: args = parse_args() run_date = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") print(f"Model Benchmark Suite — {run_date}") print(f"Testing {len(args.models)} model(s): {', '.join(args.models)}") print() all_results: dict[str, dict] = {} for model in args.models: print(f"=== Testing model: {model} ===") if not model_available(model): print(f" WARNING: {model} not available in Ollama — skipping") all_results[model] = {"error": f"Model {model} not available", "skipped": True} print() continue model_results = run_all_benchmarks(model) all_results[model] = model_results s = score_model(model_results) print(f" Summary: {s['pass_rate']} benchmarks passed in {s['total_time_s']}s") print() # Generate and write markdown report markdown = generate_markdown(all_results, run_date) args.output.parent.mkdir(parents=True, exist_ok=True) args.output.write_text(markdown, encoding="utf-8") print(f"Report written to: {args.output}") if args.json_output: args.json_output.write_text(json.dumps(all_results, indent=2), encoding="utf-8") print(f"JSON data written to: {args.json_output}") # Overall pass/fail all_pass = all( not r.get("skipped", False) and all(b.get("passed", False) for b in r.values() if isinstance(b, dict)) for r in all_results.values() ) return 0 if all_pass else 1 if __name__ == "__main__": sys.exit(main())