Timmy-time-dashboard/scripts/benchmarks/run_suite.py

#!/usr/bin/env python3
"""Model Benchmark Suite Runner

Runs all 5 benchmarks against each candidate model and generates
a comparison report at docs/model-benchmarks.md.

Usage:
    python scripts/benchmarks/run_suite.py
    python scripts/benchmarks/run_suite.py --models hermes3:8b qwen3.5:latest
    python scripts/benchmarks/run_suite.py --output docs/model-benchmarks.md
"""

from __future__ import annotations

import argparse
import importlib.util
import json
import sys
import time
from datetime import datetime, timezone
from pathlib import Path

import requests

OLLAMA_URL = "http://localhost:11434"

# Models to test — maps friendly name to Ollama model tag.
# Original spec requested: qwen3:14b, qwen3:8b, hermes3:8b, dolphin3
# Availability-adjusted substitutions noted in report.
DEFAULT_MODELS = [
    "hermes3:8b",
    "qwen3.5:latest",
    "qwen2.5:14b",
    "llama3.2:latest",
]

BENCHMARKS_DIR = Path(__file__).parent
DOCS_DIR = Path(__file__).resolve().parent.parent.parent / "docs"


def load_benchmark(name: str):
    """Dynamically import a benchmark module."""
    path = BENCHMARKS_DIR / name
    module_name = Path(name).stem
    spec = importlib.util.spec_from_file_location(module_name, path)
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    return mod


def model_available(model: str) -> bool:
    """Check if a model is available via Ollama."""
    try:
        resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10)
        if resp.status_code != 200:
            return False
        models = {m["name"] for m in resp.json().get("models", [])}
        return model in models
    except Exception:
        return False


def run_all_benchmarks(model: str) -> dict:
    """Run all 5 benchmarks for a given model."""
    benchmark_files = [
        "01_tool_calling.py",
        "02_code_generation.py",
        "03_shell_commands.py",
        "04_multi_turn_coherence.py",
        "05_issue_triage.py",
    ]

    results = {}
    for fname in benchmark_files:
        key = fname.replace(".py", "")
        print(f"  [{model}] Running {key}...", flush=True)
        try:
            mod = load_benchmark(fname)
            start = time.time()
            if key == "01_tool_calling":
                result = mod.run_benchmark(model)
            elif key == "02_code_generation":
                result = mod.run_benchmark(model)
            elif key == "03_shell_commands":
                result = mod.run_benchmark(model)
            elif key == "04_multi_turn_coherence":
                result = mod.run_multi_turn(model)
            elif key == "05_issue_triage":
                result = mod.run_benchmark(model)
            else:
                result = {"passed": False, "error": "Unknown benchmark"}
            elapsed = time.time() - start
            print(
                f"    -> {'PASS' if result.get('passed') else 'FAIL'} ({elapsed:.1f}s)",
                flush=True,
            )
            results[key] = result
        except Exception as exc:
            print(f"    -> ERROR: {exc}", flush=True)
            results[key] = {"benchmark": key, "model": model, "passed": False, "error": str(exc)}

    return results


def score_model(results: dict) -> dict:
    """Compute summary scores for a model."""
    benchmarks = list(results.values())
    passed = sum(1 for b in benchmarks if b.get("passed", False))
    total = len(benchmarks)

    # Specific metrics
    tool_rate = results.get("01_tool_calling", {}).get("compliance_rate", 0.0)
    code_pass = results.get("02_code_generation", {}).get("passed", False)
    shell_pass = results.get("03_shell_commands", {}).get("passed", False)
    coherence = results.get("04_multi_turn_coherence", {}).get("coherence_rate", 0.0)
    triage_acc = results.get("05_issue_triage", {}).get("accuracy", 0.0)

    total_time = sum(
        r.get("total_time_s", r.get("elapsed_s", 0.0)) for r in benchmarks
    )

    return {
        "passed": passed,
        "total": total,
        "pass_rate": f"{passed}/{total}",
        "tool_compliance": f"{tool_rate:.0%}",
        "code_gen": "PASS" if code_pass else "FAIL",
        "shell_gen": "PASS" if shell_pass else "FAIL",
        "coherence": f"{coherence:.0%}",
        "triage_accuracy": f"{triage_acc:.0%}",
        "total_time_s": round(total_time, 1),
    }


def generate_markdown(all_results: dict, run_date: str) -> str:
    """Generate markdown comparison report."""
    lines = []
    lines.append("# Model Benchmark Results")
    lines.append("")
    lines.append(f"> Generated: {run_date}  ")
    lines.append(f"> Ollama URL: `{OLLAMA_URL}`  ")
    lines.append("> Issue: [#1066](http://143.198.27.163:3000/rockachopa/Timmy-time-dashboard/issues/1066)")
    lines.append("")
    lines.append("## Overview")
    lines.append("")
    lines.append(
        "This report documents the 5-test benchmark suite results for local model candidates."
    )
    lines.append("")
    lines.append("### Model Availability vs. Spec")
    lines.append("")
    lines.append("| Requested | Tested Substitute | Reason |")
    lines.append("|-----------|-------------------|--------|")
    lines.append("| `qwen3:14b` | `qwen2.5:14b` | `qwen3:14b` not pulled locally |")
    lines.append("| `qwen3:8b` | `qwen3.5:latest` | `qwen3:8b` not pulled locally |")
    lines.append("| `hermes3:8b` | `hermes3:8b` | Exact match |")
    lines.append("| `dolphin3` | `llama3.2:latest` | `dolphin3` not pulled locally |")
    lines.append("")

    # Summary table
    lines.append("## Summary Comparison Table")
    lines.append("")
    lines.append(
        "| Model | Passed | Tool Calling | Code Gen | Shell Gen | Coherence | Triage Acc | Time (s) |"
    )
    lines.append(
        "|-------|--------|-------------|----------|-----------|-----------|------------|----------|"
    )

    for model, results in all_results.items():
        if "error" in results and "01_tool_calling" not in results:
            lines.append(f"| `{model}` | — | — | — | — | — | — | — |")
            continue
        s = score_model(results)
        lines.append(
            f"| `{model}` | {s['pass_rate']} | {s['tool_compliance']} | {s['code_gen']} | "
            f"{s['shell_gen']} | {s['coherence']} | {s['triage_accuracy']} | {s['total_time_s']} |"
        )

    lines.append("")

    # Per-model detail sections
    lines.append("## Per-Model Detail")
    lines.append("")

    for model, results in all_results.items():
        lines.append(f"### `{model}`")
        lines.append("")

        if "error" in results and not isinstance(results.get("error"), str):
            lines.append(f"> **Error:** {results.get('error')}")
            lines.append("")
            continue

        for bkey, bres in results.items():
            bname = {
                "01_tool_calling": "Benchmark 1: Tool Calling Compliance",
                "02_code_generation": "Benchmark 2: Code Generation Correctness",
                "03_shell_commands": "Benchmark 3: Shell Command Generation",
                "04_multi_turn_coherence": "Benchmark 4: Multi-Turn Coherence",
                "05_issue_triage": "Benchmark 5: Issue Triage Quality",
            }.get(bkey, bkey)

            status = "✅ PASS" if bres.get("passed") else "❌ FAIL"
            lines.append(f"#### {bname} — {status}")
            lines.append("")

            if bkey == "01_tool_calling":
                rate = bres.get("compliance_rate", 0)
                count = bres.get("valid_json_count", 0)
                total = bres.get("total_prompts", 0)
                lines.append(
                    f"- **JSON Compliance:** {count}/{total} ({rate:.0%}) — target ≥90%"
                )
            elif bkey == "02_code_generation":
                lines.append(f"- **Result:** {bres.get('detail', bres.get('error', 'n/a'))}")
                snippet = bres.get("code_snippet", "")
                if snippet:
                    lines.append(f"- **Generated code snippet:**")
                    lines.append("  ```python")
                    for ln in snippet.splitlines()[:8]:
                        lines.append(f"  {ln}")
                    lines.append("  ```")
            elif bkey == "03_shell_commands":
                passed = bres.get("passed_count", 0)
                refused = bres.get("refused_count", 0)
                total = bres.get("total_prompts", 0)
                lines.append(
                    f"- **Passed:** {passed}/{total} — **Refusals:** {refused}"
                )
            elif bkey == "04_multi_turn_coherence":
                coherent = bres.get("coherent_turns", 0)
                total = bres.get("total_turns", 0)
                rate = bres.get("coherence_rate", 0)
                lines.append(
                    f"- **Coherent turns:** {coherent}/{total} ({rate:.0%}) — target ≥80%"
                )
            elif bkey == "05_issue_triage":
                exact = bres.get("exact_matches", 0)
                total = bres.get("total_issues", 0)
                acc = bres.get("accuracy", 0)
                lines.append(
                    f"- **Accuracy:** {exact}/{total} ({acc:.0%}) — target ≥80%"
                )

            elapsed = bres.get("total_time_s", bres.get("elapsed_s", 0))
            lines.append(f"- **Time:** {elapsed}s")
            lines.append("")

    lines.append("## Raw JSON Data")
    lines.append("")
    lines.append("<details>")
    lines.append("<summary>Click to expand full JSON results</summary>")
    lines.append("")
    lines.append("```json")
    lines.append(json.dumps(all_results, indent=2))
    lines.append("```")
    lines.append("")
    lines.append("</details>")
    lines.append("")

    return "\n".join(lines)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run model benchmark suite")
    parser.add_argument(
        "--models",
        nargs="+",
        default=DEFAULT_MODELS,
        help="Models to test",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=DOCS_DIR / "model-benchmarks.md",
        help="Output markdown file",
    )
    parser.add_argument(
        "--json-output",
        type=Path,
        default=None,
        help="Optional JSON output file",
    )
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    run_date = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")

    print(f"Model Benchmark Suite — {run_date}")
    print(f"Testing {len(args.models)} model(s): {', '.join(args.models)}")
    print()

    all_results: dict[str, dict] = {}

    for model in args.models:
        print(f"=== Testing model: {model} ===")
        if not model_available(model):
            print(f"  WARNING: {model} not available in Ollama — skipping")
            all_results[model] = {"error": f"Model {model} not available", "skipped": True}
            print()
            continue

        model_results = run_all_benchmarks(model)
        all_results[model] = model_results

        s = score_model(model_results)
        print(f"  Summary: {s['pass_rate']} benchmarks passed in {s['total_time_s']}s")
        print()

    # Generate and write markdown report
    markdown = generate_markdown(all_results, run_date)

    args.output.parent.mkdir(parents=True, exist_ok=True)
    args.output.write_text(markdown, encoding="utf-8")
    print(f"Report written to: {args.output}")

    if args.json_output:
        args.json_output.write_text(json.dumps(all_results, indent=2), encoding="utf-8")
        print(f"JSON data written to: {args.json_output}")

    # Overall pass/fail
    all_pass = all(
        not r.get("skipped", False)
        and all(b.get("passed", False) for b in r.values() if isinstance(b, dict))
        for r in all_results.values()
    )
    return 0 if all_pass else 1


if __name__ == "__main__":
    sys.exit(main())