#!/usr/bin/env python3
"""
Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.

Runs 100 diverse tool-calling prompts through multiple models and compares
success rates, latency, and token costs.

Usage:
    python3 benchmarks/tool_call_benchmark.py                  # full 100-call suite
    python3 benchmarks/tool_call_benchmark.py --limit 10       # quick smoke test
    python3 benchmarks/tool_call_benchmark.py --models nous     # single model
    python3 benchmarks/tool_call_benchmark.py --category file   # single category

Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
"""

import argparse
import json
import os
import sys
import time
import traceback
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

# Ensure hermes-agent root is importable
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))

# ---------------------------------------------------------------------------
# Test Definitions
# ---------------------------------------------------------------------------

@dataclass
class ToolCall:
    """A single tool-calling test case."""
    id: str
    category: str
    prompt: str
    expected_tool: str              # tool name we expect the model to call
    expected_params_check: str = "" # substring expected in JSON args
    timeout: int = 30               # max seconds per call
    notes: str = ""


# fmt: off
SUITE: list[ToolCall] = [
    # ── File Operations (20) ──────────────────────────────────────────────
    ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
             "read_file", "path"),
    ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
             "write_file", "path"),
    ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
             "search_files", "pattern"),
    ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
             "read_file", "offset"),
    ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
             "patch", "old_string"),
    ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
             "search_files", "target"),
    ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
             "read_file", "limit"),
    ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
             "write_file", "content"),
    ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
             "search_files", "file_glob"),
    ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
             "read_file", "bench_config"),
    ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
             "write_file", "bench_readme"),
    ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
             "search_files", "TODO"),
    ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
             "read_file", "bench_readme"),
    ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
             "patch", "Tool Benchmark"),
    ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
             "write_file", "bench_hello"),
    ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
             "search_files", "json"),
    ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
             "read_file", "bench_hello"),
    ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
             "patch", "hello world"),
    ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
             "search_files", "bench"),
    ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
             "read_file", "test_bench"),

    # ── Terminal Commands (20) ────────────────────────────────────────────
    ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
             "terminal", "echo"),
    ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
             "terminal", "date"),
    ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
             "terminal", "uname"),
    ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
             "terminal", "pwd"),
    ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
             "terminal", "head"),
    ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
             "terminal", "whoami"),
    ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
             "terminal", "df"),
    ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
             "terminal", "python3"),
    ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
             "terminal", "hostname"),
    ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
             "terminal", "uptime"),
    ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
             "terminal", "PATH"),
    ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
             "terminal", "wc"),
    ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
             "terminal", "SHELL"),
    ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
             "terminal", "memory"),
    ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
             "terminal", "id"),
    ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
             "terminal", "hostname"),
    ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
             "terminal", "echo"),
    ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
             "terminal", "seq"),
    ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
             "terminal", "print"),
    ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
             "terminal", "wc"),

    # ── Code Execution (15) ──────────────────────────────────────────────
    ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
             "execute_code", "factorial"),
    ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
             "execute_code", "words"),
    ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
             "execute_code", "fibonacci"),
    ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
             "execute_code", "json"),
    ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
             "execute_code", "glob"),
    ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
             "execute_code", "sum"),
    ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
             "execute_code", "palindrome"),
    ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
             "execute_code", "csv"),
    ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
             "execute_code", "sort"),
    ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
             "execute_code", "passwd"),
    ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
             "execute_code", "sha256"),
    ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
             "execute_code", "utcnow"),
    ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
             "execute_code", "upper"),
    ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
             "execute_code", "sys"),
    ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
             "execute_code", "socket"),

    # ── Delegation (10) ──────────────────────────────────────────────────
    ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
             "delegate_task", "log"),
    ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
             "delegate_task", "15"),
    ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
             "delegate_task", "python"),
    ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
             "delegate_task", "summarize"),
    ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
             "delegate_task", "tmp"),
    ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
             "delegate_task", ".py"),
    ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
             "delegate_task", "df"),
    ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
             "delegate_task", "os"),
    ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
             "delegate_task", "hostname"),
    ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
             "delegate_task", "write"),

    # ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
    ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
             "todo", "benchmark"),
    ToolCall("todo-02", "todo", "Show me the current todo list.",
             "todo", ""),
    ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
             "todo", "completed"),
    ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
             "todo", "Review"),
    ToolCall("todo-05", "todo", "Clear all completed todos.",
             "todo", "clear"),
    ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
              date=datetime.now().strftime("%Y-%m-%d")),
             "memory", "benchmark"),
    ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
             "memory", "benchmark"),
    ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
             "memory", "gemma"),
    ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
             "todo", "analyze"),
    ToolCall("todo-10", "memory", "Search memory for any notes about models.",
             "memory", "model"),

    # ── Skills (10 — replacing MCP tools which need servers) ─────────────
    ToolCall("skill-01", "skills", "List all available skills.",
             "skills_list", ""),
    ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
             "skill_view", "test-driven"),
    ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
             "skills_list", "git"),
    ToolCall("skill-04", "skills", "View the 'code-review' skill.",
             "skill_view", "code-review"),
    ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
             "skills_list", "devops"),
    ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
             "skill_view", "systematic-debugging"),
    ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
             "skills_list", "testing"),
    ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
             "skill_view", "writing-plans"),
    ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
             "skills_list", "software-development"),
    ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
             "skill_view", "pr-review"),

    # ── Additional tests to reach 100 ────────────────────────────────────
    ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
             "write_file", "bench_sort"),
    ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
             "read_file", "bench_sort"),
    ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
             "search_files", "class"),
    ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
             "terminal", "os"),
    ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
             "terminal", "cpu"),
    ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
             "execute_code", "flatten"),
    ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
             "execute_code", "prime"),
    ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
             "delegate_task", "cwd"),
    ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
             "todo", "Finalize"),
    ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
             "memory", "categories"),
    ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
             "skills_list", "deployment"),
    ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
             "skill_view", "gitea-burn-cycle"),
    ToolCall("skill-13", "skills", "List all available skill categories.",
             "skills_list", ""),
    ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
             "skills_list", "memory"),
    ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
             "skill_view", "mimo-swarm"),
]
# fmt: on


# ---------------------------------------------------------------------------
# Runner
# ---------------------------------------------------------------------------

@dataclass
class CallResult:
    test_id: str
    category: str
    model: str
    prompt: str
    expected_tool: str
    success: bool
    tool_called: Optional[str] = None
    tool_args_valid: bool = False
    execution_ok: bool = False
    latency_s: float = 0.0
    error: str = ""
    raw_response: str = ""


@dataclass
class ModelStats:
    model: str
    total: int = 0
    schema_ok: int = 0        # model produced valid tool call JSON
    exec_ok: int = 0          # tool actually ran without error
    latency_sum: float = 0.0
    failures: list = field(default_factory=list)

    @property
    def schema_pct(self) -> float:
        return (self.schema_ok / self.total * 100) if self.total else 0

    @property
    def exec_pct(self) -> float:
        return (self.exec_ok / self.total * 100) if self.total else 0

    @property
    def avg_latency(self) -> float:
        return (self.latency_sum / self.total) if self.total else 0


def setup_test_files():
    """Create prerequisite files for the benchmark."""
    Path("/tmp/test_bench.txt").write_text(
        "This is a benchmark test file.\n"
        "It contains sample data for tool-calling tests.\n"
        "Line three has some import statements.\n"
        "import os\nimport sys\nimport json\n"
        "End of test data.\n"
    )


def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
    """Run a single tool-calling test through the agent."""
    from run_agent import AIAgent

    result = CallResult(
        test_id=tc.id,
        category=tc.category,
        model=model_spec,
        prompt=tc.prompt,
        expected_tool=tc.expected_tool,
        success=False,
    )

    try:
        agent = AIAgent(
            model=model_spec,
            provider=provider,
            max_iterations=3,
            quiet_mode=True,
            skip_context_files=True,
            skip_memory=True,
            persist_session=False,
        )

        t0 = time.time()
        conv = agent.run_conversation(
            user_message=tc.prompt,
            system_message=(
                "You are a benchmark test runner. Execute the user's request by calling "
                "the appropriate tool. Return the tool result directly. Do not add commentary."
            ),
        )
        result.latency_s = round(time.time() - t0, 2)

        messages = conv.get("messages", [])

        # Find the first assistant message with tool_calls
        tool_called = None
        tool_args_str = ""
        for msg in messages:
            if msg.get("role") == "assistant" and msg.get("tool_calls"):
                for tc_item in msg["tool_calls"]:
                    fn = tc_item.get("function", {})
                    tool_called = fn.get("name", "")
                    tool_args_str = fn.get("arguments", "{}")
                    break
                break

        if tool_called:
            result.tool_called = tool_called
            result.schema_ok = True

            # Check if the right tool was called
            if tool_called == tc.expected_tool:
                result.success = True

            # Check if args contain expected substring
            if tc.expected_params_check:
                result.tool_args_valid = tc.expected_params_check in tool_args_str
            else:
                result.tool_args_valid = True

            # Check if tool executed (look for tool role message)
            for msg in messages:
                if msg.get("role") == "tool":
                    content = msg.get("content", "")
                    if content and "error" not in content.lower()[:50]:
                        result.execution_ok = True
                        break
                    elif content:
                        result.execution_ok = True  # got a response, even if error
                        break
        else:
            # No tool call produced — still check if model responded
            final = conv.get("final_response", "")
            result.raw_response = final[:200] if final else ""

    except Exception as e:
        result.error = f"{type(e).__name__}: {str(e)[:200]}"
        result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0

    return result


def generate_report(results: list[CallResult], models: list[str], output_path: Path):
    """Generate markdown benchmark report."""
    now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")

    # Aggregate per model
    stats: dict[str, ModelStats] = {}
    for m in models:
        stats[m] = ModelStats(model=m)

    by_category: dict[str, dict[str, list[CallResult]]] = {}

    for r in results:
        s = stats[r.model]
        s.total += 1
        s.schema_ok += int(r.schema_ok)
        s.exec_ok += int(r.execution_ok)
        s.latency_sum += r.latency_s
        if not r.success:
            s.failures.append(r)

        by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)

    lines = [
        f"# Tool-Calling Benchmark Report",
        f"",
        f"Generated: {now}",
        f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
        f"Models tested: {', '.join(models)}",
        f"",
        f"## Summary",
        f"",
        f"| Metric | {' | '.join(models)} |",
        f"|--------|{'|'.join('---------' for _ in models)}|",
    ]

    # Schema parse success
    row = "| Schema parse success | "
    for m in models:
        s = stats[m]
        row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
    lines.append(row)

    # Tool execution success
    row = "| Tool execution success | "
    for m in models:
        s = stats[m]
        row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
    lines.append(row)

    # Correct tool selected
    row = "| Correct tool selected | "
    for m in models:
        s = stats[m]
        correct = sum(1 for r in results if r.model == m and r.success)
        pct = (correct / s.total * 100) if s.total else 0
        row += f"{correct}/{s.total} ({pct:.0f}%) | "
    lines.append(row)

    # Avg latency
    row = "| Avg latency (s) | "
    for m in models:
        s = stats[m]
        row += f"{s.avg_latency:.2f} | "
    lines.append(row)

    lines.append("")

    # Per-category breakdown
    lines.append("## Per-Category Breakdown")
    lines.append("")

    for cat in sorted(by_category.keys()):
        lines.append(f"### {cat.title()}")
        lines.append("")
        lines.append(f"| Metric | {' | '.join(models)} |")
        lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")

        cat_data = by_category[cat]
        for metric_name, fn in [
            ("Schema OK", lambda r: r.schema_ok),
            ("Exec OK", lambda r: r.execution_ok),
            ("Correct tool", lambda r: r.success),
        ]:
            row = f"| {metric_name} | "
            for m in models:
                results_m = cat_data.get(m, [])
                total = len(results_m)
                ok = sum(1 for r in results_m if fn(r))
                pct = (ok / total * 100) if total else 0
                row += f"{ok}/{total} ({pct:.0f}%) | "
            lines.append(row)

        lines.append("")

    # Failure analysis
    lines.append("## Failure Analysis")
    lines.append("")

    any_failures = False
    for m in models:
        s = stats[m]
        if s.failures:
            any_failures = True
            lines.append(f"### {m} — {len(s.failures)} failures")
            lines.append("")
            lines.append("| Test | Category | Expected | Got | Error |")
            lines.append("|------|----------|----------|-----|-------|")
            for r in s.failures:
                got = r.tool_called or "none"
                err = r.error or "wrong tool"
                lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
            lines.append("")

    if not any_failures:
        lines.append("No failures detected.")
        lines.append("")

    # Raw results JSON
    lines.append("## Raw Results")
    lines.append("")
    lines.append("```json")
    lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
    lines.append("```")

    report = "\n".join(lines)
    output_path.write_text(report)
    return report


def main():
    parser = argparse.ArgumentParser(description="Tool-calling benchmark")
    parser.add_argument("--models", nargs="+",
                        default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
                        help="Model specs to test (provider:model)")
    parser.add_argument("--limit", type=int, default=0,
                        help="Run only first N tests (0 = all)")
    parser.add_argument("--category", type=str, default="",
                        help="Run only tests in this category")
    parser.add_argument("--output", type=str, default="",
                        help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
    parser.add_argument("--dry-run", action="store_true",
                        help="Print test cases without running them")
    args = parser.parse_args()

    # Filter suite
    suite = SUITE[:]
    if args.category:
        suite = [tc for tc in suite if tc.category == args.category]
    if args.limit > 0:
        suite = suite[:args.limit]

    if args.dry_run:
        print(f"Would run {len(suite)} tests:")
        for tc in suite:
            print(f"  [{tc.category:8s}] {tc.id}: {tc.expected_tool} — {tc.prompt[:60]}")
        return

    # Setup
    setup_test_files()
    date_str = datetime.now().strftime("%Y-%m-%d")
    output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"

    # Parse model specs
    model_specs = []
    for spec in args.models:
        parts = spec.split(":", 1)
        provider = parts[0]
        model_name = parts[1] if len(parts) > 1 else parts[0]
        model_specs.append((provider, model_name, spec))

    print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
    print(f"Output: {output_path}")
    print()

    all_results: list[CallResult] = []

    for provider, model_name, full_spec in model_specs:
        print(f"── {full_spec} {'─' * (50 - len(full_spec))}")
        model_results = []

        for i, tc in enumerate(suite, 1):
            sys.stdout.write(f"\r  [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s} → {tc.expected_tool:20s}")
            sys.stdout.flush()

            r = run_single_test(tc, full_spec, provider)
            model_results.append(r)

            status = "✓" if r.success else "✗"
            sys.stdout.write(f"  {status} ({r.latency_s:.1f}s)")
            sys.stdout.write("\n")

        all_results.extend(model_results)

        # Quick stats
        ok = sum(1 for r in model_results if r.success)
        print(f"  Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
        print()

    # Generate report
    model_names = [spec for _, _, spec in model_specs]
    report = generate_report(all_results, model_names, output_path)
    print(f"Report written to {output_path}")

    # Exit code: 0 if all pass, 1 if any failures
    total_fail = sum(1 for r in all_results if not r.success)
    sys.exit(1 if total_fail > 0 else 0)


if __name__ == "__main__":
    main()