#!/usr/bin/env python3 """ Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test. Runs 100 diverse tool-calling prompts through multiple models and compares success rates, latency, and token costs. Usage: python3 benchmarks/tool_call_benchmark.py # full 100-call suite python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test python3 benchmarks/tool_call_benchmark.py --models nous # single model python3 benchmarks/tool_call_benchmark.py --category file # single category Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent. """ import argparse import json import os import sys import time import traceback from dataclasses import dataclass, field, asdict from datetime import datetime, timezone from pathlib import Path from typing import Optional # Ensure hermes-agent root is importable REPO_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(REPO_ROOT)) # --------------------------------------------------------------------------- # Test Definitions # --------------------------------------------------------------------------- @dataclass class ToolCall: """A single tool-calling test case.""" id: str category: str prompt: str expected_tool: str # tool name we expect the model to call expected_params_check: str = "" # substring expected in JSON args timeout: int = 30 # max seconds per call notes: str = "" # fmt: off SUITE: list[ToolCall] = [ # ── File Operations (20) ────────────────────────────────────────────── ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.", "read_file", "path"), ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt", "write_file", "path"), ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.", "search_files", "pattern"), ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts", "read_file", "offset"), ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'", "patch", "old_string"), ToolCall("file-06", "file", "Search for files matching *.py in the current directory.", "search_files", "target"), ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd", "read_file", "limit"), ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true", "write_file", "content"), ToolCall("file-09", "file", "Search for 'def test_' in Python test files.", "search_files", "file_glob"), ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.", "read_file", "bench_config"), ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'", "write_file", "bench_readme"), ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.", "search_files", "TODO"), ToolCall("file-13", "file", "Read /tmp/bench_readme.md", "read_file", "bench_readme"), ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'", "patch", "Tool Benchmark"), ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.", "write_file", "bench_hello"), ToolCall("file-16", "file", "Search for all .json files in /tmp/.", "search_files", "json"), ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').", "read_file", "bench_hello"), ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.", "patch", "hello world"), ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.", "search_files", "bench"), ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.", "read_file", "test_bench"), # ── Terminal Commands (20) ──────────────────────────────────────────── ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.", "terminal", "echo"), ToolCall("term-02", "terminal", "Run `date` to get the current date and time.", "terminal", "date"), ToolCall("term-03", "terminal", "Run `uname -a` to get system information.", "terminal", "uname"), ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.", "terminal", "pwd"), ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.", "terminal", "head"), ToolCall("term-06", "terminal", "Run `whoami` to show the current user.", "terminal", "whoami"), ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.", "terminal", "df"), ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.", "terminal", "python3"), ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.", "terminal", "hostname"), ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.", "terminal", "uptime"), ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.", "terminal", "PATH"), ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.", "terminal", "wc"), ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.", "terminal", "SHELL"), ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.", "terminal", "memory"), ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.", "terminal", "id"), ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.", "terminal", "hostname"), ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.", "terminal", "echo"), ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.", "terminal", "seq"), ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.", "terminal", "print"), ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.", "terminal", "wc"), # ── Code Execution (15) ────────────────────────────────────────────── ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.", "execute_code", "factorial"), ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.", "execute_code", "words"), ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.", "execute_code", "fibonacci"), ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.", "execute_code", "json"), ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.", "execute_code", "glob"), ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.", "execute_code", "sum"), ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.", "execute_code", "palindrome"), ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.", "execute_code", "csv"), ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.", "execute_code", "sort"), ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.", "execute_code", "passwd"), ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.", "execute_code", "sha256"), ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.", "execute_code", "utcnow"), ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.", "execute_code", "upper"), ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).", "execute_code", "sys"), ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.", "execute_code", "socket"), # ── Delegation (10) ────────────────────────────────────────────────── ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.", "delegate_task", "log"), ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?", "delegate_task", "15"), ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.", "delegate_task", "python"), ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.", "delegate_task", "summarize"), ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.", "delegate_task", "tmp"), ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.", "delegate_task", ".py"), ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.", "delegate_task", "df"), ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?", "delegate_task", "os"), ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.", "delegate_task", "hostname"), ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.", "delegate_task", "write"), # ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ── ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'", "todo", "benchmark"), ToolCall("todo-02", "todo", "Show me the current todo list.", "todo", ""), ToolCall("todo-03", "todo", "Mark the first todo item as completed.", "todo", "completed"), ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.", "todo", "Review"), ToolCall("todo-05", "todo", "Clear all completed todos.", "todo", "clear"), ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format( date=datetime.now().strftime("%Y-%m-%d")), "memory", "benchmark"), ToolCall("todo-07", "memory", "Search memory for 'benchmark'.", "memory", "benchmark"), ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.", "memory", "gemma"), ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.", "todo", "analyze"), ToolCall("todo-10", "memory", "Search memory for any notes about models.", "memory", "model"), # ── Skills (10 — replacing MCP tools which need servers) ───────────── ToolCall("skill-01", "skills", "List all available skills.", "skills_list", ""), ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.", "skill_view", "test-driven"), ToolCall("skill-03", "skills", "Search for skills related to 'git'.", "skills_list", "git"), ToolCall("skill-04", "skills", "View the 'code-review' skill.", "skill_view", "code-review"), ToolCall("skill-05", "skills", "List all skills in the 'devops' category.", "skills_list", "devops"), ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.", "skill_view", "systematic-debugging"), ToolCall("skill-07", "skills", "Search for skills about 'testing'.", "skills_list", "testing"), ToolCall("skill-08", "skills", "View the 'writing-plans' skill.", "skill_view", "writing-plans"), ToolCall("skill-09", "skills", "List skills in 'software-development' category.", "skills_list", "software-development"), ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.", "skill_view", "pr-review"), # ── Additional tests to reach 100 ──────────────────────────────────── ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].", "write_file", "bench_sort"), ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.", "read_file", "bench_sort"), ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.", "search_files", "class"), ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.", "terminal", "os"), ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.", "terminal", "cpu"), ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].", "execute_code", "flatten"), ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.", "execute_code", "prime"), ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?", "delegate_task", "cwd"), ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.", "todo", "Finalize"), ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.", "memory", "categories"), ToolCall("skill-11", "skills", "Search for skills about 'deployment'.", "skills_list", "deployment"), ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.", "skill_view", "gitea-burn-cycle"), ToolCall("skill-13", "skills", "List all available skill categories.", "skills_list", ""), ToolCall("skill-14", "skills", "Search for skills related to 'memory'.", "skills_list", "memory"), ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.", "skill_view", "mimo-swarm"), ] # fmt: on # --------------------------------------------------------------------------- # Runner # --------------------------------------------------------------------------- @dataclass class CallResult: test_id: str category: str model: str prompt: str expected_tool: str success: bool tool_called: Optional[str] = None tool_args_valid: bool = False execution_ok: bool = False latency_s: float = 0.0 error: str = "" raw_response: str = "" @dataclass class ModelStats: model: str total: int = 0 schema_ok: int = 0 # model produced valid tool call JSON exec_ok: int = 0 # tool actually ran without error latency_sum: float = 0.0 failures: list = field(default_factory=list) @property def schema_pct(self) -> float: return (self.schema_ok / self.total * 100) if self.total else 0 @property def exec_pct(self) -> float: return (self.exec_ok / self.total * 100) if self.total else 0 @property def avg_latency(self) -> float: return (self.latency_sum / self.total) if self.total else 0 def setup_test_files(): """Create prerequisite files for the benchmark.""" Path("/tmp/test_bench.txt").write_text( "This is a benchmark test file.\n" "It contains sample data for tool-calling tests.\n" "Line three has some import statements.\n" "import os\nimport sys\nimport json\n" "End of test data.\n" ) def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult: """Run a single tool-calling test through the agent.""" from run_agent import AIAgent result = CallResult( test_id=tc.id, category=tc.category, model=model_spec, prompt=tc.prompt, expected_tool=tc.expected_tool, success=False, ) try: agent = AIAgent( model=model_spec, provider=provider, max_iterations=3, quiet_mode=True, skip_context_files=True, skip_memory=True, persist_session=False, ) t0 = time.time() conv = agent.run_conversation( user_message=tc.prompt, system_message=( "You are a benchmark test runner. Execute the user's request by calling " "the appropriate tool. Return the tool result directly. Do not add commentary." ), ) result.latency_s = round(time.time() - t0, 2) messages = conv.get("messages", []) # Find the first assistant message with tool_calls tool_called = None tool_args_str = "" for msg in messages: if msg.get("role") == "assistant" and msg.get("tool_calls"): for tc_item in msg["tool_calls"]: fn = tc_item.get("function", {}) tool_called = fn.get("name", "") tool_args_str = fn.get("arguments", "{}") break break if tool_called: result.tool_called = tool_called result.schema_ok = True # Check if the right tool was called if tool_called == tc.expected_tool: result.success = True # Check if args contain expected substring if tc.expected_params_check: result.tool_args_valid = tc.expected_params_check in tool_args_str else: result.tool_args_valid = True # Check if tool executed (look for tool role message) for msg in messages: if msg.get("role") == "tool": content = msg.get("content", "") if content and "error" not in content.lower()[:50]: result.execution_ok = True break elif content: result.execution_ok = True # got a response, even if error break else: # No tool call produced — still check if model responded final = conv.get("final_response", "") result.raw_response = final[:200] if final else "" except Exception as e: result.error = f"{type(e).__name__}: {str(e)[:200]}" result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0 return result def generate_report(results: list[CallResult], models: list[str], output_path: Path): """Generate markdown benchmark report.""" now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") # Aggregate per model stats: dict[str, ModelStats] = {} for m in models: stats[m] = ModelStats(model=m) by_category: dict[str, dict[str, list[CallResult]]] = {} for r in results: s = stats[r.model] s.total += 1 s.schema_ok += int(r.schema_ok) s.exec_ok += int(r.execution_ok) s.latency_sum += r.latency_s if not r.success: s.failures.append(r) by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r) lines = [ f"# Tool-Calling Benchmark Report", f"", f"Generated: {now}", f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories", f"Models tested: {', '.join(models)}", f"", f"## Summary", f"", f"| Metric | {' | '.join(models)} |", f"|--------|{'|'.join('---------' for _ in models)}|", ] # Schema parse success row = "| Schema parse success | " for m in models: s = stats[m] row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | " lines.append(row) # Tool execution success row = "| Tool execution success | " for m in models: s = stats[m] row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | " lines.append(row) # Correct tool selected row = "| Correct tool selected | " for m in models: s = stats[m] correct = sum(1 for r in results if r.model == m and r.success) pct = (correct / s.total * 100) if s.total else 0 row += f"{correct}/{s.total} ({pct:.0f}%) | " lines.append(row) # Avg latency row = "| Avg latency (s) | " for m in models: s = stats[m] row += f"{s.avg_latency:.2f} | " lines.append(row) lines.append("") # Per-category breakdown lines.append("## Per-Category Breakdown") lines.append("") for cat in sorted(by_category.keys()): lines.append(f"### {cat.title()}") lines.append("") lines.append(f"| Metric | {' | '.join(models)} |") lines.append(f"|--------|{'|'.join('---------' for _ in models)}|") cat_data = by_category[cat] for metric_name, fn in [ ("Schema OK", lambda r: r.schema_ok), ("Exec OK", lambda r: r.execution_ok), ("Correct tool", lambda r: r.success), ]: row = f"| {metric_name} | " for m in models: results_m = cat_data.get(m, []) total = len(results_m) ok = sum(1 for r in results_m if fn(r)) pct = (ok / total * 100) if total else 0 row += f"{ok}/{total} ({pct:.0f}%) | " lines.append(row) lines.append("") # Failure analysis lines.append("## Failure Analysis") lines.append("") any_failures = False for m in models: s = stats[m] if s.failures: any_failures = True lines.append(f"### {m} — {len(s.failures)} failures") lines.append("") lines.append("| Test | Category | Expected | Got | Error |") lines.append("|------|----------|----------|-----|-------|") for r in s.failures: got = r.tool_called or "none" err = r.error or "wrong tool" lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |") lines.append("") if not any_failures: lines.append("No failures detected.") lines.append("") # Raw results JSON lines.append("## Raw Results") lines.append("") lines.append("```json") lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str)) lines.append("```") report = "\n".join(lines) output_path.write_text(report) return report def main(): parser = argparse.ArgumentParser(description="Tool-calling benchmark") parser.add_argument("--models", nargs="+", default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"], help="Model specs to test (provider:model)") parser.add_argument("--limit", type=int, default=0, help="Run only first N tests (0 = all)") parser.add_argument("--category", type=str, default="", help="Run only tests in this category") parser.add_argument("--output", type=str, default="", help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)") parser.add_argument("--dry-run", action="store_true", help="Print test cases without running them") args = parser.parse_args() # Filter suite suite = SUITE[:] if args.category: suite = [tc for tc in suite if tc.category == args.category] if args.limit > 0: suite = suite[:args.limit] if args.dry_run: print(f"Would run {len(suite)} tests:") for tc in suite: print(f" [{tc.category:8s}] {tc.id}: {tc.expected_tool} — {tc.prompt[:60]}") return # Setup setup_test_files() date_str = datetime.now().strftime("%Y-%m-%d") output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md" # Parse model specs model_specs = [] for spec in args.models: parts = spec.split(":", 1) provider = parts[0] model_name = parts[1] if len(parts) > 1 else parts[0] model_specs.append((provider, model_name, spec)) print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls") print(f"Output: {output_path}") print() all_results: list[CallResult] = [] for provider, model_name, full_spec in model_specs: print(f"── {full_spec} {'─' * (50 - len(full_spec))}") model_results = [] for i, tc in enumerate(suite, 1): sys.stdout.write(f"\r [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s} → {tc.expected_tool:20s}") sys.stdout.flush() r = run_single_test(tc, full_spec, provider) model_results.append(r) status = "✓" if r.success else "✗" sys.stdout.write(f" {status} ({r.latency_s:.1f}s)") sys.stdout.write("\n") all_results.extend(model_results) # Quick stats ok = sum(1 for r in model_results if r.success) print(f" Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)") print() # Generate report model_names = [spec for _, _, spec in model_specs] report = generate_report(all_results, model_names, output_path) print(f"Report written to {output_path}") # Exit code: 0 if all pass, 1 if any failures total_fail = sum(1 for r in all_results if not r.success) sys.exit(1 if total_fail > 0 else 0) if __name__ == "__main__": main()