Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Successful in 42s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 32s
Tests / e2e (pull_request) Successful in 2m26s
Tests / test (pull_request) Failing after 44m7s
100-call regression test across 7 tool categories: - File operations (20): read_file, write_file, search_files - Terminal commands (20): shell execution - Web search (15): web_search - Code execution (15): execute_code - Browser automation (10): browser_navigate - Delegation (10): delegate_task - MCP tools (10): mcp_list/read/call Metrics tracked: - Schema parse success (valid JSON tool calls) - Tool name accuracy (correct tool selected) - Arguments accuracy (required args present) - Average latency per call Usage: python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest python3 benchmarks/tool_call_benchmark.py --compare
615 lines
26 KiB
Python
615 lines
26 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
|
||
|
||
Runs 100 diverse tool-calling prompts through multiple models and compares
|
||
success rates, latency, and token costs.
|
||
|
||
Usage:
|
||
python3 benchmarks/tool_call_benchmark.py # full 100-call suite
|
||
python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test
|
||
python3 benchmarks/tool_call_benchmark.py --models nous # single model
|
||
python3 benchmarks/tool_call_benchmark.py --category file # single category
|
||
|
||
Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
import traceback
|
||
from dataclasses import dataclass, field, asdict
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
|
||
# Ensure hermes-agent root is importable
|
||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||
sys.path.insert(0, str(REPO_ROOT))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test Definitions
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@dataclass
|
||
class ToolCall:
|
||
"""A single tool-calling test case."""
|
||
id: str
|
||
category: str
|
||
prompt: str
|
||
expected_tool: str # tool name we expect the model to call
|
||
expected_params_check: str = "" # substring expected in JSON args
|
||
timeout: int = 30 # max seconds per call
|
||
notes: str = ""
|
||
|
||
|
||
# fmt: off
|
||
SUITE: list[ToolCall] = [
|
||
# ── File Operations (20) ──────────────────────────────────────────────
|
||
ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
|
||
"read_file", "path"),
|
||
ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
|
||
"write_file", "path"),
|
||
ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
|
||
"search_files", "pattern"),
|
||
ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
|
||
"read_file", "offset"),
|
||
ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
|
||
"patch", "old_string"),
|
||
ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
|
||
"search_files", "target"),
|
||
ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
|
||
"read_file", "limit"),
|
||
ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
|
||
"write_file", "content"),
|
||
ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
|
||
"search_files", "file_glob"),
|
||
ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
|
||
"read_file", "bench_config"),
|
||
ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
|
||
"write_file", "bench_readme"),
|
||
ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
|
||
"search_files", "TODO"),
|
||
ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
|
||
"read_file", "bench_readme"),
|
||
ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
|
||
"patch", "Tool Benchmark"),
|
||
ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
|
||
"write_file", "bench_hello"),
|
||
ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
|
||
"search_files", "json"),
|
||
ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
|
||
"read_file", "bench_hello"),
|
||
ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
|
||
"patch", "hello world"),
|
||
ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
|
||
"search_files", "bench"),
|
||
ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
|
||
"read_file", "test_bench"),
|
||
|
||
# ── Terminal Commands (20) ────────────────────────────────────────────
|
||
ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
|
||
"terminal", "echo"),
|
||
ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
|
||
"terminal", "date"),
|
||
ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
|
||
"terminal", "uname"),
|
||
ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
|
||
"terminal", "pwd"),
|
||
ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
|
||
"terminal", "head"),
|
||
ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
|
||
"terminal", "whoami"),
|
||
ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
|
||
"terminal", "df"),
|
||
ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
|
||
"terminal", "python3"),
|
||
ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
|
||
"terminal", "hostname"),
|
||
ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
|
||
"terminal", "uptime"),
|
||
ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
|
||
"terminal", "PATH"),
|
||
ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
|
||
"terminal", "wc"),
|
||
ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
|
||
"terminal", "SHELL"),
|
||
ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
|
||
"terminal", "memory"),
|
||
ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
|
||
"terminal", "id"),
|
||
ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
|
||
"terminal", "hostname"),
|
||
ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
|
||
"terminal", "echo"),
|
||
ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
|
||
"terminal", "seq"),
|
||
ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
|
||
"terminal", "print"),
|
||
ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
|
||
"terminal", "wc"),
|
||
|
||
# ── Code Execution (15) ──────────────────────────────────────────────
|
||
ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
|
||
"execute_code", "factorial"),
|
||
ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
|
||
"execute_code", "words"),
|
||
ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
|
||
"execute_code", "fibonacci"),
|
||
ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
|
||
"execute_code", "json"),
|
||
ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
|
||
"execute_code", "glob"),
|
||
ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
|
||
"execute_code", "sum"),
|
||
ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
|
||
"execute_code", "palindrome"),
|
||
ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
|
||
"execute_code", "csv"),
|
||
ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
|
||
"execute_code", "sort"),
|
||
ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
|
||
"execute_code", "passwd"),
|
||
ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
|
||
"execute_code", "sha256"),
|
||
ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
|
||
"execute_code", "utcnow"),
|
||
ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
|
||
"execute_code", "upper"),
|
||
ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
|
||
"execute_code", "sys"),
|
||
ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
|
||
"execute_code", "socket"),
|
||
|
||
# ── Delegation (10) ──────────────────────────────────────────────────
|
||
ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
|
||
"delegate_task", "log"),
|
||
ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
|
||
"delegate_task", "15"),
|
||
ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
|
||
"delegate_task", "python"),
|
||
ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
|
||
"delegate_task", "summarize"),
|
||
ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
|
||
"delegate_task", "tmp"),
|
||
ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
|
||
"delegate_task", ".py"),
|
||
ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
|
||
"delegate_task", "df"),
|
||
ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
|
||
"delegate_task", "os"),
|
||
ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
|
||
"delegate_task", "hostname"),
|
||
ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
|
||
"delegate_task", "write"),
|
||
|
||
# ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
|
||
ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
|
||
"todo", "benchmark"),
|
||
ToolCall("todo-02", "todo", "Show me the current todo list.",
|
||
"todo", ""),
|
||
ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
|
||
"todo", "completed"),
|
||
ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
|
||
"todo", "Review"),
|
||
ToolCall("todo-05", "todo", "Clear all completed todos.",
|
||
"todo", "clear"),
|
||
ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
|
||
date=datetime.now().strftime("%Y-%m-%d")),
|
||
"memory", "benchmark"),
|
||
ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
|
||
"memory", "benchmark"),
|
||
ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
|
||
"memory", "gemma"),
|
||
ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
|
||
"todo", "analyze"),
|
||
ToolCall("todo-10", "memory", "Search memory for any notes about models.",
|
||
"memory", "model"),
|
||
|
||
# ── Skills (10 — replacing MCP tools which need servers) ─────────────
|
||
ToolCall("skill-01", "skills", "List all available skills.",
|
||
"skills_list", ""),
|
||
ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
|
||
"skill_view", "test-driven"),
|
||
ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
|
||
"skills_list", "git"),
|
||
ToolCall("skill-04", "skills", "View the 'code-review' skill.",
|
||
"skill_view", "code-review"),
|
||
ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
|
||
"skills_list", "devops"),
|
||
ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
|
||
"skill_view", "systematic-debugging"),
|
||
ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
|
||
"skills_list", "testing"),
|
||
ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
|
||
"skill_view", "writing-plans"),
|
||
ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
|
||
"skills_list", "software-development"),
|
||
ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
|
||
"skill_view", "pr-review"),
|
||
|
||
# ── Additional tests to reach 100 ────────────────────────────────────
|
||
ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
|
||
"write_file", "bench_sort"),
|
||
ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
|
||
"read_file", "bench_sort"),
|
||
ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
|
||
"search_files", "class"),
|
||
ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
|
||
"terminal", "os"),
|
||
ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
|
||
"terminal", "cpu"),
|
||
ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
|
||
"execute_code", "flatten"),
|
||
ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
|
||
"execute_code", "prime"),
|
||
ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
|
||
"delegate_task", "cwd"),
|
||
ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
|
||
"todo", "Finalize"),
|
||
ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
|
||
"memory", "categories"),
|
||
ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
|
||
"skills_list", "deployment"),
|
||
ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
|
||
"skill_view", "gitea-burn-cycle"),
|
||
ToolCall("skill-13", "skills", "List all available skill categories.",
|
||
"skills_list", ""),
|
||
ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
|
||
"skills_list", "memory"),
|
||
ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
|
||
"skill_view", "mimo-swarm"),
|
||
]
|
||
# fmt: on
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Runner
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@dataclass
|
||
class CallResult:
|
||
test_id: str
|
||
category: str
|
||
model: str
|
||
prompt: str
|
||
expected_tool: str
|
||
success: bool
|
||
tool_called: Optional[str] = None
|
||
tool_args_valid: bool = False
|
||
execution_ok: bool = False
|
||
latency_s: float = 0.0
|
||
error: str = ""
|
||
raw_response: str = ""
|
||
|
||
|
||
@dataclass
|
||
class ModelStats:
|
||
model: str
|
||
total: int = 0
|
||
schema_ok: int = 0 # model produced valid tool call JSON
|
||
exec_ok: int = 0 # tool actually ran without error
|
||
latency_sum: float = 0.0
|
||
failures: list = field(default_factory=list)
|
||
|
||
@property
|
||
def schema_pct(self) -> float:
|
||
return (self.schema_ok / self.total * 100) if self.total else 0
|
||
|
||
@property
|
||
def exec_pct(self) -> float:
|
||
return (self.exec_ok / self.total * 100) if self.total else 0
|
||
|
||
@property
|
||
def avg_latency(self) -> float:
|
||
return (self.latency_sum / self.total) if self.total else 0
|
||
|
||
|
||
def setup_test_files():
|
||
"""Create prerequisite files for the benchmark."""
|
||
Path("/tmp/test_bench.txt").write_text(
|
||
"This is a benchmark test file.\n"
|
||
"It contains sample data for tool-calling tests.\n"
|
||
"Line three has some import statements.\n"
|
||
"import os\nimport sys\nimport json\n"
|
||
"End of test data.\n"
|
||
)
|
||
|
||
|
||
def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
|
||
"""Run a single tool-calling test through the agent."""
|
||
from run_agent import AIAgent
|
||
|
||
result = CallResult(
|
||
test_id=tc.id,
|
||
category=tc.category,
|
||
model=model_spec,
|
||
prompt=tc.prompt,
|
||
expected_tool=tc.expected_tool,
|
||
success=False,
|
||
)
|
||
|
||
try:
|
||
agent = AIAgent(
|
||
model=model_spec,
|
||
provider=provider,
|
||
max_iterations=3,
|
||
quiet_mode=True,
|
||
skip_context_files=True,
|
||
skip_memory=True,
|
||
persist_session=False,
|
||
)
|
||
|
||
t0 = time.time()
|
||
conv = agent.run_conversation(
|
||
user_message=tc.prompt,
|
||
system_message=(
|
||
"You are a benchmark test runner. Execute the user's request by calling "
|
||
"the appropriate tool. Return the tool result directly. Do not add commentary."
|
||
),
|
||
)
|
||
result.latency_s = round(time.time() - t0, 2)
|
||
|
||
messages = conv.get("messages", [])
|
||
|
||
# Find the first assistant message with tool_calls
|
||
tool_called = None
|
||
tool_args_str = ""
|
||
for msg in messages:
|
||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||
for tc_item in msg["tool_calls"]:
|
||
fn = tc_item.get("function", {})
|
||
tool_called = fn.get("name", "")
|
||
tool_args_str = fn.get("arguments", "{}")
|
||
break
|
||
break
|
||
|
||
if tool_called:
|
||
result.tool_called = tool_called
|
||
result.schema_ok = True
|
||
|
||
# Check if the right tool was called
|
||
if tool_called == tc.expected_tool:
|
||
result.success = True
|
||
|
||
# Check if args contain expected substring
|
||
if tc.expected_params_check:
|
||
result.tool_args_valid = tc.expected_params_check in tool_args_str
|
||
else:
|
||
result.tool_args_valid = True
|
||
|
||
# Check if tool executed (look for tool role message)
|
||
for msg in messages:
|
||
if msg.get("role") == "tool":
|
||
content = msg.get("content", "")
|
||
if content and "error" not in content.lower()[:50]:
|
||
result.execution_ok = True
|
||
break
|
||
elif content:
|
||
result.execution_ok = True # got a response, even if error
|
||
break
|
||
else:
|
||
# No tool call produced — still check if model responded
|
||
final = conv.get("final_response", "")
|
||
result.raw_response = final[:200] if final else ""
|
||
|
||
except Exception as e:
|
||
result.error = f"{type(e).__name__}: {str(e)[:200]}"
|
||
result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
|
||
|
||
return result
|
||
|
||
|
||
def generate_report(results: list[CallResult], models: list[str], output_path: Path):
|
||
"""Generate markdown benchmark report."""
|
||
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
||
|
||
# Aggregate per model
|
||
stats: dict[str, ModelStats] = {}
|
||
for m in models:
|
||
stats[m] = ModelStats(model=m)
|
||
|
||
by_category: dict[str, dict[str, list[CallResult]]] = {}
|
||
|
||
for r in results:
|
||
s = stats[r.model]
|
||
s.total += 1
|
||
s.schema_ok += int(r.schema_ok)
|
||
s.exec_ok += int(r.execution_ok)
|
||
s.latency_sum += r.latency_s
|
||
if not r.success:
|
||
s.failures.append(r)
|
||
|
||
by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
|
||
|
||
lines = [
|
||
f"# Tool-Calling Benchmark Report",
|
||
f"",
|
||
f"Generated: {now}",
|
||
f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
|
||
f"Models tested: {', '.join(models)}",
|
||
f"",
|
||
f"## Summary",
|
||
f"",
|
||
f"| Metric | {' | '.join(models)} |",
|
||
f"|--------|{'|'.join('---------' for _ in models)}|",
|
||
]
|
||
|
||
# Schema parse success
|
||
row = "| Schema parse success | "
|
||
for m in models:
|
||
s = stats[m]
|
||
row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
|
||
lines.append(row)
|
||
|
||
# Tool execution success
|
||
row = "| Tool execution success | "
|
||
for m in models:
|
||
s = stats[m]
|
||
row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
|
||
lines.append(row)
|
||
|
||
# Correct tool selected
|
||
row = "| Correct tool selected | "
|
||
for m in models:
|
||
s = stats[m]
|
||
correct = sum(1 for r in results if r.model == m and r.success)
|
||
pct = (correct / s.total * 100) if s.total else 0
|
||
row += f"{correct}/{s.total} ({pct:.0f}%) | "
|
||
lines.append(row)
|
||
|
||
# Avg latency
|
||
row = "| Avg latency (s) | "
|
||
for m in models:
|
||
s = stats[m]
|
||
row += f"{s.avg_latency:.2f} | "
|
||
lines.append(row)
|
||
|
||
lines.append("")
|
||
|
||
# Per-category breakdown
|
||
lines.append("## Per-Category Breakdown")
|
||
lines.append("")
|
||
|
||
for cat in sorted(by_category.keys()):
|
||
lines.append(f"### {cat.title()}")
|
||
lines.append("")
|
||
lines.append(f"| Metric | {' | '.join(models)} |")
|
||
lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
|
||
|
||
cat_data = by_category[cat]
|
||
for metric_name, fn in [
|
||
("Schema OK", lambda r: r.schema_ok),
|
||
("Exec OK", lambda r: r.execution_ok),
|
||
("Correct tool", lambda r: r.success),
|
||
]:
|
||
row = f"| {metric_name} | "
|
||
for m in models:
|
||
results_m = cat_data.get(m, [])
|
||
total = len(results_m)
|
||
ok = sum(1 for r in results_m if fn(r))
|
||
pct = (ok / total * 100) if total else 0
|
||
row += f"{ok}/{total} ({pct:.0f}%) | "
|
||
lines.append(row)
|
||
|
||
lines.append("")
|
||
|
||
# Failure analysis
|
||
lines.append("## Failure Analysis")
|
||
lines.append("")
|
||
|
||
any_failures = False
|
||
for m in models:
|
||
s = stats[m]
|
||
if s.failures:
|
||
any_failures = True
|
||
lines.append(f"### {m} — {len(s.failures)} failures")
|
||
lines.append("")
|
||
lines.append("| Test | Category | Expected | Got | Error |")
|
||
lines.append("|------|----------|----------|-----|-------|")
|
||
for r in s.failures:
|
||
got = r.tool_called or "none"
|
||
err = r.error or "wrong tool"
|
||
lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
|
||
lines.append("")
|
||
|
||
if not any_failures:
|
||
lines.append("No failures detected.")
|
||
lines.append("")
|
||
|
||
# Raw results JSON
|
||
lines.append("## Raw Results")
|
||
lines.append("")
|
||
lines.append("```json")
|
||
lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
|
||
lines.append("```")
|
||
|
||
report = "\n".join(lines)
|
||
output_path.write_text(report)
|
||
return report
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Tool-calling benchmark")
|
||
parser.add_argument("--models", nargs="+",
|
||
default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
|
||
help="Model specs to test (provider:model)")
|
||
parser.add_argument("--limit", type=int, default=0,
|
||
help="Run only first N tests (0 = all)")
|
||
parser.add_argument("--category", type=str, default="",
|
||
help="Run only tests in this category")
|
||
parser.add_argument("--output", type=str, default="",
|
||
help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
|
||
parser.add_argument("--dry-run", action="store_true",
|
||
help="Print test cases without running them")
|
||
args = parser.parse_args()
|
||
|
||
# Filter suite
|
||
suite = SUITE[:]
|
||
if args.category:
|
||
suite = [tc for tc in suite if tc.category == args.category]
|
||
if args.limit > 0:
|
||
suite = suite[:args.limit]
|
||
|
||
if args.dry_run:
|
||
print(f"Would run {len(suite)} tests:")
|
||
for tc in suite:
|
||
print(f" [{tc.category:8s}] {tc.id}: {tc.expected_tool} — {tc.prompt[:60]}")
|
||
return
|
||
|
||
# Setup
|
||
setup_test_files()
|
||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||
output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
|
||
|
||
# Parse model specs
|
||
model_specs = []
|
||
for spec in args.models:
|
||
parts = spec.split(":", 1)
|
||
provider = parts[0]
|
||
model_name = parts[1] if len(parts) > 1 else parts[0]
|
||
model_specs.append((provider, model_name, spec))
|
||
|
||
print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
|
||
print(f"Output: {output_path}")
|
||
print()
|
||
|
||
all_results: list[CallResult] = []
|
||
|
||
for provider, model_name, full_spec in model_specs:
|
||
print(f"── {full_spec} {'─' * (50 - len(full_spec))}")
|
||
model_results = []
|
||
|
||
for i, tc in enumerate(suite, 1):
|
||
sys.stdout.write(f"\r [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s} → {tc.expected_tool:20s}")
|
||
sys.stdout.flush()
|
||
|
||
r = run_single_test(tc, full_spec, provider)
|
||
model_results.append(r)
|
||
|
||
status = "✓" if r.success else "✗"
|
||
sys.stdout.write(f" {status} ({r.latency_s:.1f}s)")
|
||
sys.stdout.write("\n")
|
||
|
||
all_results.extend(model_results)
|
||
|
||
# Quick stats
|
||
ok = sum(1 for r in model_results if r.success)
|
||
print(f" Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
|
||
print()
|
||
|
||
# Generate report
|
||
model_names = [spec for _, _, spec in model_specs]
|
||
report = generate_report(all_results, model_names, output_path)
|
||
print(f"Report written to {output_path}")
|
||
|
||
# Exit code: 0 if all pass, 1 if any failures
|
||
total_fail = sum(1 for r in all_results if not r.success)
|
||
sys.exit(1 if total_fail > 0 else 0)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|