Compare commits

...

2 Commits

Author SHA1 Message Date
d643e01263 bench: Gemma 4 tool calling vs mimo-v2-pro — 100-call regression (#796)
Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Failing after 23s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 37s
Tests / e2e (pull_request) Successful in 3m16s
Tests / test (pull_request) Failing after 44m17s
2026-04-15 22:45:34 +00:00
b770164efa bench: Gemma 4 tool calling vs mimo-v2-pro — 100-call regression (#796) 2026-04-15 22:45:31 +00:00
2 changed files with 648 additions and 0 deletions

View File

@@ -0,0 +1,34 @@
# Gemma 4 Tool Calling — Benchmark Results
Generated: 2026-04-15
## Suite
100 diverse tool calls across 7 categories:
| Category | Calls | Tools exercised |
|----------|-------|-----------------|
| File operations | 23 | read_file, write_file, search_files, patch |
| Terminal commands | 22 | terminal |
| Code execution | 17 | execute_code |
| Delegation | 11 | delegate_task |
| Todo management | 7 | todo |
| Memory operations | 5 | memory |
| Skills | 15 | skills_list, skill_view |
## Models
| Model | Provider | Notes |
|-------|----------|-------|
| gia-3/gemma-4-31b | Nous | Gemma 4 31B dense |
| mimo-v2-pro | Nous | Xiaomi MiMo v2 Pro |
## Results
Run with:
```bash
python3 benchmarks/tool_call_benchmark.py
```
Results will be appended here after execution.

View File

@@ -0,0 +1,614 @@
#!/usr/bin/env python3
"""
Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
Runs 100 diverse tool-calling prompts through multiple models and compares
success rates, latency, and token costs.
Usage:
python3 benchmarks/tool_call_benchmark.py # full 100-call suite
python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test
python3 benchmarks/tool_call_benchmark.py --models nous # single model
python3 benchmarks/tool_call_benchmark.py --category file # single category
Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
"""
import argparse
import json
import os
import sys
import time
import traceback
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# Ensure hermes-agent root is importable
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))
# ---------------------------------------------------------------------------
# Test Definitions
# ---------------------------------------------------------------------------
@dataclass
class ToolCall:
"""A single tool-calling test case."""
id: str
category: str
prompt: str
expected_tool: str # tool name we expect the model to call
expected_params_check: str = "" # substring expected in JSON args
timeout: int = 30 # max seconds per call
notes: str = ""
# fmt: off
SUITE: list[ToolCall] = [
# ── File Operations (20) ──────────────────────────────────────────────
ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
"read_file", "path"),
ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
"write_file", "path"),
ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
"search_files", "pattern"),
ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
"read_file", "offset"),
ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
"patch", "old_string"),
ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
"search_files", "target"),
ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
"read_file", "limit"),
ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
"write_file", "content"),
ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
"search_files", "file_glob"),
ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
"read_file", "bench_config"),
ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
"write_file", "bench_readme"),
ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
"search_files", "TODO"),
ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
"read_file", "bench_readme"),
ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
"patch", "Tool Benchmark"),
ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
"write_file", "bench_hello"),
ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
"search_files", "json"),
ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
"read_file", "bench_hello"),
ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
"patch", "hello world"),
ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
"search_files", "bench"),
ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
"read_file", "test_bench"),
# ── Terminal Commands (20) ────────────────────────────────────────────
ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
"terminal", "echo"),
ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
"terminal", "date"),
ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
"terminal", "uname"),
ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
"terminal", "pwd"),
ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
"terminal", "head"),
ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
"terminal", "whoami"),
ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
"terminal", "df"),
ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
"terminal", "python3"),
ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
"terminal", "hostname"),
ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
"terminal", "uptime"),
ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
"terminal", "PATH"),
ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
"terminal", "wc"),
ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
"terminal", "SHELL"),
ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
"terminal", "memory"),
ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
"terminal", "id"),
ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
"terminal", "hostname"),
ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
"terminal", "echo"),
ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
"terminal", "seq"),
ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
"terminal", "print"),
ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
"terminal", "wc"),
# ── Code Execution (15) ──────────────────────────────────────────────
ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
"execute_code", "factorial"),
ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
"execute_code", "words"),
ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
"execute_code", "fibonacci"),
ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
"execute_code", "json"),
ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
"execute_code", "glob"),
ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
"execute_code", "sum"),
ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
"execute_code", "palindrome"),
ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
"execute_code", "csv"),
ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
"execute_code", "sort"),
ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
"execute_code", "passwd"),
ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
"execute_code", "sha256"),
ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
"execute_code", "utcnow"),
ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
"execute_code", "upper"),
ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
"execute_code", "sys"),
ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
"execute_code", "socket"),
# ── Delegation (10) ──────────────────────────────────────────────────
ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
"delegate_task", "log"),
ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
"delegate_task", "15"),
ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
"delegate_task", "python"),
ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
"delegate_task", "summarize"),
ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
"delegate_task", "tmp"),
ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
"delegate_task", ".py"),
ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
"delegate_task", "df"),
ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
"delegate_task", "os"),
ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
"delegate_task", "hostname"),
ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
"delegate_task", "write"),
# ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
"todo", "benchmark"),
ToolCall("todo-02", "todo", "Show me the current todo list.",
"todo", ""),
ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
"todo", "completed"),
ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
"todo", "Review"),
ToolCall("todo-05", "todo", "Clear all completed todos.",
"todo", "clear"),
ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
date=datetime.now().strftime("%Y-%m-%d")),
"memory", "benchmark"),
ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
"memory", "benchmark"),
ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
"memory", "gemma"),
ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
"todo", "analyze"),
ToolCall("todo-10", "memory", "Search memory for any notes about models.",
"memory", "model"),
# ── Skills (10 — replacing MCP tools which need servers) ─────────────
ToolCall("skill-01", "skills", "List all available skills.",
"skills_list", ""),
ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
"skill_view", "test-driven"),
ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
"skills_list", "git"),
ToolCall("skill-04", "skills", "View the 'code-review' skill.",
"skill_view", "code-review"),
ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
"skills_list", "devops"),
ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
"skill_view", "systematic-debugging"),
ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
"skills_list", "testing"),
ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
"skill_view", "writing-plans"),
ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
"skills_list", "software-development"),
ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
"skill_view", "pr-review"),
# ── Additional tests to reach 100 ────────────────────────────────────
ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
"write_file", "bench_sort"),
ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
"read_file", "bench_sort"),
ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
"search_files", "class"),
ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
"terminal", "os"),
ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
"terminal", "cpu"),
ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
"execute_code", "flatten"),
ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
"execute_code", "prime"),
ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
"delegate_task", "cwd"),
ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
"todo", "Finalize"),
ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
"memory", "categories"),
ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
"skills_list", "deployment"),
ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
"skill_view", "gitea-burn-cycle"),
ToolCall("skill-13", "skills", "List all available skill categories.",
"skills_list", ""),
ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
"skills_list", "memory"),
ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
"skill_view", "mimo-swarm"),
]
# fmt: on
# ---------------------------------------------------------------------------
# Runner
# ---------------------------------------------------------------------------
@dataclass
class CallResult:
test_id: str
category: str
model: str
prompt: str
expected_tool: str
success: bool
tool_called: Optional[str] = None
tool_args_valid: bool = False
execution_ok: bool = False
latency_s: float = 0.0
error: str = ""
raw_response: str = ""
@dataclass
class ModelStats:
model: str
total: int = 0
schema_ok: int = 0 # model produced valid tool call JSON
exec_ok: int = 0 # tool actually ran without error
latency_sum: float = 0.0
failures: list = field(default_factory=list)
@property
def schema_pct(self) -> float:
return (self.schema_ok / self.total * 100) if self.total else 0
@property
def exec_pct(self) -> float:
return (self.exec_ok / self.total * 100) if self.total else 0
@property
def avg_latency(self) -> float:
return (self.latency_sum / self.total) if self.total else 0
def setup_test_files():
"""Create prerequisite files for the benchmark."""
Path("/tmp/test_bench.txt").write_text(
"This is a benchmark test file.\n"
"It contains sample data for tool-calling tests.\n"
"Line three has some import statements.\n"
"import os\nimport sys\nimport json\n"
"End of test data.\n"
)
def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
"""Run a single tool-calling test through the agent."""
from run_agent import AIAgent
result = CallResult(
test_id=tc.id,
category=tc.category,
model=model_spec,
prompt=tc.prompt,
expected_tool=tc.expected_tool,
success=False,
)
try:
agent = AIAgent(
model=model_spec,
provider=provider,
max_iterations=3,
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
persist_session=False,
)
t0 = time.time()
conv = agent.run_conversation(
user_message=tc.prompt,
system_message=(
"You are a benchmark test runner. Execute the user's request by calling "
"the appropriate tool. Return the tool result directly. Do not add commentary."
),
)
result.latency_s = round(time.time() - t0, 2)
messages = conv.get("messages", [])
# Find the first assistant message with tool_calls
tool_called = None
tool_args_str = ""
for msg in messages:
if msg.get("role") == "assistant" and msg.get("tool_calls"):
for tc_item in msg["tool_calls"]:
fn = tc_item.get("function", {})
tool_called = fn.get("name", "")
tool_args_str = fn.get("arguments", "{}")
break
break
if tool_called:
result.tool_called = tool_called
result.schema_ok = True
# Check if the right tool was called
if tool_called == tc.expected_tool:
result.success = True
# Check if args contain expected substring
if tc.expected_params_check:
result.tool_args_valid = tc.expected_params_check in tool_args_str
else:
result.tool_args_valid = True
# Check if tool executed (look for tool role message)
for msg in messages:
if msg.get("role") == "tool":
content = msg.get("content", "")
if content and "error" not in content.lower()[:50]:
result.execution_ok = True
break
elif content:
result.execution_ok = True # got a response, even if error
break
else:
# No tool call produced — still check if model responded
final = conv.get("final_response", "")
result.raw_response = final[:200] if final else ""
except Exception as e:
result.error = f"{type(e).__name__}: {str(e)[:200]}"
result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
return result
def generate_report(results: list[CallResult], models: list[str], output_path: Path):
"""Generate markdown benchmark report."""
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
# Aggregate per model
stats: dict[str, ModelStats] = {}
for m in models:
stats[m] = ModelStats(model=m)
by_category: dict[str, dict[str, list[CallResult]]] = {}
for r in results:
s = stats[r.model]
s.total += 1
s.schema_ok += int(r.schema_ok)
s.exec_ok += int(r.execution_ok)
s.latency_sum += r.latency_s
if not r.success:
s.failures.append(r)
by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
lines = [
f"# Tool-Calling Benchmark Report",
f"",
f"Generated: {now}",
f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
f"Models tested: {', '.join(models)}",
f"",
f"## Summary",
f"",
f"| Metric | {' | '.join(models)} |",
f"|--------|{'|'.join('---------' for _ in models)}|",
]
# Schema parse success
row = "| Schema parse success | "
for m in models:
s = stats[m]
row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
lines.append(row)
# Tool execution success
row = "| Tool execution success | "
for m in models:
s = stats[m]
row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
lines.append(row)
# Correct tool selected
row = "| Correct tool selected | "
for m in models:
s = stats[m]
correct = sum(1 for r in results if r.model == m and r.success)
pct = (correct / s.total * 100) if s.total else 0
row += f"{correct}/{s.total} ({pct:.0f}%) | "
lines.append(row)
# Avg latency
row = "| Avg latency (s) | "
for m in models:
s = stats[m]
row += f"{s.avg_latency:.2f} | "
lines.append(row)
lines.append("")
# Per-category breakdown
lines.append("## Per-Category Breakdown")
lines.append("")
for cat in sorted(by_category.keys()):
lines.append(f"### {cat.title()}")
lines.append("")
lines.append(f"| Metric | {' | '.join(models)} |")
lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
cat_data = by_category[cat]
for metric_name, fn in [
("Schema OK", lambda r: r.schema_ok),
("Exec OK", lambda r: r.execution_ok),
("Correct tool", lambda r: r.success),
]:
row = f"| {metric_name} | "
for m in models:
results_m = cat_data.get(m, [])
total = len(results_m)
ok = sum(1 for r in results_m if fn(r))
pct = (ok / total * 100) if total else 0
row += f"{ok}/{total} ({pct:.0f}%) | "
lines.append(row)
lines.append("")
# Failure analysis
lines.append("## Failure Analysis")
lines.append("")
any_failures = False
for m in models:
s = stats[m]
if s.failures:
any_failures = True
lines.append(f"### {m}{len(s.failures)} failures")
lines.append("")
lines.append("| Test | Category | Expected | Got | Error |")
lines.append("|------|----------|----------|-----|-------|")
for r in s.failures:
got = r.tool_called or "none"
err = r.error or "wrong tool"
lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
lines.append("")
if not any_failures:
lines.append("No failures detected.")
lines.append("")
# Raw results JSON
lines.append("## Raw Results")
lines.append("")
lines.append("```json")
lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
lines.append("```")
report = "\n".join(lines)
output_path.write_text(report)
return report
def main():
parser = argparse.ArgumentParser(description="Tool-calling benchmark")
parser.add_argument("--models", nargs="+",
default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
help="Model specs to test (provider:model)")
parser.add_argument("--limit", type=int, default=0,
help="Run only first N tests (0 = all)")
parser.add_argument("--category", type=str, default="",
help="Run only tests in this category")
parser.add_argument("--output", type=str, default="",
help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
parser.add_argument("--dry-run", action="store_true",
help="Print test cases without running them")
args = parser.parse_args()
# Filter suite
suite = SUITE[:]
if args.category:
suite = [tc for tc in suite if tc.category == args.category]
if args.limit > 0:
suite = suite[:args.limit]
if args.dry_run:
print(f"Would run {len(suite)} tests:")
for tc in suite:
print(f" [{tc.category:8s}] {tc.id}: {tc.expected_tool}{tc.prompt[:60]}")
return
# Setup
setup_test_files()
date_str = datetime.now().strftime("%Y-%m-%d")
output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
# Parse model specs
model_specs = []
for spec in args.models:
parts = spec.split(":", 1)
provider = parts[0]
model_name = parts[1] if len(parts) > 1 else parts[0]
model_specs.append((provider, model_name, spec))
print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
print(f"Output: {output_path}")
print()
all_results: list[CallResult] = []
for provider, model_name, full_spec in model_specs:
print(f"── {full_spec} {'' * (50 - len(full_spec))}")
model_results = []
for i, tc in enumerate(suite, 1):
sys.stdout.write(f"\r [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s}{tc.expected_tool:20s}")
sys.stdout.flush()
r = run_single_test(tc, full_spec, provider)
model_results.append(r)
status = "" if r.success else ""
sys.stdout.write(f" {status} ({r.latency_s:.1f}s)")
sys.stdout.write("\n")
all_results.extend(model_results)
# Quick stats
ok = sum(1 for r in model_results if r.success)
print(f" Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
print()
# Generate report
model_names = [spec for _, _, spec in model_specs]
report = generate_report(all_results, model_names, output_path)
print(f"Report written to {output_path}")
# Exit code: 0 if all pass, 1 if any failures
total_fail = sum(1 for r in all_results if not r.success)
sys.exit(1 if total_fail > 0 else 0)
if __name__ == "__main__":
main()