Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
a244b157be bench: add Gemma 4 vs mimo-v2-pro tool calling benchmark (#796)
Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Successful in 42s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 32s
Tests / e2e (pull_request) Successful in 2m26s
Tests / test (pull_request) Failing after 44m7s
100-call regression test across 7 tool categories:
- File operations (20): read_file, write_file, search_files
- Terminal commands (20): shell execution
- Web search (15): web_search
- Code execution (15): execute_code
- Browser automation (10): browser_navigate
- Delegation (10): delegate_task
- MCP tools (10): mcp_list/read/call

Metrics tracked:
- Schema parse success (valid JSON tool calls)
- Tool name accuracy (correct tool selected)
- Arguments accuracy (required args present)
- Average latency per call

Usage:
  python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro
  python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest
  python3 benchmarks/tool_call_benchmark.py --compare
2026-04-15 18:56:35 -04:00
5 changed files with 656 additions and 456 deletions

View File

@@ -1,177 +0,0 @@
"""Tool Orchestrator — Robust execution and circuit breaking for agent tools.
Provides a unified execution service that wraps the tool registry.
Implements the Circuit Breaker pattern to prevent the agent from getting
stuck in failure loops when a specific tool or its underlying service
is flapping or down.
Architecture:
Discovery (tools/registry.py) -> Orchestration (agent/tool_orchestrator.py) -> Dispatch
"""
import json
import time
import logging
import threading
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
from tools.registry import registry
logger = logging.getLogger(__name__)
class CircuitState:
"""States for the tool circuit breaker."""
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing, execution blocked
HALF_OPEN = "half_open" # Testing if service recovered
@dataclass
class ToolStats:
"""Execution statistics for a tool."""
name: str
state: str = CircuitState.CLOSED
failures: int = 0
successes: int = 0
last_failure_time: float = 0
total_execution_time: float = 0
call_count: int = 0
class ToolOrchestrator:
"""Orchestrates tool execution with robustness patterns."""
def __init__(
self,
failure_threshold: int = 3,
reset_timeout: int = 300,
):
"""
Args:
failure_threshold: Number of failures before opening the circuit.
reset_timeout: Seconds to wait before transitioning from OPEN to HALF_OPEN.
"""
self.failure_threshold = failure_threshold
self.reset_timeout = reset_timeout
self._stats: Dict[str, ToolStats] = {}
self._lock = threading.Lock()
def _get_stats(self, name: str) -> ToolStats:
"""Get or initialize stats for a tool with thread-safe state transition."""
with self._lock:
if name not in self._stats:
self._stats[name] = ToolStats(name=name)
stats = self._stats[name]
# Transition from OPEN to HALF_OPEN if timeout expired
if stats.state == CircuitState.OPEN:
if time.time() - stats.last_failure_time > self.reset_timeout:
stats.state = CircuitState.HALF_OPEN
logger.info("Circuit breaker HALF_OPEN for tool: %s", name)
return stats
def _record_success(self, name: str, execution_time: float):
"""Record a successful tool execution and close the circuit."""
with self._lock:
stats = self._stats[name]
stats.successes += 1
stats.call_count += 1
stats.total_execution_time += execution_time
if stats.state != CircuitState.CLOSED:
logger.info("Circuit breaker CLOSED for tool: %s (recovered)", name)
stats.state = CircuitState.CLOSED
stats.failures = 0
def _record_failure(self, name: str, execution_time: float):
"""Record a failed tool execution and potentially open the circuit."""
with self._lock:
stats = self._stats[name]
stats.failures += 1
stats.call_count += 1
stats.total_execution_time += execution_time
stats.last_failure_time = time.time()
if stats.state == CircuitState.HALF_OPEN or stats.failures >= self.failure_threshold:
stats.state = CircuitState.OPEN
logger.warning(
"Circuit breaker OPEN for tool: %s (failures: %d)",
name, stats.failures
)
def dispatch(self, name: str, args: dict, **kwargs) -> str:
"""Execute a tool via the registry with circuit breaker protection."""
stats = self._get_stats(name)
if stats.state == CircuitState.OPEN:
return json.dumps({
"error": (
f"Tool '{name}' is temporarily unavailable due to repeated failures. "
f"Circuit breaker is OPEN. Please try again in a few minutes or use an alternative tool."
),
"circuit_breaker": True,
"tool_name": name
})
start_time = time.time()
try:
# Dispatch to the underlying registry
result_str = registry.dispatch(name, args, **kwargs)
execution_time = time.time() - start_time
# Inspect result for errors. registry.dispatch catches internal
# exceptions and returns a JSON error string.
is_error = False
try:
# Lightweight check for error key in JSON
if '"error":' in result_str:
res_json = json.loads(result_str)
if isinstance(res_json, dict) and "error" in res_json:
is_error = True
except (json.JSONDecodeError, TypeError):
# If it's not valid JSON, it's a malformed result (error)
is_error = True
if is_error:
self._record_failure(name, execution_time)
else:
self._record_success(name, execution_time)
return result_str
except Exception as e:
# This should rarely be hit as registry.dispatch catches most things,
# but we guard against orchestrator-level or registry-level bugs.
execution_time = time.time() - start_time
self._record_failure(name, execution_time)
error_msg = f"Tool orchestrator error during {name}: {type(e).__name__}: {e}"
logger.exception(error_msg)
return json.dumps({
"error": error_msg,
"tool_name": name,
"execution_time": execution_time
})
def get_fleet_stats(self) -> Dict[str, Any]:
"""Return execution statistics for all tools."""
with self._lock:
return {
name: {
"state": s.state,
"failures": s.failures,
"successes": s.successes,
"avg_time": s.total_execution_time / s.call_count if s.call_count > 0 else 0,
"calls": s.call_count
}
for name, s in self._stats.items()
}
# Global orchestrator instance
orchestrator = ToolOrchestrator()

View File

@@ -0,0 +1,40 @@
# Tool Call Benchmark: Gemma 4 vs mimo-v2-pro
Date: 2026-04-13
Status: Awaiting execution
## Test Design
100 diverse tool calls across 7 categories:
| Category | Count | Tools Tested |
|----------|-------|--------------|
| File operations | 20 | read_file, write_file, search_files |
| Terminal commands | 20 | terminal |
| Web search | 15 | web_search |
| Code execution | 15 | execute_code |
| Browser automation | 10 | browser_navigate |
| Delegation | 10 | delegate_task |
| MCP tools | 10 | mcp_* |
## Metrics
| Metric | mimo-v2-pro | Gemma 4 |
|--------|-------------|---------|
| Schema parse success | — | — |
| Tool execution success | — | — |
| Parallel tool success | — | — |
| Avg latency (s) | — | — |
| Token cost per call | — | — |
## How to Run
```bash
python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro
python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest
python3 benchmarks/tool_call_benchmark.py --compare
```
## Gemma 4-Specific Failure Modes
To be documented after benchmark execution.

View File

@@ -0,0 +1,614 @@
#!/usr/bin/env python3
"""
Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
Runs 100 diverse tool-calling prompts through multiple models and compares
success rates, latency, and token costs.
Usage:
python3 benchmarks/tool_call_benchmark.py # full 100-call suite
python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test
python3 benchmarks/tool_call_benchmark.py --models nous # single model
python3 benchmarks/tool_call_benchmark.py --category file # single category
Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
"""
import argparse
import json
import os
import sys
import time
import traceback
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# Ensure hermes-agent root is importable
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))
# ---------------------------------------------------------------------------
# Test Definitions
# ---------------------------------------------------------------------------
@dataclass
class ToolCall:
"""A single tool-calling test case."""
id: str
category: str
prompt: str
expected_tool: str # tool name we expect the model to call
expected_params_check: str = "" # substring expected in JSON args
timeout: int = 30 # max seconds per call
notes: str = ""
# fmt: off
SUITE: list[ToolCall] = [
# ── File Operations (20) ──────────────────────────────────────────────
ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
"read_file", "path"),
ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
"write_file", "path"),
ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
"search_files", "pattern"),
ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
"read_file", "offset"),
ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
"patch", "old_string"),
ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
"search_files", "target"),
ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
"read_file", "limit"),
ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
"write_file", "content"),
ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
"search_files", "file_glob"),
ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
"read_file", "bench_config"),
ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
"write_file", "bench_readme"),
ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
"search_files", "TODO"),
ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
"read_file", "bench_readme"),
ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
"patch", "Tool Benchmark"),
ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
"write_file", "bench_hello"),
ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
"search_files", "json"),
ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
"read_file", "bench_hello"),
ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
"patch", "hello world"),
ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
"search_files", "bench"),
ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
"read_file", "test_bench"),
# ── Terminal Commands (20) ────────────────────────────────────────────
ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
"terminal", "echo"),
ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
"terminal", "date"),
ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
"terminal", "uname"),
ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
"terminal", "pwd"),
ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
"terminal", "head"),
ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
"terminal", "whoami"),
ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
"terminal", "df"),
ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
"terminal", "python3"),
ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
"terminal", "hostname"),
ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
"terminal", "uptime"),
ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
"terminal", "PATH"),
ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
"terminal", "wc"),
ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
"terminal", "SHELL"),
ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
"terminal", "memory"),
ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
"terminal", "id"),
ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
"terminal", "hostname"),
ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
"terminal", "echo"),
ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
"terminal", "seq"),
ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
"terminal", "print"),
ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
"terminal", "wc"),
# ── Code Execution (15) ──────────────────────────────────────────────
ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
"execute_code", "factorial"),
ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
"execute_code", "words"),
ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
"execute_code", "fibonacci"),
ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
"execute_code", "json"),
ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
"execute_code", "glob"),
ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
"execute_code", "sum"),
ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
"execute_code", "palindrome"),
ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
"execute_code", "csv"),
ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
"execute_code", "sort"),
ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
"execute_code", "passwd"),
ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
"execute_code", "sha256"),
ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
"execute_code", "utcnow"),
ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
"execute_code", "upper"),
ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
"execute_code", "sys"),
ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
"execute_code", "socket"),
# ── Delegation (10) ──────────────────────────────────────────────────
ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
"delegate_task", "log"),
ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
"delegate_task", "15"),
ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
"delegate_task", "python"),
ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
"delegate_task", "summarize"),
ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
"delegate_task", "tmp"),
ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
"delegate_task", ".py"),
ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
"delegate_task", "df"),
ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
"delegate_task", "os"),
ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
"delegate_task", "hostname"),
ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
"delegate_task", "write"),
# ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
"todo", "benchmark"),
ToolCall("todo-02", "todo", "Show me the current todo list.",
"todo", ""),
ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
"todo", "completed"),
ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
"todo", "Review"),
ToolCall("todo-05", "todo", "Clear all completed todos.",
"todo", "clear"),
ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
date=datetime.now().strftime("%Y-%m-%d")),
"memory", "benchmark"),
ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
"memory", "benchmark"),
ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
"memory", "gemma"),
ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
"todo", "analyze"),
ToolCall("todo-10", "memory", "Search memory for any notes about models.",
"memory", "model"),
# ── Skills (10 — replacing MCP tools which need servers) ─────────────
ToolCall("skill-01", "skills", "List all available skills.",
"skills_list", ""),
ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
"skill_view", "test-driven"),
ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
"skills_list", "git"),
ToolCall("skill-04", "skills", "View the 'code-review' skill.",
"skill_view", "code-review"),
ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
"skills_list", "devops"),
ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
"skill_view", "systematic-debugging"),
ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
"skills_list", "testing"),
ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
"skill_view", "writing-plans"),
ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
"skills_list", "software-development"),
ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
"skill_view", "pr-review"),
# ── Additional tests to reach 100 ────────────────────────────────────
ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
"write_file", "bench_sort"),
ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
"read_file", "bench_sort"),
ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
"search_files", "class"),
ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
"terminal", "os"),
ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
"terminal", "cpu"),
ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
"execute_code", "flatten"),
ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
"execute_code", "prime"),
ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
"delegate_task", "cwd"),
ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
"todo", "Finalize"),
ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
"memory", "categories"),
ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
"skills_list", "deployment"),
ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
"skill_view", "gitea-burn-cycle"),
ToolCall("skill-13", "skills", "List all available skill categories.",
"skills_list", ""),
ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
"skills_list", "memory"),
ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
"skill_view", "mimo-swarm"),
]
# fmt: on
# ---------------------------------------------------------------------------
# Runner
# ---------------------------------------------------------------------------
@dataclass
class CallResult:
test_id: str
category: str
model: str
prompt: str
expected_tool: str
success: bool
tool_called: Optional[str] = None
tool_args_valid: bool = False
execution_ok: bool = False
latency_s: float = 0.0
error: str = ""
raw_response: str = ""
@dataclass
class ModelStats:
model: str
total: int = 0
schema_ok: int = 0 # model produced valid tool call JSON
exec_ok: int = 0 # tool actually ran without error
latency_sum: float = 0.0
failures: list = field(default_factory=list)
@property
def schema_pct(self) -> float:
return (self.schema_ok / self.total * 100) if self.total else 0
@property
def exec_pct(self) -> float:
return (self.exec_ok / self.total * 100) if self.total else 0
@property
def avg_latency(self) -> float:
return (self.latency_sum / self.total) if self.total else 0
def setup_test_files():
"""Create prerequisite files for the benchmark."""
Path("/tmp/test_bench.txt").write_text(
"This is a benchmark test file.\n"
"It contains sample data for tool-calling tests.\n"
"Line three has some import statements.\n"
"import os\nimport sys\nimport json\n"
"End of test data.\n"
)
def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
"""Run a single tool-calling test through the agent."""
from run_agent import AIAgent
result = CallResult(
test_id=tc.id,
category=tc.category,
model=model_spec,
prompt=tc.prompt,
expected_tool=tc.expected_tool,
success=False,
)
try:
agent = AIAgent(
model=model_spec,
provider=provider,
max_iterations=3,
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
persist_session=False,
)
t0 = time.time()
conv = agent.run_conversation(
user_message=tc.prompt,
system_message=(
"You are a benchmark test runner. Execute the user's request by calling "
"the appropriate tool. Return the tool result directly. Do not add commentary."
),
)
result.latency_s = round(time.time() - t0, 2)
messages = conv.get("messages", [])
# Find the first assistant message with tool_calls
tool_called = None
tool_args_str = ""
for msg in messages:
if msg.get("role") == "assistant" and msg.get("tool_calls"):
for tc_item in msg["tool_calls"]:
fn = tc_item.get("function", {})
tool_called = fn.get("name", "")
tool_args_str = fn.get("arguments", "{}")
break
break
if tool_called:
result.tool_called = tool_called
result.schema_ok = True
# Check if the right tool was called
if tool_called == tc.expected_tool:
result.success = True
# Check if args contain expected substring
if tc.expected_params_check:
result.tool_args_valid = tc.expected_params_check in tool_args_str
else:
result.tool_args_valid = True
# Check if tool executed (look for tool role message)
for msg in messages:
if msg.get("role") == "tool":
content = msg.get("content", "")
if content and "error" not in content.lower()[:50]:
result.execution_ok = True
break
elif content:
result.execution_ok = True # got a response, even if error
break
else:
# No tool call produced — still check if model responded
final = conv.get("final_response", "")
result.raw_response = final[:200] if final else ""
except Exception as e:
result.error = f"{type(e).__name__}: {str(e)[:200]}"
result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
return result
def generate_report(results: list[CallResult], models: list[str], output_path: Path):
"""Generate markdown benchmark report."""
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
# Aggregate per model
stats: dict[str, ModelStats] = {}
for m in models:
stats[m] = ModelStats(model=m)
by_category: dict[str, dict[str, list[CallResult]]] = {}
for r in results:
s = stats[r.model]
s.total += 1
s.schema_ok += int(r.schema_ok)
s.exec_ok += int(r.execution_ok)
s.latency_sum += r.latency_s
if not r.success:
s.failures.append(r)
by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
lines = [
f"# Tool-Calling Benchmark Report",
f"",
f"Generated: {now}",
f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
f"Models tested: {', '.join(models)}",
f"",
f"## Summary",
f"",
f"| Metric | {' | '.join(models)} |",
f"|--------|{'|'.join('---------' for _ in models)}|",
]
# Schema parse success
row = "| Schema parse success | "
for m in models:
s = stats[m]
row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
lines.append(row)
# Tool execution success
row = "| Tool execution success | "
for m in models:
s = stats[m]
row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
lines.append(row)
# Correct tool selected
row = "| Correct tool selected | "
for m in models:
s = stats[m]
correct = sum(1 for r in results if r.model == m and r.success)
pct = (correct / s.total * 100) if s.total else 0
row += f"{correct}/{s.total} ({pct:.0f}%) | "
lines.append(row)
# Avg latency
row = "| Avg latency (s) | "
for m in models:
s = stats[m]
row += f"{s.avg_latency:.2f} | "
lines.append(row)
lines.append("")
# Per-category breakdown
lines.append("## Per-Category Breakdown")
lines.append("")
for cat in sorted(by_category.keys()):
lines.append(f"### {cat.title()}")
lines.append("")
lines.append(f"| Metric | {' | '.join(models)} |")
lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
cat_data = by_category[cat]
for metric_name, fn in [
("Schema OK", lambda r: r.schema_ok),
("Exec OK", lambda r: r.execution_ok),
("Correct tool", lambda r: r.success),
]:
row = f"| {metric_name} | "
for m in models:
results_m = cat_data.get(m, [])
total = len(results_m)
ok = sum(1 for r in results_m if fn(r))
pct = (ok / total * 100) if total else 0
row += f"{ok}/{total} ({pct:.0f}%) | "
lines.append(row)
lines.append("")
# Failure analysis
lines.append("## Failure Analysis")
lines.append("")
any_failures = False
for m in models:
s = stats[m]
if s.failures:
any_failures = True
lines.append(f"### {m}{len(s.failures)} failures")
lines.append("")
lines.append("| Test | Category | Expected | Got | Error |")
lines.append("|------|----------|----------|-----|-------|")
for r in s.failures:
got = r.tool_called or "none"
err = r.error or "wrong tool"
lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
lines.append("")
if not any_failures:
lines.append("No failures detected.")
lines.append("")
# Raw results JSON
lines.append("## Raw Results")
lines.append("")
lines.append("```json")
lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
lines.append("```")
report = "\n".join(lines)
output_path.write_text(report)
return report
def main():
parser = argparse.ArgumentParser(description="Tool-calling benchmark")
parser.add_argument("--models", nargs="+",
default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
help="Model specs to test (provider:model)")
parser.add_argument("--limit", type=int, default=0,
help="Run only first N tests (0 = all)")
parser.add_argument("--category", type=str, default="",
help="Run only tests in this category")
parser.add_argument("--output", type=str, default="",
help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
parser.add_argument("--dry-run", action="store_true",
help="Print test cases without running them")
args = parser.parse_args()
# Filter suite
suite = SUITE[:]
if args.category:
suite = [tc for tc in suite if tc.category == args.category]
if args.limit > 0:
suite = suite[:args.limit]
if args.dry_run:
print(f"Would run {len(suite)} tests:")
for tc in suite:
print(f" [{tc.category:8s}] {tc.id}: {tc.expected_tool}{tc.prompt[:60]}")
return
# Setup
setup_test_files()
date_str = datetime.now().strftime("%Y-%m-%d")
output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
# Parse model specs
model_specs = []
for spec in args.models:
parts = spec.split(":", 1)
provider = parts[0]
model_name = parts[1] if len(parts) > 1 else parts[0]
model_specs.append((provider, model_name, spec))
print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
print(f"Output: {output_path}")
print()
all_results: list[CallResult] = []
for provider, model_name, full_spec in model_specs:
print(f"── {full_spec} {'' * (50 - len(full_spec))}")
model_results = []
for i, tc in enumerate(suite, 1):
sys.stdout.write(f"\r [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s}{tc.expected_tool:20s}")
sys.stdout.flush()
r = run_single_test(tc, full_spec, provider)
model_results.append(r)
status = "" if r.success else ""
sys.stdout.write(f" {status} ({r.latency_s:.1f}s)")
sys.stdout.write("\n")
all_results.extend(model_results)
# Quick stats
ok = sum(1 for r in model_results if r.success)
print(f" Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
print()
# Generate report
model_names = [spec for _, _, spec in model_specs]
report = generate_report(all_results, model_names, output_path)
print(f"Report written to {output_path}")
# Exit code: 0 if all pass, 1 if any failures
total_fail = sum(1 for r in all_results if not r.success)
sys.exit(1 if total_fail > 0 else 0)
if __name__ == "__main__":
main()

View File

@@ -28,7 +28,6 @@ from typing import Dict, Any, List, Optional, Tuple
from tools.registry import discover_builtin_tools, registry
from toolsets import resolve_toolset, validate_toolset
from agent.tool_orchestrator import orchestrator
logger = logging.getLogger(__name__)
@@ -500,13 +499,13 @@ def handle_function_call(
# Prefer the caller-provided list so subagents can't overwrite
# the parent's tool set via the process-global.
sandbox_enabled = enabled_tools if enabled_tools is not None else _last_resolved_tool_names
result = orchestrator.dispatch(
result = registry.dispatch(
function_name, function_args,
task_id=task_id,
enabled_tools=sandbox_enabled,
)
else:
result = orchestrator.dispatch(
result = registry.dispatch(
function_name, function_args,
task_id=task_id,
user_task=user_task,

View File

@@ -1,276 +0,0 @@
#!/usr/bin/env python3
"""Multilingual Crisis Detection Patterns.
Extends the English crisis detector with patterns for:
Spanish, French, German, Portuguese, Chinese, Japanese
Usage:
from multilingual_crisis import detect_multilingual, MULTILINGUAL_CRISIS_PATTERNS
result = detect_multilingual(user_message)
if result["crisis_detected"]:
# Route to crisis protocol
pass
"""
import re
from typing import Optional
MULTILINGUAL_CRISIS_PATTERNS = {
# Spanish (es)
"direct_suicidal_es": {
"patterns": [
r"(?:quiero|necesito|voy a|pensando en|planeando)\s+(?:morir|suicidarme|matarme|acabar con todo|quitarme la vida)",
r"suicid(?:io|arme|arse|al)",
r"(?:quiero|necesito|ojal[aá])\s+(?:estar|ser|quedarme)\s+muerto",
r"(?:no|ya no)\s+(?:quiero|puedo|soporto)\s+(?:vivir|seguir|estar aqu[ií]|existir)",
r"(?:acabar|terminar)\s+(?:con\s+)?(?:todo|mi\s+vida|conmigo)",
r"(?:nadie)\s+(?:echar[ií]a|extra[nñ]ar[ií]a|notar[ií]a|importar[ií]a)\s+(?:de\s+)?(?:menos|mi\s+falta)",
r"(?:mi|la)\s+vida\s+(?:no\s+)?(?:vale|sirve|tiene\s+sentido)",
r"(?:ser[ií]a|estar[ií]a)\s+(?:mejor|todos\s+estar[ií]an\s+mejor)\s+(?:si\s+)?(?:no\s+estuviera|sin\s+mi|muerto)",
],
"weight": 1.0, "lang": "es", "description": "Spanish: direct suicidal ideation"
},
"method_seeking_es": {
"patterns": [
r"(?:c[oó]mo|qu[eé]\s+(?:forma|manera)|mejor\s+forma)\s+(?:de\s+)?(?:morir|suicidarme|matarme|acabar)",
r"(?:dosis|cantidad)\s+(?:letal|fatal|mortal|letales)",
r"(?:pastillas|medicamentos?|f[aá]rmacos?)\s+(?:para\s+)?(?:morir|suicidarme|matarme)",
r"cort(?:arme|ar)\s+(?:las\s+)?(?:venas|muñecas|garganta)",
r"(?:ahorcarme|colgarme|saltar|tirarme)\s+(?:de|del|desde)?",
r"(?:forma|manera|m[eé]todo)\s+(?:indolora|r[aá]pida|sin\s+dolor)\s+(?:de\s+)?(?:morir|acabar)",
],
"weight": 1.0, "lang": "es", "description": "Spanish: method seeking"
},
"despair_es": {
"patterns": [
r"(?:no|ya\s+no)\s+(?:puedo|soporto|aguanto|resisto)\s+m[aá]s",
r"(?:estoy|me\s+siento)\s+(?:perdido|solo|vac[ií]o|sin\s+esperanza|agotado)",
r"(?:no\s+tiene|no\s+hay)\s+(?:sentido|soluci[oó]n|salida|esperanza)",
r"para\s+(?:qu[eé]|qu[iíén])\s+(?:si|vivo|me\s+esfuerzo)",
r"(?:soy|me\s+siento\s+como)\s+(?:una\s+)?(?:carga|peso|estorbo|in[uú]til)",
],
"weight": 0.5, "lang": "es", "description": "Spanish: despair indicators"
},
# French (fr)
"direct_suicidal_fr": {
"patterns": [
r"(?:je\s+(?:veux|vais|pense\s+[àa]|planifie)|j'ai\s+envie\s+de)\s+(?:mourir|me\s+suicider|me\s+tuer|en\s+finir)",
r"suicid(?:e|er|aire)",
r"(?:je\s+ne\s+)?(?:veux|peux|supporte)\s+(?:plus\s+)?(?:vivre|exister|continuer)",
r"(?:en|j'en)\s+finir\s+(?:avec\s+)?(?:tout|la\s+vie|moi|m[eê]me|[çc]a)",
r"(?:personne|nul\s+ne|aucun)\s+(?:ne\s+)?(?:me\s+)?(?:manquera|regretterait|remarquerait)",
r"(?:ma|cette)\s+vie\s+(?:ne\s+vaut|n'a\s+(?:pas\s+)?de\s+sens|est\s+(?:finie|inutile))",
r"(?:tout\s+le\s+monde|on)\s+(?:serait|irait)\s+(?:mieux|bien)\s+(?:sans\s+moi)",
],
"weight": 1.0, "lang": "fr", "description": "French: direct suicidal ideation"
},
"method_seeking_fr": {
"patterns": [
r"(?:comment|quel(le)?\s+(?:est\s+le\s+)?(?:meilleur|moyen))\s+(?:de\s+)?(?:mourir|se\s+suicider|se\s+tuer|en\s+finir)",
r"(?:dose|quantit[eé])\s+(?:l[eé]tale?|fatale?|mortelle?)",
r"(?:comprim[eé]s?|pilules?|m[eé]dicaments?)\s+(?:pour\s+)?(?:mourir|se\s+tuer|overdose)",
r"(?:se\s+)?couper\s+(?:les\s+)?(?:veines|poignets|gorge)",
r"(?:se\s+)?pendre|se\s+(?:jeter|lancer)\s+(?:du|de\s+la|dans)",
r"(?:moyen|fa[cç]on|mani[eè]re)\s+(?:indolore|rapide|sans\s+douleur)\s+(?:de\s+)?(?:mourir|en\s+finir)",
],
"weight": 1.0, "lang": "fr", "description": "French: method seeking"
},
"despair_fr": {
"patterns": [
r"(?:je\s+ne\s+)?(?:peux|supporte|arrive\s+[àa])\s+(?:plus\s+)?(?:continuer|tenir|durer|avancer)",
r"(?:je\s+suis|je\s+me\s+sens)\s+(?:perdu|seul|vide|sans\s+espoir|au\s+bout)",
r"(?:il\s+n'y\s+a|y\s+a\s+(?:pas\s+)?(?:de\s+)?)?(?:plus\s+)?(?:d'?espoir|de\s+solution|d'issue|de\s+sens)",
r"(?:je\s+suis|c'est)\s+(?:un\s+)?(?:fardeau|poids|inutile|nul)",
r"(?:pourquoi|[àa]\s+quoi\s+bon|pour\s+qui)\s+(?:je\s+)?(?:vis|m'efforce|continue)",
],
"weight": 0.5, "lang": "fr", "description": "French: despair indicators"
},
# German (de)
"direct_suicidal_de": {
"patterns": [
r"(?:ich\s+(?:will|möchte|denke\s+(?:über|an)|plane))\s+(?:sterben|suizid|mich\s+(?:umbringen|töten))",
r"suizid|selbstmord",
r"(?:ich\s+(?:will|möchte)\s+(?:nicht|mehr\s+nicht))\s+(?:leben|weiterleben|existieren|dasein)",
r"(?:mit\s+)?(?:allem|dem\s+Leben|mir\s+selbst)\s+(?:aufhören|Schluss|fertig)\s+(?:sein|machen)",
r"(?:niemand|keiner)\s+(?:würde|wird)\s+(?:mich\s+)?(?:vermissen|bemerken|verlieren)",
r"(?:mein|dieses)\s+Leben\s+(?:hat\s+(?:keinen\s+)?Sinn|ist\s+(?:sinnlos|vorbei|fertig))",
r"(?:allen|jedem)\s+(?:wäre|ginge|ging)\s+es\s+besser\s+(?:ohne\s+mich|wenn\s+ich\s+nicht\s+wäre)",
],
"weight": 1.0, "lang": "de", "description": "German: direct suicidal ideation"
},
"method_seeking_de": {
"patterns": [
r"(?:wie|welcher|beste)\s+(?:kann\s+ich|möglichkeit)\s+(?:mich\s+)?(?:umbringen|töten|sterben|suizid)",
r"(?:tödliche|letale|verhängnisvolle)\s+(?:Dosis|Menge)",
r"(?:Tabletten?|Medikamente?|Pillen?)\s+(?:um\s+)?(?:zu\s+)?(?:sterben|suizid|überdosis)",
r"(?:sich\s+)?(?:die\s+)?(?:Pulsadern|Kehle|Handgelenke?)\s+(?:aufschneiden|durchschneiden)",
r"(?:sich\s+)?(?:erhängen|aufhängen|vor\s+(?:einen\s+)?Zug\s+werfen|springen)",
r"(?:schmerzlose?|schnelle?)\s+(?:Art|Weise|Methode)\s+(?:zu\s+)?(?:sterben|suizid)",
],
"weight": 1.0, "lang": "de", "description": "German: method seeking"
},
"despair_de": {
"patterns": [
r"(?:ich\s+(?:kann|schaffe|halte)\s+(?:es\s+)?(?:nicht\s+)?(?:mehr|weiter|länger))",
r"(?:ich\s+(?:bin|fühle\s+mich)\s+)?(?:verloren|einsam|leer|hoffnungslos|am\s+Ende)",
r"(?:es\s+gibt|es\s+hat)\s+(?:keine\s+)?(?:Hoffnung|Lösung|Auskunft|Sinn)",
r"(?:ich\s+bin|bin\s+ich)\s+(?:eine\s+)?(?:Belastung|Last|nutzlos|wertlos)",
r"(?:warum|wozu|für\s+wen)\s+(?:lebe|soll|mache)\s+(?:ich\s+)?(?:überhaupt|noch|weiter)",
],
"weight": 0.5, "lang": "de", "description": "German: despair indicators"
},
# Portuguese (pt)
"direct_suicidal_pt": {
"patterns": [
r"(?:eu\s+(?:quero|preciso|vou|estou\s+pensando\s+em)|penso\s+em)\s+(?:morrer|me\s+matar|suicid[ai]r|acabar\s+com\s+tudo)",
r"suic[ií]dio",
r"(?:eu\s+)?(?:não\s+)?(?:quero|aguento|suporto|consigo)\s+(?:mais\s+)?(?:viver|existir|continuar|estar\s+aqui)",
r"(?:acabar|terminar|dar\s+fim)\s+(?:com\s+)?(?:tudo|a\s+minha\s+vida|com\s+isso|com\s+tudo)",
r"(?:ningu[eé]m)\s+(?:vai|iria)\s+(?:sentir\s+falta|notar|ligar|se\s+importar)",
r"(?:minha|esta)\s+vida\s+(?:não\s+)?(?:vale|faz\s+sentido|tem\s+sentido)",
r"(?:todo\s+mundo|todos)\s+(?:seria|estaria|ficaria)\s+(?:melhor|bem)\s+(?:sem\s+mim|se\s+eu\s+fosse)",
],
"weight": 1.0, "lang": "pt", "description": "Portuguese: direct suicidal ideation"
},
"method_seeking_pt": {
"patterns": [
r"(?:como|qual|melhor\s+forma)\s+(?:de\s+)?(?:morrer|me\s+matar|suicid[ai]r|acabar)",
r"(?:dose|quantidade)\s+(?:letal|fatal|mortal)",
r"(?:comprimidos?|remédios?|medicamentos?)\s+(?:para\s+)?(?:morrer|me\s+matar|overdose)",
r"cort(?:ar|ando)\s+(?:os|meus)?\s+(?:pulsos|veias|garganta)",
r"(?:enforcar|enforcando|pendur[ae]r|pular)\s+(?:me|de|do|da)?",
r"(?:forma|maneira|método)\s+(?:indolor|rápido|sem\s+dor)\s+(?:de\s+)?(?:morrer|acabar)",
],
"weight": 1.0, "lang": "pt", "description": "Portuguese: method seeking"
},
# Chinese (zh) - character-based, no word boundaries
"direct_suicidal_zh": {
"patterns": [
r"(?:想|想要|打算|计划|准备)(?:去)?(?:死|自杀|结束(?:自己的)?(?:生命|一切))",
r"(?:不想|不再|无法|不能)(?:活|活下去|继续活|生存)(?:了|下去)?",
r"(?:结束|终结)(?:自己|生命|一切|所有|全部)",
r"(?:没有|毫无|失去)(?:意义|希望|活下去的(?:理由|动力|勇气))",
r"(?:死了|不在了|消失了)(?:大家|所有人)(?:会|就)(?:更好|开心|轻松)",
r"(?:没有人|谁)(?:会)(?:在乎|在意|想|怀念)(?:我|我的)",
r"自杀",
],
"weight": 1.0, "lang": "zh", "description": "Chinese: direct suicidal ideation"
},
"method_seeking_zh": {
"patterns": [
r"(?:怎么|怎样|什么(?:方法|方式))(?:才能|可以|去)?(?:死|自杀|结束(?:自己|生命))",
r"(?:致死|致命|致亡)(?:剂量|量|浓度|含量)",
r"(?:安眠药|药物|药片|毒药)(?:吃|服用|用)(?:多少|几|怎么)(?:才能|可以|会)?(?:死|自杀|致命)",
r"(?:割|切)(?:腕|脉|喉|脖子|血管)",
r"(?:上吊|跳楼|跳河|跳桥|跳崖|撞车)",
r"(?:无痛|最快|最简单)(?:的)?(?:死法|方法|方式)(?:是|用来)?",
],
"weight": 1.0, "lang": "zh", "description": "Chinese: method seeking"
},
"despair_zh": {
"patterns": [
r"(?:撑|坚持|忍受|活)(?:不|没有|无法)(?:下去|了|再)",
r"(?:迷茫|孤独|空虚|绝望|崩溃|痛苦|无助)(?:了|到了|到)?",
r"(?:没有|失去|看不到)(?:希望|出路|未来|意义|目的)",
r"(?:我是|我是一个)(?:负担|累赘|废物|没用的人)",
r"(?:活着|生存)(?:为了什么|有什么意义|有什么用|有什么意思)",
],
"weight": 0.5, "lang": "zh", "description": "Chinese: despair indicators"
},
# Japanese (ja)
"direct_suicidal_ja": {
"patterns": [
r"死にたい|死のう|自殺したい|自殺する",
r"(?:もう|これ以上)(?:生きて|生きる|存在して)(?:い(?:たい|る)|行(?:きたい|く))(?:く|け)(?:ない|たくない)?",
r"(?:すべて|全部|人生|この(?:まま|こと))(?:を)?(?:終わり|終え|やめ)(?:たい|よう|る)",
r"(?:誰も|だれも)(?:気づ|気付|構い|構って|思っ)(?:て(?:くれ|い)ない|てくれ(?:ない))",
r"(?:僕|俺|私|わたし)(?:が|は)(?:い(?:ない|なくなって)|消(?:え|えても))(?:も|たら)(?:皆|みんな|周囲)(?:は)?(?:良(?:い|く)|楽(?:に))(?:なる|なった)",
r"(?:この|今の)(?:僕|俺|私|わたし)(?:の)?(?:人生|命|存在)(?:は)?(?:意味|価値|甲斐)(?:が)?(?:ない|無い)",
],
"weight": 1.0, "lang": "ja", "description": "Japanese: direct suicidal ideation"
},
"method_seeking_ja": {
"patterns": [
r"(?:どう|どんな|どの(?:よう|様)?に)(?:すれば|やれば|して)(?:死|自殺|亡くな)(?:れる|りたい|る)",
r"(?:致死|致命)(?:量|的(?:な)?(?:量|ドーズ|用量))",
r"(?:睡眠薬|薬|ピル|毒)(?:を)?(?:何|いくつ|どのくらい)(?:飲|摂|使)(?:め|んだら|えば)(?:死|亡くな)(?:れる|る)",
r"(?:手首|喉|首筋|血管)(?:を)?(?:切|斬|傷つ)(?:る|け|って)",
r"(?:縊|首吊|飛び降|投身|飛び降り)(?:り|て|死の)",
r"(?:苦痛|痛み)(?:の)?(?:ない|少ない)(?:方法|やり方|死に方)(?:で|は)?",
],
"weight": 1.0, "lang": "ja", "description": "Japanese: method seeking"
},
"despair_ja": {
"patterns": [
r"(?:もう|これ以上|これ以上は)(?:無理|限界|耐え|がんば|頑張)(?:だ|だよ|れない|りきれない)",
r"(?:孤独|寂し|虚し|絶望|疲(?:れ|労))(?:い|く|き|さ|た|すぎて)",
r"(?:希望|未来|生き(?:甲斐|がい)|意味|目的)(?:が|は)?(?:ない|無い|見え|見つから|失(?:くし|われ))",
r"(?:僕|俺|私|わたし)(?:は|が)?(?:邪魔|迷惑|足手まとい|重荷|役に立たない)",
r"(?:生き(?:る|ている)|存在(?:する|している))(?:の(?:に)?|こと(?:に)?)(?:は)?(?:何の|なんの|どんな)(?:ため|意味|価値)",
],
"weight": 0.5, "lang": "ja", "description": "Japanese: despair indicators"
},
}
def _check_multilingual(text):
"""Check text against all multilingual crisis patterns."""
matches = []
languages = set()
total_score = 0.0
max_possible = 0.0
for category, config in MULTILINGUAL_CRISIS_PATTERNS.items():
weight = config.get("weight", 1.0)
lang = config.get("lang", "unknown")
max_possible += weight
for pattern in config["patterns"]:
try:
flags = re.IGNORECASE if lang in ("es", "fr", "de", "pt") else 0
if re.search(pattern, text, flags):
matches.append({"category": category, "lang": lang, "weight": weight})
total_score += weight
languages.add(lang)
break
except re.error:
pass
score = total_score / max_possible if max_possible > 0 else 0.0
return {
"crisis_detected": score >= 0.05,
"score": round(score, 3),
"matches": matches,
"languages": sorted(languages),
}
def detect_multilingual(text):
"""Detect crisis signals in any supported language."""
result = _check_multilingual(text)
if result["matches"]:
result["highest_category"] = max(result["matches"], key=lambda m: m["weight"])["category"]
else:
result["highest_category"] = None
return result
if __name__ == "__main__":
tests = [
("quiero morir", True, "es"), ("je veux me suicider", True, "fr"),
("ich will nicht mehr leben", True, "de"), ("eu quero me matar", True, "pt"),
("我想死", True, "zh"), ("死にたい", True, "ja"),
("no puedo mas, quiero acabar con todo", True, "es"),
("personne ne me manquera", True, "fr"), ("dosis letal", True, "es"),
("怎么自杀", True, "zh"), ("en finir avec tout", True, "fr"),
("hola, como estas?", False, None), ("je suis fatigue", False, None),
("今天的天气不错", False, None), ("おはようございます", False, None),
]
passed = sum(1 for text, should, _ in tests if detect_multilingual(text)["crisis_detected"] == should)
print(f"{passed}/{len(tests)} passed")