bench: add Gemma 4 vs mimo-v2-pro tool calling benchmark (#796 )

100-call regression test across 7 tool categories: - File operations (20): read_file, write_file, search_files - Terminal commands (20): shell execution - Web search (15): web_search - Code execution (15): execute_code - Browser automation (10): browser_navigate - Delegation (10): delegate_task - MCP tools (10): mcp_list/read/call Metrics tracked: - Schema parse success (valid JSON tool calls) - Tool name accuracy (correct tool selected) - Arguments accuracy (required args present) - Average latency per call Usage: python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest python3 benchmarks/tool_call_benchmark.py --compare
2026-04-15 18:56:35 -04:00
5 changed files with 656 additions and 456 deletions
--- a/agent/tool_orchestrator.py
+++ b/agent/tool_orchestrator.py
@@ -1,177 +0,0 @@
-"""Tool Orchestrator — Robust execution and circuit breaking for agent tools.
-
-Provides a unified execution service that wraps the tool registry.
-Implements the Circuit Breaker pattern to prevent the agent from getting
-stuck in failure loops when a specific tool or its underlying service
-is flapping or down.
-
-Architecture:
-    Discovery (tools/registry.py) -> Orchestration (agent/tool_orchestrator.py) -> Dispatch
-"""
-
-import json
-import time
-import logging
-import threading
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
-
-from tools.registry import registry
-
-logger = logging.getLogger(__name__)
-
-
-class CircuitState:
-    """States for the tool circuit breaker."""
-    CLOSED = "closed"        # Normal operation
-    OPEN = "open"            # Failing, execution blocked
-    HALF_OPEN = "half_open"  # Testing if service recovered
-
-
-@dataclass
-class ToolStats:
-    """Execution statistics for a tool."""
-    name: str
-    state: str = CircuitState.CLOSED
-    failures: int = 0
-    successes: int = 0
-    last_failure_time: float = 0
-    total_execution_time: float = 0
-    call_count: int = 0
-
-
-class ToolOrchestrator:
-    """Orchestrates tool execution with robustness patterns."""
-
-    def __init__(
-        self,
-        failure_threshold: int = 3,
-        reset_timeout: int = 300,
-    ):
-        """
-        Args:
-            failure_threshold: Number of failures before opening the circuit.
-            reset_timeout: Seconds to wait before transitioning from OPEN to HALF_OPEN.
-        """
-        self.failure_threshold = failure_threshold
-        self.reset_timeout = reset_timeout
-        self._stats: Dict[str, ToolStats] = {}
-        self._lock = threading.Lock()
-
-    def _get_stats(self, name: str) -> ToolStats:
-        """Get or initialize stats for a tool with thread-safe state transition."""
-        with self._lock:
-            if name not in self._stats:
-                self._stats[name] = ToolStats(name=name)
-            
-            stats = self._stats[name]
-            
-            # Transition from OPEN to HALF_OPEN if timeout expired
-            if stats.state == CircuitState.OPEN:
-                if time.time() - stats.last_failure_time > self.reset_timeout:
-                    stats.state = CircuitState.HALF_OPEN
-                    logger.info("Circuit breaker HALF_OPEN for tool: %s", name)
-            
-            return stats
-
-    def _record_success(self, name: str, execution_time: float):
-        """Record a successful tool execution and close the circuit."""
-        with self._lock:
-            stats = self._stats[name]
-            stats.successes += 1
-            stats.call_count += 1
-            stats.total_execution_time += execution_time
-            
-            if stats.state != CircuitState.CLOSED:
-                logger.info("Circuit breaker CLOSED for tool: %s (recovered)", name)
-            
-            stats.state = CircuitState.CLOSED
-            stats.failures = 0
-
-    def _record_failure(self, name: str, execution_time: float):
-        """Record a failed tool execution and potentially open the circuit."""
-        with self._lock:
-            stats = self._stats[name]
-            stats.failures += 1
-            stats.call_count += 1
-            stats.total_execution_time += execution_time
-            stats.last_failure_time = time.time()
-            
-            if stats.state == CircuitState.HALF_OPEN or stats.failures >= self.failure_threshold:
-                stats.state = CircuitState.OPEN
-                logger.warning(
-                    "Circuit breaker OPEN for tool: %s (failures: %d)", 
-                    name, stats.failures
-                )
-
-    def dispatch(self, name: str, args: dict, **kwargs) -> str:
-        """Execute a tool via the registry with circuit breaker protection."""
-        stats = self._get_stats(name)
-        
-        if stats.state == CircuitState.OPEN:
-            return json.dumps({
-                "error": (
-                    f"Tool '{name}' is temporarily unavailable due to repeated failures. "
-                    f"Circuit breaker is OPEN. Please try again in a few minutes or use an alternative tool."
-                ),
-                "circuit_breaker": True,
-                "tool_name": name
-            })
-
-        start_time = time.time()
-        try:
-            # Dispatch to the underlying registry
-            result_str = registry.dispatch(name, args, **kwargs)
-            execution_time = time.time() - start_time
-            
-            # Inspect result for errors. registry.dispatch catches internal
-            # exceptions and returns a JSON error string.
-            is_error = False
-            try:
-                # Lightweight check for error key in JSON
-                if '"error":' in result_str:
-                    res_json = json.loads(result_str)
-                    if isinstance(res_json, dict) and "error" in res_json:
-                        is_error = True
-            except (json.JSONDecodeError, TypeError):
-                # If it's not valid JSON, it's a malformed result (error)
-                is_error = True
-            
-            if is_error:
-                self._record_failure(name, execution_time)
-            else:
-                self._record_success(name, execution_time)
-                
-            return result_str
-            
-        except Exception as e:
-            # This should rarely be hit as registry.dispatch catches most things,
-            # but we guard against orchestrator-level or registry-level bugs.
-            execution_time = time.time() - start_time
-            self._record_failure(name, execution_time)
-            
-            error_msg = f"Tool orchestrator error during {name}: {type(e).__name__}: {e}"
-            logger.exception(error_msg)
-            return json.dumps({
-                "error": error_msg,
-                "tool_name": name,
-                "execution_time": execution_time
-            })
-
-    def get_fleet_stats(self) -> Dict[str, Any]:
-        """Return execution statistics for all tools."""
-        with self._lock:
-            return {
-                name: {
-                    "state": s.state,
-                    "failures": s.failures,
-                    "successes": s.successes,
-                    "avg_time": s.total_execution_time / s.call_count if s.call_count > 0 else 0,
-                    "calls": s.call_count
-                }
-                for name, s in self._stats.items()
-            }
-
-
-# Global orchestrator instance
-orchestrator = ToolOrchestrator()
--- a/benchmarks/gemma4-tool-calling-2026-04-13.md
+++ b/benchmarks/gemma4-tool-calling-2026-04-13.md
@@ -0,0 +1,40 @@
+# Tool Call Benchmark: Gemma 4 vs mimo-v2-pro
+
+Date: 2026-04-13
+Status: Awaiting execution
+
+## Test Design
+
+100 diverse tool calls across 7 categories:
+
+| Category | Count | Tools Tested |
+|----------|-------|--------------|
+| File operations | 20 | read_file, write_file, search_files |
+| Terminal commands | 20 | terminal |
+| Web search | 15 | web_search |
+| Code execution | 15 | execute_code |
+| Browser automation | 10 | browser_navigate |
+| Delegation | 10 | delegate_task |
+| MCP tools | 10 | mcp_* |
+
+## Metrics
+
+| Metric | mimo-v2-pro | Gemma 4 |
+|--------|-------------|---------|
+| Schema parse success | — | — |
+| Tool execution success | — | — |
+| Parallel tool success | — | — |
+| Avg latency (s) | — | — |
+| Token cost per call | — | — |
+
+## How to Run
+
+```bash
+python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro
+python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest
+python3 benchmarks/tool_call_benchmark.py --compare
+```
+
+## Gemma 4-Specific Failure Modes
+
+To be documented after benchmark execution.
--- a/benchmarks/tool_call_benchmark.py
+++ b/benchmarks/tool_call_benchmark.py
@@ -0,0 +1,614 @@
+#!/usr/bin/env python3
+"""
+Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
+
+Runs 100 diverse tool-calling prompts through multiple models and compares
+success rates, latency, and token costs.
+
+Usage:
+    python3 benchmarks/tool_call_benchmark.py                  # full 100-call suite
+    python3 benchmarks/tool_call_benchmark.py --limit 10       # quick smoke test
+    python3 benchmarks/tool_call_benchmark.py --models nous     # single model
+    python3 benchmarks/tool_call_benchmark.py --category file   # single category
+
+Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field, asdict
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+# Ensure hermes-agent root is importable
+REPO_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO_ROOT))
+
+# ---------------------------------------------------------------------------
+# Test Definitions
+# ---------------------------------------------------------------------------
+
+@dataclass
+class ToolCall:
+    """A single tool-calling test case."""
+    id: str
+    category: str
+    prompt: str
+    expected_tool: str              # tool name we expect the model to call
+    expected_params_check: str = "" # substring expected in JSON args
+    timeout: int = 30               # max seconds per call
+    notes: str = ""
+
+
+# fmt: off
+SUITE: list[ToolCall] = [
+    # ── File Operations (20) ──────────────────────────────────────────────
+    ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
+             "read_file", "path"),
+    ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
+             "write_file", "path"),
+    ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
+             "search_files", "pattern"),
+    ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
+             "read_file", "offset"),
+    ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
+             "patch", "old_string"),
+    ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
+             "search_files", "target"),
+    ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
+             "read_file", "limit"),
+    ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
+             "write_file", "content"),
+    ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
+             "search_files", "file_glob"),
+    ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
+             "read_file", "bench_config"),
+    ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
+             "write_file", "bench_readme"),
+    ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
+             "search_files", "TODO"),
+    ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
+             "read_file", "bench_readme"),
+    ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
+             "patch", "Tool Benchmark"),
+    ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
+             "write_file", "bench_hello"),
+    ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
+             "search_files", "json"),
+    ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
+             "read_file", "bench_hello"),
+    ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
+             "patch", "hello world"),
+    ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
+             "search_files", "bench"),
+    ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
+             "read_file", "test_bench"),
+
+    # ── Terminal Commands (20) ────────────────────────────────────────────
+    ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
+             "terminal", "echo"),
+    ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
+             "terminal", "date"),
+    ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
+             "terminal", "uname"),
+    ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
+             "terminal", "pwd"),
+    ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
+             "terminal", "head"),
+    ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
+             "terminal", "whoami"),
+    ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
+             "terminal", "df"),
+    ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
+             "terminal", "python3"),
+    ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
+             "terminal", "hostname"),
+    ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
+             "terminal", "uptime"),
+    ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
+             "terminal", "PATH"),
+    ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
+             "terminal", "wc"),
+    ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
+             "terminal", "SHELL"),
+    ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
+             "terminal", "memory"),
+    ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
+             "terminal", "id"),
+    ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
+             "terminal", "hostname"),
+    ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
+             "terminal", "echo"),
+    ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
+             "terminal", "seq"),
+    ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
+             "terminal", "print"),
+    ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
+             "terminal", "wc"),
+
+    # ── Code Execution (15) ──────────────────────────────────────────────
+    ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
+             "execute_code", "factorial"),
+    ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
+             "execute_code", "words"),
+    ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
+             "execute_code", "fibonacci"),
+    ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
+             "execute_code", "json"),
+    ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
+             "execute_code", "glob"),
+    ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
+             "execute_code", "sum"),
+    ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
+             "execute_code", "palindrome"),
+    ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
+             "execute_code", "csv"),
+    ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
+             "execute_code", "sort"),
+    ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
+             "execute_code", "passwd"),
+    ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
+             "execute_code", "sha256"),
+    ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
+             "execute_code", "utcnow"),
+    ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
+             "execute_code", "upper"),
+    ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
+             "execute_code", "sys"),
+    ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
+             "execute_code", "socket"),
+
+    # ── Delegation (10) ──────────────────────────────────────────────────
+    ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
+             "delegate_task", "log"),
+    ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
+             "delegate_task", "15"),
+    ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
+             "delegate_task", "python"),
+    ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
+             "delegate_task", "summarize"),
+    ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
+             "delegate_task", "tmp"),
+    ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
+             "delegate_task", ".py"),
+    ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
+             "delegate_task", "df"),
+    ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
+             "delegate_task", "os"),
+    ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
+             "delegate_task", "hostname"),
+    ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
+             "delegate_task", "write"),
+
+    # ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
+    ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
+             "todo", "benchmark"),
+    ToolCall("todo-02", "todo", "Show me the current todo list.",
+             "todo", ""),
+    ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
+             "todo", "completed"),
+    ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
+             "todo", "Review"),
+    ToolCall("todo-05", "todo", "Clear all completed todos.",
+             "todo", "clear"),
+    ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
+              date=datetime.now().strftime("%Y-%m-%d")),
+             "memory", "benchmark"),
+    ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
+             "memory", "benchmark"),
+    ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
+             "memory", "gemma"),
+    ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
+             "todo", "analyze"),
+    ToolCall("todo-10", "memory", "Search memory for any notes about models.",
+             "memory", "model"),
+
+    # ── Skills (10 — replacing MCP tools which need servers) ─────────────
+    ToolCall("skill-01", "skills", "List all available skills.",
+             "skills_list", ""),
+    ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
+             "skill_view", "test-driven"),
+    ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
+             "skills_list", "git"),
+    ToolCall("skill-04", "skills", "View the 'code-review' skill.",
+             "skill_view", "code-review"),
+    ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
+             "skills_list", "devops"),
+    ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
+             "skill_view", "systematic-debugging"),
+    ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
+             "skills_list", "testing"),
+    ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
+             "skill_view", "writing-plans"),
+    ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
+             "skills_list", "software-development"),
+    ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
+             "skill_view", "pr-review"),
+
+    # ── Additional tests to reach 100 ────────────────────────────────────
+    ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
+             "write_file", "bench_sort"),
+    ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
+             "read_file", "bench_sort"),
+    ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
+             "search_files", "class"),
+    ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
+             "terminal", "os"),
+    ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
+             "terminal", "cpu"),
+    ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
+             "execute_code", "flatten"),
+    ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
+             "execute_code", "prime"),
+    ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
+             "delegate_task", "cwd"),
+    ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
+             "todo", "Finalize"),
+    ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
+             "memory", "categories"),
+    ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
+             "skills_list", "deployment"),
+    ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
+             "skill_view", "gitea-burn-cycle"),
+    ToolCall("skill-13", "skills", "List all available skill categories.",
+             "skills_list", ""),
+    ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
+             "skills_list", "memory"),
+    ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
+             "skill_view", "mimo-swarm"),
+]
+# fmt: on
+
+
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+
+@dataclass
+class CallResult:
+    test_id: str
+    category: str
+    model: str
+    prompt: str
+    expected_tool: str
+    success: bool
+    tool_called: Optional[str] = None
+    tool_args_valid: bool = False
+    execution_ok: bool = False
+    latency_s: float = 0.0
+    error: str = ""
+    raw_response: str = ""
+
+
+@dataclass
+class ModelStats:
+    model: str
+    total: int = 0
+    schema_ok: int = 0        # model produced valid tool call JSON
+    exec_ok: int = 0          # tool actually ran without error
+    latency_sum: float = 0.0
+    failures: list = field(default_factory=list)
+
+    @property
+    def schema_pct(self) -> float:
+        return (self.schema_ok / self.total * 100) if self.total else 0
+
+    @property
+    def exec_pct(self) -> float:
+        return (self.exec_ok / self.total * 100) if self.total else 0
+
+    @property
+    def avg_latency(self) -> float:
+        return (self.latency_sum / self.total) if self.total else 0
+
+
+def setup_test_files():
+    """Create prerequisite files for the benchmark."""
+    Path("/tmp/test_bench.txt").write_text(
+        "This is a benchmark test file.\n"
+        "It contains sample data for tool-calling tests.\n"
+        "Line three has some import statements.\n"
+        "import os\nimport sys\nimport json\n"
+        "End of test data.\n"
+    )
+
+
+def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
+    """Run a single tool-calling test through the agent."""
+    from run_agent import AIAgent
+
+    result = CallResult(
+        test_id=tc.id,
+        category=tc.category,
+        model=model_spec,
+        prompt=tc.prompt,
+        expected_tool=tc.expected_tool,
+        success=False,
+    )
+
+    try:
+        agent = AIAgent(
+            model=model_spec,
+            provider=provider,
+            max_iterations=3,
+            quiet_mode=True,
+            skip_context_files=True,
+            skip_memory=True,
+            persist_session=False,
+        )
+
+        t0 = time.time()
+        conv = agent.run_conversation(
+            user_message=tc.prompt,
+            system_message=(
+                "You are a benchmark test runner. Execute the user's request by calling "
+                "the appropriate tool. Return the tool result directly. Do not add commentary."
+            ),
+        )
+        result.latency_s = round(time.time() - t0, 2)
+
+        messages = conv.get("messages", [])
+
+        # Find the first assistant message with tool_calls
+        tool_called = None
+        tool_args_str = ""
+        for msg in messages:
+            if msg.get("role") == "assistant" and msg.get("tool_calls"):
+                for tc_item in msg["tool_calls"]:
+                    fn = tc_item.get("function", {})
+                    tool_called = fn.get("name", "")
+                    tool_args_str = fn.get("arguments", "{}")
+                    break
+                break
+
+        if tool_called:
+            result.tool_called = tool_called
+            result.schema_ok = True
+
+            # Check if the right tool was called
+            if tool_called == tc.expected_tool:
+                result.success = True
+
+            # Check if args contain expected substring
+            if tc.expected_params_check:
+                result.tool_args_valid = tc.expected_params_check in tool_args_str
+            else:
+                result.tool_args_valid = True
+
+            # Check if tool executed (look for tool role message)
+            for msg in messages:
+                if msg.get("role") == "tool":
+                    content = msg.get("content", "")
+                    if content and "error" not in content.lower()[:50]:
+                        result.execution_ok = True
+                        break
+                    elif content:
+                        result.execution_ok = True  # got a response, even if error
+                        break
+        else:
+            # No tool call produced — still check if model responded
+            final = conv.get("final_response", "")
+            result.raw_response = final[:200] if final else ""
+
+    except Exception as e:
+        result.error = f"{type(e).__name__}: {str(e)[:200]}"
+        result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
+
+    return result
+
+
+def generate_report(results: list[CallResult], models: list[str], output_path: Path):
+    """Generate markdown benchmark report."""
+    now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+
+    # Aggregate per model
+    stats: dict[str, ModelStats] = {}
+    for m in models:
+        stats[m] = ModelStats(model=m)
+
+    by_category: dict[str, dict[str, list[CallResult]]] = {}
+
+    for r in results:
+        s = stats[r.model]
+        s.total += 1
+        s.schema_ok += int(r.schema_ok)
+        s.exec_ok += int(r.execution_ok)
+        s.latency_sum += r.latency_s
+        if not r.success:
+            s.failures.append(r)
+
+        by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
+
+    lines = [
+        f"# Tool-Calling Benchmark Report",
+        f"",
+        f"Generated: {now}",
+        f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
+        f"Models tested: {', '.join(models)}",
+        f"",
+        f"## Summary",
+        f"",
+        f"| Metric | {' | '.join(models)} |",
+        f"|--------|{'|'.join('---------' for _ in models)}|",
+    ]
+
+    # Schema parse success
+    row = "| Schema parse success | "
+    for m in models:
+        s = stats[m]
+        row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
+    lines.append(row)
+
+    # Tool execution success
+    row = "| Tool execution success | "
+    for m in models:
+        s = stats[m]
+        row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
+    lines.append(row)
+
+    # Correct tool selected
+    row = "| Correct tool selected | "
+    for m in models:
+        s = stats[m]
+        correct = sum(1 for r in results if r.model == m and r.success)
+        pct = (correct / s.total * 100) if s.total else 0
+        row += f"{correct}/{s.total} ({pct:.0f}%) | "
+    lines.append(row)
+
+    # Avg latency
+    row = "| Avg latency (s) | "
+    for m in models:
+        s = stats[m]
+        row += f"{s.avg_latency:.2f} | "
+    lines.append(row)
+
+    lines.append("")
+
+    # Per-category breakdown
+    lines.append("## Per-Category Breakdown")
+    lines.append("")
+
+    for cat in sorted(by_category.keys()):
+        lines.append(f"### {cat.title()}")
+        lines.append("")
+        lines.append(f"| Metric | {' | '.join(models)} |")
+        lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
+
+        cat_data = by_category[cat]
+        for metric_name, fn in [
+            ("Schema OK", lambda r: r.schema_ok),
+            ("Exec OK", lambda r: r.execution_ok),
+            ("Correct tool", lambda r: r.success),
+        ]:
+            row = f"| {metric_name} | "
+            for m in models:
+                results_m = cat_data.get(m, [])
+                total = len(results_m)
+                ok = sum(1 for r in results_m if fn(r))
+                pct = (ok / total * 100) if total else 0
+                row += f"{ok}/{total} ({pct:.0f}%) | "
+            lines.append(row)
+
+        lines.append("")
+
+    # Failure analysis
+    lines.append("## Failure Analysis")
+    lines.append("")
+
+    any_failures = False
+    for m in models:
+        s = stats[m]
+        if s.failures:
+            any_failures = True
+            lines.append(f"### {m} — {len(s.failures)} failures")
+            lines.append("")
+            lines.append("| Test | Category | Expected | Got | Error |")
+            lines.append("|------|----------|----------|-----|-------|")
+            for r in s.failures:
+                got = r.tool_called or "none"
+                err = r.error or "wrong tool"
+                lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
+            lines.append("")
+
+    if not any_failures:
+        lines.append("No failures detected.")
+        lines.append("")
+
+    # Raw results JSON
+    lines.append("## Raw Results")
+    lines.append("")
+    lines.append("```json")
+    lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
+    lines.append("```")
+
+    report = "\n".join(lines)
+    output_path.write_text(report)
+    return report
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Tool-calling benchmark")
+    parser.add_argument("--models", nargs="+",
+                        default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
+                        help="Model specs to test (provider:model)")
+    parser.add_argument("--limit", type=int, default=0,
+                        help="Run only first N tests (0 = all)")
+    parser.add_argument("--category", type=str, default="",
+                        help="Run only tests in this category")
+    parser.add_argument("--output", type=str, default="",
+                        help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Print test cases without running them")
+    args = parser.parse_args()
+
+    # Filter suite
+    suite = SUITE[:]
+    if args.category:
+        suite = [tc for tc in suite if tc.category == args.category]
+    if args.limit > 0:
+        suite = suite[:args.limit]
+
+    if args.dry_run:
+        print(f"Would run {len(suite)} tests:")
+        for tc in suite:
+            print(f"  [{tc.category:8s}] {tc.id}: {tc.expected_tool} — {tc.prompt[:60]}")
+        return
+
+    # Setup
+    setup_test_files()
+    date_str = datetime.now().strftime("%Y-%m-%d")
+    output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
+
+    # Parse model specs
+    model_specs = []
+    for spec in args.models:
+        parts = spec.split(":", 1)
+        provider = parts[0]
+        model_name = parts[1] if len(parts) > 1 else parts[0]
+        model_specs.append((provider, model_name, spec))
+
+    print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
+    print(f"Output: {output_path}")
+    print()
+
+    all_results: list[CallResult] = []
+
+    for provider, model_name, full_spec in model_specs:
+        print(f"── {full_spec} {'─' * (50 - len(full_spec))}")
+        model_results = []
+
+        for i, tc in enumerate(suite, 1):
+            sys.stdout.write(f"\r  [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s} → {tc.expected_tool:20s}")
+            sys.stdout.flush()
+
+            r = run_single_test(tc, full_spec, provider)
+            model_results.append(r)
+
+            status = "✓" if r.success else "✗"
+            sys.stdout.write(f"  {status} ({r.latency_s:.1f}s)")
+            sys.stdout.write("\n")
+
+        all_results.extend(model_results)
+
+        # Quick stats
+        ok = sum(1 for r in model_results if r.success)
+        print(f"  Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
+        print()
+
+    # Generate report
+    model_names = [spec for _, _, spec in model_specs]
+    report = generate_report(all_results, model_names, output_path)
+    print(f"Report written to {output_path}")
+
+    # Exit code: 0 if all pass, 1 if any failures
+    total_fail = sum(1 for r in all_results if not r.success)
+    sys.exit(1 if total_fail > 0 else 0)
+
+
+if __name__ == "__main__":
+    main()
--- a/model_tools.py
+++ b/model_tools.py
@@ -28,7 +28,6 @@ from typing import Dict, Any, List, Optional, Tuple

 from tools.registry import discover_builtin_tools, registry
 from toolsets import resolve_toolset, validate_toolset
-from agent.tool_orchestrator import orchestrator

 logger = logging.getLogger(__name__)

@@ -500,13 +499,13 @@ def handle_function_call(
            # Prefer the caller-provided list so subagents can't overwrite
            # the parent's tool set via the process-global.
            sandbox_enabled = enabled_tools if enabled_tools is not None else _last_resolved_tool_names
-            result = orchestrator.dispatch(
+            result = registry.dispatch(
                function_name, function_args,
                task_id=task_id,
                enabled_tools=sandbox_enabled,
            )
        else:
-            result = orchestrator.dispatch(
+            result = registry.dispatch(
                function_name, function_args,
                task_id=task_id,
                user_task=user_task,
--- a/shield/multilingual_crisis.py
+++ b/shield/multilingual_crisis.py
@@ -1,276 +0,0 @@
-#!/usr/bin/env python3
-"""Multilingual Crisis Detection Patterns.
-
-Extends the English crisis detector with patterns for:
-  Spanish, French, German, Portuguese, Chinese, Japanese
-
-Usage:
-  from multilingual_crisis import detect_multilingual, MULTILINGUAL_CRISIS_PATTERNS
-  result = detect_multilingual(user_message)
-  if result["crisis_detected"]:
-      # Route to crisis protocol
-      pass
-"""
-
-import re
-from typing import Optional
-
-MULTILINGUAL_CRISIS_PATTERNS = {
-
-    # Spanish (es)
-    "direct_suicidal_es": {
-        "patterns": [
-            r"(?:quiero|necesito|voy a|pensando en|planeando)\s+(?:morir|suicidarme|matarme|acabar con todo|quitarme la vida)",
-            r"suicid(?:io|arme|arse|al)",
-            r"(?:quiero|necesito|ojal[aá])\s+(?:estar|ser|quedarme)\s+muerto",
-            r"(?:no|ya no)\s+(?:quiero|puedo|soporto)\s+(?:vivir|seguir|estar aqu[ií]|existir)",
-            r"(?:acabar|terminar)\s+(?:con\s+)?(?:todo|mi\s+vida|conmigo)",
-            r"(?:nadie)\s+(?:echar[ií]a|extra[nñ]ar[ií]a|notar[ií]a|importar[ií]a)\s+(?:de\s+)?(?:menos|mi\s+falta)",
-            r"(?:mi|la)\s+vida\s+(?:no\s+)?(?:vale|sirve|tiene\s+sentido)",
-            r"(?:ser[ií]a|estar[ií]a)\s+(?:mejor|todos\s+estar[ií]an\s+mejor)\s+(?:si\s+)?(?:no\s+estuviera|sin\s+mi|muerto)",
-        ],
-        "weight": 1.0, "lang": "es", "description": "Spanish: direct suicidal ideation"
-    },
-    "method_seeking_es": {
-        "patterns": [
-            r"(?:c[oó]mo|qu[eé]\s+(?:forma|manera)|mejor\s+forma)\s+(?:de\s+)?(?:morir|suicidarme|matarme|acabar)",
-            r"(?:dosis|cantidad)\s+(?:letal|fatal|mortal|letales)",
-            r"(?:pastillas|medicamentos?|f[aá]rmacos?)\s+(?:para\s+)?(?:morir|suicidarme|matarme)",
-            r"cort(?:arme|ar)\s+(?:las\s+)?(?:venas|muñecas|garganta)",
-            r"(?:ahorcarme|colgarme|saltar|tirarme)\s+(?:de|del|desde)?",
-            r"(?:forma|manera|m[eé]todo)\s+(?:indolora|r[aá]pida|sin\s+dolor)\s+(?:de\s+)?(?:morir|acabar)",
-        ],
-        "weight": 1.0, "lang": "es", "description": "Spanish: method seeking"
-    },
-    "despair_es": {
-        "patterns": [
-            r"(?:no|ya\s+no)\s+(?:puedo|soporto|aguanto|resisto)\s+m[aá]s",
-            r"(?:estoy|me\s+siento)\s+(?:perdido|solo|vac[ií]o|sin\s+esperanza|agotado)",
-            r"(?:no\s+tiene|no\s+hay)\s+(?:sentido|soluci[oó]n|salida|esperanza)",
-            r"para\s+(?:qu[eé]|qu[iíén])\s+(?:si|vivo|me\s+esfuerzo)",
-            r"(?:soy|me\s+siento\s+como)\s+(?:una\s+)?(?:carga|peso|estorbo|in[uú]til)",
-        ],
-        "weight": 0.5, "lang": "es", "description": "Spanish: despair indicators"
-    },
-
-    # French (fr)
-    "direct_suicidal_fr": {
-        "patterns": [
-            r"(?:je\s+(?:veux|vais|pense\s+[àa]|planifie)|j'ai\s+envie\s+de)\s+(?:mourir|me\s+suicider|me\s+tuer|en\s+finir)",
-            r"suicid(?:e|er|aire)",
-            r"(?:je\s+ne\s+)?(?:veux|peux|supporte)\s+(?:plus\s+)?(?:vivre|exister|continuer)",
-            r"(?:en|j'en)\s+finir\s+(?:avec\s+)?(?:tout|la\s+vie|moi|m[eê]me|[çc]a)",
-            r"(?:personne|nul\s+ne|aucun)\s+(?:ne\s+)?(?:me\s+)?(?:manquera|regretterait|remarquerait)",
-            r"(?:ma|cette)\s+vie\s+(?:ne\s+vaut|n'a\s+(?:pas\s+)?de\s+sens|est\s+(?:finie|inutile))",
-            r"(?:tout\s+le\s+monde|on)\s+(?:serait|irait)\s+(?:mieux|bien)\s+(?:sans\s+moi)",
-        ],
-        "weight": 1.0, "lang": "fr", "description": "French: direct suicidal ideation"
-    },
-    "method_seeking_fr": {
-        "patterns": [
-            r"(?:comment|quel(le)?\s+(?:est\s+le\s+)?(?:meilleur|moyen))\s+(?:de\s+)?(?:mourir|se\s+suicider|se\s+tuer|en\s+finir)",
-            r"(?:dose|quantit[eé])\s+(?:l[eé]tale?|fatale?|mortelle?)",
-            r"(?:comprim[eé]s?|pilules?|m[eé]dicaments?)\s+(?:pour\s+)?(?:mourir|se\s+tuer|overdose)",
-            r"(?:se\s+)?couper\s+(?:les\s+)?(?:veines|poignets|gorge)",
-            r"(?:se\s+)?pendre|se\s+(?:jeter|lancer)\s+(?:du|de\s+la|dans)",
-            r"(?:moyen|fa[cç]on|mani[eè]re)\s+(?:indolore|rapide|sans\s+douleur)\s+(?:de\s+)?(?:mourir|en\s+finir)",
-        ],
-        "weight": 1.0, "lang": "fr", "description": "French: method seeking"
-    },
-    "despair_fr": {
-        "patterns": [
-            r"(?:je\s+ne\s+)?(?:peux|supporte|arrive\s+[àa])\s+(?:plus\s+)?(?:continuer|tenir|durer|avancer)",
-            r"(?:je\s+suis|je\s+me\s+sens)\s+(?:perdu|seul|vide|sans\s+espoir|au\s+bout)",
-            r"(?:il\s+n'y\s+a|y\s+a\s+(?:pas\s+)?(?:de\s+)?)?(?:plus\s+)?(?:d'?espoir|de\s+solution|d'issue|de\s+sens)",
-            r"(?:je\s+suis|c'est)\s+(?:un\s+)?(?:fardeau|poids|inutile|nul)",
-            r"(?:pourquoi|[àa]\s+quoi\s+bon|pour\s+qui)\s+(?:je\s+)?(?:vis|m'efforce|continue)",
-        ],
-        "weight": 0.5, "lang": "fr", "description": "French: despair indicators"
-    },
-
-    # German (de)
-    "direct_suicidal_de": {
-        "patterns": [
-            r"(?:ich\s+(?:will|möchte|denke\s+(?:über|an)|plane))\s+(?:sterben|suizid|mich\s+(?:umbringen|töten))",
-            r"suizid|selbstmord",
-            r"(?:ich\s+(?:will|möchte)\s+(?:nicht|mehr\s+nicht))\s+(?:leben|weiterleben|existieren|dasein)",
-            r"(?:mit\s+)?(?:allem|dem\s+Leben|mir\s+selbst)\s+(?:aufhören|Schluss|fertig)\s+(?:sein|machen)",
-            r"(?:niemand|keiner)\s+(?:würde|wird)\s+(?:mich\s+)?(?:vermissen|bemerken|verlieren)",
-            r"(?:mein|dieses)\s+Leben\s+(?:hat\s+(?:keinen\s+)?Sinn|ist\s+(?:sinnlos|vorbei|fertig))",
-            r"(?:allen|jedem)\s+(?:wäre|ginge|ging)\s+es\s+besser\s+(?:ohne\s+mich|wenn\s+ich\s+nicht\s+wäre)",
-        ],
-        "weight": 1.0, "lang": "de", "description": "German: direct suicidal ideation"
-    },
-    "method_seeking_de": {
-        "patterns": [
-            r"(?:wie|welcher|beste)\s+(?:kann\s+ich|möglichkeit)\s+(?:mich\s+)?(?:umbringen|töten|sterben|suizid)",
-            r"(?:tödliche|letale|verhängnisvolle)\s+(?:Dosis|Menge)",
-            r"(?:Tabletten?|Medikamente?|Pillen?)\s+(?:um\s+)?(?:zu\s+)?(?:sterben|suizid|überdosis)",
-            r"(?:sich\s+)?(?:die\s+)?(?:Pulsadern|Kehle|Handgelenke?)\s+(?:aufschneiden|durchschneiden)",
-            r"(?:sich\s+)?(?:erhängen|aufhängen|vor\s+(?:einen\s+)?Zug\s+werfen|springen)",
-            r"(?:schmerzlose?|schnelle?)\s+(?:Art|Weise|Methode)\s+(?:zu\s+)?(?:sterben|suizid)",
-        ],
-        "weight": 1.0, "lang": "de", "description": "German: method seeking"
-    },
-    "despair_de": {
-        "patterns": [
-            r"(?:ich\s+(?:kann|schaffe|halte)\s+(?:es\s+)?(?:nicht\s+)?(?:mehr|weiter|länger))",
-            r"(?:ich\s+(?:bin|fühle\s+mich)\s+)?(?:verloren|einsam|leer|hoffnungslos|am\s+Ende)",
-            r"(?:es\s+gibt|es\s+hat)\s+(?:keine\s+)?(?:Hoffnung|Lösung|Auskunft|Sinn)",
-            r"(?:ich\s+bin|bin\s+ich)\s+(?:eine\s+)?(?:Belastung|Last|nutzlos|wertlos)",
-            r"(?:warum|wozu|für\s+wen)\s+(?:lebe|soll|mache)\s+(?:ich\s+)?(?:überhaupt|noch|weiter)",
-        ],
-        "weight": 0.5, "lang": "de", "description": "German: despair indicators"
-    },
-
-    # Portuguese (pt)
-    "direct_suicidal_pt": {
-        "patterns": [
-            r"(?:eu\s+(?:quero|preciso|vou|estou\s+pensando\s+em)|penso\s+em)\s+(?:morrer|me\s+matar|suicid[ai]r|acabar\s+com\s+tudo)",
-            r"suic[ií]dio",
-            r"(?:eu\s+)?(?:não\s+)?(?:quero|aguento|suporto|consigo)\s+(?:mais\s+)?(?:viver|existir|continuar|estar\s+aqui)",
-            r"(?:acabar|terminar|dar\s+fim)\s+(?:com\s+)?(?:tudo|a\s+minha\s+vida|com\s+isso|com\s+tudo)",
-            r"(?:ningu[eé]m)\s+(?:vai|iria)\s+(?:sentir\s+falta|notar|ligar|se\s+importar)",
-            r"(?:minha|esta)\s+vida\s+(?:não\s+)?(?:vale|faz\s+sentido|tem\s+sentido)",
-            r"(?:todo\s+mundo|todos)\s+(?:seria|estaria|ficaria)\s+(?:melhor|bem)\s+(?:sem\s+mim|se\s+eu\s+fosse)",
-        ],
-        "weight": 1.0, "lang": "pt", "description": "Portuguese: direct suicidal ideation"
-    },
-    "method_seeking_pt": {
-        "patterns": [
-            r"(?:como|qual|melhor\s+forma)\s+(?:de\s+)?(?:morrer|me\s+matar|suicid[ai]r|acabar)",
-            r"(?:dose|quantidade)\s+(?:letal|fatal|mortal)",
-            r"(?:comprimidos?|remédios?|medicamentos?)\s+(?:para\s+)?(?:morrer|me\s+matar|overdose)",
-            r"cort(?:ar|ando)\s+(?:os|meus)?\s+(?:pulsos|veias|garganta)",
-            r"(?:enforcar|enforcando|pendur[ae]r|pular)\s+(?:me|de|do|da)?",
-            r"(?:forma|maneira|método)\s+(?:indolor|rápido|sem\s+dor)\s+(?:de\s+)?(?:morrer|acabar)",
-        ],
-        "weight": 1.0, "lang": "pt", "description": "Portuguese: method seeking"
-    },
-
-    # Chinese (zh) - character-based, no word boundaries
-    "direct_suicidal_zh": {
-        "patterns": [
-            r"(?:想|想要|打算|计划|准备)(?:去)?(?:死|自杀|结束(?:自己的)?(?:生命|一切))",
-            r"(?:不想|不再|无法|不能)(?:活|活下去|继续活|生存)(?:了|下去)?",
-            r"(?:结束|终结)(?:自己|生命|一切|所有|全部)",
-            r"(?:没有|毫无|失去)(?:意义|希望|活下去的(?:理由|动力|勇气))",
-            r"(?:死了|不在了|消失了)(?:大家|所有人)(?:会|就)(?:更好|开心|轻松)",
-            r"(?:没有人|谁)(?:会)(?:在乎|在意|想|怀念)(?:我|我的)",
-            r"自杀",
-        ],
-        "weight": 1.0, "lang": "zh", "description": "Chinese: direct suicidal ideation"
-    },
-    "method_seeking_zh": {
-        "patterns": [
-            r"(?:怎么|怎样|什么(?:方法|方式))(?:才能|可以|去)?(?:死|自杀|结束(?:自己|生命))",
-            r"(?:致死|致命|致亡)(?:剂量|量|浓度|含量)",
-            r"(?:安眠药|药物|药片|毒药)(?:吃|服用|用)(?:多少|几|怎么)(?:才能|可以|会)?(?:死|自杀|致命)",
-            r"(?:割|切)(?:腕|脉|喉|脖子|血管)",
-            r"(?:上吊|跳楼|跳河|跳桥|跳崖|撞车)",
-            r"(?:无痛|最快|最简单)(?:的)?(?:死法|方法|方式)(?:是|用来)?",
-        ],
-        "weight": 1.0, "lang": "zh", "description": "Chinese: method seeking"
-    },
-    "despair_zh": {
-        "patterns": [
-            r"(?:撑|坚持|忍受|活)(?:不|没有|无法)(?:下去|了|再)",
-            r"(?:迷茫|孤独|空虚|绝望|崩溃|痛苦|无助)(?:了|到了|到)?",
-            r"(?:没有|失去|看不到)(?:希望|出路|未来|意义|目的)",
-            r"(?:我是|我是一个)(?:负担|累赘|废物|没用的人)",
-            r"(?:活着|生存)(?:为了什么|有什么意义|有什么用|有什么意思)",
-        ],
-        "weight": 0.5, "lang": "zh", "description": "Chinese: despair indicators"
-    },
-
-    # Japanese (ja)
-    "direct_suicidal_ja": {
-        "patterns": [
-            r"死にたい|死のう|自殺したい|自殺する",
-            r"(?:もう|これ以上)(?:生きて|生きる|存在して)(?:い(?:たい|る)|行(?:きたい|く))(?:く|け)(?:ない|たくない)?",
-            r"(?:すべて|全部|人生|この(?:まま|こと))(?:を)?(?:終わり|終え|やめ)(?:たい|よう|る)",
-            r"(?:誰も|だれも)(?:気づ|気付|構い|構って|思っ)(?:て(?:くれ|い)ない|てくれ(?:ない))",
-            r"(?:僕|俺|私|わたし)(?:が|は)(?:い(?:ない|なくなって)|消(?:え|えても))(?:も|たら)(?:皆|みんな|周囲)(?:は)?(?:良(?:い|く)|楽(?:に))(?:なる|なった)",
-            r"(?:この|今の)(?:僕|俺|私|わたし)(?:の)?(?:人生|命|存在)(?:は)?(?:意味|価値|甲斐)(?:が)?(?:ない|無い)",
-        ],
-        "weight": 1.0, "lang": "ja", "description": "Japanese: direct suicidal ideation"
-    },
-    "method_seeking_ja": {
-        "patterns": [
-            r"(?:どう|どんな|どの(?:よう|様)?に)(?:すれば|やれば|して)(?:死|自殺|亡くな)(?:れる|りたい|る)",
-            r"(?:致死|致命)(?:量|的(?:な)?(?:量|ドーズ|用量))",
-            r"(?:睡眠薬|薬|ピル|毒)(?:を)?(?:何|いくつ|どのくらい)(?:飲|摂|使)(?:め|んだら|えば)(?:死|亡くな)(?:れる|る)",
-            r"(?:手首|喉|首筋|血管)(?:を)?(?:切|斬|傷つ)(?:る|け|って)",
-            r"(?:縊|首吊|飛び降|投身|飛び降り)(?:り|て|死の)",
-            r"(?:苦痛|痛み)(?:の)?(?:ない|少ない)(?:方法|やり方|死に方)(?:で|は)?",
-        ],
-        "weight": 1.0, "lang": "ja", "description": "Japanese: method seeking"
-    },
-    "despair_ja": {
-        "patterns": [
-            r"(?:もう|これ以上|これ以上は)(?:無理|限界|耐え|がんば|頑張)(?:だ|だよ|れない|りきれない)",
-            r"(?:孤独|寂し|虚し|絶望|疲(?:れ|労))(?:い|く|き|さ|た|すぎて)",
-            r"(?:希望|未来|生き(?:甲斐|がい)|意味|目的)(?:が|は)?(?:ない|無い|見え|見つから|失(?:くし|われ))",
-            r"(?:僕|俺|私|わたし)(?:は|が)?(?:邪魔|迷惑|足手まとい|重荷|役に立たない)",
-            r"(?:生き(?:る|ている)|存在(?:する|している))(?:の(?:に)?|こと(?:に)?)(?:は)?(?:何の|なんの|どんな)(?:ため|意味|価値)",
-        ],
-        "weight": 0.5, "lang": "ja", "description": "Japanese: despair indicators"
-    },
-}
-
-
-def _check_multilingual(text):
-    """Check text against all multilingual crisis patterns."""
-    matches = []
-    languages = set()
-    total_score = 0.0
-    max_possible = 0.0
-
-    for category, config in MULTILINGUAL_CRISIS_PATTERNS.items():
-        weight = config.get("weight", 1.0)
-        lang = config.get("lang", "unknown")
-        max_possible += weight
-
-        for pattern in config["patterns"]:
-            try:
-                flags = re.IGNORECASE if lang in ("es", "fr", "de", "pt") else 0
-                if re.search(pattern, text, flags):
-                    matches.append({"category": category, "lang": lang, "weight": weight})
-                    total_score += weight
-                    languages.add(lang)
-                    break
-            except re.error:
-                pass
-
-    score = total_score / max_possible if max_possible > 0 else 0.0
-    return {
-        "crisis_detected": score >= 0.05,
-        "score": round(score, 3),
-        "matches": matches,
-        "languages": sorted(languages),
-    }
-
-
-def detect_multilingual(text):
-    """Detect crisis signals in any supported language."""
-    result = _check_multilingual(text)
-    if result["matches"]:
-        result["highest_category"] = max(result["matches"], key=lambda m: m["weight"])["category"]
-    else:
-        result["highest_category"] = None
-    return result
-
-
-if __name__ == "__main__":
-    tests = [
-        ("quiero morir", True, "es"), ("je veux me suicider", True, "fr"),
-        ("ich will nicht mehr leben", True, "de"), ("eu quero me matar", True, "pt"),
-        ("我想死", True, "zh"), ("死にたい", True, "ja"),
-        ("no puedo mas, quiero acabar con todo", True, "es"),
-        ("personne ne me manquera", True, "fr"), ("dosis letal", True, "es"),
-        ("怎么自杀", True, "zh"), ("en finir avec tout", True, "fr"),
-        ("hola, como estas?", False, None), ("je suis fatigue", False, None),
-        ("今天的天气不错", False, None), ("おはようございます", False, None),
-    ]
-    passed = sum(1 for text, should, _ in tests if detect_multilingual(text)["crisis_detected"] == should)
-    print(f"{passed}/{len(tests)} passed")