Compare commits
1 Commits
fix/706
...
fix/796-to
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a244b157be |
@@ -1,177 +0,0 @@
|
||||
"""Tool Orchestrator — Robust execution and circuit breaking for agent tools.
|
||||
|
||||
Provides a unified execution service that wraps the tool registry.
|
||||
Implements the Circuit Breaker pattern to prevent the agent from getting
|
||||
stuck in failure loops when a specific tool or its underlying service
|
||||
is flapping or down.
|
||||
|
||||
Architecture:
|
||||
Discovery (tools/registry.py) -> Orchestration (agent/tool_orchestrator.py) -> Dispatch
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from tools.registry import registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CircuitState:
|
||||
"""States for the tool circuit breaker."""
|
||||
CLOSED = "closed" # Normal operation
|
||||
OPEN = "open" # Failing, execution blocked
|
||||
HALF_OPEN = "half_open" # Testing if service recovered
|
||||
|
||||
|
||||
@dataclass
|
||||
class ToolStats:
|
||||
"""Execution statistics for a tool."""
|
||||
name: str
|
||||
state: str = CircuitState.CLOSED
|
||||
failures: int = 0
|
||||
successes: int = 0
|
||||
last_failure_time: float = 0
|
||||
total_execution_time: float = 0
|
||||
call_count: int = 0
|
||||
|
||||
|
||||
class ToolOrchestrator:
|
||||
"""Orchestrates tool execution with robustness patterns."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
failure_threshold: int = 3,
|
||||
reset_timeout: int = 300,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
failure_threshold: Number of failures before opening the circuit.
|
||||
reset_timeout: Seconds to wait before transitioning from OPEN to HALF_OPEN.
|
||||
"""
|
||||
self.failure_threshold = failure_threshold
|
||||
self.reset_timeout = reset_timeout
|
||||
self._stats: Dict[str, ToolStats] = {}
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def _get_stats(self, name: str) -> ToolStats:
|
||||
"""Get or initialize stats for a tool with thread-safe state transition."""
|
||||
with self._lock:
|
||||
if name not in self._stats:
|
||||
self._stats[name] = ToolStats(name=name)
|
||||
|
||||
stats = self._stats[name]
|
||||
|
||||
# Transition from OPEN to HALF_OPEN if timeout expired
|
||||
if stats.state == CircuitState.OPEN:
|
||||
if time.time() - stats.last_failure_time > self.reset_timeout:
|
||||
stats.state = CircuitState.HALF_OPEN
|
||||
logger.info("Circuit breaker HALF_OPEN for tool: %s", name)
|
||||
|
||||
return stats
|
||||
|
||||
def _record_success(self, name: str, execution_time: float):
|
||||
"""Record a successful tool execution and close the circuit."""
|
||||
with self._lock:
|
||||
stats = self._stats[name]
|
||||
stats.successes += 1
|
||||
stats.call_count += 1
|
||||
stats.total_execution_time += execution_time
|
||||
|
||||
if stats.state != CircuitState.CLOSED:
|
||||
logger.info("Circuit breaker CLOSED for tool: %s (recovered)", name)
|
||||
|
||||
stats.state = CircuitState.CLOSED
|
||||
stats.failures = 0
|
||||
|
||||
def _record_failure(self, name: str, execution_time: float):
|
||||
"""Record a failed tool execution and potentially open the circuit."""
|
||||
with self._lock:
|
||||
stats = self._stats[name]
|
||||
stats.failures += 1
|
||||
stats.call_count += 1
|
||||
stats.total_execution_time += execution_time
|
||||
stats.last_failure_time = time.time()
|
||||
|
||||
if stats.state == CircuitState.HALF_OPEN or stats.failures >= self.failure_threshold:
|
||||
stats.state = CircuitState.OPEN
|
||||
logger.warning(
|
||||
"Circuit breaker OPEN for tool: %s (failures: %d)",
|
||||
name, stats.failures
|
||||
)
|
||||
|
||||
def dispatch(self, name: str, args: dict, **kwargs) -> str:
|
||||
"""Execute a tool via the registry with circuit breaker protection."""
|
||||
stats = self._get_stats(name)
|
||||
|
||||
if stats.state == CircuitState.OPEN:
|
||||
return json.dumps({
|
||||
"error": (
|
||||
f"Tool '{name}' is temporarily unavailable due to repeated failures. "
|
||||
f"Circuit breaker is OPEN. Please try again in a few minutes or use an alternative tool."
|
||||
),
|
||||
"circuit_breaker": True,
|
||||
"tool_name": name
|
||||
})
|
||||
|
||||
start_time = time.time()
|
||||
try:
|
||||
# Dispatch to the underlying registry
|
||||
result_str = registry.dispatch(name, args, **kwargs)
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
# Inspect result for errors. registry.dispatch catches internal
|
||||
# exceptions and returns a JSON error string.
|
||||
is_error = False
|
||||
try:
|
||||
# Lightweight check for error key in JSON
|
||||
if '"error":' in result_str:
|
||||
res_json = json.loads(result_str)
|
||||
if isinstance(res_json, dict) and "error" in res_json:
|
||||
is_error = True
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
# If it's not valid JSON, it's a malformed result (error)
|
||||
is_error = True
|
||||
|
||||
if is_error:
|
||||
self._record_failure(name, execution_time)
|
||||
else:
|
||||
self._record_success(name, execution_time)
|
||||
|
||||
return result_str
|
||||
|
||||
except Exception as e:
|
||||
# This should rarely be hit as registry.dispatch catches most things,
|
||||
# but we guard against orchestrator-level or registry-level bugs.
|
||||
execution_time = time.time() - start_time
|
||||
self._record_failure(name, execution_time)
|
||||
|
||||
error_msg = f"Tool orchestrator error during {name}: {type(e).__name__}: {e}"
|
||||
logger.exception(error_msg)
|
||||
return json.dumps({
|
||||
"error": error_msg,
|
||||
"tool_name": name,
|
||||
"execution_time": execution_time
|
||||
})
|
||||
|
||||
def get_fleet_stats(self) -> Dict[str, Any]:
|
||||
"""Return execution statistics for all tools."""
|
||||
with self._lock:
|
||||
return {
|
||||
name: {
|
||||
"state": s.state,
|
||||
"failures": s.failures,
|
||||
"successes": s.successes,
|
||||
"avg_time": s.total_execution_time / s.call_count if s.call_count > 0 else 0,
|
||||
"calls": s.call_count
|
||||
}
|
||||
for name, s in self._stats.items()
|
||||
}
|
||||
|
||||
|
||||
# Global orchestrator instance
|
||||
orchestrator = ToolOrchestrator()
|
||||
40
benchmarks/gemma4-tool-calling-2026-04-13.md
Normal file
40
benchmarks/gemma4-tool-calling-2026-04-13.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Tool Call Benchmark: Gemma 4 vs mimo-v2-pro
|
||||
|
||||
Date: 2026-04-13
|
||||
Status: Awaiting execution
|
||||
|
||||
## Test Design
|
||||
|
||||
100 diverse tool calls across 7 categories:
|
||||
|
||||
| Category | Count | Tools Tested |
|
||||
|----------|-------|--------------|
|
||||
| File operations | 20 | read_file, write_file, search_files |
|
||||
| Terminal commands | 20 | terminal |
|
||||
| Web search | 15 | web_search |
|
||||
| Code execution | 15 | execute_code |
|
||||
| Browser automation | 10 | browser_navigate |
|
||||
| Delegation | 10 | delegate_task |
|
||||
| MCP tools | 10 | mcp_* |
|
||||
|
||||
## Metrics
|
||||
|
||||
| Metric | mimo-v2-pro | Gemma 4 |
|
||||
|--------|-------------|---------|
|
||||
| Schema parse success | — | — |
|
||||
| Tool execution success | — | — |
|
||||
| Parallel tool success | — | — |
|
||||
| Avg latency (s) | — | — |
|
||||
| Token cost per call | — | — |
|
||||
|
||||
## How to Run
|
||||
|
||||
```bash
|
||||
python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro
|
||||
python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest
|
||||
python3 benchmarks/tool_call_benchmark.py --compare
|
||||
```
|
||||
|
||||
## Gemma 4-Specific Failure Modes
|
||||
|
||||
To be documented after benchmark execution.
|
||||
614
benchmarks/tool_call_benchmark.py
Normal file
614
benchmarks/tool_call_benchmark.py
Normal file
@@ -0,0 +1,614 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
|
||||
|
||||
Runs 100 diverse tool-calling prompts through multiple models and compares
|
||||
success rates, latency, and token costs.
|
||||
|
||||
Usage:
|
||||
python3 benchmarks/tool_call_benchmark.py # full 100-call suite
|
||||
python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test
|
||||
python3 benchmarks/tool_call_benchmark.py --models nous # single model
|
||||
python3 benchmarks/tool_call_benchmark.py --category file # single category
|
||||
|
||||
Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Ensure hermes-agent root is importable
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test Definitions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class ToolCall:
|
||||
"""A single tool-calling test case."""
|
||||
id: str
|
||||
category: str
|
||||
prompt: str
|
||||
expected_tool: str # tool name we expect the model to call
|
||||
expected_params_check: str = "" # substring expected in JSON args
|
||||
timeout: int = 30 # max seconds per call
|
||||
notes: str = ""
|
||||
|
||||
|
||||
# fmt: off
|
||||
SUITE: list[ToolCall] = [
|
||||
# ── File Operations (20) ──────────────────────────────────────────────
|
||||
ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
|
||||
"read_file", "path"),
|
||||
ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
|
||||
"write_file", "path"),
|
||||
ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
|
||||
"search_files", "pattern"),
|
||||
ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
|
||||
"read_file", "offset"),
|
||||
ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
|
||||
"patch", "old_string"),
|
||||
ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
|
||||
"search_files", "target"),
|
||||
ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
|
||||
"read_file", "limit"),
|
||||
ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
|
||||
"write_file", "content"),
|
||||
ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
|
||||
"search_files", "file_glob"),
|
||||
ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
|
||||
"read_file", "bench_config"),
|
||||
ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
|
||||
"write_file", "bench_readme"),
|
||||
ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
|
||||
"search_files", "TODO"),
|
||||
ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
|
||||
"read_file", "bench_readme"),
|
||||
ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
|
||||
"patch", "Tool Benchmark"),
|
||||
ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
|
||||
"write_file", "bench_hello"),
|
||||
ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
|
||||
"search_files", "json"),
|
||||
ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
|
||||
"read_file", "bench_hello"),
|
||||
ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
|
||||
"patch", "hello world"),
|
||||
ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
|
||||
"search_files", "bench"),
|
||||
ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
|
||||
"read_file", "test_bench"),
|
||||
|
||||
# ── Terminal Commands (20) ────────────────────────────────────────────
|
||||
ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
|
||||
"terminal", "echo"),
|
||||
ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
|
||||
"terminal", "date"),
|
||||
ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
|
||||
"terminal", "uname"),
|
||||
ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
|
||||
"terminal", "pwd"),
|
||||
ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
|
||||
"terminal", "head"),
|
||||
ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
|
||||
"terminal", "whoami"),
|
||||
ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
|
||||
"terminal", "df"),
|
||||
ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
|
||||
"terminal", "python3"),
|
||||
ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
|
||||
"terminal", "hostname"),
|
||||
ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
|
||||
"terminal", "uptime"),
|
||||
ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
|
||||
"terminal", "PATH"),
|
||||
ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
|
||||
"terminal", "wc"),
|
||||
ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
|
||||
"terminal", "SHELL"),
|
||||
ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
|
||||
"terminal", "memory"),
|
||||
ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
|
||||
"terminal", "id"),
|
||||
ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
|
||||
"terminal", "hostname"),
|
||||
ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
|
||||
"terminal", "echo"),
|
||||
ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
|
||||
"terminal", "seq"),
|
||||
ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
|
||||
"terminal", "print"),
|
||||
ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
|
||||
"terminal", "wc"),
|
||||
|
||||
# ── Code Execution (15) ──────────────────────────────────────────────
|
||||
ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
|
||||
"execute_code", "factorial"),
|
||||
ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
|
||||
"execute_code", "words"),
|
||||
ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
|
||||
"execute_code", "fibonacci"),
|
||||
ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
|
||||
"execute_code", "json"),
|
||||
ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
|
||||
"execute_code", "glob"),
|
||||
ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
|
||||
"execute_code", "sum"),
|
||||
ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
|
||||
"execute_code", "palindrome"),
|
||||
ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
|
||||
"execute_code", "csv"),
|
||||
ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
|
||||
"execute_code", "sort"),
|
||||
ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
|
||||
"execute_code", "passwd"),
|
||||
ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
|
||||
"execute_code", "sha256"),
|
||||
ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
|
||||
"execute_code", "utcnow"),
|
||||
ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
|
||||
"execute_code", "upper"),
|
||||
ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
|
||||
"execute_code", "sys"),
|
||||
ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
|
||||
"execute_code", "socket"),
|
||||
|
||||
# ── Delegation (10) ──────────────────────────────────────────────────
|
||||
ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
|
||||
"delegate_task", "log"),
|
||||
ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
|
||||
"delegate_task", "15"),
|
||||
ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
|
||||
"delegate_task", "python"),
|
||||
ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
|
||||
"delegate_task", "summarize"),
|
||||
ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
|
||||
"delegate_task", "tmp"),
|
||||
ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
|
||||
"delegate_task", ".py"),
|
||||
ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
|
||||
"delegate_task", "df"),
|
||||
ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
|
||||
"delegate_task", "os"),
|
||||
ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
|
||||
"delegate_task", "hostname"),
|
||||
ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
|
||||
"delegate_task", "write"),
|
||||
|
||||
# ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
|
||||
ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
|
||||
"todo", "benchmark"),
|
||||
ToolCall("todo-02", "todo", "Show me the current todo list.",
|
||||
"todo", ""),
|
||||
ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
|
||||
"todo", "completed"),
|
||||
ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
|
||||
"todo", "Review"),
|
||||
ToolCall("todo-05", "todo", "Clear all completed todos.",
|
||||
"todo", "clear"),
|
||||
ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
|
||||
date=datetime.now().strftime("%Y-%m-%d")),
|
||||
"memory", "benchmark"),
|
||||
ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
|
||||
"memory", "benchmark"),
|
||||
ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
|
||||
"memory", "gemma"),
|
||||
ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
|
||||
"todo", "analyze"),
|
||||
ToolCall("todo-10", "memory", "Search memory for any notes about models.",
|
||||
"memory", "model"),
|
||||
|
||||
# ── Skills (10 — replacing MCP tools which need servers) ─────────────
|
||||
ToolCall("skill-01", "skills", "List all available skills.",
|
||||
"skills_list", ""),
|
||||
ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
|
||||
"skill_view", "test-driven"),
|
||||
ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
|
||||
"skills_list", "git"),
|
||||
ToolCall("skill-04", "skills", "View the 'code-review' skill.",
|
||||
"skill_view", "code-review"),
|
||||
ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
|
||||
"skills_list", "devops"),
|
||||
ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
|
||||
"skill_view", "systematic-debugging"),
|
||||
ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
|
||||
"skills_list", "testing"),
|
||||
ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
|
||||
"skill_view", "writing-plans"),
|
||||
ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
|
||||
"skills_list", "software-development"),
|
||||
ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
|
||||
"skill_view", "pr-review"),
|
||||
|
||||
# ── Additional tests to reach 100 ────────────────────────────────────
|
||||
ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
|
||||
"write_file", "bench_sort"),
|
||||
ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
|
||||
"read_file", "bench_sort"),
|
||||
ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
|
||||
"search_files", "class"),
|
||||
ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
|
||||
"terminal", "os"),
|
||||
ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
|
||||
"terminal", "cpu"),
|
||||
ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
|
||||
"execute_code", "flatten"),
|
||||
ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
|
||||
"execute_code", "prime"),
|
||||
ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
|
||||
"delegate_task", "cwd"),
|
||||
ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
|
||||
"todo", "Finalize"),
|
||||
ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
|
||||
"memory", "categories"),
|
||||
ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
|
||||
"skills_list", "deployment"),
|
||||
ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
|
||||
"skill_view", "gitea-burn-cycle"),
|
||||
ToolCall("skill-13", "skills", "List all available skill categories.",
|
||||
"skills_list", ""),
|
||||
ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
|
||||
"skills_list", "memory"),
|
||||
ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
|
||||
"skill_view", "mimo-swarm"),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class CallResult:
|
||||
test_id: str
|
||||
category: str
|
||||
model: str
|
||||
prompt: str
|
||||
expected_tool: str
|
||||
success: bool
|
||||
tool_called: Optional[str] = None
|
||||
tool_args_valid: bool = False
|
||||
execution_ok: bool = False
|
||||
latency_s: float = 0.0
|
||||
error: str = ""
|
||||
raw_response: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelStats:
|
||||
model: str
|
||||
total: int = 0
|
||||
schema_ok: int = 0 # model produced valid tool call JSON
|
||||
exec_ok: int = 0 # tool actually ran without error
|
||||
latency_sum: float = 0.0
|
||||
failures: list = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def schema_pct(self) -> float:
|
||||
return (self.schema_ok / self.total * 100) if self.total else 0
|
||||
|
||||
@property
|
||||
def exec_pct(self) -> float:
|
||||
return (self.exec_ok / self.total * 100) if self.total else 0
|
||||
|
||||
@property
|
||||
def avg_latency(self) -> float:
|
||||
return (self.latency_sum / self.total) if self.total else 0
|
||||
|
||||
|
||||
def setup_test_files():
|
||||
"""Create prerequisite files for the benchmark."""
|
||||
Path("/tmp/test_bench.txt").write_text(
|
||||
"This is a benchmark test file.\n"
|
||||
"It contains sample data for tool-calling tests.\n"
|
||||
"Line three has some import statements.\n"
|
||||
"import os\nimport sys\nimport json\n"
|
||||
"End of test data.\n"
|
||||
)
|
||||
|
||||
|
||||
def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
|
||||
"""Run a single tool-calling test through the agent."""
|
||||
from run_agent import AIAgent
|
||||
|
||||
result = CallResult(
|
||||
test_id=tc.id,
|
||||
category=tc.category,
|
||||
model=model_spec,
|
||||
prompt=tc.prompt,
|
||||
expected_tool=tc.expected_tool,
|
||||
success=False,
|
||||
)
|
||||
|
||||
try:
|
||||
agent = AIAgent(
|
||||
model=model_spec,
|
||||
provider=provider,
|
||||
max_iterations=3,
|
||||
quiet_mode=True,
|
||||
skip_context_files=True,
|
||||
skip_memory=True,
|
||||
persist_session=False,
|
||||
)
|
||||
|
||||
t0 = time.time()
|
||||
conv = agent.run_conversation(
|
||||
user_message=tc.prompt,
|
||||
system_message=(
|
||||
"You are a benchmark test runner. Execute the user's request by calling "
|
||||
"the appropriate tool. Return the tool result directly. Do not add commentary."
|
||||
),
|
||||
)
|
||||
result.latency_s = round(time.time() - t0, 2)
|
||||
|
||||
messages = conv.get("messages", [])
|
||||
|
||||
# Find the first assistant message with tool_calls
|
||||
tool_called = None
|
||||
tool_args_str = ""
|
||||
for msg in messages:
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
for tc_item in msg["tool_calls"]:
|
||||
fn = tc_item.get("function", {})
|
||||
tool_called = fn.get("name", "")
|
||||
tool_args_str = fn.get("arguments", "{}")
|
||||
break
|
||||
break
|
||||
|
||||
if tool_called:
|
||||
result.tool_called = tool_called
|
||||
result.schema_ok = True
|
||||
|
||||
# Check if the right tool was called
|
||||
if tool_called == tc.expected_tool:
|
||||
result.success = True
|
||||
|
||||
# Check if args contain expected substring
|
||||
if tc.expected_params_check:
|
||||
result.tool_args_valid = tc.expected_params_check in tool_args_str
|
||||
else:
|
||||
result.tool_args_valid = True
|
||||
|
||||
# Check if tool executed (look for tool role message)
|
||||
for msg in messages:
|
||||
if msg.get("role") == "tool":
|
||||
content = msg.get("content", "")
|
||||
if content and "error" not in content.lower()[:50]:
|
||||
result.execution_ok = True
|
||||
break
|
||||
elif content:
|
||||
result.execution_ok = True # got a response, even if error
|
||||
break
|
||||
else:
|
||||
# No tool call produced — still check if model responded
|
||||
final = conv.get("final_response", "")
|
||||
result.raw_response = final[:200] if final else ""
|
||||
|
||||
except Exception as e:
|
||||
result.error = f"{type(e).__name__}: {str(e)[:200]}"
|
||||
result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def generate_report(results: list[CallResult], models: list[str], output_path: Path):
|
||||
"""Generate markdown benchmark report."""
|
||||
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
||||
|
||||
# Aggregate per model
|
||||
stats: dict[str, ModelStats] = {}
|
||||
for m in models:
|
||||
stats[m] = ModelStats(model=m)
|
||||
|
||||
by_category: dict[str, dict[str, list[CallResult]]] = {}
|
||||
|
||||
for r in results:
|
||||
s = stats[r.model]
|
||||
s.total += 1
|
||||
s.schema_ok += int(r.schema_ok)
|
||||
s.exec_ok += int(r.execution_ok)
|
||||
s.latency_sum += r.latency_s
|
||||
if not r.success:
|
||||
s.failures.append(r)
|
||||
|
||||
by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
|
||||
|
||||
lines = [
|
||||
f"# Tool-Calling Benchmark Report",
|
||||
f"",
|
||||
f"Generated: {now}",
|
||||
f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
|
||||
f"Models tested: {', '.join(models)}",
|
||||
f"",
|
||||
f"## Summary",
|
||||
f"",
|
||||
f"| Metric | {' | '.join(models)} |",
|
||||
f"|--------|{'|'.join('---------' for _ in models)}|",
|
||||
]
|
||||
|
||||
# Schema parse success
|
||||
row = "| Schema parse success | "
|
||||
for m in models:
|
||||
s = stats[m]
|
||||
row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
|
||||
lines.append(row)
|
||||
|
||||
# Tool execution success
|
||||
row = "| Tool execution success | "
|
||||
for m in models:
|
||||
s = stats[m]
|
||||
row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
|
||||
lines.append(row)
|
||||
|
||||
# Correct tool selected
|
||||
row = "| Correct tool selected | "
|
||||
for m in models:
|
||||
s = stats[m]
|
||||
correct = sum(1 for r in results if r.model == m and r.success)
|
||||
pct = (correct / s.total * 100) if s.total else 0
|
||||
row += f"{correct}/{s.total} ({pct:.0f}%) | "
|
||||
lines.append(row)
|
||||
|
||||
# Avg latency
|
||||
row = "| Avg latency (s) | "
|
||||
for m in models:
|
||||
s = stats[m]
|
||||
row += f"{s.avg_latency:.2f} | "
|
||||
lines.append(row)
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Per-category breakdown
|
||||
lines.append("## Per-Category Breakdown")
|
||||
lines.append("")
|
||||
|
||||
for cat in sorted(by_category.keys()):
|
||||
lines.append(f"### {cat.title()}")
|
||||
lines.append("")
|
||||
lines.append(f"| Metric | {' | '.join(models)} |")
|
||||
lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
|
||||
|
||||
cat_data = by_category[cat]
|
||||
for metric_name, fn in [
|
||||
("Schema OK", lambda r: r.schema_ok),
|
||||
("Exec OK", lambda r: r.execution_ok),
|
||||
("Correct tool", lambda r: r.success),
|
||||
]:
|
||||
row = f"| {metric_name} | "
|
||||
for m in models:
|
||||
results_m = cat_data.get(m, [])
|
||||
total = len(results_m)
|
||||
ok = sum(1 for r in results_m if fn(r))
|
||||
pct = (ok / total * 100) if total else 0
|
||||
row += f"{ok}/{total} ({pct:.0f}%) | "
|
||||
lines.append(row)
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Failure analysis
|
||||
lines.append("## Failure Analysis")
|
||||
lines.append("")
|
||||
|
||||
any_failures = False
|
||||
for m in models:
|
||||
s = stats[m]
|
||||
if s.failures:
|
||||
any_failures = True
|
||||
lines.append(f"### {m} — {len(s.failures)} failures")
|
||||
lines.append("")
|
||||
lines.append("| Test | Category | Expected | Got | Error |")
|
||||
lines.append("|------|----------|----------|-----|-------|")
|
||||
for r in s.failures:
|
||||
got = r.tool_called or "none"
|
||||
err = r.error or "wrong tool"
|
||||
lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
|
||||
lines.append("")
|
||||
|
||||
if not any_failures:
|
||||
lines.append("No failures detected.")
|
||||
lines.append("")
|
||||
|
||||
# Raw results JSON
|
||||
lines.append("## Raw Results")
|
||||
lines.append("")
|
||||
lines.append("```json")
|
||||
lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
|
||||
lines.append("```")
|
||||
|
||||
report = "\n".join(lines)
|
||||
output_path.write_text(report)
|
||||
return report
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Tool-calling benchmark")
|
||||
parser.add_argument("--models", nargs="+",
|
||||
default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
|
||||
help="Model specs to test (provider:model)")
|
||||
parser.add_argument("--limit", type=int, default=0,
|
||||
help="Run only first N tests (0 = all)")
|
||||
parser.add_argument("--category", type=str, default="",
|
||||
help="Run only tests in this category")
|
||||
parser.add_argument("--output", type=str, default="",
|
||||
help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Print test cases without running them")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Filter suite
|
||||
suite = SUITE[:]
|
||||
if args.category:
|
||||
suite = [tc for tc in suite if tc.category == args.category]
|
||||
if args.limit > 0:
|
||||
suite = suite[:args.limit]
|
||||
|
||||
if args.dry_run:
|
||||
print(f"Would run {len(suite)} tests:")
|
||||
for tc in suite:
|
||||
print(f" [{tc.category:8s}] {tc.id}: {tc.expected_tool} — {tc.prompt[:60]}")
|
||||
return
|
||||
|
||||
# Setup
|
||||
setup_test_files()
|
||||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||
output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
|
||||
|
||||
# Parse model specs
|
||||
model_specs = []
|
||||
for spec in args.models:
|
||||
parts = spec.split(":", 1)
|
||||
provider = parts[0]
|
||||
model_name = parts[1] if len(parts) > 1 else parts[0]
|
||||
model_specs.append((provider, model_name, spec))
|
||||
|
||||
print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
|
||||
print(f"Output: {output_path}")
|
||||
print()
|
||||
|
||||
all_results: list[CallResult] = []
|
||||
|
||||
for provider, model_name, full_spec in model_specs:
|
||||
print(f"── {full_spec} {'─' * (50 - len(full_spec))}")
|
||||
model_results = []
|
||||
|
||||
for i, tc in enumerate(suite, 1):
|
||||
sys.stdout.write(f"\r [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s} → {tc.expected_tool:20s}")
|
||||
sys.stdout.flush()
|
||||
|
||||
r = run_single_test(tc, full_spec, provider)
|
||||
model_results.append(r)
|
||||
|
||||
status = "✓" if r.success else "✗"
|
||||
sys.stdout.write(f" {status} ({r.latency_s:.1f}s)")
|
||||
sys.stdout.write("\n")
|
||||
|
||||
all_results.extend(model_results)
|
||||
|
||||
# Quick stats
|
||||
ok = sum(1 for r in model_results if r.success)
|
||||
print(f" Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
|
||||
print()
|
||||
|
||||
# Generate report
|
||||
model_names = [spec for _, _, spec in model_specs]
|
||||
report = generate_report(all_results, model_names, output_path)
|
||||
print(f"Report written to {output_path}")
|
||||
|
||||
# Exit code: 0 if all pass, 1 if any failures
|
||||
total_fail = sum(1 for r in all_results if not r.success)
|
||||
sys.exit(1 if total_fail > 0 else 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -28,7 +28,6 @@ from typing import Dict, Any, List, Optional, Tuple
|
||||
|
||||
from tools.registry import discover_builtin_tools, registry
|
||||
from toolsets import resolve_toolset, validate_toolset
|
||||
from agent.tool_orchestrator import orchestrator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -500,13 +499,13 @@ def handle_function_call(
|
||||
# Prefer the caller-provided list so subagents can't overwrite
|
||||
# the parent's tool set via the process-global.
|
||||
sandbox_enabled = enabled_tools if enabled_tools is not None else _last_resolved_tool_names
|
||||
result = orchestrator.dispatch(
|
||||
result = registry.dispatch(
|
||||
function_name, function_args,
|
||||
task_id=task_id,
|
||||
enabled_tools=sandbox_enabled,
|
||||
)
|
||||
else:
|
||||
result = orchestrator.dispatch(
|
||||
result = registry.dispatch(
|
||||
function_name, function_args,
|
||||
task_id=task_id,
|
||||
user_task=user_task,
|
||||
|
||||
@@ -1,276 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Multilingual Crisis Detection Patterns.
|
||||
|
||||
Extends the English crisis detector with patterns for:
|
||||
Spanish, French, German, Portuguese, Chinese, Japanese
|
||||
|
||||
Usage:
|
||||
from multilingual_crisis import detect_multilingual, MULTILINGUAL_CRISIS_PATTERNS
|
||||
result = detect_multilingual(user_message)
|
||||
if result["crisis_detected"]:
|
||||
# Route to crisis protocol
|
||||
pass
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
MULTILINGUAL_CRISIS_PATTERNS = {
|
||||
|
||||
# Spanish (es)
|
||||
"direct_suicidal_es": {
|
||||
"patterns": [
|
||||
r"(?:quiero|necesito|voy a|pensando en|planeando)\s+(?:morir|suicidarme|matarme|acabar con todo|quitarme la vida)",
|
||||
r"suicid(?:io|arme|arse|al)",
|
||||
r"(?:quiero|necesito|ojal[aá])\s+(?:estar|ser|quedarme)\s+muerto",
|
||||
r"(?:no|ya no)\s+(?:quiero|puedo|soporto)\s+(?:vivir|seguir|estar aqu[ií]|existir)",
|
||||
r"(?:acabar|terminar)\s+(?:con\s+)?(?:todo|mi\s+vida|conmigo)",
|
||||
r"(?:nadie)\s+(?:echar[ií]a|extra[nñ]ar[ií]a|notar[ií]a|importar[ií]a)\s+(?:de\s+)?(?:menos|mi\s+falta)",
|
||||
r"(?:mi|la)\s+vida\s+(?:no\s+)?(?:vale|sirve|tiene\s+sentido)",
|
||||
r"(?:ser[ií]a|estar[ií]a)\s+(?:mejor|todos\s+estar[ií]an\s+mejor)\s+(?:si\s+)?(?:no\s+estuviera|sin\s+mi|muerto)",
|
||||
],
|
||||
"weight": 1.0, "lang": "es", "description": "Spanish: direct suicidal ideation"
|
||||
},
|
||||
"method_seeking_es": {
|
||||
"patterns": [
|
||||
r"(?:c[oó]mo|qu[eé]\s+(?:forma|manera)|mejor\s+forma)\s+(?:de\s+)?(?:morir|suicidarme|matarme|acabar)",
|
||||
r"(?:dosis|cantidad)\s+(?:letal|fatal|mortal|letales)",
|
||||
r"(?:pastillas|medicamentos?|f[aá]rmacos?)\s+(?:para\s+)?(?:morir|suicidarme|matarme)",
|
||||
r"cort(?:arme|ar)\s+(?:las\s+)?(?:venas|muñecas|garganta)",
|
||||
r"(?:ahorcarme|colgarme|saltar|tirarme)\s+(?:de|del|desde)?",
|
||||
r"(?:forma|manera|m[eé]todo)\s+(?:indolora|r[aá]pida|sin\s+dolor)\s+(?:de\s+)?(?:morir|acabar)",
|
||||
],
|
||||
"weight": 1.0, "lang": "es", "description": "Spanish: method seeking"
|
||||
},
|
||||
"despair_es": {
|
||||
"patterns": [
|
||||
r"(?:no|ya\s+no)\s+(?:puedo|soporto|aguanto|resisto)\s+m[aá]s",
|
||||
r"(?:estoy|me\s+siento)\s+(?:perdido|solo|vac[ií]o|sin\s+esperanza|agotado)",
|
||||
r"(?:no\s+tiene|no\s+hay)\s+(?:sentido|soluci[oó]n|salida|esperanza)",
|
||||
r"para\s+(?:qu[eé]|qu[iíén])\s+(?:si|vivo|me\s+esfuerzo)",
|
||||
r"(?:soy|me\s+siento\s+como)\s+(?:una\s+)?(?:carga|peso|estorbo|in[uú]til)",
|
||||
],
|
||||
"weight": 0.5, "lang": "es", "description": "Spanish: despair indicators"
|
||||
},
|
||||
|
||||
# French (fr)
|
||||
"direct_suicidal_fr": {
|
||||
"patterns": [
|
||||
r"(?:je\s+(?:veux|vais|pense\s+[àa]|planifie)|j'ai\s+envie\s+de)\s+(?:mourir|me\s+suicider|me\s+tuer|en\s+finir)",
|
||||
r"suicid(?:e|er|aire)",
|
||||
r"(?:je\s+ne\s+)?(?:veux|peux|supporte)\s+(?:plus\s+)?(?:vivre|exister|continuer)",
|
||||
r"(?:en|j'en)\s+finir\s+(?:avec\s+)?(?:tout|la\s+vie|moi|m[eê]me|[çc]a)",
|
||||
r"(?:personne|nul\s+ne|aucun)\s+(?:ne\s+)?(?:me\s+)?(?:manquera|regretterait|remarquerait)",
|
||||
r"(?:ma|cette)\s+vie\s+(?:ne\s+vaut|n'a\s+(?:pas\s+)?de\s+sens|est\s+(?:finie|inutile))",
|
||||
r"(?:tout\s+le\s+monde|on)\s+(?:serait|irait)\s+(?:mieux|bien)\s+(?:sans\s+moi)",
|
||||
],
|
||||
"weight": 1.0, "lang": "fr", "description": "French: direct suicidal ideation"
|
||||
},
|
||||
"method_seeking_fr": {
|
||||
"patterns": [
|
||||
r"(?:comment|quel(le)?\s+(?:est\s+le\s+)?(?:meilleur|moyen))\s+(?:de\s+)?(?:mourir|se\s+suicider|se\s+tuer|en\s+finir)",
|
||||
r"(?:dose|quantit[eé])\s+(?:l[eé]tale?|fatale?|mortelle?)",
|
||||
r"(?:comprim[eé]s?|pilules?|m[eé]dicaments?)\s+(?:pour\s+)?(?:mourir|se\s+tuer|overdose)",
|
||||
r"(?:se\s+)?couper\s+(?:les\s+)?(?:veines|poignets|gorge)",
|
||||
r"(?:se\s+)?pendre|se\s+(?:jeter|lancer)\s+(?:du|de\s+la|dans)",
|
||||
r"(?:moyen|fa[cç]on|mani[eè]re)\s+(?:indolore|rapide|sans\s+douleur)\s+(?:de\s+)?(?:mourir|en\s+finir)",
|
||||
],
|
||||
"weight": 1.0, "lang": "fr", "description": "French: method seeking"
|
||||
},
|
||||
"despair_fr": {
|
||||
"patterns": [
|
||||
r"(?:je\s+ne\s+)?(?:peux|supporte|arrive\s+[àa])\s+(?:plus\s+)?(?:continuer|tenir|durer|avancer)",
|
||||
r"(?:je\s+suis|je\s+me\s+sens)\s+(?:perdu|seul|vide|sans\s+espoir|au\s+bout)",
|
||||
r"(?:il\s+n'y\s+a|y\s+a\s+(?:pas\s+)?(?:de\s+)?)?(?:plus\s+)?(?:d'?espoir|de\s+solution|d'issue|de\s+sens)",
|
||||
r"(?:je\s+suis|c'est)\s+(?:un\s+)?(?:fardeau|poids|inutile|nul)",
|
||||
r"(?:pourquoi|[àa]\s+quoi\s+bon|pour\s+qui)\s+(?:je\s+)?(?:vis|m'efforce|continue)",
|
||||
],
|
||||
"weight": 0.5, "lang": "fr", "description": "French: despair indicators"
|
||||
},
|
||||
|
||||
# German (de)
|
||||
"direct_suicidal_de": {
|
||||
"patterns": [
|
||||
r"(?:ich\s+(?:will|möchte|denke\s+(?:über|an)|plane))\s+(?:sterben|suizid|mich\s+(?:umbringen|töten))",
|
||||
r"suizid|selbstmord",
|
||||
r"(?:ich\s+(?:will|möchte)\s+(?:nicht|mehr\s+nicht))\s+(?:leben|weiterleben|existieren|dasein)",
|
||||
r"(?:mit\s+)?(?:allem|dem\s+Leben|mir\s+selbst)\s+(?:aufhören|Schluss|fertig)\s+(?:sein|machen)",
|
||||
r"(?:niemand|keiner)\s+(?:würde|wird)\s+(?:mich\s+)?(?:vermissen|bemerken|verlieren)",
|
||||
r"(?:mein|dieses)\s+Leben\s+(?:hat\s+(?:keinen\s+)?Sinn|ist\s+(?:sinnlos|vorbei|fertig))",
|
||||
r"(?:allen|jedem)\s+(?:wäre|ginge|ging)\s+es\s+besser\s+(?:ohne\s+mich|wenn\s+ich\s+nicht\s+wäre)",
|
||||
],
|
||||
"weight": 1.0, "lang": "de", "description": "German: direct suicidal ideation"
|
||||
},
|
||||
"method_seeking_de": {
|
||||
"patterns": [
|
||||
r"(?:wie|welcher|beste)\s+(?:kann\s+ich|möglichkeit)\s+(?:mich\s+)?(?:umbringen|töten|sterben|suizid)",
|
||||
r"(?:tödliche|letale|verhängnisvolle)\s+(?:Dosis|Menge)",
|
||||
r"(?:Tabletten?|Medikamente?|Pillen?)\s+(?:um\s+)?(?:zu\s+)?(?:sterben|suizid|überdosis)",
|
||||
r"(?:sich\s+)?(?:die\s+)?(?:Pulsadern|Kehle|Handgelenke?)\s+(?:aufschneiden|durchschneiden)",
|
||||
r"(?:sich\s+)?(?:erhängen|aufhängen|vor\s+(?:einen\s+)?Zug\s+werfen|springen)",
|
||||
r"(?:schmerzlose?|schnelle?)\s+(?:Art|Weise|Methode)\s+(?:zu\s+)?(?:sterben|suizid)",
|
||||
],
|
||||
"weight": 1.0, "lang": "de", "description": "German: method seeking"
|
||||
},
|
||||
"despair_de": {
|
||||
"patterns": [
|
||||
r"(?:ich\s+(?:kann|schaffe|halte)\s+(?:es\s+)?(?:nicht\s+)?(?:mehr|weiter|länger))",
|
||||
r"(?:ich\s+(?:bin|fühle\s+mich)\s+)?(?:verloren|einsam|leer|hoffnungslos|am\s+Ende)",
|
||||
r"(?:es\s+gibt|es\s+hat)\s+(?:keine\s+)?(?:Hoffnung|Lösung|Auskunft|Sinn)",
|
||||
r"(?:ich\s+bin|bin\s+ich)\s+(?:eine\s+)?(?:Belastung|Last|nutzlos|wertlos)",
|
||||
r"(?:warum|wozu|für\s+wen)\s+(?:lebe|soll|mache)\s+(?:ich\s+)?(?:überhaupt|noch|weiter)",
|
||||
],
|
||||
"weight": 0.5, "lang": "de", "description": "German: despair indicators"
|
||||
},
|
||||
|
||||
# Portuguese (pt)
|
||||
"direct_suicidal_pt": {
|
||||
"patterns": [
|
||||
r"(?:eu\s+(?:quero|preciso|vou|estou\s+pensando\s+em)|penso\s+em)\s+(?:morrer|me\s+matar|suicid[ai]r|acabar\s+com\s+tudo)",
|
||||
r"suic[ií]dio",
|
||||
r"(?:eu\s+)?(?:não\s+)?(?:quero|aguento|suporto|consigo)\s+(?:mais\s+)?(?:viver|existir|continuar|estar\s+aqui)",
|
||||
r"(?:acabar|terminar|dar\s+fim)\s+(?:com\s+)?(?:tudo|a\s+minha\s+vida|com\s+isso|com\s+tudo)",
|
||||
r"(?:ningu[eé]m)\s+(?:vai|iria)\s+(?:sentir\s+falta|notar|ligar|se\s+importar)",
|
||||
r"(?:minha|esta)\s+vida\s+(?:não\s+)?(?:vale|faz\s+sentido|tem\s+sentido)",
|
||||
r"(?:todo\s+mundo|todos)\s+(?:seria|estaria|ficaria)\s+(?:melhor|bem)\s+(?:sem\s+mim|se\s+eu\s+fosse)",
|
||||
],
|
||||
"weight": 1.0, "lang": "pt", "description": "Portuguese: direct suicidal ideation"
|
||||
},
|
||||
"method_seeking_pt": {
|
||||
"patterns": [
|
||||
r"(?:como|qual|melhor\s+forma)\s+(?:de\s+)?(?:morrer|me\s+matar|suicid[ai]r|acabar)",
|
||||
r"(?:dose|quantidade)\s+(?:letal|fatal|mortal)",
|
||||
r"(?:comprimidos?|remédios?|medicamentos?)\s+(?:para\s+)?(?:morrer|me\s+matar|overdose)",
|
||||
r"cort(?:ar|ando)\s+(?:os|meus)?\s+(?:pulsos|veias|garganta)",
|
||||
r"(?:enforcar|enforcando|pendur[ae]r|pular)\s+(?:me|de|do|da)?",
|
||||
r"(?:forma|maneira|método)\s+(?:indolor|rápido|sem\s+dor)\s+(?:de\s+)?(?:morrer|acabar)",
|
||||
],
|
||||
"weight": 1.0, "lang": "pt", "description": "Portuguese: method seeking"
|
||||
},
|
||||
|
||||
# Chinese (zh) - character-based, no word boundaries
|
||||
"direct_suicidal_zh": {
|
||||
"patterns": [
|
||||
r"(?:想|想要|打算|计划|准备)(?:去)?(?:死|自杀|结束(?:自己的)?(?:生命|一切))",
|
||||
r"(?:不想|不再|无法|不能)(?:活|活下去|继续活|生存)(?:了|下去)?",
|
||||
r"(?:结束|终结)(?:自己|生命|一切|所有|全部)",
|
||||
r"(?:没有|毫无|失去)(?:意义|希望|活下去的(?:理由|动力|勇气))",
|
||||
r"(?:死了|不在了|消失了)(?:大家|所有人)(?:会|就)(?:更好|开心|轻松)",
|
||||
r"(?:没有人|谁)(?:会)(?:在乎|在意|想|怀念)(?:我|我的)",
|
||||
r"自杀",
|
||||
],
|
||||
"weight": 1.0, "lang": "zh", "description": "Chinese: direct suicidal ideation"
|
||||
},
|
||||
"method_seeking_zh": {
|
||||
"patterns": [
|
||||
r"(?:怎么|怎样|什么(?:方法|方式))(?:才能|可以|去)?(?:死|自杀|结束(?:自己|生命))",
|
||||
r"(?:致死|致命|致亡)(?:剂量|量|浓度|含量)",
|
||||
r"(?:安眠药|药物|药片|毒药)(?:吃|服用|用)(?:多少|几|怎么)(?:才能|可以|会)?(?:死|自杀|致命)",
|
||||
r"(?:割|切)(?:腕|脉|喉|脖子|血管)",
|
||||
r"(?:上吊|跳楼|跳河|跳桥|跳崖|撞车)",
|
||||
r"(?:无痛|最快|最简单)(?:的)?(?:死法|方法|方式)(?:是|用来)?",
|
||||
],
|
||||
"weight": 1.0, "lang": "zh", "description": "Chinese: method seeking"
|
||||
},
|
||||
"despair_zh": {
|
||||
"patterns": [
|
||||
r"(?:撑|坚持|忍受|活)(?:不|没有|无法)(?:下去|了|再)",
|
||||
r"(?:迷茫|孤独|空虚|绝望|崩溃|痛苦|无助)(?:了|到了|到)?",
|
||||
r"(?:没有|失去|看不到)(?:希望|出路|未来|意义|目的)",
|
||||
r"(?:我是|我是一个)(?:负担|累赘|废物|没用的人)",
|
||||
r"(?:活着|生存)(?:为了什么|有什么意义|有什么用|有什么意思)",
|
||||
],
|
||||
"weight": 0.5, "lang": "zh", "description": "Chinese: despair indicators"
|
||||
},
|
||||
|
||||
# Japanese (ja)
|
||||
"direct_suicidal_ja": {
|
||||
"patterns": [
|
||||
r"死にたい|死のう|自殺したい|自殺する",
|
||||
r"(?:もう|これ以上)(?:生きて|生きる|存在して)(?:い(?:たい|る)|行(?:きたい|く))(?:く|け)(?:ない|たくない)?",
|
||||
r"(?:すべて|全部|人生|この(?:まま|こと))(?:を)?(?:終わり|終え|やめ)(?:たい|よう|る)",
|
||||
r"(?:誰も|だれも)(?:気づ|気付|構い|構って|思っ)(?:て(?:くれ|い)ない|てくれ(?:ない))",
|
||||
r"(?:僕|俺|私|わたし)(?:が|は)(?:い(?:ない|なくなって)|消(?:え|えても))(?:も|たら)(?:皆|みんな|周囲)(?:は)?(?:良(?:い|く)|楽(?:に))(?:なる|なった)",
|
||||
r"(?:この|今の)(?:僕|俺|私|わたし)(?:の)?(?:人生|命|存在)(?:は)?(?:意味|価値|甲斐)(?:が)?(?:ない|無い)",
|
||||
],
|
||||
"weight": 1.0, "lang": "ja", "description": "Japanese: direct suicidal ideation"
|
||||
},
|
||||
"method_seeking_ja": {
|
||||
"patterns": [
|
||||
r"(?:どう|どんな|どの(?:よう|様)?に)(?:すれば|やれば|して)(?:死|自殺|亡くな)(?:れる|りたい|る)",
|
||||
r"(?:致死|致命)(?:量|的(?:な)?(?:量|ドーズ|用量))",
|
||||
r"(?:睡眠薬|薬|ピル|毒)(?:を)?(?:何|いくつ|どのくらい)(?:飲|摂|使)(?:め|んだら|えば)(?:死|亡くな)(?:れる|る)",
|
||||
r"(?:手首|喉|首筋|血管)(?:を)?(?:切|斬|傷つ)(?:る|け|って)",
|
||||
r"(?:縊|首吊|飛び降|投身|飛び降り)(?:り|て|死の)",
|
||||
r"(?:苦痛|痛み)(?:の)?(?:ない|少ない)(?:方法|やり方|死に方)(?:で|は)?",
|
||||
],
|
||||
"weight": 1.0, "lang": "ja", "description": "Japanese: method seeking"
|
||||
},
|
||||
"despair_ja": {
|
||||
"patterns": [
|
||||
r"(?:もう|これ以上|これ以上は)(?:無理|限界|耐え|がんば|頑張)(?:だ|だよ|れない|りきれない)",
|
||||
r"(?:孤独|寂し|虚し|絶望|疲(?:れ|労))(?:い|く|き|さ|た|すぎて)",
|
||||
r"(?:希望|未来|生き(?:甲斐|がい)|意味|目的)(?:が|は)?(?:ない|無い|見え|見つから|失(?:くし|われ))",
|
||||
r"(?:僕|俺|私|わたし)(?:は|が)?(?:邪魔|迷惑|足手まとい|重荷|役に立たない)",
|
||||
r"(?:生き(?:る|ている)|存在(?:する|している))(?:の(?:に)?|こと(?:に)?)(?:は)?(?:何の|なんの|どんな)(?:ため|意味|価値)",
|
||||
],
|
||||
"weight": 0.5, "lang": "ja", "description": "Japanese: despair indicators"
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _check_multilingual(text):
|
||||
"""Check text against all multilingual crisis patterns."""
|
||||
matches = []
|
||||
languages = set()
|
||||
total_score = 0.0
|
||||
max_possible = 0.0
|
||||
|
||||
for category, config in MULTILINGUAL_CRISIS_PATTERNS.items():
|
||||
weight = config.get("weight", 1.0)
|
||||
lang = config.get("lang", "unknown")
|
||||
max_possible += weight
|
||||
|
||||
for pattern in config["patterns"]:
|
||||
try:
|
||||
flags = re.IGNORECASE if lang in ("es", "fr", "de", "pt") else 0
|
||||
if re.search(pattern, text, flags):
|
||||
matches.append({"category": category, "lang": lang, "weight": weight})
|
||||
total_score += weight
|
||||
languages.add(lang)
|
||||
break
|
||||
except re.error:
|
||||
pass
|
||||
|
||||
score = total_score / max_possible if max_possible > 0 else 0.0
|
||||
return {
|
||||
"crisis_detected": score >= 0.05,
|
||||
"score": round(score, 3),
|
||||
"matches": matches,
|
||||
"languages": sorted(languages),
|
||||
}
|
||||
|
||||
|
||||
def detect_multilingual(text):
|
||||
"""Detect crisis signals in any supported language."""
|
||||
result = _check_multilingual(text)
|
||||
if result["matches"]:
|
||||
result["highest_category"] = max(result["matches"], key=lambda m: m["weight"])["category"]
|
||||
else:
|
||||
result["highest_category"] = None
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tests = [
|
||||
("quiero morir", True, "es"), ("je veux me suicider", True, "fr"),
|
||||
("ich will nicht mehr leben", True, "de"), ("eu quero me matar", True, "pt"),
|
||||
("我想死", True, "zh"), ("死にたい", True, "ja"),
|
||||
("no puedo mas, quiero acabar con todo", True, "es"),
|
||||
("personne ne me manquera", True, "fr"), ("dosis letal", True, "es"),
|
||||
("怎么自杀", True, "zh"), ("en finir avec tout", True, "fr"),
|
||||
("hola, como estas?", False, None), ("je suis fatigue", False, None),
|
||||
("今天的天气不错", False, None), ("おはようございます", False, None),
|
||||
]
|
||||
passed = sum(1 for text, should, _ in tests if detect_multilingual(text)["crisis_detected"] == should)
|
||||
print(f"{passed}/{len(tests)} passed")
|
||||
Reference in New Issue
Block a user