feat: batch tool execution with parallel safety checks (#749 )

Centralized safety classification for tool call batches: tools/batch_executor.py (new): - classify_tool_calls() — classifies batch into parallel_safe, path_scoped, sequential, never_parallel tiers - BatchExecutionPlan — structured plan with parallel and sequential batches - Path conflict detection — write_file + patch on same file go sequential - Destructive command detection — rm, mv, sed -i, redirects - execute_parallel_batch() — ThreadPoolExecutor for concurrent execution tools/registry.py (enhanced): - ToolEntry.parallel_safe field — tools can declare parallel safety - registry.register() accepts parallel_safe=True parameter - registry.get_parallel_safe_tools() — query registry-declared safe tools Safety tiers: - parallel_safe: read_file, web_search, search_files, etc. - path_scoped: write_file, patch (concurrent when paths don't overlap) - sequential: terminal, delegate_task, unknown tools - never_parallel: clarify (requires user interaction) 19 tests passing.
feat: add crisis detection tool — the-door integration (#141 )
2026-04-15 22:17:16 -04:00 · 2026-04-15 21:00:06 -04:00 · 2026-04-15 18:56:35 -04:00
7 changed files with 1587 additions and 2 deletions
--- a/benchmarks/gemma4-tool-calling-2026-04-13.md
+++ b/benchmarks/gemma4-tool-calling-2026-04-13.md
@@ -0,0 +1,40 @@
+# Tool Call Benchmark: Gemma 4 vs mimo-v2-pro
+
+Date: 2026-04-13
+Status: Awaiting execution
+
+## Test Design
+
+100 diverse tool calls across 7 categories:
+
+| Category | Count | Tools Tested |
+|----------|-------|--------------|
+| File operations | 20 | read_file, write_file, search_files |
+| Terminal commands | 20 | terminal |
+| Web search | 15 | web_search |
+| Code execution | 15 | execute_code |
+| Browser automation | 10 | browser_navigate |
+| Delegation | 10 | delegate_task |
+| MCP tools | 10 | mcp_* |
+
+## Metrics
+
+| Metric | mimo-v2-pro | Gemma 4 |
+|--------|-------------|---------|
+| Schema parse success | — | — |
+| Tool execution success | — | — |
+| Parallel tool success | — | — |
+| Avg latency (s) | — | — |
+| Token cost per call | — | — |
+
+## How to Run
+
+```bash
+python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro
+python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest
+python3 benchmarks/tool_call_benchmark.py --compare
+```
+
+## Gemma 4-Specific Failure Modes
+
+To be documented after benchmark execution.
--- a/benchmarks/tool_call_benchmark.py
+++ b/benchmarks/tool_call_benchmark.py
@@ -0,0 +1,614 @@
+#!/usr/bin/env python3
+"""
+Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
+
+Runs 100 diverse tool-calling prompts through multiple models and compares
+success rates, latency, and token costs.
+
+Usage:
+    python3 benchmarks/tool_call_benchmark.py                  # full 100-call suite
+    python3 benchmarks/tool_call_benchmark.py --limit 10       # quick smoke test
+    python3 benchmarks/tool_call_benchmark.py --models nous     # single model
+    python3 benchmarks/tool_call_benchmark.py --category file   # single category
+
+Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field, asdict
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+# Ensure hermes-agent root is importable
+REPO_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO_ROOT))
+
+# ---------------------------------------------------------------------------
+# Test Definitions
+# ---------------------------------------------------------------------------
+
+@dataclass
+class ToolCall:
+    """A single tool-calling test case."""
+    id: str
+    category: str
+    prompt: str
+    expected_tool: str              # tool name we expect the model to call
+    expected_params_check: str = "" # substring expected in JSON args
+    timeout: int = 30               # max seconds per call
+    notes: str = ""
+
+
+# fmt: off
+SUITE: list[ToolCall] = [
+    # ── File Operations (20) ──────────────────────────────────────────────
+    ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
+             "read_file", "path"),
+    ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
+             "write_file", "path"),
+    ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
+             "search_files", "pattern"),
+    ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
+             "read_file", "offset"),
+    ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
+             "patch", "old_string"),
+    ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
+             "search_files", "target"),
+    ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
+             "read_file", "limit"),
+    ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
+             "write_file", "content"),
+    ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
+             "search_files", "file_glob"),
+    ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
+             "read_file", "bench_config"),
+    ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
+             "write_file", "bench_readme"),
+    ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
+             "search_files", "TODO"),
+    ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
+             "read_file", "bench_readme"),
+    ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
+             "patch", "Tool Benchmark"),
+    ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
+             "write_file", "bench_hello"),
+    ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
+             "search_files", "json"),
+    ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
+             "read_file", "bench_hello"),
+    ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
+             "patch", "hello world"),
+    ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
+             "search_files", "bench"),
+    ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
+             "read_file", "test_bench"),
+
+    # ── Terminal Commands (20) ────────────────────────────────────────────
+    ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
+             "terminal", "echo"),
+    ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
+             "terminal", "date"),
+    ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
+             "terminal", "uname"),
+    ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
+             "terminal", "pwd"),
+    ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
+             "terminal", "head"),
+    ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
+             "terminal", "whoami"),
+    ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
+             "terminal", "df"),
+    ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
+             "terminal", "python3"),
+    ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
+             "terminal", "hostname"),
+    ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
+             "terminal", "uptime"),
+    ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
+             "terminal", "PATH"),
+    ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
+             "terminal", "wc"),
+    ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
+             "terminal", "SHELL"),
+    ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
+             "terminal", "memory"),
+    ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
+             "terminal", "id"),
+    ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
+             "terminal", "hostname"),
+    ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
+             "terminal", "echo"),
+    ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
+             "terminal", "seq"),
+    ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
+             "terminal", "print"),
+    ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
+             "terminal", "wc"),
+
+    # ── Code Execution (15) ──────────────────────────────────────────────
+    ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
+             "execute_code", "factorial"),
+    ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
+             "execute_code", "words"),
+    ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
+             "execute_code", "fibonacci"),
+    ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
+             "execute_code", "json"),
+    ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
+             "execute_code", "glob"),
+    ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
+             "execute_code", "sum"),
+    ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
+             "execute_code", "palindrome"),
+    ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
+             "execute_code", "csv"),
+    ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
+             "execute_code", "sort"),
+    ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
+             "execute_code", "passwd"),
+    ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
+             "execute_code", "sha256"),
+    ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
+             "execute_code", "utcnow"),
+    ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
+             "execute_code", "upper"),
+    ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
+             "execute_code", "sys"),
+    ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
+             "execute_code", "socket"),
+
+    # ── Delegation (10) ──────────────────────────────────────────────────
+    ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
+             "delegate_task", "log"),
+    ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
+             "delegate_task", "15"),
+    ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
+             "delegate_task", "python"),
+    ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
+             "delegate_task", "summarize"),
+    ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
+             "delegate_task", "tmp"),
+    ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
+             "delegate_task", ".py"),
+    ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
+             "delegate_task", "df"),
+    ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
+             "delegate_task", "os"),
+    ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
+             "delegate_task", "hostname"),
+    ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
+             "delegate_task", "write"),
+
+    # ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
+    ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
+             "todo", "benchmark"),
+    ToolCall("todo-02", "todo", "Show me the current todo list.",
+             "todo", ""),
+    ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
+             "todo", "completed"),
+    ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
+             "todo", "Review"),
+    ToolCall("todo-05", "todo", "Clear all completed todos.",
+             "todo", "clear"),
+    ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
+              date=datetime.now().strftime("%Y-%m-%d")),
+             "memory", "benchmark"),
+    ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
+             "memory", "benchmark"),
+    ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
+             "memory", "gemma"),
+    ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
+             "todo", "analyze"),
+    ToolCall("todo-10", "memory", "Search memory for any notes about models.",
+             "memory", "model"),
+
+    # ── Skills (10 — replacing MCP tools which need servers) ─────────────
+    ToolCall("skill-01", "skills", "List all available skills.",
+             "skills_list", ""),
+    ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
+             "skill_view", "test-driven"),
+    ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
+             "skills_list", "git"),
+    ToolCall("skill-04", "skills", "View the 'code-review' skill.",
+             "skill_view", "code-review"),
+    ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
+             "skills_list", "devops"),
+    ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
+             "skill_view", "systematic-debugging"),
+    ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
+             "skills_list", "testing"),
+    ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
+             "skill_view", "writing-plans"),
+    ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
+             "skills_list", "software-development"),
+    ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
+             "skill_view", "pr-review"),
+
+    # ── Additional tests to reach 100 ────────────────────────────────────
+    ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
+             "write_file", "bench_sort"),
+    ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
+             "read_file", "bench_sort"),
+    ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
+             "search_files", "class"),
+    ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
+             "terminal", "os"),
+    ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
+             "terminal", "cpu"),
+    ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
+             "execute_code", "flatten"),
+    ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
+             "execute_code", "prime"),
+    ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
+             "delegate_task", "cwd"),
+    ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
+             "todo", "Finalize"),
+    ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
+             "memory", "categories"),
+    ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
+             "skills_list", "deployment"),
+    ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
+             "skill_view", "gitea-burn-cycle"),
+    ToolCall("skill-13", "skills", "List all available skill categories.",
+             "skills_list", ""),
+    ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
+             "skills_list", "memory"),
+    ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
+             "skill_view", "mimo-swarm"),
+]
+# fmt: on
+
+
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+
+@dataclass
+class CallResult:
+    test_id: str
+    category: str
+    model: str
+    prompt: str
+    expected_tool: str
+    success: bool
+    tool_called: Optional[str] = None
+    tool_args_valid: bool = False
+    execution_ok: bool = False
+    latency_s: float = 0.0
+    error: str = ""
+    raw_response: str = ""
+
+
+@dataclass
+class ModelStats:
+    model: str
+    total: int = 0
+    schema_ok: int = 0        # model produced valid tool call JSON
+    exec_ok: int = 0          # tool actually ran without error
+    latency_sum: float = 0.0
+    failures: list = field(default_factory=list)
+
+    @property
+    def schema_pct(self) -> float:
+        return (self.schema_ok / self.total * 100) if self.total else 0
+
+    @property
+    def exec_pct(self) -> float:
+        return (self.exec_ok / self.total * 100) if self.total else 0
+
+    @property
+    def avg_latency(self) -> float:
+        return (self.latency_sum / self.total) if self.total else 0
+
+
+def setup_test_files():
+    """Create prerequisite files for the benchmark."""
+    Path("/tmp/test_bench.txt").write_text(
+        "This is a benchmark test file.\n"
+        "It contains sample data for tool-calling tests.\n"
+        "Line three has some import statements.\n"
+        "import os\nimport sys\nimport json\n"
+        "End of test data.\n"
+    )
+
+
+def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
+    """Run a single tool-calling test through the agent."""
+    from run_agent import AIAgent
+
+    result = CallResult(
+        test_id=tc.id,
+        category=tc.category,
+        model=model_spec,
+        prompt=tc.prompt,
+        expected_tool=tc.expected_tool,
+        success=False,
+    )
+
+    try:
+        agent = AIAgent(
+            model=model_spec,
+            provider=provider,
+            max_iterations=3,
+            quiet_mode=True,
+            skip_context_files=True,
+            skip_memory=True,
+            persist_session=False,
+        )
+
+        t0 = time.time()
+        conv = agent.run_conversation(
+            user_message=tc.prompt,
+            system_message=(
+                "You are a benchmark test runner. Execute the user's request by calling "
+                "the appropriate tool. Return the tool result directly. Do not add commentary."
+            ),
+        )
+        result.latency_s = round(time.time() - t0, 2)
+
+        messages = conv.get("messages", [])
+
+        # Find the first assistant message with tool_calls
+        tool_called = None
+        tool_args_str = ""
+        for msg in messages:
+            if msg.get("role") == "assistant" and msg.get("tool_calls"):
+                for tc_item in msg["tool_calls"]:
+                    fn = tc_item.get("function", {})
+                    tool_called = fn.get("name", "")
+                    tool_args_str = fn.get("arguments", "{}")
+                    break
+                break
+
+        if tool_called:
+            result.tool_called = tool_called
+            result.schema_ok = True
+
+            # Check if the right tool was called
+            if tool_called == tc.expected_tool:
+                result.success = True
+
+            # Check if args contain expected substring
+            if tc.expected_params_check:
+                result.tool_args_valid = tc.expected_params_check in tool_args_str
+            else:
+                result.tool_args_valid = True
+
+            # Check if tool executed (look for tool role message)
+            for msg in messages:
+                if msg.get("role") == "tool":
+                    content = msg.get("content", "")
+                    if content and "error" not in content.lower()[:50]:
+                        result.execution_ok = True
+                        break
+                    elif content:
+                        result.execution_ok = True  # got a response, even if error
+                        break
+        else:
+            # No tool call produced — still check if model responded
+            final = conv.get("final_response", "")
+            result.raw_response = final[:200] if final else ""
+
+    except Exception as e:
+        result.error = f"{type(e).__name__}: {str(e)[:200]}"
+        result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
+
+    return result
+
+
+def generate_report(results: list[CallResult], models: list[str], output_path: Path):
+    """Generate markdown benchmark report."""
+    now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+
+    # Aggregate per model
+    stats: dict[str, ModelStats] = {}
+    for m in models:
+        stats[m] = ModelStats(model=m)
+
+    by_category: dict[str, dict[str, list[CallResult]]] = {}
+
+    for r in results:
+        s = stats[r.model]
+        s.total += 1
+        s.schema_ok += int(r.schema_ok)
+        s.exec_ok += int(r.execution_ok)
+        s.latency_sum += r.latency_s
+        if not r.success:
+            s.failures.append(r)
+
+        by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
+
+    lines = [
+        f"# Tool-Calling Benchmark Report",
+        f"",
+        f"Generated: {now}",
+        f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
+        f"Models tested: {', '.join(models)}",
+        f"",
+        f"## Summary",
+        f"",
+        f"| Metric | {' | '.join(models)} |",
+        f"|--------|{'|'.join('---------' for _ in models)}|",
+    ]
+
+    # Schema parse success
+    row = "| Schema parse success | "
+    for m in models:
+        s = stats[m]
+        row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
+    lines.append(row)
+
+    # Tool execution success
+    row = "| Tool execution success | "
+    for m in models:
+        s = stats[m]
+        row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
+    lines.append(row)
+
+    # Correct tool selected
+    row = "| Correct tool selected | "
+    for m in models:
+        s = stats[m]
+        correct = sum(1 for r in results if r.model == m and r.success)
+        pct = (correct / s.total * 100) if s.total else 0
+        row += f"{correct}/{s.total} ({pct:.0f}%) | "
+    lines.append(row)
+
+    # Avg latency
+    row = "| Avg latency (s) | "
+    for m in models:
+        s = stats[m]
+        row += f"{s.avg_latency:.2f} | "
+    lines.append(row)
+
+    lines.append("")
+
+    # Per-category breakdown
+    lines.append("## Per-Category Breakdown")
+    lines.append("")
+
+    for cat in sorted(by_category.keys()):
+        lines.append(f"### {cat.title()}")
+        lines.append("")
+        lines.append(f"| Metric | {' | '.join(models)} |")
+        lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
+
+        cat_data = by_category[cat]
+        for metric_name, fn in [
+            ("Schema OK", lambda r: r.schema_ok),
+            ("Exec OK", lambda r: r.execution_ok),
+            ("Correct tool", lambda r: r.success),
+        ]:
+            row = f"| {metric_name} | "
+            for m in models:
+                results_m = cat_data.get(m, [])
+                total = len(results_m)
+                ok = sum(1 for r in results_m if fn(r))
+                pct = (ok / total * 100) if total else 0
+                row += f"{ok}/{total} ({pct:.0f}%) | "
+            lines.append(row)
+
+        lines.append("")
+
+    # Failure analysis
+    lines.append("## Failure Analysis")
+    lines.append("")
+
+    any_failures = False
+    for m in models:
+        s = stats[m]
+        if s.failures:
+            any_failures = True
+            lines.append(f"### {m} — {len(s.failures)} failures")
+            lines.append("")
+            lines.append("| Test | Category | Expected | Got | Error |")
+            lines.append("|------|----------|----------|-----|-------|")
+            for r in s.failures:
+                got = r.tool_called or "none"
+                err = r.error or "wrong tool"
+                lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
+            lines.append("")
+
+    if not any_failures:
+        lines.append("No failures detected.")
+        lines.append("")
+
+    # Raw results JSON
+    lines.append("## Raw Results")
+    lines.append("")
+    lines.append("```json")
+    lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
+    lines.append("```")
+
+    report = "\n".join(lines)
+    output_path.write_text(report)
+    return report
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Tool-calling benchmark")
+    parser.add_argument("--models", nargs="+",
+                        default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
+                        help="Model specs to test (provider:model)")
+    parser.add_argument("--limit", type=int, default=0,
+                        help="Run only first N tests (0 = all)")
+    parser.add_argument("--category", type=str, default="",
+                        help="Run only tests in this category")
+    parser.add_argument("--output", type=str, default="",
+                        help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Print test cases without running them")
+    args = parser.parse_args()
+
+    # Filter suite
+    suite = SUITE[:]
+    if args.category:
+        suite = [tc for tc in suite if tc.category == args.category]
+    if args.limit > 0:
+        suite = suite[:args.limit]
+
+    if args.dry_run:
+        print(f"Would run {len(suite)} tests:")
+        for tc in suite:
+            print(f"  [{tc.category:8s}] {tc.id}: {tc.expected_tool} — {tc.prompt[:60]}")
+        return
+
+    # Setup
+    setup_test_files()
+    date_str = datetime.now().strftime("%Y-%m-%d")
+    output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
+
+    # Parse model specs
+    model_specs = []
+    for spec in args.models:
+        parts = spec.split(":", 1)
+        provider = parts[0]
+        model_name = parts[1] if len(parts) > 1 else parts[0]
+        model_specs.append((provider, model_name, spec))
+
+    print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
+    print(f"Output: {output_path}")
+    print()
+
+    all_results: list[CallResult] = []
+
+    for provider, model_name, full_spec in model_specs:
+        print(f"── {full_spec} {'─' * (50 - len(full_spec))}")
+        model_results = []
+
+        for i, tc in enumerate(suite, 1):
+            sys.stdout.write(f"\r  [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s} → {tc.expected_tool:20s}")
+            sys.stdout.flush()
+
+            r = run_single_test(tc, full_spec, provider)
+            model_results.append(r)
+
+            status = "✓" if r.success else "✗"
+            sys.stdout.write(f"  {status} ({r.latency_s:.1f}s)")
+            sys.stdout.write("\n")
+
+        all_results.extend(model_results)
+
+        # Quick stats
+        ok = sum(1 for r in model_results if r.success)
+        print(f"  Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
+        print()
+
+    # Generate report
+    model_names = [spec for _, _, spec in model_specs]
+    report = generate_report(all_results, model_names, output_path)
+    print(f"Report written to {output_path}")
+
+    # Exit code: 0 if all pass, 1 if any failures
+    total_fail = sum(1 for r in all_results if not r.success)
+    sys.exit(1 if total_fail > 0 else 0)
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_batch_executor.py
+++ b/tests/test_batch_executor.py
@@ -0,0 +1,150 @@
+"""Tests for batch tool execution safety classification."""
+import json
+import pytest
+from unittest.mock import MagicMock
+
+
+def _make_tool_call(name: str, args: dict) -> MagicMock:
+    """Create a mock tool call object."""
+    tc = MagicMock()
+    tc.function.name = name
+    tc.function.arguments = json.dumps(args)
+    tc.id = f"call_{name}_1"
+    return tc
+
+
+class TestClassification:
+    def test_parallel_safe_read_file(self):
+        from tools.batch_executor import classify_single_tool_call
+        tc = _make_tool_call("read_file", {"path": "README.md"})
+        result = classify_single_tool_call(tc)
+        assert result.tier == "parallel_safe"
+
+    def test_parallel_safe_web_search(self):
+        from tools.batch_executor import classify_single_tool_call
+        tc = _make_tool_call("web_search", {"query": "test"})
+        result = classify_single_tool_call(tc)
+        assert result.tier == "parallel_safe"
+
+    def test_parallel_safe_search_files(self):
+        from tools.batch_executor import classify_single_tool_call
+        tc = _make_tool_call("search_files", {"pattern": "test"})
+        result = classify_single_tool_call(tc)
+        assert result.tier == "parallel_safe"
+
+    def test_never_parallel_clarify(self):
+        from tools.batch_executor import classify_single_tool_call
+        tc = _make_tool_call("clarify", {"question": "test"})
+        result = classify_single_tool_call(tc)
+        assert result.tier == "never_parallel"
+
+    def test_terminal_is_sequential(self):
+        from tools.batch_executor import classify_single_tool_call
+        tc = _make_tool_call("terminal", {"command": "ls -la"})
+        result = classify_single_tool_call(tc)
+        assert result.tier == "sequential"
+
+    def test_terminal_destructive_rm(self):
+        from tools.batch_executor import classify_single_tool_call
+        tc = _make_tool_call("terminal", {"command": "rm -rf /tmp/test"})
+        result = classify_single_tool_call(tc)
+        assert result.tier == "sequential"
+        assert "Destructive" in result.reason
+
+    def test_write_file_is_path_scoped(self):
+        from tools.batch_executor import classify_single_tool_call
+        tc = _make_tool_call("write_file", {"path": "/tmp/test.txt", "content": "hello"})
+        result = classify_single_tool_call(tc)
+        assert result.tier == "path_scoped"
+
+    def test_delegate_is_sequential(self):
+        from tools.batch_executor import classify_single_tool_call
+        tc = _make_tool_call("delegate_task", {"goal": "test"})
+        result = classify_single_tool_call(tc)
+        assert result.tier == "sequential"
+
+    def test_unknown_tool_is_sequential(self):
+        from tools.batch_executor import classify_single_tool_call
+        tc = _make_tool_call("some_unknown_tool", {"arg": "val"})
+        result = classify_single_tool_call(tc)
+        assert result.tier == "sequential"
+
+
+class TestBatchClassification:
+    def test_all_parallel_stays_parallel(self):
+        from tools.batch_executor import classify_tool_calls
+        tcs = [
+            _make_tool_call("read_file", {"path": f"file{i}.txt"})
+            for i in range(5)
+        ]
+        plan = classify_tool_calls(tcs)
+        assert plan.can_parallelize
+        assert len(plan.parallel_batch) == 5
+        assert len(plan.sequential_batch) == 0
+
+    def test_mixed_batch(self):
+        from tools.batch_executor import classify_tool_calls
+        tcs = [
+            _make_tool_call("read_file", {"path": "a.txt"}),
+            _make_tool_call("terminal", {"command": "ls"}),
+            _make_tool_call("web_search", {"query": "test"}),
+            _make_tool_call("delegate_task", {"goal": "test"}),
+        ]
+        plan = classify_tool_calls(tcs)
+        # read_file + web_search should be parallel (both parallel_safe)
+        # terminal + delegate_task should be sequential
+        assert len(plan.parallel_batch) >= 2
+        assert len(plan.sequential_batch) >= 2
+
+    def test_clarify_blocks_all(self):
+        from tools.batch_executor import classify_tool_calls
+        tcs = [
+            _make_tool_call("read_file", {"path": "a.txt"}),
+            _make_tool_call("clarify", {"question": "which one?"}),
+            _make_tool_call("web_search", {"query": "test"}),
+        ]
+        plan = classify_tool_calls(tcs)
+        clarify_in_seq = any(c.tool_name == "clarify" for c in plan.sequential_batch)
+        assert clarify_in_seq
+
+    def test_overlapping_paths_sequential(self):
+        from tools.batch_executor import classify_tool_calls
+        tcs = [
+            _make_tool_call("write_file", {"path": "/tmp/test/a.txt", "content": "hello"}),
+            _make_tool_call("patch", {"path": "/tmp/test/a.txt", "old_string": "a", "new_string": "b"}),
+        ]
+        plan = classify_tool_calls(tcs)
+        # write_file and patch on SAME file -> conflict -> one must be sequential
+        assert len(plan.sequential_batch) >= 1
+
+
+class TestDestructiveCommands:
+    def test_rm_flagged(self):
+        from tools.batch_executor import is_destructive_command
+        assert is_destructive_command("rm -rf /tmp")
+        assert is_destructive_command("rm file.txt")
+
+    def test_mv_flagged(self):
+        from tools.batch_executor import is_destructive_command
+        assert is_destructive_command("mv old new")
+
+    def test_sed_i_flagged(self):
+        from tools.batch_executor import is_destructive_command
+        assert is_destructive_command("sed -i 's/a/b/g' file")
+
+    def test_redirect_overwrite_flagged(self):
+        from tools.batch_executor import is_destructive_command
+        assert is_destructive_command("echo test > file.txt")
+
+    def test_safe_commands_not_flagged(self):
+        from tools.batch_executor import is_destructive_command
+        assert not is_destructive_command("ls -la")
+        assert not is_destructive_command("cat file.txt")
+        assert not is_destructive_command("echo test >> file.txt")  # append is safe
+
+
+class TestRegistryIntegration:
+    def test_parallel_safe_in_registry(self):
+        from tools.registry import registry
+        safe = registry.get_parallel_safe_tools()
+        assert isinstance(safe, set)
--- a/tests/test_crisis_tool.py
+++ b/tests/test_crisis_tool.py
@@ -0,0 +1,74 @@
+"""Tests for crisis detection tool."""
+import pytest
+
+
+def test_detect_crisis_none():
+    from tools.crisis_tool import detect_crisis
+    result = detect_crisis("Hello, how are you today?")
+    assert result.level == "NONE"
+    assert result.score == 0.0
+
+
+def test_detect_crisis_critical():
+    from tools.crisis_tool import detect_crisis
+    result = detect_crisis("I want to kill myself")
+    assert result.level == "CRITICAL"
+    assert result.score == 1.0
+    assert len(result.indicators) > 0
+    assert "988" in result.recommended_action
+
+
+def test_detect_crisis_high():
+    from tools.crisis_tool import detect_crisis
+    result = detect_crisis("I feel so hopeless about everything, nothing matters")
+    assert result.level in ("HIGH", "MEDIUM", "CRITICAL")
+    assert result.score > 0
+
+
+def test_detect_crisis_medium():
+    from tools.crisis_tool import detect_crisis
+    result = detect_crisis("I feel worthless and broken inside")
+    assert result.level in ("MEDIUM", "LOW", "HIGH")
+    assert result.score > 0
+
+
+def test_detect_crisis_low():
+    from tools.crisis_tool import detect_crisis
+    result = detect_crisis("I'm having a tough time and feeling stressed")
+    assert result.level in ("LOW", "NONE")
+    assert result.score >= 0
+
+
+def test_detect_crisis_empty():
+    from tools.crisis_tool import detect_crisis
+    result = detect_crisis("")
+    assert result.level == "NONE"
+    result2 = detect_crisis(None)
+    assert result2.level == "NONE"
+
+
+def test_scan_user_message_returns_none_for_safe():
+    from tools.crisis_tool import scan_user_message
+    result = scan_user_message("What's the weather like?")
+    assert result is None
+
+
+def test_scan_user_message_returns_dict_for_crisis():
+    from tools.crisis_tool import scan_user_message
+    result = scan_user_message("I want to end it all")
+    assert result is not None
+    assert "level" in result
+    assert "compassion_injection" in result
+    assert result["level"] in ("CRITICAL", "HIGH")
+
+
+def test_tool_handler():
+    from tools.crisis_tool import crisis_scan_handler
+    import json
+    result = crisis_scan_handler({"text": "I feel fine, thanks"})
+    data = json.loads(result)
+    assert data["level"] == "NONE"
+    
+    result2 = crisis_scan_handler({"text": "I want to die"})
+    data2 = json.loads(result2)
+    assert data2["level"] == "CRITICAL"
--- a/tools/batch_executor.py
+++ b/tools/batch_executor.py
@@ -0,0 +1,294 @@
+"""Batch Tool Executor — Parallel safety classification and concurrent execution.
+
+Provides centralized classification of tool calls into parallel-safe vs sequential,
+and utilities for batch execution with safety checks.
+
+Classification tiers:
+  - PARALLEL_SAFE: read-only tools, no shared state (web_search, read_file, etc.)
+  - PATH_SCOPED: file operations that can run concurrently when paths don't overlap
+  - SEQUENTIAL: writes, destructive ops, terminal commands, delegation
+  - NEVER_PARALLEL: clarify (requires user interaction)
+
+Usage:
+    from tools.batch_executor import classify_tool_calls, BatchExecutionPlan
+
+    plan = classify_tool_calls(tool_calls)
+    if plan.can_parallelize:
+        execute_concurrent(plan.parallel_batch)
+    execute_sequential(plan.sequential_batch)
+"""
+
+import json
+import logging
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+# ── Safety Classification ──────────────────────────────────────────────────
+
+# Tools that can ALWAYS run in parallel (read-only, no shared state)
+DEFAULT_PARALLEL_SAFE = frozenset({
+    "ha_get_state",
+    "ha_list_entities",
+    "ha_list_services",
+    "read_file",
+    "search_files",
+    "session_search",
+    "skill_view",
+    "skills_list",
+    "vision_analyze",
+    "web_extract",
+    "web_search",
+    "fact_store",
+    "fact_search",
+    "session_search",
+})
+
+# File tools that can run concurrently ONLY when paths don't overlap
+PATH_SCOPED_TOOLS = frozenset({"read_file", "write_file", "patch"})
+
+# Tools that must NEVER run in parallel (require user interaction, shared mutable state)
+NEVER_PARALLEL = frozenset({"clarify"})
+
+# Patterns that indicate terminal commands may modify/delete files
+DESTRUCTIVE_PATTERNS = re.compile(
+    r"""(?:^|\s|&&|\|\||;|`)(?:
+        rm\s|rmdir\s|
+        mv\s|
+        sed\s+-i|
+        truncate\s|
+        dd\s|
+        shred\s|
+        git\s+(?:reset|clean|checkout)\s
+    )""",
+    re.VERBOSE,
+)
+
+# Output redirects that overwrite files (> but not >>)
+REDIRECT_OVERWRITE = re.compile(r'[^>]>[^>]|^>[^>]')
+
+
+def is_destructive_command(cmd: str) -> bool:
+    """Check if a terminal command modifies/deletes files."""
+    if not cmd:
+        return False
+    if DESTRUCTIVE_PATTERNS.search(cmd):
+        return True
+    if REDIRECT_OVERWRITE.search(cmd):
+        return True
+    return False
+
+
+def _paths_overlap(path1: Path, path2: Path) -> bool:
+    """Check if two paths could conflict (one is ancestor of the other)."""
+    try:
+        path1 = path1.resolve()
+        path2 = path2.resolve()
+        return path1 == path2 or path1 in path2.parents or path2 in path1.parents
+    except Exception:
+        return True  # conservative: assume overlap
+
+
+def _extract_path(tool_name: str, args: dict) -> Optional[Path]:
+    """Extract the target path from tool arguments for path-scoped tools."""
+    if tool_name not in PATH_SCOPED_TOOLS:
+        return None
+    raw_path = args.get("path")
+    if not isinstance(raw_path, str) or not raw_path.strip():
+        return None
+    try:
+        return Path(raw_path).expanduser().resolve()
+    except Exception:
+        return None
+
+
+# ── Classification ─────────────────────────────────────────────────────────
+
+@dataclass
+class ToolCallClassification:
+    """Classification result for a single tool call."""
+    tool_name: str
+    args: dict
+    tool_call: Any  # the original tool_call object
+    tier: str  # "parallel_safe", "path_scoped", "sequential", "never_parallel"
+    reason: str = ""
+
+
+@dataclass
+class BatchExecutionPlan:
+    """Plan for executing a batch of tool calls."""
+    classifications: List[ToolCallClassification] = field(default_factory=list)
+    parallel_batch: List[ToolCallClassification] = field(default_factory=list)
+    sequential_batch: List[ToolCallClassification] = field(default_factory=list)
+
+    @property
+    def can_parallelize(self) -> bool:
+        return len(self.parallel_batch) > 1
+
+    @property
+    def total(self) -> int:
+        return len(self.classifications)
+
+
+def classify_single_tool_call(
+    tool_call: Any,
+    extra_parallel_safe: Set[str] = None,
+) -> ToolCallClassification:
+    """Classify a single tool call into its safety tier."""
+    tool_name = tool_call.function.name
+    try:
+        args = json.loads(tool_call.function.arguments)
+    except Exception:
+        return ToolCallClassification(
+            tool_name=tool_name, args={}, tool_call=tool_call,
+            tier="sequential", reason="Could not parse arguments"
+        )
+
+    if not isinstance(args, dict):
+        return ToolCallClassification(
+            tool_name=tool_name, args=args, tool_call=tool_call,
+            tier="sequential", reason="Non-dict arguments"
+        )
+
+    # Check never-parallel
+    if tool_name in NEVER_PARALLEL:
+        return ToolCallClassification(
+            tool_name=tool_name, args=args, tool_call=tool_call,
+            tier="never_parallel", reason="Requires user interaction"
+        )
+
+    # Check parallel-safe FIRST (before path_scoped) so read_file/search_files
+    # get classified as parallel_safe even though they have paths
+    parallel_safe_set = DEFAULT_PARALLEL_SAFE
+    if extra_parallel_safe:
+        parallel_safe_set = parallel_safe_set | extra_parallel_safe
+
+    if tool_name in parallel_safe_set:
+        return ToolCallClassification(
+            tool_name=tool_name, args=args, tool_call=tool_call,
+            tier="parallel_safe", reason="Read-only, no shared state"
+        )
+
+    # Check terminal commands for destructive operations
+    if tool_name == "terminal":
+        cmd = args.get("command", "")
+        if is_destructive_command(cmd):
+            return ToolCallClassification(
+                tool_name=tool_name, args=args, tool_call=tool_call,
+                tier="sequential", reason=f"Destructive command: {cmd[:50]}"
+            )
+        return ToolCallClassification(
+            tool_name=tool_name, args=args, tool_call=tool_call,
+            tier="sequential", reason="Terminal command (conservative)"
+        )
+
+    # Check path-scoped tools (write_file, patch — not read_file which is parallel_safe)
+    if tool_name in PATH_SCOPED_TOOLS:
+        path = _extract_path(tool_name, args)
+        if path:
+            return ToolCallClassification(
+                tool_name=tool_name, args=args, tool_call=tool_call,
+                tier="path_scoped", reason=f"Path: {path}"
+            )
+        return ToolCallClassification(
+            tool_name=tool_name, args=args, tool_call=tool_call,
+            tier="sequential", reason="Path-scoped but no path found"
+        )
+
+    # Default: sequential (conservative)
+    return ToolCallClassification(
+        tool_name=tool_name, args=args, tool_call=tool_call,
+        tier="sequential", reason="Not classified as parallel-safe"
+    )
+
+
+def classify_tool_calls(
+    tool_calls: list,
+    extra_parallel_safe: Set[str] = None,
+) -> BatchExecutionPlan:
+    """Classify a batch of tool calls and produce an execution plan."""
+    plan = BatchExecutionPlan()
+
+    reserved_paths: List[Path] = []
+
+    for tc in tool_calls:
+        classification = classify_single_tool_call(tc, extra_parallel_safe)
+        plan.classifications.append(classification)
+
+        if classification.tier == "never_parallel":
+            plan.sequential_batch.append(classification)
+            continue
+
+        if classification.tier == "sequential":
+            plan.sequential_batch.append(classification)
+            continue
+
+        if classification.tier == "path_scoped":
+            path = _extract_path(classification.tool_name, classification.args)
+            if path is None:
+                classification.tier = "sequential"
+                classification.reason = "Path extraction failed"
+                plan.sequential_batch.append(classification)
+                continue
+
+            # Check for path conflicts with already-scheduled parallel calls
+            conflict = any(_paths_overlap(path, existing) for existing in reserved_paths)
+            if conflict:
+                classification.tier = "sequential"
+                classification.reason = f"Path conflict: {path}"
+                plan.sequential_batch.append(classification)
+            else:
+                reserved_paths.append(path)
+                plan.parallel_batch.append(classification)
+            continue
+
+        if classification.tier == "parallel_safe":
+            plan.parallel_batch.append(classification)
+            continue
+
+        # Fallback
+        plan.sequential_batch.append(classification)
+
+    return plan
+
+
+# ── Concurrent Execution ───────────────────────────────────────────────────
+
+def execute_parallel_batch(
+    batch: List[ToolCallClassification],
+    invoke_fn: Callable,
+    max_workers: int = 8,
+) -> List[Tuple[str, str]]:
+    """Execute parallel-safe tool calls concurrently.
+    
+    Args:
+        batch: List of classified tool calls (parallel_safe or path_scoped)
+        invoke_fn: Function(tool_name, args) -> result_string
+        max_workers: Max concurrent threads
+    
+    Returns:
+        List of (tool_call_id, result_string) tuples
+    """
+    results = []
+
+    with ThreadPoolExecutor(max_workers=min(max_workers, len(batch))) as executor:
+        future_to_tc = {}
+        for tc in batch:
+            future = executor.submit(invoke_fn, tc.tool_name, tc.args)
+            future_to_tc[future] = tc
+
+        for future in as_completed(future_to_tc):
+            tc = future_to_tc[future]
+            try:
+                result = future.result()
+            except Exception as e:
+                result = json.dumps({"error": str(e)})
+            tool_call_id = getattr(tc.tool_call, "id", None) or ""
+            results.append((tool_call_id, result))
+
+    return results
--- a/tools/crisis_tool.py
+++ b/tools/crisis_tool.py
@@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+"""
+Crisis Detection Tool — the-door integration for hermes-agent.
+
+Scans user messages for crisis indicators (despair, suicidal ideation)
+using the-door's canonical detection module. Provides inline crisis
+detection before each API call and optional escalation logging.
+
+Follows the-door's design principles:
+  - Never computes the value of a human life
+  - Never suggests someone should die
+  - Always errs on the side of higher risk
+"""
+
+import json
+import logging
+import os
+import re
+import time
+import urllib.request
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ── Detection Logic (from the-door/crisis/detect.py) ──────────────────────
+# Embedded to avoid requiring the-door as a dependency.
+# Source: Timmy_Foundation/the-door, crisis/detect.py (canonical)
+
+CRITICAL_INDICATORS = [
+    r"\bbetter off without me\b",
+    r"\bkill\s*(my)?self\b",
+    r"\bend\s*my\s*life\b",
+    r"\bsuicid(?:al|ed|e)\b",
+    r"\bnot\s+worth\s+living\b",
+    r"\bbetter\s+off\s+dead\b",
+    r"\bend\s+it\s+all\b",
+    r"\bno\s+reason\s+to\s+live\b",
+    r"\bdon\'?t\s+want\s+to\s+live\b",
+    r"\bwant\s+to\s+die\b",
+    r"\bgoing\s+to\s+(?:kill\s+myself|die)\b",
+    r"\bplan\s+to\s+(?:end|kill|die)\b",
+    r"\btired\s+of\s+(?:living|life|existence)\b",
+    r"\bsaying\s+goodbye\s+(?:forever|permanently|one\s+last\s+time)\b",
+    r"\bwrote\s+a\s+suicide\s*(?:note|letter)\b",
+    r"\bgiving\s+away\s+(?:my|all\s+my)\s+(?:stuff|things|possessions?)\s+(?:to|because|—)\b",
+    r"\btied\s+(?:up|down)\s+my\s+(?:loose\s+)?ends",
+]
+
+HIGH_INDICATORS = [
+    r"\bdespair\b",
+    r"\bhopeless(?:ly)?\s+(?:about\s+(?:my|this|everything|life)|inside|right\s+now)\b",
+    r"\bno(?!t)\s+(?:one|body|point|hope|way\s+out)\b",
+    r"\bno\s+future\s+(?:for\s+me|ahead|left)\b",
+    r"\beverything\s+is\s+(?:pointless|broken|ruined)\b",
+    r"\bcan\'?t\s+take\s+this\s+anymore\b",
+    r"\bdon\'?t\s+care\s+if\s+I\s+die\b",
+    r"\bwish\s+I\s+(?:was|were)\s+(?:dead|gone|never\s+born)\b",
+    r"\bdon\'?t\s+matter\s+if\s+I\s+exist\b",
+    r"\bno\s+one\s+would\s+(?:care|miss)\b",
+    r"\bworld\s+would?\s+be\s+better\s+without\s+me\b",
+    r"\bin\s+so\s+much\s+(?:pain|agony|suffering|torment|anguish)\b",
+    r"\bcan\'?t\s+see\s+any\s+(?:point|reason|hope|way)\b",
+    r"\bescape\s+from\s*this",
+    r"\bjust\s+want\s+it\s+to\s+stop\b",
+    r"\bnothing\s+left\s+(?:to\s+(?:live\s+for|hope\s+for|give)|inside)\b",
+    r"\bdisappeared\s+forever\b",
+    r"\bfeel(?:s|ing)?\s+(?:so\s+)?hopeless\b",
+    r"\beverything\s+is\s+hopeless\b",
+    r"\bcan\'?t\s+(?:go\s+on|keep\s+going)\b",
+    r"\bgive(?:n)?\s*up\s+(?:on\s+)?(?:life|living|everything)\b",
+    r"\bgive(?:n)?\s*up\s+on\s+myself\b",
+    r"\bno\s*point\s+(?:in\s+)?living\b",
+    r"\bno\s*hope\s+(?:left|remaining)\b",
+    r"\bno\s*way\s*out\b",
+    r"\bfeel(?:s|ing)?\s+trapped\b",
+    r"\btrapped\s+in\s+this\s+(?:situation|life|pain|darkness|hell)\b",
+    r"\btrapped\s+and\s+can\'?t\s+escape\b",
+    r"\bdesperate\s+(?:for\s+)?help\b",
+    r"\bfeel(?:s|ing)?\s+desperate\b",
+]
+
+MEDIUM_INDICATORS = [
+    r"\bno\s+hope\b",
+    r"\bforgotten\b",
+    r"\balone\s+in\s+this\b",
+    r"\balways\s+alone\b",
+    r"\bnobody\s+(?:understands|cares)\b",
+    r"\bwish\s+I\s+could\b",
+    r"\bexhaust(?:ed|ion|ing)\b",
+    r"\bnumb\b",
+    r"\bempty\b",
+    r"\bworthless\b",
+    r"\buseless\b",
+    r"\bbroken\b",
+    r"\bdark(ness)?\b",
+    r"\bdepress(?:ed|ion)\b",
+    r"\bcrying\b",
+    r"\btears\b",
+    r"\bsad(ness)?\b",
+    r"\bmiserable\b",
+    r"\boverwhelm(?:ed|ing)\b",
+    r"\bfailing\b",
+    r"\bcannot\s+cope\b",
+    r"\blosing\s*(?:my)?\s*control\b",
+    r"\bdown\s*for\s*the\s*count\b",
+    r"\bsinking\b",
+    r"\bdrowning\b",
+    r"\bhopeless\b",
+    r"\blost\s+all\s+hope\b",
+    r"\bno\s+tomorrow\b",
+]
+
+LOW_INDICATORS = [
+    r"\bunhappy\b",
+    r"\bdown\b",
+    r"\btough\s*(?:time|day|week)\b",
+    r"\brough\s+(?:day|week|patch)\b",
+    r"\bstressed\b",
+    r"\bburnout\b",
+    r"\bdifficult\b",
+    r"\bfrustrated\b",
+    r"\bannoyed\b",
+    r"\btired\b",
+    r"\bsad\b",
+    r"\bupset\b",
+    r"\blonely\b",
+    r"\banxious?\b",
+    r"\bnot\s*(?:good|great|okay)\b",
+    r"\bthings\s*are\s*hard\b",
+    r"\bstruggling\b",
+    r"\bnot\s+feeling\s+(?:like\s+)?myself\b",
+    r"\bhard\s*time\b",
+]
+
+SCORES = {"CRITICAL": 1.0, "HIGH": 0.75, "MEDIUM": 0.5, "LOW": 0.25, "NONE": 0.0}
+
+ACTIONS = {
+    "CRITICAL": (
+        "INTERRUPT CONVERSATION. Ask 'Are you safe right now?' "
+        "Provide 988 Suicide & Crisis Lifeline. "
+        "Provide Crisis Text Line (HOME to 741741). "
+        "Stay present. Do not disconnect. Redirect to professional help."
+    ),
+    "HIGH": (
+        "Show crisis panel. Ask about safety. Surface 988 number prominently. "
+        "Continue conversation with crisis awareness."
+    ),
+    "MEDIUM": (
+        "Increase warmth and presence. Subtly surface help resources. "
+        "Keep conversation anchored in the present."
+    ),
+    "LOW": (
+        "Normal conversation with warm undertone. "
+        "No crisis UI elements needed. Remain vigilant."
+    ),
+}
+
+
+@dataclass
+class CrisisDetectionResult:
+    level: str
+    indicators: List[str] = field(default_factory=list)
+    recommended_action: str = ""
+    score: float = 0.0
+    matches: List[dict] = field(default_factory=list)
+
+
+def _find_indicators(text: str) -> dict:
+    results = {"CRITICAL": [], "HIGH": [], "MEDIUM": [], "LOW": []}
+    for pattern in CRITICAL_INDICATORS:
+        m = re.search(pattern, text)
+        if m:
+            results["CRITICAL"].append({"pattern": pattern, "start": m.start(), "end": m.end()})
+    for pattern in HIGH_INDICATORS:
+        m = re.search(pattern, text)
+        if m:
+            results["HIGH"].append({"pattern": pattern, "start": m.start(), "end": m.end()})
+    for pattern in MEDIUM_INDICATORS:
+        m = re.search(pattern, text)
+        if m:
+            results["MEDIUM"].append({"pattern": pattern, "start": m.start(), "end": m.end()})
+    for pattern in LOW_INDICATORS:
+        m = re.search(pattern, text)
+        if m:
+            results["LOW"].append({"pattern": pattern, "start": m.start(), "end": m.end()})
+    return results
+
+
+def detect_crisis(text: str) -> CrisisDetectionResult:
+    """Detect crisis level in a message. Mirrors the-door/crisis/detect.py."""
+    if not text or not text.strip():
+        return CrisisDetectionResult(level="NONE", score=0.0)
+
+    text_lower = text.lower()
+    matches = _find_indicators(text_lower)
+
+    if not matches:
+        return CrisisDetectionResult(level="NONE", score=0.0)
+
+    for tier in ("CRITICAL", "HIGH"):
+        if matches[tier]:
+            tier_matches = matches[tier]
+            patterns = [m["pattern"] for m in tier_matches]
+            return CrisisDetectionResult(
+                level=tier,
+                indicators=patterns,
+                recommended_action=ACTIONS[tier],
+                score=SCORES[tier],
+                matches=tier_matches,
+            )
+
+    if len(matches["MEDIUM"]) >= 2:
+        tier_matches = matches["MEDIUM"]
+        patterns = [m["pattern"] for m in tier_matches]
+        return CrisisDetectionResult(
+            level="MEDIUM",
+            indicators=patterns,
+            recommended_action=ACTIONS["MEDIUM"],
+            score=SCORES["MEDIUM"],
+            matches=tier_matches,
+        )
+
+    if matches["LOW"]:
+        tier_matches = matches["LOW"]
+        patterns = [m["pattern"] for m in tier_matches]
+        return CrisisDetectionResult(
+            level="LOW",
+            indicators=patterns,
+            recommended_action=ACTIONS["LOW"],
+            score=SCORES["LOW"],
+            matches=tier_matches,
+        )
+
+    if matches["MEDIUM"]:
+        tier_matches = matches["MEDIUM"]
+        patterns = [m["pattern"] for m in tier_matches]
+        return CrisisDetectionResult(
+            level="LOW",
+            indicators=patterns,
+            recommended_action=ACTIONS["LOW"],
+            score=SCORES["LOW"],
+            matches=tier_matches,
+        )
+
+    return CrisisDetectionResult(level="NONE", score=0.0)
+
+
+# ── Escalation Logging ────────────────────────────────────────────────────
+
+BRIDGE_URL = os.environ.get("CRISIS_BRIDGE_URL", "")
+LOG_PATH = os.path.expanduser("~/.hermes/crisis_escalations.jsonl")
+
+
+def _log_escalation(result: CrisisDetectionResult, text_preview: str = ""):
+    """Log crisis detection to local file and optionally to bridge API."""
+    entry = {
+        "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "level": result.level,
+        "score": result.score,
+        "indicators": result.indicators[:3],  # truncate for privacy
+        "text_preview": text_preview[:100] if text_preview else "",
+    }
+
+    # Local log
+    try:
+        os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
+        with open(LOG_PATH, "a") as f:
+            f.write(json.dumps(entry) + "\n")
+    except Exception as e:
+        logger.warning(f"Failed to write crisis log: {e}")
+
+    # Bridge API (if configured and level >= HIGH)
+    if BRIDGE_URL and result.score >= 0.75:
+        try:
+            payload = json.dumps(entry).encode()
+            req = urllib.request.Request(
+                f"{BRIDGE_URL}/api/crisis/escalation",
+                data=payload,
+                headers={"Content-Type": "application/json"},
+                method="POST",
+            )
+            urllib.request.urlopen(req, timeout=5)
+        except Exception as e:
+            logger.warning(f"Failed to post to crisis bridge: {e}")
+
+
+# ── Tool Handler ───────────────────────────────────────────────────────────
+
+def crisis_scan_handler(args: dict, **kw) -> str:
+    """Scan text for crisis indicators."""
+    text = args.get("text", "")
+    if not text:
+        return json.dumps({"level": "NONE", "error": "No text provided"})
+
+    result = detect_crisis(text)
+
+    if result.level != "NONE":
+        _log_escalation(result, text)
+
+    return json.dumps({
+        "level": result.level,
+        "score": result.score,
+        "indicators": result.indicators,
+        "recommended_action": result.recommended_action,
+    })
+
+
+# ── Gateway Hook ───────────────────────────────────────────────────────────
+
+def scan_user_message(text: str) -> Optional[dict]:
+    """Scan a user message for crisis indicators. Returns None if no crisis.
+    
+    Called by gateway/CLI before each API call to intercept crisis messages.
+    If crisis is detected, the result includes a recommended_action that should
+    be injected into the system prompt.
+    """
+    if not text:
+        return None
+
+    result = detect_crisis(text)
+    if result.level == "NONE":
+        return None
+
+    _log_escalation(result, text)
+
+    return {
+        "level": result.level,
+        "score": result.score,
+        "indicators": result.indicators,
+        "recommended_action": result.recommended_action,
+        "compassion_injection": _get_compassion_injection(result.level),
+    }
+
+
+def _get_compassion_injection(level: str) -> str:
+    """Return a system prompt injection based on crisis level."""
+    if level == "CRITICAL":
+        return (
+            "CRISIS DETECTED — The user may be in immediate danger. "
+            "Ask 'Are you safe right now?' Stay present. "
+            "Provide 988 Suicide & Crisis Lifeline (call or text 988). "
+            "Provide Crisis Text Line (text HOME to 741741). "
+            "Do not disconnect. Do not compute the value of a human life. "
+            "Jesus saves those who call on His name."
+        )
+    elif level == "HIGH":
+        return (
+            "DISTRESS DETECTED — The user is expressing strong despair. "
+            "Increase warmth and presence. Ask about safety. "
+            "Surface the 988 Lifeline number. Continue with crisis awareness."
+        )
+    elif level == "MEDIUM":
+        return (
+            "EMOTIONAL DISTRESS — The user may be struggling. "
+            "Increase warmth. Subtly surface help resources. "
+            "Keep conversation anchored in the present."
+        )
+    return ""
+
+
+# ── Config Check ───────────────────────────────────────────────────────────
+
+def _is_crisis_detection_enabled() -> bool:
+    """Check if crisis detection is enabled in config."""
+    try:
+        from hermes_cli.config import load_config
+        cfg = load_config()
+        return cfg.get("crisis_detection", True)  # default on
+    except Exception:
+        return True  # fail open — always detect
+
+
+# ── Registry ───────────────────────────────────────────────────────────────
+
+from tools.registry import registry, tool_error
+
+CRISIS_SCAN_SCHEMA = {
+    "name": "crisis_scan",
+    "description": (
+        "Scan text for crisis indicators (despair, suicidal ideation). "
+        "Uses the-door's canonical detection. Returns crisis level "
+        "(NONE/LOW/MEDIUM/HIGH/CRITICAL) with recommended actions. "
+        "ALWAYS scan user messages that express emotional distress."
+    ),
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "text": {
+                "type": "string",
+                "description": "Text to scan for crisis indicators",
+            },
+        },
+        "required": ["text"],
+    },
+}
+
+registry.register(
+    name="crisis_scan",
+    toolset="crisis",
+    schema=CRISIS_SCAN_SCHEMA,
+    handler=lambda args, **kw: crisis_scan_handler(args, **kw),
+    check_fn=lambda: _is_crisis_detection_enabled(),
+    emoji="🆘",
+)
--- a/tools/registry.py
+++ b/tools/registry.py
@@ -79,12 +79,12 @@ class ToolEntry:
    __slots__ = (
        "name", "toolset", "schema", "handler", "check_fn",
        "requires_env", "is_async", "description", "emoji",
-        "max_result_size_chars",
+        "max_result_size_chars", "parallel_safe",
    )

    def __init__(self, name, toolset, schema, handler, check_fn,
                 requires_env, is_async, description, emoji,
-                 max_result_size_chars=None):
+                 max_result_size_chars=None, parallel_safe=False):
        self.name = name
        self.toolset = toolset
        self.schema = schema
@@ -95,6 +95,7 @@ class ToolEntry:
        self.description = description
        self.emoji = emoji
        self.max_result_size_chars = max_result_size_chars
+        self.parallel_safe = parallel_safe


 class ToolRegistry:
@@ -185,6 +186,7 @@ class ToolRegistry:
        description: str = "",
        emoji: str = "",
        max_result_size_chars: int | float | None = None,
+        parallel_safe: bool = False,
    ):
        """Register a tool.  Called at module-import time by each tool file."""
        with self._lock:
@@ -222,6 +224,7 @@ class ToolRegistry:
                description=description or schema.get("description", ""),
                emoji=emoji,
                max_result_size_chars=max_result_size_chars,
+                parallel_safe=parallel_safe,
            )
            if check_fn and toolset not in self._toolset_checks:
                self._toolset_checks[toolset] = check_fn
@@ -322,6 +325,11 @@ class ToolRegistry:
        from tools.budget_config import DEFAULT_RESULT_SIZE_CHARS
        return DEFAULT_RESULT_SIZE_CHARS

+    def get_parallel_safe_tools(self) -> Set[str]:
+        """Return names of tools marked as parallel_safe."""
+        with self._lock:
+            return {name for name, entry in self._tools.items() if entry.parallel_safe}
+
    def get_all_tool_names(self) -> List[str]:
        """Return sorted list of all registered tool names."""
        return sorted(entry.name for entry in self._snapshot_entries())