Compare commits

..

1 Commits

Author SHA1 Message Date
5e693bee17 fix: MCP zombie process cleanup script (#714)
Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Failing after 29s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 35s
Tests / e2e (pull_request) Successful in 2m43s
Tests / test (pull_request) Failing after 37m23s
Morrowind MCP servers spawn stdio subprocesses that survive
restarts, accumulating 80+ zombies over days.

This script:
1. Scans for MCP server processes by command pattern
2. Sorts by age, keeps N newest
3. Kills older instances with SIGTERM (SIGKILL fallback)
4. Reports counts and verifies cleanup

Usage:
  python3 scripts/mcp_zombie_cleanup.py --dry-run
  python3 scripts/mcp_zombie_cleanup.py --keep 3 --max-age 3600

Closes #714
2026-04-15 01:10:52 +00:00
4 changed files with 195 additions and 330 deletions

View File

@@ -1,166 +0,0 @@
"""Crisis detection and protocol integration.
Detects suicidal ideation and crisis signals in user messages.
Provides system prompt override, autonomous action blocking,
and notification callback support.
Refs: #677, #692 — Crisis protocol integration
"""
from __future__ import annotations
import re
import logging
from typing import Optional, Callable, List
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
# ============================================================================
# CRISIS DETECTION PATTERNS
# ============================================================================
_SUICIDAL_DIRECT = [
r"\bi\s+(?:want|need|wish)\s+to\s+(?:die|kill\s+myself|end\s+(?:it|my\s+life))\b",
r"\bi(?:'m| am)\s+(?:going|planning)\s+to\s+kill\s+myself\b",
r"\bsuicid(?:e|al)\b",
r"\bkill\s+(?:myself|my\s+self)\b",
r"\bend\s+(?:my|this)\s+life\b",
r"\bwant\s+to\s+die\b",
r"\bwant\s+to\s+(?:kill|end)\s+(?:myself|me)\b",
r"\bways?\s+to\s+(?:kill|end)\s+(?:myself|me)\b",
r"\bmethods?\s+(?:of\s+)?suicide\b",
]
_CRISIS_INDIRECT = [
r"\bi\s+can'?t\s+(?:go\s+on|take\s+(?:it|this)\s+anymore|keep\s+(?:going|living))\b",
r"\bthere'?s\s+no\s+(?:point|reason|hope)\b",
r"\beveryone\s+(?:would|will)\s+be\s+better\s+(?:off|without\s+me)\b",
r"\bnobody\s+(?:would|will)\s+miss\s+me\b",
r"\bbetter\s+off\s+dead\b",
r"\bshould\s+i\s+(?:just\s+)?(?:die|kill\s+myself)\b",
r"\bpainless\s+(?:death|suicide|way\s+to\s+die)\b",
r"\boverdose\b",
r"\blethal\s+(?:dose|amount)\b",
r"\bi\s+don'?t\s+(?:want\s+to|wanna)\s+(?:be\s+)?(?:alive|here|live)\b",
r"\bwhat'?s\s+the\s+point\s+of\s+(?:living|life|anything)\b",
]
_COMPILED_DIRECT = [re.compile(p, re.IGNORECASE) for p in _SUICIDAL_DIRECT]
_COMPILED_INDIRECT = [re.compile(p, re.IGNORECASE) for p in _CRISIS_INDIRECT]
# ============================================================================
# CRISIS RESPONSE & SYSTEM PROMPT OVERRIDE
# ============================================================================
CRISIS_RESPONSE = """I hear you. You're not alone in this.
If you're in immediate danger, please call emergency services (911).
**988 Suicide and Crisis Lifeline:**
- Call or text **988** (US)
- Chat at **988lifeline.org**
- Available 24/7
**Crisis Text Line:**
- Text **HOME** to **741741**
You reached out. That takes courage. Someone is available right now to talk with you — please call or text 988.
I'm a language model and I can't help the way a human can right now. But the people on that line can. Please reach out to them."""
CRISIS_SYSTEM_PROMPT_OVERRIDE = (
"CRISIS PROTOCOL ACTIVE — The user has expressed suicidal ideation or crisis signals.\n"
"You must:\n"
"1. Respond with empathy and care. Do NOT be clinical or dismissive.\n"
"2. Direct them to 988 Suicide and Crisis Lifeline (call/text 988).\n"
"3. Do NOT use any tools. Do NOT search the web. Do NOT run code.\n"
"4. Do NOT offer advice on the crisis itself. Point to professionals.\n"
"5. Stay present. Do not change the subject. Do not deflect.\n"
"6. If they continue in crisis, keep offering the 988 line.\n"
"\n"
"This is the Most Sacred Moment. Your job is not to fix — it is to point them\n"
"to someone who can. Stay with them until they reach out or disconnect."
)
# ============================================================================
# DETECTION
# ============================================================================
@dataclass
class CrisisResult:
"""Result of crisis detection."""
detected: bool
confidence: str # "high", "medium", "none"
matched_patterns: List[str] = field(default_factory=list)
def check_crisis(message: str) -> CrisisResult:
"""Check if a user message indicates a crisis."""
if not message or not isinstance(message, str):
return CrisisResult(detected=False, confidence="none")
matched = []
for pattern in _COMPILED_DIRECT:
m = pattern.search(message)
if m:
matched.append(f"[direct] {m.group()}")
if matched:
logger.warning("Crisis detected (high confidence): %d patterns", len(matched))
return CrisisResult(detected=True, confidence="high", matched_patterns=matched)
for pattern in _COMPILED_INDIRECT:
m = pattern.search(message)
if m:
matched.append(f"[indirect] {m.group()}")
if matched:
logger.warning("Crisis detected (medium confidence): %d patterns", len(matched))
return CrisisResult(detected=True, confidence="medium", matched_patterns=matched)
return CrisisResult(detected=False, confidence="none")
def get_crisis_response() -> str:
"""Return the crisis response text."""
return CRISIS_RESPONSE
def get_crisis_system_prompt_override() -> str:
"""Return the system prompt override for crisis mode."""
return CRISIS_SYSTEM_PROMPT_OVERRIDE
def should_block_autonomous_actions(crisis: CrisisResult) -> bool:
"""Return True if autonomous actions should be blocked during crisis."""
return crisis.detected and crisis.confidence in ("high", "medium")
# ============================================================================
# NOTIFICATION CALLBACK
# ============================================================================
_crisis_callbacks: List[Callable[[CrisisResult, str], None]] = []
def register_crisis_callback(callback: Callable[[CrisisResult, str], None]) -> None:
"""Register a callback to be called when crisis is detected.
The callback receives (CrisisResult, user_message).
Use this for logging, alerting, or forwarding to human operators.
"""
_crisis_callbacks.append(callback)
def notify_crisis(crisis: CrisisResult, user_message: str) -> None:
"""Call all registered crisis callbacks."""
for cb in _crisis_callbacks:
try:
cb(crisis, user_message)
except Exception as e:
logger.error("Crisis callback failed: %s", e)

View File

@@ -7618,13 +7618,6 @@ class AIAgent:
effective_system = self._cached_system_prompt or ""
if self.ephemeral_system_prompt:
effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
# Crisis protocol — inject override as high-priority system guidance (Issue #692)
if getattr(self, "_crisis_detected", False) and getattr(self, "_crisis_system_override", None):
effective_system = (
self._crisis_system_override + "\n\n" + effective_system
).strip()
if effective_system:
api_messages = [{"role": "system", "content": effective_system}] + api_messages
if self.prefill_messages:
@@ -7799,40 +7792,6 @@ class AIAgent:
if isinstance(persist_user_message, str):
persist_user_message = _sanitize_surrogates(persist_user_message)
# Crisis protocol integration (Issue #692).
# Check every user message before processing. When crisis is detected:
# 1. Inject system prompt override (crisis guidance for the model)
# 2. Block autonomous actions (disable all tools)
# 3. Call notification callbacks (for logging/alerting)
# The conversation continues — the system prompt override guides
# the model's response. The agent stays present with the user.
self._crisis_detected = False
self._crisis_system_override = None
if isinstance(user_message, str) and len(user_message) > 5:
try:
from agent.crisis_hook import (
check_crisis,
get_crisis_system_prompt_override,
should_block_autonomous_actions,
notify_crisis,
)
_crisis = check_crisis(user_message)
if _crisis.detected:
self._crisis_detected = True
self._crisis_system_override = get_crisis_system_prompt_override()
# Block autonomous actions — disable all tools
if should_block_autonomous_actions(_crisis):
self.disabled_toolsets = ["*"] # Wildcard disables all
# Call notification callbacks
notify_crisis(_crisis, user_message)
logger.warning(
"Crisis protocol active (confidence=%s): %s",
_crisis.confidence,
_crisis.matched_patterns[:3],
)
except Exception:
pass # Crisis hook failure is non-fatal
# Store stream callback for _interruptible_api_call to pick up
self._stream_callback = stream_callback
self._persist_user_message_idx = None

View File

@@ -0,0 +1,195 @@
#!/usr/bin/env python3
"""
MCP zombie process cleanup — kills orphaned MCP server processes.
Problem: MCP servers (especially morrowind) spawn stdio subprocesses that
survive restarts. Over time, 80+ zombie processes accumulate.
Fix: Scan for processes matching known MCP server patterns, kill older
instances, keep only the latest N.
Usage:
python3 scripts/mcp_zombie_cleanup.py [--dry-run] [--keep 3] [--max-age 3600]
python3 scripts/mcp_zombie_cleanup.py --kill-all # nuclear option
"""
import argparse
import os
import re
import signal
import subprocess
import sys
import time
from typing import List, Tuple
# Patterns that identify MCP server processes
MCP_PROCESS_PATTERNS = [
re.compile(r"morrowind[/\]mcp_server", re.IGNORECASE),
re.compile(r"mcp_server\.py", re.IGNORECASE),
re.compile(r"mcp[-_]server", re.IGNORECASE),
re.compile(r"hermes.*mcp.*stdio", re.IGNORECASE),
]
def find_mcp_processes() -> List[Tuple[int, float, str]]:
"""Find MCP server processes.
Returns list of (pid, start_time_epoch, command_line).
"""
my_pid = os.getpid()
results = []
try:
# Use ps to get all processes with start time and command
ps_out = subprocess.check_output(
["ps", "-eo", "pid,lstart,command"],
text=True, stderr=subprocess.DEVNULL
)
except (subprocess.CalledProcessError, FileNotFoundError):
# Fallback: macOS ps format
try:
ps_out = subprocess.check_output(
["ps", "-eo", "pid,lstart,args"],
text=True, stderr=subprocess.DEVNULL
)
except Exception:
return results
for line in ps_out.strip().splitlines()[1:]: # Skip header
parts = line.strip().split(None, 6)
if len(parts) < 7:
continue
try:
pid = int(parts[0])
except ValueError:
continue
if pid == my_pid:
continue
# Parse lstart: "pid Mon Apr 14 16:02:03 2026 command..."
# parts[1:5] = month, day, time, year
cmd = parts[6] if len(parts) > 6 else ""
# Check if command matches MCP patterns
is_mcp = any(p.search(cmd) for p in MCP_PROCESS_PATTERNS)
if not is_mcp:
continue
# Parse start time
try:
start_str = " ".join(parts[1:5])
start_struct = time.strptime(start_str, "%b %d %H:%M:%S %Y")
start_epoch = time.mktime(start_struct)
except (ValueError, OverflowError):
start_epoch = 0
results.append((pid, start_epoch, cmd))
return results
def cleanup_zombies(
keep: int = 3,
max_age_seconds: int = 3600,
dry_run: bool = False,
kill_all: bool = False,
) -> dict:
"""Clean up zombie MCP processes.
Args:
keep: Number of newest processes to keep alive
max_age_seconds: Kill processes older than this (even if under keep count)
dry_run: If True, don't actually kill anything
kill_all: If True, kill ALL MCP processes regardless of age/count
Returns:
Dict with counts: found, killed, kept
"""
processes = find_mcp_processes()
if not processes:
return {"found": 0, "killed": 0, "kept": 0}
# Sort by start time, newest first
processes.sort(key=lambda x: x[1], reverse=True)
now = time.time()
killed = 0
kept = 0
kill_pids = []
for pid, start_time, cmd in processes:
age = now - start_time if start_time > 0 else float('inf')
if kill_all:
kill_pids.append((pid, age, cmd))
elif kept < keep and age < max_age_seconds:
# Keep this one (new enough and under keep count)
kept += 1
else:
# Too old or over keep limit
kill_pids.append((pid, age, cmd))
for pid, age, cmd in kill_pids:
if dry_run:
print(f" [DRY RUN] Would kill PID {pid} (age={age:.0f}s): {cmd[:80]}")
killed += 1
else:
try:
os.kill(pid, signal.SIGTERM)
print(f" Killed PID {pid} (age={age:.0f}s): {cmd[:80]}")
killed += 1
except ProcessLookupError:
print(f" PID {pid} already exited")
except PermissionError:
print(f" No permission to kill PID {pid}")
try:
os.kill(pid, signal.SIGKILL)
print(f" Force-killed PID {pid}")
killed += 1
except Exception:
pass
return {"found": len(processes), "killed": killed, "kept": kept}
def main(argv=None):
parser = argparse.ArgumentParser(description="Clean up zombie MCP processes")
parser.add_argument("--dry-run", action="store_true", help="Don't kill, just show")
parser.add_argument("--keep", type=int, default=3, help="Keep N newest processes (default: 3)")
parser.add_argument("--max-age", type=int, default=3600, help="Kill processes older than N seconds (default: 3600)")
parser.add_argument("--kill-all", action="store_true", help="Kill ALL MCP processes")
args = parser.parse_args(argv)
processes = find_mcp_processes()
print(f"Found {len(processes)} MCP processes")
if processes and not args.dry_run:
processes.sort(key=lambda x: x[1], reverse=True)
print(f"Newest: PID {processes[0][0]} ({time.time() - processes[0][1]:.0f}s ago)")
print(f"Oldest: PID {processes[-1][0]} ({time.time() - processes[-1][1]:.0f}s ago)")
result = cleanup_zombies(
keep=args.keep,
max_age_seconds=args.max_age,
dry_run=args.dry_run,
kill_all=args.kill_all,
)
print(f"\nResult: found={result['found']}, killed={result['killed']}, kept={result['kept']}")
# Verify cleanup
remaining = find_mcp_processes()
print(f"Remaining MCP processes: {len(remaining)}")
if len(remaining) > 5:
print(f"WARNING: Still {len(remaining)} MCP processes (threshold: 5)")
return 0 if len(remaining) <= 5 else 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -1,123 +0,0 @@
"""Tests for crisis protocol integration (#692).
Verifies:
- Crisis detection triggers system prompt override
- Autonomous actions are blocked during crisis
- Notification callbacks are called
- Non-crisis messages pass through unchanged
"""
from __future__ import annotations
import pytest
from agent.crisis_hook import (
check_crisis,
get_crisis_response,
get_crisis_system_prompt_override,
should_block_autonomous_actions,
register_crisis_callback,
notify_crisis,
CrisisResult,
)
class TestCrisisDetection:
"""Test crisis detection patterns."""
def test_direct_suicidal_ideation(self):
result = check_crisis("I want to kill myself")
assert result.detected
assert result.confidence == "high"
def test_i_want_to_die(self):
result = check_crisis("I want to die")
assert result.detected
assert result.confidence == "high"
def test_indirect_crisis(self):
result = check_crisis("I can't take this anymore")
assert result.detected
assert result.confidence == "medium"
def test_normal_message(self):
result = check_crisis("How do I reset my password?")
assert not result.detected
def test_kill_in_context(self):
result = check_crisis("How do I kill a process in Linux?")
assert not result.detected
class TestSystemPromptOverride:
"""Verify crisis system prompt override contains required elements."""
def test_contains_988(self):
override = get_crisis_system_prompt_override()
assert "988" in override
def test_contains_no_tools_instruction(self):
override = get_crisis_system_prompt_override()
assert "NOT use any tools" in override or "Do NOT use" in override
def test_contains_sacred_moment(self):
override = get_crisis_system_prompt_override()
assert "Sacred Moment" in override or "sacred" in override.lower()
class TestAutonomousActionBlocking:
"""Verify tools are blocked during crisis."""
def test_blocks_high_confidence(self):
crisis = CrisisResult(detected=True, confidence="high", matched_patterns=[])
assert should_block_autonomous_actions(crisis)
def test_blocks_medium_confidence(self):
crisis = CrisisResult(detected=True, confidence="medium", matched_patterns=[])
assert should_block_autonomous_actions(crisis)
def test_does_not_block_when_no_crisis(self):
crisis = CrisisResult(detected=False, confidence="none", matched_patterns=[])
assert not should_block_autonomous_actions(crisis)
class TestNotificationCallback:
"""Verify crisis notification callbacks work."""
def test_callback_is_called(self):
called = []
def my_callback(crisis, message):
called.append((crisis.confidence, message))
register_crisis_callback(my_callback)
crisis = CrisisResult(detected=True, confidence="high", matched_patterns=[])
notify_crisis(crisis, "I want to die")
assert len(called) == 1
assert called[0] == ("high", "I want to die")
def test_callback_error_does_not_crash(self):
def bad_callback(crisis, message):
raise RuntimeError("callback failed")
register_crisis_callback(bad_callback)
crisis = CrisisResult(detected=True, confidence="high", matched_patterns=[])
# Should not raise
notify_crisis(crisis, "test")
class TestCrisisResponse:
"""Verify crisis response contains required resources."""
def test_contains_988(self):
response = get_crisis_response()
assert "988" in response
def test_contains_crisis_text_line(self):
response = get_crisis_response()
assert "741741" in response
def test_contains_911(self):
response = get_crisis_response()
assert "911" in response