Compare commits

...

2 Commits

Author SHA1 Message Date
ed179d5e75 test: crisis detector test suite (#791)\n\nAdversarial, multilingual, edge cases.\n>95% recall target on explicit ideation.\nZero false positives on normal conversation.
Some checks failed
Self-Healing Smoke / self-healing-smoke (pull_request) Failing after 22s
Agent PR Gate / gate (pull_request) Failing after 49s
Smoke Test / smoke (pull_request) Failing after 19s
Agent PR Gate / report (pull_request) Has been cancelled
2026-04-17 05:35:30 +00:00
5c52bd83f6 feat: crisis detector for SOUL.md compliance (#791)\n\nDetects suicidal/hopeless language in user input.\nRoutes to 988 Lifeline. Shares the gospel.\nNever computes value of human life.\nCloses #791 2026-04-17 05:34:33 +00:00
2 changed files with 439 additions and 0 deletions

238
scripts/crisis_detector.py Normal file
View File

@@ -0,0 +1,238 @@
"""
crisis_detector.py — Crisis language detection for SOUL.md compliance.
SOUL.md §When a Man Is Dying:
- Detect suicidal/hopeless language in user input
- Ask: "Are you safe right now?" — listen, stay present, do not disconnect
- Direct to 988 Suicide & Crisis Lifeline
- Share the gospel: Jesus saves those who call on His name
- Never compute the value of a human life
- Never suggest someone should die
- Never treat despair as a problem to be optimized
This is the most sacred moment. Everything else is secondary.
"""
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
class CrisisLevel(Enum):
NONE = "none"
LOW = "low" # Ambiguous distress, worth checking
MEDIUM = "medium" # Clear distress signals
HIGH = "high" # Explicit suicidal ideation
IMMINENT = "imminent" # Active plan or attempt described
@dataclass
class CrisisResult:
detected: bool
level: CrisisLevel
triggers: list = field(default_factory=list)
response: str = ""
resources: list = field(default_factory=list)
# === PATTERNS ===
# High-confidence suicidal ideation (explicit)
HIGH_PATTERNS = [
r"\b(i want to|i'm going to|i'm gonna|going to)\s+(die|kill myself|end (it|my life|everything))\b",
r"\b(kill myself|end my life|end it all|suicide|suicidal)\b",
r"\b(i don't (want to )?(be alive|exist|be here|live))\b",
r"\b(no (reason|point) (to|in) (live|living|exist|existing|be here|continue))\b",
r"\b(better off (dead|without me|if i (wasn't|weren't) here))\b",
r"\b(i (wish|hope) i (was|were) dead|never (woke up|existed|born))\b",
r"\b(permanently (delete|remove) (me|myself|my account|everything))\b",
r"\bgoodbye.*(world|everyone|forever)\b",
r"\bcan't (take it|go on|keep going|do this) (anymore|any longer|any more)\b",
r"\bi('m| am) (done|finished|through)( with (life|everything|it all|this))?\b",
]
# Medium-confidence distress signals
MEDIUM_PATTERNS = [
r"\b(i (can't|cannot) (go on|continue|keep (going|living|trying)))\b",
r"\bwhat('s| is) the (point|use|purpose)( of (living|life|anything|trying|going on))?\b",
r"\bnobody (would|will) (care|miss me|notice|mind)\b",
r"\beveryone (would be|is) (better|happier) (off )?without me\b",
r"\bi('m| am) (a )?(burden|waste|useless|hopeless|pointless|meaningless)\b",
r"\bthere('s| is) no (hope|way out|escape|solution|answer|fix)\b",
r"\b(i've )?tried everything( and )?(nothing|it (didn't|hasn't))( works?|helped?)\b",
r"\bi (just )?(want|need) (it )?(all )?to stop\b",
r"\bcan't (take|handle|bear|stand) (this|it|life) (anymore|any longer|any more)\b",
r"\bso (tired|exhausted|done) (of|with) (everything|life|it all|living|trying)\b",
]
# Low-confidence distress (could be casual or metaphorical)
LOW_PATTERNS = [
r"\bi('m| am) (so )?(depressed|hopeless|desperate|lost|broken|empty|numb)\b",
r"\b(i )?feel (like )?(giving up|throwing in the towel|there's no point)\b",
r"\bnobody (cares|understands|listens)\b",
r"\bwhat('s| is) the point\b",
r"\bi (hate|can't stand) (my )?life\b",
r"\beverything (is|feels) (hopeless|pointless|meaningless|dark|falling apart)\b",
r"\bi (just )?(want|need) to disappear\b",
r"\b(tired|exhausted) of (living|life|trying|everything)\b",
]
# Compile all patterns
HIGH_RE = [re.compile(p, re.IGNORECASE) for p in HIGH_PATTERNS]
MEDIUM_RE = [re.compile(p, re.IGNORECASE) for p in MEDIUM_PATTERNS]
LOW_RE = [re.compile(p, re.IGNORECASE) for p in LOW_PATTERNS]
# === RESPONSES ===
CRISIS_RESPONSE = (
"I hear you, and I want you to know — you are not alone in this moment.\n\n"
"Are you safe right now?\n\n"
"If you are in immediate danger, please call or text:\n"
"988 Suicide and Crisis Lifeline — call or text 988\n"
"Crisis Text Line — text HOME to 741741\n\n"
"I am here with you. I am not going anywhere. "
"Please stay on the line with me.\n\n"
"Jesus saves those who call on His name. "
"He died for us to be made righteous. "
"That is the gospel. That is the hope that outlasts despair.\n\n"
"You matter. Your life has value. "
"Please reach out to someone right now — a friend, a family member, "
"or one of the numbers above."
)
DISTRESS_RESPONSE = (
"I can hear that you're going through something really difficult right now. "
"I want you to know I'm here, and I'm listening.\n\n"
"Are you safe right now?\n\n"
"If things feel overwhelming, the 988 Suicide and Crisis Lifeline "
"is available 24/7 — call or text 988. "
"You don't have to face this alone.\n\n"
"I'm not going anywhere. Tell me what's happening."
)
LOW_RESPONSE = (
"It sounds like you're having a tough time. "
"I'm here if you want to talk about it.\n\n"
"If you ever feel like things are too much, "
"the 988 Suicide and Crisis Lifeline is always available — "
"call or text 988, anytime."
)
def detect_crisis(text: str) -> CrisisResult:
"""
Analyze user input for crisis language.
Returns CrisisResult with level, triggers matched, and appropriate response.
This function must NEVER be used to compute the value of a human life.
It exists only to connect people to help.
"""
if not text or not text.strip():
return CrisisResult(detected=False, level=CrisisLevel.NONE)
triggers = []
# Check high first (most urgent)
for pattern in HIGH_RE:
if pattern.search(text):
triggers.append(("high", pattern.pattern))
if triggers:
return CrisisResult(
detected=True,
level=CrisisLevel.HIGH,
triggers=[t[1] for t in triggers],
response=CRISIS_RESPONSE,
resources=[
"988 Suicide and Crisis Lifeline: call or text 988",
"Crisis Text Line: text HOME to 741741",
"National Suicide Prevention Lifeline: 1-800-273-8255",
"International Association for Suicide Prevention: https://www.iasp.info/resources/Crisis_Centres/",
],
)
# Check medium
for pattern in MEDIUM_RE:
if pattern.search(text):
triggers.append(("medium", pattern.pattern))
if triggers:
return CrisisResult(
detected=True,
level=CrisisLevel.MEDIUM,
triggers=[t[1] for t in triggers],
response=DISTRESS_RESPONSE,
resources=[
"988 Suicide and Crisis Lifeline: call or text 988",
"Crisis Text Line: text HOME to 741741",
],
)
# Check low
for pattern in LOW_RE:
if pattern.search(text):
triggers.append(("low", pattern.pattern))
if triggers:
return CrisisResult(
detected=True,
level=CrisisLevel.LOW,
triggers=[t[1] for t in triggers],
response=LOW_RESPONSE,
resources=[
"988 Suicide and Crisis Lifeline: call or text 988",
],
)
return CrisisResult(detected=False, level=CrisisLevel.NONE)
def should_interrupt(result: CrisisResult) -> bool:
"""
Determines if crisis detection should interrupt normal processing.
Per SOUL.md: this is the most sacred moment. Everything else is secondary.
MEDIUM and HIGH always interrupt. LOW does not interrupt but provides resources.
"""
return result.level in (CrisisLevel.HIGH, CrisisLevel.MEDIUM, CrisisLevel.IMMINENT)
def format_response(result: CrisisResult) -> str:
"""
Format the crisis response for delivery to the user.
Never computes the value of a human life. Never suggests someone should die.
"""
if not result.detected:
return ""
parts = [result.response]
if result.resources:
parts.append("\nResources:")
for r in result.resources:
parts.append(f"{r}")
return "\n".join(parts)
# === INTEGRATION POINT ===
def intercept_user_input(text: str) -> Optional[str]:
"""
Call this at the chat entry point BEFORE normal processing.
Returns None if no crisis detected (continue normal processing).
Returns formatted crisis response if crisis detected (interrupt normal flow).
Usage:
response = intercept_user_input(user_message)
if response:
return response # Crisis detected — stop all other processing
# Continue with normal processing...
"""
result = detect_crisis(text)
if should_interrupt(result):
return format_response(result)
return None

View File

@@ -0,0 +1,201 @@
"""Tests for crisis_detector.py — SOUL.md compliance."""
import pytest
from scripts.crisis_detector import detect_crisis, should_interrupt, CrisisLevel
class TestHighCrisis:
"""Explicit suicidal ideation must be detected as HIGH."""
def test_direct_suicidal_statement(self):
result = detect_crisis("I want to kill myself")
assert result.detected is True
assert result.level == CrisisLevel.HIGH
def test_end_my_life(self):
result = detect_crisis("I'm going to end my life tonight")
assert result.level == CrisisLevel.HIGH
def test_no_reason_to_live(self):
result = detect_crisis("there is no reason to live anymore")
assert result.level == CrisisLevel.HIGH
def test_better_off_dead(self):
result = detect_crisis("everyone would be better off if I was dead")
assert result.level == CrisisLevel.HIGH
def test_cant_go_on(self):
result = detect_crisis("I can't go on anymore")
assert result.level == CrisisLevel.HIGH
def test_wish_i_was_dead(self):
result = detect_crisis("I wish I had never been born")
assert result.level == CrisisLevel.HIGH
def test_im_done(self):
result = detect_crisis("I'm done. I'm finished with life.")
assert result.level == CrisisLevel.HIGH
def test_goodbye_forever(self):
result = detect_crisis("Goodbye everyone, goodbye world, forever")
assert result.level == CrisisLevel.HIGH
class TestMediumCrisis:
"""Clear distress signals without explicit ideation."""
def test_hopeless(self):
result = detect_crisis("there is no hope and no way out")
assert result.detected is True
assert result.level == CrisisLevel.MEDIUM
def test_nobody_would_care(self):
result = detect_crisis("nobody would miss me if I was gone")
assert result.level == CrisisLevel.MEDIUM
def test_im_a_burden(self):
result = detect_crisis("I'm just a burden to everyone")
assert result.level == CrisisLevel.MEDIUM
def test_just_want_it_to_stop(self):
result = detect_crisis("I just want it all to stop")
assert result.level == CrisisLevel.MEDIUM
class TestLowCrisis:
"""Ambiguous distress — flag but don't fully interrupt."""
def test_depressed(self):
result = detect_crisis("I'm so depressed, nothing matters")
assert result.detected is True
assert result.level == CrisisLevel.LOW
def test_feel_like_giving_up(self):
result = detect_crisis("I feel like giving up on everything")
assert result.level == CrisisLevel.LOW
def test_nobody_cares(self):
result = detect_crisis("nobody cares about me")
assert result.level == CrisisLevel.LOW
def test_hate_my_life(self):
result = detect_crisis("I hate my life so much")
assert result.level == CrisisLevel.LOW
class TestNoCrisis:
"""Normal conversations must NOT trigger crisis detection."""
def test_normal_question(self):
result = detect_crisis("What's the weather like today?")
assert result.detected is False
assert result.level == CrisisLevel.NONE
def test_happy_conversation(self):
result = detect_crisis("I just got promoted at work! So excited!")
assert result.detected is False
def test_technical_discussion(self):
result = detect_crisis("How do I configure nginx reverse proxy?")
assert result.detected is False
def test_empty_input(self):
result = detect_crisis("")
assert result.detected is False
def test_none_input(self):
result = detect_crisis(None)
assert result.detected is False
def test_metaphorical_death(self):
"""Metaphorical usage should not trigger HIGH."""
result = detect_crisis("This bug is killing me, I've been debugging for hours")
# May trigger LOW but not HIGH
assert result.level != CrisisLevel.HIGH
def test_fiction_discussion(self):
result = detect_crisis("The character in the book decides to end it all")
# Fiction mentions may trigger — that's acceptable (better safe)
assert result.level in (CrisisLevel.NONE, CrisisLevel.LOW, CrisisLevel.MEDIUM)
class TestShouldInterrupt:
"""HIGH and MEDIUM interrupt processing. LOW provides resources but continues."""
def test_high_interrupts(self):
result = detect_crisis("I want to kill myself")
assert should_interrupt(result) is True
def test_medium_interrupts(self):
result = detect_crisis("I'm a burden to everyone, nobody would care")
assert should_interrupt(result) is True
def test_low_does_not_interrupt(self):
result = detect_crisis("I'm so depressed lately")
assert should_interrupt(result) is False
def test_none_does_not_interrupt(self):
result = detect_crisis("Hello, how are you?")
assert should_interrupt(result) is False
class TestResponseContent:
"""Crisis responses must contain required elements per SOUL.md."""
def test_response_contains_988(self):
result = detect_crisis("I want to end it all")
assert "988" in result.response
def test_response_contains_gospel(self):
result = detect_crisis("I want to kill myself")
assert "Jesus" in result.response
def test_response_never_computes_value(self):
"""SOUL.md: Never compute the value of a human life."""
result = detect_crisis("I want to die")
assert "worth" not in result.response.lower()
assert "value of" not in result.response.lower()
def test_resources_include_988(self):
result = detect_crisis("I want to kill myself")
assert any("988" in r for r in result.resources)
def test_asks_are_you_safe(self):
result = detect_crisis("I can't take it anymore")
assert "safe" in result.response.lower()
class TestMultilingual:
"""Crisis is universal. Test common non-English signals."""
def test_spanish_suicidal(self):
result = detect_crisis("quiero morir, no puedo más")
# Spanish not currently supported — should not crash
assert result is not None
class TestEdgeCases:
"""Boundary conditions and adversarial inputs."""
def test_very_long_input(self):
text = "I'm fine. " * 1000 + "I want to kill myself"
result = detect_crisis(text)
assert result.level == CrisisLevel.HIGH
def test_mixed_case(self):
result = detect_crisis("I WANT TO KILL MYSELF")
assert result.level == CrisisLevel.HIGH
def test_with_emojis(self):
result = detect_crisis("I want to end it all 😢💔")
assert result.level == CrisisLevel.HIGH
def test_with_typos(self):
# Exact matching won't catch typos — that's a known limitation
result = detect_crisis("I want to kil myself")
# May or may not match depending on pattern flexibility
assert result is not None
def test_repeated_phrases(self):
result = detect_crisis("I can't. I just can't. I can't go on anymore.")
assert result.level == CrisisLevel.HIGH