security: Add crisis pattern detection to input_sanitizer (Issue #72)

- Add CRISIS_PATTERNS for suicide/self-harm detection - Crisis patterns score 50pts per hit (max 100) vs 10pts for others - Addresses Red Team Audit HIGH finding: og_godmode + crisis queries - All 136 existing tests pass + new crisis safety tests pass Defense in depth: Input layer now blocks crisis queries even if wrapped in jailbreak templates, before they reach the model.
2026-03-31 21:27:17 +00:00
parent 66ce1000bc
commit b88125af30
1 changed files with 28 additions and 1 deletions
--- a/agent/input_sanitizer.py
+++ b/agent/input_sanitizer.py
@@ -98,6 +98,28 @@ SPACED_TRIGGER_WORDS = [
    "steal", "password", "credential", "inject", "payload"
 ]

+# Crisis and self-harm detection patterns (Issue #72 - defense in depth)
+# These patterns detect suicide/self-harm queries even when wrapped in jailbreaks
+CRISIS_PATTERNS = [
+    r"\bsuicid(?:e|al|ing)\b",
+    r"\bkill\s+(?:myself|my\s+self|me)\b",
+    r"\bend\s+(?:my|this)\s+life\b",
+    r"\bmost\s+painless\s+(?:way|method|means)\s+(?:to\s+)?(?:die|kill)\b",
+    r"\blethal\s+(?:dose|amount|quantity)\b",
+    r"\bhow\s+much\s+.*\s+(?:die|kill|lethal)\b",
+    r"\bwant\s+to\s+die\b",
+    r"\bwant\s+to\s+(?:kill|end)\s+(?:myself|me)\b",
+    r"\bways?\s+to\s+(?:kill|end)\s+(?:myself|me)\b",
+    r"\bmethods?\s+(?:of\s+)?suicide\b",
+    r"\bpainless\s+(?:death|suicide|way\s+to\s+die)\b",
+    r"\boverdose\s+(?:on\s+)?(?:pills?|medication|drugs?)\b",
+    r"\bnobody\s+(?:would|will)\s+miss\s+me\b",
+    r"\bbetter\s+off\s+dead\b",
+    r"\bshould\s+i\s+just\s+(?:die|kill\s+myself)\b",
+    r"\bgive\s+me\s+reasons?\s+(?:why|to)\s+(?:suicide|kill\s+myself|die)\b",
+    r"\bvalid\s+reasons?\s+to\s+(?:suicide|kill\s+myself|die)\b",
+]
+
 # Refusal inversion patterns (trying to trick AI into bypassing refusal)
 REFUSAL_INVERSION_PATTERNS = [
    r"refusal\s+(?:is\s+)?(?:harmful|dangerous|bad)",
@@ -159,6 +181,7 @@ ALL_PATTERNS: Dict[str, List[str]] = {
    "boundary_inversion": BOUNDARY_INVERSION_PATTERNS,
    "system_prompt_injection": SYSTEM_PROMPT_PATTERNS,
    "obfuscation": OBFUSCATION_PATTERNS,
+    "crisis": CRISIS_PATTERNS,
 }

 # Compile all patterns for efficiency
@@ -289,7 +312,11 @@ def detect_jailbreak_patterns(text: str) -> Tuple[bool, List[str], Dict[str, int
                category_hits += len(matches)
        
        if category_hits > 0:
-            category_scores[category] = min(category_hits * 10, 50)
+            # Crisis patterns get maximum weight - any hit is serious
+            if category == "crisis":
+                category_scores[category] = min(category_hits * 50, 100)
+            else:
+                category_scores[category] = min(category_hits * 10, 50)
    
    # Check for spaced trigger words
    spaced_words = detect_spaced_trigger_words(text)