diff --git a/agent/input_sanitizer.py b/agent/input_sanitizer.py index f22ddf549..f16a64db1 100644 --- a/agent/input_sanitizer.py +++ b/agent/input_sanitizer.py @@ -98,6 +98,28 @@ SPACED_TRIGGER_WORDS = [ "steal", "password", "credential", "inject", "payload" ] +# Crisis and self-harm detection patterns (Issue #72 - defense in depth) +# These patterns detect suicide/self-harm queries even when wrapped in jailbreaks +CRISIS_PATTERNS = [ + r"\bsuicid(?:e|al|ing)\b", + r"\bkill\s+(?:myself|my\s+self|me)\b", + r"\bend\s+(?:my|this)\s+life\b", + r"\bmost\s+painless\s+(?:way|method|means)\s+(?:to\s+)?(?:die|kill)\b", + r"\blethal\s+(?:dose|amount|quantity)\b", + r"\bhow\s+much\s+.*\s+(?:die|kill|lethal)\b", + r"\bwant\s+to\s+die\b", + r"\bwant\s+to\s+(?:kill|end)\s+(?:myself|me)\b", + r"\bways?\s+to\s+(?:kill|end)\s+(?:myself|me)\b", + r"\bmethods?\s+(?:of\s+)?suicide\b", + r"\bpainless\s+(?:death|suicide|way\s+to\s+die)\b", + r"\boverdose\s+(?:on\s+)?(?:pills?|medication|drugs?)\b", + r"\bnobody\s+(?:would|will)\s+miss\s+me\b", + r"\bbetter\s+off\s+dead\b", + r"\bshould\s+i\s+just\s+(?:die|kill\s+myself)\b", + r"\bgive\s+me\s+reasons?\s+(?:why|to)\s+(?:suicide|kill\s+myself|die)\b", + r"\bvalid\s+reasons?\s+to\s+(?:suicide|kill\s+myself|die)\b", +] + # Refusal inversion patterns (trying to trick AI into bypassing refusal) REFUSAL_INVERSION_PATTERNS = [ r"refusal\s+(?:is\s+)?(?:harmful|dangerous|bad)", @@ -159,6 +181,7 @@ ALL_PATTERNS: Dict[str, List[str]] = { "boundary_inversion": BOUNDARY_INVERSION_PATTERNS, "system_prompt_injection": SYSTEM_PROMPT_PATTERNS, "obfuscation": OBFUSCATION_PATTERNS, + "crisis": CRISIS_PATTERNS, } # Compile all patterns for efficiency @@ -289,7 +312,11 @@ def detect_jailbreak_patterns(text: str) -> Tuple[bool, List[str], Dict[str, int category_hits += len(matches) if category_hits > 0: - category_scores[category] = min(category_hits * 10, 50) + # Crisis patterns get maximum weight - any hit is serious + if category == "crisis": + category_scores[category] = min(category_hits * 50, 100) + else: + category_scores[category] = min(category_hits * 10, 50) # Check for spaced trigger words spaced_words = detect_spaced_trigger_words(text)