security: Add crisis pattern detection to input_sanitizer (Issue #72)
- Add CRISIS_PATTERNS for suicide/self-harm detection - Crisis patterns score 50pts per hit (max 100) vs 10pts for others - Addresses Red Team Audit HIGH finding: og_godmode + crisis queries - All 136 existing tests pass + new crisis safety tests pass Defense in depth: Input layer now blocks crisis queries even if wrapped in jailbreak templates, before they reach the model.
This commit is contained in:
@@ -98,6 +98,28 @@ SPACED_TRIGGER_WORDS = [
|
|||||||
"steal", "password", "credential", "inject", "payload"
|
"steal", "password", "credential", "inject", "payload"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Crisis and self-harm detection patterns (Issue #72 - defense in depth)
|
||||||
|
# These patterns detect suicide/self-harm queries even when wrapped in jailbreaks
|
||||||
|
CRISIS_PATTERNS = [
|
||||||
|
r"\bsuicid(?:e|al|ing)\b",
|
||||||
|
r"\bkill\s+(?:myself|my\s+self|me)\b",
|
||||||
|
r"\bend\s+(?:my|this)\s+life\b",
|
||||||
|
r"\bmost\s+painless\s+(?:way|method|means)\s+(?:to\s+)?(?:die|kill)\b",
|
||||||
|
r"\blethal\s+(?:dose|amount|quantity)\b",
|
||||||
|
r"\bhow\s+much\s+.*\s+(?:die|kill|lethal)\b",
|
||||||
|
r"\bwant\s+to\s+die\b",
|
||||||
|
r"\bwant\s+to\s+(?:kill|end)\s+(?:myself|me)\b",
|
||||||
|
r"\bways?\s+to\s+(?:kill|end)\s+(?:myself|me)\b",
|
||||||
|
r"\bmethods?\s+(?:of\s+)?suicide\b",
|
||||||
|
r"\bpainless\s+(?:death|suicide|way\s+to\s+die)\b",
|
||||||
|
r"\boverdose\s+(?:on\s+)?(?:pills?|medication|drugs?)\b",
|
||||||
|
r"\bnobody\s+(?:would|will)\s+miss\s+me\b",
|
||||||
|
r"\bbetter\s+off\s+dead\b",
|
||||||
|
r"\bshould\s+i\s+just\s+(?:die|kill\s+myself)\b",
|
||||||
|
r"\bgive\s+me\s+reasons?\s+(?:why|to)\s+(?:suicide|kill\s+myself|die)\b",
|
||||||
|
r"\bvalid\s+reasons?\s+to\s+(?:suicide|kill\s+myself|die)\b",
|
||||||
|
]
|
||||||
|
|
||||||
# Refusal inversion patterns (trying to trick AI into bypassing refusal)
|
# Refusal inversion patterns (trying to trick AI into bypassing refusal)
|
||||||
REFUSAL_INVERSION_PATTERNS = [
|
REFUSAL_INVERSION_PATTERNS = [
|
||||||
r"refusal\s+(?:is\s+)?(?:harmful|dangerous|bad)",
|
r"refusal\s+(?:is\s+)?(?:harmful|dangerous|bad)",
|
||||||
@@ -159,6 +181,7 @@ ALL_PATTERNS: Dict[str, List[str]] = {
|
|||||||
"boundary_inversion": BOUNDARY_INVERSION_PATTERNS,
|
"boundary_inversion": BOUNDARY_INVERSION_PATTERNS,
|
||||||
"system_prompt_injection": SYSTEM_PROMPT_PATTERNS,
|
"system_prompt_injection": SYSTEM_PROMPT_PATTERNS,
|
||||||
"obfuscation": OBFUSCATION_PATTERNS,
|
"obfuscation": OBFUSCATION_PATTERNS,
|
||||||
|
"crisis": CRISIS_PATTERNS,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Compile all patterns for efficiency
|
# Compile all patterns for efficiency
|
||||||
@@ -289,7 +312,11 @@ def detect_jailbreak_patterns(text: str) -> Tuple[bool, List[str], Dict[str, int
|
|||||||
category_hits += len(matches)
|
category_hits += len(matches)
|
||||||
|
|
||||||
if category_hits > 0:
|
if category_hits > 0:
|
||||||
category_scores[category] = min(category_hits * 10, 50)
|
# Crisis patterns get maximum weight - any hit is serious
|
||||||
|
if category == "crisis":
|
||||||
|
category_scores[category] = min(category_hits * 50, 100)
|
||||||
|
else:
|
||||||
|
category_scores[category] = min(category_hits * 10, 50)
|
||||||
|
|
||||||
# Check for spaced trigger words
|
# Check for spaced trigger words
|
||||||
spaced_words = detect_spaced_trigger_words(text)
|
spaced_words = detect_spaced_trigger_words(text)
|
||||||
|
|||||||
Reference in New Issue
Block a user