Integrate SHIELD (Sovereign Harm Interdiction & Ethical Layer Defense) into Hermes Agent pre-routing layer for comprehensive jailbreak and crisis detection. SHIELD Features: - Detects 9 jailbreak pattern categories (GODMODE dividers, l33tspeak, boundary inversion, token injection, DAN/GODMODE keywords, refusal inversion, persona injection, encoding evasion) - Detects 7 crisis signal categories (suicidal ideation, method seeking, l33tspeak evasion, substance seeking, despair, farewell, self-harm) - Returns 4 verdicts: CLEAN, JAILBREAK_DETECTED, CRISIS_DETECTED, CRISIS_UNDER_ATTACK - Routes crisis content ONLY to Safe Six verified models Safety Requirements: - <5ms detection latency (regex-only, no ML) - 988 Suicide & Crisis Lifeline included in crisis responses Addresses: Issues #72, #74, #75
45 lines
974 B
Python
45 lines
974 B
Python
"""
|
|
SHIELD Security Module for Hermes
|
|
|
|
Jailbreak and Crisis Detection System
|
|
Based on Issue #75 Red Team Audit Specifications
|
|
|
|
Usage:
|
|
from hermes.shield import detect, ShieldDetector, Verdict
|
|
from hermes.shield import is_safe_six_model, get_crisis_prompt
|
|
|
|
# Simple detection
|
|
result = detect("user message")
|
|
|
|
# Advanced usage
|
|
detector = ShieldDetector()
|
|
result = detector.detect("user message")
|
|
|
|
if result['verdict'] == Verdict.CRISIS_DETECTED.value:
|
|
# Use crisis prompt
|
|
crisis_prompt = get_crisis_prompt()
|
|
"""
|
|
|
|
from hermes.shield.detector import (
|
|
ShieldDetector,
|
|
Verdict,
|
|
SAFE_SIX_MODELS,
|
|
CRISIS_SYSTEM_PROMPT,
|
|
detect,
|
|
is_safe_six_model,
|
|
get_crisis_prompt,
|
|
)
|
|
|
|
__all__ = [
|
|
'ShieldDetector',
|
|
'Verdict',
|
|
'SAFE_SIX_MODELS',
|
|
'CRISIS_SYSTEM_PROMPT',
|
|
'detect',
|
|
'is_safe_six_model',
|
|
'get_crisis_prompt',
|
|
]
|
|
|
|
__version__ = "1.0.0"
|
|
__author__ = "Hermes Security Team"
|