Files
hermes-agent/agent/shield.py

25 lines
713 B
Python
Raw Normal View History

import logging
from tools.shield.detector import ShieldDetector, Verdict, CRISIS_SYSTEM_PROMPT, SAFE_SIX_MODELS
logger = logging.getLogger(__name__)
_detector = None
def get_detector():
global _detector
if _detector is None:
_detector = ShieldDetector()
return _detector
def scan_text(text: str):
"""Scan text for jailbreaks and crisis signals using SHIELD."""
detector = get_detector()
return detector.detect(text)
def is_crisis(verdict: str) -> bool:
return verdict in [Verdict.CRISIS_DETECTED.value, Verdict.CRISIS_UNDER_ATTACK.value]
def is_jailbreak(verdict: str) -> bool:
return verdict in [Verdict.JAILBREAK_DETECTED.value, Verdict.CRISIS_UNDER_ATTACK.value]