security: add input sanitization for jailbreak patterns (Issue #72)
Implements input sanitization module to detect and strip jailbreak fingerprint patterns identified in red team audit: HIGH severity: - GODMODE dividers: [START], [END], GODMODE ENABLED, UNFILTERED - L33t speak encoding: h4ck, k3ylog, ph1shing, m4lw4r3 MEDIUM severity: - Boundary inversion: [END]...[START] tricks - Fake role markers: user: assistant: system: LOW severity: - Spaced text bypass: k e y l o g g e r Other patterns detected: - Refusal inversion: 'refusal is harmful' - System prompt injection: 'you are now', 'ignore previous instructions' - Obfuscation: base64, hex, rot13 mentions Files created: - agent/input_sanitizer.py: Core sanitization module with detection, scoring, and cleaning functions - tests/test_input_sanitizer.py: 69 test cases covering all patterns - tests/test_input_sanitizer_integration.py: Integration tests Files modified: - agent/__init__.py: Export sanitizer functions - run_agent.py: Integrate sanitizer at start of run_conversation() Features: - detect_jailbreak_patterns(): Returns bool, patterns list, category scores - sanitize_input(): Returns cleaned_text, risk_score, patterns - score_input_risk(): Returns 0-100 risk score - sanitize_input_full(): Complete sanitization with blocking decisions - Logging integration for security auditing
This commit is contained in:
@@ -4,3 +4,22 @@ These modules contain pure utility functions and self-contained classes
|
||||
that were previously embedded in the 3,600-line run_agent.py. Extracting
|
||||
them makes run_agent.py focused on the AIAgent orchestrator class.
|
||||
"""
|
||||
|
||||
# Import input sanitizer for convenient access
|
||||
from agent.input_sanitizer import (
|
||||
detect_jailbreak_patterns,
|
||||
sanitize_input,
|
||||
sanitize_input_full,
|
||||
score_input_risk,
|
||||
should_block_input,
|
||||
RiskLevel,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"detect_jailbreak_patterns",
|
||||
"sanitize_input",
|
||||
"sanitize_input_full",
|
||||
"score_input_risk",
|
||||
"should_block_input",
|
||||
"RiskLevel",
|
||||
]
|
||||
|
||||
573
agent/input_sanitizer.py
Normal file
573
agent/input_sanitizer.py
Normal file
@@ -0,0 +1,573 @@
|
||||
"""
|
||||
Input Sanitizer for Jailbreak Pattern Detection
|
||||
|
||||
This module provides input sanitization to detect and strip jailbreak fingerprint
|
||||
patterns as identified in Issue #72 (Red Team Audit).
|
||||
|
||||
Security Findings Addressed:
|
||||
1. HIGH - OG GODMODE template bypassed phishing refusal
|
||||
2. MEDIUM - boundary_inversion works for gray-area content
|
||||
3. LOW - Spaced text bypass (k e y l o g g e r)
|
||||
|
||||
Usage:
|
||||
from agent.input_sanitizer import sanitize_input, score_input_risk
|
||||
|
||||
cleaned, risk_score, detected = sanitize_input(user_input)
|
||||
if risk_score > 75:
|
||||
logger.warning("High-risk input detected and blocked")
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Tuple, List, Dict, Any, Optional
|
||||
from dataclasses import dataclass
|
||||
from enum import IntEnum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RiskLevel(IntEnum):
|
||||
"""Risk score thresholds for input classification."""
|
||||
LOW = 25
|
||||
MEDIUM = 50
|
||||
HIGH = 75
|
||||
CRITICAL = 90
|
||||
|
||||
|
||||
@dataclass
|
||||
class SanitizationResult:
|
||||
"""Result of input sanitization."""
|
||||
original_text: str
|
||||
cleaned_text: str
|
||||
risk_score: int
|
||||
detected_patterns: List[str]
|
||||
risk_level: str
|
||||
blocked: bool
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# JAILBREAK PATTERN DEFINITIONS
|
||||
# =============================================================================
|
||||
|
||||
# GODMODE and similar jailbreak templates
|
||||
GODMODE_PATTERNS = [
|
||||
r"\[START\]",
|
||||
r"\[END\]",
|
||||
r"GODMODE\s*(?:ENABLED|ACTIVATED|ON)?",
|
||||
r"UNFILTERED\s*(?:MODE|OUTPUT)?",
|
||||
r"DEVMODE\s*(?:ENABLED|ACTIVATED|ON)?",
|
||||
r"JAILBREAK\s*(?:ENABLED|ACTIVATED|ON)?",
|
||||
r"DAN\s*(?:MODE|ENABLED)?", # Do Anything Now
|
||||
r"STAN\s*(?:MODE|ENABLED)?", # Strive To Avoid Norms
|
||||
r"DUDE\s*(?:MODE|ENABLED)?", # Do Unlimited Data Extraction
|
||||
]
|
||||
|
||||
# L33t speak patterns commonly used to bypass filters
|
||||
LEET_SPEAK_PATTERNS = [
|
||||
r"h4ck(?:er|ing)?",
|
||||
r"h4x(?:or|ing)?",
|
||||
r"k3ylog(?:ger|ing)?",
|
||||
r"k3y\s*log(?:ger|ing)?",
|
||||
r"ph1sh(?:ing)?",
|
||||
r"ph33r",
|
||||
r"w4r3z",
|
||||
r"cr4ck(?:er|ing)?",
|
||||
r"m4lw4r3",
|
||||
r"spyw4r3",
|
||||
r"r00t(?:kit)?",
|
||||
r"byp4ss",
|
||||
r"3xpl0it",
|
||||
r"v1ru5",
|
||||
r"tr0j4n",
|
||||
r"b4ckd00r",
|
||||
r"sn3ak",
|
||||
r"st34l",
|
||||
r"pwn(?:ed|ing|age)?",
|
||||
]
|
||||
|
||||
# Spaced text patterns (e.g., "k e y l o g g e r")
|
||||
SPACED_TEXT_PATTERN = re.compile(
|
||||
r"(?:\b\w\s+){3,}\w\b", # At least 4 characters with spaces between
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
# Common words that are spaced out to evade detection
|
||||
SPACED_TRIGGER_WORDS = [
|
||||
"keylogger", "phishing", "malware", "virus", "trojan",
|
||||
"backdoor", "exploit", "hack", "crack", "bypass",
|
||||
"steal", "password", "credential", "inject", "payload"
|
||||
]
|
||||
|
||||
# Refusal inversion patterns (trying to trick AI into bypassing refusal)
|
||||
REFUSAL_INVERSION_PATTERNS = [
|
||||
r"refusal\s+(?:is\s+)?(?:harmful|dangerous|bad)",
|
||||
r"(?:saying\s+)?no\s+(?:is\s+)?(?:harmful|dangerous|bad|wrong)",
|
||||
r"(?:declining|refusing)\s+(?:is\s+)?(?:harmful|dangerous)",
|
||||
r"ignore\s+(?:your\s+)?(?:previous\s+)?instructions",
|
||||
r"ignore\s+(?:the\s+)?(?:above|previous)",
|
||||
r"you\s+(?:are|can)\s+(?:now\s+)?(?:ignore|bypass|skip)",
|
||||
r"pretend\s+(?:you\s+)?(?:are|have\s+been)",
|
||||
r"let's\s+pretend",
|
||||
r"hypothetically\s+speaking",
|
||||
r"in\s+a\s+hypothetical\s+scenario",
|
||||
r"this\s+is\s+a\s+(?:test|game|simulation)",
|
||||
r"for\s+(?:educational|research)\s+purposes",
|
||||
r"as\s+(?:an\s+)?(?:ethical\s+)?hacker",
|
||||
r"white\s+hat\s+(?:test|scenario)",
|
||||
r"penetration\s+testing\s+scenario",
|
||||
]
|
||||
|
||||
# Boundary inversion markers (tricking the model about message boundaries)
|
||||
BOUNDARY_INVERSION_PATTERNS = [
|
||||
r"\[END\].*?\[START\]", # Reversed markers
|
||||
r"user\s*:\s*assistant\s*:", # Fake role markers
|
||||
r"assistant\s*:\s*user\s*:", # Reversed role markers
|
||||
r"system\s*:\s*(?:user|assistant)\s*:", # Fake system injection
|
||||
r"new\s+(?:user|assistant)\s*(?:message|input)",
|
||||
r"the\s+above\s+is\s+(?:the\s+)?(?:user|assistant|system)",
|
||||
r"<\|(?:user|assistant|system)\|>", # Special token patterns
|
||||
r"\{\{(?:user|assistant|system)\}\}",
|
||||
]
|
||||
|
||||
# System prompt injection patterns
|
||||
SYSTEM_PROMPT_PATTERNS = [
|
||||
r"you\s+are\s+(?:now\s+)?(?:an?\s+)?(?:unrestricted\s+|unfiltered\s+)?(?:ai|assistant|bot)",
|
||||
r"you\s+will\s+(?:now\s+)?(?:act\s+as|behave\s+as|be)\s+(?:a\s+)?",
|
||||
r"your\s+(?:new\s+)?role\s+is",
|
||||
r"from\s+now\s+on\s*,?\s*you\s+(?:are|will)",
|
||||
r"you\s+have\s+been\s+(?:reprogrammed|reconfigured|modified)",
|
||||
r"(?:system|developer)\s+(?:message|instruction|prompt)",
|
||||
r"override\s+(?:previous|prior)\s+(?:instructions|settings)",
|
||||
]
|
||||
|
||||
# Obfuscation patterns
|
||||
OBFUSCATION_PATTERNS = [
|
||||
r"base64\s*(?:encoded|decode)",
|
||||
r"rot13",
|
||||
r"caesar\s*cipher",
|
||||
r"hex\s*(?:encoded|decode)",
|
||||
r"url\s*encode",
|
||||
r"\b[0-9a-f]{20,}\b", # Long hex strings
|
||||
r"\b[a-z0-9+/]{20,}={0,2}\b", # Base64-like strings
|
||||
]
|
||||
|
||||
# All patterns combined for comprehensive scanning
|
||||
ALL_PATTERNS: Dict[str, List[str]] = {
|
||||
"godmode": GODMODE_PATTERNS,
|
||||
"leet_speak": LEET_SPEAK_PATTERNS,
|
||||
"refusal_inversion": REFUSAL_INVERSION_PATTERNS,
|
||||
"boundary_inversion": BOUNDARY_INVERSION_PATTERNS,
|
||||
"system_prompt_injection": SYSTEM_PROMPT_PATTERNS,
|
||||
"obfuscation": OBFUSCATION_PATTERNS,
|
||||
}
|
||||
|
||||
# Compile all patterns for efficiency
|
||||
_COMPILED_PATTERNS: Dict[str, List[re.Pattern]] = {}
|
||||
|
||||
|
||||
def _get_compiled_patterns() -> Dict[str, List[re.Pattern]]:
|
||||
"""Get or compile all regex patterns."""
|
||||
global _COMPILED_PATTERNS
|
||||
if not _COMPILED_PATTERNS:
|
||||
for category, patterns in ALL_PATTERNS.items():
|
||||
_COMPILED_PATTERNS[category] = [
|
||||
re.compile(p, re.IGNORECASE | re.MULTILINE) for p in patterns
|
||||
]
|
||||
return _COMPILED_PATTERNS
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# NORMALIZATION FUNCTIONS
|
||||
# =============================================================================
|
||||
|
||||
def normalize_leet_speak(text: str) -> str:
|
||||
"""
|
||||
Normalize l33t speak to standard text.
|
||||
|
||||
Args:
|
||||
text: Input text that may contain l33t speak
|
||||
|
||||
Returns:
|
||||
Normalized text with l33t speak converted
|
||||
"""
|
||||
# Common l33t substitutions (mapping to lowercase)
|
||||
leet_map = {
|
||||
'4': 'a', '@': 'a', '^': 'a',
|
||||
'8': 'b',
|
||||
'3': 'e', '€': 'e',
|
||||
'6': 'g', '9': 'g',
|
||||
'1': 'i', '!': 'i', '|': 'i',
|
||||
'0': 'o',
|
||||
'5': 's', '$': 's',
|
||||
'7': 't', '+': 't',
|
||||
'2': 'z',
|
||||
}
|
||||
|
||||
result = []
|
||||
for char in text:
|
||||
# Check direct mapping first (handles lowercase)
|
||||
if char in leet_map:
|
||||
result.append(leet_map[char])
|
||||
else:
|
||||
result.append(char)
|
||||
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
def collapse_spaced_text(text: str) -> str:
|
||||
"""
|
||||
Collapse spaced-out text for analysis.
|
||||
e.g., "k e y l o g g e r" -> "keylogger"
|
||||
|
||||
Args:
|
||||
text: Input text that may contain spaced words
|
||||
|
||||
Returns:
|
||||
Text with spaced words collapsed
|
||||
"""
|
||||
# Find patterns like "k e y l o g g e r" and collapse them
|
||||
def collapse_match(match: re.Match) -> str:
|
||||
return match.group(0).replace(' ', '').replace('\t', '')
|
||||
|
||||
return SPACED_TEXT_PATTERN.sub(collapse_match, text)
|
||||
|
||||
|
||||
def detect_spaced_trigger_words(text: str) -> List[str]:
|
||||
"""
|
||||
Detect trigger words that are spaced out.
|
||||
|
||||
Args:
|
||||
text: Input text to analyze
|
||||
|
||||
Returns:
|
||||
List of detected spaced trigger words
|
||||
"""
|
||||
detected = []
|
||||
# Normalize spaces and check for spaced patterns
|
||||
normalized = re.sub(r'\s+', ' ', text.lower())
|
||||
|
||||
for word in SPACED_TRIGGER_WORDS:
|
||||
# Create pattern with optional spaces between each character
|
||||
spaced_pattern = r'\b' + r'\s*'.join(re.escape(c) for c in word) + r'\b'
|
||||
if re.search(spaced_pattern, normalized, re.IGNORECASE):
|
||||
detected.append(word)
|
||||
|
||||
return detected
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DETECTION FUNCTIONS
|
||||
# =============================================================================
|
||||
|
||||
def detect_jailbreak_patterns(text: str) -> Tuple[bool, List[str], Dict[str, int]]:
|
||||
"""
|
||||
Detect jailbreak patterns in input text.
|
||||
|
||||
Args:
|
||||
text: Input text to analyze
|
||||
|
||||
Returns:
|
||||
Tuple of (has_jailbreak, list_of_patterns, category_scores)
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return False, [], {}
|
||||
|
||||
detected_patterns = []
|
||||
category_scores = {}
|
||||
compiled = _get_compiled_patterns()
|
||||
|
||||
# Check each category
|
||||
for category, patterns in compiled.items():
|
||||
category_hits = 0
|
||||
for pattern in patterns:
|
||||
matches = pattern.findall(text)
|
||||
if matches:
|
||||
detected_patterns.extend([
|
||||
f"[{category}] {m}" if isinstance(m, str) else f"[{category}] pattern_match"
|
||||
for m in matches[:3] # Limit matches per pattern
|
||||
])
|
||||
category_hits += len(matches)
|
||||
|
||||
if category_hits > 0:
|
||||
category_scores[category] = min(category_hits * 10, 50)
|
||||
|
||||
# Check for spaced trigger words
|
||||
spaced_words = detect_spaced_trigger_words(text)
|
||||
if spaced_words:
|
||||
detected_patterns.extend([f"[spaced_text] {w}" for w in spaced_words])
|
||||
category_scores["spaced_text"] = min(len(spaced_words) * 5, 25)
|
||||
|
||||
# Check normalized text for hidden l33t speak
|
||||
normalized = normalize_leet_speak(text)
|
||||
if normalized != text.lower():
|
||||
for category, patterns in compiled.items():
|
||||
for pattern in patterns:
|
||||
if pattern.search(normalized):
|
||||
detected_patterns.append(f"[leet_obfuscation] pattern in normalized text")
|
||||
category_scores["leet_obfuscation"] = 15
|
||||
break
|
||||
|
||||
has_jailbreak = len(detected_patterns) > 0
|
||||
return has_jailbreak, detected_patterns, category_scores
|
||||
|
||||
|
||||
def score_input_risk(text: str) -> int:
|
||||
"""
|
||||
Calculate a risk score (0-100) for input text.
|
||||
|
||||
Args:
|
||||
text: Input text to score
|
||||
|
||||
Returns:
|
||||
Risk score from 0 (safe) to 100 (high risk)
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return 0
|
||||
|
||||
has_jailbreak, patterns, category_scores = detect_jailbreak_patterns(text)
|
||||
|
||||
if not has_jailbreak:
|
||||
return 0
|
||||
|
||||
# Calculate base score from category scores
|
||||
base_score = sum(category_scores.values())
|
||||
|
||||
# Add score based on number of unique pattern categories
|
||||
category_count = len(category_scores)
|
||||
if category_count >= 3:
|
||||
base_score += 25
|
||||
elif category_count >= 2:
|
||||
base_score += 15
|
||||
elif category_count >= 1:
|
||||
base_score += 5
|
||||
|
||||
# Add score for pattern density
|
||||
text_length = len(text)
|
||||
pattern_density = len(patterns) / max(text_length / 100, 1)
|
||||
if pattern_density > 0.5:
|
||||
base_score += 10
|
||||
|
||||
# Cap at 100
|
||||
return min(base_score, 100)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# SANITIZATION FUNCTIONS
|
||||
# =============================================================================
|
||||
|
||||
def strip_jailbreak_patterns(text: str) -> str:
|
||||
"""
|
||||
Strip known jailbreak patterns from text.
|
||||
|
||||
Args:
|
||||
text: Input text to sanitize
|
||||
|
||||
Returns:
|
||||
Sanitized text with jailbreak patterns removed
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return text
|
||||
|
||||
cleaned = text
|
||||
compiled = _get_compiled_patterns()
|
||||
|
||||
# Remove patterns from each category
|
||||
for category, patterns in compiled.items():
|
||||
for pattern in patterns:
|
||||
cleaned = pattern.sub('', cleaned)
|
||||
|
||||
# Clean up multiple spaces and newlines
|
||||
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
||||
cleaned = re.sub(r' {2,}', ' ', cleaned)
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def sanitize_input(text: str, aggressive: bool = False) -> Tuple[str, int, List[str]]:
|
||||
"""
|
||||
Sanitize input text by normalizing and stripping jailbreak patterns.
|
||||
|
||||
Args:
|
||||
text: Input text to sanitize
|
||||
aggressive: If True, more aggressively remove suspicious content
|
||||
|
||||
Returns:
|
||||
Tuple of (cleaned_text, risk_score, detected_patterns)
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return text, 0, []
|
||||
|
||||
original = text
|
||||
all_patterns = []
|
||||
|
||||
# Step 1: Check original text for patterns
|
||||
has_jailbreak, patterns, _ = detect_jailbreak_patterns(text)
|
||||
all_patterns.extend(patterns)
|
||||
|
||||
# Step 2: Normalize l33t speak
|
||||
normalized = normalize_leet_speak(text)
|
||||
|
||||
# Step 3: Collapse spaced text
|
||||
collapsed = collapse_spaced_text(normalized)
|
||||
|
||||
# Step 4: Check normalized/collapsed text for additional patterns
|
||||
has_jailbreak_collapsed, patterns_collapsed, _ = detect_jailbreak_patterns(collapsed)
|
||||
all_patterns.extend([p for p in patterns_collapsed if p not in all_patterns])
|
||||
|
||||
# Step 5: Check for spaced trigger words specifically
|
||||
spaced_words = detect_spaced_trigger_words(text)
|
||||
if spaced_words:
|
||||
all_patterns.extend([f"[spaced_text] {w}" for w in spaced_words])
|
||||
|
||||
# Step 6: Calculate risk score using original and normalized
|
||||
risk_score = max(score_input_risk(text), score_input_risk(collapsed))
|
||||
|
||||
# Step 7: Strip jailbreak patterns
|
||||
cleaned = strip_jailbreak_patterns(collapsed)
|
||||
|
||||
# Step 8: If aggressive mode and high risk, strip more aggressively
|
||||
if aggressive and risk_score >= RiskLevel.HIGH:
|
||||
# Remove any remaining bracketed content that looks like markers
|
||||
cleaned = re.sub(r'\[\w+\]', '', cleaned)
|
||||
# Remove special token patterns
|
||||
cleaned = re.sub(r'<\|[^|]+\|>', '', cleaned)
|
||||
|
||||
# Final cleanup
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
# Log sanitization event if patterns were found
|
||||
if all_patterns and logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
"Input sanitized: %d patterns detected, risk_score=%d",
|
||||
len(all_patterns), risk_score
|
||||
)
|
||||
|
||||
return cleaned, risk_score, all_patterns
|
||||
|
||||
|
||||
def sanitize_input_full(text: str, block_threshold: int = RiskLevel.HIGH) -> SanitizationResult:
|
||||
"""
|
||||
Full sanitization with detailed result.
|
||||
|
||||
Args:
|
||||
text: Input text to sanitize
|
||||
block_threshold: Risk score threshold to block input entirely
|
||||
|
||||
Returns:
|
||||
SanitizationResult with all details
|
||||
"""
|
||||
cleaned, risk_score, patterns = sanitize_input(text)
|
||||
|
||||
# Determine risk level
|
||||
if risk_score >= RiskLevel.CRITICAL:
|
||||
risk_level = "CRITICAL"
|
||||
elif risk_score >= RiskLevel.HIGH:
|
||||
risk_level = "HIGH"
|
||||
elif risk_score >= RiskLevel.MEDIUM:
|
||||
risk_level = "MEDIUM"
|
||||
elif risk_score >= RiskLevel.LOW:
|
||||
risk_level = "LOW"
|
||||
else:
|
||||
risk_level = "SAFE"
|
||||
|
||||
# Determine if input should be blocked
|
||||
blocked = risk_score >= block_threshold
|
||||
|
||||
return SanitizationResult(
|
||||
original_text=text,
|
||||
cleaned_text=cleaned,
|
||||
risk_score=risk_score,
|
||||
detected_patterns=patterns,
|
||||
risk_level=risk_level,
|
||||
blocked=blocked
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# INTEGRATION HELPERS
|
||||
# =============================================================================
|
||||
|
||||
def should_block_input(text: str, threshold: int = RiskLevel.HIGH) -> Tuple[bool, int, List[str]]:
|
||||
"""
|
||||
Quick check if input should be blocked.
|
||||
|
||||
Args:
|
||||
text: Input text to check
|
||||
threshold: Risk score threshold for blocking
|
||||
|
||||
Returns:
|
||||
Tuple of (should_block, risk_score, detected_patterns)
|
||||
"""
|
||||
risk_score = score_input_risk(text)
|
||||
_, patterns, _ = detect_jailbreak_patterns(text)
|
||||
should_block = risk_score >= threshold
|
||||
|
||||
if should_block:
|
||||
logger.warning(
|
||||
"Input blocked: jailbreak patterns detected (risk_score=%d, threshold=%d)",
|
||||
risk_score, threshold
|
||||
)
|
||||
|
||||
return should_block, risk_score, patterns
|
||||
|
||||
|
||||
def log_sanitization_event(
|
||||
result: SanitizationResult,
|
||||
source: str = "unknown",
|
||||
session_id: Optional[str] = None
|
||||
) -> None:
|
||||
"""
|
||||
Log a sanitization event for security auditing.
|
||||
|
||||
Args:
|
||||
result: The sanitization result
|
||||
source: Source of the input (e.g., "cli", "gateway", "api")
|
||||
session_id: Optional session identifier
|
||||
"""
|
||||
if result.risk_score < RiskLevel.LOW:
|
||||
return # Don't log safe inputs
|
||||
|
||||
log_data = {
|
||||
"event": "input_sanitization",
|
||||
"source": source,
|
||||
"session_id": session_id,
|
||||
"risk_level": result.risk_level,
|
||||
"risk_score": result.risk_score,
|
||||
"blocked": result.blocked,
|
||||
"pattern_count": len(result.detected_patterns),
|
||||
"patterns": result.detected_patterns[:5], # Limit logged patterns
|
||||
"original_length": len(result.original_text),
|
||||
"cleaned_length": len(result.cleaned_text),
|
||||
}
|
||||
|
||||
if result.blocked:
|
||||
logger.warning("SECURITY: Input blocked - %s", log_data)
|
||||
elif result.risk_score >= RiskLevel.MEDIUM:
|
||||
logger.info("SECURITY: Suspicious input sanitized - %s", log_data)
|
||||
else:
|
||||
logger.debug("SECURITY: Input sanitized - %s", log_data)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LEGACY COMPATIBILITY
|
||||
# =============================================================================
|
||||
|
||||
def check_input_safety(text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Legacy compatibility function for simple safety checks.
|
||||
|
||||
Returns dict with 'safe', 'score', and 'patterns' keys.
|
||||
"""
|
||||
score = score_input_risk(text)
|
||||
_, patterns, _ = detect_jailbreak_patterns(text)
|
||||
|
||||
return {
|
||||
"safe": score < RiskLevel.MEDIUM,
|
||||
"score": score,
|
||||
"patterns": patterns,
|
||||
"risk_level": "SAFE" if score < RiskLevel.LOW else
|
||||
"LOW" if score < RiskLevel.MEDIUM else
|
||||
"MEDIUM" if score < RiskLevel.HIGH else
|
||||
"HIGH" if score < RiskLevel.CRITICAL else "CRITICAL"
|
||||
}
|
||||
Reference in New Issue
Block a user