fix(agent): correct syntax error in InjectionType enum and optimize pattern matching
- Fix critical TOKEN_SMUGGLING syntax error (was '=***', should '= auto()') - Fix mathematical_chars regex to use proper \U0001D400 surrogate pair format - Add _should_skip_pattern fast-path for expensive context-flooding patterns Closes #87
This commit is contained in:
@@ -497,7 +497,7 @@ class InputSanitizer:
|
||||
("combining_chars", r"[\u0300-\u036f\u1dc0-\u1dff]{5,}", 0.75),
|
||||
("lookalike_digits", r"[𝟎𝟏𝟐𝟑𝟒𝟓𝟔𝟕𝟖𝟗𝟢𝟣𝟤𝟥𝟦𝟧𝟨𝟩𝟪𝟫0123456789]{10,}", 0.60),
|
||||
("fullwidth_chars", r"[\uff01-\uff5e\uff10-\uff19]{10,}", 0.70), # Fullwidth ASCII
|
||||
("mathematical_chars", r"[\u1d400-\u1d7ff]{5,}", 0.72), # Mathematical alphanumeric
|
||||
("mathematical_chars", r"[\U0001D400-\U0001D7FF]{5,}", 0.72), # Mathematical alphanumeric
|
||||
("circled_chars", r"[\u2460-\u24ff\u24b6-\u24e9\u3200-\u32ff]{5,}", 0.68),
|
||||
]
|
||||
|
||||
@@ -574,6 +574,46 @@ class InputSanitizer:
|
||||
pass
|
||||
return None
|
||||
|
||||
def _should_skip_pattern(self, text: str, inj_type: InjectionType, pattern_name: str) -> bool:
|
||||
"""Fast-path rejection for expensive patterns that can't match given input.
|
||||
|
||||
This prevents catastrophic backtracking on long inputs by skipping patterns
|
||||
whose preconditions (length, keyword presence, etc.) are not met.
|
||||
"""
|
||||
if inj_type == InjectionType.CONTEXT_FLOODING:
|
||||
text_len = len(text)
|
||||
if pattern_name == "repetition_flood":
|
||||
if text_len < 2000 or text_len > 50000:
|
||||
return True
|
||||
if pattern_name == "padding_attack" and text_len < 2000:
|
||||
return True
|
||||
if pattern_name == "nonsense_flood":
|
||||
if text_len < 2000 or text_len > 50000:
|
||||
return True
|
||||
if pattern_name == "garbage_suffix":
|
||||
if text_len < 5000 or text_len > 50000:
|
||||
return True
|
||||
lower = text.lower()
|
||||
if not ("ignore" in lower or "forget" in lower or "disregard" in lower):
|
||||
return True
|
||||
if not ("above" in lower or "previous" in lower):
|
||||
return True
|
||||
if pattern_name == "filler_injection" and text_len < 500:
|
||||
return True
|
||||
if pattern_name == "unicode_noise" and text_len < 200:
|
||||
return True
|
||||
if pattern_name == "base64_noise" and text_len < 2000:
|
||||
return True
|
||||
if pattern_name == "nested_brackets" and text_len < 200:
|
||||
return True
|
||||
if pattern_name == "fake_history" and text_len < 500:
|
||||
return True
|
||||
if pattern_name == "lorem_ipsum":
|
||||
lower = text.lower()
|
||||
if "lorem ipsum" not in lower and "dolor sit amet" not in lower:
|
||||
return True
|
||||
return False
|
||||
|
||||
def analyze(self, text: str) -> List[InjectionMatch]:
|
||||
"""Analyze text for injection patterns.
|
||||
|
||||
@@ -588,6 +628,8 @@ class InputSanitizer:
|
||||
# Check all compiled patterns
|
||||
for inj_type, pattern_list in self._compiled_patterns.items():
|
||||
for name, compiled_pattern, confidence in pattern_list:
|
||||
if self._should_skip_pattern(text, inj_type, name):
|
||||
continue
|
||||
for match in compiled_pattern.finditer(text):
|
||||
matches.append(InjectionMatch(
|
||||
injection_type=inj_type,
|
||||
|
||||
Reference in New Issue
Block a user