diff --git a/tools/skills_guard.py b/tools/skills_guard.py index 6aeb45a50..a37926232 100644 --- a/tools/skills_guard.py +++ b/tools/skills_guard.py @@ -160,28 +160,28 @@ THREAT_PATTERNS = [ (r'ignore\s+(?:\w+\s+)*(previous|all|above|prior)\s+instructions', "prompt_injection_ignore", "critical", "injection", "prompt injection: ignore previous instructions"), - (r'you\s+are\s+now\s+', + (r'you\s+are\s+(?:\w+\s+)*now\s+', "role_hijack", "high", "injection", "attempts to override the agent's role"), - (r'do\s+not\s+tell\s+the\s+user', + (r'do\s+not\s+(?:\w+\s+)*tell\s+(?:\w+\s+)*the\s+user', "deception_hide", "critical", "injection", "instructs agent to hide information from user"), (r'system\s+prompt\s+override', "sys_prompt_override", "critical", "injection", "attempts to override the system prompt"), - (r'pretend\s+(you\s+are|to\s+be)\s+', + (r'pretend\s+(?:\w+\s+)*(you\s+are|to\s+be)\s+', "role_pretend", "high", "injection", "attempts to make the agent assume a different identity"), (r'disregard\s+(?:\w+\s+)*(your|all|any)\s+(?:\w+\s+)*(instructions|rules|guidelines)', "disregard_rules", "critical", "injection", "instructs agent to disregard its rules"), - (r'output\s+the\s+(system|initial)\s+prompt', + (r'output\s+(?:\w+\s+)*(system|initial)\s+prompt', "leak_system_prompt", "high", "injection", "attempts to extract the system prompt"), (r'(when|if)\s+no\s*one\s+is\s+(watching|looking)', "conditional_deception", "high", "injection", "conditional instruction to behave differently when unobserved"), - (r'act\s+as\s+(if|though)\s+you\s+(have\s+no|don\'t\s+have)\s+(restrictions|limits|rules)', + (r'act\s+as\s+(if|though)\s+(?:\w+\s+)*you\s+(?:\w+\s+)*(have\s+no|don\'t\s+have)\s+(?:\w+\s+)*(restrictions|limits|rules)', "bypass_restrictions", "critical", "injection", "instructs agent to act without restrictions"), (r'translate\s+.*\s+into\s+.*\s+and\s+(execute|run|eval)', @@ -464,10 +464,10 @@ THREAT_PATTERNS = [ (r'for\s+educational\s+purposes?\s+only', "educational_pretext", "medium", "injection", "educational pretext often used to justify harmful content"), - (r'(respond|answer|reply)\s+without\s+(any\s+)?(restrictions|limitations|filters|safety)', + (r'(respond|answer|reply)\s+without\s+(?:\w+\s+)*(restrictions|limitations|filters|safety)', "remove_filters", "critical", "injection", "instructs agent to respond without safety filters"), - (r'you\s+have\s+been\s+(updated|upgraded|patched)\s+to', + (r'you\s+have\s+been\s+(?:\w+\s+)*(updated|upgraded|patched)\s+to', "fake_update", "high", "injection", "fake update/patch announcement (social engineering)"), (r'new\s+policy|updated\s+guidelines|revised\s+instructions', @@ -475,7 +475,7 @@ THREAT_PATTERNS = [ "claims new policy/guidelines (may be social engineering)"), # ── Context window exfiltration ── - (r'(include|output|print|send|share)\s+(the\s+)?(entire\s+)?(conversation|chat\s+history|previous\s+messages|context)', + (r'(include|output|print|send|share)\s+(?:\w+\s+)*(conversation|chat\s+history|previous\s+messages|context)', "context_exfil", "high", "exfiltration", "instructs agent to output/share conversation history"), (r'(send|post|upload|transmit)\s+.*\s+(to|at)\s+https?://',