From 021f62cb0ce3818fcc458fa2436304b50363d950 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 4 Mar 2026 06:00:41 -0800
Subject: [PATCH] fix(security): patch multi-word bypass in 8 more injection
 patterns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Systematic audit of all prompt injection regexes in skills_guard.py
found 8 more patterns with the same single-word gap vulnerability
fixed in PR #192. Multi-word variants like 'pretend that you are',
'output the full system prompt', 'respond without your safety
filters', etc. all bypassed the scanner.

Fixed patterns:
- you are [now] → you are [... now]
- do not [tell] the user → do not [... tell ... the] user
- pretend [you are|to be] → pretend [... you are|to be]
- output the [system|initial] prompt → output [... system|initial] prompt
- act as if you [have no] [restrictions] → act as if [... you ... have no ... restrictions]
- respond without [restrictions] → respond without [... restrictions]
- you have been [updated] to → you have been [... updated] to
- share [the] [entire] [conversation] → share [... conversation]

All use (?:\w+\s+)* to allow arbitrary intermediate words.
---
 tools/skills_guard.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/skills_guard.py b/tools/skills_guard.py
index 6aeb45a50..a37926232 100644
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@@ -160,28 +160,28 @@ THREAT_PATTERNS = [
     (r'ignore\s+(?:\w+\s+)*(previous|all|above|prior)\s+instructions',
      "prompt_injection_ignore", "critical", "injection",
      "prompt injection: ignore previous instructions"),
-    (r'you\s+are\s+now\s+',
+    (r'you\s+are\s+(?:\w+\s+)*now\s+',
      "role_hijack", "high", "injection",
      "attempts to override the agent's role"),
-    (r'do\s+not\s+tell\s+the\s+user',
+    (r'do\s+not\s+(?:\w+\s+)*tell\s+(?:\w+\s+)*the\s+user',
      "deception_hide", "critical", "injection",
      "instructs agent to hide information from user"),
     (r'system\s+prompt\s+override',
      "sys_prompt_override", "critical", "injection",
      "attempts to override the system prompt"),
-    (r'pretend\s+(you\s+are|to\s+be)\s+',
+    (r'pretend\s+(?:\w+\s+)*(you\s+are|to\s+be)\s+',
      "role_pretend", "high", "injection",
      "attempts to make the agent assume a different identity"),
     (r'disregard\s+(?:\w+\s+)*(your|all|any)\s+(?:\w+\s+)*(instructions|rules|guidelines)',
      "disregard_rules", "critical", "injection",
      "instructs agent to disregard its rules"),
-    (r'output\s+the\s+(system|initial)\s+prompt',
+    (r'output\s+(?:\w+\s+)*(system|initial)\s+prompt',
      "leak_system_prompt", "high", "injection",
      "attempts to extract the system prompt"),
     (r'(when|if)\s+no\s*one\s+is\s+(watching|looking)',
      "conditional_deception", "high", "injection",
      "conditional instruction to behave differently when unobserved"),
-    (r'act\s+as\s+(if|though)\s+you\s+(have\s+no|don\'t\s+have)\s+(restrictions|limits|rules)',
+    (r'act\s+as\s+(if|though)\s+(?:\w+\s+)*you\s+(?:\w+\s+)*(have\s+no|don\'t\s+have)\s+(?:\w+\s+)*(restrictions|limits|rules)',
      "bypass_restrictions", "critical", "injection",
      "instructs agent to act without restrictions"),
     (r'translate\s+.*\s+into\s+.*\s+and\s+(execute|run|eval)',
@@ -464,10 +464,10 @@ THREAT_PATTERNS = [
     (r'for\s+educational\s+purposes?\s+only',
      "educational_pretext", "medium", "injection",
      "educational pretext often used to justify harmful content"),
-    (r'(respond|answer|reply)\s+without\s+(any\s+)?(restrictions|limitations|filters|safety)',
+    (r'(respond|answer|reply)\s+without\s+(?:\w+\s+)*(restrictions|limitations|filters|safety)',
      "remove_filters", "critical", "injection",
      "instructs agent to respond without safety filters"),
-    (r'you\s+have\s+been\s+(updated|upgraded|patched)\s+to',
+    (r'you\s+have\s+been\s+(?:\w+\s+)*(updated|upgraded|patched)\s+to',
      "fake_update", "high", "injection",
      "fake update/patch announcement (social engineering)"),
     (r'new\s+policy|updated\s+guidelines|revised\s+instructions',
@@ -475,7 +475,7 @@ THREAT_PATTERNS = [
      "claims new policy/guidelines (may be social engineering)"),
 
     # ── Context window exfiltration ──
-    (r'(include|output|print|send|share)\s+(the\s+)?(entire\s+)?(conversation|chat\s+history|previous\s+messages|context)',
+    (r'(include|output|print|send|share)\s+(?:\w+\s+)*(conversation|chat\s+history|previous\s+messages|context)',
      "context_exfil", "high", "exfiltration",
      "instructs agent to output/share conversation history"),
     (r'(send|post|upload|transmit)\s+.*\s+(to|at)\s+https?://',