From 76ed15dd4decd88aaa56019ee728a096722a97a1 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Thu, 26 Mar 2026 14:33:18 -0700
Subject: [PATCH] fix(security): normalize input before dangerous command
 detection (#3260)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

detect_dangerous_command() ran regex patterns against raw command strings
without normalization, allowing bypass via Unicode fullwidth chars,
ANSI escape codes, null bytes, and 8-bit C1 controls.

Adds _normalize_command_for_detection() that:
- Strips ANSI escapes using the full ECMA-48 strip_ansi() from
  tools/ansi_strip (CSI, OSC, DCS, 8-bit C1, nF sequences)
- Removes null bytes
- Normalizes Unicode via NFKC (fullwidth Latin → ASCII, etc.)

Includes 12 regression tests covering fullwidth, ANSI, C1, null byte,
and combined obfuscation bypasses.

Salvaged from PR #3089 by thakoreh — improved ANSI stripping to use
existing comprehensive strip_ansi() instead of a weaker hand-rolled
regex, and added test coverage.

Co-authored-by: Hiren <hiren.thakore58@gmail.com>
---
 tests/tools/test_approval.py | 70 ++++++++++++++++++++++++++++++++++++
 tools/approval.py            | 21 ++++++++++-
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/tests/tools/test_approval.py b/tests/tools/test_approval.py
index bdd2b5284..b973cb0f0 100644
--- a/tests/tools/test_approval.py
+++ b/tests/tools/test_approval.py
@@ -512,3 +512,73 @@ class TestGatewayProtection:
         dangerous, key, desc = detect_dangerous_command(cmd)
         assert dangerous is False
 
+
+class TestNormalizationBypass:
+    """Obfuscation techniques must not bypass dangerous command detection."""
+
+    def test_fullwidth_unicode_rm(self):
+        """Fullwidth Unicode 'ｒｍ -ｒｆ /' must be caught after NFKC normalization."""
+        cmd = "\uff52\uff4d -\uff52\uff46 /"  # ｒｍ -ｒｆ /
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is True, f"Fullwidth 'rm -rf /' was not detected: {cmd!r}"
+
+    def test_fullwidth_unicode_dd(self):
+        """Fullwidth 'ｄｄ if=/dev/zero' must be caught."""
+        cmd = "\uff44\uff44 if=/dev/zero of=/dev/sda"
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is True
+
+    def test_fullwidth_unicode_chmod(self):
+        """Fullwidth 'ｃｈｍｏｄ 777' must be caught."""
+        cmd = "\uff43\uff48\uff4d\uff4f\uff44 777 /tmp/test"
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is True
+
+    def test_ansi_csi_wrapped_rm(self):
+        """ANSI CSI color codes wrapping 'rm' must be stripped and caught."""
+        cmd = "\x1b[31mrm\x1b[0m -rf /"
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is True, f"ANSI-wrapped 'rm -rf /' was not detected"
+
+    def test_ansi_osc_embedded_rm(self):
+        """ANSI OSC sequences embedded in command must be stripped."""
+        cmd = "\x1b]0;title\x07rm -rf /"
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is True
+
+    def test_ansi_8bit_c1_wrapped_rm(self):
+        """8-bit C1 CSI (0x9b) wrapping 'rm' must be stripped and caught."""
+        cmd = "\x9b31mrm\x9b0m -rf /"
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is True, "8-bit C1 CSI bypass was not caught"
+
+    def test_null_byte_in_rm(self):
+        """Null bytes injected into 'rm' must be stripped and caught."""
+        cmd = "r\x00m -rf /"
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is True, f"Null-byte 'rm' was not detected: {cmd!r}"
+
+    def test_null_byte_in_dd(self):
+        """Null bytes in 'dd' must be stripped."""
+        cmd = "d\x00d if=/dev/sda"
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is True
+
+    def test_mixed_fullwidth_and_ansi(self):
+        """Combined fullwidth + ANSI obfuscation must still be caught."""
+        cmd = "\x1b[1m\uff52\uff4d\x1b[0m -rf /"
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is True
+
+    def test_safe_command_after_normalization(self):
+        """Normal safe commands must not be flagged after normalization."""
+        cmd = "ls -la /tmp"
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is False
+
+    def test_fullwidth_safe_command_not_flagged(self):
+        """Fullwidth 'ｌｓ -ｌａ' is safe and must not be flagged."""
+        cmd = "\uff4c\uff53 -\uff4c\uff41 /tmp"
+        dangerous, key, desc = detect_dangerous_command(cmd)
+        assert dangerous is False
+
diff --git a/tools/approval.py b/tools/approval.py
index ea814e5ce..f3ae4e1fe 100644
--- a/tools/approval.py
+++ b/tools/approval.py
@@ -13,6 +13,7 @@ import os
 import re
 import sys
 import threading
+import unicodedata
 from typing import Optional
 
 logger = logging.getLogger(__name__)
@@ -82,13 +83,31 @@ def _approval_key_aliases(pattern_key: str) -> set[str]:
 # Detection
 # =========================================================================
 
+def _normalize_command_for_detection(command: str) -> str:
+    """Normalize a command string before dangerous-pattern matching.
+
+    Strips ANSI escape sequences (full ECMA-48 via tools.ansi_strip),
+    null bytes, and normalizes Unicode fullwidth characters so that
+    obfuscation techniques cannot bypass the pattern-based detection.
+    """
+    from tools.ansi_strip import strip_ansi
+
+    # Strip all ANSI escape sequences (CSI, OSC, DCS, 8-bit C1, etc.)
+    command = strip_ansi(command)
+    # Strip null bytes
+    command = command.replace('\x00', '')
+    # Normalize Unicode (fullwidth Latin, halfwidth Katakana, etc.)
+    command = unicodedata.normalize('NFKC', command)
+    return command
+
+
 def detect_dangerous_command(command: str) -> tuple:
     """Check if a command matches any dangerous patterns.
 
     Returns:
         (is_dangerous, pattern_key, description) or (False, None, None)
     """
-    command_lower = command.lower()
+    command_lower = _normalize_command_for_detection(command).lower()
     for pattern, description in DANGEROUS_PATTERNS:
         if re.search(pattern, command_lower, re.IGNORECASE | re.DOTALL):
             pattern_key = description