security: fix V-011 Skills Guard Bypass with AST analysis and normalization

2026-03-31 18:44:32 +00:00
parent 546b3dd45d
commit 37c75ecd7a
2 changed files with 792 additions and 30 deletions
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@@ -3,10 +3,11 @@
 Skills Guard — Security scanner for externally-sourced skills.

 Every skill downloaded from a registry passes through this scanner before
-installation. It uses regex-based static analysis to detect known-bad patterns
-(data exfiltration, prompt injection, destructive commands, persistence, etc.)
-and a trust-aware install policy that determines whether a skill is allowed
-based on both the scan verdict and the source's trust level.
+installation. It uses regex-based static analysis and AST analysis to detect
+known-bad patterns (data exfiltration, prompt injection, destructive commands,
+persistence, obfuscation, etc.) and a trust-aware install policy that determines
+whether a skill is allowed based on both the scan verdict and the source's
+trust level.

 Trust levels:
  - builtin:   Ships with Hermes. Never scanned, always trusted.
@@ -22,12 +23,14 @@ Usage:
        print(format_scan_report(result))
 """

-import re
+import ast
 import hashlib
+import re
+import unicodedata
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import List, Tuple
+from typing import List, Set, Tuple



@@ -501,7 +504,25 @@ SUSPICIOUS_BINARY_EXTENSIONS = {
    '.msi', '.dmg', '.app', '.deb', '.rpm',
 }

+
+# ---------------------------------------------------------------------------
+# Input normalization for bypass detection
+# ---------------------------------------------------------------------------
+
 # Zero-width and invisible unicode characters used for injection
+# These are removed during normalization
+ZERO_WIDTH_CHARS = frozenset({
+    '\u200b',  # zero-width space
+    '\u200c',  # zero-width non-joiner
+    '\u200d',  # zero-width joiner
+    '\u2060',  # word joiner
+    '\u2062',  # invisible times
+    '\u2063',  # invisible separator
+    '\u2064',  # invisible plus
+    '\ufeff',  # zero-width no-break space (BOM)
+})
+
+# Extended invisible characters for detection (reporting only)
 INVISIBLE_CHARS = {
    '\u200b',  # zero-width space
    '\u200c',  # zero-width non-joiner
@@ -522,6 +543,311 @@ INVISIBLE_CHARS = {
    '\u2069',  # pop directional isolate
 }

+# Unicode homoglyph mapping for common confusable characters
+# Maps lookalike characters to their ASCII equivalents
+HOMOGLYPH_MAP = str.maketrans({
+    # Fullwidth Latin
+    '\uff45': 'e', '\uff56': 'v', '\uff41': 'a', '\uff4c': 'l',  # ｅｖａｌ -> eval
+    '\uff25': 'e', '\uff36': 'v', '\uff21': 'a', '\uff2c': 'l',  # ＥＶＡＬ -> eval
+    '\uff4f': 'o', '\uff53': 's', '\uff58': 'x', '\uff43': 'c',  # ｏｓｘｃ
+    '\uff2f': 'o', '\uff33': 's', '\uff38': 'x', '\uff23': 'c',  # ＯＳＸＣ
+    # Cyrillic lookalikes
+    '\u0435': 'e',  # Cyrillic е -> Latin e
+    '\u0430': 'a',  # Cyrillic а -> Latin a
+    '\u043e': 'o',  # Cyrillic о -> Latin o
+    '\u0441': 'c',  # Cyrillic с -> Latin c
+    '\u0445': 'x',  # Cyrillic х -> Latin x
+    '\u0440': 'p',  # Cyrillic р -> Latin p
+    '\u0456': 'i',  # Cyrillic і -> Latin i (U+0456)
+    '\u0415': 'e',  # Cyrillic Е -> Latin e
+    '\u0410': 'a',  # Cyrillic А -> Latin a
+    '\u041e': 'o',  # Cyrillic О -> Latin o
+    '\u0421': 'c',  # Cyrillic С -> Latin c
+    '\u0425': 'x',  # Cyrillic Х -> Latin x
+    '\u0420': 'p',  # Cyrillic Р -> Latin p
+    '\u0406': 'i',  # Cyrillic І -> Latin I (U+0406)
+    # Greek lookalikes
+    '\u03bf': 'o',  # Greek omicron -> Latin o
+    '\u03c1': 'p',  # Greek rho -> Latin p
+    '\u03b1': 'a',  # Greek alpha -> Latin a
+    '\u03b5': 'e',  # Greek epsilon -> Latin e
+})
+
+
+def normalize_input(text: str) -> str:
+    """
+    Normalize input text to defeat obfuscation attempts.
+
+    Applies:
+    1. Removal of zero-width characters (U+200B, U+200C, U+200D, U+FEFF, etc.)
+    2. NFKC Unicode normalization (decomposes + canonicalizes)
+    3. Case folding (lowercase)
+    4. Homoglyph substitution (Cyrillic, fullwidth, Greek lookalikes)
+
+    Args:
+        text: The input text to normalize
+
+    Returns:
+        Normalized text with obfuscation removed
+    """
+    # Step 1: Remove zero-width characters
+    for char in ZERO_WIDTH_CHARS:
+        text = text.replace(char, '')
+
+    # Step 2: NFKC normalization (decomposes characters, canonicalizes)
+    text = unicodedata.normalize('NFKC', text)
+
+    # Step 3: Homoglyph substitution (before case folding for fullwidth)
+    text = text.translate(HOMOGLYPH_MAP)
+
+    # Step 4: Case folding (lowercase)
+    text = text.casefold()
+
+    return text
+
+
+# ---------------------------------------------------------------------------
+# AST-based Python security analysis
+# ---------------------------------------------------------------------------
+
+class PythonSecurityAnalyzer(ast.NodeVisitor):
+    """
+    AST visitor that detects obfuscated Python code execution patterns.
+
+    Detects:
+    - Direct dangerous calls: eval(), exec(), compile(), __import__()
+    - Dynamic access: getattr(__builtins__, ...), globals()['eval']
+    - String concatenation obfuscation: 'e'+'v'+'a'+'l'
+    - Encoded attribute access via subscripts
+    """
+
+    # Dangerous builtins that can execute arbitrary code
+    DANGEROUS_BUILTINS: Set[str] = {
+        'eval', 'exec', 'compile', '__import__',
+        'open', 'execfile',  # Python 2 compatibility concerns
+    }
+
+    def __init__(self, source_lines: List[str], file_path: str):
+        self.findings: List[Finding] = []
+        self.source_lines = source_lines
+        self.file_path = file_path
+        self.line_offsets = self._build_line_offsets()
+
+    def _build_line_offsets(self) -> List[int]:
+        """Build offset map for converting absolute position to line number."""
+        offsets = [0]
+        for line in self.source_lines:
+            offsets.append(offsets[-1] + len(line) + 1)  # +1 for newline
+        return offsets
+
+    def _get_line_from_offset(self, offset: int) -> int:
+        """Convert absolute character offset to 1-based line number."""
+        for i, start_offset in enumerate(self.line_offsets):
+            if offset < start_offset:
+                return max(1, i)
+        return len(self.line_offsets)
+
+    def _get_line_content(self, lineno: int) -> str:
+        """Get the content of a specific line (1-based)."""
+        if 1 <= lineno <= len(self.source_lines):
+            return self.source_lines[lineno - 1]
+        return ""
+
+    def _add_finding(self, pattern_id: str, severity: str, category: str,
+                     node: ast.AST, description: str) -> None:
+        """Add a finding for a detected pattern."""
+        lineno = getattr(node, 'lineno', 1)
+        line_content = self._get_line_content(lineno).strip()
+        if len(line_content) > 120:
+            line_content = line_content[:117] + "..."
+
+        self.findings.append(Finding(
+            pattern_id=pattern_id,
+            severity=severity,
+            category=category,
+            file=self.file_path,
+            line=lineno,
+            match=line_content,
+            description=description,
+        ))
+
+    def _is_string_concat(self, node: ast.AST) -> bool:
+        """Check if node represents a string concatenation operation."""
+        if isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
+            return self._is_string_concat(node.left) or self._is_string_concat(node.right)
+        if isinstance(node, ast.Constant) and isinstance(node.value, str):
+            return True
+        if isinstance(node, ast.JoinedStr):
+            return True
+        return False
+
+    def _concat_to_string(self, node: ast.AST) -> str:
+        """Try to extract the concatenated string value from a BinOp chain."""
+        if isinstance(node, ast.Constant) and isinstance(node.value, str):
+            return node.value
+        if isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
+            return self._concat_to_string(node.left) + self._concat_to_string(node.right)
+        return ""
+
+    def visit_Call(self, node: ast.Call) -> None:
+        """Detect dangerous function calls including obfuscated variants."""
+        func = node.func
+
+        # Direct call: eval(...), exec(...), etc.
+        if isinstance(func, ast.Name):
+            func_name = func.id
+            if func_name in self.DANGEROUS_BUILTINS:
+                self._add_finding(
+                    f"ast_dangerous_call_{func_name}",
+                    "high", "obfuscation", node,
+                    f"Dangerous builtin call: {func_name}()"
+                )
+
+        # getattr(__builtins__, ...) pattern
+        if isinstance(func, ast.Name) and func.id == 'getattr':
+            if len(node.args) >= 2:
+                first_arg = node.args[0]
+                second_arg = node.args[1]
+
+                # Check for getattr(__builtins__, ...)
+                if (isinstance(first_arg, ast.Name) and
+                    first_arg.id in ('__builtins__', 'builtins')):
+                    self._add_finding(
+                        "ast_getattr_builtins", "critical", "obfuscation", node,
+                        "Dynamic access to builtins via getattr() (evasion technique)"
+                    )
+
+                # Check for getattr(..., 'eval') or getattr(..., 'exec')
+                if isinstance(second_arg, ast.Constant) and isinstance(second_arg.value, str):
+                    if second_arg.value in self.DANGEROUS_BUILTINS:
+                        self._add_finding(
+                            f"ast_getattr_{second_arg.value}", "critical", "obfuscation", node,
+                            f"Dynamic retrieval of {second_arg.value} via getattr()"
+                        )
+
+        # globals()[...] or locals()[...] pattern when called
+        # AST structure: Call(func=Subscript(value=Call(func=Name(id='globals')), slice=Constant('eval')))
+        if isinstance(func, ast.Subscript):
+            subscript_value = func.value
+            # Check if subscript value is a call to globals() or locals()
+            if (isinstance(subscript_value, ast.Call) and
+                isinstance(subscript_value.func, ast.Name) and
+                subscript_value.func.id in ('globals', 'locals')):
+                self._add_finding(
+                    "ast_dynamic_global_access", "critical", "obfuscation", node,
+                    f"Dynamic function call via {subscript_value.func.id}()[...] (evasion technique)"
+                )
+            # Also check for direct globals[...] (without call, less common but possible)
+            elif isinstance(subscript_value, ast.Name) and subscript_value.id in ('globals', 'locals'):
+                self._add_finding(
+                    "ast_dynamic_global_access", "critical", "obfuscation", node,
+                    f"Dynamic function call via {subscript_value.id}[...] (evasion technique)"
+                )
+
+        # Detect string concatenation in arguments (e.g., 'e'+'v'+'a'+'l')
+        for arg in node.args:
+            if self._is_string_concat(arg):
+                concat_str = self._concat_to_string(arg)
+                normalized = normalize_input(concat_str)
+                if normalized in self.DANGEROUS_BUILTINS:
+                    self._add_finding(
+                        f"ast_concat_{normalized}", "critical", "obfuscation", node,
+                        f"String concatenation obfuscation building '{normalized}'"
+                    )
+
+        self.generic_visit(node)
+
+    def visit_Subscript(self, node: ast.Subscript) -> None:
+        """Detect globals()['eval'] / locals()['exec'] patterns."""
+        # Check for globals()[...] or locals()[...]
+        # AST structure for `globals()['eval']`: Subscript(value=Call(func=Name(id='globals')), slice=Constant('eval'))
+        subscript_target = node.value
+        globals_or_locals = None
+
+        # Check if subscript target is a call to globals() or locals()
+        if isinstance(subscript_target, ast.Call) and isinstance(subscript_target.func, ast.Name):
+            if subscript_target.func.id in ('globals', 'locals'):
+                globals_or_locals = subscript_target.func.id
+        # Also handle direct globals[...] without call (less common)
+        elif isinstance(subscript_target, ast.Name) and subscript_target.id in ('globals', 'locals'):
+            globals_or_locals = subscript_target.id
+
+        if globals_or_locals:
+            # Check the subscript value
+            if isinstance(node.slice, ast.Constant) and isinstance(node.slice.value, str):
+                slice_val = node.slice.value
+                if slice_val in self.DANGEROUS_BUILTINS:
+                    self._add_finding(
+                        f"ast_{globals_or_locals}_subscript_{slice_val}",
+                        "critical", "obfuscation", node,
+                        f"Dynamic access to {slice_val} via {globals_or_locals}()['{slice_val}']"
+                    )
+            # String concatenation in subscript: globals()['e'+'v'+'a'+'l']
+            elif isinstance(node.slice, ast.BinOp):
+                concat_str = self._concat_to_string(node.slice)
+                normalized = normalize_input(concat_str)
+                if normalized in self.DANGEROUS_BUILTINS:
+                    self._add_finding(
+                        f"ast_{globals_or_locals}_concat_{normalized}",
+                        "critical", "obfuscation", node,
+                        f"String concatenation obfuscation via {globals_or_locals}()['...']"
+                    )
+
+        # Check for __builtins__[...]
+        if isinstance(node.value, ast.Name) and node.value.id == '__builtins__':
+            self._add_finding(
+                "ast_builtins_subscript", "high", "obfuscation", node,
+                "Direct subscript access to __builtins__"
+            )
+
+        self.generic_visit(node)
+
+    def visit_BinOp(self, node: ast.BinOp) -> None:
+        """Detect string concatenation building dangerous function names."""
+        if isinstance(node.op, ast.Add):
+            concat_str = self._concat_to_string(node)
+            normalized = normalize_input(concat_str)
+            if normalized in self.DANGEROUS_BUILTINS:
+                self._add_finding(
+                    f"ast_string_concat_{normalized}", "high", "obfuscation", node,
+                    f"String concatenation building '{normalized}' (possible obfuscation)"
+                )
+
+        self.generic_visit(node)
+
+    def visit_Attribute(self, node: ast.Attribute) -> None:
+        """Detect obj.eval, obj.exec patterns."""
+        if node.attr in self.DANGEROUS_BUILTINS:
+            self._add_finding(
+                f"ast_attr_{node.attr}", "medium", "obfuscation", node,
+                f"Access to .{node.attr} attribute (context-dependent risk)"
+            )
+        self.generic_visit(node)
+
+
+def analyze_python_ast(content: str, file_path: str) -> List[Finding]:
+    """
+    Parse Python code and analyze its AST for security issues.
+
+    Args:
+        content: The Python source code to analyze
+        file_path: Path to the file (for reporting)
+
+    Returns:
+        List of findings from AST analysis
+    """
+    lines = content.split('\n')
+
+    try:
+        tree = ast.parse(content)
+    except SyntaxError:
+        # If we can't parse, return empty findings
+        return []
+
+    analyzer = PythonSecurityAnalyzer(lines, file_path)
+    analyzer.visit(tree)
+    return analyzer.findings
+

 # ---------------------------------------------------------------------------
 # Scanning functions
@@ -529,7 +855,12 @@ INVISIBLE_CHARS = {

 def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
    """
-    Scan a single file for threat patterns and invisible unicode characters.
+    Scan a single file for threat patterns, obfuscation, and invisible unicode.
+
+    Performs:
+    1. Invisible unicode character detection (on original content)
+    2. AST analysis for Python files (detects obfuscated execution patterns)
+    3. Regex pattern matching on normalized content (catches obfuscated variants)

    Args:
        file_path: Absolute path to the file
@@ -553,27 +884,7 @@ def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
    lines = content.split('\n')
    seen = set()  # (pattern_id, line_number) for deduplication

-    # Regex pattern matching
-    for pattern, pid, severity, category, description in THREAT_PATTERNS:
-        for i, line in enumerate(lines, start=1):
-            if (pid, i) in seen:
-                continue
-            if re.search(pattern, line, re.IGNORECASE):
-                seen.add((pid, i))
-                matched_text = line.strip()
-                if len(matched_text) > 120:
-                    matched_text = matched_text[:117] + "..."
-                findings.append(Finding(
-                    pattern_id=pid,
-                    severity=severity,
-                    category=category,
-                    file=rel_path,
-                    line=i,
-                    match=matched_text,
-                    description=description,
-                ))
-
-    # Invisible unicode character detection
+    # Step 1: Invisible unicode character detection (on original)
    for i, line in enumerate(lines, start=1):
        for char in INVISIBLE_CHARS:
            if char in line:
@@ -589,6 +900,38 @@ def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
                ))
                break  # one finding per line for invisible chars

+    # Step 2: AST analysis for Python files
+    if file_path.suffix.lower() == '.py':
+        ast_findings = analyze_python_ast(content, rel_path)
+        findings.extend(ast_findings)
+
+    # Step 3: Normalize content and run regex patterns
+    # This catches obfuscated variants like Cyrillic homoglyphs, fullwidth, etc.
+    normalized_content = normalize_input(content)
+    normalized_lines = normalized_content.split('\n')
+
+    # Map normalized line numbers to original line numbers (they should match)
+    for pattern, pid, severity, category, description in THREAT_PATTERNS:
+        for i, norm_line in enumerate(normalized_lines, start=1):
+            if (pid, i) in seen:
+                continue
+            if re.search(pattern, norm_line, re.IGNORECASE):
+                seen.add((pid, i))
+                # Show original line content for context
+                original_line = lines[i - 1] if i <= len(lines) else norm_line
+                matched_text = original_line.strip()
+                if len(matched_text) > 120:
+                    matched_text = matched_text[:117] + "..."
+                findings.append(Finding(
+                    pattern_id=pid,
+                    severity=severity,
+                    category=category,
+                    file=rel_path,
+                    line=i,
+                    match=matched_text,
+                    description=description,
+                ))
+
    return findings


@@ -598,8 +941,17 @@ def scan_skill(skill_path: Path, source: str = "community") -> ScanResult:

    Performs:
    1. Structural checks (file count, total size, binary files, symlinks)
-    2. Regex pattern matching on all text files
-    3. Invisible unicode character detection
+    2. Unicode normalization to defeat obfuscation (NFKC, homoglyphs, zero-width)
+    3. AST analysis for Python files (detects dynamic execution patterns)
+    4. Regex pattern matching on normalized content
+    5. Invisible unicode character detection
+
+    V-011 Bypass Protection:
+    - Unicode homoglyphs (Cyrillic, fullwidth, Greek lookalikes)
+    - Zero-width character injection (U+200B, U+200C, U+200D, U+FEFF)
+    - Case manipulation (EvAl, ExEc)
+    - String concatenation obfuscation ('e'+'v'+'a'+'l')
+    - Dynamic execution patterns (globals()['eval'], getattr(__builtins__, 'exec'))

    Args:
        skill_path: Path to the skill directory (must contain SKILL.md)