security: fix V-011 Skills Guard Bypass with AST analysis and normalization

2026-03-31 18:44:32 +00:00
parent 546b3dd45d
commit 37c75ecd7a
2 changed files with 792 additions and 30 deletions
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@@ -3,10 +3,11 @@
 Skills Guard — Security scanner for externally-sourced skills.
 Every skill downloaded from a registry passes through this scanner before
-installation. It uses regex-based static analysis to detect known-bad patterns
+installation. It uses regex-based static analysis and AST analysis to detect
-(data exfiltration, prompt injection, destructive commands, persistence, etc.)
+known-bad patterns (data exfiltration, prompt injection, destructive commands,
-and a trust-aware install policy that determines whether a skill is allowed
+persistence, obfuscation, etc.) and a trust-aware install policy that determines
-based on both the scan verdict and the source's trust level.
+whether a skill is allowed based on both the scan verdict and the source's
 trust level.
 Trust levels:
  - builtin:   Ships with Hermes. Never scanned, always trusted.
@@ -22,12 +23,14 @@ Usage:
        print(format_scan_report(result))
 """
-import re
+import ast
 import hashlib
 import re
 import unicodedata
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import List, Tuple
+from typing import List, Set, Tuple
@@ -501,7 +504,25 @@ SUSPICIOUS_BINARY_EXTENSIONS = {
    '.msi', '.dmg', '.app', '.deb', '.rpm',
 }
 # ---------------------------------------------------------------------------
 # Input normalization for bypass detection
 # ---------------------------------------------------------------------------
 # Zero-width and invisible unicode characters used for injection
 # These are removed during normalization
 ZERO_WIDTH_CHARS = frozenset({
    '\u200b',  # zero-width space
    '\u200c',  # zero-width non-joiner
    '\u200d',  # zero-width joiner
    '\u2060',  # word joiner
    '\u2062',  # invisible times
    '\u2063',  # invisible separator
    '\u2064',  # invisible plus
    '\ufeff',  # zero-width no-break space (BOM)
 })
 # Extended invisible characters for detection (reporting only)
 INVISIBLE_CHARS = {
    '\u200b',  # zero-width space
    '\u200c',  # zero-width non-joiner
@@ -522,6 +543,311 @@ INVISIBLE_CHARS = {
    '\u2069',  # pop directional isolate
 }
 # Unicode homoglyph mapping for common confusable characters
 # Maps lookalike characters to their ASCII equivalents
 HOMOGLYPH_MAP = str.maketrans({
    # Fullwidth Latin
    '\uff45': 'e', '\uff56': 'v', '\uff41': 'a', '\uff4c': 'l',  # ｅｖａｌ -> eval
    '\uff25': 'e', '\uff36': 'v', '\uff21': 'a', '\uff2c': 'l',  # ＥＶＡＬ -> eval
    '\uff4f': 'o', '\uff53': 's', '\uff58': 'x', '\uff43': 'c',  # ｏｓｘｃ
    '\uff2f': 'o', '\uff33': 's', '\uff38': 'x', '\uff23': 'c',  # ＯＳＸＣ
    # Cyrillic lookalikes
    '\u0435': 'e',  # Cyrillic е -> Latin e
    '\u0430': 'a',  # Cyrillic а -> Latin a
    '\u043e': 'o',  # Cyrillic о -> Latin o
    '\u0441': 'c',  # Cyrillic с -> Latin c
    '\u0445': 'x',  # Cyrillic х -> Latin x
    '\u0440': 'p',  # Cyrillic р -> Latin p
    '\u0456': 'i',  # Cyrillic і -> Latin i (U+0456)
    '\u0415': 'e',  # Cyrillic Е -> Latin e
    '\u0410': 'a',  # Cyrillic А -> Latin a
    '\u041e': 'o',  # Cyrillic О -> Latin o
    '\u0421': 'c',  # Cyrillic С -> Latin c
    '\u0425': 'x',  # Cyrillic Х -> Latin x
    '\u0420': 'p',  # Cyrillic Р -> Latin p
    '\u0406': 'i',  # Cyrillic І -> Latin I (U+0406)
    # Greek lookalikes
    '\u03bf': 'o',  # Greek omicron -> Latin o
    '\u03c1': 'p',  # Greek rho -> Latin p
    '\u03b1': 'a',  # Greek alpha -> Latin a
    '\u03b5': 'e',  # Greek epsilon -> Latin e
 })
 def normalize_input(text: str) -> str:
    """
    Normalize input text to defeat obfuscation attempts.
    Applies:
    1. Removal of zero-width characters (U+200B, U+200C, U+200D, U+FEFF, etc.)
    2. NFKC Unicode normalization (decomposes + canonicalizes)
    3. Case folding (lowercase)
    4. Homoglyph substitution (Cyrillic, fullwidth, Greek lookalikes)
    Args:
        text: The input text to normalize
    Returns:
        Normalized text with obfuscation removed
    """
    # Step 1: Remove zero-width characters
    for char in ZERO_WIDTH_CHARS:
        text = text.replace(char, '')
    # Step 2: NFKC normalization (decomposes characters, canonicalizes)
    text = unicodedata.normalize('NFKC', text)
    # Step 3: Homoglyph substitution (before case folding for fullwidth)
    text = text.translate(HOMOGLYPH_MAP)
    # Step 4: Case folding (lowercase)
    text = text.casefold()
    return text
 # ---------------------------------------------------------------------------
 # AST-based Python security analysis
 # ---------------------------------------------------------------------------
 class PythonSecurityAnalyzer(ast.NodeVisitor):
    """
    AST visitor that detects obfuscated Python code execution patterns.
    Detects:
    - Direct dangerous calls: eval(), exec(), compile(), __import__()
    - Dynamic access: getattr(__builtins__, ...), globals()['eval']
    - String concatenation obfuscation: 'e'+'v'+'a'+'l'
    - Encoded attribute access via subscripts
    """
    # Dangerous builtins that can execute arbitrary code
    DANGEROUS_BUILTINS: Set[str] = {
        'eval', 'exec', 'compile', '__import__',
        'open', 'execfile',  # Python 2 compatibility concerns
    }
    def __init__(self, source_lines: List[str], file_path: str):
        self.findings: List[Finding] = []
        self.source_lines = source_lines
        self.file_path = file_path
        self.line_offsets = self._build_line_offsets()
    def _build_line_offsets(self) -> List[int]:
        """Build offset map for converting absolute position to line number."""
        offsets = [0]
        for line in self.source_lines:
            offsets.append(offsets[-1] + len(line) + 1)  # +1 for newline
        return offsets
    def _get_line_from_offset(self, offset: int) -> int:
        """Convert absolute character offset to 1-based line number."""
        for i, start_offset in enumerate(self.line_offsets):
            if offset < start_offset:
                return max(1, i)
        return len(self.line_offsets)
    def _get_line_content(self, lineno: int) -> str:
        """Get the content of a specific line (1-based)."""
        if 1 <= lineno <= len(self.source_lines):
            return self.source_lines[lineno - 1]
        return ""
    def _add_finding(self, pattern_id: str, severity: str, category: str,
                     node: ast.AST, description: str) -> None:
        """Add a finding for a detected pattern."""
        lineno = getattr(node, 'lineno', 1)
        line_content = self._get_line_content(lineno).strip()
        if len(line_content) > 120:
            line_content = line_content[:117] + "..."
        self.findings.append(Finding(
            pattern_id=pattern_id,
            severity=severity,
            category=category,
            file=self.file_path,
            line=lineno,
            match=line_content,
            description=description,
        ))
    def _is_string_concat(self, node: ast.AST) -> bool:
        """Check if node represents a string concatenation operation."""
        if isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
            return self._is_string_concat(node.left) or self._is_string_concat(node.right)
        if isinstance(node, ast.Constant) and isinstance(node.value, str):
            return True
        if isinstance(node, ast.JoinedStr):
            return True
        return False
    def _concat_to_string(self, node: ast.AST) -> str:
        """Try to extract the concatenated string value from a BinOp chain."""
        if isinstance(node, ast.Constant) and isinstance(node.value, str):
            return node.value
        if isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
            return self._concat_to_string(node.left) + self._concat_to_string(node.right)
        return ""
    def visit_Call(self, node: ast.Call) -> None:
        """Detect dangerous function calls including obfuscated variants."""
        func = node.func
        # Direct call: eval(...), exec(...), etc.
        if isinstance(func, ast.Name):
            func_name = func.id
            if func_name in self.DANGEROUS_BUILTINS:
                self._add_finding(
                    f"ast_dangerous_call_{func_name}",
                    "high", "obfuscation", node,
                    f"Dangerous builtin call: {func_name}()"
                )
        # getattr(__builtins__, ...) pattern
        if isinstance(func, ast.Name) and func.id == 'getattr':
            if len(node.args) >= 2:
                first_arg = node.args[0]
                second_arg = node.args[1]
                # Check for getattr(__builtins__, ...)
                if (isinstance(first_arg, ast.Name) and
                    first_arg.id in ('__builtins__', 'builtins')):
                    self._add_finding(
                        "ast_getattr_builtins", "critical", "obfuscation", node,
                        "Dynamic access to builtins via getattr() (evasion technique)"
                    )
                # Check for getattr(..., 'eval') or getattr(..., 'exec')
                if isinstance(second_arg, ast.Constant) and isinstance(second_arg.value, str):
                    if second_arg.value in self.DANGEROUS_BUILTINS:
                        self._add_finding(
                            f"ast_getattr_{second_arg.value}", "critical", "obfuscation", node,
                            f"Dynamic retrieval of {second_arg.value} via getattr()"
                        )
        # globals()[...] or locals()[...] pattern when called
        # AST structure: Call(func=Subscript(value=Call(func=Name(id='globals')), slice=Constant('eval')))
        if isinstance(func, ast.Subscript):
            subscript_value = func.value
            # Check if subscript value is a call to globals() or locals()
            if (isinstance(subscript_value, ast.Call) and
                isinstance(subscript_value.func, ast.Name) and
                subscript_value.func.id in ('globals', 'locals')):
                self._add_finding(
                    "ast_dynamic_global_access", "critical", "obfuscation", node,
                    f"Dynamic function call via {subscript_value.func.id}()[...] (evasion technique)"
                )
            # Also check for direct globals[...] (without call, less common but possible)
            elif isinstance(subscript_value, ast.Name) and subscript_value.id in ('globals', 'locals'):
                self._add_finding(
                    "ast_dynamic_global_access", "critical", "obfuscation", node,
                    f"Dynamic function call via {subscript_value.id}[...] (evasion technique)"
                )
        # Detect string concatenation in arguments (e.g., 'e'+'v'+'a'+'l')
        for arg in node.args:
            if self._is_string_concat(arg):
                concat_str = self._concat_to_string(arg)
                normalized = normalize_input(concat_str)
                if normalized in self.DANGEROUS_BUILTINS:
                    self._add_finding(
                        f"ast_concat_{normalized}", "critical", "obfuscation", node,
                        f"String concatenation obfuscation building '{normalized}'"
                    )
        self.generic_visit(node)
    def visit_Subscript(self, node: ast.Subscript) -> None:
        """Detect globals()['eval'] / locals()['exec'] patterns."""
        # Check for globals()[...] or locals()[...]
        # AST structure for `globals()['eval']`: Subscript(value=Call(func=Name(id='globals')), slice=Constant('eval'))
        subscript_target = node.value
        globals_or_locals = None
        # Check if subscript target is a call to globals() or locals()
        if isinstance(subscript_target, ast.Call) and isinstance(subscript_target.func, ast.Name):
            if subscript_target.func.id in ('globals', 'locals'):
                globals_or_locals = subscript_target.func.id
        # Also handle direct globals[...] without call (less common)
        elif isinstance(subscript_target, ast.Name) and subscript_target.id in ('globals', 'locals'):
            globals_or_locals = subscript_target.id
        if globals_or_locals:
            # Check the subscript value
            if isinstance(node.slice, ast.Constant) and isinstance(node.slice.value, str):
                slice_val = node.slice.value
                if slice_val in self.DANGEROUS_BUILTINS:
                    self._add_finding(
                        f"ast_{globals_or_locals}_subscript_{slice_val}",
                        "critical", "obfuscation", node,
                        f"Dynamic access to {slice_val} via {globals_or_locals}()['{slice_val}']"
                    )
            # String concatenation in subscript: globals()['e'+'v'+'a'+'l']
            elif isinstance(node.slice, ast.BinOp):
                concat_str = self._concat_to_string(node.slice)
                normalized = normalize_input(concat_str)
                if normalized in self.DANGEROUS_BUILTINS:
                    self._add_finding(
                        f"ast_{globals_or_locals}_concat_{normalized}",
                        "critical", "obfuscation", node,
                        f"String concatenation obfuscation via {globals_or_locals}()['...']"
                    )
        # Check for __builtins__[...]
        if isinstance(node.value, ast.Name) and node.value.id == '__builtins__':
            self._add_finding(
                "ast_builtins_subscript", "high", "obfuscation", node,
                "Direct subscript access to __builtins__"
            )
        self.generic_visit(node)
    def visit_BinOp(self, node: ast.BinOp) -> None:
        """Detect string concatenation building dangerous function names."""
        if isinstance(node.op, ast.Add):
            concat_str = self._concat_to_string(node)
            normalized = normalize_input(concat_str)
            if normalized in self.DANGEROUS_BUILTINS:
                self._add_finding(
                    f"ast_string_concat_{normalized}", "high", "obfuscation", node,
                    f"String concatenation building '{normalized}' (possible obfuscation)"
                )
        self.generic_visit(node)
    def visit_Attribute(self, node: ast.Attribute) -> None:
        """Detect obj.eval, obj.exec patterns."""
        if node.attr in self.DANGEROUS_BUILTINS:
            self._add_finding(
                f"ast_attr_{node.attr}", "medium", "obfuscation", node,
                f"Access to .{node.attr} attribute (context-dependent risk)"
            )
        self.generic_visit(node)
 def analyze_python_ast(content: str, file_path: str) -> List[Finding]:
    """
    Parse Python code and analyze its AST for security issues.
    Args:
        content: The Python source code to analyze
        file_path: Path to the file (for reporting)
    Returns:
        List of findings from AST analysis
    """
    lines = content.split('\n')
    try:
        tree = ast.parse(content)
    except SyntaxError:
        # If we can't parse, return empty findings
        return []
    analyzer = PythonSecurityAnalyzer(lines, file_path)
    analyzer.visit(tree)
    return analyzer.findings
 # ---------------------------------------------------------------------------
 # Scanning functions
@@ -529,7 +855,12 @@ INVISIBLE_CHARS = {
 def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
    """
-    Scan a single file for threat patterns and invisible unicode characters.
+    Scan a single file for threat patterns, obfuscation, and invisible unicode.
    Performs:
    1. Invisible unicode character detection (on original content)
    2. AST analysis for Python files (detects obfuscated execution patterns)
    3. Regex pattern matching on normalized content (catches obfuscated variants)
    Args:
        file_path: Absolute path to the file
@@ -553,27 +884,7 @@ def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
    lines = content.split('\n')
    seen = set()  # (pattern_id, line_number) for deduplication
-    # Regex pattern matching
+    # Step 1: Invisible unicode character detection (on original)
    for pattern, pid, severity, category, description in THREAT_PATTERNS:
        for i, line in enumerate(lines, start=1):
            if (pid, i) in seen:
                continue
            if re.search(pattern, line, re.IGNORECASE):
                seen.add((pid, i))
                matched_text = line.strip()
                if len(matched_text) > 120:
                    matched_text = matched_text[:117] + "..."
                findings.append(Finding(
                    pattern_id=pid,
                    severity=severity,
                    category=category,
                    file=rel_path,
                    line=i,
                    match=matched_text,
                    description=description,
                ))
    # Invisible unicode character detection
    for i, line in enumerate(lines, start=1):
        for char in INVISIBLE_CHARS:
            if char in line:
@@ -589,6 +900,38 @@ def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
                ))
                break  # one finding per line for invisible chars
    # Step 2: AST analysis for Python files
    if file_path.suffix.lower() == '.py':
        ast_findings = analyze_python_ast(content, rel_path)
        findings.extend(ast_findings)
    # Step 3: Normalize content and run regex patterns
    # This catches obfuscated variants like Cyrillic homoglyphs, fullwidth, etc.
    normalized_content = normalize_input(content)
    normalized_lines = normalized_content.split('\n')
    # Map normalized line numbers to original line numbers (they should match)
    for pattern, pid, severity, category, description in THREAT_PATTERNS:
        for i, norm_line in enumerate(normalized_lines, start=1):
            if (pid, i) in seen:
                continue
            if re.search(pattern, norm_line, re.IGNORECASE):
                seen.add((pid, i))
                # Show original line content for context
                original_line = lines[i - 1] if i <= len(lines) else norm_line
                matched_text = original_line.strip()
                if len(matched_text) > 120:
                    matched_text = matched_text[:117] + "..."
                findings.append(Finding(
                    pattern_id=pid,
                    severity=severity,
                    category=category,
                    file=rel_path,
                    line=i,
                    match=matched_text,
                    description=description,
                ))
    return findings
@@ -598,8 +941,17 @@ def scan_skill(skill_path: Path, source: str = "community") -> ScanResult:
    Performs:
    1. Structural checks (file count, total size, binary files, symlinks)
-    2. Regex pattern matching on all text files
+    2. Unicode normalization to defeat obfuscation (NFKC, homoglyphs, zero-width)
-    3. Invisible unicode character detection
+    3. AST analysis for Python files (detects dynamic execution patterns)
    4. Regex pattern matching on normalized content
    5. Invisible unicode character detection
    V-011 Bypass Protection:
    - Unicode homoglyphs (Cyrillic, fullwidth, Greek lookalikes)
    - Zero-width character injection (U+200B, U+200C, U+200D, U+FEFF)
    - Case manipulation (EvAl, ExEc)
    - String concatenation obfuscation ('e'+'v'+'a'+'l')
    - Dynamic execution patterns (globals()['eval'], getattr(__builtins__, 'exec'))
    Args:
        skill_path: Path to the skill directory (must contain SKILL.md)
--- a/tools/test_skills_guard_v011.py
+++ b/tools/test_skills_guard_v011.py
@@ -0,0 +1,410 @@
 #!/usr/bin/env python3
 """
 Tests for V-011 Skills Guard Bypass fix.
 Tests all bypass techniques:
 1. Unicode encoding tricks (fullwidth characters, Cyrillic homoglyphs)
 2. Case manipulation (EvAl, ExEc)
 3. Zero-width characters (U+200B, U+200C, U+200D, U+FEFF)
 4. Dynamic execution obfuscation: globals()['ev'+'al'], getattr(__builtins__, 'exec')
 5. String concatenation: 'e'+'v'+'a'+'l'
 """
 import sys
 import tempfile
 from pathlib import Path
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent))
 from skills_guard import (
    normalize_input,
    analyze_python_ast,
    scan_file,
    ZERO_WIDTH_CHARS,
    HOMOGLYPH_MAP,
 )
 class TestNormalizeInput:
    """Test input normalization for obfuscation removal."""
    def test_zero_width_removal(self):
        """Test removal of zero-width characters."""
        # U+200B zero-width space
        obfuscated = "ev\u200bal"
        normalized = normalize_input(obfuscated)
        assert normalized == "eval", f"Expected 'eval', got '{normalized}'"
        # Multiple zero-width characters
        obfuscated = "e\u200bx\u200ce\u200dc"
        normalized = normalize_input(obfuscated)
        assert normalized == "exec", f"Expected 'exec', got '{normalized}'"
        # U+FEFF BOM
        obfuscated = "\ufeffeval"
        normalized = normalize_input(obfuscated)
        assert normalized == "eval", f"Expected 'eval', got '{normalized}'"
        print("✓ Zero-width character removal tests passed")
    def test_case_folding(self):
        """Test case folding (lowercase conversion)."""
        test_cases = [
            ("EvAl", "eval"),
            ("EXEC", "exec"),
            ("CoMpIlE", "compile"),
            ("GetAttr", "getattr"),
        ]
        for input_str, expected in test_cases:
            normalized = normalize_input(input_str)
            assert normalized == expected, f"Expected '{expected}', got '{normalized}'"
        print("✓ Case folding tests passed")
    def test_fullwidth_normalization(self):
        """Test fullwidth character normalization."""
        # Fullwidth Latin characters
        test_cases = [
            ("\uff45\uff56\uff41\uff4c", "eval"),  # ｅｖａｌ
            ("\uff25\uff36\uff21\uff2c", "eval"),  # ＥＶＡＬ (uppercase fullwidth)
            ("\uff45\uff58\uff45\uff43", "exec"),  # ｅｘｅｃ
            ("\uff4f\uff53", "os"),               # ｏｓ
        ]
        for input_str, expected in test_cases:
            normalized = normalize_input(input_str)
            assert normalized == expected, f"Expected '{expected}', got '{normalized}'"
        print("✓ Fullwidth normalization tests passed")
    def test_cyrillic_homoglyphs(self):
        """Test Cyrillic lookalike character normalization."""
        # Cyrillic е (U+0435) looks like Latin e (U+0065)
        test_cases = [
            ("\u0435val", "eval"),  # еval (Cyrillic е)
            ("\u0435x\u0435c", "exec"),  # еxеc (Cyrillic е's)
            ("\u0430\u0435\u0456\u043e", "aeio"),  # аеіо (all Cyrillic)
            ("g\u0435tattr", "getattr"),  # gеtattr (Cyrillic е)
        ]
        for input_str, expected in test_cases:
            normalized = normalize_input(input_str)
            assert normalized == expected, f"Expected '{expected}', got '{normalized}'"
        print("✓ Cyrillic homoglyph tests passed")
    def test_combined_obfuscation(self):
        """Test combined obfuscation techniques."""
        # Mix of case, zero-width, and homoglyphs
        obfuscated = "E\u200bV\u0430L"  # E + ZWS + V + Cyrillic а + L
        normalized = normalize_input(obfuscated)
        assert normalized == "eval", f"Expected 'eval', got '{normalized}'"
        print("✓ Combined obfuscation tests passed")
 class TestASTAnalysis:
    """Test AST-based security analysis."""
    def test_direct_dangerous_calls(self):
        """Test detection of direct eval/exec/compile calls."""
        code = "eval('1+1')"
        findings = analyze_python_ast(code, "test.py")
        assert any("eval" in f.pattern_id for f in findings), "Should detect eval() call"
        code = "exec('print(1)')"
        findings = analyze_python_ast(code, "test.py")
        assert any("exec" in f.pattern_id for f in findings), "Should detect exec() call"
        code = "compile('x', '<string>', 'exec')"
        findings = analyze_python_ast(code, "test.py")
        assert any("compile" in f.pattern_id for f in findings), "Should detect compile() call"
        print("✓ Direct dangerous call detection tests passed")
    def test_getattr_builtins_pattern(self):
        """Test detection of getattr(__builtins__, ...) pattern."""
        code = "getattr(__builtins__, 'eval')"
        findings = analyze_python_ast(code, "test.py")
        assert any("getattr_builtins" in f.pattern_id for f in findings), \
            "Should detect getattr(__builtins__, ...) pattern"
        code = "getattr(__builtins__, 'exec')"
        findings = analyze_python_ast(code, "test.py")
        assert any("getattr_exec" in f.pattern_id for f in findings), \
            "Should detect getattr(..., 'exec')"
        print("✓ getattr(__builtins__, ...) detection tests passed")
    def test_globals_subscript_pattern(self):
        """Test detection of globals()['eval'] pattern."""
        code = "globals()['eval']('1+1')"
        findings = analyze_python_ast(code, "test.py")
        assert any("globals" in f.pattern_id for f in findings), \
            "Should detect globals()['eval'] pattern"
        code = "locals()['exec']('print(1)')"
        findings = analyze_python_ast(code, "test.py")
        assert any("locals" in f.pattern_id for f in findings), \
            "Should detect locals()['exec'] pattern"
        print("✓ globals()/locals() subscript detection tests passed")
    def test_string_concatenation_obfuscation(self):
        """Test detection of string concatenation obfuscation."""
        # Simple concatenation
        code = "('e'+'v'+'a'+'l')('1+1')"
        findings = analyze_python_ast(code, "test.py")
        assert any("concat" in f.pattern_id for f in findings), \
            "Should detect string concatenation obfuscation"
        # Concatenation in globals subscript
        code = "globals()['e'+'v'+'a'+'l']('1+1')"
        findings = analyze_python_ast(code, "test.py")
        assert any("concat" in f.pattern_id for f in findings), \
            "Should detect concat in globals subscript"
        print("✓ String concatenation obfuscation detection tests passed")
    def test_dynamic_global_call(self):
        """Test detection of dynamic calls via globals()."""
        code = "globals()['eval']('1+1')"
        findings = analyze_python_ast(code, "test.py")
        assert any("dynamic_global" in f.pattern_id for f in findings), \
            "Should detect dynamic global access"
        print("✓ Dynamic global call detection tests passed")
    def test_legitimate_code_not_flagged(self):
        """Test that legitimate code is not flagged."""
        # Normal function definition
        code = """
 def calculate(x, y):
    result = x + y
    return result
 class MyClass:
    def method(self):
        return "hello"
 import os
 print(os.path.join("a", "b"))
 """
        findings = analyze_python_ast(code, "test.py")
        # Should not have any obfuscation-related findings
        obfuscation_findings = [f for f in findings if f.category == "obfuscation"]
        assert len(obfuscation_findings) == 0, \
            f"Legitimate code should not be flagged, got: {[f.description for f in obfuscation_findings]}"
        print("✓ Legitimate code not flagged tests passed")
 class TestScanFileIntegration:
    """Integration tests for scan_file with new detection."""
    def _create_temp_file(self, content: str, suffix: str = ".py") -> Path:
        """Create a temporary file with the given content."""
        with tempfile.NamedTemporaryFile(mode='w', suffix=suffix, delete=False) as f:
            f.write(content)
            return Path(f.name)
    def test_unicode_obfuscation_detection(self):
        """Test that obfuscated eval is detected via normalization."""
        # Fullwidth eval
        code = "\uff45\uff56\uff41\uff4c('1+1')"  # ｅｖａｌ
        path = self._create_temp_file(code)
        try:
            findings = scan_file(path, "test.py")
            # Should detect via regex on normalized content
            assert any("eval" in f.pattern_id.lower() or "eval" in f.description.lower()
                      for f in findings), \
                f"Should detect fullwidth eval, got: {[f.pattern_id for f in findings]}"
        finally:
            path.unlink()
        print("✓ Unicode obfuscation detection tests passed")
    def test_zero_width_character_detection(self):
        """Test detection of zero-width characters."""
        code = "ev\u200bal('1+1')"  # eval with zero-width space
        path = self._create_temp_file(code)
        try:
            findings = scan_file(path, "test.py")
            assert any("invisible_unicode" in f.pattern_id for f in findings), \
                f"Should detect invisible unicode, got: {[f.pattern_id for f in findings]}"
        finally:
            path.unlink()
        print("✓ Zero-width character detection tests passed")
    def test_ast_and_regex_combined(self):
        """Test that both AST and regex detection work together."""
        code = """
 # Obfuscated eval via string concat
 func = ('e'+'v'+'a'+'l')
 result = func('1+1')
 # Also fullwidth in comment: ｅｖａｌ
 """
        path = self._create_temp_file(code)
        try:
            findings = scan_file(path, "test.py")
            ast_findings = [f for f in findings if f.pattern_id.startswith("ast_")]
            assert len(ast_findings) > 0, "Should have AST-based findings"
        finally:
            path.unlink()
        print("✓ AST and regex combined detection tests passed")
    def test_cyrillic_in_code_detection(self):
        """Test detection of Cyrillic homoglyphs in code."""
        # Using Cyrillic е (U+0435) instead of Latin e (U+0065)
        code = "\u0435val('1+1')"  # еval with Cyrillic е
        path = self._create_temp_file(code)
        try:
            findings = scan_file(path, "test.py")
            # After normalization, regex should catch this
            assert any("eval" in f.pattern_id.lower() or "eval" in f.description.lower()
                      for f in findings), \
                f"Should detect Cyrillic obfuscated eval, got: {[f.pattern_id for f in findings]}"
        finally:
            path.unlink()
        print("✓ Cyrillic homoglyph detection tests passed")
 class TestBypassTechniques:
    """Test specific bypass techniques mentioned in the vulnerability report."""
    def test_bypass_1_unicode_encoding(self):
        """Bypass 1: Unicode encoding tricks (fullwidth characters)."""
        # Fullwidth characters: ｅｖａｌ
        fullwidth_eval = "\uff45\uff56\uff41\uff4c"
        normalized = normalize_input(fullwidth_eval)
        assert normalized == "eval", "Fullwidth should normalize to ASCII"
        # Fullwidth exec: ｅｘｅｃ
        fullwidth_exec = "\uff45\uff58\uff45\uff43"
        normalized = normalize_input(fullwidth_exec)
        assert normalized == "exec", "Fullwidth exec should normalize"
        print("✓ Bypass 1: Unicode encoding tricks blocked")
    def test_bypass_2_case_manipulation(self):
        """Bypass 2: Case manipulation (EvAl, ExEc)."""
        test_cases = ["EvAl", "ExEc", "CoMpIlE", "EVA", "exec"]
        for case in test_cases:
            normalized = normalize_input(case)
            expected = case.lower()
            assert normalized == expected, f"Case folding failed for {case}"
        print("✓ Bypass 2: Case manipulation blocked")
    def test_bypass_3_zero_width(self):
        """Bypass 3: Zero-width characters (U+200B, U+200C, U+200D, U+FEFF)."""
        # Test all zero-width characters are removed
        for char in ZERO_WIDTH_CHARS:
            obfuscated = f"ev{char}al"
            normalized = normalize_input(obfuscated)
            assert normalized == "eval", f"Zero-width char U+{ord(char):04X} not removed"
        print("✓ Bypass 3: Zero-width character injection blocked")
    def test_bypass_4_dynamic_execution(self):
        """Bypass 4: Dynamic execution obfuscation."""
        # globals()['eval']
        code1 = "globals()['eval']('1+1')"
        findings1 = analyze_python_ast(code1, "test.py")
        assert len([f for f in findings1 if "globals" in f.pattern_id]) > 0, \
            "globals()['eval'] should be detected"
        # getattr(__builtins__, 'exec')
        code2 = "getattr(__builtins__, 'exec')"
        findings2 = analyze_python_ast(code2, "test.py")
        assert any("getattr_builtins" in f.pattern_id for f in findings2), \
            "getattr(__builtins__, ...) should be detected"
        print("✓ Bypass 4: Dynamic execution obfuscation blocked")
    def test_bypass_5_string_concatenation(self):
        """Bypass 5: String concatenation ('e'+'v'+'a'+'l')."""
        # AST should detect this
        code = "('e'+'v'+'a'+'l')('1+1')"
        findings = analyze_python_ast(code, "test.py")
        assert any("concat" in f.pattern_id for f in findings), \
            "String concatenation obfuscation should be detected"
        # Also test via globals
        code2 = "globals()['e'+'v'+'a'+'l']('1+1')"
        findings2 = analyze_python_ast(code2, "test.py")
        assert any("concat" in f.pattern_id for f in findings2), \
            "Concat in globals subscript should be detected"
        print("✓ Bypass 5: String concatenation obfuscation blocked")
    def test_cyrillic_homoglyph_bypass(self):
        """Test Cyrillic homoglyph bypass (е vs e)."""
        # е (U+0435) vs e (U+0065)
        cyrillic_e = "\u0435"
        latin_e = "e"
        assert cyrillic_e != latin_e, "Cyrillic and Latin e should be different"
        # After normalization, they should be the same
        normalized_cyrillic = normalize_input(cyrillic_e)
        normalized_latin = normalize_input(latin_e)
        assert normalized_cyrillic == normalized_latin == "e", \
            "Cyrillic е should normalize to Latin e"
        # Test full word: еval (with Cyrillic е)
        cyrillic_eval = "\u0435val"
        normalized = normalize_input(cyrillic_eval)
        assert normalized == "eval", "Cyrillic eval should normalize"
        print("✓ Cyrillic homoglyph bypass blocked")
 def run_all_tests():
    """Run all tests."""
    print("=" * 60)
    print("V-011 Skills Guard Bypass Fix Tests")
    print("=" * 60)
    test_classes = [
        TestNormalizeInput,
        TestASTAnalysis,
        TestScanFileIntegration,
        TestBypassTechniques,
    ]
    passed = 0
    failed = 0
    for test_class in test_classes:
        print(f"\n--- {test_class.__name__} ---")
        instance = test_class()
        for method_name in dir(instance):
            if method_name.startswith("test_"):
                try:
                    method = getattr(instance, method_name)
                    method()
                    passed += 1
                except AssertionError as e:
                    print(f"  ✗ FAILED: {method_name}: {e}")
                    failed += 1
                except Exception as e:
                    print(f"  ✗ ERROR: {method_name}: {e}")
                    failed += 1
    print("\n" + "=" * 60)
    print(f"Results: {passed} passed, {failed} failed")
    print("=" * 60)
    if failed > 0:
        sys.exit(1)
    else:
        print("\n✓ All V-011 bypass protection tests passed!")
        sys.exit(0)
 if __name__ == "__main__":
    run_all_tests()