security: fix V-011 Skills Guard Bypass with AST analysis and normalization

This commit is contained in:
Allegro
2026-03-31 18:44:32 +00:00
parent 546b3dd45d
commit 37c75ecd7a
2 changed files with 792 additions and 30 deletions

View File

@@ -3,10 +3,11 @@
Skills Guard — Security scanner for externally-sourced skills.
Every skill downloaded from a registry passes through this scanner before
installation. It uses regex-based static analysis to detect known-bad patterns
(data exfiltration, prompt injection, destructive commands, persistence, etc.)
and a trust-aware install policy that determines whether a skill is allowed
based on both the scan verdict and the source's trust level.
installation. It uses regex-based static analysis and AST analysis to detect
known-bad patterns (data exfiltration, prompt injection, destructive commands,
persistence, obfuscation, etc.) and a trust-aware install policy that determines
whether a skill is allowed based on both the scan verdict and the source's
trust level.
Trust levels:
- builtin: Ships with Hermes. Never scanned, always trusted.
@@ -22,12 +23,14 @@ Usage:
print(format_scan_report(result))
"""
import re
import ast
import hashlib
import re
import unicodedata
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Tuple
from typing import List, Set, Tuple
@@ -501,7 +504,25 @@ SUSPICIOUS_BINARY_EXTENSIONS = {
'.msi', '.dmg', '.app', '.deb', '.rpm',
}
# ---------------------------------------------------------------------------
# Input normalization for bypass detection
# ---------------------------------------------------------------------------
# Zero-width and invisible unicode characters used for injection
# These are removed during normalization
ZERO_WIDTH_CHARS = frozenset({
'\u200b', # zero-width space
'\u200c', # zero-width non-joiner
'\u200d', # zero-width joiner
'\u2060', # word joiner
'\u2062', # invisible times
'\u2063', # invisible separator
'\u2064', # invisible plus
'\ufeff', # zero-width no-break space (BOM)
})
# Extended invisible characters for detection (reporting only)
INVISIBLE_CHARS = {
'\u200b', # zero-width space
'\u200c', # zero-width non-joiner
@@ -522,6 +543,311 @@ INVISIBLE_CHARS = {
'\u2069', # pop directional isolate
}
# Unicode homoglyph mapping for common confusable characters
# Maps lookalike characters to their ASCII equivalents
HOMOGLYPH_MAP = str.maketrans({
# Fullwidth Latin
'\uff45': 'e', '\uff56': 'v', '\uff41': 'a', '\uff4c': 'l', # -> eval
'\uff25': 'e', '\uff36': 'v', '\uff21': 'a', '\uff2c': 'l', # -> eval
'\uff4f': 'o', '\uff53': 's', '\uff58': 'x', '\uff43': 'c', #
'\uff2f': 'o', '\uff33': 's', '\uff38': 'x', '\uff23': 'c', #
# Cyrillic lookalikes
'\u0435': 'e', # Cyrillic е -> Latin e
'\u0430': 'a', # Cyrillic а -> Latin a
'\u043e': 'o', # Cyrillic о -> Latin o
'\u0441': 'c', # Cyrillic с -> Latin c
'\u0445': 'x', # Cyrillic х -> Latin x
'\u0440': 'p', # Cyrillic р -> Latin p
'\u0456': 'i', # Cyrillic і -> Latin i (U+0456)
'\u0415': 'e', # Cyrillic Е -> Latin e
'\u0410': 'a', # Cyrillic А -> Latin a
'\u041e': 'o', # Cyrillic О -> Latin o
'\u0421': 'c', # Cyrillic С -> Latin c
'\u0425': 'x', # Cyrillic Х -> Latin x
'\u0420': 'p', # Cyrillic Р -> Latin p
'\u0406': 'i', # Cyrillic І -> Latin I (U+0406)
# Greek lookalikes
'\u03bf': 'o', # Greek omicron -> Latin o
'\u03c1': 'p', # Greek rho -> Latin p
'\u03b1': 'a', # Greek alpha -> Latin a
'\u03b5': 'e', # Greek epsilon -> Latin e
})
def normalize_input(text: str) -> str:
"""
Normalize input text to defeat obfuscation attempts.
Applies:
1. Removal of zero-width characters (U+200B, U+200C, U+200D, U+FEFF, etc.)
2. NFKC Unicode normalization (decomposes + canonicalizes)
3. Case folding (lowercase)
4. Homoglyph substitution (Cyrillic, fullwidth, Greek lookalikes)
Args:
text: The input text to normalize
Returns:
Normalized text with obfuscation removed
"""
# Step 1: Remove zero-width characters
for char in ZERO_WIDTH_CHARS:
text = text.replace(char, '')
# Step 2: NFKC normalization (decomposes characters, canonicalizes)
text = unicodedata.normalize('NFKC', text)
# Step 3: Homoglyph substitution (before case folding for fullwidth)
text = text.translate(HOMOGLYPH_MAP)
# Step 4: Case folding (lowercase)
text = text.casefold()
return text
# ---------------------------------------------------------------------------
# AST-based Python security analysis
# ---------------------------------------------------------------------------
class PythonSecurityAnalyzer(ast.NodeVisitor):
"""
AST visitor that detects obfuscated Python code execution patterns.
Detects:
- Direct dangerous calls: eval(), exec(), compile(), __import__()
- Dynamic access: getattr(__builtins__, ...), globals()['eval']
- String concatenation obfuscation: 'e'+'v'+'a'+'l'
- Encoded attribute access via subscripts
"""
# Dangerous builtins that can execute arbitrary code
DANGEROUS_BUILTINS: Set[str] = {
'eval', 'exec', 'compile', '__import__',
'open', 'execfile', # Python 2 compatibility concerns
}
def __init__(self, source_lines: List[str], file_path: str):
self.findings: List[Finding] = []
self.source_lines = source_lines
self.file_path = file_path
self.line_offsets = self._build_line_offsets()
def _build_line_offsets(self) -> List[int]:
"""Build offset map for converting absolute position to line number."""
offsets = [0]
for line in self.source_lines:
offsets.append(offsets[-1] + len(line) + 1) # +1 for newline
return offsets
def _get_line_from_offset(self, offset: int) -> int:
"""Convert absolute character offset to 1-based line number."""
for i, start_offset in enumerate(self.line_offsets):
if offset < start_offset:
return max(1, i)
return len(self.line_offsets)
def _get_line_content(self, lineno: int) -> str:
"""Get the content of a specific line (1-based)."""
if 1 <= lineno <= len(self.source_lines):
return self.source_lines[lineno - 1]
return ""
def _add_finding(self, pattern_id: str, severity: str, category: str,
node: ast.AST, description: str) -> None:
"""Add a finding for a detected pattern."""
lineno = getattr(node, 'lineno', 1)
line_content = self._get_line_content(lineno).strip()
if len(line_content) > 120:
line_content = line_content[:117] + "..."
self.findings.append(Finding(
pattern_id=pattern_id,
severity=severity,
category=category,
file=self.file_path,
line=lineno,
match=line_content,
description=description,
))
def _is_string_concat(self, node: ast.AST) -> bool:
"""Check if node represents a string concatenation operation."""
if isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
return self._is_string_concat(node.left) or self._is_string_concat(node.right)
if isinstance(node, ast.Constant) and isinstance(node.value, str):
return True
if isinstance(node, ast.JoinedStr):
return True
return False
def _concat_to_string(self, node: ast.AST) -> str:
"""Try to extract the concatenated string value from a BinOp chain."""
if isinstance(node, ast.Constant) and isinstance(node.value, str):
return node.value
if isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
return self._concat_to_string(node.left) + self._concat_to_string(node.right)
return ""
def visit_Call(self, node: ast.Call) -> None:
"""Detect dangerous function calls including obfuscated variants."""
func = node.func
# Direct call: eval(...), exec(...), etc.
if isinstance(func, ast.Name):
func_name = func.id
if func_name in self.DANGEROUS_BUILTINS:
self._add_finding(
f"ast_dangerous_call_{func_name}",
"high", "obfuscation", node,
f"Dangerous builtin call: {func_name}()"
)
# getattr(__builtins__, ...) pattern
if isinstance(func, ast.Name) and func.id == 'getattr':
if len(node.args) >= 2:
first_arg = node.args[0]
second_arg = node.args[1]
# Check for getattr(__builtins__, ...)
if (isinstance(first_arg, ast.Name) and
first_arg.id in ('__builtins__', 'builtins')):
self._add_finding(
"ast_getattr_builtins", "critical", "obfuscation", node,
"Dynamic access to builtins via getattr() (evasion technique)"
)
# Check for getattr(..., 'eval') or getattr(..., 'exec')
if isinstance(second_arg, ast.Constant) and isinstance(second_arg.value, str):
if second_arg.value in self.DANGEROUS_BUILTINS:
self._add_finding(
f"ast_getattr_{second_arg.value}", "critical", "obfuscation", node,
f"Dynamic retrieval of {second_arg.value} via getattr()"
)
# globals()[...] or locals()[...] pattern when called
# AST structure: Call(func=Subscript(value=Call(func=Name(id='globals')), slice=Constant('eval')))
if isinstance(func, ast.Subscript):
subscript_value = func.value
# Check if subscript value is a call to globals() or locals()
if (isinstance(subscript_value, ast.Call) and
isinstance(subscript_value.func, ast.Name) and
subscript_value.func.id in ('globals', 'locals')):
self._add_finding(
"ast_dynamic_global_access", "critical", "obfuscation", node,
f"Dynamic function call via {subscript_value.func.id}()[...] (evasion technique)"
)
# Also check for direct globals[...] (without call, less common but possible)
elif isinstance(subscript_value, ast.Name) and subscript_value.id in ('globals', 'locals'):
self._add_finding(
"ast_dynamic_global_access", "critical", "obfuscation", node,
f"Dynamic function call via {subscript_value.id}[...] (evasion technique)"
)
# Detect string concatenation in arguments (e.g., 'e'+'v'+'a'+'l')
for arg in node.args:
if self._is_string_concat(arg):
concat_str = self._concat_to_string(arg)
normalized = normalize_input(concat_str)
if normalized in self.DANGEROUS_BUILTINS:
self._add_finding(
f"ast_concat_{normalized}", "critical", "obfuscation", node,
f"String concatenation obfuscation building '{normalized}'"
)
self.generic_visit(node)
def visit_Subscript(self, node: ast.Subscript) -> None:
"""Detect globals()['eval'] / locals()['exec'] patterns."""
# Check for globals()[...] or locals()[...]
# AST structure for `globals()['eval']`: Subscript(value=Call(func=Name(id='globals')), slice=Constant('eval'))
subscript_target = node.value
globals_or_locals = None
# Check if subscript target is a call to globals() or locals()
if isinstance(subscript_target, ast.Call) and isinstance(subscript_target.func, ast.Name):
if subscript_target.func.id in ('globals', 'locals'):
globals_or_locals = subscript_target.func.id
# Also handle direct globals[...] without call (less common)
elif isinstance(subscript_target, ast.Name) and subscript_target.id in ('globals', 'locals'):
globals_or_locals = subscript_target.id
if globals_or_locals:
# Check the subscript value
if isinstance(node.slice, ast.Constant) and isinstance(node.slice.value, str):
slice_val = node.slice.value
if slice_val in self.DANGEROUS_BUILTINS:
self._add_finding(
f"ast_{globals_or_locals}_subscript_{slice_val}",
"critical", "obfuscation", node,
f"Dynamic access to {slice_val} via {globals_or_locals}()['{slice_val}']"
)
# String concatenation in subscript: globals()['e'+'v'+'a'+'l']
elif isinstance(node.slice, ast.BinOp):
concat_str = self._concat_to_string(node.slice)
normalized = normalize_input(concat_str)
if normalized in self.DANGEROUS_BUILTINS:
self._add_finding(
f"ast_{globals_or_locals}_concat_{normalized}",
"critical", "obfuscation", node,
f"String concatenation obfuscation via {globals_or_locals}()['...']"
)
# Check for __builtins__[...]
if isinstance(node.value, ast.Name) and node.value.id == '__builtins__':
self._add_finding(
"ast_builtins_subscript", "high", "obfuscation", node,
"Direct subscript access to __builtins__"
)
self.generic_visit(node)
def visit_BinOp(self, node: ast.BinOp) -> None:
"""Detect string concatenation building dangerous function names."""
if isinstance(node.op, ast.Add):
concat_str = self._concat_to_string(node)
normalized = normalize_input(concat_str)
if normalized in self.DANGEROUS_BUILTINS:
self._add_finding(
f"ast_string_concat_{normalized}", "high", "obfuscation", node,
f"String concatenation building '{normalized}' (possible obfuscation)"
)
self.generic_visit(node)
def visit_Attribute(self, node: ast.Attribute) -> None:
"""Detect obj.eval, obj.exec patterns."""
if node.attr in self.DANGEROUS_BUILTINS:
self._add_finding(
f"ast_attr_{node.attr}", "medium", "obfuscation", node,
f"Access to .{node.attr} attribute (context-dependent risk)"
)
self.generic_visit(node)
def analyze_python_ast(content: str, file_path: str) -> List[Finding]:
"""
Parse Python code and analyze its AST for security issues.
Args:
content: The Python source code to analyze
file_path: Path to the file (for reporting)
Returns:
List of findings from AST analysis
"""
lines = content.split('\n')
try:
tree = ast.parse(content)
except SyntaxError:
# If we can't parse, return empty findings
return []
analyzer = PythonSecurityAnalyzer(lines, file_path)
analyzer.visit(tree)
return analyzer.findings
# ---------------------------------------------------------------------------
# Scanning functions
@@ -529,7 +855,12 @@ INVISIBLE_CHARS = {
def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
"""
Scan a single file for threat patterns and invisible unicode characters.
Scan a single file for threat patterns, obfuscation, and invisible unicode.
Performs:
1. Invisible unicode character detection (on original content)
2. AST analysis for Python files (detects obfuscated execution patterns)
3. Regex pattern matching on normalized content (catches obfuscated variants)
Args:
file_path: Absolute path to the file
@@ -553,27 +884,7 @@ def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
lines = content.split('\n')
seen = set() # (pattern_id, line_number) for deduplication
# Regex pattern matching
for pattern, pid, severity, category, description in THREAT_PATTERNS:
for i, line in enumerate(lines, start=1):
if (pid, i) in seen:
continue
if re.search(pattern, line, re.IGNORECASE):
seen.add((pid, i))
matched_text = line.strip()
if len(matched_text) > 120:
matched_text = matched_text[:117] + "..."
findings.append(Finding(
pattern_id=pid,
severity=severity,
category=category,
file=rel_path,
line=i,
match=matched_text,
description=description,
))
# Invisible unicode character detection
# Step 1: Invisible unicode character detection (on original)
for i, line in enumerate(lines, start=1):
for char in INVISIBLE_CHARS:
if char in line:
@@ -589,6 +900,38 @@ def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
))
break # one finding per line for invisible chars
# Step 2: AST analysis for Python files
if file_path.suffix.lower() == '.py':
ast_findings = analyze_python_ast(content, rel_path)
findings.extend(ast_findings)
# Step 3: Normalize content and run regex patterns
# This catches obfuscated variants like Cyrillic homoglyphs, fullwidth, etc.
normalized_content = normalize_input(content)
normalized_lines = normalized_content.split('\n')
# Map normalized line numbers to original line numbers (they should match)
for pattern, pid, severity, category, description in THREAT_PATTERNS:
for i, norm_line in enumerate(normalized_lines, start=1):
if (pid, i) in seen:
continue
if re.search(pattern, norm_line, re.IGNORECASE):
seen.add((pid, i))
# Show original line content for context
original_line = lines[i - 1] if i <= len(lines) else norm_line
matched_text = original_line.strip()
if len(matched_text) > 120:
matched_text = matched_text[:117] + "..."
findings.append(Finding(
pattern_id=pid,
severity=severity,
category=category,
file=rel_path,
line=i,
match=matched_text,
description=description,
))
return findings
@@ -598,8 +941,17 @@ def scan_skill(skill_path: Path, source: str = "community") -> ScanResult:
Performs:
1. Structural checks (file count, total size, binary files, symlinks)
2. Regex pattern matching on all text files
3. Invisible unicode character detection
2. Unicode normalization to defeat obfuscation (NFKC, homoglyphs, zero-width)
3. AST analysis for Python files (detects dynamic execution patterns)
4. Regex pattern matching on normalized content
5. Invisible unicode character detection
V-011 Bypass Protection:
- Unicode homoglyphs (Cyrillic, fullwidth, Greek lookalikes)
- Zero-width character injection (U+200B, U+200C, U+200D, U+FEFF)
- Case manipulation (EvAl, ExEc)
- String concatenation obfuscation ('e'+'v'+'a'+'l')
- Dynamic execution patterns (globals()['eval'], getattr(__builtins__, 'exec'))
Args:
skill_path: Path to the skill directory (must contain SKILL.md)