hermes-agent/tools/risk_scoring.py

"""ML-inspired risk scoring for command approval.

Enhances pattern-based dangerous command detection with:
1. Path-aware risk scoring (system paths = higher tier)
2. Context detection (documentation vs execution)
3. Multi-factor risk score calculation

Usage:
    from tools.risk_scoring import score_command_risk, RiskScore
    result = score_command_risk("rm /etc/passwd")
    print(result.tier)      # "CRITICAL"
    print(result.score)     # 95
    print(result.factors)   # ["system_path", "destructive_operation"]
"""

from __future__ import annotations

import re
from dataclasses import dataclass, field
from typing import List, Optional


# ---------------------------------------------------------------------------
# Path risk classification
# ---------------------------------------------------------------------------

# Critical system paths — operations here are almost always dangerous
_SYSTEM_PATHS_CRITICAL = [
    r"/etc/",
    r"/boot/",
    r"/sys/",
    r"/proc/",
    r"/dev/sd",
    r"/dev/nvme",
    r"/usr/bin/",
    r"/usr/sbin/",
    r"/sbin/",
    r"/bin/",
    r"/lib/systemd/",
    r"/var/log/syslog",
    r"/var/log/auth",
]

# Sensitive user paths — important but user-scoped
_SENSITIVE_USER_PATHS = [
    r"\.ssh/",
    r"\.gnupg/",
    r"\.aws/",
    r"\.config/gcloud/",
    r"\.kube/config",
    r"\.docker/config",
    r"\.hermes/\.env",
    r"\.netrc",
    r"\.pgpass",
    r"id_rsa",
    r"id_ed25519",
]

# Safe/temp paths — operations here are usually benign
_SAFE_PATHS = [
    r"/tmp/",
    r"/var/tmp/",
    r"\.cache/",
    r"temp",
    r"tmp",
    r"\.log$",
    r"\.bak$",
    r"\.old$",
    r"\.swp$",
    r"node_modules/",
    r"__pycache__/",
    r"\.pyc$",
]

# Dangerous user paths — home dir but destructive
_DANGEROUS_USER_PATHS = [
    r"~/",
    r"\$HOME/",
    r"/home/\w+/",
]


def classify_path_risk(path: str) -> tuple[int, str]:
    """Classify a filesystem path's risk level.

    Returns (risk_score, category) where risk_score is 0-100.
    """
    path_lower = path.lower()

    # Check critical system paths
    for pattern in _SYSTEM_PATHS_CRITICAL:
        if re.search(pattern, path_lower):
            return 90, "system_path_critical"

    # Check sensitive user paths
    for pattern in _SENSITIVE_USER_PATHS:
        if re.search(pattern, path_lower):
            return 75, "sensitive_user_path"

    # Check safe paths
    for pattern in _SAFE_PATHS:
        if re.search(pattern, path_lower):
            return 10, "safe_path"

    # Check dangerous user paths
    for pattern in _DANGEROUS_USER_PATHS:
        if re.search(pattern, path_lower):
            return 50, "user_path"

    # Default: moderate risk for unknown paths
    return 30, "unknown_path"


# ---------------------------------------------------------------------------
# Context detection
# ---------------------------------------------------------------------------

def detect_context(command: str) -> str:
    """Detect the context of a command string.

    Returns one of:
    - "code_block": Inside a markdown code block (likely documentation)
    - "comment": Shell comment (# ...)
    - "heredoc_content": Content inside a heredoc (documentation)
    - "execution": Normal command execution
    """
    stripped = command.strip()

    # Markdown code fence
    if stripped.startswith("```"):
        return "code_block"

    # Shell comment
    if stripped.startswith("#"):
        return "comment"

    # Inline comment (command followed by #)
    if re.search(r'\s+#\s', command) and not re.search(r'[;&|]\s*#', command):
        # Might be a comment in the middle
        pass

    # Heredoc content indicators
    if re.search(r"<<\s*['\"]?\w+['\"]?", command):
        return "heredoc_content"

    # Documentation indicators
    doc_indicators = [
        r"example:",
        r"e\.g\.",
        r"i\.e\.",
        r"note:",
        r"warning:",
        r"see also:",
        r"documentation",
        r"README",
        r"man page",
        r"help:",
    ]
    for indicator in doc_indicators:
        if re.search(indicator, command, re.IGNORECASE):
            return "documentation"

    return "execution"


# ---------------------------------------------------------------------------
# Operation risk classification
# ---------------------------------------------------------------------------

_OPERATION_RISK = {
    # Destructive operations
    "rm": 70,
    "rmdir": 50,
    "shred": 90,
    "dd": 60,
    "mkfs": 95,
    "fdisk": 85,
    "wipefs": 90,

    # Permission changes
    "chmod": 40,
    "chown": 50,
    "setfacl": 50,

    # System control
    "systemctl": 60,
    "service": 55,
    "reboot": 90,
    "shutdown": 90,
    "halt": 90,
    "poweroff": 90,

    # Process control
    "kill": 45,
    "killall": 55,
    "pkill": 55,

    # Network
    "iptables": 70,
    "ufw": 60,
    "firewall-cmd": 60,

    # Package management
    "apt-get": 30,
    "yum": 30,
    "dnf": 30,
    "pacman": 30,
    "pip": 20,
    "npm": 15,

    # Git
    "git reset --hard": 50, "git reset": 30,
    "git push": 30,
    "git clean": 45,
    "git branch": 20,

    # Dangerous pipes
    "curl": 25,
    "wget": 25,
}


# Read-only operations — low risk even on system paths
_READONLY_OPERATIONS = {
    "cat": 5, "head": 5, "tail": 5, "less": 5, "more": 5,
    "grep": 5, "find": 10, "ls": 3, "dir": 3, "tree": 3,
    "file": 3, "stat": 3, "wc": 3, "diff": 5, "md5sum": 5,
    "sha256sum": 5, "which": 3, "whereis": 3, "type": 3,
    "readlink": 3, "realpath": 3, "basename": 3, "dirname": 3,
}


def get_operation_risk(command: str) -> tuple[int, str]:
    """Get the risk score for the operation in a command.

    Returns (risk_score, operation_name).
    """
    cmd_lower = command.lower().strip()

    # Check read-only operations first (low risk regardless of path)
    for op, score in sorted(_READONLY_OPERATIONS.items(), key=lambda x: -len(x[0])):
        if cmd_lower.startswith(op + " ") or cmd_lower.startswith(op + "\t") or cmd_lower == op:
            return score, op

    # Check compound operations
    for op, score in sorted(_OPERATION_RISK.items(), key=lambda x: -len(x[0])):
        if cmd_lower.startswith(op) or f" {op}" in cmd_lower:
            return score, op

    return 20, "unknown"


# ---------------------------------------------------------------------------
# Risk score calculation
# ---------------------------------------------------------------------------

@dataclass
class RiskScore:
    """Result of risk scoring for a command."""
    command: str
    score: int = 0                    # 0-100 risk score
    tier: str = "SAFE"                # SAFE, LOW, MEDIUM, HIGH, CRITICAL
    factors: List[str] = field(default_factory=list)
    path_risk: int = 0
    operation_risk: int = 0
    context: str = "execution"
    context_modifier: float = 1.0
    recommendation: str = ""

    def __post_init__(self):
        if not self.recommendation:
            self.recommendation = self._generate_recommendation()

    def _generate_recommendation(self) -> str:
        if self.tier == "CRITICAL":
            return "BLOCK — requires explicit user approval"
        elif self.tier == "HIGH":
            return "WARN — confirm with user before executing"
        elif self.tier == "MEDIUM":
            return "CAUTION — log and proceed with care"
        elif self.tier == "LOW":
            return "NOTE — low risk, proceed normally"
        return "OK — safe to execute"


def score_command_risk(command: str) -> RiskScore:
    """Calculate a comprehensive risk score for a command.

    Considers:
    - Pattern-based detection (existing DANGEROUS_PATTERNS)
    - Path risk (system paths, user paths, temp paths)
    - Operation risk (rm vs cat vs echo)
    - Context (documentation vs execution)
    """
    result = RiskScore(command=command)
    factors = []

    # 1. Path analysis
    paths = re.findall(r'[/~$][^\s;&|\'"]*', command)
    max_path_risk = 0
    for path in paths:
        risk, category = classify_path_risk(path)
        if risk > max_path_risk:
            max_path_risk = risk
        if risk >= 50:
            factors.append(f"path:{category}")
    result.path_risk = max_path_risk

    # 2. Operation risk
    op_risk, op_name = get_operation_risk(command)
    result.operation_risk = op_risk
    if op_risk >= 40:
        factors.append(f"operation:{op_name}")

    # 3. Context detection
    ctx = detect_context(command)
    result.context = ctx

    # Context modifiers: documentation contexts reduce risk
    context_modifiers = {
        "execution": 1.0,
        "code_block": 0.3,
        "comment": 0.1,
        "heredoc_content": 0.5,
        "documentation": 0.2,
    }
    result.context_modifier = context_modifiers.get(ctx, 1.0)

    # 4. Special pattern bonuses
    destructive_patterns = [
        (r'\brm\s+-[^s]*r', 20, "recursive_delete"),
        (r'\brm\s+/', 15, "root_delete"),
        (r'\bchmod\s+777', 15, "world_writable"),
        (r'\bDROP\s+TABLE', 25, "sql_drop"),
        (r'\bDELETE\s+FROM(?!.*WHERE)', 20, "sql_delete_no_where"),
        (r'\|\s*(ba)?sh\b', 20, "pipe_to_shell"),
        (r'--force', 10, "force_flag"),
        (r'--no-preserve-root', 30, "no_preserve_root"),
    ]
    for pattern, bonus, factor_name in destructive_patterns:
        if re.search(pattern, command, re.IGNORECASE):
            result.score += bonus
            factors.append(factor_name)

    # 5. Calculate final score
    # Read operations on system paths are safe (just looking, not touching)
    is_read_op = result.operation_risk <= 10

    if is_read_op:
        # Read operations: mostly operation risk, path barely matters
        base_score = result.operation_risk + (result.path_risk * 0.05)
    elif result.path_risk >= 80:
        # Write to system path: very dangerous
        base_score = result.path_risk + (result.operation_risk * 0.5)
    elif result.path_risk <= 15:
        # Write to safe path: mostly operation risk
        base_score = result.path_risk + (result.operation_risk * 0.3)
    else:
        # Moderate path: balanced
        base_score = result.path_risk + (result.operation_risk * 0.4)

    base_score += result.score  # pattern bonuses
    result.score = min(100, int(base_score * result.context_modifier))

    # 6. Determine tier
    if result.score >= 80:
        result.tier = "CRITICAL"
    elif result.score >= 60:
        result.tier = "HIGH"
    elif result.score >= 40:
        result.tier = "MEDIUM"
    elif result.score >= 20:
        result.tier = "LOW"
    else:
        result.tier = "SAFE"

    result.factors = factors
    if not result.recommendation:
        result.recommendation = result._generate_recommendation()

    return result


def compare_commands(cmd1: str, cmd2: str) -> dict:
    """Compare risk scores of two commands.

    Useful for showing why "rm temp.txt" is different from "rm /etc/passwd".
    """
    r1 = score_command_risk(cmd1)
    r2 = score_command_risk(cmd2)
    return {
        "command_1": {"command": cmd1, "score": r1.score, "tier": r1.tier},
        "command_2": {"command": cmd2, "score": r2.score, "tier": r2.tier},
        "difference": abs(r1.score - r2.score),
        "riskier": cmd1 if r1.score > r2.score else cmd2,
    }