hermes-agent/tools/credential_redact.py

"""
Credential Redaction — Block silent credential exposure in tool outputs

Poka-yoke: Prevent API keys, tokens, passwords from leaking into context.

Issue: #839
"""

import json
import logging
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Tuple

from hermes_constants import get_hermes_home

logger = logging.getLogger(__name__)

HERMES_HOME = get_hermes_home()
AUDIT_DIR = HERMES_HOME / "audit"

# Credential patterns to detect and redact
CREDENTIAL_PATTERNS = [
    # API keys
    (r"sk-[a-zA-Z0-9]{20,}", "[REDACTED: OpenAI API key]"),
    (r"sk-ant-[a-zA-Z0-9-]{20,}", "[REDACTED: Anthropic API key]"),
    (r"ghp_[a-zA-Z0-9]{36}", "[REDACTED: GitHub token]"),
    (r"gho_[a-zA-Z0-9]{36}", "[REDACTED: GitHub OAuth token]"),
    (r"glpat-[a-zA-Z0-9-]{20,}", "[REDACTED: GitLab token]"),

    # Bearer tokens
    (r"Bearer\s+[a-zA-Z0-9._-]{20,}", "[REDACTED: Bearer token]"),
    (r"bearer\s+[a-zA-Z0-9._-]{20,}", "[REDACTED: Bearer token]"),

    # Generic tokens/passwords
    ("(?:token|TOKEN|Token)[:=]\\s*['\"]?[a-zA-Z0-9._-]{20,}['\"]?", "[REDACTED: Token]"),
    ("(?:password|PASSWORD|Password)[:=]\\s*['\"]?[^\\s\"']{8,}['\"]?", "[REDACTED: Password]"),
    ("(?:secret|SECRET|Secret)[:=]\\s*['\"]?[a-zA-Z0-9._-]{20,}['\"]?", "[REDACTED: Secret]"),
    ("(?:api_key|API_KEY|apiKey|ApiKey)[:=]\\s*['\"]?[a-zA-Z0-9._-]{20,}['\"]?", "[REDACTED: API key]"),

    # AWS keys
    (r"AKIA[0-9A-Z]{16}", "[REDACTED: AWS access key]"),
    ("(?:aws_secret_access_key|AWS_SECRET_ACCESS_KEY)[:=]\\s*['\"]?[a-zA-Z0-9/+=]{40}['\"]?", "[REDACTED: AWS secret]"),

    # Private keys
    (r"-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----", "[REDACTED: Private key header]"),

    # Connection strings
    (r"(?:postgres|mysql|mongodb|redis)://[^:]+:[^@]+@[^\s]+", "[REDACTED: Database connection string]"),
]

# Files that should trigger auto-masking
SENSITIVE_FILE_PATTERNS = [
    r"\.env$",
    r"\.env\.",
    r"\.secret",
    r"credentials",
    r"\.token",
    r"config\.yaml$",
    r"config\.yml$",
    r"config\.json$",
    r"\.netrc$",
    r"\.pgpass$",
]


class CredentialRedactor:
    """Redact credentials from text."""

    def __init__(self, audit_log: bool = True):
        self.audit_log = audit_log
        self._redaction_count = 0

    def redact(self, text: str) -> Tuple[str, int]:
        """
        Redact credentials from text.

        Returns:
            Tuple of (redacted_text, number_of_redactions)
        """
        if not text:
            return text, 0

        redacted = text
        count = 0

        for pattern, replacement in CREDENTIAL_PATTERNS:
            matches = re.findall(pattern, redacted, re.IGNORECASE)
            if matches:
                redacted = re.sub(pattern, replacement, redacted, flags=re.IGNORECASE)
                count += len(matches)

        if count > 0:
            self._redaction_count += count
            if self.audit_log:
                self._log_redaction(count, text[:100])

        return redacted, count

    def redact_tool_output(self, tool_name: str, output: str) -> Tuple[str, str]:
        """
        Redact tool output and return notice if redactions occurred.

        Returns:
            Tuple of (redacted_output, notice_or_empty)
        """
        redacted, count = self.redact(output)

        if count > 0:
            notice = f"[REDACTED: {count} credential pattern{'s' if count > 1 else ''} found in {tool_name} output]"
            return redacted, notice

        return redacted, ""

    def should_mask_file(self, file_path: str) -> bool:
        """Check if file should have credentials auto-masked."""
        path_lower = file_path.lower()
        return any(re.search(p, path_lower) for p in SENSITIVE_FILE_PATTERNS)

    def mask_file_content(self, content: str, file_path: str) -> str:
        """Mask credentials in file content while preserving structure."""
        if not self.should_mask_file(file_path):
            return content

        lines = content.split("\n")
        masked_lines = []

        for line in lines:
            # Preserve key=value structure but mask values
            if "=" in line and not line.strip().startswith("#"):
                key, _, value = line.partition("=")
                key_lower = key.strip().lower()

                sensitive_keys = ["password", "secret", "token", "key", "api", "credential"]
                if any(sk in key_lower for sk in sensitive_keys):
                    masked_lines.append(f"{key}=[REDACTED]")
                else:
                    masked_lines.append(line)
            else:
                masked_lines.append(line)

        return "\n".join(masked_lines)

    def _log_redaction(self, count: int, preview: str):
        """Log redaction event to audit trail."""
        try:
            AUDIT_DIR.mkdir(parents=True, exist_ok=True)
            audit_file = AUDIT_DIR / "redactions.jsonl"

            entry = {
                "timestamp": datetime.now(timezone.utc).isoformat(),
                "redactions": count,
                "preview_hash": hash(preview),
            }

            with open(audit_file, "a") as f:
                f.write(json.dumps(entry) + "\n")

        except Exception as e:
            logger.debug("Audit log failed: %s", e)


# Module-level redactor
_redactor = CredentialRedactor()


def redact_credentials(text: str) -> Tuple[str, int]:
    """Redact credentials from text."""
    return _redactor.redact(text)


def redact_tool_output(tool_name: str, output: str) -> Tuple[str, str]:
    """Redact tool output and return notice."""
    return _redactor.redact_tool_output(tool_name, output)


def should_mask_file(file_path: str) -> bool:
    """Check if file should be masked."""
    return _redactor.should_mask_file(file_path)


def mask_sensitive_file(content: str, file_path: str) -> str:
    """Mask credentials in sensitive file."""
    return _redactor.mask_file_content(content, file_path)