#!/usr/bin/env python3
"""
Secret leak detection script for pre-commit hooks.

Detects common secret patterns in staged files:
- API keys (sk-*, pk_*, etc.)
- Private keys (-----BEGIN PRIVATE KEY-----)
- Passwords in config files
- GitHub/Gitea tokens
- Database connection strings with credentials
"""

import argparse
import re
import sys
from pathlib import Path
from typing import List, Tuple


# Secret patterns to detect
SECRET_PATTERNS = {
    "openai_api_key": {
        "pattern": r"sk-[a-zA-Z0-9]{20,}",
        "description": "OpenAI API key",
    },
    "anthropic_api_key": {
        "pattern": r"sk-ant-[a-zA-Z0-9]{32,}",
        "description": "Anthropic API key",
    },
    "generic_api_key": {
        "pattern": r"(?i)(api[_-]?key|apikey)\s*[:=]\s*['\"]?([a-zA-Z0-9_\-]{16,})['\"]?",
        "description": "Generic API key",
    },
    "private_key": {
        "pattern": r"-----BEGIN (RSA |DSA |EC |OPENSSH )?PRIVATE KEY-----",
        "description": "Private key",
    },
    "github_token": {
        "pattern": r"gh[pousr]_[A-Za-z0-9_]{36,}",
        "description": "GitHub token",
    },
    "gitea_token": {
        "pattern": r"gitea_[a-f0-9]{40}",
        "description": "Gitea token",
    },
    "aws_access_key": {
        "pattern": r"AKIA[0-9A-Z]{16}",
        "description": "AWS Access Key ID",
    },
    "aws_secret_key": {
        "pattern": r"(?i)aws[_-]?secret[_-]?(access)?[_-]?key\s*[:=]\s*['\"]?([a-zA-Z0-9/+=]{40})['\"]?",
        "description": "AWS Secret Access Key",
    },
    "database_connection_string": {
        "pattern": r"(?i)(mongodb|mysql|postgresql|postgres|redis)://[^:]+:[^@]+@[^/]+",
        "description": "Database connection string with credentials",
    },
    "password_in_config": {
        "pattern": r"(?i)(password|passwd|pwd)\s*[:=]\s*['\"]([^'\"]{4,})['\"]",
        "description": "Hardcoded password",
    },
    "stripe_key": {
        "pattern": r"sk_(live|test)_[0-9a-zA-Z]{24,}",
        "description": "Stripe API key",
    },
    "slack_token": {
        "pattern": r"xox[baprs]-[0-9a-zA-Z]{10,}",
        "description": "Slack token",
    },
    "telegram_bot_token": {
        "pattern": r"[0-9]{8,10}:[a-zA-Z0-9_-]{35}",
        "description": "Telegram bot token",
    },
    "jwt_token": {
        "pattern": r"eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*",
        "description": "JWT token",
    },
    "bearer_token": {
        "pattern": r"(?i)bearer\s+[a-zA-Z0-9_\-\.=]{20,}",
        "description": "Bearer token",
    },
}

# Files/patterns to exclude from scanning
EXCLUSIONS = {
    "files": {
        ".pre-commit-hooks.yaml",
        ".gitignore",
        "poetry.lock",
        "package-lock.json",
        "yarn.lock",
        "Pipfile.lock",
        ".secrets.baseline",
    },
    "extensions": {
        ".md",
        ".svg",
        ".png",
        ".jpg",
        ".jpeg",
        ".gif",
        ".ico",
        ".woff",
        ".woff2",
        ".ttf",
        ".eot",
    },
    "paths": {
        ".git/",
        "node_modules/",
        "__pycache__/",
        ".pytest_cache/",
        ".mypy_cache/",
        ".venv/",
        "venv/",
        ".tox/",
        "dist/",
        "build/",
        ".eggs/",
    },
    "patterns": {
        r"your_[a-z_]+_here",
        r"example_[a-z_]+",
        r"dummy_[a-z_]+",
        r"test_[a-z_]+",
        r"fake_[a-z_]+",
        r"password\s*[=:]\s*['\"]?(changeme|password|123456|admin)['\"]?",
        r"#.*(?:example|placeholder|sample)",
        r"(mongodb|mysql|postgresql)://[^:]+:[^@]+@localhost",
        r"(mongodb|mysql|postgresql)://[^:]+:[^@]+@127\.0\.0\.1",
    },
}

# Markers for inline exclusions
EXCLUSION_MARKERS = [
    "# pragma: allowlist secret",
    "# noqa: secret",
    "// pragma: allowlist secret",
    "/* pragma: allowlist secret */",
    "# secret-detection:ignore",
]


def should_exclude_file(file_path: str) -> bool:
    """Check if file should be excluded from scanning."""
    path = Path(file_path)

    if path.name in EXCLUSIONS["files"]:
        return True

    if path.suffix.lower() in EXCLUSIONS["extensions"]:
        return True

    for excluded_path in EXCLUSIONS["paths"]:
        if excluded_path in str(path):
            return True

    return False


def has_exclusion_marker(line: str) -> bool:
    """Check if line has an exclusion marker."""
    return any(marker in line for marker in EXCLUSION_MARKERS)


def is_excluded_match(line: str, match_str: str) -> bool:
    """Check if the match should be excluded."""
    for pattern in EXCLUSIONS["patterns"]:
        if re.search(pattern, line, re.IGNORECASE):
            return True

    if re.search(r"['\"](fake|test|dummy|example|placeholder|changeme)['\"]", line, re.IGNORECASE):
        return True

    return False


def scan_file(file_path: str) -> List[Tuple[int, str, str, str]]:
    """Scan a single file for secrets.
    
    Returns list of tuples: (line_number, line_content, pattern_name, description)
    """
    findings = []

    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            lines = f.readlines()
    except (IOError, OSError) as e:
        print(f"Warning: Could not read {file_path}: {e}", file=sys.stderr)
        return findings

    for line_num, line in enumerate(lines, 1):
        if has_exclusion_marker(line):
            continue

        for pattern_name, pattern_info in SECRET_PATTERNS.items():
            matches = re.finditer(pattern_info["pattern"], line)
            for match in matches:
                match_str = match.group(0)

                if is_excluded_match(line, match_str):
                    continue

                findings.append(
                    (line_num, line.strip(), pattern_name, pattern_info["description"])
                )

    return findings


def scan_files(file_paths: List[str]) -> dict:
    """Scan multiple files for secrets.
    
    Returns dict: {file_path: [(line_num, line, pattern, description), ...]}
    """
    results = {}

    for file_path in file_paths:
        if should_exclude_file(file_path):
            continue

        findings = scan_file(file_path)
        if findings:
            results[file_path] = findings

    return results


def print_findings(results: dict) -> None:
    """Print secret findings in a readable format."""
    if not results:
        return

    print("=" * 80)
    print("POTENTIAL SECRETS DETECTED!")
    print("=" * 80)
    print()

    total_findings = 0
    for file_path, findings in results.items():
        print(f"\nFILE: {file_path}")
        print("-" * 40)
        for line_num, line, pattern_name, description in findings:
            total_findings += 1
            print(f"  Line {line_num}: {description}")
            print(f"  Pattern: {pattern_name}")
            print(f"  Content: {line[:100]}{'...' if len(line) > 100 else ''}")
            print()

    print("=" * 80)
    print(f"Total findings: {total_findings}")
    print("=" * 80)
    print()
    print("To fix this:")
    print("  1. Remove the secret from the file")
    print("  2. Use environment variables or a secrets manager")
    print("  3. If this is a false positive, add an exclusion marker:")
    print("     - Add '# pragma: allowlist secret' to the end of the line")
    print("     - Or add '# secret-detection:ignore' to the end of the line")
    print()


def main() -> int:
    """Main entry point."""
    parser = argparse.ArgumentParser(
        description="Detect secrets in files",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s file1.py file2.yaml
  %(prog)s --exclude "*.md" src/

Exit codes:
  0 - No secrets found
  1 - Secrets detected
  2 - Error
        """,
    )
    parser.add_argument(
        "files",
        nargs="+",
        help="Files to scan",
    )
    parser.add_argument(
        "--exclude",
        action="append",
        default=[],
        help="Additional file patterns to exclude",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Print verbose output",
    )

    args = parser.parse_args()

    files_to_scan = []
    for file_path in args.files:
        if should_exclude_file(file_path):
            if args.verbose:
                print(f"Skipping excluded file: {file_path}")
            continue
        files_to_scan.append(file_path)

    if args.verbose:
        print(f"Scanning {len(files_to_scan)} files...")

    results = scan_files(files_to_scan)

    if results:
        print_findings(results)
        return 1

    if args.verbose:
        print("No secrets detected!")

    return 0


if __name__ == "__main__":
    sys.exit(main())