scripts/detect_secrets.py

#!/usr/bin/env python3
"""
Secret leak detection script for pre-commit hooks.

Detects common secret patterns in staged files:
- API keys (sk-*, pk_*, etc.)
- Private keys (-----BEGIN PRIVATE KEY-----)
- Passwords in config files
- GitHub/Gitea tokens
- Database connection strings with credentials
"""

import argparse
import re
import sys
from pathlib import Path
from typing import List, Tuple


# Secret patterns to detect
SECRET_PATTERNS = {
    "openai_api_key": {
        "pattern": r"sk-[a-zA-Z0-9]{20,}",
        "description": "OpenAI API key",
    },
    "anthropic_api_key": {
        "pattern": r"sk-ant-[a-zA-Z0-9]{32,}",
        "description": "Anthropic API key",
    },
    "generic_api_key": {
        "pattern": r"(?i)(api[_-]?key|apikey)\s*[:=]\s*['\"]?([a-zA-Z0-9_\-]{16,})['\"]?",
        "description": "Generic API key",
    },
    "private_key": {
        "pattern": r"-----BEGIN (RSA |DSA |EC |OPENSSH )?PRIVATE KEY-----",
        "description": "Private key",
    },
    "github_token": {
        "pattern": r"gh[pousr]_[A-Za-z0-9_]{36,}",
        "description": "GitHub token",
    },
    "gitea_token": {
        "pattern": r"gitea_[a-f0-9]{40}",
        "description": "Gitea token",
    },
    "aws_access_key": {
        "pattern": r"AKIA[0-9A-Z]{16}",
        "description": "AWS Access Key ID",
    },
    "aws_secret_key": {
        "pattern": r"(?i)aws[_-]?secret[_-]?(access)?[_-]?key\s*[:=]\s*['\"]?([a-zA-Z0-9/+=]{40})['\"]?",
        "description": "AWS Secret Access Key",
    },
    "database_connection_string": {
        "pattern": r"(?i)(mongodb|mysql|postgresql|postgres|redis)://[^:]+:[^@]+@[^/]+",
        "description": "Database connection string with credentials",
    },
    "password_in_config": {
        "pattern": r"(?i)(password|passwd|pwd)\s*[:=]\s*['\"]([^'\"]{4,})['\"]",
        "description": "Hardcoded password",
    },
    "stripe_key": {
        "pattern": r"sk_(live|test)_[0-9a-zA-Z]{24,}",
        "description": "Stripe API key",
    },
    "slack_token": {
        "pattern": r"xox[baprs]-[0-9a-zA-Z]{10,}",
        "description": "Slack token",
    },
    "telegram_bot_token": {
        "pattern": r"[0-9]{8,10}:[a-zA-Z0-9_-]{35}",
        "description": "Telegram bot token",
    },
    "jwt_token": {
        "pattern": r"eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*",
        "description": "JWT token",
    },
    "bearer_token": {
        "pattern": r"(?i)bearer\s+[a-zA-Z0-9_\-\.=]{20,}",
        "description": "Bearer token",
    },
}

# Files/patterns to exclude from scanning
EXCLUSIONS = {
    "files": {
        ".pre-commit-hooks.yaml",
        ".gitignore",
        "poetry.lock",
        "package-lock.json",
        "yarn.lock",
        "Pipfile.lock",
        ".secrets.baseline",
    },
    "extensions": {
        ".md",
        ".svg",
        ".png",
        ".jpg",
        ".jpeg",
        ".gif",
        ".ico",
        ".woff",
        ".woff2",
        ".ttf",
        ".eot",
    },
    "paths": {
        ".git/",
        "node_modules/",
        "__pycache__/",
        ".pytest_cache/",
        ".mypy_cache/",
        ".venv/",
        "venv/",
        ".tox/",
        "dist/",
        "build/",
        ".eggs/",
    },
    "patterns": {
        r"your_[a-z_]+_here",
        r"example_[a-z_]+",
        r"dummy_[a-z_]+",
        r"test_[a-z_]+",
        r"fake_[a-z_]+",
        r"password\s*[=:]\s*['\"]?(changeme|password|123456|admin)['\"]?",
        r"#.*(?:example|placeholder|sample)",
        r"(mongodb|mysql|postgresql)://[^:]+:[^@]+@localhost",
        r"(mongodb|mysql|postgresql)://[^:]+:[^@]+@127\.0\.0\.1",
    },
}

# Markers for inline exclusions
EXCLUSION_MARKERS = [
    "# pragma: allowlist secret",
    "# noqa: secret",
    "// pragma: allowlist secret",
    "/* pragma: allowlist secret */",
    "# secret-detection:ignore",
]


def should_exclude_file(file_path: str) -> bool:
    """Check if file should be excluded from scanning."""
    path = Path(file_path)

    if path.name in EXCLUSIONS["files"]:
        return True

    if path.suffix.lower() in EXCLUSIONS["extensions"]:
        return True

    for excluded_path in EXCLUSIONS["paths"]:
        if excluded_path in str(path):
            return True

    return False


def has_exclusion_marker(line: str) -> bool:
    """Check if line has an exclusion marker."""
    return any(marker in line for marker in EXCLUSION_MARKERS)


def is_excluded_match(line: str, match_str: str) -> bool:
    """Check if the match should be excluded."""
    for pattern in EXCLUSIONS["patterns"]:
        if re.search(pattern, line, re.IGNORECASE):
            return True

    if re.search(r"['\"](fake|test|dummy|example|placeholder|changeme)['\"]", line, re.IGNORECASE):
        return True

    return False


def scan_file(file_path: str) -> List[Tuple[int, str, str, str]]:
    """Scan a single file for secrets.
    
    Returns list of tuples: (line_number, line_content, pattern_name, description)
    """
    findings = []

    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            lines = f.readlines()
    except (IOError, OSError) as e:
        print(f"Warning: Could not read {file_path}: {e}", file=sys.stderr)
        return findings

    for line_num, line in enumerate(lines, 1):
        if has_exclusion_marker(line):
            continue

        for pattern_name, pattern_info in SECRET_PATTERNS.items():
            matches = re.finditer(pattern_info["pattern"], line)
            for match in matches:
                match_str = match.group(0)

                if is_excluded_match(line, match_str):
                    continue

                findings.append(
                    (line_num, line.strip(), pattern_name, pattern_info["description"])
                )

    return findings


def scan_files(file_paths: List[str]) -> dict:
    """Scan multiple files for secrets.
    
    Returns dict: {file_path: [(line_num, line, pattern, description), ...]}
    """
    results = {}

    for file_path in file_paths:
        if should_exclude_file(file_path):
            continue

        findings = scan_file(file_path)
        if findings:
            results[file_path] = findings

    return results


def print_findings(results: dict) -> None:
    """Print secret findings in a readable format."""
    if not results:
        return

    print("=" * 80)
    print("POTENTIAL SECRETS DETECTED!")
    print("=" * 80)
    print()

    total_findings = 0
    for file_path, findings in results.items():
        print(f"\nFILE: {file_path}")
        print("-" * 40)
        for line_num, line, pattern_name, description in findings:
            total_findings += 1
            print(f"  Line {line_num}: {description}")
            print(f"  Pattern: {pattern_name}")
            print(f"  Content: {line[:100]}{'...' if len(line) > 100 else ''}")
            print()

    print("=" * 80)
    print(f"Total findings: {total_findings}")
    print("=" * 80)
    print()
    print("To fix this:")
    print("  1. Remove the secret from the file")
    print("  2. Use environment variables or a secrets manager")
    print("  3. If this is a false positive, add an exclusion marker:")
    print("     - Add '# pragma: allowlist secret' to the end of the line")
    print("     - Or add '# secret-detection:ignore' to the end of the line")
    print()


def main() -> int:
    """Main entry point."""
    parser = argparse.ArgumentParser(
        description="Detect secrets in files",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s file1.py file2.yaml
  %(prog)s --exclude "*.md" src/

Exit codes:
  0 - No secrets found
  1 - Secrets detected
  2 - Error
        """,
    )
    parser.add_argument(
        "files",
        nargs="+",
        help="Files to scan",
    )
    parser.add_argument(
        "--exclude",
        action="append",
        default=[],
        help="Additional file patterns to exclude",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Print verbose output",
    )

    args = parser.parse_args()

    files_to_scan = []
    for file_path in args.files:
        if should_exclude_file(file_path):
            if args.verbose:
                print(f"Skipping excluded file: {file_path}")
            continue
        files_to_scan.append(file_path)

    if args.verbose:
        print(f"Scanning {len(files_to_scan)} files...")

    results = scan_files(files_to_scan)

    if results:
        print_findings(results)
        return 1

    if args.verbose:
        print("No secrets detected!")

    return 0


if __name__ == "__main__":
    sys.exit(main())
security: add pre-commit hook for secret leak detection (#384) 2026-04-05 00:27:00 +00:00			`#!/usr/bin/env python3`
			`"""`
			`Secret leak detection script for pre-commit hooks.`

			`Detects common secret patterns in staged files:`
			`- API keys (sk-, pk_, etc.)`
			`- Private keys (-----BEGIN PRIVATE KEY-----)`
			`- Passwords in config files`
			`- GitHub/Gitea tokens`
			`- Database connection strings with credentials`
			`"""`

			`import argparse`
			`import re`
			`import sys`
			`from pathlib import Path`
			`from typing import List, Tuple`


			`# Secret patterns to detect`
			`SECRET_PATTERNS = {`
			`"openai_api_key": {`
			`"pattern": r"sk-[a-zA-Z0-9]{20,}",`
			`"description": "OpenAI API key",`
			`},`
			`"anthropic_api_key": {`
			`"pattern": r"sk-ant-[a-zA-Z0-9]{32,}",`
			`"description": "Anthropic API key",`
			`},`
			`"generic_api_key": {`
			`"pattern": r"(?i)(api[_-]?key\|apikey)\s[:=]\s['\"]?([a-zA-Z0-9_\-]{16,})['\"]?",`
			`"description": "Generic API key",`
			`},`
			`"private_key": {`
			`"pattern": r"-----BEGIN (RSA \|DSA \|EC \|OPENSSH )?PRIVATE KEY-----",`
			`"description": "Private key",`
			`},`
			`"github_token": {`
			`"pattern": r"gh[pousr]_[A-Za-z0-9_]{36,}",`
			`"description": "GitHub token",`
			`},`
			`"gitea_token": {`
			`"pattern": r"gitea_[a-f0-9]{40}",`
			`"description": "Gitea token",`
			`},`
			`"aws_access_key": {`
			`"pattern": r"AKIA[0-9A-Z]{16}",`
			`"description": "AWS Access Key ID",`
			`},`
			`"aws_secret_key": {`
			`"pattern": r"(?i)aws[_-]?secret[_-]?(access)?[_-]?key\s[:=]\s['\"]?([a-zA-Z0-9/+=]{40})['\"]?",`
			`"description": "AWS Secret Access Key",`
			`},`
			`"database_connection_string": {`
			`"pattern": r"(?i)(mongodb\|mysql\|postgresql\|postgres\|redis)://[^:]+:[^@]+@[^/]+",`
			`"description": "Database connection string with credentials",`
			`},`
			`"password_in_config": {`
			`"pattern": r"(?i)(password\|passwd\|pwd)\s[:=]\s['\"]([^'\"]{4,})['\"]",`
			`"description": "Hardcoded password",`
			`},`
			`"stripe_key": {`
			`"pattern": r"sk_(live\|test)_[0-9a-zA-Z]{24,}",`
			`"description": "Stripe API key",`
			`},`
			`"slack_token": {`
			`"pattern": r"xox[baprs]-[0-9a-zA-Z]{10,}",`
			`"description": "Slack token",`
			`},`
			`"telegram_bot_token": {`
			`"pattern": r"[0-9]{8,10}:[a-zA-Z0-9_-]{35}",`
			`"description": "Telegram bot token",`
			`},`
			`"jwt_token": {`
			`"pattern": r"eyJ[a-zA-Z0-9_-]\.eyJ[a-zA-Z0-9_-]\.[a-zA-Z0-9_-]*",`
			`"description": "JWT token",`
			`},`
			`"bearer_token": {`
			`"pattern": r"(?i)bearer\s+[a-zA-Z0-9_\-\.=]{20,}",`
			`"description": "Bearer token",`
			`},`
			`}`

			`# Files/patterns to exclude from scanning`
			`EXCLUSIONS = {`
			`"files": {`
			`".pre-commit-hooks.yaml",`
			`".gitignore",`
			`"poetry.lock",`
			`"package-lock.json",`
			`"yarn.lock",`
			`"Pipfile.lock",`
			`".secrets.baseline",`
			`},`
			`"extensions": {`
			`".md",`
			`".svg",`
			`".png",`
			`".jpg",`
			`".jpeg",`
			`".gif",`
			`".ico",`
			`".woff",`
			`".woff2",`
			`".ttf",`
			`".eot",`
			`},`
			`"paths": {`
			`".git/",`
			`"node_modules/",`
			`"__pycache__/",`
			`".pytest_cache/",`
			`".mypy_cache/",`
			`".venv/",`
			`"venv/",`
			`".tox/",`
			`"dist/",`
			`"build/",`
			`".eggs/",`
			`},`
			`"patterns": {`
			`r"your_[a-z_]+_here",`
			`r"example_[a-z_]+",`
			`r"dummy_[a-z_]+",`
			`r"test_[a-z_]+",`
			`r"fake_[a-z_]+",`
			`r"password\s[=:]\s['\"]?(changeme\|password\|123456\|admin)['\"]?",`
			`r"#.*(?:example\|placeholder\|sample)",`
			`r"(mongodb\|mysql\|postgresql)://[^:]+:[^@]+@localhost",`
			`r"(mongodb\|mysql\|postgresql)://[^:]+:[^@]+@127\.0\.0\.1",`
			`},`
			`}`

			`# Markers for inline exclusions`
			`EXCLUSION_MARKERS = [`
			`"# pragma: allowlist secret",`
			`"# noqa: secret",`
			`"// pragma: allowlist secret",`
			`"/* pragma: allowlist secret */",`
			`"# secret-detection:ignore",`
			`]`


			`def should_exclude_file(file_path: str) -> bool:`
			`"""Check if file should be excluded from scanning."""`
			`path = Path(file_path)`

			`if path.name in EXCLUSIONS["files"]:`
			`return True`

			`if path.suffix.lower() in EXCLUSIONS["extensions"]:`
			`return True`

			`for excluded_path in EXCLUSIONS["paths"]:`
			`if excluded_path in str(path):`
			`return True`

			`return False`


			`def has_exclusion_marker(line: str) -> bool:`
			`"""Check if line has an exclusion marker."""`
			`return any(marker in line for marker in EXCLUSION_MARKERS)`


			`def is_excluded_match(line: str, match_str: str) -> bool:`
			`"""Check if the match should be excluded."""`
			`for pattern in EXCLUSIONS["patterns"]:`
			`if re.search(pattern, line, re.IGNORECASE):`
			`return True`

			`if re.search(r"['\"](fake\|test\|dummy\|example\|placeholder\|changeme)['\"]", line, re.IGNORECASE):`
			`return True`

			`return False`


			`def scan_file(file_path: str) -> List[Tuple[int, str, str, str]]:`
			`"""Scan a single file for secrets.`

			`Returns list of tuples: (line_number, line_content, pattern_name, description)`
			`"""`
			`findings = []`

			`try:`
			`with open(file_path, "r", encoding="utf-8", errors="ignore") as f:`
			`lines = f.readlines()`
			`except (IOError, OSError) as e:`
			`print(f"Warning: Could not read {file_path}: {e}", file=sys.stderr)`
			`return findings`

			`for line_num, line in enumerate(lines, 1):`
			`if has_exclusion_marker(line):`
			`continue`

			`for pattern_name, pattern_info in SECRET_PATTERNS.items():`
			`matches = re.finditer(pattern_info["pattern"], line)`
			`for match in matches:`
			`match_str = match.group(0)`

			`if is_excluded_match(line, match_str):`
			`continue`

			`findings.append(`
			`(line_num, line.strip(), pattern_name, pattern_info["description"])`
			`)`

			`return findings`


			`def scan_files(file_paths: List[str]) -> dict:`
			`"""Scan multiple files for secrets.`

			`Returns dict: {file_path: [(line_num, line, pattern, description), ...]}`
			`"""`
			`results = {}`

			`for file_path in file_paths:`
			`if should_exclude_file(file_path):`
			`continue`

			`findings = scan_file(file_path)`
			`if findings:`
			`results[file_path] = findings`

			`return results`


			`def print_findings(results: dict) -> None:`
			`"""Print secret findings in a readable format."""`
			`if not results:`
			`return`

			`print("=" * 80)`
			`print("POTENTIAL SECRETS DETECTED!")`
			`print("=" * 80)`
			`print()`

			`total_findings = 0`
			`for file_path, findings in results.items():`
			`print(f"\nFILE: {file_path}")`
			`print("-" * 40)`
			`for line_num, line, pattern_name, description in findings:`
			`total_findings += 1`
			`print(f" Line {line_num}: {description}")`
			`print(f" Pattern: {pattern_name}")`
			`print(f" Content: {line[:100]}{'...' if len(line) > 100 else ''}")`
			`print()`

			`print("=" * 80)`
			`print(f"Total findings: {total_findings}")`
			`print("=" * 80)`
			`print()`
			`print("To fix this:")`
			`print(" 1. Remove the secret from the file")`
			`print(" 2. Use environment variables or a secrets manager")`
			`print(" 3. If this is a false positive, add an exclusion marker:")`
			`print(" - Add '# pragma: allowlist secret' to the end of the line")`
			`print(" - Or add '# secret-detection:ignore' to the end of the line")`
			`print()`


			`def main() -> int:`
			`"""Main entry point."""`
			`parser = argparse.ArgumentParser(`
			`description="Detect secrets in files",`
			`formatter_class=argparse.RawDescriptionHelpFormatter,`
			`epilog="""`
			`Examples:`
			`%(prog)s file1.py file2.yaml`
			`%(prog)s --exclude "*.md" src/`

			`Exit codes:`
			`0 - No secrets found`
			`1 - Secrets detected`
			`2 - Error`
			`""",`
			`)`
			`parser.add_argument(`
			`"files",`
			`nargs="+",`
			`help="Files to scan",`
			`)`
			`parser.add_argument(`
			`"--exclude",`
			`action="append",`
			`default=[],`
			`help="Additional file patterns to exclude",`
			`)`
			`parser.add_argument(`
			`"--verbose",`
			`"-v",`
			`action="store_true",`
			`help="Print verbose output",`
			`)`

			`args = parser.parse_args()`

			`files_to_scan = []`
			`for file_path in args.files:`
			`if should_exclude_file(file_path):`
			`if args.verbose:`
			`print(f"Skipping excluded file: {file_path}")`
			`continue`
			`files_to_scan.append(file_path)`

			`if args.verbose:`
			`print(f"Scanning {len(files_to_scan)} files...")`

			`results = scan_files(files_to_scan)`

			`if results:`
			`print_findings(results)`
			`return 1`

			`if args.verbose:`
			`print("No secrets detected!")`

			`return 0`


			`if __name__ == "__main__":`
			`sys.exit(main())`