security: add pre-commit hook for secret leak detection (#384)

2026-04-05 00:27:00 +00:00
parent d5c357df76
commit 5ace1e69ce
4 changed files with 603 additions and 0 deletions
--- a/.pre-commit-hooks.yaml
+++ b/.pre-commit-hooks.yaml
@@ -0,0 +1,42 @@
+# Pre-commit hooks configuration for timmy-home
+# See https://pre-commit.com for more information
+
+repos:
+  # Standard pre-commit hooks
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+        exclude: '\.(md|txt)$'
+      - id: end-of-file-fixer
+        exclude: '\.(md|txt)$'
+      - id: check-yaml
+      - id: check-json
+      - id: check-added-large-files
+        args: ['--maxkb=5000']
+      - id: check-merge-conflict
+      - id: check-symlinks
+      - id: detect-private-key
+
+  # Secret detection - custom local hook
+  - repo: local
+    hooks:
+      - id: detect-secrets
+        name: Detect Secrets
+        description: Scan for API keys, tokens, and other secrets
+        entry: python3 scripts/detect_secrets.py
+        language: python
+        types: [text]
+        exclude: 
+          '(?x)^(
+            .*\.md$|
+            .*\.svg$|
+            .*\.lock$|
+            .*-lock\..*$|
+            \.gitignore$|
+            \.secrets\.baseline$|
+            tests/test_secret_detection\.py$
+          )'
+        pass_filenames: true
+        require_serial: false
+        verbose: true
--- a/README.md
+++ b/README.md
@@ -0,0 +1,132 @@
+# Timmy Home
+
+Timmy Foundation's home repository for development operations and configurations.
+
+## Security
+
+### Pre-commit Hook for Secret Detection
+
+This repository includes a pre-commit hook that automatically scans for secrets (API keys, tokens, passwords) before allowing commits.
+
+#### Setup
+
+Install pre-commit hooks:
+
+```bash
+pip install pre-commit
+pre-commit install
+```
+
+#### What Gets Scanned
+
+The hook detects:
+- **API Keys**: OpenAI (`sk-*`), Anthropic (`sk-ant-*`), AWS, Stripe
+- **Private Keys**: RSA, DSA, EC, OpenSSH private keys
+- **Tokens**: GitHub (`ghp_*`), Gitea, Slack, Telegram, JWT, Bearer tokens
+- **Database URLs**: Connection strings with embedded credentials
+- **Passwords**: Hardcoded passwords in configuration files
+
+#### How It Works
+
+Before each commit, the hook:
+1. Scans all staged text files
+2. Checks against patterns for common secret formats
+3. Reports any potential secrets found
+4. Blocks the commit if secrets are detected
+
+#### Handling False Positives
+
+If the hook flags something that is not actually a secret (e.g., test fixtures, placeholder values), you can:
+
+**Option 1: Add an exclusion marker to the line**
+
+```python
+# Add one of these markers to the end of the line:
+api_key = "sk-test123"  # pragma: allowlist secret
+api_key = "sk-test123"  # noqa: secret
+api_key = "sk-test123"  # secret-detection:ignore
+```
+
+**Option 2: Use placeholder values (auto-excluded)**
+
+These patterns are automatically excluded:
+- `changeme`, `password`, `123456`, `admin` (common defaults)
+- Values containing `fake_`, `test_`, `dummy_`, `example_`, `placeholder_`
+- URLs with `localhost` or `127.0.0.1`
+
+**Option 3: Skip the hook (emergency only)**
+
+```bash
+git commit --no-verify  # Bypasses all pre-commit hooks
+```
+
+⚠️ **Warning**: Only use `--no-verify` if you are certain no real secrets are being committed.
+
+#### CI/CD Integration
+
+The secret detection script can also be run in CI/CD:
+
+```bash
+# Scan specific files
+python3 scripts/detect_secrets.py file1.py file2.yaml
+
+# Scan with verbose output
+python3 scripts/detect_secrets.py --verbose src/
+
+# Run tests
+python3 tests/test_secret_detection.py
+```
+
+#### Excluded Files
+
+The following are automatically excluded from scanning:
+- Markdown files (`.md`)
+- Lock files (`package-lock.json`, `poetry.lock`, `yarn.lock`)
+- Image and font files
+- `node_modules/`, `__pycache__/`, `.git/`
+
+#### Testing the Detection
+
+To verify the detection works:
+
+```bash
+# Run the test suite
+python3 tests/test_secret_detection.py
+
+# Test with a specific file
+echo "API_KEY=sk-test123456789" > /tmp/test_secret.py
+python3 scripts/detect_secrets.py /tmp/test_secret.py
+# Should report: OpenAI API key detected
+```
+
+## Development
+
+### Running Tests
+
+```bash
+# Run secret detection tests
+python3 tests/test_secret_detection.py
+
+# Run all tests
+pytest tests/
+```
+
+### Project Structure
+
+```
+.
+├── .pre-commit-hooks.yaml    # Pre-commit configuration
+├── scripts/
+│   └── detect_secrets.py     # Secret detection script
+├── tests/
+│   └── test_secret_detection.py  # Test cases
+└── README.md                 # This file
+```
+
+## Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md) for contribution guidelines.
+
+## License
+
+This project is part of the Timmy Foundation.
--- a/scripts/detect_secrets.py
+++ b/scripts/detect_secrets.py
@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+"""
+Secret leak detection script for pre-commit hooks.
+
+Detects common secret patterns in staged files:
+- API keys (sk-*, pk_*, etc.)
+- Private keys (-----BEGIN PRIVATE KEY-----)
+- Passwords in config files
+- GitHub/Gitea tokens
+- Database connection strings with credentials
+"""
+
+import argparse
+import re
+import sys
+from pathlib import Path
+from typing import List, Tuple
+
+
+# Secret patterns to detect
+SECRET_PATTERNS = {
+    "openai_api_key": {
+        "pattern": r"sk-[a-zA-Z0-9]{20,}",
+        "description": "OpenAI API key",
+    },
+    "anthropic_api_key": {
+        "pattern": r"sk-ant-[a-zA-Z0-9]{32,}",
+        "description": "Anthropic API key",
+    },
+    "generic_api_key": {
+        "pattern": r"(?i)(api[_-]?key|apikey)\s*[:=]\s*['\"]?([a-zA-Z0-9_\-]{16,})['\"]?",
+        "description": "Generic API key",
+    },
+    "private_key": {
+        "pattern": r"-----BEGIN (RSA |DSA |EC |OPENSSH )?PRIVATE KEY-----",
+        "description": "Private key",
+    },
+    "github_token": {
+        "pattern": r"gh[pousr]_[A-Za-z0-9_]{36,}",
+        "description": "GitHub token",
+    },
+    "gitea_token": {
+        "pattern": r"gitea_[a-f0-9]{40}",
+        "description": "Gitea token",
+    },
+    "aws_access_key": {
+        "pattern": r"AKIA[0-9A-Z]{16}",
+        "description": "AWS Access Key ID",
+    },
+    "aws_secret_key": {
+        "pattern": r"(?i)aws[_-]?secret[_-]?(access)?[_-]?key\s*[:=]\s*['\"]?([a-zA-Z0-9/+=]{40})['\"]?",
+        "description": "AWS Secret Access Key",
+    },
+    "database_connection_string": {
+        "pattern": r"(?i)(mongodb|mysql|postgresql|postgres|redis)://[^:]+:[^@]+@[^/]+",
+        "description": "Database connection string with credentials",
+    },
+    "password_in_config": {
+        "pattern": r"(?i)(password|passwd|pwd)\s*[:=]\s*['\"]([^'\"]{4,})['\"]",
+        "description": "Hardcoded password",
+    },
+    "stripe_key": {
+        "pattern": r"sk_(live|test)_[0-9a-zA-Z]{24,}",
+        "description": "Stripe API key",
+    },
+    "slack_token": {
+        "pattern": r"xox[baprs]-[0-9a-zA-Z]{10,}",
+        "description": "Slack token",
+    },
+    "telegram_bot_token": {
+        "pattern": r"[0-9]{8,10}:[a-zA-Z0-9_-]{35}",
+        "description": "Telegram bot token",
+    },
+    "jwt_token": {
+        "pattern": r"eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*",
+        "description": "JWT token",
+    },
+    "bearer_token": {
+        "pattern": r"(?i)bearer\s+[a-zA-Z0-9_\-\.=]{20,}",
+        "description": "Bearer token",
+    },
+}
+
+# Files/patterns to exclude from scanning
+EXCLUSIONS = {
+    "files": {
+        ".pre-commit-hooks.yaml",
+        ".gitignore",
+        "poetry.lock",
+        "package-lock.json",
+        "yarn.lock",
+        "Pipfile.lock",
+        ".secrets.baseline",
+    },
+    "extensions": {
+        ".md",
+        ".svg",
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".ico",
+        ".woff",
+        ".woff2",
+        ".ttf",
+        ".eot",
+    },
+    "paths": {
+        ".git/",
+        "node_modules/",
+        "__pycache__/",
+        ".pytest_cache/",
+        ".mypy_cache/",
+        ".venv/",
+        "venv/",
+        ".tox/",
+        "dist/",
+        "build/",
+        ".eggs/",
+    },
+    "patterns": {
+        r"your_[a-z_]+_here",
+        r"example_[a-z_]+",
+        r"dummy_[a-z_]+",
+        r"test_[a-z_]+",
+        r"fake_[a-z_]+",
+        r"password\s*[=:]\s*['\"]?(changeme|password|123456|admin)['\"]?",
+        r"#.*(?:example|placeholder|sample)",
+        r"(mongodb|mysql|postgresql)://[^:]+:[^@]+@localhost",
+        r"(mongodb|mysql|postgresql)://[^:]+:[^@]+@127\.0\.0\.1",
+    },
+}
+
+# Markers for inline exclusions
+EXCLUSION_MARKERS = [
+    "# pragma: allowlist secret",
+    "# noqa: secret",
+    "// pragma: allowlist secret",
+    "/* pragma: allowlist secret */",
+    "# secret-detection:ignore",
+]
+
+
+def should_exclude_file(file_path: str) -> bool:
+    """Check if file should be excluded from scanning."""
+    path = Path(file_path)
+
+    if path.name in EXCLUSIONS["files"]:
+        return True
+
+    if path.suffix.lower() in EXCLUSIONS["extensions"]:
+        return True
+
+    for excluded_path in EXCLUSIONS["paths"]:
+        if excluded_path in str(path):
+            return True
+
+    return False
+
+
+def has_exclusion_marker(line: str) -> bool:
+    """Check if line has an exclusion marker."""
+    return any(marker in line for marker in EXCLUSION_MARKERS)
+
+
+def is_excluded_match(line: str, match_str: str) -> bool:
+    """Check if the match should be excluded."""
+    for pattern in EXCLUSIONS["patterns"]:
+        if re.search(pattern, line, re.IGNORECASE):
+            return True
+
+    if re.search(r"['\"](fake|test|dummy|example|placeholder|changeme)['\"]", line, re.IGNORECASE):
+        return True
+
+    return False
+
+
+def scan_file(file_path: str) -> List[Tuple[int, str, str, str]]:
+    """Scan a single file for secrets.
+    
+    Returns list of tuples: (line_number, line_content, pattern_name, description)
+    """
+    findings = []
+
+    try:
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+            lines = f.readlines()
+    except (IOError, OSError) as e:
+        print(f"Warning: Could not read {file_path}: {e}", file=sys.stderr)
+        return findings
+
+    for line_num, line in enumerate(lines, 1):
+        if has_exclusion_marker(line):
+            continue
+
+        for pattern_name, pattern_info in SECRET_PATTERNS.items():
+            matches = re.finditer(pattern_info["pattern"], line)
+            for match in matches:
+                match_str = match.group(0)
+
+                if is_excluded_match(line, match_str):
+                    continue
+
+                findings.append(
+                    (line_num, line.strip(), pattern_name, pattern_info["description"])
+                )
+
+    return findings
+
+
+def scan_files(file_paths: List[str]) -> dict:
+    """Scan multiple files for secrets.
+    
+    Returns dict: {file_path: [(line_num, line, pattern, description), ...]}
+    """
+    results = {}
+
+    for file_path in file_paths:
+        if should_exclude_file(file_path):
+            continue
+
+        findings = scan_file(file_path)
+        if findings:
+            results[file_path] = findings
+
+    return results
+
+
+def print_findings(results: dict) -> None:
+    """Print secret findings in a readable format."""
+    if not results:
+        return
+
+    print("=" * 80)
+    print("POTENTIAL SECRETS DETECTED!")
+    print("=" * 80)
+    print()
+
+    total_findings = 0
+    for file_path, findings in results.items():
+        print(f"\nFILE: {file_path}")
+        print("-" * 40)
+        for line_num, line, pattern_name, description in findings:
+            total_findings += 1
+            print(f"  Line {line_num}: {description}")
+            print(f"  Pattern: {pattern_name}")
+            print(f"  Content: {line[:100]}{'...' if len(line) > 100 else ''}")
+            print()
+
+    print("=" * 80)
+    print(f"Total findings: {total_findings}")
+    print("=" * 80)
+    print()
+    print("To fix this:")
+    print("  1. Remove the secret from the file")
+    print("  2. Use environment variables or a secrets manager")
+    print("  3. If this is a false positive, add an exclusion marker:")
+    print("     - Add '# pragma: allowlist secret' to the end of the line")
+    print("     - Or add '# secret-detection:ignore' to the end of the line")
+    print()
+
+
+def main() -> int:
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Detect secrets in files",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s file1.py file2.yaml
+  %(prog)s --exclude "*.md" src/
+
+Exit codes:
+  0 - No secrets found
+  1 - Secrets detected
+  2 - Error
+        """,
+    )
+    parser.add_argument(
+        "files",
+        nargs="+",
+        help="Files to scan",
+    )
+    parser.add_argument(
+        "--exclude",
+        action="append",
+        default=[],
+        help="Additional file patterns to exclude",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Print verbose output",
+    )
+
+    args = parser.parse_args()
+
+    files_to_scan = []
+    for file_path in args.files:
+        if should_exclude_file(file_path):
+            if args.verbose:
+                print(f"Skipping excluded file: {file_path}")
+            continue
+        files_to_scan.append(file_path)
+
+    if args.verbose:
+        print(f"Scanning {len(files_to_scan)} files...")
+
+    results = scan_files(files_to_scan)
+
+    if results:
+        print_findings(results)
+        return 1
+
+    if args.verbose:
+        print("No secrets detected!")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/test_secret_detection.py
+++ b/tests/test_secret_detection.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""
+Test cases for secret detection script.
+
+These tests verify that the detect_secrets.py script correctly:
+1. Detects actual secrets
+2. Ignores false positives
+3. Respects exclusion markers
+"""
+
+import os
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+# Add scripts directory to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "scripts"))
+
+from detect_secrets import (
+    scan_file,
+    scan_files,
+    should_exclude_file,
+    has_exclusion_marker,
+    is_excluded_match,
+    SECRET_PATTERNS,
+)
+
+
+class TestSecretDetection(unittest.TestCase):
+    """Test cases for secret detection."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.test_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        import shutil
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+
+    def _create_test_file(self, content: str, filename: str = "test.txt") -> str:
+        """Create a test file with given content."""
+        file_path = os.path.join(self.test_dir, filename)
+        with open(file_path, "w") as f:
+            f.write(content)
+        return file_path
+
+    def test_detect_openai_api_key(self):
+        """Test detection of OpenAI API keys."""
+        content = "api_key = 'sk-abcdefghijklmnopqrstuvwxyz123456'"
+        file_path = self._create_test_file(content)
+        findings = scan_file(file_path)
+        self.assertTrue(any("openai" in f[2].lower() for f in findings))
+
+    def test_detect_private_key(self):
+        """Test detection of private keys."""
+        content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEpAIBAAKCAQEA0Z3VS5JJcds3xfn/ygWyF8PbnGy0AHB7MhgwMbRvI0MBZhpF\n-----END RSA PRIVATE KEY-----"
+        file_path = self._create_test_file(content)
+        findings = scan_file(file_path)
+        self.assertTrue(any("private" in f[2].lower() for f in findings))
+
+    def test_detect_database_connection_string(self):
+        """Test detection of database connection strings with credentials."""
+        content = "DATABASE_URL=mongodb://admin:secretpassword@mongodb.example.com:27017/db"
+        file_path = self._create_test_file(content)
+        findings = scan_file(file_path)
+        self.assertTrue(any("database" in f[2].lower() for f in findings))
+
+    def test_detect_password_in_config(self):
+        """Test detection of hardcoded passwords."""
+        content = "password = 'mysecretpassword123'"
+        file_path = self._create_test_file(content)
+        findings = scan_file(file_path)
+        self.assertTrue(any("password" in f[2].lower() for f in findings))
+
+    def test_exclude_placeholder_passwords(self):
+        """Test that placeholder passwords are excluded."""
+        content = "password = 'changeme'"
+        file_path = self._create_test_file(content)
+        findings = scan_file(file_path)
+        self.assertEqual(len(findings), 0)
+
+    def test_exclude_localhost_database_url(self):
+        """Test that localhost database URLs are excluded."""
+        content = "DATABASE_URL=mongodb://admin:secret@localhost:27017/db"
+        file_path = self._create_test_file(content)
+        findings = scan_file(file_path)
+        self.assertEqual(len(findings), 0)
+
+    def test_pragma_allowlist_secret(self):
+        """Test '# pragma: allowlist secret' marker."""
+        content = "api_key = 'sk-abcdefghijklmnopqrstuvwxyz123456'  # pragma: allowlist secret"
+        file_path = self._create_test_file(content)
+        findings = scan_file(file_path)
+        self.assertEqual(len(findings), 0)
+
+    def test_empty_file(self):
+        """Test scanning empty file."""
+        file_path = self._create_test_file("")
+        findings = scan_file(file_path)
+        self.assertEqual(len(findings), 0)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)