#!/usr/bin/env python3 """ Pre-commit hook for detecting secret leaks in staged files. Scans staged diffs and full file contents for common secret patterns, token file paths, private keys, and credential strings. Installation: git config core.hooksPath .githooks To bypass: git commit --no-verify """ from __future__ import annotations import re import subprocess import sys from pathlib import Path from typing import Iterable, List, Callable, Union # ANSI color codes RED = "\033[0;31m" YELLOW = "\033[1;33m" GREEN = "\033[0;32m" NC = "\033[0m" class Finding: """Represents a single secret leak finding.""" def __init__(self, filename: str, line: int, message: str) -> None: self.filename = filename self.line = line self.message = message def __repr__(self) -> str: return f"Finding({self.filename!r}, {self.line}, {self.message!r})" def __eq__(self, other: object) -> bool: if not isinstance(other, Finding): return NotImplemented return ( self.filename == other.filename and self.line == other.line and self.message == other.message ) # --------------------------------------------------------------------------- # Regex patterns # --------------------------------------------------------------------------- _RE_SK_KEY = re.compile(r"sk-[a-zA-Z0-9]{20,}") _RE_BEARER = re.compile(r"Bearer\s+[a-zA-Z0-9_-]{20,}") _RE_ENV_ASSIGN = re.compile( r"^(?:export\s+)?" r"(OPENAI_API_KEY|GITEA_TOKEN|ANTHROPIC_API_KEY|KIMI_API_KEY" r"|TELEGRAM_BOT_TOKEN|DISCORD_TOKEN)" r"\s*=\s*(.+)$" ) _RE_TOKEN_PATHS = re.compile( r'(?:^|["\'\s])' r"(\.(?:env)" r"|(?:secrets|keystore|credentials|token|api_keys)\.json" r"|~/\.hermes/credentials/" r"|/root/nostr-relay/keystore\.json)" ) _RE_PRIVATE_KEY = re.compile( r"-----BEGIN (PRIVATE KEY|RSA PRIVATE KEY|OPENSSH PRIVATE KEY)-----" ) _RE_URL_PASSWORD = re.compile(r"https?://[^:]+:[^@]+@") _RE_RAW_TOKEN = re.compile(r'"token"\s*:\s*"([^"]{10,})"') _RE_RAW_API_KEY = re.compile(r'"api_key"\s*:\s*"([^"]{10,})"') # Safe patterns (placeholders) _SAFE_ENV_VALUES = { "", "***", "REDACTED", "", } _RE_DOC_EXAMPLE = re.compile( r"\b(?:example|documentation|doc|readme)\b", re.IGNORECASE, ) _RE_OS_ENVIRON = re.compile(r"os\.environ(?:\.get|\[)") # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def is_binary_content(content: Union[str, bytes]) -> bool: """Return True if content appears to be binary.""" if isinstance(content, str): return False return b"\x00" in content def _looks_like_safe_env_line(line: str) -> bool: """Check if a line is a safe env var read or reference.""" if _RE_OS_ENVIRON.search(line): return True # Variable expansion like $OPENAI_API_KEY if re.search(r'\$\w+\s*$', line.strip()): return True return False def _is_placeholder(value: str) -> bool: """Check if a value is a known placeholder or empty.""" stripped = value.strip().strip('"').strip("'") if stripped in _SAFE_ENV_VALUES: return True # Single word references like $VAR if re.fullmatch(r"\$\w+", stripped): return True return False def _is_doc_or_example(line: str, value: str | None = None) -> bool: """Check if line appears to be documentation or example code.""" # If the line contains a placeholder value, it's likely documentation if value is not None and _is_placeholder(value): return True # If the line contains doc keywords and no actual secret-looking value if _RE_DOC_EXAMPLE.search(line): # For env assignments, if value is empty or placeholder m = _RE_ENV_ASSIGN.search(line) if m and _is_placeholder(m.group(2)): return True return False # --------------------------------------------------------------------------- # Scanning # --------------------------------------------------------------------------- def scan_line(line: str, filename: str, line_no: int) -> Iterable[Finding]: """Scan a single line for secret leak patterns.""" stripped = line.rstrip("\n") if not stripped: return # --- API keys ---------------------------------------------------------- if _RE_SK_KEY.search(stripped): yield Finding(filename, line_no, "Potential API key (sk-...) found") return # One finding per line is enough if _RE_BEARER.search(stripped): yield Finding(filename, line_no, "Potential Bearer token found") return # --- Env var assignments ----------------------------------------------- m = _RE_ENV_ASSIGN.search(stripped) if m: var_name = m.group(1) value = m.group(2) if _looks_like_safe_env_line(stripped): return if _is_doc_or_example(stripped, value): return if not _is_placeholder(value): yield Finding( filename, line_no, f"Potential secret assignment: {var_name}=...", ) return # --- Token file paths -------------------------------------------------- if _RE_TOKEN_PATHS.search(stripped): yield Finding(filename, line_no, "Potential token file path found") return # --- Private key blocks ------------------------------------------------ if _RE_PRIVATE_KEY.search(stripped): yield Finding(filename, line_no, "Private key block found") return # --- Passwords in URLs ------------------------------------------------- if _RE_URL_PASSWORD.search(stripped): yield Finding(filename, line_no, "Password in URL found") return # --- Raw token patterns ------------------------------------------------ if _RE_RAW_TOKEN.search(stripped): yield Finding(filename, line_no, 'Raw "token" string with long value') return if _RE_RAW_API_KEY.search(stripped): yield Finding(filename, line_no, 'Raw "api_key" string with long value') return def scan_content(content: Union[str, bytes], filename: str) -> List[Finding]: """Scan full file content for secrets.""" if isinstance(content, bytes): try: text = content.decode("utf-8") except UnicodeDecodeError: return [] else: text = content findings: List[Finding] = [] for line_no, line in enumerate(text.splitlines(), start=1): findings.extend(scan_line(line, filename, line_no)) return findings def scan_files( files: List[str], content_reader: Callable[[str], bytes], ) -> List[Finding]: """Scan a list of files using the provided content reader.""" findings: List[Finding] = [] for filepath in files: content = content_reader(filepath) if is_binary_content(content): continue findings.extend(scan_content(content, filepath)) return findings # --------------------------------------------------------------------------- # Git helpers # --------------------------------------------------------------------------- def get_staged_files() -> List[str]: """Return a list of staged file paths (excluding deletions).""" result = subprocess.run( ["git", "diff", "--cached", "--name-only", "--diff-filter=ACMR"], capture_output=True, text=True, ) if result.returncode != 0: return [] return [f for f in result.stdout.strip().split("\n") if f] def get_staged_diff() -> str: """Return the diff of staged changes.""" result = subprocess.run( ["git", "diff", "--cached", "--no-color", "-U0"], capture_output=True, text=True, ) if result.returncode != 0: return "" return result.stdout def get_file_content_at_staged(filepath: str) -> bytes: """Return the staged content of a file.""" result = subprocess.run( ["git", "show", f":{filepath}"], capture_output=True, ) if result.returncode != 0: return b"" return result.stdout # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> int: print(f"{GREEN}🔍 Scanning for secret leaks in staged files...{NC}") staged_files = get_staged_files() if not staged_files: print(f"{GREEN}✓ No files staged for commit{NC}") return 0 # Scan both full staged file contents and the diff content findings = scan_files(staged_files, get_file_content_at_staged) diff_text = get_staged_diff() if diff_text: for line_no, line in enumerate(diff_text.splitlines(), start=1): # Only scan added lines in the diff if line.startswith("+") and not line.startswith("+++"): findings.extend(scan_line(line[1:], "", line_no)) if not findings: print(f"{GREEN}✓ No potential secret leaks detected{NC}") return 0 print(f"{RED}✗ Potential secret leaks detected:{NC}\n") for finding in findings: loc = finding.filename print( f" {RED}[LEAK]{NC} {loc}:{finding.line} — {finding.message}" ) print() print(f"{RED}╔════════════════════════════════════════════════════════════╗{NC}") print(f"{RED}║ COMMIT BLOCKED: Potential secrets detected! ║{NC}") print(f"{RED}╚════════════════════════════════════════════════════════════╝{NC}") print() print("Recommendations:") print(" 1. Remove secrets from your code") print(" 2. Use environment variables or a secrets manager") print(" 3. Add sensitive files to .gitignore") print(" 4. Rotate any exposed credentials immediately") print() print("If you are CERTAIN this is a false positive, you can bypass:") print(" git commit --no-verify") print() return 1 if __name__ == "__main__": sys.exit(main())