#!/bin/bash
#
# Pre-commit hook for detecting secret leaks in commits
# This hook scans staged files for potential secret leaks including:
# - Private keys (PEM, OpenSSH formats)
# - API keys (OpenAI, Anthropic, HuggingFace, etc.)
# - Token file paths in prompts/conversations
# - Environment variable names in sensitive contexts
# - AWS credentials, database connection strings, etc.
#
# Installation:
#   git config core.hooksPath .githooks
#
# To bypass this hook temporarily:
#   git commit --no-verify
#

set -euo pipefail

# Colors for output
RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
NC='\033[0m' # No Color

# Counters for statistics
CRITICAL_FOUND=0
WARNING_FOUND=0
BLOCK_COMMIT=0

# Array to store findings
FINDINGS=()

# Get list of staged files (excluding deleted)
STAGED_FILES=$(git diff --cached --name-only --diff-filter=ACMR 2>/dev/null || true)

if [ -z "$STAGED_FILES" ]; then
    echo -e "${GREEN}✓ No files staged for commit${NC}"
    exit 0
fi

# Get the diff content of staged files (new/changed lines only, starting with +)
STAGED_DIFF=$(git diff --cached --no-color -U0 2>/dev/null | grep -E '^\+[^+]' || true)

if [ -z "$STAGED_DIFF" ]; then
    echo -e "${GREEN}✓ No new content to scan${NC}"
    exit 0
fi

echo "🔍 Scanning for secret leaks in staged files..."
echo ""

# ============================================================================
# PATTERN DEFINITIONS
# ============================================================================

# Critical patterns - will block commit
CRITICAL_PATTERNS=(
    # Private Keys
    '-----BEGIN (RSA |DSA |EC |OPENSSH |PGP |SSH2 |PRIVATE KEY-----)'
    '-----BEGIN ENCRYPTED PRIVATE KEY-----'
    '-----BEGIN CERTIFICATE-----'
    
    # API Keys - Common prefixes
    'sk-[a-zA-Z0-9]{20,}'                      # OpenAI, Anthropic
    'gsk_[a-zA-Z0-9]{20,}'                     # Groq
    'hf_[a-zA-Z0-9]{20,}'                      # HuggingFace
    'nvapi-[a-zA-Z0-9]{20,}'                   # NVIDIA
    'AIza[0-9A-Za-z_-]{35}'                    # Google/Gemini
    'sk_[a-zA-Z0-9]{20,}'                      # Replicate
    'xai-[a-zA-Z0-9]{20,}'                     # xAI
    'pplx-[a-zA-Z0-9]{20,}'                    # Perplexity
    'anthropic-api-key'                        # Anthropic literal
    'claude-api-key'                           # Claude literal
    
    # AWS Credentials
    'AKIA[0-9A-Z]{16}'                         # AWS Access Key ID
    'ASIA[0-9A-Z]{16}'                         # AWS Temporary Access Key
    'aws(.{0,20})?(secret(.{0,20})?)?key'
    'aws(.{0,20})?(access(.{0,20})?)?id'
    
    # Database Connection Strings (with credentials)
    'mongodb(\+srv)?://[^:]+:[^@]+@'
    'postgres(ql)?://[^:]+:[^@]+@'
    'mysql://[^:]+:[^@]+@'
    'redis://:[^@]+@'
    'mongodb://[^:]+:[^@]+@'
)

# Warning patterns - will warn but not block
WARNING_PATTERNS=(
    # Token file paths in prompts or conversation contexts
    '(prompt|conversation|context|message).*~/\.hermes/\.env'
    '(prompt|conversation|context|message).*~/\.tokens/'
    '(prompt|conversation|context|message).*~/.env'
    '(prompt|conversation|context|message).*~/.netrc'
    '(prompt|conversation|context|message).*~/.ssh/'
    '(prompt|conversation|context|message).*~/.aws/'
    '(prompt|conversation|context|message).*~/.config/'
    
    # Environment variable names in prompts (suspicious)
    '(prompt|conversation|context|message).*(OPENAI_API_KEY|ANTHROPIC_API_KEY|HF_TOKEN|HF_API_TOKEN)'
    '(prompt|conversation|context|message).*(AWS_ACCESS_KEY_ID|AWS_SECRET_ACCESS_KEY|AZURE_.*_KEY)'
    '(prompt|conversation|context|message).*(DATABASE_URL|DB_PASSWORD|SECRET_KEY)'
    '(prompt|conversation|context|message).*(GITHUB_TOKEN|GITLAB_TOKEN|DOCKER_.*_TOKEN)'
    
    # GitHub tokens
    'gh[pousr]_[A-Za-z0-9_]{36}'
    'github[_-]?pat[_-]?[a-zA-Z0-9]{22,}'
    
    # Generic high-entropy strings that look like secrets
    'api[_-]?key["'\''']?\s*[:=]\s*["'\''']?[a-zA-Z0-9]{32,}'
    'secret["'\''']?\s*[:=]\s*["'\''']?[a-zA-Z0-9]{32,}'
    'password["'\''']?\s*[:=]\s*["'\''']?[a-zA-Z0-9]{16,}'
    'token["'\''']?\s*[:=]\s*["'\''']?[a-zA-Z0-9]{32,}'
    
    # JWT tokens (3 base64 sections separated by dots)
    'eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*'
    
    # Slack tokens
    'xox[baprs]-[0-9]{10,13}-[0-9]{10,13}([a-zA-Z0-9-]*)?'
    
    # Discord tokens
    '[MN][A-Za-z\d]{23}\.[\w-]{6}\.[\w-]{27}'
    
    # Stripe keys
    'sk_live_[0-9a-zA-Z]{24,}'
    'pk_live_[0-9a-zA-Z]{24,}'
    
    # Twilio
    'SK[0-9a-fA-F]{32}'
    
    # SendGrid
    'SG\.[a-zA-Z0-9_-]{22}\.[a-zA-Z0-9_-]{43}'
    
    # Heroku
    '[hH][eE][rR][oO][kK][uU].*[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}'
)

# File patterns to scan (relevant to prompts, conversations, config)
SCAN_FILE_PATTERNS=(
    '\.(py|js|ts|jsx|tsx|json|yaml|yml|toml|md|txt|sh|bash|zsh|fish)$'
    '(prompt|conversation|chat|message|llm|ai)_'
    '_log\.txt$'
    '\.log$'
    'prompt'
    'conversation'
)

# ============================================================================
# SCANNING FUNCTIONS
# ============================================================================

scan_with_pattern() {
    local pattern="$1"
    local content="$2"
    local severity="$3"
    local grep_opts="-iE"
    
    # Use grep to find matches
    local matches
    matches=$(echo "$content" | grep $grep_opts "$pattern" 2>/dev/null | head -5 || true)
    
    if [ -n "$matches" ]; then
        echo "$matches"
        return 0
    fi
    return 1
}

# ============================================================================
# MAIN SCANNING LOGIC
# ============================================================================

echo "Files being scanned:"
echo "$STAGED_FILES" | head -20
if [ $(echo "$STAGED_FILES" | wc -l) -gt 20 ]; then
    echo "  ... and $(( $(echo "$STAGED_FILES" | wc -l) - 20 )) more files"
fi
echo ""

# Scan for critical patterns
echo "Scanning for CRITICAL patterns (will block commit)..."
for pattern in "${CRITICAL_PATTERNS[@]}"; do
    result=$(scan_with_pattern "$pattern" "$STAGED_DIFF" "CRITICAL" || true)
    if [ -n "$result" ]; then
        CRITICAL_FOUND=$((CRITICAL_FOUND + 1))
        BLOCK_COMMIT=1
        FINDINGS+=("[CRITICAL] Pattern matched: $pattern")
        FINDINGS+=("Matches:")
        FINDINGS+=("$result")
        FINDINGS+=("")
        echo -e "${RED}✗ CRITICAL: Found potential secret!${NC}"
        echo "  Pattern: $pattern"
        echo "  Matches:"
        echo "$result" | sed 's/^/    /'
        echo ""
    fi
done

# Scan for warning patterns
echo "Scanning for WARNING patterns (will warn but not block)..."
for pattern in "${WARNING_PATTERNS[@]}"; do
    result=$(scan_with_pattern "$pattern" "$STAGED_DIFF" "WARNING" || true)
    if [ -n "$result" ]; then
        WARNING_FOUND=$((WARNING_FOUND + 1))
        FINDINGS+=("[WARNING] Pattern matched: $pattern")
        FINDINGS+=("Matches:")
        FINDINGS+=("$result")
        FINDINGS+=("")
        echo -e "${YELLOW}⚠ WARNING: Found suspicious pattern${NC}"
        echo "  Pattern: $pattern"
        echo "  Matches:"
        echo "$result" | sed 's/^/    /'
        echo ""
    fi
done

# ============================================================================
# FILE-SPECIFIC SCANS
# ============================================================================

echo "Performing file-specific checks..."

# Check for .env files being committed (should be in .gitignore but double-check)
ENV_FILES=$(echo "$STAGED_FILES" | grep -E '^\.env' | grep -v '.env.example' | grep -v '.envrc' || true)
if [ -n "$ENV_FILES" ]; then
    echo -e "${RED}✗ CRITICAL: Attempting to commit .env file(s):${NC}"
    echo "$ENV_FILES" | sed 's/^/  /'
    FINDINGS+=("[CRITICAL] .env file(s) staged for commit:")
    FINDINGS+=("$ENV_FILES")
    BLOCK_COMMIT=1
    echo ""
fi

# Check for credential files
CRED_FILES=$(echo "$STAGED_FILES" | grep -E '(credentials|secrets|tokens)\.?(json|yaml|yml|txt)?$' | grep -v 'test_' | grep -v '_test\.' | grep -v 'example' || true)
if [ -n "$CRED_FILES" ]; then
    echo -e "${YELLOW}⚠ WARNING: Potential credential file(s) detected:${NC}"
    echo "$CRED_FILES" | sed 's/^/  /'
    FINDINGS+=("[WARNING] Potential credential files staged:")
    FINDINGS+=("$CRED_FILES")
    echo ""
fi

# Check for private key files
KEY_FILES=$(echo "$STAGED_FILES" | grep -E '\.(pem|key|ppk|p12|pfx)$' | grep -v 'test_' | grep -v 'example' || true)
if [ -n "$KEY_FILES" ]; then
    echo -e "${RED}✗ CRITICAL: Private key file(s) detected:${NC}"
    echo "$KEY_FILES" | sed 's/^/  /'
    FINDINGS+=("[CRITICAL] Private key files staged for commit:")
    FINDINGS+=("$KEY_FILES")
    BLOCK_COMMIT=1
    echo ""
fi

# ============================================================================
# PROMPT/CONVERSATION SPECIFIC SCANS
# ============================================================================

# Look for prompts that might contain sensitive data
PROMPT_FILES=$(echo "$STAGED_FILES" | grep -iE '(prompt|conversation|chat|message)' | grep -v 'test_' | grep -v '.pyc' || true)
if [ -n "$PROMPT_FILES" ]; then
    echo "Scanning prompt/conversation files for embedded secrets..."
    
    for file in $PROMPT_FILES; do
        if [ -f "$file" ]; then
            file_content=$(cat "$file" 2>/dev/null || true)
            
            # Check for common secret patterns in prompts
            if echo "$file_content" | grep -qiE '(api[_-]?key|secret[_-]?key|password|token)\s*[:=]\s*\S{8,}'; then
                echo -e "${YELLOW}⚠ WARNING: Potential secret in prompt file: $file${NC}"
                FINDINGS+=("[WARNING] Potential secret in: $file")
            fi
            
            # Check for file paths in home directory
            if echo "$file_content" | grep -qE '~/\.\w+'; then
                echo -e "${YELLOW}⚠ WARNING: Home directory path in prompt file: $file${NC}"
                FINDINGS+=("[WARNING] Home directory path in: $file")
            fi
        fi
    done
    echo ""
fi

# ============================================================================
# SUMMARY AND DECISION
# ============================================================================

echo "============================================"
echo "           SCAN SUMMARY"
echo "============================================"
echo ""

if [ $CRITICAL_FOUND -gt 0 ]; then
    echo -e "${RED}✗ $CRITICAL_FOUND CRITICAL finding(s) detected${NC}"
fi

if [ $WARNING_FOUND -gt 0 ]; then
    echo -e "${YELLOW}⚠ $WARNING_FOUND WARNING(s) detected${NC}"
fi

if [ $BLOCK_COMMIT -eq 0 ] && [ $WARNING_FOUND -eq 0 ] && [ $CRITICAL_FOUND -eq 0 ]; then
    echo -e "${GREEN}✓ No potential secret leaks detected${NC}"
    echo ""
    exit 0
fi

echo ""

# If blocking issues found
if [ $BLOCK_COMMIT -eq 1 ]; then
    echo -e "${RED}╔════════════════════════════════════════════════════════════╗${NC}"
    echo -e "${RED}║  COMMIT BLOCKED: Potential secrets detected!               ║${NC}"
    echo -e "${RED}╚════════════════════════════════════════════════════════════╝${NC}"
    echo ""
    echo "The following issues must be resolved before committing:"
    echo ""
    printf '%s\n' "${FINDINGS[@]}" | grep -E '^\[CRITICAL\]'
    echo ""
    echo "Recommendations:"
    echo "  1. Remove secrets from your code"
    echo "  2. Use environment variables or a secrets manager"
    echo "  3. Add sensitive files to .gitignore"
    echo "  4. Rotate any exposed credentials immediately"
    echo ""
    echo "If you are CERTAIN this is a false positive, you can bypass:"
    echo "  git commit --no-verify"
    echo ""
    echo "⚠️  WARNING: Bypassing should be done with extreme caution!"
    echo ""
    exit 1
fi

# If only warnings
if [ $WARNING_FOUND -gt 0 ]; then
    echo -e "${YELLOW}⚠ WARNINGS found but commit will proceed${NC}"
    echo ""
    echo "Please review the warnings above and ensure no sensitive data"
    echo "is being included in prompts or configuration files."
    echo ""
    echo "To cancel this commit, press Ctrl+C within 3 seconds..."
    sleep 3
fi

echo ""
echo -e "${GREEN}✓ Proceeding with commit${NC}"
exit 0