name: Supply Chain Audit

on:
  pull_request:
    types: [opened, synchronize, reopened]

permissions:
  pull-requests: write
  contents: read

jobs:
  scan:
    name: Scan PR for supply chain risks
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Scan diff for suspicious patterns
        id: scan
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          set -euo pipefail

          BASE="${{ github.event.pull_request.base.sha }}"
          HEAD="${{ github.event.pull_request.head.sha }}"

          # Get the full diff (added lines only)
          DIFF=$(git diff "$BASE".."$HEAD" -- . ':!uv.lock' ':!*.lock' ':!package-lock.json' ':!yarn.lock' || true)

          FINDINGS=""
          CRITICAL=false

          # --- .pth files (auto-execute on Python startup) ---
          PTH_FILES=$(git diff --name-only "$BASE".."$HEAD" | grep '\.pth$' || true)
          if [ -n "$PTH_FILES" ]; then
            CRITICAL=true
            FINDINGS="${FINDINGS}
          ### 🚨 CRITICAL: .pth file added or modified
          Python \`.pth\` files in \`site-packages/\` execute automatically when the interpreter starts — no import required. This is the exact mechanism used in the [litellm supply chain attack](https://github.com/BerriAI/litellm/issues/24512).

          **Files:**
          \`\`\`
          ${PTH_FILES}
          \`\`\`
          "
          fi

          # --- base64 + exec/eval combo (the litellm attack pattern) ---
          B64_EXEC_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -iE 'base64\.(b64decode|decodebytes|urlsafe_b64decode)' | grep -iE 'exec\(|eval\(' | head -10 || true)
          if [ -n "$B64_EXEC_HITS" ]; then
            CRITICAL=true
            FINDINGS="${FINDINGS}
          ### 🚨 CRITICAL: base64 decode + exec/eval combo
          This is the exact pattern used in the [litellm supply chain attack](https://github.com/BerriAI/litellm/issues/24512) — base64-decoded strings passed to exec/eval to hide credential-stealing payloads.

          **Matches:**
          \`\`\`
          ${B64_EXEC_HITS}
          \`\`\`
          "
          fi

          # --- base64 decode/encode (alone — legitimate uses exist) ---
          B64_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -iE 'base64\.(b64decode|b64encode|decodebytes|encodebytes|urlsafe_b64decode)|atob\(|btoa\(|Buffer\.from\(.*base64' | head -20 || true)
          if [ -n "$B64_HITS" ]; then
            FINDINGS="${FINDINGS}
          ### ⚠️ WARNING: base64 encoding/decoding detected
          Base64 has legitimate uses (images, JWT, etc.) but is also commonly used to obfuscate malicious payloads. Verify the usage is appropriate.

          **Matches (first 20):**
          \`\`\`
          ${B64_HITS}
          \`\`\`
          "
          fi

          # --- exec/eval with string arguments ---
          EXEC_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -E '(exec|eval)\s*\(' | grep -v '^\+\s*#' | grep -v 'test_\|mock\|assert\|# ' | head -20 || true)
          if [ -n "$EXEC_HITS" ]; then
            FINDINGS="${FINDINGS}
          ### ⚠️ WARNING: exec() or eval() usage
          Dynamic code execution can hide malicious behavior, especially when combined with base64 or network fetches.

          **Matches (first 20):**
          \`\`\`
          ${EXEC_HITS}
          \`\`\`
          "
          fi

          # --- subprocess with encoded/obfuscated commands ---
          PROC_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -E 'subprocess\.(Popen|call|run)\s*\(' | grep -iE 'base64|decode|encode|\\x|chr\(' | head -10 || true)
          if [ -n "$PROC_HITS" ]; then
            CRITICAL=true
            FINDINGS="${FINDINGS}
          ### 🚨 CRITICAL: subprocess with encoded/obfuscated command
          Subprocess calls with encoded arguments are a strong indicator of payload execution.

          **Matches:**
          \`\`\`
          ${PROC_HITS}
          \`\`\`
          "
          fi

          # --- Network calls to non-standard domains ---
          EXFIL_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -iE 'requests\.(post|put)\(|httpx\.(post|put)\(|urllib\.request\.urlopen' | grep -v '^\+\s*#' | grep -v 'test_\|mock\|assert' | head -10 || true)
          if [ -n "$EXFIL_HITS" ]; then
            FINDINGS="${FINDINGS}
          ### ⚠️ WARNING: Outbound network calls (POST/PUT)
          Outbound POST/PUT requests in new code could be data exfiltration. Verify the destination URLs are legitimate.

          **Matches (first 10):**
          \`\`\`
          ${EXFIL_HITS}
          \`\`\`
          "
          fi

          # --- setup.py / setup.cfg install hooks ---
          SETUP_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -E '(setup\.py|setup\.cfg|__init__\.pth|sitecustomize\.py|usercustomize\.py)$' || true)
          if [ -n "$SETUP_HITS" ]; then
            FINDINGS="${FINDINGS}
          ### ⚠️ WARNING: Install hook files modified
          These files can execute code during package installation or interpreter startup.

          **Files:**
          \`\`\`
          ${SETUP_HITS}
          \`\`\`
          "
          fi

          # --- Compile/marshal/pickle (code object injection) ---
          MARSHAL_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -iE 'marshal\.loads|pickle\.loads|compile\(' | grep -v '^\+\s*#' | grep -v 'test_\|re\.compile\|ast\.compile' | head -10 || true)
          if [ -n "$MARSHAL_HITS" ]; then
            FINDINGS="${FINDINGS}
          ### ⚠️ WARNING: marshal/pickle/compile usage
          These can deserialize or construct executable code objects.

          **Matches:**
          \`\`\`
          ${MARSHAL_HITS}
          \`\`\`
          "
          fi

          # --- Output results ---
          if [ -n "$FINDINGS" ]; then
            echo "found=true" >> "$GITHUB_OUTPUT"
            if [ "$CRITICAL" = true ]; then
              echo "critical=true" >> "$GITHUB_OUTPUT"
            else
              echo "critical=false" >> "$GITHUB_OUTPUT"
            fi
            # Write findings to a file (multiline env vars are fragile)
            echo "$FINDINGS" > /tmp/findings.md
          else
            echo "found=false" >> "$GITHUB_OUTPUT"
            echo "critical=false" >> "$GITHUB_OUTPUT"
          fi

      - name: Post warning comment
        if: steps.scan.outputs.found == 'true'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          SEVERITY="⚠️ Supply Chain Risk Detected"
          if [ "${{ steps.scan.outputs.critical }}" = "true" ]; then
            SEVERITY="🚨 CRITICAL Supply Chain Risk Detected"
          fi

          BODY="## ${SEVERITY}

          This PR contains patterns commonly associated with supply chain attacks. This does **not** mean the PR is malicious — but these patterns require careful human review before merging.

          $(cat /tmp/findings.md)

          ---
          *Automated scan triggered by [supply-chain-audit](/.github/workflows/supply-chain-audit.yml). If this is a false positive, a maintainer can approve after manual review.*"

          gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY"

      - name: Fail on critical findings
        if: steps.scan.outputs.critical == 'true'
        run: |
          echo "::error::CRITICAL supply chain risk patterns detected in this PR. See the PR comment for details."
          exit 1