compounding-intelligence/scripts/license_checker.py

#!/usr/bin/env python3
"""
License Checker — Pipeline 5.4
Scans dependency files for a project, resolves license info, flags incompatibilities.

Acceptance:
  [x] Reads license for each dep
  [x] Flags: GPL in MIT project, unknown licenses
  [x] Output: license compatibility report

Usage:
    python3 license_checker.py <project_dir> [--project-license MIT] [--format json|text]
    python3 license_checker.py <project_dir> --scan-deps
"""

import argparse
import json
import os
import re
import subprocess
import sys
import urllib.request
import urllib.error
from dataclasses import dataclass, field, asdict
from enum import Enum
from pathlib import Path
from typing import Optional


class Severity(Enum):
    OK = "ok"
    WARNING = "warning"
    ERROR = "error"
    UNKNOWN = "unknown"


# SPDX license compatibility matrix
# Key: (dependency_license, project_license) -> compatible?
# Copyleft licenses are NOT compatible with permissive projects
COPYLEFT_FAMILIES = {
    "GPL-2.0", "GPL-2.0-only", "GPL-2.0-or-later",
    "GPL-3.0", "GPL-3.0-only", "GPL-3.0-or-later",
    "AGPL-3.0", "AGPL-3.0-only", "AGPL-3.0-or-later",
    "LGPL-2.0", "LGPL-2.1", "LGPL-3.0",
    "LGPL-2.0-only", "LGPL-2.1-only", "LGPL-3.0-only",
    "LGPL-2.0-or-later", "LGPL-2.1-or-later", "LGPL-3.0-or-later",
    "MPL-2.0",  # Weak copyleft — file-level
    "EUPL-1.1", "EUPL-1.2",
    "OSL-3.0",
    "SSPL-1.0",
    "CC-BY-SA-4.0", "CC-BY-SA-3.0",
    "CC-BY-NC-4.0", "CC-BY-NC-3.0",
}

PERMISSIVE_LICENSES = {
    "MIT", "BSD-2-Clause", "BSD-3-Clause", "Apache-2.0",
    "ISC", "Unlicense", "CC0-1.0", "0BSD", "BSL-1.0",
    "Zlib", "PSF-2.0", "Python-2.0",
}

# Common aliases
LICENSE_ALIASES = {
    "mit": "MIT",
    "bsd": "BSD-3-Clause",
    "bsd-2": "BSD-2-Clause",
    "bsd-3": "BSD-3-Clause",
    "bsd license": "BSD-3-Clause",
    "apache": "Apache-2.0",
    "apache 2.0": "Apache-2.0",
    "apache-2.0": "Apache-2.0",
    "apache software license": "Apache-2.0",
    "apache software license 2.0": "Apache-2.0",
    "gpl": "GPL-3.0",
    "gpl-2": "GPL-2.0",
    "gpl-3": "GPL-3.0",
    "gplv2": "GPL-2.0",
    "gplv3": "GPL-3.0",
    "gnu general public license": "GPL-3.0",
    "gnu general public license v3": "GPL-3.0",
    "gnu general public license v2": "GPL-2.0",
    "gnu lesser general public license v2": "LGPL-2.1",
    "gnu lesser general public license v3": "LGPL-3.0",
    "lgpl": "LGPL-3.0",
    "lgpl-2.1": "LGPL-2.1",
    "lgpl-3": "LGPL-3.0",
    "agpl": "AGPL-3.0",
    "agpl-3.0": "AGPL-3.0",
    "agplv3": "AGPL-3.0",
    "isc": "ISC",
    "mpl": "MPL-2.0",
    "mpl-2.0": "MPL-2.0",
    "mozilla public license 2.0": "MPL-2.0",
    "unlicense": "Unlicense",
    "public domain": "Unlicense",
    "cc0": "CC0-1.0",
    "cc0-1.0": "CC0-1.0",
    "psf": "PSF-2.0",
    "python software foundation license": "PSF-2.0",
    "the mit license": "MIT",
    "mit license": "MIT",
}


@dataclass
class DepLicense:
    name: str
    version: str = ""
    license: str = "UNKNOWN"
    source: str = ""  # where we found the dep (requirements.txt, package.json, etc.)
    severity: Severity = Severity.UNKNOWN
    message: str = ""


@dataclass
class LicenseReport:
    project_dir: str
    project_license: str = "MIT"
    dependencies: list = field(default_factory=list)
    summary: dict = field(default_factory=dict)
    errors: list = field(default_factory=list)
    warnings: list = field(default_factory=list)


def normalize_license(raw: str) -> str:
    """Normalize a license string to SPDX identifier."""
    if not raw or raw.strip() in ("UNKNOWN", "UNKNOWN:", ""):
        return "UNKNOWN"
    cleaned = raw.strip().lower()
    # Remove version specifiers like "MIT License (MIT)"
    cleaned = re.sub(r"\(.*?\)", "", cleaned).strip()
    cleaned = re.sub(r"\s+license$", "", cleaned).strip()
    cleaned = re.sub(r"^the\s+", "", cleaned).strip()
    if cleaned in LICENSE_ALIASES:
        return LICENSE_ALIASES[cleaned]
    # Check if it already looks like SPDX
    upper = raw.strip()
    if upper in COPYLEFT_FAMILIES or upper in PERMISSIVE_LICENSES:
        return upper
    return raw.strip()


def check_compatibility(dep_license: str, project_license: str) -> tuple[Severity, str]:
    """Check if a dependency license is compatible with the project license."""
    if dep_license == "UNKNOWN":
        return Severity.WARNING, "License unknown — manual review needed"

    if dep_license in PERMISSIVE_LICENSES:
        return Severity.OK, "Compatible (permissive)"

    if dep_license in COPYLEFT_FAMILIES:
        # Copyleft in a permissive project is a problem
        if project_license in PERMISSIVE_LICENSES:
            return Severity.ERROR, f"Copyleft ({dep_license}) in permissive ({project_license}) project"
        # Copyleft in same family is OK
        if dep_license.startswith(project_license.split("-")[0]):
            return Severity.OK, "Compatible (same copyleft family)"
        return Severity.WARNING, f"Review needed: {dep_license} with {project_license}"

    return Severity.UNKNOWN, f"Unrecognized license: {dep_license}"


def parse_requirements_txt(path: str) -> list[DepLicense]:
    """Parse requirements.txt format."""
    deps = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#") or line.startswith("-"):
                continue
            # Parse name==version or name>=version etc.
            match = re.match(r"^([a-zA-Z0-9_.-]+)(?:[>=<!~].*)?$", line)
            if match:
                deps.append(DepLicense(name=match.group(1), source="requirements.txt"))
    return deps


def parse_pyproject_toml(path: str) -> list[DepLicense]:
    """Parse pyproject.toml dependencies."""
    deps = []
    try:
        # Use tomllib (Python 3.11+) or fall back to regex
        import tomllib
        with open(path, "rb") as f:
            data = tomllib.load(f)
    except ImportError:
        # Fallback: regex parse
        with open(path) as f:
            content = f.read()
        # Find [project.dependencies] section
        match = re.search(r"\[project\]\s*dependencies\s*=\s*\[(.*?)\]", content, re.DOTALL)
        if match:
            for dep_str in re.findall(r'"([^"]+)"', match.group(1)):
                name = re.match(r"^([a-zA-Z0-9_.-]+)", dep_str)
                if name:
                    deps.append(DepLicense(name=name.group(1), source="pyproject.toml"))
        return deps

    project_deps = data.get("project", {}).get("dependencies", [])
    for dep_str in project_deps:
        name = re.match(r"^([a-zA-Z0-9_.-]+)", dep_str)
        if name:
            deps.append(DepLicense(name=name.group(1), source="pyproject.toml"))
    return deps


def parse_package_json(path: str) -> list[DepLicense]:
    """Parse package.json dependencies."""
    deps = []
    with open(path) as f:
        data = json.load(f)
    for section in ("dependencies", "devDependencies"):
        for name, version in data.get(section, {}).items():
            deps.append(DepLicense(name=name, version=version, source="package.json"))
    return deps


def parse_cargo_toml(path: str) -> list[DepLicense]:
    """Parse Cargo.toml dependencies (basic)."""
    deps = []
    with open(path) as f:
        for line in f:
            match = re.match(r'^([a-zA-Z0-9_-]+)\s*=\s*"', line.strip())
            if match and line.strip()[0] != "[" and line.strip() != "[dependencies]":
                deps.append(DepLicense(name=match.group(1), source="Cargo.toml"))
    return deps


def parse_go_mod(path: str) -> list[DepLicense]:
    """Parse go.mod dependencies."""
    deps = []
    with open(path) as f:
        in_require = False
        for line in f:
            line = line.strip()
            if line == "require (":
                in_require = True
                continue
            if line == ")" and in_require:
                in_require = False
                continue
            if in_require:
                parts = line.split()
                if len(parts) >= 2:
                    deps.append(DepLicense(name=parts[0], version=parts[1], source="go.mod"))
    return deps


def scan_dep_files(project_dir: str) -> list[DepLicense]:
    """Find and parse all dependency files in a project."""
    all_deps = []
    parsers = {
        "requirements.txt": parse_requirements_txt,
        "requirements-dev.txt": parse_requirements_txt,
        "requirements_prod.txt": parse_requirements_txt,
        "pyproject.toml": parse_pyproject_toml,
        "setup.py": None,  # TODO: parse setup.py
        "package.json": parse_package_json,
        "Cargo.toml": parse_cargo_toml,
        "go.mod": parse_go_mod,
    }

    for filename, parser in parsers.items():
        path = os.path.join(project_dir, filename)
        if os.path.exists(path) and parser:
            try:
                deps = parser(path)
                all_deps.extend(deps)
            except Exception as e:
                print(f"Warning: Failed to parse {filename}: {e}", file=sys.stderr)

    # Also check subdirectories for monorepos (one level deep)
    for entry in os.listdir(project_dir):
        subdir = os.path.join(project_dir, entry)
        if os.path.isdir(subdir) and not entry.startswith("."):
            for filename, parser in parsers.items():
                path = os.path.join(subdir, filename)
                if os.path.exists(path) and parser:
                    try:
                        deps = parser(path)
                        for d in deps:
                            d.source = f"{entry}/{filename}"
                        all_deps.extend(deps)
                    except Exception:
                        pass

    return all_deps


def lookup_pypi_license(package_name: str) -> str:
    """Look up license from PyPI API."""
    try:
        url = f"https://pypi.org/pypi/{package_name}/json"
        req = urllib.request.Request(url, headers={"Accept": "application/json"})
        resp = urllib.request.urlopen(req, timeout=10)
        data = json.loads(resp.read())
        # Try classifiers first
        for classifier in data.get("info", {}).get("classifiers", []):
            if classifier.startswith("License ::"):
                parts = classifier.split(" :: ")
                if len(parts) >= 3:
                    return parts[-1]
        # Fall back to license field
        lic = data.get("info", {}).get("license", "")
        if lic and len(lic) < 100:
            return lic
        # Try license_expression
        le = data.get("info", {}).get("license_expression", "")
        if le:
            return le
        return "UNKNOWN"
    except Exception:
        return "UNKNOWN"


def lookup_npm_license(package_name: str) -> str:
    """Look up license from npm registry."""
    try:
        url = f"https://registry.npmjs.org/{package_name}"
        req = urllib.request.Request(url, headers={"Accept": "application/json"})
        resp = urllib.request.urlopen(req, timeout=10)
        data = json.loads(resp.read())
        lic = data.get("license", "UNKNOWN")
        if isinstance(lic, dict):
            lic = lic.get("type", "UNKNOWN")
        return lic or "UNKNOWN"
    except Exception:
        return "UNKNOWN"


def detect_project_license(project_dir: str) -> str:
    """Detect the project's own license."""
    for name in ("LICENSE", "LICENSE.md", "LICENSE.txt", "LICENCE", "COPYING"):
        path = os.path.join(project_dir, name)
        if os.path.exists(path):
            with open(path) as f:
                content = f.read().upper()
            if "MIT LICENSE" in content or "MIT" in content[:200]:
                return "MIT"
            if "APACHE" in content and "2.0" in content:
                return "Apache-2.0"
            if "GNU GENERAL PUBLIC LICENSE" in content:
                if "VERSION 3" in content:
                    return "GPL-3.0"
                if "VERSION 2" in content:
                    return "GPL-2.0"
            if "BSD" in content[:500]:
                if "3-CLAUSE" in content or "THREE CLAUSE" in content:
                    return "BSD-3-Clause"
                return "BSD-2-Clause"
            if "ISC" in content[:200]:
                return "ISC"
    # Check pyproject.toml
    pypath = os.path.join(project_dir, "pyproject.toml")
    if os.path.exists(pypath):
        with open(pypath) as f:
            content = f.read()
        match = re.search(r'license\s*=\s*\{\s*text\s*=\s*"([^"]+)"', content)
        if match:
            return normalize_license(match.group(1))
        match = re.search(r'license\s*=\s*"([^"]+)"', content)
        if match:
            return normalize_license(match.group(1))
    return "UNKNOWN"


def resolve_licenses(deps: list[DepLicense], cache: dict = None) -> None:
    """Resolve license info for all dependencies."""
    if cache is None:
        cache = {}

    for dep in deps:
        if dep.name in cache:
            dep.license = cache[dep.name]
            continue

        # Determine ecosystem
        if dep.source in ("package.json",):
            raw = lookup_npm_license(dep.name)
        else:
            raw = lookup_pypi_license(dep.name)

        dep.license = normalize_license(raw)
        cache[dep.name] = dep.license


def generate_report(deps: list[DepLicense], project_license: str) -> LicenseReport:
    """Generate the compatibility report."""
    report = LicenseReport(
        project_dir="",
        project_license=project_license,
        dependencies=[],
    )

    counts = {"ok": 0, "warning": 0, "error": 0, "unknown": 0}

    for dep in deps:
        severity, message = check_compatibility(dep.license, project_license)
        dep.severity = severity
        dep.message = message
        counts[severity.value] += 1

        if severity == Severity.ERROR:
            report.errors.append(f"{dep.name}: {message}")
        elif severity == Severity.WARNING:
            report.warnings.append(f"{dep.name}: {message}")

        report.dependencies.append(asdict(dep))

    report.summary = {
        "total": len(deps),
        **counts,
        "project_license": project_license,
    }

    return report


def format_text(report: LicenseReport) -> str:
    """Format report as human-readable text."""
    lines = []
    lines.append("=" * 60)
    lines.append("  LICENSE COMPATIBILITY REPORT")
    lines.append("=" * 60)
    lines.append(f"  Project License: {report.project_license}")
    lines.append(f"  Dependencies: {report.summary.get('total', 0)}")
    lines.append(f"  OK: {report.summary.get('ok', 0)}  "
                 f"WARN: {report.summary.get('warning', 0)}  "
                 f"ERR: {report.summary.get('error', 0)}  "
                 f"UNK: {report.summary.get('unknown', 0)}")
    lines.append("-" * 60)

    for dep in report.dependencies:
        icon = {"ok": "[OK]", "warning": "[!!]", "error": "[XX]", "unknown": "[??]"}
        sev = dep.get("severity", "unknown")
        name = dep.get("name", "?")
        lic = dep.get("license", "?")
        msg = dep.get("message", "")
        lines.append(f"  {icon.get(sev, '[ ]')} {name:30s} {lic:20s} {msg}")

    if report.errors:
        lines.append("-" * 60)
        lines.append("  ERRORS:")
        for e in report.errors:
            lines.append(f"    - {e}")

    if report.warnings:
        lines.append("-" * 60)
        lines.append("  WARNINGS:")
        for w in report.warnings:
            lines.append(f"    - {w}")

    lines.append("=" * 60)
    return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser(description="License Checker — Pipeline 5.4")
    parser.add_argument("project_dir", help="Project directory to scan")
    parser.add_argument("--project-license", default=None,
                        help="Project license SPDX id (auto-detected if omitted)")
    parser.add_argument("--format", choices=["json", "text"], default="text",
                        help="Output format")
    parser.add_argument("--scan-deps", action="store_true",
                        help="Only scan and list deps (skip license lookup)")
    args = parser.parse_args()

    project_dir = os.path.abspath(args.project_dir)
    if not os.path.isdir(project_dir):
        print(f"Error: {project_dir} is not a directory", file=sys.stderr)
        sys.exit(1)

    # Detect project license
    project_license = args.project_license or detect_project_license(project_dir)

    # Scan deps
    deps = scan_dep_files(project_dir)
    if not deps:
        print(f"No dependencies found in {project_dir}", file=sys.stderr)
        sys.exit(0)

    print(f"Found {len(deps)} dependencies", file=sys.stderr)

    if args.scan_deps:
        for d in deps:
            print(f"  {d.name} ({d.source})")
        sys.exit(0)

    # Resolve licenses
    print("Resolving licenses...", file=sys.stderr)
    resolve_licenses(deps)

    # Generate report
    report = generate_report(deps, project_license)
    report.project_dir = project_dir

    if args.format == "json":
        print(json.dumps(asdict(report), indent=2, default=str))
    else:
        print(format_text(report))

    # Exit code: 1 if errors, 0 otherwise
    sys.exit(1 if report.errors else 0)


if __name__ == "__main__":
    main()