diff --git a/scripts/license_checker.py b/scripts/license_checker.py new file mode 100644 index 0000000..16d4c0d --- /dev/null +++ b/scripts/license_checker.py @@ -0,0 +1,506 @@ +#!/usr/bin/env python3 +""" +License Checker — Pipeline 5.4 +Scans dependency files for a project, resolves license info, flags incompatibilities. + +Acceptance: + [x] Reads license for each dep + [x] Flags: GPL in MIT project, unknown licenses + [x] Output: license compatibility report + +Usage: + python3 license_checker.py [--project-license MIT] [--format json|text] + python3 license_checker.py --scan-deps +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import urllib.request +import urllib.error +from dataclasses import dataclass, field, asdict +from enum import Enum +from pathlib import Path +from typing import Optional + + +class Severity(Enum): + OK = "ok" + WARNING = "warning" + ERROR = "error" + UNKNOWN = "unknown" + + +# SPDX license compatibility matrix +# Key: (dependency_license, project_license) -> compatible? +# Copyleft licenses are NOT compatible with permissive projects +COPYLEFT_FAMILIES = { + "GPL-2.0", "GPL-2.0-only", "GPL-2.0-or-later", + "GPL-3.0", "GPL-3.0-only", "GPL-3.0-or-later", + "AGPL-3.0", "AGPL-3.0-only", "AGPL-3.0-or-later", + "LGPL-2.0", "LGPL-2.1", "LGPL-3.0", + "LGPL-2.0-only", "LGPL-2.1-only", "LGPL-3.0-only", + "LGPL-2.0-or-later", "LGPL-2.1-or-later", "LGPL-3.0-or-later", + "MPL-2.0", # Weak copyleft — file-level + "EUPL-1.1", "EUPL-1.2", + "OSL-3.0", + "SSPL-1.0", + "CC-BY-SA-4.0", "CC-BY-SA-3.0", + "CC-BY-NC-4.0", "CC-BY-NC-3.0", +} + +PERMISSIVE_LICENSES = { + "MIT", "BSD-2-Clause", "BSD-3-Clause", "Apache-2.0", + "ISC", "Unlicense", "CC0-1.0", "0BSD", "BSL-1.0", + "Zlib", "PSF-2.0", "Python-2.0", +} + +# Common aliases +LICENSE_ALIASES = { + "mit": "MIT", + "bsd": "BSD-3-Clause", + "bsd-2": "BSD-2-Clause", + "bsd-3": "BSD-3-Clause", + "bsd license": "BSD-3-Clause", + "apache": "Apache-2.0", + "apache 2.0": "Apache-2.0", + "apache-2.0": "Apache-2.0", + "apache software license": "Apache-2.0", + "apache software license 2.0": "Apache-2.0", + "gpl": "GPL-3.0", + "gpl-2": "GPL-2.0", + "gpl-3": "GPL-3.0", + "gplv2": "GPL-2.0", + "gplv3": "GPL-3.0", + "gnu general public license": "GPL-3.0", + "gnu general public license v3": "GPL-3.0", + "gnu general public license v2": "GPL-2.0", + "gnu lesser general public license v2": "LGPL-2.1", + "gnu lesser general public license v3": "LGPL-3.0", + "lgpl": "LGPL-3.0", + "lgpl-2.1": "LGPL-2.1", + "lgpl-3": "LGPL-3.0", + "agpl": "AGPL-3.0", + "agpl-3.0": "AGPL-3.0", + "agplv3": "AGPL-3.0", + "isc": "ISC", + "mpl": "MPL-2.0", + "mpl-2.0": "MPL-2.0", + "mozilla public license 2.0": "MPL-2.0", + "unlicense": "Unlicense", + "public domain": "Unlicense", + "cc0": "CC0-1.0", + "cc0-1.0": "CC0-1.0", + "psf": "PSF-2.0", + "python software foundation license": "PSF-2.0", + "the mit license": "MIT", + "mit license": "MIT", +} + + +@dataclass +class DepLicense: + name: str + version: str = "" + license: str = "UNKNOWN" + source: str = "" # where we found the dep (requirements.txt, package.json, etc.) + severity: Severity = Severity.UNKNOWN + message: str = "" + + +@dataclass +class LicenseReport: + project_dir: str + project_license: str = "MIT" + dependencies: list = field(default_factory=list) + summary: dict = field(default_factory=dict) + errors: list = field(default_factory=list) + warnings: list = field(default_factory=list) + + +def normalize_license(raw: str) -> str: + """Normalize a license string to SPDX identifier.""" + if not raw or raw.strip() in ("UNKNOWN", "UNKNOWN:", ""): + return "UNKNOWN" + cleaned = raw.strip().lower() + # Remove version specifiers like "MIT License (MIT)" + cleaned = re.sub(r"\(.*?\)", "", cleaned).strip() + cleaned = re.sub(r"\s+license$", "", cleaned).strip() + cleaned = re.sub(r"^the\s+", "", cleaned).strip() + if cleaned in LICENSE_ALIASES: + return LICENSE_ALIASES[cleaned] + # Check if it already looks like SPDX + upper = raw.strip() + if upper in COPYLEFT_FAMILIES or upper in PERMISSIVE_LICENSES: + return upper + return raw.strip() + + +def check_compatibility(dep_license: str, project_license: str) -> tuple[Severity, str]: + """Check if a dependency license is compatible with the project license.""" + if dep_license == "UNKNOWN": + return Severity.WARNING, "License unknown — manual review needed" + + if dep_license in PERMISSIVE_LICENSES: + return Severity.OK, "Compatible (permissive)" + + if dep_license in COPYLEFT_FAMILIES: + # Copyleft in a permissive project is a problem + if project_license in PERMISSIVE_LICENSES: + return Severity.ERROR, f"Copyleft ({dep_license}) in permissive ({project_license}) project" + # Copyleft in same family is OK + if dep_license.startswith(project_license.split("-")[0]): + return Severity.OK, "Compatible (same copyleft family)" + return Severity.WARNING, f"Review needed: {dep_license} with {project_license}" + + return Severity.UNKNOWN, f"Unrecognized license: {dep_license}" + + +def parse_requirements_txt(path: str) -> list[DepLicense]: + """Parse requirements.txt format.""" + deps = [] + with open(path) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or line.startswith("-"): + continue + # Parse name==version or name>=version etc. + match = re.match(r"^([a-zA-Z0-9_.-]+)(?:[>= list[DepLicense]: + """Parse pyproject.toml dependencies.""" + deps = [] + try: + # Use tomllib (Python 3.11+) or fall back to regex + import tomllib + with open(path, "rb") as f: + data = tomllib.load(f) + except ImportError: + # Fallback: regex parse + with open(path) as f: + content = f.read() + # Find [project.dependencies] section + match = re.search(r"\[project\]\s*dependencies\s*=\s*\[(.*?)\]", content, re.DOTALL) + if match: + for dep_str in re.findall(r'"([^"]+)"', match.group(1)): + name = re.match(r"^([a-zA-Z0-9_.-]+)", dep_str) + if name: + deps.append(DepLicense(name=name.group(1), source="pyproject.toml")) + return deps + + project_deps = data.get("project", {}).get("dependencies", []) + for dep_str in project_deps: + name = re.match(r"^([a-zA-Z0-9_.-]+)", dep_str) + if name: + deps.append(DepLicense(name=name.group(1), source="pyproject.toml")) + return deps + + +def parse_package_json(path: str) -> list[DepLicense]: + """Parse package.json dependencies.""" + deps = [] + with open(path) as f: + data = json.load(f) + for section in ("dependencies", "devDependencies"): + for name, version in data.get(section, {}).items(): + deps.append(DepLicense(name=name, version=version, source="package.json")) + return deps + + +def parse_cargo_toml(path: str) -> list[DepLicense]: + """Parse Cargo.toml dependencies (basic).""" + deps = [] + with open(path) as f: + for line in f: + match = re.match(r'^([a-zA-Z0-9_-]+)\s*=\s*"', line.strip()) + if match and line.strip()[0] != "[" and line.strip() != "[dependencies]": + deps.append(DepLicense(name=match.group(1), source="Cargo.toml")) + return deps + + +def parse_go_mod(path: str) -> list[DepLicense]: + """Parse go.mod dependencies.""" + deps = [] + with open(path) as f: + in_require = False + for line in f: + line = line.strip() + if line == "require (": + in_require = True + continue + if line == ")" and in_require: + in_require = False + continue + if in_require: + parts = line.split() + if len(parts) >= 2: + deps.append(DepLicense(name=parts[0], version=parts[1], source="go.mod")) + return deps + + +def scan_dep_files(project_dir: str) -> list[DepLicense]: + """Find and parse all dependency files in a project.""" + all_deps = [] + parsers = { + "requirements.txt": parse_requirements_txt, + "requirements-dev.txt": parse_requirements_txt, + "requirements_prod.txt": parse_requirements_txt, + "pyproject.toml": parse_pyproject_toml, + "setup.py": None, # TODO: parse setup.py + "package.json": parse_package_json, + "Cargo.toml": parse_cargo_toml, + "go.mod": parse_go_mod, + } + + for filename, parser in parsers.items(): + path = os.path.join(project_dir, filename) + if os.path.exists(path) and parser: + try: + deps = parser(path) + all_deps.extend(deps) + except Exception as e: + print(f"Warning: Failed to parse {filename}: {e}", file=sys.stderr) + + # Also check subdirectories for monorepos (one level deep) + for entry in os.listdir(project_dir): + subdir = os.path.join(project_dir, entry) + if os.path.isdir(subdir) and not entry.startswith("."): + for filename, parser in parsers.items(): + path = os.path.join(subdir, filename) + if os.path.exists(path) and parser: + try: + deps = parser(path) + for d in deps: + d.source = f"{entry}/{filename}" + all_deps.extend(deps) + except Exception: + pass + + return all_deps + + +def lookup_pypi_license(package_name: str) -> str: + """Look up license from PyPI API.""" + try: + url = f"https://pypi.org/pypi/{package_name}/json" + req = urllib.request.Request(url, headers={"Accept": "application/json"}) + resp = urllib.request.urlopen(req, timeout=10) + data = json.loads(resp.read()) + # Try classifiers first + for classifier in data.get("info", {}).get("classifiers", []): + if classifier.startswith("License ::"): + parts = classifier.split(" :: ") + if len(parts) >= 3: + return parts[-1] + # Fall back to license field + lic = data.get("info", {}).get("license", "") + if lic and len(lic) < 100: + return lic + # Try license_expression + le = data.get("info", {}).get("license_expression", "") + if le: + return le + return "UNKNOWN" + except Exception: + return "UNKNOWN" + + +def lookup_npm_license(package_name: str) -> str: + """Look up license from npm registry.""" + try: + url = f"https://registry.npmjs.org/{package_name}" + req = urllib.request.Request(url, headers={"Accept": "application/json"}) + resp = urllib.request.urlopen(req, timeout=10) + data = json.loads(resp.read()) + lic = data.get("license", "UNKNOWN") + if isinstance(lic, dict): + lic = lic.get("type", "UNKNOWN") + return lic or "UNKNOWN" + except Exception: + return "UNKNOWN" + + +def detect_project_license(project_dir: str) -> str: + """Detect the project's own license.""" + for name in ("LICENSE", "LICENSE.md", "LICENSE.txt", "LICENCE", "COPYING"): + path = os.path.join(project_dir, name) + if os.path.exists(path): + with open(path) as f: + content = f.read().upper() + if "MIT LICENSE" in content or "MIT" in content[:200]: + return "MIT" + if "APACHE" in content and "2.0" in content: + return "Apache-2.0" + if "GNU GENERAL PUBLIC LICENSE" in content: + if "VERSION 3" in content: + return "GPL-3.0" + if "VERSION 2" in content: + return "GPL-2.0" + if "BSD" in content[:500]: + if "3-CLAUSE" in content or "THREE CLAUSE" in content: + return "BSD-3-Clause" + return "BSD-2-Clause" + if "ISC" in content[:200]: + return "ISC" + # Check pyproject.toml + pypath = os.path.join(project_dir, "pyproject.toml") + if os.path.exists(pypath): + with open(pypath) as f: + content = f.read() + match = re.search(r'license\s*=\s*\{\s*text\s*=\s*"([^"]+)"', content) + if match: + return normalize_license(match.group(1)) + match = re.search(r'license\s*=\s*"([^"]+)"', content) + if match: + return normalize_license(match.group(1)) + return "UNKNOWN" + + +def resolve_licenses(deps: list[DepLicense], cache: dict = None) -> None: + """Resolve license info for all dependencies.""" + if cache is None: + cache = {} + + for dep in deps: + if dep.name in cache: + dep.license = cache[dep.name] + continue + + # Determine ecosystem + if dep.source in ("package.json",): + raw = lookup_npm_license(dep.name) + else: + raw = lookup_pypi_license(dep.name) + + dep.license = normalize_license(raw) + cache[dep.name] = dep.license + + +def generate_report(deps: list[DepLicense], project_license: str) -> LicenseReport: + """Generate the compatibility report.""" + report = LicenseReport( + project_dir="", + project_license=project_license, + dependencies=[], + ) + + counts = {"ok": 0, "warning": 0, "error": 0, "unknown": 0} + + for dep in deps: + severity, message = check_compatibility(dep.license, project_license) + dep.severity = severity + dep.message = message + counts[severity.value] += 1 + + if severity == Severity.ERROR: + report.errors.append(f"{dep.name}: {message}") + elif severity == Severity.WARNING: + report.warnings.append(f"{dep.name}: {message}") + + report.dependencies.append(asdict(dep)) + + report.summary = { + "total": len(deps), + **counts, + "project_license": project_license, + } + + return report + + +def format_text(report: LicenseReport) -> str: + """Format report as human-readable text.""" + lines = [] + lines.append("=" * 60) + lines.append(" LICENSE COMPATIBILITY REPORT") + lines.append("=" * 60) + lines.append(f" Project License: {report.project_license}") + lines.append(f" Dependencies: {report.summary.get('total', 0)}") + lines.append(f" OK: {report.summary.get('ok', 0)} " + f"WARN: {report.summary.get('warning', 0)} " + f"ERR: {report.summary.get('error', 0)} " + f"UNK: {report.summary.get('unknown', 0)}") + lines.append("-" * 60) + + for dep in report.dependencies: + icon = {"ok": "[OK]", "warning": "[!!]", "error": "[XX]", "unknown": "[??]"} + sev = dep.get("severity", "unknown") + name = dep.get("name", "?") + lic = dep.get("license", "?") + msg = dep.get("message", "") + lines.append(f" {icon.get(sev, '[ ]')} {name:30s} {lic:20s} {msg}") + + if report.errors: + lines.append("-" * 60) + lines.append(" ERRORS:") + for e in report.errors: + lines.append(f" - {e}") + + if report.warnings: + lines.append("-" * 60) + lines.append(" WARNINGS:") + for w in report.warnings: + lines.append(f" - {w}") + + lines.append("=" * 60) + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser(description="License Checker — Pipeline 5.4") + parser.add_argument("project_dir", help="Project directory to scan") + parser.add_argument("--project-license", default=None, + help="Project license SPDX id (auto-detected if omitted)") + parser.add_argument("--format", choices=["json", "text"], default="text", + help="Output format") + parser.add_argument("--scan-deps", action="store_true", + help="Only scan and list deps (skip license lookup)") + args = parser.parse_args() + + project_dir = os.path.abspath(args.project_dir) + if not os.path.isdir(project_dir): + print(f"Error: {project_dir} is not a directory", file=sys.stderr) + sys.exit(1) + + # Detect project license + project_license = args.project_license or detect_project_license(project_dir) + + # Scan deps + deps = scan_dep_files(project_dir) + if not deps: + print(f"No dependencies found in {project_dir}", file=sys.stderr) + sys.exit(0) + + print(f"Found {len(deps)} dependencies", file=sys.stderr) + + if args.scan_deps: + for d in deps: + print(f" {d.name} ({d.source})") + sys.exit(0) + + # Resolve licenses + print("Resolving licenses...", file=sys.stderr) + resolve_licenses(deps) + + # Generate report + report = generate_report(deps, project_license) + report.project_dir = project_dir + + if args.format == "json": + print(json.dumps(asdict(report), indent=2, default=str)) + else: + print(format_text(report)) + + # Exit code: 1 if errors, 0 otherwise + sys.exit(1 if report.errors else 0) + + +if __name__ == "__main__": + main()