test: license checker unit tests (#110 )

feat: license checker — Pipeline 5.4 (#110 )
2026-04-15 03:32:58 +00:00 · 2026-04-15 03:31:35 +00:00
4 changed files with 692 additions and 324 deletions
--- a/scripts/license_checker.py
+++ b/scripts/license_checker.py
@@ -0,0 +1,506 @@
+#!/usr/bin/env python3
+"""
+License Checker — Pipeline 5.4
+Scans dependency files for a project, resolves license info, flags incompatibilities.
+
+Acceptance:
+  [x] Reads license for each dep
+  [x] Flags: GPL in MIT project, unknown licenses
+  [x] Output: license compatibility report
+
+Usage:
+    python3 license_checker.py <project_dir> [--project-license MIT] [--format json|text]
+    python3 license_checker.py <project_dir> --scan-deps
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import urllib.request
+import urllib.error
+from dataclasses import dataclass, field, asdict
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+
+
+class Severity(Enum):
+    OK = "ok"
+    WARNING = "warning"
+    ERROR = "error"
+    UNKNOWN = "unknown"
+
+
+# SPDX license compatibility matrix
+# Key: (dependency_license, project_license) -> compatible?
+# Copyleft licenses are NOT compatible with permissive projects
+COPYLEFT_FAMILIES = {
+    "GPL-2.0", "GPL-2.0-only", "GPL-2.0-or-later",
+    "GPL-3.0", "GPL-3.0-only", "GPL-3.0-or-later",
+    "AGPL-3.0", "AGPL-3.0-only", "AGPL-3.0-or-later",
+    "LGPL-2.0", "LGPL-2.1", "LGPL-3.0",
+    "LGPL-2.0-only", "LGPL-2.1-only", "LGPL-3.0-only",
+    "LGPL-2.0-or-later", "LGPL-2.1-or-later", "LGPL-3.0-or-later",
+    "MPL-2.0",  # Weak copyleft — file-level
+    "EUPL-1.1", "EUPL-1.2",
+    "OSL-3.0",
+    "SSPL-1.0",
+    "CC-BY-SA-4.0", "CC-BY-SA-3.0",
+    "CC-BY-NC-4.0", "CC-BY-NC-3.0",
+}
+
+PERMISSIVE_LICENSES = {
+    "MIT", "BSD-2-Clause", "BSD-3-Clause", "Apache-2.0",
+    "ISC", "Unlicense", "CC0-1.0", "0BSD", "BSL-1.0",
+    "Zlib", "PSF-2.0", "Python-2.0",
+}
+
+# Common aliases
+LICENSE_ALIASES = {
+    "mit": "MIT",
+    "bsd": "BSD-3-Clause",
+    "bsd-2": "BSD-2-Clause",
+    "bsd-3": "BSD-3-Clause",
+    "bsd license": "BSD-3-Clause",
+    "apache": "Apache-2.0",
+    "apache 2.0": "Apache-2.0",
+    "apache-2.0": "Apache-2.0",
+    "apache software license": "Apache-2.0",
+    "apache software license 2.0": "Apache-2.0",
+    "gpl": "GPL-3.0",
+    "gpl-2": "GPL-2.0",
+    "gpl-3": "GPL-3.0",
+    "gplv2": "GPL-2.0",
+    "gplv3": "GPL-3.0",
+    "gnu general public license": "GPL-3.0",
+    "gnu general public license v3": "GPL-3.0",
+    "gnu general public license v2": "GPL-2.0",
+    "gnu lesser general public license v2": "LGPL-2.1",
+    "gnu lesser general public license v3": "LGPL-3.0",
+    "lgpl": "LGPL-3.0",
+    "lgpl-2.1": "LGPL-2.1",
+    "lgpl-3": "LGPL-3.0",
+    "agpl": "AGPL-3.0",
+    "agpl-3.0": "AGPL-3.0",
+    "agplv3": "AGPL-3.0",
+    "isc": "ISC",
+    "mpl": "MPL-2.0",
+    "mpl-2.0": "MPL-2.0",
+    "mozilla public license 2.0": "MPL-2.0",
+    "unlicense": "Unlicense",
+    "public domain": "Unlicense",
+    "cc0": "CC0-1.0",
+    "cc0-1.0": "CC0-1.0",
+    "psf": "PSF-2.0",
+    "python software foundation license": "PSF-2.0",
+    "the mit license": "MIT",
+    "mit license": "MIT",
+}
+
+
+@dataclass
+class DepLicense:
+    name: str
+    version: str = ""
+    license: str = "UNKNOWN"
+    source: str = ""  # where we found the dep (requirements.txt, package.json, etc.)
+    severity: Severity = Severity.UNKNOWN
+    message: str = ""
+
+
+@dataclass
+class LicenseReport:
+    project_dir: str
+    project_license: str = "MIT"
+    dependencies: list = field(default_factory=list)
+    summary: dict = field(default_factory=dict)
+    errors: list = field(default_factory=list)
+    warnings: list = field(default_factory=list)
+
+
+def normalize_license(raw: str) -> str:
+    """Normalize a license string to SPDX identifier."""
+    if not raw or raw.strip() in ("UNKNOWN", "UNKNOWN:", ""):
+        return "UNKNOWN"
+    cleaned = raw.strip().lower()
+    # Remove version specifiers like "MIT License (MIT)"
+    cleaned = re.sub(r"\(.*?\)", "", cleaned).strip()
+    cleaned = re.sub(r"\s+license$", "", cleaned).strip()
+    cleaned = re.sub(r"^the\s+", "", cleaned).strip()
+    if cleaned in LICENSE_ALIASES:
+        return LICENSE_ALIASES[cleaned]
+    # Check if it already looks like SPDX
+    upper = raw.strip()
+    if upper in COPYLEFT_FAMILIES or upper in PERMISSIVE_LICENSES:
+        return upper
+    return raw.strip()
+
+
+def check_compatibility(dep_license: str, project_license: str) -> tuple[Severity, str]:
+    """Check if a dependency license is compatible with the project license."""
+    if dep_license == "UNKNOWN":
+        return Severity.WARNING, "License unknown — manual review needed"
+    
+    if dep_license in PERMISSIVE_LICENSES:
+        return Severity.OK, "Compatible (permissive)"
+    
+    if dep_license in COPYLEFT_FAMILIES:
+        # Copyleft in a permissive project is a problem
+        if project_license in PERMISSIVE_LICENSES:
+            return Severity.ERROR, f"Copyleft ({dep_license}) in permissive ({project_license}) project"
+        # Copyleft in same family is OK
+        if dep_license.startswith(project_license.split("-")[0]):
+            return Severity.OK, "Compatible (same copyleft family)"
+        return Severity.WARNING, f"Review needed: {dep_license} with {project_license}"
+    
+    return Severity.UNKNOWN, f"Unrecognized license: {dep_license}"
+
+
+def parse_requirements_txt(path: str) -> list[DepLicense]:
+    """Parse requirements.txt format."""
+    deps = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("#") or line.startswith("-"):
+                continue
+            # Parse name==version or name>=version etc.
+            match = re.match(r"^([a-zA-Z0-9_.-]+)(?:[>=<!~].*)?$", line)
+            if match:
+                deps.append(DepLicense(name=match.group(1), source="requirements.txt"))
+    return deps
+
+
+def parse_pyproject_toml(path: str) -> list[DepLicense]:
+    """Parse pyproject.toml dependencies."""
+    deps = []
+    try:
+        # Use tomllib (Python 3.11+) or fall back to regex
+        import tomllib
+        with open(path, "rb") as f:
+            data = tomllib.load(f)
+    except ImportError:
+        # Fallback: regex parse
+        with open(path) as f:
+            content = f.read()
+        # Find [project.dependencies] section
+        match = re.search(r"\[project\]\s*dependencies\s*=\s*\[(.*?)\]", content, re.DOTALL)
+        if match:
+            for dep_str in re.findall(r'"([^"]+)"', match.group(1)):
+                name = re.match(r"^([a-zA-Z0-9_.-]+)", dep_str)
+                if name:
+                    deps.append(DepLicense(name=name.group(1), source="pyproject.toml"))
+        return deps
+
+    project_deps = data.get("project", {}).get("dependencies", [])
+    for dep_str in project_deps:
+        name = re.match(r"^([a-zA-Z0-9_.-]+)", dep_str)
+        if name:
+            deps.append(DepLicense(name=name.group(1), source="pyproject.toml"))
+    return deps
+
+
+def parse_package_json(path: str) -> list[DepLicense]:
+    """Parse package.json dependencies."""
+    deps = []
+    with open(path) as f:
+        data = json.load(f)
+    for section in ("dependencies", "devDependencies"):
+        for name, version in data.get(section, {}).items():
+            deps.append(DepLicense(name=name, version=version, source="package.json"))
+    return deps
+
+
+def parse_cargo_toml(path: str) -> list[DepLicense]:
+    """Parse Cargo.toml dependencies (basic)."""
+    deps = []
+    with open(path) as f:
+        for line in f:
+            match = re.match(r'^([a-zA-Z0-9_-]+)\s*=\s*"', line.strip())
+            if match and line.strip()[0] != "[" and line.strip() != "[dependencies]":
+                deps.append(DepLicense(name=match.group(1), source="Cargo.toml"))
+    return deps
+
+
+def parse_go_mod(path: str) -> list[DepLicense]:
+    """Parse go.mod dependencies."""
+    deps = []
+    with open(path) as f:
+        in_require = False
+        for line in f:
+            line = line.strip()
+            if line == "require (":
+                in_require = True
+                continue
+            if line == ")" and in_require:
+                in_require = False
+                continue
+            if in_require:
+                parts = line.split()
+                if len(parts) >= 2:
+                    deps.append(DepLicense(name=parts[0], version=parts[1], source="go.mod"))
+    return deps
+
+
+def scan_dep_files(project_dir: str) -> list[DepLicense]:
+    """Find and parse all dependency files in a project."""
+    all_deps = []
+    parsers = {
+        "requirements.txt": parse_requirements_txt,
+        "requirements-dev.txt": parse_requirements_txt,
+        "requirements_prod.txt": parse_requirements_txt,
+        "pyproject.toml": parse_pyproject_toml,
+        "setup.py": None,  # TODO: parse setup.py
+        "package.json": parse_package_json,
+        "Cargo.toml": parse_cargo_toml,
+        "go.mod": parse_go_mod,
+    }
+    
+    for filename, parser in parsers.items():
+        path = os.path.join(project_dir, filename)
+        if os.path.exists(path) and parser:
+            try:
+                deps = parser(path)
+                all_deps.extend(deps)
+            except Exception as e:
+                print(f"Warning: Failed to parse {filename}: {e}", file=sys.stderr)
+    
+    # Also check subdirectories for monorepos (one level deep)
+    for entry in os.listdir(project_dir):
+        subdir = os.path.join(project_dir, entry)
+        if os.path.isdir(subdir) and not entry.startswith("."):
+            for filename, parser in parsers.items():
+                path = os.path.join(subdir, filename)
+                if os.path.exists(path) and parser:
+                    try:
+                        deps = parser(path)
+                        for d in deps:
+                            d.source = f"{entry}/{filename}"
+                        all_deps.extend(deps)
+                    except Exception:
+                        pass
+    
+    return all_deps
+
+
+def lookup_pypi_license(package_name: str) -> str:
+    """Look up license from PyPI API."""
+    try:
+        url = f"https://pypi.org/pypi/{package_name}/json"
+        req = urllib.request.Request(url, headers={"Accept": "application/json"})
+        resp = urllib.request.urlopen(req, timeout=10)
+        data = json.loads(resp.read())
+        # Try classifiers first
+        for classifier in data.get("info", {}).get("classifiers", []):
+            if classifier.startswith("License ::"):
+                parts = classifier.split(" :: ")
+                if len(parts) >= 3:
+                    return parts[-1]
+        # Fall back to license field
+        lic = data.get("info", {}).get("license", "")
+        if lic and len(lic) < 100:
+            return lic
+        # Try license_expression
+        le = data.get("info", {}).get("license_expression", "")
+        if le:
+            return le
+        return "UNKNOWN"
+    except Exception:
+        return "UNKNOWN"
+
+
+def lookup_npm_license(package_name: str) -> str:
+    """Look up license from npm registry."""
+    try:
+        url = f"https://registry.npmjs.org/{package_name}"
+        req = urllib.request.Request(url, headers={"Accept": "application/json"})
+        resp = urllib.request.urlopen(req, timeout=10)
+        data = json.loads(resp.read())
+        lic = data.get("license", "UNKNOWN")
+        if isinstance(lic, dict):
+            lic = lic.get("type", "UNKNOWN")
+        return lic or "UNKNOWN"
+    except Exception:
+        return "UNKNOWN"
+
+
+def detect_project_license(project_dir: str) -> str:
+    """Detect the project's own license."""
+    for name in ("LICENSE", "LICENSE.md", "LICENSE.txt", "LICENCE", "COPYING"):
+        path = os.path.join(project_dir, name)
+        if os.path.exists(path):
+            with open(path) as f:
+                content = f.read().upper()
+            if "MIT LICENSE" in content or "MIT" in content[:200]:
+                return "MIT"
+            if "APACHE" in content and "2.0" in content:
+                return "Apache-2.0"
+            if "GNU GENERAL PUBLIC LICENSE" in content:
+                if "VERSION 3" in content:
+                    return "GPL-3.0"
+                if "VERSION 2" in content:
+                    return "GPL-2.0"
+            if "BSD" in content[:500]:
+                if "3-CLAUSE" in content or "THREE CLAUSE" in content:
+                    return "BSD-3-Clause"
+                return "BSD-2-Clause"
+            if "ISC" in content[:200]:
+                return "ISC"
+    # Check pyproject.toml
+    pypath = os.path.join(project_dir, "pyproject.toml")
+    if os.path.exists(pypath):
+        with open(pypath) as f:
+            content = f.read()
+        match = re.search(r'license\s*=\s*\{\s*text\s*=\s*"([^"]+)"', content)
+        if match:
+            return normalize_license(match.group(1))
+        match = re.search(r'license\s*=\s*"([^"]+)"', content)
+        if match:
+            return normalize_license(match.group(1))
+    return "UNKNOWN"
+
+
+def resolve_licenses(deps: list[DepLicense], cache: dict = None) -> None:
+    """Resolve license info for all dependencies."""
+    if cache is None:
+        cache = {}
+    
+    for dep in deps:
+        if dep.name in cache:
+            dep.license = cache[dep.name]
+            continue
+        
+        # Determine ecosystem
+        if dep.source in ("package.json",):
+            raw = lookup_npm_license(dep.name)
+        else:
+            raw = lookup_pypi_license(dep.name)
+        
+        dep.license = normalize_license(raw)
+        cache[dep.name] = dep.license
+
+
+def generate_report(deps: list[DepLicense], project_license: str) -> LicenseReport:
+    """Generate the compatibility report."""
+    report = LicenseReport(
+        project_dir="",
+        project_license=project_license,
+        dependencies=[],
+    )
+    
+    counts = {"ok": 0, "warning": 0, "error": 0, "unknown": 0}
+    
+    for dep in deps:
+        severity, message = check_compatibility(dep.license, project_license)
+        dep.severity = severity
+        dep.message = message
+        counts[severity.value] += 1
+        
+        if severity == Severity.ERROR:
+            report.errors.append(f"{dep.name}: {message}")
+        elif severity == Severity.WARNING:
+            report.warnings.append(f"{dep.name}: {message}")
+        
+        report.dependencies.append(asdict(dep))
+    
+    report.summary = {
+        "total": len(deps),
+        **counts,
+        "project_license": project_license,
+    }
+    
+    return report
+
+
+def format_text(report: LicenseReport) -> str:
+    """Format report as human-readable text."""
+    lines = []
+    lines.append("=" * 60)
+    lines.append("  LICENSE COMPATIBILITY REPORT")
+    lines.append("=" * 60)
+    lines.append(f"  Project License: {report.project_license}")
+    lines.append(f"  Dependencies: {report.summary.get('total', 0)}")
+    lines.append(f"  OK: {report.summary.get('ok', 0)}  "
+                 f"WARN: {report.summary.get('warning', 0)}  "
+                 f"ERR: {report.summary.get('error', 0)}  "
+                 f"UNK: {report.summary.get('unknown', 0)}")
+    lines.append("-" * 60)
+    
+    for dep in report.dependencies:
+        icon = {"ok": "[OK]", "warning": "[!!]", "error": "[XX]", "unknown": "[??]"}
+        sev = dep.get("severity", "unknown")
+        name = dep.get("name", "?")
+        lic = dep.get("license", "?")
+        msg = dep.get("message", "")
+        lines.append(f"  {icon.get(sev, '[ ]')} {name:30s} {lic:20s} {msg}")
+    
+    if report.errors:
+        lines.append("-" * 60)
+        lines.append("  ERRORS:")
+        for e in report.errors:
+            lines.append(f"    - {e}")
+    
+    if report.warnings:
+        lines.append("-" * 60)
+        lines.append("  WARNINGS:")
+        for w in report.warnings:
+            lines.append(f"    - {w}")
+    
+    lines.append("=" * 60)
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="License Checker — Pipeline 5.4")
+    parser.add_argument("project_dir", help="Project directory to scan")
+    parser.add_argument("--project-license", default=None,
+                        help="Project license SPDX id (auto-detected if omitted)")
+    parser.add_argument("--format", choices=["json", "text"], default="text",
+                        help="Output format")
+    parser.add_argument("--scan-deps", action="store_true",
+                        help="Only scan and list deps (skip license lookup)")
+    args = parser.parse_args()
+    
+    project_dir = os.path.abspath(args.project_dir)
+    if not os.path.isdir(project_dir):
+        print(f"Error: {project_dir} is not a directory", file=sys.stderr)
+        sys.exit(1)
+    
+    # Detect project license
+    project_license = args.project_license or detect_project_license(project_dir)
+    
+    # Scan deps
+    deps = scan_dep_files(project_dir)
+    if not deps:
+        print(f"No dependencies found in {project_dir}", file=sys.stderr)
+        sys.exit(0)
+    
+    print(f"Found {len(deps)} dependencies", file=sys.stderr)
+    
+    if args.scan_deps:
+        for d in deps:
+            print(f"  {d.name} ({d.source})")
+        sys.exit(0)
+    
+    # Resolve licenses
+    print("Resolving licenses...", file=sys.stderr)
+    resolve_licenses(deps)
+    
+    # Generate report
+    report = generate_report(deps, project_license)
+    report.project_dir = project_dir
+    
+    if args.format == "json":
+        print(json.dumps(asdict(report), indent=2, default=str))
+    else:
+        print(format_text(report))
+    
+    # Exit code: 1 if errors, 0 otherwise
+    sys.exit(1 if report.errors else 0)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/session_pair_harvester.py
+++ b/scripts/session_pair_harvester.py
@@ -1,234 +0,0 @@
-#!/usr/bin/env python3
-"""
-Session Transcript → Training Pair Harvester
-
-Scans Hermes session JSONL files for Q&A patterns and extracts
-terse→rich training pairs. Outputs JSONL matching the timmy-config
-training pairs spec.
-
-Usage:
-  python3 scripts/session_pair_harvester.py ~/.hermes/sessions/
-  python3 scripts/session_pair_harvester.py session.jsonl --output pairs.jsonl
-  python3 scripts/session_pair_harvester.py --dir ~/.hermes/sessions/ --min-ratio 2.0
-
-Output format:
-  {"terse": "user short prompt", "rich": "ai detailed response", "source": "session_id", "model": "..."}
-"""
-
-import argparse
-import hashlib
-import json
-import sys
-from pathlib import Path
-from typing import Optional
-
-
-def compute_hash(text: str) -> str:
-    """Content hash for deduplication."""
-    return hashlib.sha256(text.encode()).hexdigest()[:16]
-
-
-def extract_pairs_from_session(session_data: dict, min_ratio: float = 1.5,
-                                min_response_words: int = 20) -> list:
-    """Extract terse→rich pairs from a single session object."""
-    pairs = []
-    conversations = session_data.get("conversations", [])
-    session_id = session_data.get("id", "unknown")
-    model = session_data.get("model", "unknown")
-
-    seen_hashes = set()
-
-    for i, msg in enumerate(conversations):
-        # Look for assistant/gpt responses
-        if msg.get("from") not in ("gpt", "assistant"):
-            continue
-
-        response_text = msg.get("value", "")
-        if not response_text or len(response_text.split()) < min_response_words:
-            continue
-
-        # Find the preceding human message
-        prompt_text = ""
-        for j in range(i - 1, -1, -1):
-            if conversations[j].get("from") == "human":
-                prompt_text = conversations[j].get("value", "")
-                break
-
-        if not prompt_text:
-            continue
-
-        # Filter: skip tool results, system messages embedded as human
-        if prompt_text.startswith("{") and "output" in prompt_text[:100]:
-            continue  # likely a tool result
-        if prompt_text.startswith("# SOUL.md") or prompt_text.startswith("You are"):
-            continue  # system prompt leak
-
-        # Quality filters
-        prompt_words = len(prompt_text.split())
-        response_words = len(response_text.split())
-
-        # Must have meaningful length ratio
-        if prompt_words == 0 or response_words == 0:
-            continue
-        ratio = response_words / prompt_words
-        if ratio < min_ratio:
-            continue
-
-        # Skip responses that are mostly code
-        code_blocks = response_text.count("```")
-        if code_blocks >= 4 and len(response_text.replace("```", "").strip()) < 50:
-            continue
-
-        # Skip responses with tool call artifacts
-        if "tool_call" in response_text[:100] or "function_call" in response_text[:100]:
-            continue
-
-        # Deduplicate by content hash
-        content_hash = compute_hash(prompt_text + response_text[:200])
-        if content_hash in seen_hashes:
-            continue
-        seen_hashes.add(content_hash)
-
-        # Clean up response: remove markdown headers if too many
-        clean_response = response_text
-
-        pairs.append({
-            "terse": prompt_text.strip(),
-            "rich": clean_response.strip(),
-            "source": session_id,
-            "model": model,
-            "prompt_words": prompt_words,
-            "response_words": response_words,
-            "ratio": round(ratio, 2),
-        })
-
-    return pairs
-
-
-def extract_from_jsonl_file(filepath: str, **kwargs) -> list:
-    """Extract pairs from a session JSONL file."""
-    pairs = []
-    path = Path(filepath)
-
-    if not path.exists():
-        print(f"Warning: {filepath} not found", file=sys.stderr)
-        return pairs
-
-    content = path.read_text()
-    lines = content.strip().split("\n")
-
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            session = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        session_pairs = extract_pairs_from_session(session, **kwargs)
-        pairs.extend(session_pairs)
-
-    return pairs
-
-
-def deduplicate_pairs(pairs: list) -> list:
-    """Remove duplicate pairs across files."""
-    seen = set()
-    unique = []
-    for pair in pairs:
-        key = compute_hash(pair["terse"] + pair["rich"][:200])
-        if key not in seen:
-            seen.add(key)
-            unique.append(pair)
-    return unique
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Harvest training pairs from session transcripts")
-    parser.add_argument("input", nargs="?", help="Session JSONL file or directory")
-    parser.add_argument("--dir", "-d", help="Directory to scan for session files")
-    parser.add_argument("--output", "-o", default="harvested_pairs.jsonl", help="Output file")
-    parser.add_argument("--min-ratio", type=float, default=1.5, help="Min response/prompt word ratio")
-    parser.add_argument("--min-words", type=int, default=20, help="Min response word count")
-    parser.add_argument("--dry-run", action="store_true", help="Print stats without writing")
-    args = parser.parse_args()
-
-    all_pairs = []
-    files_scanned = 0
-
-    scan_dir = args.dir or args.input
-    if not scan_dir:
-        parser.print_help()
-        sys.exit(1)
-
-    scan_path = Path(scan_dir)
-    if scan_path.is_dir():
-        jsonl_files = sorted(scan_path.rglob("*.jsonl"))
-        print(f"Scanning {len(jsonl_files)} files in {scan_dir}...", file=sys.stderr)
-        for fpath in jsonl_files:
-            pairs = extract_from_jsonl_file(
-                str(fpath),
-                min_ratio=args.min_ratio,
-                min_response_words=args.min_words
-            )
-            all_pairs.extend(pairs)
-            files_scanned += 1
-    else:
-        pairs = extract_from_jsonl_file(
-            str(scan_path),
-            min_ratio=args.min_ratio,
-            min_response_words=args.min_words
-        )
-        all_pairs.extend(pairs)
-        files_scanned = 1
-
-    # Deduplicate
-    unique_pairs = deduplicate_pairs(all_pairs)
-
-    # Stats
-    if unique_pairs:
-        avg_prompt = sum(p["prompt_words"] for p in unique_pairs) / len(unique_pairs)
-        avg_response = sum(p["response_words"] for p in unique_pairs) / len(unique_pairs)
-        avg_ratio = sum(p["ratio"] for p in unique_pairs) / len(unique_pairs)
-    else:
-        avg_prompt = avg_response = avg_ratio = 0
-
-    stats = {
-        "files_scanned": files_scanned,
-        "raw_pairs": len(all_pairs),
-        "unique_pairs": len(unique_pairs),
-        "duplicates_removed": len(all_pairs) - len(unique_pairs),
-        "avg_prompt_words": round(avg_prompt, 1),
-        "avg_response_words": round(avg_response, 1),
-        "avg_ratio": round(avg_ratio, 2),
-    }
-
-    print(json.dumps(stats, indent=2), file=sys.stderr)
-
-    if args.dry_run:
-        # Print sample pairs
-        for pair in unique_pairs[:3]:
-            print(f"\n--- Source: {pair['source']} (ratio: {pair['ratio']}) ---", file=sys.stderr)
-            print(f"TERSE: {pair['terse'][:100]}...", file=sys.stderr)
-            print(f"RICH: {pair['rich'][:150]}...", file=sys.stderr)
-        return
-
-    # Write output
-    output_path = Path(args.output)
-    with open(output_path, "w") as f:
-        for pair in unique_pairs:
-            # Strip internal fields for output
-            output = {
-                "terse": pair["terse"],
-                "rich": pair["rich"],
-                "source": pair["source"],
-                "model": pair["model"],
-            }
-            f.write(json.dumps(output) + "\n")
-
-    print(f"\nWrote {len(unique_pairs)} pairs to {output_path}", file=sys.stderr)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/test_session_pair_harvester.py
+++ b/scripts/test_session_pair_harvester.py
@@ -1,90 +0,0 @@
-#!/usr/bin/env python3
-"""Tests for session_pair_harvester."""
-
-import json
-import sys
-import os
-import tempfile
-
-sys.path.insert(0, os.path.dirname(__file__))
-from session_pair_harvester import extract_pairs_from_session, deduplicate_pairs, compute_hash
-
-
-def test_basic_extraction():
-    session = {
-        "id": "test_001",
-        "model": "test-model",
-        "conversations": [
-            {"from": "system", "value": "You are helpful."},
-            {"from": "human", "value": "What is Python?"},
-            {"from": "gpt", "value": "Python is a high-level programming language known for its readability and versatility. It supports multiple paradigms including procedural, object-oriented, and functional programming. Python is widely used in web development, data science, machine learning, and automation."},
-        ]
-    }
-    pairs = extract_pairs_from_session(session, min_ratio=1.5, min_response_words=10)
-    assert len(pairs) == 1
-    assert pairs[0]["terse"] == "What is Python?"
-    assert "programming language" in pairs[0]["rich"]
-    assert pairs[0]["source"] == "test_001"
-    print("PASS: test_basic_extraction")
-
-
-def test_filters_short_responses():
-    session = {
-        "id": "test_002",
-        "model": "test",
-        "conversations": [
-            {"from": "human", "value": "Hi"},
-            {"from": "gpt", "value": "Hello!"},
-        ]
-    }
-    pairs = extract_pairs_from_session(session, min_ratio=1.5, min_response_words=20)
-    assert len(pairs) == 0
-    print("PASS: test_filters_short_responses")
-
-
-def test_skips_tool_results():
-    session = {
-        "id": "test_003",
-        "model": "test",
-        "conversations": [
-            {"from": "human", "value": '{"output": "file content", "exit_code": 0}'},
-            {"from": "gpt", "value": "The file was read successfully. Now let me analyze the content and provide a detailed summary of what was found in the file system."},
-        ]
-    }
-    pairs = extract_pairs_from_session(session, min_ratio=1.5, min_response_words=10)
-    assert len(pairs) == 0
-    print("PASS: test_skips_tool_results")
-
-
-def test_deduplication():
-    pairs = [
-        {"terse": "What is X?", "rich": "X is Y.", "source": "s1", "model": "m"},
-        {"terse": "What is X?", "rich": "X is Y.", "source": "s2", "model": "m"},
-        {"terse": "What is Z?", "rich": "Z is W.", "source": "s1", "model": "m"},
-    ]
-    unique = deduplicate_pairs(pairs)
-    assert len(unique) == 2
-    print("PASS: test_deduplication")
-
-
-def test_ratio_filter():
-    session = {
-        "id": "test_005",
-        "model": "test",
-        "conversations": [
-            {"from": "human", "value": "Explain quantum computing in detail with examples and applications"},
-            {"from": "gpt", "value": "OK."},
-        ]
-    }
-    pairs = extract_pairs_from_session(session, min_ratio=1.5, min_response_words=10)
-    assert len(pairs) == 0  # response too short relative to prompt
-    print("PASS: test_ratio_filter")
-
-
-if __name__ == "__main__":
-    test_basic_extraction()
-    test_filters_short_responses()
-    test_skips_tool_results()
-    test_deduplication()
-    test_ratio_filter()
-    print("\nAll tests passed.")
--- a/tests/test_license_checker.py
+++ b/tests/test_license_checker.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+"""Tests for license_checker.py — Pipeline 5.4"""
+
+import json
+import os
+import sys
+import tempfile
+import unittest
+
+# Add scripts dir to path
+sys.path.insert(0, os.path.dirname(__file__))
+
+from license_checker import (
+    normalize_license,
+    check_compatibility,
+    parse_requirements_txt,
+    parse_package_json,
+    parse_pyproject_toml,
+    parse_go_mod,
+    detect_project_license,
+    scan_dep_files,
+    generate_report,
+    format_text,
+    Severity,
+    DepLicense,
+)
+
+
+class TestNormalizeLicense(unittest.TestCase):
+    def test_mit_aliases(self):
+        for alias in ["mit", "MIT License", "The MIT License", "MIT license"]:
+            self.assertEqual(normalize_license(alias), "MIT")
+
+    def test_apache_aliases(self):
+        for alias in ["Apache 2.0", "Apache-2.0", "apache software license"]:
+            self.assertEqual(normalize_license(alias), "Apache-2.0")
+
+    def test_gpl_aliases(self):
+        self.assertEqual(normalize_license("GPL-3.0"), "GPL-3.0")
+        self.assertEqual(normalize_license("gplv3"), "GPL-3.0")
+
+    def test_unknown(self):
+        self.assertEqual(normalize_license(""), "UNKNOWN")
+        self.assertEqual(normalize_license("UNKNOWN"), "UNKNOWN")
+
+    def test_already_spdx(self):
+        self.assertEqual(normalize_license("BSD-3-Clause"), "BSD-3-Clause")
+
+
+class TestCheckCompatibility(unittest.TestCase):
+    def test_permissive_ok(self):
+        sev, msg = check_compatibility("MIT", "MIT")
+        self.assertEqual(sev, Severity.OK)
+
+    def test_gpl_in_mit_error(self):
+        sev, msg = check_compatibility("GPL-3.0", "MIT")
+        self.assertEqual(sev, Severity.ERROR)
+
+    def test_unknown_warning(self):
+        sev, msg = check_compatibility("UNKNOWN", "MIT")
+        self.assertEqual(sev, Severity.WARNING)
+
+    def test_apache_in_mit_ok(self):
+        sev, msg = check_compatibility("Apache-2.0", "MIT")
+        self.assertEqual(sev, Severity.OK)
+
+    def test_lgpl_in_mit_error(self):
+        sev, msg = check_compatibility("LGPL-3.0", "MIT")
+        self.assertEqual(sev, Severity.ERROR)
+
+
+class TestParseRequirements(unittest.TestCase):
+    def test_basic(self):
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
+            f.write("requests>=2.28.0\nflask==2.3.0\n# comment\npytest\n")
+            f.flush()
+            deps = parse_requirements_txt(f.name)
+        os.unlink(f.name)
+        names = [d.name for d in deps]
+        self.assertIn("requests", names)
+        self.assertIn("flask", names)
+        self.assertIn("pytest", names)
+        self.assertEqual(len(deps), 3)
+
+    def test_skip_flags(self):
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
+            f.write("-r other.txt\n--index-url https://pypi.org\nreal-dep\n")
+            f.flush()
+            deps = parse_requirements_txt(f.name)
+        os.unlink(f.name)
+        self.assertEqual(len(deps), 1)
+        self.assertEqual(deps[0].name, "real-dep")
+
+
+class TestParsePackageJson(unittest.TestCase):
+    def test_basic(self):
+        data = {
+            "dependencies": {"express": "^4.18.0", "lodash": "^4.17.21"},
+            "devDependencies": {"jest": "^29.0.0"},
+        }
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(data, f)
+            f.flush()
+            deps = parse_package_json(f.name)
+        os.unlink(f.name)
+        names = [d.name for d in deps]
+        self.assertIn("express", names)
+        self.assertIn("jest", names)
+        self.assertEqual(len(deps), 3)
+
+
+class TestParseGoMod(unittest.TestCase):
+    def test_basic(self):
+        content = """module example.com/mymod
+
+go 1.21
+
+require (
+    github.com/gin-gonic/gin v1.9.1
+    github.com/stretchr/testify v1.8.4
+)
+"""
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".mod", delete=False) as f:
+            f.write(content)
+            f.flush()
+            deps = parse_go_mod(f.name)
+        os.unlink(f.name)
+        self.assertEqual(len(deps), 2)
+        self.assertEqual(deps[0].name, "github.com/gin-gonic/gin")
+
+
+class TestDetectProjectLicense(unittest.TestCase):
+    def test_mit_file(self):
+        with tempfile.TemporaryDirectory() as d:
+            with open(os.path.join(d, "LICENSE"), "w") as f:
+                f.write("MIT License\n\nCopyright (c) 2024...\n")
+            self.assertEqual(detect_project_license(d), "MIT")
+
+    def test_apache_file(self):
+        with tempfile.TemporaryDirectory() as d:
+            with open(os.path.join(d, "LICENSE"), "w") as f:
+                f.write("Apache License Version 2.0...")
+            self.assertEqual(detect_project_license(d), "Apache-2.0")
+
+    def test_no_license(self):
+        with tempfile.TemporaryDirectory() as d:
+            self.assertEqual(detect_project_license(d), "UNKNOWN")
+
+
+class TestScanDeps(unittest.TestCase):
+    def test_multi_ecosystem(self):
+        with tempfile.TemporaryDirectory() as d:
+            with open(os.path.join(d, "requirements.txt"), "w") as f:
+                f.write("flask\nrequests\n")
+            with open(os.path.join(d, "package.json"), "w") as f:
+                json.dump({"dependencies": {"express": "^4.0.0"}}, f)
+            deps = scan_dep_files(d)
+            names = [d.name for d in deps]
+            self.assertIn("flask", names)
+            self.assertIn("express", names)
+
+
+class TestGenerateReport(unittest.TestCase):
+    def test_basic(self):
+        deps = [
+            DepLicense(name="flask", license="BSD-3-Clause", source="requirements.txt"),
+            DepLicense(name="gpl-pkg", license="GPL-3.0", source="requirements.txt"),
+            DepLicense(name="unknown-pkg", license="UNKNOWN", source="requirements.txt"),
+        ]
+        report = generate_report(deps, "MIT")
+        self.assertEqual(report.summary["ok"], 1)
+        self.assertEqual(report.summary["error"], 1)
+        self.assertEqual(report.summary["warning"], 1)
+        self.assertEqual(len(report.errors), 1)
+        self.assertIn("gpl-pkg", report.errors[0])
+
+    def test_format_text(self):
+        deps = [DepLicense(name="flask", license="BSD-3-Clause", source="requirements.txt")]
+        report = generate_report(deps, "MIT")
+        text = format_text(report)
+        self.assertIn("LICENSE COMPATIBILITY REPORT", text)
+        self.assertIn("flask", text)
+
+
+if __name__ == "__main__":
+    unittest.main()
Author	SHA1	Message	Date
Alexander Whitestone	508f0363b5	test: license checker unit tests (#110 )	2026-04-15 03:32:58 +00:00
Alexander Whitestone	c3d1633859	feat: license checker — Pipeline 5.4 (#110 )	2026-04-15 03:31:35 +00:00