compounding-intelligence/scripts/release_note_analyzer.py

#!/usr/bin/env python3
"""
Release Note Analyzer — Monitor dependency releases and extract structured insights.

Fetches GitHub releases for configured repositories, parses changelogs,
categorizes changes, and flags breaking changes.

Usage:
    python3 scripts/release_note_analyzer.py --repos owner/repo1,owner/repo2
    python3 scripts/release_note_analyzer.py --repos numpy/numpy --limit 5
    python3 scripts/release_note_analyzer.py --repos owner/repo --output metrics/releases.json
    python3 scripts/release_note_analyzer.py --repos owner/repo --token $GITHUB_TOKEN

Output:
    JSON with per-release structure: version, date, url, categories (features, fixes, breaking), raw_body
"""

import argparse
import json
import re
import sys
from datetime import datetime, timezone
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, field, asdict
import os


@dataclass
class ReleaseAnalysis:
    version: str
    date: str
    url: str
    categories: Dict[str, List[str]] = field(default_factory=dict)
    breaking_change_flags: List[str] = field(default_factory=list)
    raw_body: str = ""

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


def fetch_github_releases(repo: str, token: Optional[str] = None, limit: int = 10) -> List[Dict[str, Any]]:
    """Fetch latest releases from GitHub API."""
    import urllib.request
    import urllib.error

    url = f"https://api.github.com/repos/{repo}/releases?per_page={limit}"
    headers = {"Accept": "application/vnd.github.v3+json"}
    if token:
        headers["Authorization"] = f"token {token}"

    req = urllib.request.Request(url, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            data = json.loads(resp.read())
            return data
    except urllib.error.HTTPError as e:
        print(f"Error fetching releases for {repo}: HTTP {e.code}", file=sys.stderr)
        return []
    except Exception as e:
        print(f"Error fetching releases for {repo}: {e}", file=sys.stderr)
        return []


def categorize_changelog(body: str) -> Dict[str, List[str]]:
    """Categorize release note lines into features, fixes, and other."""
    categories = {
        "features": [],
        "fixes": [],
        "other": []
    }

    if not body:
        return categories

    lines = body.split('\n')
    current_section = None

    # Section header patterns
    feature_patterns = re.compile(r'^(?:features?|new|add|enhancement)s?', re.IGNORECASE)
    fix_patterns = re.compile(r'^(?:fix(?:es|ed)?|bug|patch|correction)', re.IGNORECASE)

    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue

        # Check for section headers (e.g., "### Features", "## Added")
        header_match = re.match(r'^#{1,3}\s+(.+)$', stripped)
        if header_match:
            header = header_match.group(1).lower()
            if feature_patterns.search(header):
                current_section = "features"
            elif fix_patterns.search(header):
                current_section = "fixes"
            else:
                current_section = None
            continue

        # Categorize based on line content
        if current_section:
            categories[current_section].append(stripped)
        else:
            # Infer from keywords
            if re.search(r'^(?:added|new|feature|introdu)', stripped, re.IGNORECASE):
                categories["features"].append(stripped)
            elif re.search(r'^(?:fix|bug|patch|resolved)', stripped, re.IGNORECASE):
                categories["fixes"].append(stripped)
            else:
                categories["other"].append(stripped)

    # Deduplicate within categories
    for cat in categories:
        categories[cat] = list(dict.fromkeys(categories[cat]))

    return categories


def detect_breaking_changes(body: str) -> List[str]:
    """Detect and extract potential breaking change indicators."""
    breaking_indicators = []
    lines = body.split('\n')

    # Keywords that suggest breaking changes
    breaking_keywords = re.compile(
        r'\b(?:BREAKING|breaking\s+change|backward\s+incompatible|'
        r'removed\s+.*?API|deprecated.*?removed|'
        r'major\s+version|'
        r'not\s+backward\s+compatible)\b',
        re.IGNORECASE
    )

    for line in lines:
        if breaking_keywords.search(line):
            breaking_indicators.append(line.strip())

    return breaking_indicators


def analyze_releases( repos: List[str], token: Optional[str] = None, limit: int = 10) -> List[Dict[str, Any]]:
    """Fetch and analyze releases for all configured repos."""
    all_releases = []

    for repo in repos:
        repo = repo.strip()
        if not repo:
            continue

        releases = fetch_github_releases(repo, token=token, limit=limit)
        for release_data in releases:
            body = release_data.get('body') or ""
            tag = release_data.get('tag_name', 'unknown')
            date = release_data.get('published_at', '')
            url = release_data.get('html_url', '')

            analysis = ReleaseAnalysis(
                version=tag,
                date=date,
                url=url,
                raw_body=body[:5000]  # Truncate for output size
            )

            # Categorize changes
            analysis.categories = categorize_changelog(body)

            # Detect breaking changes
            analysis.breaking_change_flags = detect_breaking_changes(body)

            all_releases.append(analysis.to_dict())

    return all_releases


def main():
    parser = argparse.ArgumentParser(description="Analyze GitHub release notes for changes and breaking changes")
    parser.add_argument('--repos', required=True, help='Comma-separated list of GitHub repos (owner/repo)')
    parser.add_argument('--token', help='GitHub API token (or set GITHUB_TOKEN env var)')
    parser.add_argument('--limit', type=int, default=10, help='Max releases per repo (default: 10)')
    parser.add_argument('--output', help='Write JSON output to file (default: stdout)')

    args = parser.parse_args()

    repos = [r.strip() for r in args.repos.split(',')]
    token = args.token or os.environ.get('GITHUB_TOKEN')

    results = analyze_releases(repos, token=token, limit=args.limit)

    output = {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "repos": repos,
        "release_count": len(results),
        "releases": results
    }

    if args.output:
        with open(args.output, 'w') as f:
            json.dump(output, f, indent=2)
        print(f"Wrote {len(results)} releases to {args.output}")
    else:
        print(json.dumps(output, indent=2))


if __name__ == '__main__':
    main()