compounding-intelligence/scripts/github_trending_scanner.py

#!/usr/bin/env python3
"""GitHub Trending Scanner — Scan trending repos in AI/ML.

Extracts: repo description, stars, key features (topics, inferred highlights).
Filters by language and/or topic. Outputs dated JSON for daily scan pipeline.

Usage:
    python3 github_trending_scanner.py --language python --topic ai --output metrics/trending
    python3 github_trending_scanner.py --topic machine-learning --limit 50
    python3 github_trending_scanner.py --language rust --topic artificial-intelligence
"""

import argparse
import json
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, List, Dict
import urllib.request
import urllib.parse
import urllib.error

GITHUB_API_BASE = os.environ.get("GITHUB_API_BASE", "https://api.github.com")
DEFAULT_OUTPUT_DIR = os.environ.get("TRENDING_OUTPUT_DIR", "metrics/trending")
DEFAULT_LIMIT = int(os.environ.get("TRENDING_LIMIT", "30"))
DEFAULT_MIN_STARS = int(os.environ.get("TRENDING_MIN_STARS", "1000"))


def fetch_trending_repos(
    language: Optional[str] = None,
    topic: Optional[str] = None,
    min_stars: int = DEFAULT_MIN_STARS,
    limit: int = DEFAULT_LIMIT,
) -> List[Dict]:
    """Fetch trending-like repositories from GitHub using the search API.

    GitHub's public search API is unauthenticated-rate-limited (60 req/hr).
    This function retries on rate-limit backoff and falls back gracefully.
    """
    # Build search query: stars threshold + optional language/topic filters
    query = f"stars:>{min_stars}"
    if language:
        query += f" language:{language}"
    if topic:
        query += f" topic:{topic}"

    # Sort by stars descending as a proxy for trending/popular
    params = {
        "q": query,
        "sort": "stars",
        "order": "desc",
        "per_page": min(limit, 100),  # GitHub max per_page is 100
    }
    url = f"{GITHUB_API_BASE}/search/repositories?{urllib.parse.urlencode(params)}"

    headers = {
        "Accept": "application/vnd.github.v3+json",
        "User-Agent": "Sovereign-Trending-Scanner/1.0",
    }

    for attempt in range(3):
        try:
            req = urllib.request.Request(url, headers=headers)
            with urllib.request.urlopen(req, timeout=30) as resp:
                if resp.status != 200:
                    raise RuntimeError(f"GitHub API returned {resp.status}")
                data = json.loads(resp.read().decode("utf-8"))
                return data.get("items", [])[:limit]
        except urllib.error.HTTPError as e:
            if e.code == 403:
                # Check for rate limit message
                body = e.read().decode("utf-8", errors="replace").lower()
                if "rate limit" in body or "api rate limit exceeded" in body:
                    reset_ts = int(e.headers.get("X-RateLimit-Reset", 0))
                    wait_seconds = max(5, reset_ts - int(time.time()) + 5)
                    print(f"Rate limit exceeded — waiting {wait_seconds}s (attempt {attempt+1}/3)...", file=sys.stderr)
                    time.sleep(wait_seconds)
                    continue
            print(f"ERROR: GitHub API request failed: {e} — {e.read().decode('utf-8', errors='replace')[:200]}", file=sys.stderr)
            return []
        except Exception as e:
            if attempt < 2:
                backoff = 2 ** attempt
                print(f"WARNING: Fetch attempt {attempt+1} failed: {e} — retrying in {backoff}s", file=sys.stderr)
                time.sleep(backoff)
                continue
            print(f"ERROR: All fetch attempts failed: {e}", file=sys.stderr)
            return []

    return []


def extract_repo_features(repo_data: Dict) -> Dict:
    """Extract structured fields for a trending repo."""
    description = (repo_data.get("description") or "").strip()
    topics = repo_data.get("topics", [])

    # Infer key features from description and topics
    features = infer_features(description, topics)

    return {
        "name": repo_data.get("full_name", ""),
        "description": description,
        "stars": repo_data.get("stargazers_count", 0),
        "forks": repo_data.get("forks_count", 0),
        "open_issues": repo_data.get("open_issues_count", 0),
        "language": repo_data.get("language", ""),
        "topics": topics,
        "url": repo_data.get("html_url", ""),
        "created_at": repo_data.get("created_at", ""),
        "updated_at": repo_data.get("updated_at", ""),
        "key_features": features,
        "scanned_at": datetime.now(timezone.utc).isoformat(),
    }


def infer_features(description: str, topics: List[str]) -> List[str]:
    """Infer notable capabilities/features from repo metadata.

    Looks for AI/ML-relevant capabilities in topics and description.
    """
    features = []
    text = (description + " " + " ".join(topics)).lower()

    # Domain capabilities (keys normalized to lowercase for consistency)
    capability_keywords = {
        "fine-tuning": ["fine-tun", "finetun"],
        "agent framework": ["agent"],
        "local/offline": ["local", "on-device", "offline"],
        "quantized models": ["quantized", "quantization", "gguf", "gptq"],
        "vision": ["vision", "multimodal", "image", "visual"],
        "speech/audio": ["speech", "audio", "whisper", "tts"],
        "retrieval/rag": ["rag", "retrieval", "embedding", "vector"],
        "training": ["train", "training", "sft", "dpo"],
        "gui/playground": ["gui", "playground", "webui", "interface"],
        "sota": ["state-of-the-art", "sota", "latest"],
    }

    for label, keywords in capability_keywords.items():
        if any(kw in text for kw in keywords):
            features.append(label)

    # Also include non-generic topics as features
    generic_topics = {"ai", "ml", "machine-learning", "deep-learning", "llm", "python", "pytorch", "tensorflow"}
    for topic in topics:
        if topic.lower() not in generic_topics:
            features.append(topic)

    # Deduplicate while preserving order, return up to 10
    seen = set()
    unique = []
    for f in features:
        key = f.lower()
        if key not in seen:
            seen.add(key)
            unique.append(f)
    return unique[:10]


def save_trending(repos: List[Dict], output_dir: str = "metrics/trending") -> str:
    """Save trending results to a dated JSON file.

    Returns the path of the written file.
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    filename = output_path / f"github-trending-{date_str}.json"

    output_data = {
        "scanned_at": datetime.now(timezone.utc).isoformat(),
        "count": len(repos),
        "repos": repos,
    }

    with open(filename, "w") as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    return str(filename)


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Scan GitHub trending repositories in AI/ML"
    )
    parser.add_argument(
        "--language",
        help="Filter by programming language (e.g., python, rust, go)",
    )
    parser.add_argument(
        "--topic",
        help="Filter by GitHub topic (e.g., ai, machine-learning, llm)",
    )
    parser.add_argument(
        "--since",
        default="daily",
        choices=["daily", "weekly", "monthly"],
        help="Trending period (daily/weekly/monthly) — informational only",
    )
    parser.add_argument(
        "--output",
        default="metrics/trending",
        help="Output directory for results (default: metrics/trending)",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=DEFAULT_LIMIT,
        help=f"Maximum repos to fetch (default: {DEFAULT_LIMIT})",
    )
    parser.add_argument(
        "--min-stars",
        type=int,
        default=DEFAULT_MIN_STARS,
        help=f"Minimum star count for relevance (default: {DEFAULT_MIN_STARS})",
    )
    args = parser.parse_args()

    print(
        f"Fetching trending repos "
        f"(language={args.language or 'any'}, topic={args.topic or 'any'}, period={args.since})..."
    )

    repos_raw = fetch_trending_repos(
        language=args.language,
        topic=args.topic,
        min_stars=args.min_stars,
        limit=args.limit,
    )

    if not repos_raw:
        print("WARNING: No repos fetched — check network or rate limits", file=sys.stderr)

    repos = [extract_repo_features(r) for r in repos_raw]

    output_file = save_trending(repos, args.output)
    print(f"Saved {len(repos)} trending repos to {output_file}")

    # Brief human-readable summary
    if repos:
        print("\nTop repos:")
        for repo in repos[:5]:
            features_preview = ", ".join(repo["key_features"][:3])
            print(f"  ★ {repo['stars']:>7}  {repo['name']}")
            if repo["description"]:
                desc = repo["description"][:80]
                print(f"         {desc}{'...' if len(repo['description']) > 80 else ''}")
            if features_preview:
                print(f"         Features: {features_preview}")

    return 0


if __name__ == "__main__":
    sys.exit(main())