#!/usr/bin/env python3 """GitHub Trending Scanner — Scan trending repos in AI/ML. Extracts: repo description, stars, key features (topics, inferred highlights). Filters by language and/or topic. Outputs dated JSON for daily scan pipeline. Usage: python3 github_trending_scanner.py --language python --topic ai --output metrics/trending python3 github_trending_scanner.py --topic machine-learning --limit 50 python3 github_trending_scanner.py --language rust --topic artificial-intelligence """ import argparse import json import os import sys import time from datetime import datetime, timezone from pathlib import Path from typing import Optional, List, Dict import urllib.request import urllib.parse import urllib.error GITHUB_API_BASE = os.environ.get("GITHUB_API_BASE", "https://api.github.com") DEFAULT_OUTPUT_DIR = os.environ.get("TRENDING_OUTPUT_DIR", "metrics/trending") DEFAULT_LIMIT = int(os.environ.get("TRENDING_LIMIT", "30")) DEFAULT_MIN_STARS = int(os.environ.get("TRENDING_MIN_STARS", "1000")) def fetch_trending_repos( language: Optional[str] = None, topic: Optional[str] = None, min_stars: int = DEFAULT_MIN_STARS, limit: int = DEFAULT_LIMIT, ) -> List[Dict]: """Fetch trending-like repositories from GitHub using the search API. GitHub's public search API is unauthenticated-rate-limited (60 req/hr). This function retries on rate-limit backoff and falls back gracefully. """ # Build search query: stars threshold + optional language/topic filters query = f"stars:>{min_stars}" if language: query += f" language:{language}" if topic: query += f" topic:{topic}" # Sort by stars descending as a proxy for trending/popular params = { "q": query, "sort": "stars", "order": "desc", "per_page": min(limit, 100), # GitHub max per_page is 100 } url = f"{GITHUB_API_BASE}/search/repositories?{urllib.parse.urlencode(params)}" headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Sovereign-Trending-Scanner/1.0", } for attempt in range(3): try: req = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(req, timeout=30) as resp: if resp.status != 200: raise RuntimeError(f"GitHub API returned {resp.status}") data = json.loads(resp.read().decode("utf-8")) return data.get("items", [])[:limit] except urllib.error.HTTPError as e: if e.code == 403: # Check for rate limit message body = e.read().decode("utf-8", errors="replace").lower() if "rate limit" in body or "api rate limit exceeded" in body: reset_ts = int(e.headers.get("X-RateLimit-Reset", 0)) wait_seconds = max(5, reset_ts - int(time.time()) + 5) print(f"Rate limit exceeded — waiting {wait_seconds}s (attempt {attempt+1}/3)...", file=sys.stderr) time.sleep(wait_seconds) continue print(f"ERROR: GitHub API request failed: {e} — {e.read().decode('utf-8', errors='replace')[:200]}", file=sys.stderr) return [] except Exception as e: if attempt < 2: backoff = 2 ** attempt print(f"WARNING: Fetch attempt {attempt+1} failed: {e} — retrying in {backoff}s", file=sys.stderr) time.sleep(backoff) continue print(f"ERROR: All fetch attempts failed: {e}", file=sys.stderr) return [] return [] def extract_repo_features(repo_data: Dict) -> Dict: """Extract structured fields for a trending repo.""" description = (repo_data.get("description") or "").strip() topics = repo_data.get("topics", []) # Infer key features from description and topics features = infer_features(description, topics) return { "name": repo_data.get("full_name", ""), "description": description, "stars": repo_data.get("stargazers_count", 0), "forks": repo_data.get("forks_count", 0), "open_issues": repo_data.get("open_issues_count", 0), "language": repo_data.get("language", ""), "topics": topics, "url": repo_data.get("html_url", ""), "created_at": repo_data.get("created_at", ""), "updated_at": repo_data.get("updated_at", ""), "key_features": features, "scanned_at": datetime.now(timezone.utc).isoformat(), } def infer_features(description: str, topics: List[str]) -> List[str]: """Infer notable capabilities/features from repo metadata. Looks for AI/ML-relevant capabilities in topics and description. """ features = [] text = (description + " " + " ".join(topics)).lower() # Domain capabilities (keys normalized to lowercase for consistency) capability_keywords = { "fine-tuning": ["fine-tun", "finetun"], "agent framework": ["agent"], "local/offline": ["local", "on-device", "offline"], "quantized models": ["quantized", "quantization", "gguf", "gptq"], "vision": ["vision", "multimodal", "image", "visual"], "speech/audio": ["speech", "audio", "whisper", "tts"], "retrieval/rag": ["rag", "retrieval", "embedding", "vector"], "training": ["train", "training", "sft", "dpo"], "gui/playground": ["gui", "playground", "webui", "interface"], "sota": ["state-of-the-art", "sota", "latest"], } for label, keywords in capability_keywords.items(): if any(kw in text for kw in keywords): features.append(label) # Also include non-generic topics as features generic_topics = {"ai", "ml", "machine-learning", "deep-learning", "llm", "python", "pytorch", "tensorflow"} for topic in topics: if topic.lower() not in generic_topics: features.append(topic) # Deduplicate while preserving order, return up to 10 seen = set() unique = [] for f in features: key = f.lower() if key not in seen: seen.add(key) unique.append(f) return unique[:10] def save_trending(repos: List[Dict], output_dir: str = "metrics/trending") -> str: """Save trending results to a dated JSON file. Returns the path of the written file. """ output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") filename = output_path / f"github-trending-{date_str}.json" output_data = { "scanned_at": datetime.now(timezone.utc).isoformat(), "count": len(repos), "repos": repos, } with open(filename, "w") as f: json.dump(output_data, f, indent=2, ensure_ascii=False) return str(filename) def main() -> None: parser = argparse.ArgumentParser( description="Scan GitHub trending repositories in AI/ML" ) parser.add_argument( "--language", help="Filter by programming language (e.g., python, rust, go)", ) parser.add_argument( "--topic", help="Filter by GitHub topic (e.g., ai, machine-learning, llm)", ) parser.add_argument( "--since", default="daily", choices=["daily", "weekly", "monthly"], help="Trending period (daily/weekly/monthly) — informational only", ) parser.add_argument( "--output", default="metrics/trending", help="Output directory for results (default: metrics/trending)", ) parser.add_argument( "--limit", type=int, default=DEFAULT_LIMIT, help=f"Maximum repos to fetch (default: {DEFAULT_LIMIT})", ) parser.add_argument( "--min-stars", type=int, default=DEFAULT_MIN_STARS, help=f"Minimum star count for relevance (default: {DEFAULT_MIN_STARS})", ) args = parser.parse_args() print( f"Fetching trending repos " f"(language={args.language or 'any'}, topic={args.topic or 'any'}, period={args.since})..." ) repos_raw = fetch_trending_repos( language=args.language, topic=args.topic, min_stars=args.min_stars, limit=args.limit, ) if not repos_raw: print("WARNING: No repos fetched — check network or rate limits", file=sys.stderr) repos = [extract_repo_features(r) for r in repos_raw] output_file = save_trending(repos, args.output) print(f"Saved {len(repos)} trending repos to {output_file}") # Brief human-readable summary if repos: print("\nTop repos:") for repo in repos[:5]: features_preview = ", ".join(repo["key_features"][:3]) print(f" ★ {repo['stars']:>7} {repo['name']}") if repo["description"]: desc = repo["description"][:80] print(f" {desc}{'...' if len(repo['description']) > 80 else ''}") if features_preview: print(f" Features: {features_preview}") return 0 if __name__ == "__main__": sys.exit(main())