test(scanner): unit tests for github_trending_scanner

feat(scanner): add GitHub Trending Scanner CLI for AI/ML repos
2026-04-26 11:21:02 +00:00 · 2026-04-26 11:20:51 +00:00
3 changed files with 383 additions and 131 deletions
--- a/scripts/github_trending_scanner.py
+++ b/scripts/github_trending_scanner.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""GitHub Trending Scanner — Scan trending repos in AI/ML.
+
+Extracts: repo description, stars, key features (topics, inferred highlights).
+Filters by language and/or topic. Outputs dated JSON for daily scan pipeline.
+
+Usage:
+    python3 github_trending_scanner.py --language python --topic ai --output metrics/trending
+    python3 github_trending_scanner.py --topic machine-learning --limit 50
+    python3 github_trending_scanner.py --language rust --topic artificial-intelligence
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional, List, Dict
+import urllib.request
+import urllib.parse
+import urllib.error
+
+GITHUB_API_BASE = os.environ.get("GITHUB_API_BASE", "https://api.github.com")
+DEFAULT_OUTPUT_DIR = os.environ.get("TRENDING_OUTPUT_DIR", "metrics/trending")
+DEFAULT_LIMIT = int(os.environ.get("TRENDING_LIMIT", "30"))
+DEFAULT_MIN_STARS = int(os.environ.get("TRENDING_MIN_STARS", "1000"))
+
+
+def fetch_trending_repos(
+    language: Optional[str] = None,
+    topic: Optional[str] = None,
+    min_stars: int = DEFAULT_MIN_STARS,
+    limit: int = DEFAULT_LIMIT,
+) -> List[Dict]:
+    """Fetch trending-like repositories from GitHub using the search API.
+
+    GitHub's public search API is unauthenticated-rate-limited (60 req/hr).
+    This function retries on rate-limit backoff and falls back gracefully.
+    """
+    # Build search query: stars threshold + optional language/topic filters
+    query = f"stars:>{min_stars}"
+    if language:
+        query += f" language:{language}"
+    if topic:
+        query += f" topic:{topic}"
+
+    # Sort by stars descending as a proxy for trending/popular
+    params = {
+        "q": query,
+        "sort": "stars",
+        "order": "desc",
+        "per_page": min(limit, 100),  # GitHub max per_page is 100
+    }
+    url = f"{GITHUB_API_BASE}/search/repositories?{urllib.parse.urlencode(params)}"
+
+    headers = {
+        "Accept": "application/vnd.github.v3+json",
+        "User-Agent": "Sovereign-Trending-Scanner/1.0",
+    }
+
+    for attempt in range(3):
+        try:
+            req = urllib.request.Request(url, headers=headers)
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                if resp.status != 200:
+                    raise RuntimeError(f"GitHub API returned {resp.status}")
+                data = json.loads(resp.read().decode("utf-8"))
+                return data.get("items", [])[:limit]
+        except urllib.error.HTTPError as e:
+            if e.code == 403:
+                # Check for rate limit message
+                body = e.read().decode("utf-8", errors="replace").lower()
+                if "rate limit" in body or "api rate limit exceeded" in body:
+                    reset_ts = int(e.headers.get("X-RateLimit-Reset", 0))
+                    wait_seconds = max(5, reset_ts - int(time.time()) + 5)
+                    print(f"Rate limit exceeded — waiting {wait_seconds}s (attempt {attempt+1}/3)...", file=sys.stderr)
+                    time.sleep(wait_seconds)
+                    continue
+            print(f"ERROR: GitHub API request failed: {e} — {e.read().decode('utf-8', errors='replace')[:200]}", file=sys.stderr)
+            return []
+        except Exception as e:
+            if attempt < 2:
+                backoff = 2 ** attempt
+                print(f"WARNING: Fetch attempt {attempt+1} failed: {e} — retrying in {backoff}s", file=sys.stderr)
+                time.sleep(backoff)
+                continue
+            print(f"ERROR: All fetch attempts failed: {e}", file=sys.stderr)
+            return []
+
+    return []
+
+
+def extract_repo_features(repo_data: Dict) -> Dict:
+    """Extract structured fields for a trending repo."""
+    description = (repo_data.get("description") or "").strip()
+    topics = repo_data.get("topics", [])
+
+    # Infer key features from description and topics
+    features = infer_features(description, topics)
+
+    return {
+        "name": repo_data.get("full_name", ""),
+        "description": description,
+        "stars": repo_data.get("stargazers_count", 0),
+        "forks": repo_data.get("forks_count", 0),
+        "open_issues": repo_data.get("open_issues_count", 0),
+        "language": repo_data.get("language", ""),
+        "topics": topics,
+        "url": repo_data.get("html_url", ""),
+        "created_at": repo_data.get("created_at", ""),
+        "updated_at": repo_data.get("updated_at", ""),
+        "key_features": features,
+        "scanned_at": datetime.now(timezone.utc).isoformat(),
+    }
+
+
+def infer_features(description: str, topics: List[str]) -> List[str]:
+    """Infer notable capabilities/features from repo metadata.
+
+    Looks for AI/ML-relevant capabilities in topics and description.
+    """
+    features = []
+    text = (description + " " + " ".join(topics)).lower()
+
+    # Domain capabilities (keys normalized to lowercase for consistency)
+    capability_keywords = {
+        "fine-tuning": ["fine-tun", "finetun"],
+        "agent framework": ["agent"],
+        "local/offline": ["local", "on-device", "offline"],
+        "quantized models": ["quantized", "quantization", "gguf", "gptq"],
+        "vision": ["vision", "multimodal", "image", "visual"],
+        "speech/audio": ["speech", "audio", "whisper", "tts"],
+        "retrieval/rag": ["rag", "retrieval", "embedding", "vector"],
+        "training": ["train", "training", "sft", "dpo"],
+        "gui/playground": ["gui", "playground", "webui", "interface"],
+        "sota": ["state-of-the-art", "sota", "latest"],
+    }
+
+    for label, keywords in capability_keywords.items():
+        if any(kw in text for kw in keywords):
+            features.append(label)
+
+    # Also include non-generic topics as features
+    generic_topics = {"ai", "ml", "machine-learning", "deep-learning", "llm", "python", "pytorch", "tensorflow"}
+    for topic in topics:
+        if topic.lower() not in generic_topics:
+            features.append(topic)
+
+    # Deduplicate while preserving order, return up to 10
+    seen = set()
+    unique = []
+    for f in features:
+        key = f.lower()
+        if key not in seen:
+            seen.add(key)
+            unique.append(f)
+    return unique[:10]
+
+
+def save_trending(repos: List[Dict], output_dir: str = "metrics/trending") -> str:
+    """Save trending results to a dated JSON file.
+
+    Returns the path of the written file.
+    """
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+    filename = output_path / f"github-trending-{date_str}.json"
+
+    output_data = {
+        "scanned_at": datetime.now(timezone.utc).isoformat(),
+        "count": len(repos),
+        "repos": repos,
+    }
+
+    with open(filename, "w") as f:
+        json.dump(output_data, f, indent=2, ensure_ascii=False)
+
+    return str(filename)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Scan GitHub trending repositories in AI/ML"
+    )
+    parser.add_argument(
+        "--language",
+        help="Filter by programming language (e.g., python, rust, go)",
+    )
+    parser.add_argument(
+        "--topic",
+        help="Filter by GitHub topic (e.g., ai, machine-learning, llm)",
+    )
+    parser.add_argument(
+        "--since",
+        default="daily",
+        choices=["daily", "weekly", "monthly"],
+        help="Trending period (daily/weekly/monthly) — informational only",
+    )
+    parser.add_argument(
+        "--output",
+        default="metrics/trending",
+        help="Output directory for results (default: metrics/trending)",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=DEFAULT_LIMIT,
+        help=f"Maximum repos to fetch (default: {DEFAULT_LIMIT})",
+    )
+    parser.add_argument(
+        "--min-stars",
+        type=int,
+        default=DEFAULT_MIN_STARS,
+        help=f"Minimum star count for relevance (default: {DEFAULT_MIN_STARS})",
+    )
+    args = parser.parse_args()
+
+    print(
+        f"Fetching trending repos "
+        f"(language={args.language or 'any'}, topic={args.topic or 'any'}, period={args.since})..."
+    )
+
+    repos_raw = fetch_trending_repos(
+        language=args.language,
+        topic=args.topic,
+        min_stars=args.min_stars,
+        limit=args.limit,
+    )
+
+    if not repos_raw:
+        print("WARNING: No repos fetched — check network or rate limits", file=sys.stderr)
+
+    repos = [extract_repo_features(r) for r in repos_raw]
+
+    output_file = save_trending(repos, args.output)
+    print(f"Saved {len(repos)} trending repos to {output_file}")
+
+    # Brief human-readable summary
+    if repos:
+        print("\nTop repos:")
+        for repo in repos[:5]:
+            features_preview = ", ".join(repo["key_features"][:3])
+            print(f"  ★ {repo['stars']:>7}  {repo['name']}")
+            if repo["description"]:
+                desc = repo["description"][:80]
+                print(f"         {desc}{'...' if len(repo['description']) > 80 else ''}")
+            if features_preview:
+                print(f"         Features: {features_preview}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/test_github_trending_scanner.py
+++ b/scripts/test_github_trending_scanner.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""Tests for github_trending_scanner.py — pure function validation.
+
+Tests the feature inference, extraction, and output formatting logic
+without relying on external GitHub API calls.
+"""
+
+import json
+import sys
+import tempfile
+from pathlib import Path
+
+# Add scripts dir to path for import
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+
+from github_trending_scanner import (
+    extract_repo_features,
+    infer_features,
+    save_trending,
+)
+
+
+def test_infer_features_from_description():
+    """Feature inference extracts capabilities from description text."""
+    desc = "A local, quantized LLM framework for fine-tuning and agent-based RAG with vision."
+    topics = ["ai", "llm"]
+    features = infer_features(desc, topics)
+
+    # Should include relevant capabilities (case-insensitive comparison)
+    expected_lower = {"fine-tuning", "local/offline", "quantized models", "agent framework", "vision", "retrieval/rag"}
+    actual_lower = set(f.lower() for f in features)
+    assert expected_lower.issubset(actual_lower), f"Missing features. Expected subset of {expected_lower}, got {actual_lower}"
+    print("PASS: infer_features_from_description")
+
+
+def test_infer_features_from_topics_only():
+    """Topics alone can drive feature detection."""
+    desc = ""
+    topics = ["computer-vision", "speech", "pytorch"]
+    features = infer_features(desc, topics)
+
+    # Non-generic topics should appear as features (topics preserved as-is)
+    assert "computer-vision" in features, f"Expected 'computer-vision' in {features}"
+    assert "speech" in features, f"Expected 'speech' in {features}"
+    # Generic topics (pytorch) may be filtered
+    print(f"PASS: infer_features_from_topics_only → {features}")
+
+
+def test_extract_repo_features_produces_valid_structure():
+    """extract_repo_features returns all required fields."""
+    mock_repo = {
+        "full_name": "example/repo",
+        "description": "An example repository",
+        "stargazers_count": 1234,
+        "forks_count": 56,
+        "open_issues_count": 7,
+        "language": "Python",
+        "topics": ["ai", "llm"],
+        "html_url": "https://github.com/example/repo",
+        "created_at": "2025-01-01T00:00:00Z",
+        "updated_at": "2026-01-01T00:00:00Z",
+    }
+
+    result = extract_repo_features(mock_repo)
+
+    assert result["name"] == "example/repo"
+    assert result["description"] == "An example repository"
+    assert result["stars"] == 1234
+    assert isinstance(result["key_features"], list)
+    assert "scanned_at" in result
+    assert result["url"] == "https://github.com/example/repo"
+    print("PASS: extract_repo_features_structure")
+
+
+def test_save_trending_creates_dated_json():
+    """save_trending writes a valid JSON file with the expected schema."""
+    repos = [
+        {
+            "name": "test/repo",
+            "description": "Test repository",
+            "stars": 999,
+            "language": "Python",
+            "topics": ["test"],
+            "key_features": ["testing"],
+            "scanned_at": "2026-04-26T00:00:00+00:00",
+        }
+    ]
+
+    with tempfile.TemporaryDirectory() as tmp:
+        output_file = save_trending(repos, output_dir=tmp)
+
+        path = Path(output_file)
+        assert path.exists(), f"Output file not created: {output_file}"
+
+        with open(path) as f:
+            data = json.load(f)
+
+        assert "scanned_at" in data
+        assert data["count"] == 1
+        assert isinstance(data["repos"], list)
+        assert data["repos"][0]["name"] == "test/repo"
+        print(f"PASS: save_trending → {output_file}")
+
+
+def test_save_trending_respects_output_dir_creation():
+    """Output directory is created if it doesn't exist."""
+    repos = []
+
+    with tempfile.TemporaryDirectory() as tmp:
+        nested = Path(tmp) / "nested" / "trending"
+        assert not nested.exists()
+
+        output_file = save_trending(repos, output_dir=str(nested))
+        assert nested.exists()
+        assert Path(output_file).exists()
+        print("PASS: output_dir_creation")
+
+
+if __name__ == "__main__":
+    test_infer_features_from_description()
+    test_infer_features_from_topics_only()
+    test_extract_repo_features_produces_valid_structure()
+    test_save_trending_creates_dated_json()
+    test_save_trending_respects_output_dir_creation()
+    print("\nAll github_trending_scanner tests passed.")
--- a/scripts/validate_doc_links.py
+++ b/scripts/validate_doc_links.py
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-"""
-Doc Link Validator — Extract and verify all documentation links.
-Issue: #103 — 4.8: Doc Link Validator
-
-Acceptance:
-  Extracts links from docs | HTTP HEAD check | Reports broken links
-  (Weekly cron/CI integration out of scope for this minimal script)
-"""
-
-import argparse
-import re
-import sys
-from pathlib import Path
-from typing import List, Tuple, Optional
-from urllib.request import Request, urlopen
-from urllib.error import URLError, HTTPError
-from urllib.parse import urlparse
-
-# Markdown link patterns
-INLINE_LINK_RE = re.compile(r'\[[^\]]*\]\(([^)\s]+)(?:\s+"[^"]*")?\)')
-AUTOLINK_RE = re.compile(r'<([^>]+)>')
-
-
-def extract_links(content: str) -> List[str]:
-    urls = [m.group(1) for m in INLINE_LINK_RE.finditer(content)]
-    urls += [m.group(1) for m in AUTOLINK_RE.finditer(content)]
-    return urls
-
-
-def is_ignorable(url: str, ignore_prefixes: List[str]) -> bool:
-    p = urlparse(url)
-    if p.scheme not in ('http', 'https'):
-        return True
-    host = p.netloc.split(':')[0]
-    if host in ('localhost', '127.0.0.1', '::1'):
-        return True
-    # Private IPv4 ranges
-    if re.match(r'^(10\.|192\.168\.|172\.(1[6-9]|2[0-9]|3[01])\.)', host):
-        return True
-    for prefix in ignore_prefixes:
-        if url.startswith(prefix):
-            return True
-    return False
-
-
-def check_url(url: str, timeout: float = 8.0) -> Tuple[bool, Optional[int], str]:
-    try:
-        req = Request(url, method='HEAD')
-        req.add_header('User-Agent', 'DocLinkValidator/1.0')
-        try:
-            with urlopen(req, timeout=timeout) as resp:
-                return True, resp.getcode(), "OK"
-        except HTTPError as e:
-            if e.code in (405, 403, 400):
-                req2 = Request(url, method='GET')
-                req2.add_header('User-Agent', 'DocLinkValidator/1.0')
-                req2.add_header('Range', 'bytes=0-1')
-                with urlopen(req2, timeout=timeout) as resp2:
-                    return True, resp2.getcode(), "OK via GET"
-            return False, e.code, e.reason
-    except URLError as e:
-        return False, None, str(e.reason) if hasattr(e, 'reason') else str(e)
-    except Exception as e:
-        return False, None, str(e)
-
-
-def main() -> int:
-    p = argparse.ArgumentParser(description="Validate documentation links")
-    p.add_argument('--root', default='.', help='Repository root')
-    p.add_argument('--fail-on-broken', action='store_true', help='Exit non-zero if broken links found')
-    p.add_argument('--json', action='store_true', help='Emit JSON report')
-    p.add_argument('--ignore', default='', help='Comma-separated URL prefixes to ignore')
-    args = p.parse_args()
-
-    root = Path(args.root).resolve()
-    ignore_prefixes = [x.strip() for x in args.ignore.split(',') if x.strip()]
-
-    md_files = list(root.rglob('*.md'))
-    if not md_files:
-        print("No markdown files found.", file=sys.stderr)
-        return 1
-
-    print(f"Scanning {len(md_files)} markdown files")
-
-    all_links: List[Tuple[Path, str]] = []
-    for md in md_files:
-        content = md.read_text(errors='replace')
-        for m in INLINE_LINK_RE.finditer(content):
-            all_links.append((md, m.group(1)))
-        for m in AUTOLINK_RE.finditer(content):
-            all_links.append((md, m.group(1)))
-
-    print(f"Raw link occurrences: {len(all_links)}")
-
-    # De-duplicate by URL, keep first file context
-    first_file: dict[str, Path] = {}
-    unique_urls: List[str] = []
-    for file, url in all_links:
-        if url not in first_file:
-            first_file[url] = file
-            unique_urls.append(url)
-
-    print(f"Unique URLs to check: {len(unique_urls)}")
-
-    broken: List[dict] = []
-    ok_count = 0
-    for url in unique_urls:
-        if is_ignorable(url, ignore_prefixes):
-            continue
-        ok, code, reason = check_url(url)
-        if ok:
-            ok_count += 1
-        else:
-            broken.append({"url": url, "file": str(first_file[url]), "error": reason})
-
-    print(f"OK: {ok_count}   Broken: {len(broken)}")
-    if broken:
-        print("\nBroken links:")
-        for b in broken:
-            print(f"  [{b['file']}] {b['url']} — {b['error']}")
-
-    if args.json:
-        print(json.dumps({"scanned": len(unique_urls), "ok": ok_count,
-                          "broken": len(broken), "broken_links": broken}, indent=2))
-
-    return 1 if (args.fail_on_broken and broken) else 0
-
-
-if __name__ == '__main__':
-    sys.exit(main())
Author	SHA1	Message	Date
Rockachopa	ec76e9fec3	test(scanner): unit tests for github_trending_scanner Some checks failed Test / pytest (pull_request) Failing after 9s Details	2026-04-26 11:21:02 +00:00
Timmy Time	38c5862737	feat(scanner): add GitHub Trending Scanner CLI for AI/ML repos	2026-04-26 11:20:51 +00:00