compounding-intelligence/scripts/citation_tracker.py

#!/usr/bin/env python3
"""
Citation Tracker — Monitor citations of key papers.
Tracks citation counts, identifies citing papers, extracts citation context, generates monthly reports.

Issue: #140 (7.8)
Categories: fact, pattern
"""

import argparse
import json
import sys
import urllib.request
import urllib.error
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any, Optional

SCRIPT_DIR = Path(__file__).parent.absolute()
KNOWLEDGE_DIR = SCRIPT_DIR.parent / "knowledge"
METRICS_DIR = SCRIPT_DIR.parent / "metrics"
INDEX_PATH = KNOWLEDGE_DIR / "index.json"

# Semantic Scholar API (free, no key required for basic lookups)
S2_API_BASE = "https://api.semanticscholar.org/graph/v1"

def fetch_paper(s2_id: str) -> Optional[Dict]:
    """Fetch paper metadata from Semantic Scholar."""
    url = f"{S2_API_BASE}/paper/{s2_id}?fields=title,year,citationCount,externalIds,publicationVenue,publicationTypes"
    try:
        with urllib.request.urlopen(url, timeout=10) as resp:
            return json.loads(resp.read())
    except (urllib.error.HTTPError, urllib.error.URLError) as e:
        print(f"Warning: Failed to fetch {s2_id}: {e}", file=sys.stderr)
        return None

def fetch_citations(s2_id: str, limit: int = 50) -> List[Dict]:
    """Fetch recent citing papers from Semantic Scholar."""
    url = f"{S2_API_BASE}/paper/{s2_id}/citations?fields=title,year,authors,publicationVenue,publicationTypes&limit={limit}"
    try:
        with urllib.request.urlopen(url, timeout=15) as resp:
            data = json.loads(resp.read())
            return [c["citingPaper"] for c in data.get("data", [])]
    except (urllib.error.HTTPError, urllib.error.URLError) as e:
        print(f"Warning: Failed to fetch citations for {s2_id}: {e}", file=sys.stderr)
        return []

def load_key_papers() -> List[Dict]:
    """Load key papers list from citations.yaml."""
    config_path = KNOWLEDGE_DIR / "global" / "citations.yaml"
    if not config_path.exists():
        print(f"Error: {config_path} not found. Create it with key papers list.", file=sys.stderr)
        sys.exit(1)

    import yaml
    with open(config_path) as f:
        data = yaml.safe_load(f)

    papers = []
    for entry in data.get("papers", []):
        papers.append({
            "id": entry["s2_id"],
            "title": entry.get("title", "Unknown"),
            "notes": entry.get("notes", "")
        })
    return papers

def load_index() -> Dict:
    """Load or initialize knowledge index."""
    if INDEX_PATH.exists():
        with open(INDEX_PATH) as f:
            return json.load(f)
    return {"version": 1, "last_updated": "", "total_facts": 0, "facts": []}

def save_index(index: Dict) -> None:
    """Save knowledge index."""
    KNOWLEDGE_DIR.mkdir(parents=True, exist_ok=True)
    with open(INDEX_PATH, "w") as f:
        json.dump(index, f, indent=2)

def add_citation_fact(index: Dict, fact: str, repo: str, confidence: float,
                     tags: List[str], source_count: int = 1) -> None:
    """Add a new citation fact to the index."""
    # Determine next sequence number for citation:facts in this domain
    domain = "global"
    category = "fact"
    prefix = f"{domain}:{category}:"
    seq_nums = []
    for f in index["facts"]:
        if f["id"].startswith(prefix):
            try:
                seq = int(f["id"].split(":")[-1])
                seq_nums.append(seq)
            except ValueError:
                continue
    next_seq = max(seq_nums, default=0) + 1
    new_id = f"{domain}:{category}:{next_seq:03d}"

    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    fact_entry = {
        "id": new_id,
        "fact": fact,
        "category": category,
        "domain": domain,
        "confidence": confidence,
        "tags": tags,
        "source_count": source_count,
        "first_seen": today,
        "last_confirmed": today
    }
    index["facts"].append(fact_entry)
    index["total_facts"] = len(index["facts"])
    index["last_updated"] = datetime.now(timezone.utc).isoformat()

def update_citation_data() -> None:
    """Update citation counts and facts for all key papers."""
    papers = load_key_papers()
    index = load_index()
    updated = 0

    for paper in papers:
        s2_id = paper["id"]
        title = paper["title"]

        # Fetch current paper data
        data = fetch_paper(s2_id)
        if not data:
            continue

        citation_count = data.get("citationCount", 0)
        external_ids = data.get("externalIds", {})
        arxiv_id = externalIds.get("ArXiv") if external_ids else None

        # Add citation count fact (high confidence - directly from API)
        count_fact = f"Paper '{title}' (S2:{s2_id}) has {citation_count} citations as of {datetime.now(timezone.utc).strftime('%Y-%m-%d')}"
        if arxiv_id:
            count_fact += f" [arXiv:{arxiv_id}]"

        add_citation_fact(
            index=index,
            fact=count_fact,
            repo="compounding-intelligence",
            confidence=0.95,
            tags=["citation", "tracking", "paper", s2_id],
            source_count=1
        )
        updated += 1

        # Fetch recent citations (context extraction - limited batch)
        citations = fetch_citations(s2_id, limit=20)
        for citation in citations:
            citing_title = citation.get("title", "Unknown")
            citing_year = citation.get("year", "Unknown year")
            authors = citation.get("authors", [])
            author_names = [a.get("name", "") for a in authors[:3]]
            if len(authors) > 3:
                author_names.append("et al.")

            cite_fact = f"Paper '{citing_title}' ({', '.join(author_names)}, {citing_year}) cites '{title}'"
            add_citation_fact(
                index=index,
                fact=cite_fact,
                repo="compounding-intelligence",
                confidence=0.8,
                tags=["citation", "citing-paper", s2_id],
                source_count=1
            )

        print(f"Updated: {title} — {citation_count} citations, {len(citations)} recent")

    save_index(index)
    print(f"\nUpdated {updated} papers. Total facts in index: {index['total_facts']}")

def generate_monthly_report(month: Optional[str] = None) -> str:
    """Generate a monthly citation report."""
    target_month = month or datetime.now(timezone.utc).strftime("%Y-%m")
    year, mon = map(int, target_month.split("-"))

    index = load_index()
    monthly_facts = []

    for fact in index["facts"]:
        last_confirmed = fact.get("last_confirmed", "")
        if last_confirmed.startswith(f"{year}-{mon:02d}"):
            monthly_facts.append(fact)

    # Build report
    lines = []
    lines.append(f"# Citation Tracker Monthly Report — {target_month}")
    lines.append("")
    lines.append(f"Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
    lines.append(f"Total citation facts this month: {len(monthly_facts)}")
    lines.append("")

    # Group by paper
    from collections import defaultdict
    by_paper = defaultdict(list)
    for fact in monthly_facts:
        # Extract paper identifier from fact text
        text = fact["fact"]
        by_paper[text].append(fact)

    for paper_title, facts in by_paper.items():
        lines.append(f"## {paper_title}")
        for f in facts:
            lines.append(f"- {f['fact']} (confidence: {f['confidence']})")
        lines.append("")

    report = "\n".join(lines)

    # Save report
    METRICS_DIR.mkdir(parents=True, exist_ok=True)
    report_path = METRICS_DIR / f"citation_report_{target_month}.md"
    with open(report_path, "w") as f:
        f.write(report)

    print(f"Monthly report saved to: {report_path}")
    return report

def main() -> None:
    parser = argparse.ArgumentParser(description="Citation Tracker — Monitor key paper citations")
    parser.add_argument("--update", action="store_true", help="Fetch latest citation data")
    parser.add_argument("--report", action="store_true", help="Generate monthly report")
    parser.add_argument("--month", type=str, help="Month for report (YYYY-MM), defaults to current")
    args = parser.parse_args()

    if args.update:
        update_citation_data()
    elif args.report:
        generate_monthly_report(args.month)
    else:
        parser.print_help()

if __name__ == "__main__":
    main()