#!/usr/bin/env python3 """ Citation Tracker — Monitor citations of key papers. Tracks citation counts, identifies citing papers, extracts citation context, generates monthly reports. Issue: #140 (7.8) Categories: fact, pattern """ import argparse import json import sys import urllib.request import urllib.error from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any, Optional SCRIPT_DIR = Path(__file__).parent.absolute() KNOWLEDGE_DIR = SCRIPT_DIR.parent / "knowledge" METRICS_DIR = SCRIPT_DIR.parent / "metrics" INDEX_PATH = KNOWLEDGE_DIR / "index.json" # Semantic Scholar API (free, no key required for basic lookups) S2_API_BASE = "https://api.semanticscholar.org/graph/v1" def fetch_paper(s2_id: str) -> Optional[Dict]: """Fetch paper metadata from Semantic Scholar.""" url = f"{S2_API_BASE}/paper/{s2_id}?fields=title,year,citationCount,externalIds,publicationVenue,publicationTypes" try: with urllib.request.urlopen(url, timeout=10) as resp: return json.loads(resp.read()) except (urllib.error.HTTPError, urllib.error.URLError) as e: print(f"Warning: Failed to fetch {s2_id}: {e}", file=sys.stderr) return None def fetch_citations(s2_id: str, limit: int = 50) -> List[Dict]: """Fetch recent citing papers from Semantic Scholar.""" url = f"{S2_API_BASE}/paper/{s2_id}/citations?fields=title,year,authors,publicationVenue,publicationTypes&limit={limit}" try: with urllib.request.urlopen(url, timeout=15) as resp: data = json.loads(resp.read()) return [c["citingPaper"] for c in data.get("data", [])] except (urllib.error.HTTPError, urllib.error.URLError) as e: print(f"Warning: Failed to fetch citations for {s2_id}: {e}", file=sys.stderr) return [] def load_key_papers() -> List[Dict]: """Load key papers list from citations.yaml.""" config_path = KNOWLEDGE_DIR / "global" / "citations.yaml" if not config_path.exists(): print(f"Error: {config_path} not found. Create it with key papers list.", file=sys.stderr) sys.exit(1) import yaml with open(config_path) as f: data = yaml.safe_load(f) papers = [] for entry in data.get("papers", []): papers.append({ "id": entry["s2_id"], "title": entry.get("title", "Unknown"), "notes": entry.get("notes", "") }) return papers def load_index() -> Dict: """Load or initialize knowledge index.""" if INDEX_PATH.exists(): with open(INDEX_PATH) as f: return json.load(f) return {"version": 1, "last_updated": "", "total_facts": 0, "facts": []} def save_index(index: Dict) -> None: """Save knowledge index.""" KNOWLEDGE_DIR.mkdir(parents=True, exist_ok=True) with open(INDEX_PATH, "w") as f: json.dump(index, f, indent=2) def add_citation_fact(index: Dict, fact: str, repo: str, confidence: float, tags: List[str], source_count: int = 1) -> None: """Add a new citation fact to the index.""" # Determine next sequence number for citation:facts in this domain domain = "global" category = "fact" prefix = f"{domain}:{category}:" seq_nums = [] for f in index["facts"]: if f["id"].startswith(prefix): try: seq = int(f["id"].split(":")[-1]) seq_nums.append(seq) except ValueError: continue next_seq = max(seq_nums, default=0) + 1 new_id = f"{domain}:{category}:{next_seq:03d}" today = datetime.now(timezone.utc).strftime("%Y-%m-%d") fact_entry = { "id": new_id, "fact": fact, "category": category, "domain": domain, "confidence": confidence, "tags": tags, "source_count": source_count, "first_seen": today, "last_confirmed": today } index["facts"].append(fact_entry) index["total_facts"] = len(index["facts"]) index["last_updated"] = datetime.now(timezone.utc).isoformat() def update_citation_data() -> None: """Update citation counts and facts for all key papers.""" papers = load_key_papers() index = load_index() updated = 0 for paper in papers: s2_id = paper["id"] title = paper["title"] # Fetch current paper data data = fetch_paper(s2_id) if not data: continue citation_count = data.get("citationCount", 0) external_ids = data.get("externalIds", {}) arxiv_id = externalIds.get("ArXiv") if external_ids else None # Add citation count fact (high confidence - directly from API) count_fact = f"Paper '{title}' (S2:{s2_id}) has {citation_count} citations as of {datetime.now(timezone.utc).strftime('%Y-%m-%d')}" if arxiv_id: count_fact += f" [arXiv:{arxiv_id}]" add_citation_fact( index=index, fact=count_fact, repo="compounding-intelligence", confidence=0.95, tags=["citation", "tracking", "paper", s2_id], source_count=1 ) updated += 1 # Fetch recent citations (context extraction - limited batch) citations = fetch_citations(s2_id, limit=20) for citation in citations: citing_title = citation.get("title", "Unknown") citing_year = citation.get("year", "Unknown year") authors = citation.get("authors", []) author_names = [a.get("name", "") for a in authors[:3]] if len(authors) > 3: author_names.append("et al.") cite_fact = f"Paper '{citing_title}' ({', '.join(author_names)}, {citing_year}) cites '{title}'" add_citation_fact( index=index, fact=cite_fact, repo="compounding-intelligence", confidence=0.8, tags=["citation", "citing-paper", s2_id], source_count=1 ) print(f"Updated: {title} — {citation_count} citations, {len(citations)} recent") save_index(index) print(f"\nUpdated {updated} papers. Total facts in index: {index['total_facts']}") def generate_monthly_report(month: Optional[str] = None) -> str: """Generate a monthly citation report.""" target_month = month or datetime.now(timezone.utc).strftime("%Y-%m") year, mon = map(int, target_month.split("-")) index = load_index() monthly_facts = [] for fact in index["facts"]: last_confirmed = fact.get("last_confirmed", "") if last_confirmed.startswith(f"{year}-{mon:02d}"): monthly_facts.append(fact) # Build report lines = [] lines.append(f"# Citation Tracker Monthly Report — {target_month}") lines.append("") lines.append(f"Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}") lines.append(f"Total citation facts this month: {len(monthly_facts)}") lines.append("") # Group by paper from collections import defaultdict by_paper = defaultdict(list) for fact in monthly_facts: # Extract paper identifier from fact text text = fact["fact"] by_paper[text].append(fact) for paper_title, facts in by_paper.items(): lines.append(f"## {paper_title}") for f in facts: lines.append(f"- {f['fact']} (confidence: {f['confidence']})") lines.append("") report = "\n".join(lines) # Save report METRICS_DIR.mkdir(parents=True, exist_ok=True) report_path = METRICS_DIR / f"citation_report_{target_month}.md" with open(report_path, "w") as f: f.write(report) print(f"Monthly report saved to: {report_path}") return report def main() -> None: parser = argparse.ArgumentParser(description="Citation Tracker — Monitor key paper citations") parser.add_argument("--update", action="store_true", help="Fetch latest citation data") parser.add_argument("--report", action="store_true", help="Generate monthly report") parser.add_argument("--month", type=str, help="Month for report (YYYY-MM), defaults to current") args = parser.parse_args() if args.update: update_citation_data() elif args.report: generate_monthly_report(args.month) else: parser.print_help() if __name__ == "__main__": main()