[DEEP-DIVE] Scaffold component — #830

2026-04-05 07:42:24 +00:00
parent 245f8a9c41
commit 3014d83462
1 changed files with 105 additions and 0 deletions
--- a/scaffold/deep-dive/aggregator/arxiv_fetcher.py
+++ b/scaffold/deep-dive/aggregator/arxiv_fetcher.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""
+arXiv Source Aggregator for Deep Dive
+Fetches daily RSS feeds for cs.AI, cs.CL, cs.LG
+"""
+
+import feedparser
+import requests
+from datetime import datetime, timedelta
+from dataclasses import dataclass
+from typing import List
+import re
+
+@dataclass
+class Paper:
+    title: str
+    authors: List[str]
+    abstract: str
+    url: str
+    pdf_url: str
+    published: datetime
+    categories: List[str]
+    arxiv_id: str
+
+ARXIV_RSS_URLS = {
+    "cs.AI": "http://export.arxiv.org/rss/cs.AI",
+    "cs.CL": "http://export.arxiv.org/rss/cs.CL",
+    "cs.LG": "http://export.arxiv.org/rss/cs.LG",
+}
+
+# Hermes/Timmy relevant keywords
+RELEVANCE_KEYWORDS = [
+    "agent", "llm", "large language model", "rag", "retrieval",
+    "fine-tuning", "rlhf", "reinforcement learning", "transformer",
+    "attention", "gpt", "claude", "embedding", "vector",
+    "reasoning", "chain-of-thought", "tool use", "mcp",
+    "orchestration", "multi-agent", "swarm", "fleet",
+]
+
+def fetch_arxiv_category(category: str, days_back: int = 1) -> List[Paper]:
+    """Fetch papers from an arXiv category RSS feed."""
+    url = ARXIV_RSS_URLS.get(category)
+    if not url:
+        return []
+    
+    feed = feedparser.parse(url)
+    papers = []
+    cutoff = datetime.now() - timedelta(days=days_back)
+    
+    for entry in feed.entries:
+        # Parse date
+        try:
+            published = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z")
+        except:
+            published = datetime.now()
+        
+        if published < cutoff:
+            continue
+        
+        # Extract arXiv ID from link
+        arxiv_id = entry.link.split("/abs/")[-1] if "/abs/" in entry.link else ""
+        pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else ""
+        
+        paper = Paper(
+            title=entry.title,
+            authors=[a.get("name", "") for a in entry.get("authors", [])],
+            abstract=entry.get("summary", ""),
+            url=entry.link,
+            pdf_url=pdf_url,
+            published=published,
+            categories=[t.get("term", "") for t in entry.get("tags", [])],
+            arxiv_id=arxiv_id
+        )
+        papers.append(paper)
+    
+    return papers
+
+def keyword_score(paper: Paper) -> float:
+    """Simple keyword-based relevance scoring."""
+    text = f"{paper.title} {paper.abstract}".lower()
+    score = 0
+    for kw in RELEVANCE_KEYWORDS:
+        if kw.lower() in text:
+            score += 1
+    return score / len(RELEVANCE_KEYWORDS)
+
+def fetch_all_sources(days_back: int = 1) -> List[Paper]:
+    """Fetch from all configured arXiv categories."""
+    all_papers = []
+    for category in ARXIV_RSS_URLS.keys():
+        papers = fetch_arxiv_category(category, days_back)
+        all_papers.extend(papers)
+    return all_papers
+
+if __name__ == "__main__":
+    papers = fetch_all_sources(days_back=1)
+    print(f"Fetched {len(papers)} papers")
+    
+    # Sort by keyword relevance
+    scored = [(p, keyword_score(p)) for p in papers]
+    scored.sort(key=lambda x: x[1], reverse=True)
+    
+    for paper, score in scored[:10]:
+        print(f"\n[{score:.2f}] {paper.title}")
+        print(f"    {paper.url}")