the-nexus/scaffold/deep-dive/aggregator/arxiv_fetcher.py

#!/usr/bin/env python3
"""
arXiv Source Aggregator for Deep Dive
Fetches daily RSS feeds for cs.AI, cs.CL, cs.LG
"""

import feedparser
import requests
from datetime import datetime, timedelta
from dataclasses import dataclass
from typing import List
import re

@dataclass
class Paper:
    title: str
    authors: List[str]
    abstract: str
    url: str
    pdf_url: str
    published: datetime
    categories: List[str]
    arxiv_id: str

ARXIV_RSS_URLS = {
    "cs.AI": "http://export.arxiv.org/rss/cs.AI",
    "cs.CL": "http://export.arxiv.org/rss/cs.CL",
    "cs.LG": "http://export.arxiv.org/rss/cs.LG",
}

# Hermes/Timmy relevant keywords
RELEVANCE_KEYWORDS = [
    "agent", "llm", "large language model", "rag", "retrieval",
    "fine-tuning", "rlhf", "reinforcement learning", "transformer",
    "attention", "gpt", "claude", "embedding", "vector",
    "reasoning", "chain-of-thought", "tool use", "mcp",
    "orchestration", "multi-agent", "swarm", "fleet",
]

def fetch_arxiv_category(category: str, days_back: int = 1) -> List[Paper]:
    """Fetch papers from an arXiv category RSS feed."""
    url = ARXIV_RSS_URLS.get(category)
    if not url:
        return []

    feed = feedparser.parse(url)
    papers = []
    cutoff = datetime.now() - timedelta(days=days_back)

    for entry in feed.entries:
        # Parse date
        try:
            published = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z")
        except:
            published = datetime.now()

        if published < cutoff:
            continue

        # Extract arXiv ID from link
        arxiv_id = entry.link.split("/abs/")[-1] if "/abs/" in entry.link else ""
        pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else ""

        paper = Paper(
            title=entry.title,
            authors=[a.get("name", "") for a in entry.get("authors", [])],
            abstract=entry.get("summary", ""),
            url=entry.link,
            pdf_url=pdf_url,
            published=published,
            categories=[t.get("term", "") for t in entry.get("tags", [])],
            arxiv_id=arxiv_id
        )
        papers.append(paper)

    return papers

def keyword_score(paper: Paper) -> float:
    """Simple keyword-based relevance scoring."""
    text = f"{paper.title} {paper.abstract}".lower()
    score = 0
    for kw in RELEVANCE_KEYWORDS:
        if kw.lower() in text:
            score += 1
    return score / len(RELEVANCE_KEYWORDS)

def fetch_all_sources(days_back: int = 1) -> List[Paper]:
    """Fetch from all configured arXiv categories."""
    all_papers = []
    for category in ARXIV_RSS_URLS.keys():
        papers = fetch_arxiv_category(category, days_back)
        all_papers.extend(papers)
    return all_papers

if __name__ == "__main__":
    papers = fetch_all_sources(days_back=1)
    print(f"Fetched {len(papers)} papers")

    # Sort by keyword relevance
    scored = [(p, keyword_score(p)) for p in papers]
    scored.sort(key=lambda x: x[1], reverse=True)

    for paper, score in scored[:10]:
        print(f"\n[{score:.2f}] {paper.title}")
        print(f"    {paper.url}")