From 3014d834627a5725fc366f8f1bf17b3b79911fb8 Mon Sep 17 00:00:00 2001
From: Ezra <ezra@hermes.local>
Date: Sun, 5 Apr 2026 07:42:24 +0000
Subject: [PATCH] =?UTF-8?q?[DEEP-DIVE]=20Scaffold=20component=20=E2=80=94?=
 =?UTF-8?q?=20#830?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../deep-dive/aggregator/arxiv_fetcher.py     | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 scaffold/deep-dive/aggregator/arxiv_fetcher.py

diff --git a/scaffold/deep-dive/aggregator/arxiv_fetcher.py b/scaffold/deep-dive/aggregator/arxiv_fetcher.py
new file mode 100644
index 0000000..a87c759
--- /dev/null
+++ b/scaffold/deep-dive/aggregator/arxiv_fetcher.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""
+arXiv Source Aggregator for Deep Dive
+Fetches daily RSS feeds for cs.AI, cs.CL, cs.LG
+"""
+
+import feedparser
+import requests
+from datetime import datetime, timedelta
+from dataclasses import dataclass
+from typing import List
+import re
+
+@dataclass
+class Paper:
+    title: str
+    authors: List[str]
+    abstract: str
+    url: str
+    pdf_url: str
+    published: datetime
+    categories: List[str]
+    arxiv_id: str
+
+ARXIV_RSS_URLS = {
+    "cs.AI": "http://export.arxiv.org/rss/cs.AI",
+    "cs.CL": "http://export.arxiv.org/rss/cs.CL",
+    "cs.LG": "http://export.arxiv.org/rss/cs.LG",
+}
+
+# Hermes/Timmy relevant keywords
+RELEVANCE_KEYWORDS = [
+    "agent", "llm", "large language model", "rag", "retrieval",
+    "fine-tuning", "rlhf", "reinforcement learning", "transformer",
+    "attention", "gpt", "claude", "embedding", "vector",
+    "reasoning", "chain-of-thought", "tool use", "mcp",
+    "orchestration", "multi-agent", "swarm", "fleet",
+]
+
+def fetch_arxiv_category(category: str, days_back: int = 1) -> List[Paper]:
+    """Fetch papers from an arXiv category RSS feed."""
+    url = ARXIV_RSS_URLS.get(category)
+    if not url:
+        return []
+    
+    feed = feedparser.parse(url)
+    papers = []
+    cutoff = datetime.now() - timedelta(days=days_back)
+    
+    for entry in feed.entries:
+        # Parse date
+        try:
+            published = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z")
+        except:
+            published = datetime.now()
+        
+        if published < cutoff:
+            continue
+        
+        # Extract arXiv ID from link
+        arxiv_id = entry.link.split("/abs/")[-1] if "/abs/" in entry.link else ""
+        pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else ""
+        
+        paper = Paper(
+            title=entry.title,
+            authors=[a.get("name", "") for a in entry.get("authors", [])],
+            abstract=entry.get("summary", ""),
+            url=entry.link,
+            pdf_url=pdf_url,
+            published=published,
+            categories=[t.get("term", "") for t in entry.get("tags", [])],
+            arxiv_id=arxiv_id
+        )
+        papers.append(paper)
+    
+    return papers
+
+def keyword_score(paper: Paper) -> float:
+    """Simple keyword-based relevance scoring."""
+    text = f"{paper.title} {paper.abstract}".lower()
+    score = 0
+    for kw in RELEVANCE_KEYWORDS:
+        if kw.lower() in text:
+            score += 1
+    return score / len(RELEVANCE_KEYWORDS)
+
+def fetch_all_sources(days_back: int = 1) -> List[Paper]:
+    """Fetch from all configured arXiv categories."""
+    all_papers = []
+    for category in ARXIV_RSS_URLS.keys():
+        papers = fetch_arxiv_category(category, days_back)
+        all_papers.extend(papers)
+    return all_papers
+
+if __name__ == "__main__":
+    papers = fetch_all_sources(days_back=1)
+    print(f"Fetched {len(papers)} papers")
+    
+    # Sort by keyword relevance
+    scored = [(p, keyword_score(p)) for p in papers]
+    scored.sort(key=lambda x: x[1], reverse=True)
+    
+    for paper, score in scored[:10]:
+        print(f"\n[{score:.2f}] {paper.title}")
+        print(f"    {paper.url}")