From 3014d834627a5725fc366f8f1bf17b3b79911fb8 Mon Sep 17 00:00:00 2001 From: Ezra Date: Sun, 5 Apr 2026 07:42:24 +0000 Subject: [PATCH] =?UTF-8?q?[DEEP-DIVE]=20Scaffold=20component=20=E2=80=94?= =?UTF-8?q?=20#830?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../deep-dive/aggregator/arxiv_fetcher.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 scaffold/deep-dive/aggregator/arxiv_fetcher.py diff --git a/scaffold/deep-dive/aggregator/arxiv_fetcher.py b/scaffold/deep-dive/aggregator/arxiv_fetcher.py new file mode 100644 index 0000000..a87c759 --- /dev/null +++ b/scaffold/deep-dive/aggregator/arxiv_fetcher.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +""" +arXiv Source Aggregator for Deep Dive +Fetches daily RSS feeds for cs.AI, cs.CL, cs.LG +""" + +import feedparser +import requests +from datetime import datetime, timedelta +from dataclasses import dataclass +from typing import List +import re + +@dataclass +class Paper: + title: str + authors: List[str] + abstract: str + url: str + pdf_url: str + published: datetime + categories: List[str] + arxiv_id: str + +ARXIV_RSS_URLS = { + "cs.AI": "http://export.arxiv.org/rss/cs.AI", + "cs.CL": "http://export.arxiv.org/rss/cs.CL", + "cs.LG": "http://export.arxiv.org/rss/cs.LG", +} + +# Hermes/Timmy relevant keywords +RELEVANCE_KEYWORDS = [ + "agent", "llm", "large language model", "rag", "retrieval", + "fine-tuning", "rlhf", "reinforcement learning", "transformer", + "attention", "gpt", "claude", "embedding", "vector", + "reasoning", "chain-of-thought", "tool use", "mcp", + "orchestration", "multi-agent", "swarm", "fleet", +] + +def fetch_arxiv_category(category: str, days_back: int = 1) -> List[Paper]: + """Fetch papers from an arXiv category RSS feed.""" + url = ARXIV_RSS_URLS.get(category) + if not url: + return [] + + feed = feedparser.parse(url) + papers = [] + cutoff = datetime.now() - timedelta(days=days_back) + + for entry in feed.entries: + # Parse date + try: + published = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z") + except: + published = datetime.now() + + if published < cutoff: + continue + + # Extract arXiv ID from link + arxiv_id = entry.link.split("/abs/")[-1] if "/abs/" in entry.link else "" + pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "" + + paper = Paper( + title=entry.title, + authors=[a.get("name", "") for a in entry.get("authors", [])], + abstract=entry.get("summary", ""), + url=entry.link, + pdf_url=pdf_url, + published=published, + categories=[t.get("term", "") for t in entry.get("tags", [])], + arxiv_id=arxiv_id + ) + papers.append(paper) + + return papers + +def keyword_score(paper: Paper) -> float: + """Simple keyword-based relevance scoring.""" + text = f"{paper.title} {paper.abstract}".lower() + score = 0 + for kw in RELEVANCE_KEYWORDS: + if kw.lower() in text: + score += 1 + return score / len(RELEVANCE_KEYWORDS) + +def fetch_all_sources(days_back: int = 1) -> List[Paper]: + """Fetch from all configured arXiv categories.""" + all_papers = [] + for category in ARXIV_RSS_URLS.keys(): + papers = fetch_arxiv_category(category, days_back) + all_papers.extend(papers) + return all_papers + +if __name__ == "__main__": + papers = fetch_all_sources(days_back=1) + print(f"Fetched {len(papers)} papers") + + # Sort by keyword relevance + scored = [(p, keyword_score(p)) for p in papers] + scored.sort(key=lambda x: x[1], reverse=True) + + for paper, score in scored[:10]: + print(f"\n[{score:.2f}] {paper.title}") + print(f" {paper.url}")