[DEEP-DIVE] Scaffold component — #830
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled

This commit is contained in:
2026-04-05 07:42:24 +00:00
parent 245f8a9c41
commit 3014d83462

View File

@@ -0,0 +1,105 @@
#!/usr/bin/env python3
"""
arXiv Source Aggregator for Deep Dive
Fetches daily RSS feeds for cs.AI, cs.CL, cs.LG
"""
import feedparser
import requests
from datetime import datetime, timedelta
from dataclasses import dataclass
from typing import List
import re
@dataclass
class Paper:
title: str
authors: List[str]
abstract: str
url: str
pdf_url: str
published: datetime
categories: List[str]
arxiv_id: str
ARXIV_RSS_URLS = {
"cs.AI": "http://export.arxiv.org/rss/cs.AI",
"cs.CL": "http://export.arxiv.org/rss/cs.CL",
"cs.LG": "http://export.arxiv.org/rss/cs.LG",
}
# Hermes/Timmy relevant keywords
RELEVANCE_KEYWORDS = [
"agent", "llm", "large language model", "rag", "retrieval",
"fine-tuning", "rlhf", "reinforcement learning", "transformer",
"attention", "gpt", "claude", "embedding", "vector",
"reasoning", "chain-of-thought", "tool use", "mcp",
"orchestration", "multi-agent", "swarm", "fleet",
]
def fetch_arxiv_category(category: str, days_back: int = 1) -> List[Paper]:
"""Fetch papers from an arXiv category RSS feed."""
url = ARXIV_RSS_URLS.get(category)
if not url:
return []
feed = feedparser.parse(url)
papers = []
cutoff = datetime.now() - timedelta(days=days_back)
for entry in feed.entries:
# Parse date
try:
published = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z")
except:
published = datetime.now()
if published < cutoff:
continue
# Extract arXiv ID from link
arxiv_id = entry.link.split("/abs/")[-1] if "/abs/" in entry.link else ""
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else ""
paper = Paper(
title=entry.title,
authors=[a.get("name", "") for a in entry.get("authors", [])],
abstract=entry.get("summary", ""),
url=entry.link,
pdf_url=pdf_url,
published=published,
categories=[t.get("term", "") for t in entry.get("tags", [])],
arxiv_id=arxiv_id
)
papers.append(paper)
return papers
def keyword_score(paper: Paper) -> float:
"""Simple keyword-based relevance scoring."""
text = f"{paper.title} {paper.abstract}".lower()
score = 0
for kw in RELEVANCE_KEYWORDS:
if kw.lower() in text:
score += 1
return score / len(RELEVANCE_KEYWORDS)
def fetch_all_sources(days_back: int = 1) -> List[Paper]:
"""Fetch from all configured arXiv categories."""
all_papers = []
for category in ARXIV_RSS_URLS.keys():
papers = fetch_arxiv_category(category, days_back)
all_papers.extend(papers)
return all_papers
if __name__ == "__main__":
papers = fetch_all_sources(days_back=1)
print(f"Fetched {len(papers)} papers")
# Sort by keyword relevance
scored = [(p, keyword_score(p)) for p in papers]
scored.sort(key=lambda x: x[1], reverse=True)
for paper, score in scored[:10]:
print(f"\n[{score:.2f}] {paper.title}")
print(f" {paper.url}")