[DEEP-DIVE] Scaffold component — #830
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
This commit is contained in:
105
scaffold/deep-dive/aggregator/arxiv_fetcher.py
Normal file
105
scaffold/deep-dive/aggregator/arxiv_fetcher.py
Normal file
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
arXiv Source Aggregator for Deep Dive
|
||||
Fetches daily RSS feeds for cs.AI, cs.CL, cs.LG
|
||||
"""
|
||||
|
||||
import feedparser
|
||||
import requests
|
||||
from datetime import datetime, timedelta
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
import re
|
||||
|
||||
@dataclass
|
||||
class Paper:
|
||||
title: str
|
||||
authors: List[str]
|
||||
abstract: str
|
||||
url: str
|
||||
pdf_url: str
|
||||
published: datetime
|
||||
categories: List[str]
|
||||
arxiv_id: str
|
||||
|
||||
ARXIV_RSS_URLS = {
|
||||
"cs.AI": "http://export.arxiv.org/rss/cs.AI",
|
||||
"cs.CL": "http://export.arxiv.org/rss/cs.CL",
|
||||
"cs.LG": "http://export.arxiv.org/rss/cs.LG",
|
||||
}
|
||||
|
||||
# Hermes/Timmy relevant keywords
|
||||
RELEVANCE_KEYWORDS = [
|
||||
"agent", "llm", "large language model", "rag", "retrieval",
|
||||
"fine-tuning", "rlhf", "reinforcement learning", "transformer",
|
||||
"attention", "gpt", "claude", "embedding", "vector",
|
||||
"reasoning", "chain-of-thought", "tool use", "mcp",
|
||||
"orchestration", "multi-agent", "swarm", "fleet",
|
||||
]
|
||||
|
||||
def fetch_arxiv_category(category: str, days_back: int = 1) -> List[Paper]:
|
||||
"""Fetch papers from an arXiv category RSS feed."""
|
||||
url = ARXIV_RSS_URLS.get(category)
|
||||
if not url:
|
||||
return []
|
||||
|
||||
feed = feedparser.parse(url)
|
||||
papers = []
|
||||
cutoff = datetime.now() - timedelta(days=days_back)
|
||||
|
||||
for entry in feed.entries:
|
||||
# Parse date
|
||||
try:
|
||||
published = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z")
|
||||
except:
|
||||
published = datetime.now()
|
||||
|
||||
if published < cutoff:
|
||||
continue
|
||||
|
||||
# Extract arXiv ID from link
|
||||
arxiv_id = entry.link.split("/abs/")[-1] if "/abs/" in entry.link else ""
|
||||
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else ""
|
||||
|
||||
paper = Paper(
|
||||
title=entry.title,
|
||||
authors=[a.get("name", "") for a in entry.get("authors", [])],
|
||||
abstract=entry.get("summary", ""),
|
||||
url=entry.link,
|
||||
pdf_url=pdf_url,
|
||||
published=published,
|
||||
categories=[t.get("term", "") for t in entry.get("tags", [])],
|
||||
arxiv_id=arxiv_id
|
||||
)
|
||||
papers.append(paper)
|
||||
|
||||
return papers
|
||||
|
||||
def keyword_score(paper: Paper) -> float:
|
||||
"""Simple keyword-based relevance scoring."""
|
||||
text = f"{paper.title} {paper.abstract}".lower()
|
||||
score = 0
|
||||
for kw in RELEVANCE_KEYWORDS:
|
||||
if kw.lower() in text:
|
||||
score += 1
|
||||
return score / len(RELEVANCE_KEYWORDS)
|
||||
|
||||
def fetch_all_sources(days_back: int = 1) -> List[Paper]:
|
||||
"""Fetch from all configured arXiv categories."""
|
||||
all_papers = []
|
||||
for category in ARXIV_RSS_URLS.keys():
|
||||
papers = fetch_arxiv_category(category, days_back)
|
||||
all_papers.extend(papers)
|
||||
return all_papers
|
||||
|
||||
if __name__ == "__main__":
|
||||
papers = fetch_all_sources(days_back=1)
|
||||
print(f"Fetched {len(papers)} papers")
|
||||
|
||||
# Sort by keyword relevance
|
||||
scored = [(p, keyword_score(p)) for p in papers]
|
||||
scored.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
for paper, score in scored[:10]:
|
||||
print(f"\n[{score:.2f}] {paper.title}")
|
||||
print(f" {paper.url}")
|
||||
Reference in New Issue
Block a user