106 lines
3.1 KiB
Python
106 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
arXiv Source Aggregator for Deep Dive
|
|
Fetches daily RSS feeds for cs.AI, cs.CL, cs.LG
|
|
"""
|
|
|
|
import feedparser
|
|
import requests
|
|
from datetime import datetime, timedelta
|
|
from dataclasses import dataclass
|
|
from typing import List
|
|
import re
|
|
|
|
@dataclass
|
|
class Paper:
|
|
title: str
|
|
authors: List[str]
|
|
abstract: str
|
|
url: str
|
|
pdf_url: str
|
|
published: datetime
|
|
categories: List[str]
|
|
arxiv_id: str
|
|
|
|
ARXIV_RSS_URLS = {
|
|
"cs.AI": "http://export.arxiv.org/rss/cs.AI",
|
|
"cs.CL": "http://export.arxiv.org/rss/cs.CL",
|
|
"cs.LG": "http://export.arxiv.org/rss/cs.LG",
|
|
}
|
|
|
|
# Hermes/Timmy relevant keywords
|
|
RELEVANCE_KEYWORDS = [
|
|
"agent", "llm", "large language model", "rag", "retrieval",
|
|
"fine-tuning", "rlhf", "reinforcement learning", "transformer",
|
|
"attention", "gpt", "claude", "embedding", "vector",
|
|
"reasoning", "chain-of-thought", "tool use", "mcp",
|
|
"orchestration", "multi-agent", "swarm", "fleet",
|
|
]
|
|
|
|
def fetch_arxiv_category(category: str, days_back: int = 1) -> List[Paper]:
|
|
"""Fetch papers from an arXiv category RSS feed."""
|
|
url = ARXIV_RSS_URLS.get(category)
|
|
if not url:
|
|
return []
|
|
|
|
feed = feedparser.parse(url)
|
|
papers = []
|
|
cutoff = datetime.now() - timedelta(days=days_back)
|
|
|
|
for entry in feed.entries:
|
|
# Parse date
|
|
try:
|
|
published = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z")
|
|
except:
|
|
published = datetime.now()
|
|
|
|
if published < cutoff:
|
|
continue
|
|
|
|
# Extract arXiv ID from link
|
|
arxiv_id = entry.link.split("/abs/")[-1] if "/abs/" in entry.link else ""
|
|
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else ""
|
|
|
|
paper = Paper(
|
|
title=entry.title,
|
|
authors=[a.get("name", "") for a in entry.get("authors", [])],
|
|
abstract=entry.get("summary", ""),
|
|
url=entry.link,
|
|
pdf_url=pdf_url,
|
|
published=published,
|
|
categories=[t.get("term", "") for t in entry.get("tags", [])],
|
|
arxiv_id=arxiv_id
|
|
)
|
|
papers.append(paper)
|
|
|
|
return papers
|
|
|
|
def keyword_score(paper: Paper) -> float:
|
|
"""Simple keyword-based relevance scoring."""
|
|
text = f"{paper.title} {paper.abstract}".lower()
|
|
score = 0
|
|
for kw in RELEVANCE_KEYWORDS:
|
|
if kw.lower() in text:
|
|
score += 1
|
|
return score / len(RELEVANCE_KEYWORDS)
|
|
|
|
def fetch_all_sources(days_back: int = 1) -> List[Paper]:
|
|
"""Fetch from all configured arXiv categories."""
|
|
all_papers = []
|
|
for category in ARXIV_RSS_URLS.keys():
|
|
papers = fetch_arxiv_category(category, days_back)
|
|
all_papers.extend(papers)
|
|
return all_papers
|
|
|
|
if __name__ == "__main__":
|
|
papers = fetch_all_sources(days_back=1)
|
|
print(f"Fetched {len(papers)} papers")
|
|
|
|
# Sort by keyword relevance
|
|
scored = [(p, keyword_score(p)) for p in papers]
|
|
scored.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
for paper, score in scored[:10]:
|
|
print(f"\n[{score:.2f}] {paper.title}")
|
|
print(f" {paper.url}")
|