diff --git a/config/deepdive_sources.yaml b/config/deepdive_sources.yaml new file mode 100644 index 0000000..6d9da62 --- /dev/null +++ b/config/deepdive_sources.yaml @@ -0,0 +1,115 @@ +# Deep Dive Source Configuration +# Define RSS feeds, API endpoints, and scrapers for content aggregation + +feeds: + # arXiv Categories + arxiv_ai: + name: "arXiv Artificial Intelligence" + url: "http://export.arxiv.org/rss/cs.AI" + type: rss + poll_interval_hours: 24 + enabled: true + + arxiv_cl: + name: "arXiv Computation and Language" + url: "http://export.arxiv.org/rss/cs.CL" + type: rss + poll_interval_hours: 24 + enabled: true + + arxiv_lg: + name: "arXiv Learning" + url: "http://export.arxiv.org/rss/cs.LG" + type: rss + poll_interval_hours: 24 + enabled: true + + arxiv_lm: + name: "arXiv Large Language Models" + url: "http://export.arxiv.org/rss/cs.LG" + type: rss + poll_interval_hours: 24 + enabled: true + + # AI Lab Blogs + openai_blog: + name: "OpenAI Blog" + url: "https://openai.com/blog/rss.xml" + type: rss + poll_interval_hours: 6 + enabled: true + + deepmind_news: + name: "Google DeepMind News" + url: "https://deepmind.google/news/rss.xml" + type: rss + poll_interval_hours: 12 + enabled: true + + google_research: + name: "Google Research Blog" + url: "https://research.google/blog/rss/" + type: rss + poll_interval_hours: 12 + enabled: true + + anthropic_news: + name: "Anthropic News" + url: "https://www.anthropic.com/news" + type: scraper # Custom scraper required + poll_interval_hours: 12 + enabled: false # Enable when scraper implemented + selectors: + container: "article" + title: "h2, .title" + link: "a[href^='/news']" + date: "time" + summary: ".summary, p" + + # Newsletters + importai: + name: "Import AI" + url: "https://importai.substack.com/feed" + type: rss + poll_interval_hours: 24 + enabled: true + + tldr_ai: + name: "TLDR AI" + url: "https://tldr.tech/ai/rss" + type: rss + poll_interval_hours: 24 + enabled: true + + the_batch: + name: "The Batch (DeepLearning.AI)" + url: "https://read.deeplearning.ai/the-batch/rss" + type: rss + poll_interval_hours: 24 + enabled: false + +# API Sources (for future expansion) +api_sources: + huggingface_papers: + name: "Hugging Face Daily Papers" + url: "https://huggingface.co/api/daily_papers" + type: api + enabled: false + auth_required: false + + semanticscholar: + name: "Semantic Scholar" + url: "https://api.semanticscholar.org/graph/v1/" + type: api + enabled: false + auth_required: true + api_key_env: "SEMANTIC_SCHOLAR_API_KEY" + +# Global settings +settings: + max_entries_per_source: 50 + min_summary_length: 100 + request_timeout_seconds: 30 + user_agent: "DeepDive-Bot/1.0 (Research Aggregation)" + respect_robots_txt: true + rate_limit_delay_seconds: 2