[ezra] Deep Dive source configuration template (#830)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
This commit is contained in:
115
config/deepdive_sources.yaml
Normal file
115
config/deepdive_sources.yaml
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
# Deep Dive Source Configuration
|
||||||
|
# Define RSS feeds, API endpoints, and scrapers for content aggregation
|
||||||
|
|
||||||
|
feeds:
|
||||||
|
# arXiv Categories
|
||||||
|
arxiv_ai:
|
||||||
|
name: "arXiv Artificial Intelligence"
|
||||||
|
url: "http://export.arxiv.org/rss/cs.AI"
|
||||||
|
type: rss
|
||||||
|
poll_interval_hours: 24
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
arxiv_cl:
|
||||||
|
name: "arXiv Computation and Language"
|
||||||
|
url: "http://export.arxiv.org/rss/cs.CL"
|
||||||
|
type: rss
|
||||||
|
poll_interval_hours: 24
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
arxiv_lg:
|
||||||
|
name: "arXiv Learning"
|
||||||
|
url: "http://export.arxiv.org/rss/cs.LG"
|
||||||
|
type: rss
|
||||||
|
poll_interval_hours: 24
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
arxiv_lm:
|
||||||
|
name: "arXiv Large Language Models"
|
||||||
|
url: "http://export.arxiv.org/rss/cs.LG"
|
||||||
|
type: rss
|
||||||
|
poll_interval_hours: 24
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# AI Lab Blogs
|
||||||
|
openai_blog:
|
||||||
|
name: "OpenAI Blog"
|
||||||
|
url: "https://openai.com/blog/rss.xml"
|
||||||
|
type: rss
|
||||||
|
poll_interval_hours: 6
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
deepmind_news:
|
||||||
|
name: "Google DeepMind News"
|
||||||
|
url: "https://deepmind.google/news/rss.xml"
|
||||||
|
type: rss
|
||||||
|
poll_interval_hours: 12
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
google_research:
|
||||||
|
name: "Google Research Blog"
|
||||||
|
url: "https://research.google/blog/rss/"
|
||||||
|
type: rss
|
||||||
|
poll_interval_hours: 12
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
anthropic_news:
|
||||||
|
name: "Anthropic News"
|
||||||
|
url: "https://www.anthropic.com/news"
|
||||||
|
type: scraper # Custom scraper required
|
||||||
|
poll_interval_hours: 12
|
||||||
|
enabled: false # Enable when scraper implemented
|
||||||
|
selectors:
|
||||||
|
container: "article"
|
||||||
|
title: "h2, .title"
|
||||||
|
link: "a[href^='/news']"
|
||||||
|
date: "time"
|
||||||
|
summary: ".summary, p"
|
||||||
|
|
||||||
|
# Newsletters
|
||||||
|
importai:
|
||||||
|
name: "Import AI"
|
||||||
|
url: "https://importai.substack.com/feed"
|
||||||
|
type: rss
|
||||||
|
poll_interval_hours: 24
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
tldr_ai:
|
||||||
|
name: "TLDR AI"
|
||||||
|
url: "https://tldr.tech/ai/rss"
|
||||||
|
type: rss
|
||||||
|
poll_interval_hours: 24
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
the_batch:
|
||||||
|
name: "The Batch (DeepLearning.AI)"
|
||||||
|
url: "https://read.deeplearning.ai/the-batch/rss"
|
||||||
|
type: rss
|
||||||
|
poll_interval_hours: 24
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
# API Sources (for future expansion)
|
||||||
|
api_sources:
|
||||||
|
huggingface_papers:
|
||||||
|
name: "Hugging Face Daily Papers"
|
||||||
|
url: "https://huggingface.co/api/daily_papers"
|
||||||
|
type: api
|
||||||
|
enabled: false
|
||||||
|
auth_required: false
|
||||||
|
|
||||||
|
semanticscholar:
|
||||||
|
name: "Semantic Scholar"
|
||||||
|
url: "https://api.semanticscholar.org/graph/v1/"
|
||||||
|
type: api
|
||||||
|
enabled: false
|
||||||
|
auth_required: true
|
||||||
|
api_key_env: "SEMANTIC_SCHOLAR_API_KEY"
|
||||||
|
|
||||||
|
# Global settings
|
||||||
|
settings:
|
||||||
|
max_entries_per_source: 50
|
||||||
|
min_summary_length: 100
|
||||||
|
request_timeout_seconds: 30
|
||||||
|
user_agent: "DeepDive-Bot/1.0 (Research Aggregation)"
|
||||||
|
respect_robots_txt: true
|
||||||
|
rate_limit_delay_seconds: 2
|
||||||
Reference in New Issue
Block a user