[DEEP-DIVE] Scaffold component — #830
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
This commit is contained in:
112
scaffold/deep-dive/aggregator/blog_fetcher.py
Normal file
112
scaffold/deep-dive/aggregator/blog_fetcher.py
Normal file
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AI Lab Blog Aggregator
|
||||
Scrapes RSS/feeds from major AI labs
|
||||
"""
|
||||
|
||||
import feedparser
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
|
||||
@dataclass
|
||||
class BlogPost:
|
||||
title: str
|
||||
source: str # "openai", "anthropic", "deepmind", etc.
|
||||
url: str
|
||||
published: datetime
|
||||
summary: str
|
||||
content: Optional[str] = None
|
||||
|
||||
BLOG_SOURCES = {
|
||||
"openai": {
|
||||
"rss": "https://openai.com/blog/rss.xml",
|
||||
"fallback_url": "https://openai.com/blog/",
|
||||
},
|
||||
"anthropic": {
|
||||
"rss": "https://www.anthropic.com/rss.xml",
|
||||
"fallback_url": "https://www.anthropic.com/news",
|
||||
},
|
||||
"deepmind": {
|
||||
# DeepMind doesn't have a clean RSS, requires scraping
|
||||
"url": "https://deepmind.google/research/highlighted/",
|
||||
"selector": "article",
|
||||
}
|
||||
}
|
||||
|
||||
def fetch_rss_source(name: str, config: dict) -> List[BlogPost]:
|
||||
"""Fetch posts from an RSS feed."""
|
||||
url = config.get("rss")
|
||||
if not url:
|
||||
return []
|
||||
|
||||
feed = feedparser.parse(url)
|
||||
posts = []
|
||||
|
||||
for entry in feed.entries[:10]: # Limit to recent 10
|
||||
try:
|
||||
published = datetime.strptime(
|
||||
entry.published, "%a, %d %b %Y %H:%M:%S %Z"
|
||||
)
|
||||
except:
|
||||
published = datetime.now()
|
||||
|
||||
posts.append(BlogPost(
|
||||
title=entry.title,
|
||||
source=name,
|
||||
url=entry.link,
|
||||
published=published,
|
||||
summary=entry.get("summary", "")[:500]
|
||||
))
|
||||
|
||||
return posts
|
||||
|
||||
def fetch_deepmind() -> List[BlogPost]:
|
||||
"""Specialized scraper for DeepMind (no RSS)."""
|
||||
url = BLOG_SOURCES["deepmind"]["url"]
|
||||
try:
|
||||
resp = requests.get(url, timeout=30)
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
posts = []
|
||||
|
||||
for article in soup.select("article")[:10]:
|
||||
title_elem = article.select_one("h3, h2")
|
||||
link_elem = article.select_one("a")
|
||||
|
||||
if title_elem and link_elem:
|
||||
posts.append(BlogPost(
|
||||
title=title_elem.get_text(strip=True),
|
||||
source="deepmind",
|
||||
url=f"https://deepmind.google{link_elem['href']}",
|
||||
published=datetime.now(), # DeepMind doesn't expose dates easily
|
||||
summary=""
|
||||
))
|
||||
|
||||
return posts
|
||||
except Exception as e:
|
||||
print(f"DeepMind fetch error: {e}")
|
||||
return []
|
||||
|
||||
def fetch_all_blogs() -> List[BlogPost]:
|
||||
"""Fetch from all configured blog sources."""
|
||||
all_posts = []
|
||||
|
||||
for name, config in BLOG_SOURCES.items():
|
||||
if name == "deepmind":
|
||||
posts = fetch_deepmind()
|
||||
else:
|
||||
posts = fetch_rss_source(name, config)
|
||||
all_posts.extend(posts)
|
||||
|
||||
# Sort by date (newest first)
|
||||
all_posts.sort(key=lambda x: x.published, reverse=True)
|
||||
return all_posts
|
||||
|
||||
if __name__ == "__main__":
|
||||
posts = fetch_all_blogs()
|
||||
print(f"Fetched {len(posts)} blog posts")
|
||||
for post in posts[:5]:
|
||||
print(f"\n[{post.source}] {post.title}")
|
||||
print(f" {post.url}")
|
||||
Reference in New Issue
Block a user