diff --git a/scaffold/deep-dive/aggregator/blog_fetcher.py b/scaffold/deep-dive/aggregator/blog_fetcher.py new file mode 100644 index 0000000..2577546 --- /dev/null +++ b/scaffold/deep-dive/aggregator/blog_fetcher.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +""" +AI Lab Blog Aggregator +Scrapes RSS/feeds from major AI labs +""" + +import feedparser +import requests +from bs4 import BeautifulSoup +from dataclasses import dataclass +from datetime import datetime +from typing import List, Optional + +@dataclass +class BlogPost: + title: str + source: str # "openai", "anthropic", "deepmind", etc. + url: str + published: datetime + summary: str + content: Optional[str] = None + +BLOG_SOURCES = { + "openai": { + "rss": "https://openai.com/blog/rss.xml", + "fallback_url": "https://openai.com/blog/", + }, + "anthropic": { + "rss": "https://www.anthropic.com/rss.xml", + "fallback_url": "https://www.anthropic.com/news", + }, + "deepmind": { + # DeepMind doesn't have a clean RSS, requires scraping + "url": "https://deepmind.google/research/highlighted/", + "selector": "article", + } +} + +def fetch_rss_source(name: str, config: dict) -> List[BlogPost]: + """Fetch posts from an RSS feed.""" + url = config.get("rss") + if not url: + return [] + + feed = feedparser.parse(url) + posts = [] + + for entry in feed.entries[:10]: # Limit to recent 10 + try: + published = datetime.strptime( + entry.published, "%a, %d %b %Y %H:%M:%S %Z" + ) + except: + published = datetime.now() + + posts.append(BlogPost( + title=entry.title, + source=name, + url=entry.link, + published=published, + summary=entry.get("summary", "")[:500] + )) + + return posts + +def fetch_deepmind() -> List[BlogPost]: + """Specialized scraper for DeepMind (no RSS).""" + url = BLOG_SOURCES["deepmind"]["url"] + try: + resp = requests.get(url, timeout=30) + soup = BeautifulSoup(resp.text, "html.parser") + posts = [] + + for article in soup.select("article")[:10]: + title_elem = article.select_one("h3, h2") + link_elem = article.select_one("a") + + if title_elem and link_elem: + posts.append(BlogPost( + title=title_elem.get_text(strip=True), + source="deepmind", + url=f"https://deepmind.google{link_elem['href']}", + published=datetime.now(), # DeepMind doesn't expose dates easily + summary="" + )) + + return posts + except Exception as e: + print(f"DeepMind fetch error: {e}") + return [] + +def fetch_all_blogs() -> List[BlogPost]: + """Fetch from all configured blog sources.""" + all_posts = [] + + for name, config in BLOG_SOURCES.items(): + if name == "deepmind": + posts = fetch_deepmind() + else: + posts = fetch_rss_source(name, config) + all_posts.extend(posts) + + # Sort by date (newest first) + all_posts.sort(key=lambda x: x.published, reverse=True) + return all_posts + +if __name__ == "__main__": + posts = fetch_all_blogs() + print(f"Fetched {len(posts)} blog posts") + for post in posts[:5]: + print(f"\n[{post.source}] {post.title}") + print(f" {post.url}")