the-nexus/scaffold/deep-dive/aggregator/blog_fetcher.py

#!/usr/bin/env python3
"""
AI Lab Blog Aggregator
Scrapes RSS/feeds from major AI labs
"""

import feedparser
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional

@dataclass
class BlogPost:
    title: str
    source: str  # "openai", "anthropic", "deepmind", etc.
    url: str
    published: datetime
    summary: str
    content: Optional[str] = None

BLOG_SOURCES = {
    "openai": {
        "rss": "https://openai.com/blog/rss.xml",
        "fallback_url": "https://openai.com/blog/",
    },
    "anthropic": {
        "rss": "https://www.anthropic.com/rss.xml",
        "fallback_url": "https://www.anthropic.com/news",
    },
    "deepmind": {
        # DeepMind doesn't have a clean RSS, requires scraping
        "url": "https://deepmind.google/research/highlighted/",
        "selector": "article",
    }
}

def fetch_rss_source(name: str, config: dict) -> List[BlogPost]:
    """Fetch posts from an RSS feed."""
    url = config.get("rss")
    if not url:
        return []

    feed = feedparser.parse(url)
    posts = []

    for entry in feed.entries[:10]:  # Limit to recent 10
        try:
            published = datetime.strptime(
                entry.published, "%a, %d %b %Y %H:%M:%S %Z"
            )
        except:
            published = datetime.now()

        posts.append(BlogPost(
            title=entry.title,
            source=name,
            url=entry.link,
            published=published,
            summary=entry.get("summary", "")[:500]
        ))

    return posts

def fetch_deepmind() -> List[BlogPost]:
    """Specialized scraper for DeepMind (no RSS)."""
    url = BLOG_SOURCES["deepmind"]["url"]
    try:
        resp = requests.get(url, timeout=30)
        soup = BeautifulSoup(resp.text, "html.parser")
        posts = []

        for article in soup.select("article")[:10]:
            title_elem = article.select_one("h3, h2")
            link_elem = article.select_one("a")

            if title_elem and link_elem:
                posts.append(BlogPost(
                    title=title_elem.get_text(strip=True),
                    source="deepmind",
                    url=f"https://deepmind.google{link_elem['href']}",
                    published=datetime.now(),  # DeepMind doesn't expose dates easily
                    summary=""
                ))

        return posts
    except Exception as e:
        print(f"DeepMind fetch error: {e}")
        return []

def fetch_all_blogs() -> List[BlogPost]:
    """Fetch from all configured blog sources."""
    all_posts = []

    for name, config in BLOG_SOURCES.items():
        if name == "deepmind":
            posts = fetch_deepmind()
        else:
            posts = fetch_rss_source(name, config)
        all_posts.extend(posts)

    # Sort by date (newest first)
    all_posts.sort(key=lambda x: x.published, reverse=True)
    return all_posts

if __name__ == "__main__":
    posts = fetch_all_blogs()
    print(f"Fetched {len(posts)} blog posts")
    for post in posts[:5]:
        print(f"\n[{post.source}] {post.title}")
        print(f"    {post.url}")