[DEEP-DIVE] Scaffold component — #830

2026-04-05 07:42:25 +00:00
parent 3014d83462
commit db262ec764
1 changed files with 112 additions and 0 deletions
--- a/scaffold/deep-dive/aggregator/blog_fetcher.py
+++ b/scaffold/deep-dive/aggregator/blog_fetcher.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+"""
+AI Lab Blog Aggregator
+Scrapes RSS/feeds from major AI labs
+"""
+
+import feedparser
+import requests
+from bs4 import BeautifulSoup
+from dataclasses import dataclass
+from datetime import datetime
+from typing import List, Optional
+
+@dataclass
+class BlogPost:
+    title: str
+    source: str  # "openai", "anthropic", "deepmind", etc.
+    url: str
+    published: datetime
+    summary: str
+    content: Optional[str] = None
+
+BLOG_SOURCES = {
+    "openai": {
+        "rss": "https://openai.com/blog/rss.xml",
+        "fallback_url": "https://openai.com/blog/",
+    },
+    "anthropic": {
+        "rss": "https://www.anthropic.com/rss.xml",
+        "fallback_url": "https://www.anthropic.com/news",
+    },
+    "deepmind": {
+        # DeepMind doesn't have a clean RSS, requires scraping
+        "url": "https://deepmind.google/research/highlighted/",
+        "selector": "article",
+    }
+}
+
+def fetch_rss_source(name: str, config: dict) -> List[BlogPost]:
+    """Fetch posts from an RSS feed."""
+    url = config.get("rss")
+    if not url:
+        return []
+    
+    feed = feedparser.parse(url)
+    posts = []
+    
+    for entry in feed.entries[:10]:  # Limit to recent 10
+        try:
+            published = datetime.strptime(
+                entry.published, "%a, %d %b %Y %H:%M:%S %Z"
+            )
+        except:
+            published = datetime.now()
+        
+        posts.append(BlogPost(
+            title=entry.title,
+            source=name,
+            url=entry.link,
+            published=published,
+            summary=entry.get("summary", "")[:500]
+        ))
+    
+    return posts
+
+def fetch_deepmind() -> List[BlogPost]:
+    """Specialized scraper for DeepMind (no RSS)."""
+    url = BLOG_SOURCES["deepmind"]["url"]
+    try:
+        resp = requests.get(url, timeout=30)
+        soup = BeautifulSoup(resp.text, "html.parser")
+        posts = []
+        
+        for article in soup.select("article")[:10]:
+            title_elem = article.select_one("h3, h2")
+            link_elem = article.select_one("a")
+            
+            if title_elem and link_elem:
+                posts.append(BlogPost(
+                    title=title_elem.get_text(strip=True),
+                    source="deepmind",
+                    url=f"https://deepmind.google{link_elem['href']}",
+                    published=datetime.now(),  # DeepMind doesn't expose dates easily
+                    summary=""
+                ))
+        
+        return posts
+    except Exception as e:
+        print(f"DeepMind fetch error: {e}")
+        return []
+
+def fetch_all_blogs() -> List[BlogPost]:
+    """Fetch from all configured blog sources."""
+    all_posts = []
+    
+    for name, config in BLOG_SOURCES.items():
+        if name == "deepmind":
+            posts = fetch_deepmind()
+        else:
+            posts = fetch_rss_source(name, config)
+        all_posts.extend(posts)
+    
+    # Sort by date (newest first)
+    all_posts.sort(key=lambda x: x.published, reverse=True)
+    return all_posts
+
+if __name__ == "__main__":
+    posts = fetch_all_blogs()
+    print(f"Fetched {len(posts)} blog posts")
+    for post in posts[:5]:
+        print(f"\n[{post.source}] {post.title}")
+        print(f"    {post.url}")