[DEEP-DIVE] Scaffold component — #830
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled

This commit is contained in:
2026-04-05 07:42:25 +00:00
parent 3014d83462
commit db262ec764

View File

@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""
AI Lab Blog Aggregator
Scrapes RSS/feeds from major AI labs
"""
import feedparser
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional
@dataclass
class BlogPost:
title: str
source: str # "openai", "anthropic", "deepmind", etc.
url: str
published: datetime
summary: str
content: Optional[str] = None
BLOG_SOURCES = {
"openai": {
"rss": "https://openai.com/blog/rss.xml",
"fallback_url": "https://openai.com/blog/",
},
"anthropic": {
"rss": "https://www.anthropic.com/rss.xml",
"fallback_url": "https://www.anthropic.com/news",
},
"deepmind": {
# DeepMind doesn't have a clean RSS, requires scraping
"url": "https://deepmind.google/research/highlighted/",
"selector": "article",
}
}
def fetch_rss_source(name: str, config: dict) -> List[BlogPost]:
"""Fetch posts from an RSS feed."""
url = config.get("rss")
if not url:
return []
feed = feedparser.parse(url)
posts = []
for entry in feed.entries[:10]: # Limit to recent 10
try:
published = datetime.strptime(
entry.published, "%a, %d %b %Y %H:%M:%S %Z"
)
except:
published = datetime.now()
posts.append(BlogPost(
title=entry.title,
source=name,
url=entry.link,
published=published,
summary=entry.get("summary", "")[:500]
))
return posts
def fetch_deepmind() -> List[BlogPost]:
"""Specialized scraper for DeepMind (no RSS)."""
url = BLOG_SOURCES["deepmind"]["url"]
try:
resp = requests.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "html.parser")
posts = []
for article in soup.select("article")[:10]:
title_elem = article.select_one("h3, h2")
link_elem = article.select_one("a")
if title_elem and link_elem:
posts.append(BlogPost(
title=title_elem.get_text(strip=True),
source="deepmind",
url=f"https://deepmind.google{link_elem['href']}",
published=datetime.now(), # DeepMind doesn't expose dates easily
summary=""
))
return posts
except Exception as e:
print(f"DeepMind fetch error: {e}")
return []
def fetch_all_blogs() -> List[BlogPost]:
"""Fetch from all configured blog sources."""
all_posts = []
for name, config in BLOG_SOURCES.items():
if name == "deepmind":
posts = fetch_deepmind()
else:
posts = fetch_rss_source(name, config)
all_posts.extend(posts)
# Sort by date (newest first)
all_posts.sort(key=lambda x: x.published, reverse=True)
return all_posts
if __name__ == "__main__":
posts = fetch_all_blogs()
print(f"Fetched {len(posts)} blog posts")
for post in posts[:5]:
print(f"\n[{post.source}] {post.title}")
print(f" {post.url}")