113 lines
3.1 KiB
Python
113 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
AI Lab Blog Aggregator
|
|
Scrapes RSS/feeds from major AI labs
|
|
"""
|
|
|
|
import feedparser
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from typing import List, Optional
|
|
|
|
@dataclass
|
|
class BlogPost:
|
|
title: str
|
|
source: str # "openai", "anthropic", "deepmind", etc.
|
|
url: str
|
|
published: datetime
|
|
summary: str
|
|
content: Optional[str] = None
|
|
|
|
BLOG_SOURCES = {
|
|
"openai": {
|
|
"rss": "https://openai.com/blog/rss.xml",
|
|
"fallback_url": "https://openai.com/blog/",
|
|
},
|
|
"anthropic": {
|
|
"rss": "https://www.anthropic.com/rss.xml",
|
|
"fallback_url": "https://www.anthropic.com/news",
|
|
},
|
|
"deepmind": {
|
|
# DeepMind doesn't have a clean RSS, requires scraping
|
|
"url": "https://deepmind.google/research/highlighted/",
|
|
"selector": "article",
|
|
}
|
|
}
|
|
|
|
def fetch_rss_source(name: str, config: dict) -> List[BlogPost]:
|
|
"""Fetch posts from an RSS feed."""
|
|
url = config.get("rss")
|
|
if not url:
|
|
return []
|
|
|
|
feed = feedparser.parse(url)
|
|
posts = []
|
|
|
|
for entry in feed.entries[:10]: # Limit to recent 10
|
|
try:
|
|
published = datetime.strptime(
|
|
entry.published, "%a, %d %b %Y %H:%M:%S %Z"
|
|
)
|
|
except:
|
|
published = datetime.now()
|
|
|
|
posts.append(BlogPost(
|
|
title=entry.title,
|
|
source=name,
|
|
url=entry.link,
|
|
published=published,
|
|
summary=entry.get("summary", "")[:500]
|
|
))
|
|
|
|
return posts
|
|
|
|
def fetch_deepmind() -> List[BlogPost]:
|
|
"""Specialized scraper for DeepMind (no RSS)."""
|
|
url = BLOG_SOURCES["deepmind"]["url"]
|
|
try:
|
|
resp = requests.get(url, timeout=30)
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
posts = []
|
|
|
|
for article in soup.select("article")[:10]:
|
|
title_elem = article.select_one("h3, h2")
|
|
link_elem = article.select_one("a")
|
|
|
|
if title_elem and link_elem:
|
|
posts.append(BlogPost(
|
|
title=title_elem.get_text(strip=True),
|
|
source="deepmind",
|
|
url=f"https://deepmind.google{link_elem['href']}",
|
|
published=datetime.now(), # DeepMind doesn't expose dates easily
|
|
summary=""
|
|
))
|
|
|
|
return posts
|
|
except Exception as e:
|
|
print(f"DeepMind fetch error: {e}")
|
|
return []
|
|
|
|
def fetch_all_blogs() -> List[BlogPost]:
|
|
"""Fetch from all configured blog sources."""
|
|
all_posts = []
|
|
|
|
for name, config in BLOG_SOURCES.items():
|
|
if name == "deepmind":
|
|
posts = fetch_deepmind()
|
|
else:
|
|
posts = fetch_rss_source(name, config)
|
|
all_posts.extend(posts)
|
|
|
|
# Sort by date (newest first)
|
|
all_posts.sort(key=lambda x: x.published, reverse=True)
|
|
return all_posts
|
|
|
|
if __name__ == "__main__":
|
|
posts = fetch_all_blogs()
|
|
print(f"Fetched {len(posts)} blog posts")
|
|
for post in posts[:5]:
|
|
print(f"\n[{post.source}] {post.title}")
|
|
print(f" {post.url}")
|