#!/usr/bin/env python3 """Tests for Phase 1: Source Aggregation""" import asyncio import pytest from datetime import datetime, timedelta from pathlib import Path import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from pipeline import RSSAggregator, FeedItem class TestRSSAggregator: """Test suite for RSS aggregation.""" @pytest.fixture def aggregator(self, tmp_path): return RSSAggregator(cache_dir=tmp_path) @pytest.mark.asyncio async def test_fetch_arxiv_cs_ai(self, aggregator): """Test fetching real arXiv cs.AI feed.""" items = await aggregator.fetch_feed( url="http://export.arxiv.org/rss/cs.AI", name="test_arxiv", max_items=5 ) assert len(items) > 0, "Should fetch items from arXiv" assert all(isinstance(i, FeedItem) for i in items) assert all(i.title for i in items) assert all(i.url.startswith("http") for i in items) print(f"Fetched {len(items)} items from arXiv cs.AI") @pytest.mark.asyncio async def test_fetch_all_sources(self, aggregator): """Test fetching from multiple sources.""" sources = [ {"name": "arxiv_ai", "url": "http://export.arxiv.org/rss/cs.AI", "max_items": 3}, {"name": "arxiv_cl", "url": "http://export.arxiv.org/rss/cs.CL", "max_items": 3}, ] since = datetime.utcnow() - timedelta(hours=48) items = await aggregator.fetch_all(sources, since=since) assert len(items) > 0 # Check deduplication hashes = [i.content_hash for i in items] assert len(hashes) == len(set(hashes)), "Should deduplicate items" def test_content_hash_consistency(self): """Test that identical content produces identical hashes.""" agg = RSSAggregator() h1 = agg._compute_hash("Test content") h2 = agg._compute_hash("Test content") h3 = agg._compute_hash("Different content") assert h1 == h2 assert h1 != h3 if __name__ == "__main__": pytest.main([__file__, "-v"])