Files
the-nexus/intelligence/deepdive/tests/test_aggregator.py
Ezra ffae1b6285
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
[BURN] #830: Phase 1 tests (arXiv RSS aggregation)
2026-04-05 08:08:08 +00:00

65 lines
2.1 KiB
Python

#!/usr/bin/env python3
"""Tests for Phase 1: Source Aggregation"""
import asyncio
import pytest
from datetime import datetime, timedelta
from pathlib import Path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from pipeline import RSSAggregator, FeedItem
class TestRSSAggregator:
"""Test suite for RSS aggregation."""
@pytest.fixture
def aggregator(self, tmp_path):
return RSSAggregator(cache_dir=tmp_path)
@pytest.mark.asyncio
async def test_fetch_arxiv_cs_ai(self, aggregator):
"""Test fetching real arXiv cs.AI feed."""
items = await aggregator.fetch_feed(
url="http://export.arxiv.org/rss/cs.AI",
name="test_arxiv",
max_items=5
)
assert len(items) > 0, "Should fetch items from arXiv"
assert all(isinstance(i, FeedItem) for i in items)
assert all(i.title for i in items)
assert all(i.url.startswith("http") for i in items)
print(f"Fetched {len(items)} items from arXiv cs.AI")
@pytest.mark.asyncio
async def test_fetch_all_sources(self, aggregator):
"""Test fetching from multiple sources."""
sources = [
{"name": "arxiv_ai", "url": "http://export.arxiv.org/rss/cs.AI", "max_items": 3},
{"name": "arxiv_cl", "url": "http://export.arxiv.org/rss/cs.CL", "max_items": 3},
]
since = datetime.utcnow() - timedelta(hours=48)
items = await aggregator.fetch_all(sources, since=since)
assert len(items) > 0
# Check deduplication
hashes = [i.content_hash for i in items]
assert len(hashes) == len(set(hashes)), "Should deduplicate items"
def test_content_hash_consistency(self):
"""Test that identical content produces identical hashes."""
agg = RSSAggregator()
h1 = agg._compute_hash("Test content")
h2 = agg._compute_hash("Test content")
h3 = agg._compute_hash("Different content")
assert h1 == h2
assert h1 != h3
if __name__ == "__main__":
pytest.main([__file__, "-v"])