Some checks failed
Smoke Test / smoke (push) Has been cancelled
Merge PR #639
207 lines
7.0 KiB
Python
207 lines
7.0 KiB
Python
"""Tests for Know Thy Father — Phase 1: Media Indexing."""
|
|
|
|
import json
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from scripts.know_thy_father.index_media import (
|
|
MediaEntry,
|
|
IndexStats,
|
|
load_tweets,
|
|
load_media_manifest,
|
|
filter_target_tweets,
|
|
build_media_entries,
|
|
compute_stats,
|
|
generate_summary_report,
|
|
)
|
|
|
|
|
|
class TestFilterTargetTweets:
|
|
"""Test filtering tweets by target hashtags."""
|
|
|
|
def test_finds_timmytime(self):
|
|
tweets = [
|
|
{"tweet_id": "1", "hashtags": ["TimmyTime"], "full_text": "test"},
|
|
{"tweet_id": "2", "hashtags": ["other"], "full_text": "test"},
|
|
]
|
|
result = filter_target_tweets(tweets)
|
|
assert len(result) == 1
|
|
assert result[0]["tweet_id"] == "1"
|
|
|
|
def test_finds_timmychain(self):
|
|
tweets = [
|
|
{"tweet_id": "1", "hashtags": ["TimmyChain"], "full_text": "test"},
|
|
]
|
|
result = filter_target_tweets(tweets)
|
|
assert len(result) == 1
|
|
|
|
def test_case_insensitive(self):
|
|
tweets = [
|
|
{"tweet_id": "1", "hashtags": ["timmytime"], "full_text": "test"},
|
|
{"tweet_id": "2", "hashtags": ["TIMMYCHAIN"], "full_text": "test"},
|
|
]
|
|
result = filter_target_tweets(tweets)
|
|
assert len(result) == 2
|
|
|
|
def test_finds_both_hashtags(self):
|
|
tweets = [
|
|
{"tweet_id": "1", "hashtags": ["TimmyTime", "TimmyChain"], "full_text": "test"},
|
|
]
|
|
result = filter_target_tweets(tweets)
|
|
assert len(result) == 1
|
|
|
|
def test_excludes_non_target(self):
|
|
tweets = [
|
|
{"tweet_id": "1", "hashtags": ["bitcoin"], "full_text": "test"},
|
|
{"tweet_id": "2", "hashtags": [], "full_text": "test"},
|
|
]
|
|
result = filter_target_tweets(tweets)
|
|
assert len(result) == 0
|
|
|
|
|
|
class TestBuildMediaEntries:
|
|
"""Test building media entries from tweets and manifest."""
|
|
|
|
def test_maps_tweets_to_media(self):
|
|
target_tweets = [
|
|
{"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
|
|
"hashtags": ["TimmyTime"], "urls": []},
|
|
]
|
|
media_by_tweet = {
|
|
"100": [
|
|
{"media_id": "m1", "media_type": "photo", "media_index": 1,
|
|
"local_media_path": "/tmp/m1.jpg"},
|
|
]
|
|
}
|
|
|
|
entries, without_media = build_media_entries(target_tweets, media_by_tweet)
|
|
assert len(entries) == 1
|
|
assert entries[0].tweet_id == "100"
|
|
assert entries[0].media_type == "photo"
|
|
assert entries[0].source == "media_manifest"
|
|
assert len(without_media) == 0
|
|
|
|
def test_handles_no_media(self):
|
|
target_tweets = [
|
|
{"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
|
|
"hashtags": ["TimmyTime"], "urls": []},
|
|
]
|
|
media_by_tweet = {}
|
|
|
|
entries, without_media = build_media_entries(target_tweets, media_by_tweet)
|
|
assert len(entries) == 0
|
|
assert len(without_media) == 1
|
|
|
|
def test_handles_url_only_tweets(self):
|
|
target_tweets = [
|
|
{"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
|
|
"hashtags": ["TimmyTime"], "urls": ["https://example.com"]},
|
|
]
|
|
media_by_tweet = {}
|
|
|
|
entries, without_media = build_media_entries(target_tweets, media_by_tweet)
|
|
# Should create a URL reference entry
|
|
assert len(entries) == 1
|
|
assert entries[0].media_type == "url_reference"
|
|
assert entries[0].source == "tweets_only"
|
|
|
|
def test_deduplicates_media(self):
|
|
target_tweets = [
|
|
{"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
|
|
"hashtags": ["TimmyTime"], "urls": []},
|
|
]
|
|
media_by_tweet = {
|
|
"100": [
|
|
{"media_id": "m1", "media_type": "photo", "media_index": 1,
|
|
"local_media_path": "/tmp/m1.jpg"},
|
|
{"media_id": "m1", "media_type": "photo", "media_index": 1,
|
|
"local_media_path": "/tmp/m1.jpg"}, # Duplicate
|
|
]
|
|
}
|
|
|
|
entries, _ = build_media_entries(target_tweets, media_by_tweet)
|
|
assert len(entries) == 1 # Deduplicated
|
|
|
|
|
|
class TestComputeStats:
|
|
"""Test statistics computation."""
|
|
|
|
def test_computes_basic_stats(self):
|
|
target_tweets = [
|
|
{"tweet_id": "100", "hashtags": ["TimmyTime"], "created_at": "2026-04-01"},
|
|
{"tweet_id": "101", "hashtags": ["TimmyChain"], "created_at": "2026-04-02"},
|
|
]
|
|
media_entries = [
|
|
MediaEntry(tweet_id="100", created_at="2026-04-01", full_text="",
|
|
hashtags=["TimmyTime"], media_id="m1", media_type="photo",
|
|
media_index=1, local_media_path="/tmp/m1.jpg"),
|
|
]
|
|
|
|
stats = compute_stats(1000, target_tweets, media_entries)
|
|
assert stats.total_tweets_scanned == 1000
|
|
assert stats.target_tweets_found == 2
|
|
assert stats.target_tweets_with_media == 1
|
|
assert stats.target_tweets_without_media == 1
|
|
assert stats.total_media_entries == 1
|
|
|
|
def test_counts_media_types(self):
|
|
target_tweets = [
|
|
{"tweet_id": "100", "hashtags": ["TimmyTime"], "created_at": ""},
|
|
]
|
|
media_entries = [
|
|
MediaEntry(tweet_id="100", created_at="", full_text="",
|
|
hashtags=[], media_id="m1", media_type="photo",
|
|
media_index=1, local_media_path=""),
|
|
MediaEntry(tweet_id="100", created_at="", full_text="",
|
|
hashtags=[], media_id="m2", media_type="video",
|
|
media_index=2, local_media_path=""),
|
|
]
|
|
|
|
stats = compute_stats(100, target_tweets, media_entries)
|
|
assert stats.media_types["photo"] == 1
|
|
assert stats.media_types["video"] == 1
|
|
|
|
|
|
class TestMediaEntry:
|
|
"""Test MediaEntry dataclass."""
|
|
|
|
def test_to_dict(self):
|
|
entry = MediaEntry(
|
|
tweet_id="100",
|
|
created_at="2026-04-01",
|
|
full_text="Test",
|
|
hashtags=["TimmyTime"],
|
|
media_id="m1",
|
|
media_type="photo",
|
|
media_index=1,
|
|
local_media_path="/tmp/m1.jpg",
|
|
)
|
|
d = entry.to_dict()
|
|
assert d["tweet_id"] == "100"
|
|
assert d["media_type"] == "photo"
|
|
assert "indexed_at" in d
|
|
|
|
|
|
class TestGenerateSummaryReport:
|
|
"""Test report generation."""
|
|
|
|
def test_generates_valid_markdown(self):
|
|
stats = IndexStats(
|
|
total_tweets_scanned=1000,
|
|
target_tweets_found=100,
|
|
target_tweets_with_media=80,
|
|
target_tweets_without_media=20,
|
|
total_media_entries=150,
|
|
media_types={"photo": 100, "video": 50},
|
|
hashtag_counts={"timmytime": 60, "timmychain": 40},
|
|
)
|
|
|
|
report = generate_summary_report(stats)
|
|
assert "# Know Thy Father" in report
|
|
assert "1000" in report
|
|
assert "100" in report
|
|
assert "photo" in report
|
|
assert "timmytime" in report
|