Files
timmy-home/tests/test_know_thy_father_index.py
Alexander Whitestone 34b74d81dc
Some checks failed
Smoke Test / smoke (push) Has been cancelled
feat(know-thy-father): Phase 1 media indexing (#639)
Merge PR #639
2026-04-14 22:14:17 +00:00

207 lines
7.0 KiB
Python

"""Tests for Know Thy Father — Phase 1: Media Indexing."""
import json
import tempfile
from pathlib import Path
import pytest
from scripts.know_thy_father.index_media import (
MediaEntry,
IndexStats,
load_tweets,
load_media_manifest,
filter_target_tweets,
build_media_entries,
compute_stats,
generate_summary_report,
)
class TestFilterTargetTweets:
"""Test filtering tweets by target hashtags."""
def test_finds_timmytime(self):
tweets = [
{"tweet_id": "1", "hashtags": ["TimmyTime"], "full_text": "test"},
{"tweet_id": "2", "hashtags": ["other"], "full_text": "test"},
]
result = filter_target_tweets(tweets)
assert len(result) == 1
assert result[0]["tweet_id"] == "1"
def test_finds_timmychain(self):
tweets = [
{"tweet_id": "1", "hashtags": ["TimmyChain"], "full_text": "test"},
]
result = filter_target_tweets(tweets)
assert len(result) == 1
def test_case_insensitive(self):
tweets = [
{"tweet_id": "1", "hashtags": ["timmytime"], "full_text": "test"},
{"tweet_id": "2", "hashtags": ["TIMMYCHAIN"], "full_text": "test"},
]
result = filter_target_tweets(tweets)
assert len(result) == 2
def test_finds_both_hashtags(self):
tweets = [
{"tweet_id": "1", "hashtags": ["TimmyTime", "TimmyChain"], "full_text": "test"},
]
result = filter_target_tweets(tweets)
assert len(result) == 1
def test_excludes_non_target(self):
tweets = [
{"tweet_id": "1", "hashtags": ["bitcoin"], "full_text": "test"},
{"tweet_id": "2", "hashtags": [], "full_text": "test"},
]
result = filter_target_tweets(tweets)
assert len(result) == 0
class TestBuildMediaEntries:
"""Test building media entries from tweets and manifest."""
def test_maps_tweets_to_media(self):
target_tweets = [
{"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
"hashtags": ["TimmyTime"], "urls": []},
]
media_by_tweet = {
"100": [
{"media_id": "m1", "media_type": "photo", "media_index": 1,
"local_media_path": "/tmp/m1.jpg"},
]
}
entries, without_media = build_media_entries(target_tweets, media_by_tweet)
assert len(entries) == 1
assert entries[0].tweet_id == "100"
assert entries[0].media_type == "photo"
assert entries[0].source == "media_manifest"
assert len(without_media) == 0
def test_handles_no_media(self):
target_tweets = [
{"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
"hashtags": ["TimmyTime"], "urls": []},
]
media_by_tweet = {}
entries, without_media = build_media_entries(target_tweets, media_by_tweet)
assert len(entries) == 0
assert len(without_media) == 1
def test_handles_url_only_tweets(self):
target_tweets = [
{"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
"hashtags": ["TimmyTime"], "urls": ["https://example.com"]},
]
media_by_tweet = {}
entries, without_media = build_media_entries(target_tweets, media_by_tweet)
# Should create a URL reference entry
assert len(entries) == 1
assert entries[0].media_type == "url_reference"
assert entries[0].source == "tweets_only"
def test_deduplicates_media(self):
target_tweets = [
{"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
"hashtags": ["TimmyTime"], "urls": []},
]
media_by_tweet = {
"100": [
{"media_id": "m1", "media_type": "photo", "media_index": 1,
"local_media_path": "/tmp/m1.jpg"},
{"media_id": "m1", "media_type": "photo", "media_index": 1,
"local_media_path": "/tmp/m1.jpg"}, # Duplicate
]
}
entries, _ = build_media_entries(target_tweets, media_by_tweet)
assert len(entries) == 1 # Deduplicated
class TestComputeStats:
"""Test statistics computation."""
def test_computes_basic_stats(self):
target_tweets = [
{"tweet_id": "100", "hashtags": ["TimmyTime"], "created_at": "2026-04-01"},
{"tweet_id": "101", "hashtags": ["TimmyChain"], "created_at": "2026-04-02"},
]
media_entries = [
MediaEntry(tweet_id="100", created_at="2026-04-01", full_text="",
hashtags=["TimmyTime"], media_id="m1", media_type="photo",
media_index=1, local_media_path="/tmp/m1.jpg"),
]
stats = compute_stats(1000, target_tweets, media_entries)
assert stats.total_tweets_scanned == 1000
assert stats.target_tweets_found == 2
assert stats.target_tweets_with_media == 1
assert stats.target_tweets_without_media == 1
assert stats.total_media_entries == 1
def test_counts_media_types(self):
target_tweets = [
{"tweet_id": "100", "hashtags": ["TimmyTime"], "created_at": ""},
]
media_entries = [
MediaEntry(tweet_id="100", created_at="", full_text="",
hashtags=[], media_id="m1", media_type="photo",
media_index=1, local_media_path=""),
MediaEntry(tweet_id="100", created_at="", full_text="",
hashtags=[], media_id="m2", media_type="video",
media_index=2, local_media_path=""),
]
stats = compute_stats(100, target_tweets, media_entries)
assert stats.media_types["photo"] == 1
assert stats.media_types["video"] == 1
class TestMediaEntry:
"""Test MediaEntry dataclass."""
def test_to_dict(self):
entry = MediaEntry(
tweet_id="100",
created_at="2026-04-01",
full_text="Test",
hashtags=["TimmyTime"],
media_id="m1",
media_type="photo",
media_index=1,
local_media_path="/tmp/m1.jpg",
)
d = entry.to_dict()
assert d["tweet_id"] == "100"
assert d["media_type"] == "photo"
assert "indexed_at" in d
class TestGenerateSummaryReport:
"""Test report generation."""
def test_generates_valid_markdown(self):
stats = IndexStats(
total_tweets_scanned=1000,
target_tweets_found=100,
target_tweets_with_media=80,
target_tweets_without_media=20,
total_media_entries=150,
media_types={"photo": 100, "video": 50},
hashtag_counts={"timmytime": 60, "timmychain": 40},
)
report = generate_summary_report(stats)
assert "# Know Thy Father" in report
assert "1000" in report
assert "100" in report
assert "photo" in report
assert "timmytime" in report