timmy-home/tests/test_know_thy_father_index.py

"""Tests for Know Thy Father — Phase 1: Media Indexing."""

import json
import tempfile
from pathlib import Path

import pytest

from scripts.know_thy_father.index_media import (
    MediaEntry,
    IndexStats,
    load_tweets,
    load_media_manifest,
    filter_target_tweets,
    build_media_entries,
    compute_stats,
    generate_summary_report,
)


class TestFilterTargetTweets:
    """Test filtering tweets by target hashtags."""

    def test_finds_timmytime(self):
        tweets = [
            {"tweet_id": "1", "hashtags": ["TimmyTime"], "full_text": "test"},
            {"tweet_id": "2", "hashtags": ["other"], "full_text": "test"},
        ]
        result = filter_target_tweets(tweets)
        assert len(result) == 1
        assert result[0]["tweet_id"] == "1"

    def test_finds_timmychain(self):
        tweets = [
            {"tweet_id": "1", "hashtags": ["TimmyChain"], "full_text": "test"},
        ]
        result = filter_target_tweets(tweets)
        assert len(result) == 1

    def test_case_insensitive(self):
        tweets = [
            {"tweet_id": "1", "hashtags": ["timmytime"], "full_text": "test"},
            {"tweet_id": "2", "hashtags": ["TIMMYCHAIN"], "full_text": "test"},
        ]
        result = filter_target_tweets(tweets)
        assert len(result) == 2

    def test_finds_both_hashtags(self):
        tweets = [
            {"tweet_id": "1", "hashtags": ["TimmyTime", "TimmyChain"], "full_text": "test"},
        ]
        result = filter_target_tweets(tweets)
        assert len(result) == 1

    def test_excludes_non_target(self):
        tweets = [
            {"tweet_id": "1", "hashtags": ["bitcoin"], "full_text": "test"},
            {"tweet_id": "2", "hashtags": [], "full_text": "test"},
        ]
        result = filter_target_tweets(tweets)
        assert len(result) == 0


class TestBuildMediaEntries:
    """Test building media entries from tweets and manifest."""

    def test_maps_tweets_to_media(self):
        target_tweets = [
            {"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
             "hashtags": ["TimmyTime"], "urls": []},
        ]
        media_by_tweet = {
            "100": [
                {"media_id": "m1", "media_type": "photo", "media_index": 1,
                 "local_media_path": "/tmp/m1.jpg"},
            ]
        }

        entries, without_media = build_media_entries(target_tweets, media_by_tweet)
        assert len(entries) == 1
        assert entries[0].tweet_id == "100"
        assert entries[0].media_type == "photo"
        assert entries[0].source == "media_manifest"
        assert len(without_media) == 0

    def test_handles_no_media(self):
        target_tweets = [
            {"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
             "hashtags": ["TimmyTime"], "urls": []},
        ]
        media_by_tweet = {}

        entries, without_media = build_media_entries(target_tweets, media_by_tweet)
        assert len(entries) == 0
        assert len(without_media) == 1

    def test_handles_url_only_tweets(self):
        target_tweets = [
            {"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
             "hashtags": ["TimmyTime"], "urls": ["https://example.com"]},
        ]
        media_by_tweet = {}

        entries, without_media = build_media_entries(target_tweets, media_by_tweet)
        # Should create a URL reference entry
        assert len(entries) == 1
        assert entries[0].media_type == "url_reference"
        assert entries[0].source == "tweets_only"

    def test_deduplicates_media(self):
        target_tweets = [
            {"tweet_id": "100", "created_at": "2026-04-01", "full_text": "Test",
             "hashtags": ["TimmyTime"], "urls": []},
        ]
        media_by_tweet = {
            "100": [
                {"media_id": "m1", "media_type": "photo", "media_index": 1,
                 "local_media_path": "/tmp/m1.jpg"},
                {"media_id": "m1", "media_type": "photo", "media_index": 1,
                 "local_media_path": "/tmp/m1.jpg"},  # Duplicate
            ]
        }

        entries, _ = build_media_entries(target_tweets, media_by_tweet)
        assert len(entries) == 1  # Deduplicated


class TestComputeStats:
    """Test statistics computation."""

    def test_computes_basic_stats(self):
        target_tweets = [
            {"tweet_id": "100", "hashtags": ["TimmyTime"], "created_at": "2026-04-01"},
            {"tweet_id": "101", "hashtags": ["TimmyChain"], "created_at": "2026-04-02"},
        ]
        media_entries = [
            MediaEntry(tweet_id="100", created_at="2026-04-01", full_text="",
                      hashtags=["TimmyTime"], media_id="m1", media_type="photo",
                      media_index=1, local_media_path="/tmp/m1.jpg"),
        ]

        stats = compute_stats(1000, target_tweets, media_entries)
        assert stats.total_tweets_scanned == 1000
        assert stats.target_tweets_found == 2
        assert stats.target_tweets_with_media == 1
        assert stats.target_tweets_without_media == 1
        assert stats.total_media_entries == 1

    def test_counts_media_types(self):
        target_tweets = [
            {"tweet_id": "100", "hashtags": ["TimmyTime"], "created_at": ""},
        ]
        media_entries = [
            MediaEntry(tweet_id="100", created_at="", full_text="",
                      hashtags=[], media_id="m1", media_type="photo",
                      media_index=1, local_media_path=""),
            MediaEntry(tweet_id="100", created_at="", full_text="",
                      hashtags=[], media_id="m2", media_type="video",
                      media_index=2, local_media_path=""),
        ]

        stats = compute_stats(100, target_tweets, media_entries)
        assert stats.media_types["photo"] == 1
        assert stats.media_types["video"] == 1


class TestMediaEntry:
    """Test MediaEntry dataclass."""

    def test_to_dict(self):
        entry = MediaEntry(
            tweet_id="100",
            created_at="2026-04-01",
            full_text="Test",
            hashtags=["TimmyTime"],
            media_id="m1",
            media_type="photo",
            media_index=1,
            local_media_path="/tmp/m1.jpg",
        )
        d = entry.to_dict()
        assert d["tweet_id"] == "100"
        assert d["media_type"] == "photo"
        assert "indexed_at" in d


class TestGenerateSummaryReport:
    """Test report generation."""

    def test_generates_valid_markdown(self):
        stats = IndexStats(
            total_tweets_scanned=1000,
            target_tweets_found=100,
            target_tweets_with_media=80,
            target_tweets_without_media=20,
            total_media_entries=150,
            media_types={"photo": 100, "video": 50},
            hashtag_counts={"timmytime": 60, "timmychain": 40},
        )

        report = generate_summary_report(stats)
        assert "# Know Thy Father" in report
        assert "1000" in report
        assert "100" in report
        assert "photo" in report
        assert "timmytime" in report