timmy-home/tests/test_twitter_video_manifest.py

from __future__ import annotations

import json
from pathlib import Path

from scripts.twitter_archive.extract_media_manifest import (
    build_media_manifest_summary,
    extract_media_records,
)


def test_extract_media_records_preserves_post_text_and_hashtags(tmp_path: Path) -> None:
    media_dir = tmp_path / "tweets_media"
    media_dir.mkdir()
    local_media = media_dir / "12345-abcde.mp4"
    local_media.write_bytes(b"fake")

    raw_entries = [
        {
            "tweet": {
                "id_str": "12345",
                "created_at": "Sat Mar 28 12:00:00 +0000 2026",
                "full_text": "#timmyTime meets #TimmyChain in motion https://t.co/demo",
                "lang": "en",
                "entities": {
                    "hashtags": [{"text": "timmyTime"}, {"text": "TimmyChain"}],
                    "urls": [{"expanded_url": "https://example.com/post"}],
                },
                "extended_entities": {
                    "media": [
                        {
                            "id_str": "abcde",
                            "type": "video",
                            "media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/abc.jpg",
                            "expanded_url": "https://x.com/rockachopa/status/12345/video/1",
                            "video_info": {
                                "variants": [
                                    {
                                        "bitrate": 832000,
                                        "content_type": "video/mp4",
                                        "url": "https://video.twimg.com/ext_tw_video/abc.mp4",
                                    }
                                ]
                            },
                        }
                    ]
                },
            }
        }
    ]

    records = extract_media_records(raw_entries, tmp_path, "tweets.js")

    assert len(records) == 1
    record = records[0]
    assert record["tweet_id"] == "12345"
    assert record["full_text"].startswith("#timmyTime meets #TimmyChain")
    assert record["hashtags"] == ["timmyTime", "TimmyChain"]
    assert record["media_type"] == "video"
    assert record["local_media_path"] == str(local_media)
    assert record["source_file"] == "tweets.js"
    assert record["variants"][0]["url"].endswith("abc.mp4")


def test_build_media_manifest_summary_counts_media_types_and_hashtag_families() -> None:
    records = [
        {
            "tweet_id": "1",
            "media_type": "video",
            "hashtags": ["timmyTime"],
            "created_at": "Sat Mar 28 12:00:00 +0000 2026",
        },
        {
            "tweet_id": "2",
            "media_type": "animated_gif",
            "hashtags": ["TimmyChain", "timmyTime"],
            "created_at": "Sun Mar 29 12:00:00 +0000 2026",
        },
    ]

    summary = build_media_manifest_summary(records, source_dir="/tmp/source", source_file="tweets.js")

    assert summary["media_record_count"] == 2
    assert summary["media_type_counts"]["video"] == 1
    assert summary["media_type_counts"]["animated_gif"] == 1
    assert summary["hashtag_counts"]["timmytime"] == 2
    assert summary["hashtag_counts"]["timmychain"] == 1
    assert summary["fields"]
    assert summary["source_file"] == "tweets.js"


def test_project_doc_mentions_media_manifest_and_hashtag_preservation() -> None:
    project = Path("twitter-archive/PROJECT.md").read_text()

    assert "media/" in project
    assert "manifest.jsonl" in project
    assert "hashtags" in project
    assert "#timmyTime" in project
    assert "#TimmyChain" in project