from __future__ import annotations import json from pathlib import Path from scripts.twitter_archive.extract_media_manifest import ( build_media_manifest_summary, extract_media_records, ) def test_extract_media_records_preserves_post_text_and_hashtags(tmp_path: Path) -> None: media_dir = tmp_path / "tweets_media" media_dir.mkdir() local_media = media_dir / "12345-abcde.mp4" local_media.write_bytes(b"fake") raw_entries = [ { "tweet": { "id_str": "12345", "created_at": "Sat Mar 28 12:00:00 +0000 2026", "full_text": "#timmyTime meets #TimmyChain in motion https://t.co/demo", "lang": "en", "entities": { "hashtags": [{"text": "timmyTime"}, {"text": "TimmyChain"}], "urls": [{"expanded_url": "https://example.com/post"}], }, "extended_entities": { "media": [ { "id_str": "abcde", "type": "video", "media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/abc.jpg", "expanded_url": "https://x.com/rockachopa/status/12345/video/1", "video_info": { "variants": [ { "bitrate": 832000, "content_type": "video/mp4", "url": "https://video.twimg.com/ext_tw_video/abc.mp4", } ] }, } ] }, } } ] records = extract_media_records(raw_entries, tmp_path, "tweets.js") assert len(records) == 1 record = records[0] assert record["tweet_id"] == "12345" assert record["full_text"].startswith("#timmyTime meets #TimmyChain") assert record["hashtags"] == ["timmyTime", "TimmyChain"] assert record["media_type"] == "video" assert record["local_media_path"] == str(local_media) assert record["source_file"] == "tweets.js" assert record["variants"][0]["url"].endswith("abc.mp4") def test_build_media_manifest_summary_counts_media_types_and_hashtag_families() -> None: records = [ { "tweet_id": "1", "media_type": "video", "hashtags": ["timmyTime"], "created_at": "Sat Mar 28 12:00:00 +0000 2026", }, { "tweet_id": "2", "media_type": "animated_gif", "hashtags": ["TimmyChain", "timmyTime"], "created_at": "Sun Mar 29 12:00:00 +0000 2026", }, ] summary = build_media_manifest_summary(records, source_dir="/tmp/source", source_file="tweets.js") assert summary["media_record_count"] == 2 assert summary["media_type_counts"]["video"] == 1 assert summary["media_type_counts"]["animated_gif"] == 1 assert summary["hashtag_counts"]["timmytime"] == 2 assert summary["hashtag_counts"]["timmychain"] == 1 assert summary["fields"] assert summary["source_file"] == "tweets.js" def test_project_doc_mentions_media_manifest_and_hashtag_preservation() -> None: project = Path("twitter-archive/PROJECT.md").read_text() assert "media/" in project assert "manifest.jsonl" in project assert "hashtags" in project assert "#timmyTime" in project assert "#TimmyChain" in project