Files
timmy-home/tests/test_twitter_video_manifest.py

100 lines
3.5 KiB
Python

from __future__ import annotations
import json
from pathlib import Path
from scripts.twitter_archive.extract_media_manifest import (
build_media_manifest_summary,
extract_media_records,
)
def test_extract_media_records_preserves_post_text_and_hashtags(tmp_path: Path) -> None:
media_dir = tmp_path / "tweets_media"
media_dir.mkdir()
local_media = media_dir / "12345-abcde.mp4"
local_media.write_bytes(b"fake")
raw_entries = [
{
"tweet": {
"id_str": "12345",
"created_at": "Sat Mar 28 12:00:00 +0000 2026",
"full_text": "#timmyTime meets #TimmyChain in motion https://t.co/demo",
"lang": "en",
"entities": {
"hashtags": [{"text": "timmyTime"}, {"text": "TimmyChain"}],
"urls": [{"expanded_url": "https://example.com/post"}],
},
"extended_entities": {
"media": [
{
"id_str": "abcde",
"type": "video",
"media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/abc.jpg",
"expanded_url": "https://x.com/rockachopa/status/12345/video/1",
"video_info": {
"variants": [
{
"bitrate": 832000,
"content_type": "video/mp4",
"url": "https://video.twimg.com/ext_tw_video/abc.mp4",
}
]
},
}
]
},
}
}
]
records = extract_media_records(raw_entries, tmp_path, "tweets.js")
assert len(records) == 1
record = records[0]
assert record["tweet_id"] == "12345"
assert record["full_text"].startswith("#timmyTime meets #TimmyChain")
assert record["hashtags"] == ["timmyTime", "TimmyChain"]
assert record["media_type"] == "video"
assert record["local_media_path"] == str(local_media)
assert record["source_file"] == "tweets.js"
assert record["variants"][0]["url"].endswith("abc.mp4")
def test_build_media_manifest_summary_counts_media_types_and_hashtag_families() -> None:
records = [
{
"tweet_id": "1",
"media_type": "video",
"hashtags": ["timmyTime"],
"created_at": "Sat Mar 28 12:00:00 +0000 2026",
},
{
"tweet_id": "2",
"media_type": "animated_gif",
"hashtags": ["TimmyChain", "timmyTime"],
"created_at": "Sun Mar 29 12:00:00 +0000 2026",
},
]
summary = build_media_manifest_summary(records, source_dir="/tmp/source", source_file="tweets.js")
assert summary["media_record_count"] == 2
assert summary["media_type_counts"]["video"] == 1
assert summary["media_type_counts"]["animated_gif"] == 1
assert summary["hashtag_counts"]["timmytime"] == 2
assert summary["hashtag_counts"]["timmychain"] == 1
assert summary["fields"]
assert summary["source_file"] == "tweets.js"
def test_project_doc_mentions_media_manifest_and_hashtag_preservation() -> None:
project = Path("twitter-archive/PROJECT.md").read_text()
assert "media/" in project
assert "manifest.jsonl" in project
assert "hashtags" in project
assert "#timmyTime" in project
assert "#TimmyChain" in project