100 lines
3.5 KiB
Python
100 lines
3.5 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from scripts.twitter_archive.extract_media_manifest import (
|
|
build_media_manifest_summary,
|
|
extract_media_records,
|
|
)
|
|
|
|
|
|
def test_extract_media_records_preserves_post_text_and_hashtags(tmp_path: Path) -> None:
|
|
media_dir = tmp_path / "tweets_media"
|
|
media_dir.mkdir()
|
|
local_media = media_dir / "12345-abcde.mp4"
|
|
local_media.write_bytes(b"fake")
|
|
|
|
raw_entries = [
|
|
{
|
|
"tweet": {
|
|
"id_str": "12345",
|
|
"created_at": "Sat Mar 28 12:00:00 +0000 2026",
|
|
"full_text": "#timmyTime meets #TimmyChain in motion https://t.co/demo",
|
|
"lang": "en",
|
|
"entities": {
|
|
"hashtags": [{"text": "timmyTime"}, {"text": "TimmyChain"}],
|
|
"urls": [{"expanded_url": "https://example.com/post"}],
|
|
},
|
|
"extended_entities": {
|
|
"media": [
|
|
{
|
|
"id_str": "abcde",
|
|
"type": "video",
|
|
"media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/abc.jpg",
|
|
"expanded_url": "https://x.com/rockachopa/status/12345/video/1",
|
|
"video_info": {
|
|
"variants": [
|
|
{
|
|
"bitrate": 832000,
|
|
"content_type": "video/mp4",
|
|
"url": "https://video.twimg.com/ext_tw_video/abc.mp4",
|
|
}
|
|
]
|
|
},
|
|
}
|
|
]
|
|
},
|
|
}
|
|
}
|
|
]
|
|
|
|
records = extract_media_records(raw_entries, tmp_path, "tweets.js")
|
|
|
|
assert len(records) == 1
|
|
record = records[0]
|
|
assert record["tweet_id"] == "12345"
|
|
assert record["full_text"].startswith("#timmyTime meets #TimmyChain")
|
|
assert record["hashtags"] == ["timmyTime", "TimmyChain"]
|
|
assert record["media_type"] == "video"
|
|
assert record["local_media_path"] == str(local_media)
|
|
assert record["source_file"] == "tweets.js"
|
|
assert record["variants"][0]["url"].endswith("abc.mp4")
|
|
|
|
|
|
def test_build_media_manifest_summary_counts_media_types_and_hashtag_families() -> None:
|
|
records = [
|
|
{
|
|
"tweet_id": "1",
|
|
"media_type": "video",
|
|
"hashtags": ["timmyTime"],
|
|
"created_at": "Sat Mar 28 12:00:00 +0000 2026",
|
|
},
|
|
{
|
|
"tweet_id": "2",
|
|
"media_type": "animated_gif",
|
|
"hashtags": ["TimmyChain", "timmyTime"],
|
|
"created_at": "Sun Mar 29 12:00:00 +0000 2026",
|
|
},
|
|
]
|
|
|
|
summary = build_media_manifest_summary(records, source_dir="/tmp/source", source_file="tweets.js")
|
|
|
|
assert summary["media_record_count"] == 2
|
|
assert summary["media_type_counts"]["video"] == 1
|
|
assert summary["media_type_counts"]["animated_gif"] == 1
|
|
assert summary["hashtag_counts"]["timmytime"] == 2
|
|
assert summary["hashtag_counts"]["timmychain"] == 1
|
|
assert summary["fields"]
|
|
assert summary["source_file"] == "tweets.js"
|
|
|
|
|
|
def test_project_doc_mentions_media_manifest_and_hashtag_preservation() -> None:
|
|
project = Path("twitter-archive/PROJECT.md").read_text()
|
|
|
|
assert "media/" in project
|
|
assert "manifest.jsonl" in project
|
|
assert "hashtags" in project
|
|
assert "#timmyTime" in project
|
|
assert "#TimmyChain" in project
|