diff --git a/scripts/twitter_archive/extract_media_manifest.py b/scripts/twitter_archive/extract_media_manifest.py new file mode 100644 index 0000000..3e1efc2 --- /dev/null +++ b/scripts/twitter_archive/extract_media_manifest.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""Build a local manifest of Twitter archive media with post metadata preserved.""" + +from __future__ import annotations + +import json +from collections import Counter +from pathlib import Path +from typing import Any + +from .common import ARCHIVE_DIR, ensure_layout, resolve_source_dir, stable_sha256, write_json + +MEDIA_DIR = ARCHIVE_DIR / "media" +MEDIA_MANIFEST_JSONL = MEDIA_DIR / "manifest.jsonl" +MEDIA_MANIFEST_SUMMARY = MEDIA_DIR / "manifest_summary.json" +MEDIA_FILE_DIR_NAMES = ("tweets_media", "community_tweet_media", "deleted_tweets_media") + + +def strip_js_prefix(raw_text: str) -> str: + start = raw_text.find("[") + if start == -1: + raise ValueError("Could not find JSON array in tweets.js") + return raw_text[start:].strip() + + +def _tweet_payload(raw_entry: dict[str, Any]) -> dict[str, Any]: + return raw_entry.get("tweet", raw_entry) + + +def _hashtags(tweet: dict[str, Any]) -> list[str]: + entities = tweet.get("entities", {}) or {} + return [item.get("text", "").strip() for item in entities.get("hashtags", []) if item.get("text")] + + +def _urls(tweet: dict[str, Any]) -> list[str]: + entities = tweet.get("entities", {}) or {} + return [item.get("expanded_url", "").strip() for item in entities.get("urls", []) if item.get("expanded_url")] + + +def _media_entries(tweet: dict[str, Any]) -> list[dict[str, Any]]: + return ((tweet.get("extended_entities") or {}).get("media") or (tweet.get("entities") or {}).get("media") or []) + + +def _find_local_media_path(source_dir: Path, tweet_id: str) -> str | None: + for dirname in MEDIA_FILE_DIR_NAMES: + media_dir = source_dir / dirname + if not media_dir.exists(): + continue + matches = sorted(media_dir.glob(f"{tweet_id}-*")) + if matches: + return str(matches[0]) + return None + + +def _media_variants(media: dict[str, Any]) -> list[dict[str, Any]]: + variants = [] + for variant in ((media.get("video_info") or {}).get("variants") or []): + variants.append( + { + "content_type": variant.get("content_type"), + "bitrate": int(variant.get("bitrate") or 0), + "url": variant.get("url"), + } + ) + return variants + + +def extract_media_records(raw_entries: list[dict[str, Any]], source_dir: Path, source_file: str) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + for raw_entry in raw_entries: + tweet = _tweet_payload(raw_entry) + tweet_id = str(tweet.get("id_str") or tweet.get("id") or "").strip() + if not tweet_id: + continue + media_entries = _media_entries(tweet) + if not media_entries: + continue + base = { + "tweet_id": tweet_id, + "created_at": tweet.get("created_at"), + "full_text": (tweet.get("full_text") or tweet.get("text") or "").strip(), + "hashtags": _hashtags(tweet), + "urls": _urls(tweet), + "source_file": source_file, + "local_media_path": _find_local_media_path(source_dir, tweet_id), + } + for index, media in enumerate(media_entries, start=1): + records.append( + { + **base, + "media_index": index, + "media_id": str(media.get("id_str") or media.get("id") or "").strip() or None, + "media_type": media.get("type") or "unknown", + "expanded_url": media.get("expanded_url"), + "media_url_https": media.get("media_url_https") or media.get("media_url"), + "display_url": media.get("display_url"), + "variants": _media_variants(media), + } + ) + return records + + +def build_media_manifest_summary(records: list[dict[str, Any]], *, source_dir: str, source_file: str) -> dict[str, Any]: + hashtag_counts = Counter() + media_type_counts = Counter() + for record in records: + media_type_counts[record.get("media_type") or "unknown"] += 1 + for tag in record.get("hashtags", []): + hashtag_counts[tag.lower()] += 1 + return { + "schema_version": 1, + "source_dir": source_dir, + "source_file": source_file, + "media_record_count": len(records), + "media_type_counts": dict(media_type_counts), + "hashtag_counts": dict(hashtag_counts), + "fields": [ + "tweet_id", + "created_at", + "full_text", + "hashtags", + "urls", + "media_index", + "media_id", + "media_type", + "expanded_url", + "media_url_https", + "display_url", + "variants", + "local_media_path", + "source_file", + ], + } + + +def write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as handle: + for row in rows: + handle.write(json.dumps(row, sort_keys=True) + "\n") + + +def main() -> None: + ensure_layout() + MEDIA_DIR.mkdir(parents=True, exist_ok=True) + source_dir = resolve_source_dir() + source_path = source_dir / "tweets.js" + if not source_path.exists(): + raise SystemExit(json.dumps({"status": "error", "reason": f"missing {source_path}"})) + + raw_payload = strip_js_prefix(source_path.read_text()) + raw_entries = json.loads(raw_payload) + records = extract_media_records(raw_entries, source_dir, source_path.name) + write_jsonl(MEDIA_MANIFEST_JSONL, records) + summary = build_media_manifest_summary(records, source_dir=str(source_dir), source_file=source_path.name) + summary["source_sha256"] = stable_sha256(source_path) + summary["source_size_bytes"] = source_path.stat().st_size + write_json(MEDIA_MANIFEST_SUMMARY, summary) + print(json.dumps({"status": "ok", **summary}, sort_keys=True)) + + +if __name__ == "__main__": + main() diff --git a/tests/test_twitter_video_manifest.py b/tests/test_twitter_video_manifest.py new file mode 100644 index 0000000..7ca8309 --- /dev/null +++ b/tests/test_twitter_video_manifest.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from scripts.twitter_archive.extract_media_manifest import ( + build_media_manifest_summary, + extract_media_records, +) + + +def test_extract_media_records_preserves_post_text_and_hashtags(tmp_path: Path) -> None: + media_dir = tmp_path / "tweets_media" + media_dir.mkdir() + local_media = media_dir / "12345-abcde.mp4" + local_media.write_bytes(b"fake") + + raw_entries = [ + { + "tweet": { + "id_str": "12345", + "created_at": "Sat Mar 28 12:00:00 +0000 2026", + "full_text": "#timmyTime meets #TimmyChain in motion https://t.co/demo", + "lang": "en", + "entities": { + "hashtags": [{"text": "timmyTime"}, {"text": "TimmyChain"}], + "urls": [{"expanded_url": "https://example.com/post"}], + }, + "extended_entities": { + "media": [ + { + "id_str": "abcde", + "type": "video", + "media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/abc.jpg", + "expanded_url": "https://x.com/rockachopa/status/12345/video/1", + "video_info": { + "variants": [ + { + "bitrate": 832000, + "content_type": "video/mp4", + "url": "https://video.twimg.com/ext_tw_video/abc.mp4", + } + ] + }, + } + ] + }, + } + } + ] + + records = extract_media_records(raw_entries, tmp_path, "tweets.js") + + assert len(records) == 1 + record = records[0] + assert record["tweet_id"] == "12345" + assert record["full_text"].startswith("#timmyTime meets #TimmyChain") + assert record["hashtags"] == ["timmyTime", "TimmyChain"] + assert record["media_type"] == "video" + assert record["local_media_path"] == str(local_media) + assert record["source_file"] == "tweets.js" + assert record["variants"][0]["url"].endswith("abc.mp4") + + +def test_build_media_manifest_summary_counts_media_types_and_hashtag_families() -> None: + records = [ + { + "tweet_id": "1", + "media_type": "video", + "hashtags": ["timmyTime"], + "created_at": "Sat Mar 28 12:00:00 +0000 2026", + }, + { + "tweet_id": "2", + "media_type": "animated_gif", + "hashtags": ["TimmyChain", "timmyTime"], + "created_at": "Sun Mar 29 12:00:00 +0000 2026", + }, + ] + + summary = build_media_manifest_summary(records, source_dir="/tmp/source", source_file="tweets.js") + + assert summary["media_record_count"] == 2 + assert summary["media_type_counts"]["video"] == 1 + assert summary["media_type_counts"]["animated_gif"] == 1 + assert summary["hashtag_counts"]["timmytime"] == 2 + assert summary["hashtag_counts"]["timmychain"] == 1 + assert summary["fields"] + assert summary["source_file"] == "tweets.js" + + +def test_project_doc_mentions_media_manifest_and_hashtag_preservation() -> None: + project = Path("twitter-archive/PROJECT.md").read_text() + + assert "media/" in project + assert "manifest.jsonl" in project + assert "hashtags" in project + assert "#timmyTime" in project + assert "#TimmyChain" in project diff --git a/twitter-archive/PROJECT.md b/twitter-archive/PROJECT.md index ccb2078..6b74b33 100644 --- a/twitter-archive/PROJECT.md +++ b/twitter-archive/PROJECT.md @@ -117,7 +117,14 @@ Late notes are terse deltas. THAT'S THE PROOF OF GROWTH. retweets.jsonl likes.jsonl manifest.json - + + media/ # Local-first media understanding + manifest.jsonl # one row per video/gif with tweet text + hashtags preserved + manifest_summary.json # rollup counts and hashtag families + keyframes/ # future extracted frames + audio/ # future demuxed audio + style_cards/ # future per-video aesthetic summaries + notes/ # Per-batch observations batch_001.md # early: exploratory batch_002.md # ... @@ -182,6 +189,11 @@ We extract metrics from session files — no custom telemetry. 4. The adapter should accelerate the spiral: faster taxonomy, sharper deltas, better voice match +### Media metadata rule: +- tweet posts and hashtags are first-class metadata all the way through the media lane +- especially preserve and measure `#timmyTime` and `#TimmyChain` +- raw Twitter videos stay local; only derived local artifacts move through the pipeline + --- ## Running It