feat: add twitter video media manifest pipeline (#47)

2026-03-28 19:58:32 +00:00
parent 0a421e5ba5
commit 25c1b32bfb
3 changed files with 275 additions and 1 deletions
--- a/scripts/twitter_archive/extract_media_manifest.py
+++ b/scripts/twitter_archive/extract_media_manifest.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""Build a local manifest of Twitter archive media with post metadata preserved."""
+
+from __future__ import annotations
+
+import json
+from collections import Counter
+from pathlib import Path
+from typing import Any
+
+from .common import ARCHIVE_DIR, ensure_layout, resolve_source_dir, stable_sha256, write_json
+
+MEDIA_DIR = ARCHIVE_DIR / "media"
+MEDIA_MANIFEST_JSONL = MEDIA_DIR / "manifest.jsonl"
+MEDIA_MANIFEST_SUMMARY = MEDIA_DIR / "manifest_summary.json"
+MEDIA_FILE_DIR_NAMES = ("tweets_media", "community_tweet_media", "deleted_tweets_media")
+
+
+def strip_js_prefix(raw_text: str) -> str:
+    start = raw_text.find("[")
+    if start == -1:
+        raise ValueError("Could not find JSON array in tweets.js")
+    return raw_text[start:].strip()
+
+
+def _tweet_payload(raw_entry: dict[str, Any]) -> dict[str, Any]:
+    return raw_entry.get("tweet", raw_entry)
+
+
+def _hashtags(tweet: dict[str, Any]) -> list[str]:
+    entities = tweet.get("entities", {}) or {}
+    return [item.get("text", "").strip() for item in entities.get("hashtags", []) if item.get("text")]
+
+
+def _urls(tweet: dict[str, Any]) -> list[str]:
+    entities = tweet.get("entities", {}) or {}
+    return [item.get("expanded_url", "").strip() for item in entities.get("urls", []) if item.get("expanded_url")]
+
+
+def _media_entries(tweet: dict[str, Any]) -> list[dict[str, Any]]:
+    return ((tweet.get("extended_entities") or {}).get("media") or (tweet.get("entities") or {}).get("media") or [])
+
+
+def _find_local_media_path(source_dir: Path, tweet_id: str) -> str | None:
+    for dirname in MEDIA_FILE_DIR_NAMES:
+        media_dir = source_dir / dirname
+        if not media_dir.exists():
+            continue
+        matches = sorted(media_dir.glob(f"{tweet_id}-*"))
+        if matches:
+            return str(matches[0])
+    return None
+
+
+def _media_variants(media: dict[str, Any]) -> list[dict[str, Any]]:
+    variants = []
+    for variant in ((media.get("video_info") or {}).get("variants") or []):
+        variants.append(
+            {
+                "content_type": variant.get("content_type"),
+                "bitrate": int(variant.get("bitrate") or 0),
+                "url": variant.get("url"),
+            }
+        )
+    return variants
+
+
+def extract_media_records(raw_entries: list[dict[str, Any]], source_dir: Path, source_file: str) -> list[dict[str, Any]]:
+    records: list[dict[str, Any]] = []
+    for raw_entry in raw_entries:
+        tweet = _tweet_payload(raw_entry)
+        tweet_id = str(tweet.get("id_str") or tweet.get("id") or "").strip()
+        if not tweet_id:
+            continue
+        media_entries = _media_entries(tweet)
+        if not media_entries:
+            continue
+        base = {
+            "tweet_id": tweet_id,
+            "created_at": tweet.get("created_at"),
+            "full_text": (tweet.get("full_text") or tweet.get("text") or "").strip(),
+            "hashtags": _hashtags(tweet),
+            "urls": _urls(tweet),
+            "source_file": source_file,
+            "local_media_path": _find_local_media_path(source_dir, tweet_id),
+        }
+        for index, media in enumerate(media_entries, start=1):
+            records.append(
+                {
+                    **base,
+                    "media_index": index,
+                    "media_id": str(media.get("id_str") or media.get("id") or "").strip() or None,
+                    "media_type": media.get("type") or "unknown",
+                    "expanded_url": media.get("expanded_url"),
+                    "media_url_https": media.get("media_url_https") or media.get("media_url"),
+                    "display_url": media.get("display_url"),
+                    "variants": _media_variants(media),
+                }
+            )
+    return records
+
+
+def build_media_manifest_summary(records: list[dict[str, Any]], *, source_dir: str, source_file: str) -> dict[str, Any]:
+    hashtag_counts = Counter()
+    media_type_counts = Counter()
+    for record in records:
+        media_type_counts[record.get("media_type") or "unknown"] += 1
+        for tag in record.get("hashtags", []):
+            hashtag_counts[tag.lower()] += 1
+    return {
+        "schema_version": 1,
+        "source_dir": source_dir,
+        "source_file": source_file,
+        "media_record_count": len(records),
+        "media_type_counts": dict(media_type_counts),
+        "hashtag_counts": dict(hashtag_counts),
+        "fields": [
+            "tweet_id",
+            "created_at",
+            "full_text",
+            "hashtags",
+            "urls",
+            "media_index",
+            "media_id",
+            "media_type",
+            "expanded_url",
+            "media_url_https",
+            "display_url",
+            "variants",
+            "local_media_path",
+            "source_file",
+        ],
+    }
+
+
+def write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as handle:
+        for row in rows:
+            handle.write(json.dumps(row, sort_keys=True) + "\n")
+
+
+def main() -> None:
+    ensure_layout()
+    MEDIA_DIR.mkdir(parents=True, exist_ok=True)
+    source_dir = resolve_source_dir()
+    source_path = source_dir / "tweets.js"
+    if not source_path.exists():
+        raise SystemExit(json.dumps({"status": "error", "reason": f"missing {source_path}"}))
+
+    raw_payload = strip_js_prefix(source_path.read_text())
+    raw_entries = json.loads(raw_payload)
+    records = extract_media_records(raw_entries, source_dir, source_path.name)
+    write_jsonl(MEDIA_MANIFEST_JSONL, records)
+    summary = build_media_manifest_summary(records, source_dir=str(source_dir), source_file=source_path.name)
+    summary["source_sha256"] = stable_sha256(source_path)
+    summary["source_size_bytes"] = source_path.stat().st_size
+    write_json(MEDIA_MANIFEST_SUMMARY, summary)
+    print(json.dumps({"status": "ok", **summary}, sort_keys=True))
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_twitter_video_manifest.py
+++ b/tests/test_twitter_video_manifest.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from scripts.twitter_archive.extract_media_manifest import (
+    build_media_manifest_summary,
+    extract_media_records,
+)
+
+
+def test_extract_media_records_preserves_post_text_and_hashtags(tmp_path: Path) -> None:
+    media_dir = tmp_path / "tweets_media"
+    media_dir.mkdir()
+    local_media = media_dir / "12345-abcde.mp4"
+    local_media.write_bytes(b"fake")
+
+    raw_entries = [
+        {
+            "tweet": {
+                "id_str": "12345",
+                "created_at": "Sat Mar 28 12:00:00 +0000 2026",
+                "full_text": "#timmyTime meets #TimmyChain in motion https://t.co/demo",
+                "lang": "en",
+                "entities": {
+                    "hashtags": [{"text": "timmyTime"}, {"text": "TimmyChain"}],
+                    "urls": [{"expanded_url": "https://example.com/post"}],
+                },
+                "extended_entities": {
+                    "media": [
+                        {
+                            "id_str": "abcde",
+                            "type": "video",
+                            "media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/abc.jpg",
+                            "expanded_url": "https://x.com/rockachopa/status/12345/video/1",
+                            "video_info": {
+                                "variants": [
+                                    {
+                                        "bitrate": 832000,
+                                        "content_type": "video/mp4",
+                                        "url": "https://video.twimg.com/ext_tw_video/abc.mp4",
+                                    }
+                                ]
+                            },
+                        }
+                    ]
+                },
+            }
+        }
+    ]
+
+    records = extract_media_records(raw_entries, tmp_path, "tweets.js")
+
+    assert len(records) == 1
+    record = records[0]
+    assert record["tweet_id"] == "12345"
+    assert record["full_text"].startswith("#timmyTime meets #TimmyChain")
+    assert record["hashtags"] == ["timmyTime", "TimmyChain"]
+    assert record["media_type"] == "video"
+    assert record["local_media_path"] == str(local_media)
+    assert record["source_file"] == "tweets.js"
+    assert record["variants"][0]["url"].endswith("abc.mp4")
+
+
+def test_build_media_manifest_summary_counts_media_types_and_hashtag_families() -> None:
+    records = [
+        {
+            "tweet_id": "1",
+            "media_type": "video",
+            "hashtags": ["timmyTime"],
+            "created_at": "Sat Mar 28 12:00:00 +0000 2026",
+        },
+        {
+            "tweet_id": "2",
+            "media_type": "animated_gif",
+            "hashtags": ["TimmyChain", "timmyTime"],
+            "created_at": "Sun Mar 29 12:00:00 +0000 2026",
+        },
+    ]
+
+    summary = build_media_manifest_summary(records, source_dir="/tmp/source", source_file="tweets.js")
+
+    assert summary["media_record_count"] == 2
+    assert summary["media_type_counts"]["video"] == 1
+    assert summary["media_type_counts"]["animated_gif"] == 1
+    assert summary["hashtag_counts"]["timmytime"] == 2
+    assert summary["hashtag_counts"]["timmychain"] == 1
+    assert summary["fields"]
+    assert summary["source_file"] == "tweets.js"
+
+
+def test_project_doc_mentions_media_manifest_and_hashtag_preservation() -> None:
+    project = Path("twitter-archive/PROJECT.md").read_text()
+
+    assert "media/" in project
+    assert "manifest.jsonl" in project
+    assert "hashtags" in project
+    assert "#timmyTime" in project
+    assert "#TimmyChain" in project
--- a/twitter-archive/PROJECT.md
+++ b/twitter-archive/PROJECT.md
@@ -117,7 +117,14 @@ Late notes are terse deltas. THAT'S THE PROOF OF GROWTH.
    retweets.jsonl
    likes.jsonl
    manifest.json
-    
+
+  media/                  # Local-first media understanding
+    manifest.jsonl        # one row per video/gif with tweet text + hashtags preserved
+    manifest_summary.json # rollup counts and hashtag families
+    keyframes/            # future extracted frames
+    audio/                # future demuxed audio
+    style_cards/          # future per-video aesthetic summaries
+
  notes/                  # Per-batch observations
    batch_001.md          # early: exploratory
    batch_002.md          # ...
@@ -182,6 +189,11 @@ We extract metrics from session files — no custom telemetry.
 4. The adapter should accelerate the spiral: faster taxonomy,
   sharper deltas, better voice match

+### Media metadata rule:
+- tweet posts and hashtags are first-class metadata all the way through the media lane
+- especially preserve and measure `#timmyTime` and `#TimmyChain`
+- raw Twitter videos stay local; only derived local artifacts move through the pipeline
+
 ---

 ## Running It