feat: add twitter video media manifest pipeline (#47)

This commit was merged in pull request #47.
This commit is contained in:
2026-03-28 19:58:32 +00:00
parent 0a421e5ba5
commit 25c1b32bfb
3 changed files with 275 additions and 1 deletions

View File

@@ -0,0 +1,163 @@
#!/usr/bin/env python3
"""Build a local manifest of Twitter archive media with post metadata preserved."""
from __future__ import annotations
import json
from collections import Counter
from pathlib import Path
from typing import Any
from .common import ARCHIVE_DIR, ensure_layout, resolve_source_dir, stable_sha256, write_json
MEDIA_DIR = ARCHIVE_DIR / "media"
MEDIA_MANIFEST_JSONL = MEDIA_DIR / "manifest.jsonl"
MEDIA_MANIFEST_SUMMARY = MEDIA_DIR / "manifest_summary.json"
MEDIA_FILE_DIR_NAMES = ("tweets_media", "community_tweet_media", "deleted_tweets_media")
def strip_js_prefix(raw_text: str) -> str:
start = raw_text.find("[")
if start == -1:
raise ValueError("Could not find JSON array in tweets.js")
return raw_text[start:].strip()
def _tweet_payload(raw_entry: dict[str, Any]) -> dict[str, Any]:
return raw_entry.get("tweet", raw_entry)
def _hashtags(tweet: dict[str, Any]) -> list[str]:
entities = tweet.get("entities", {}) or {}
return [item.get("text", "").strip() for item in entities.get("hashtags", []) if item.get("text")]
def _urls(tweet: dict[str, Any]) -> list[str]:
entities = tweet.get("entities", {}) or {}
return [item.get("expanded_url", "").strip() for item in entities.get("urls", []) if item.get("expanded_url")]
def _media_entries(tweet: dict[str, Any]) -> list[dict[str, Any]]:
return ((tweet.get("extended_entities") or {}).get("media") or (tweet.get("entities") or {}).get("media") or [])
def _find_local_media_path(source_dir: Path, tweet_id: str) -> str | None:
for dirname in MEDIA_FILE_DIR_NAMES:
media_dir = source_dir / dirname
if not media_dir.exists():
continue
matches = sorted(media_dir.glob(f"{tweet_id}-*"))
if matches:
return str(matches[0])
return None
def _media_variants(media: dict[str, Any]) -> list[dict[str, Any]]:
variants = []
for variant in ((media.get("video_info") or {}).get("variants") or []):
variants.append(
{
"content_type": variant.get("content_type"),
"bitrate": int(variant.get("bitrate") or 0),
"url": variant.get("url"),
}
)
return variants
def extract_media_records(raw_entries: list[dict[str, Any]], source_dir: Path, source_file: str) -> list[dict[str, Any]]:
records: list[dict[str, Any]] = []
for raw_entry in raw_entries:
tweet = _tweet_payload(raw_entry)
tweet_id = str(tweet.get("id_str") or tweet.get("id") or "").strip()
if not tweet_id:
continue
media_entries = _media_entries(tweet)
if not media_entries:
continue
base = {
"tweet_id": tweet_id,
"created_at": tweet.get("created_at"),
"full_text": (tweet.get("full_text") or tweet.get("text") or "").strip(),
"hashtags": _hashtags(tweet),
"urls": _urls(tweet),
"source_file": source_file,
"local_media_path": _find_local_media_path(source_dir, tweet_id),
}
for index, media in enumerate(media_entries, start=1):
records.append(
{
**base,
"media_index": index,
"media_id": str(media.get("id_str") or media.get("id") or "").strip() or None,
"media_type": media.get("type") or "unknown",
"expanded_url": media.get("expanded_url"),
"media_url_https": media.get("media_url_https") or media.get("media_url"),
"display_url": media.get("display_url"),
"variants": _media_variants(media),
}
)
return records
def build_media_manifest_summary(records: list[dict[str, Any]], *, source_dir: str, source_file: str) -> dict[str, Any]:
hashtag_counts = Counter()
media_type_counts = Counter()
for record in records:
media_type_counts[record.get("media_type") or "unknown"] += 1
for tag in record.get("hashtags", []):
hashtag_counts[tag.lower()] += 1
return {
"schema_version": 1,
"source_dir": source_dir,
"source_file": source_file,
"media_record_count": len(records),
"media_type_counts": dict(media_type_counts),
"hashtag_counts": dict(hashtag_counts),
"fields": [
"tweet_id",
"created_at",
"full_text",
"hashtags",
"urls",
"media_index",
"media_id",
"media_type",
"expanded_url",
"media_url_https",
"display_url",
"variants",
"local_media_path",
"source_file",
],
}
def write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as handle:
for row in rows:
handle.write(json.dumps(row, sort_keys=True) + "\n")
def main() -> None:
ensure_layout()
MEDIA_DIR.mkdir(parents=True, exist_ok=True)
source_dir = resolve_source_dir()
source_path = source_dir / "tweets.js"
if not source_path.exists():
raise SystemExit(json.dumps({"status": "error", "reason": f"missing {source_path}"}))
raw_payload = strip_js_prefix(source_path.read_text())
raw_entries = json.loads(raw_payload)
records = extract_media_records(raw_entries, source_dir, source_path.name)
write_jsonl(MEDIA_MANIFEST_JSONL, records)
summary = build_media_manifest_summary(records, source_dir=str(source_dir), source_file=source_path.name)
summary["source_sha256"] = stable_sha256(source_path)
summary["source_size_bytes"] = source_path.stat().st_size
write_json(MEDIA_MANIFEST_SUMMARY, summary)
print(json.dumps({"status": "ok", **summary}, sort_keys=True))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,99 @@
from __future__ import annotations
import json
from pathlib import Path
from scripts.twitter_archive.extract_media_manifest import (
build_media_manifest_summary,
extract_media_records,
)
def test_extract_media_records_preserves_post_text_and_hashtags(tmp_path: Path) -> None:
media_dir = tmp_path / "tweets_media"
media_dir.mkdir()
local_media = media_dir / "12345-abcde.mp4"
local_media.write_bytes(b"fake")
raw_entries = [
{
"tweet": {
"id_str": "12345",
"created_at": "Sat Mar 28 12:00:00 +0000 2026",
"full_text": "#timmyTime meets #TimmyChain in motion https://t.co/demo",
"lang": "en",
"entities": {
"hashtags": [{"text": "timmyTime"}, {"text": "TimmyChain"}],
"urls": [{"expanded_url": "https://example.com/post"}],
},
"extended_entities": {
"media": [
{
"id_str": "abcde",
"type": "video",
"media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/abc.jpg",
"expanded_url": "https://x.com/rockachopa/status/12345/video/1",
"video_info": {
"variants": [
{
"bitrate": 832000,
"content_type": "video/mp4",
"url": "https://video.twimg.com/ext_tw_video/abc.mp4",
}
]
},
}
]
},
}
}
]
records = extract_media_records(raw_entries, tmp_path, "tweets.js")
assert len(records) == 1
record = records[0]
assert record["tweet_id"] == "12345"
assert record["full_text"].startswith("#timmyTime meets #TimmyChain")
assert record["hashtags"] == ["timmyTime", "TimmyChain"]
assert record["media_type"] == "video"
assert record["local_media_path"] == str(local_media)
assert record["source_file"] == "tweets.js"
assert record["variants"][0]["url"].endswith("abc.mp4")
def test_build_media_manifest_summary_counts_media_types_and_hashtag_families() -> None:
records = [
{
"tweet_id": "1",
"media_type": "video",
"hashtags": ["timmyTime"],
"created_at": "Sat Mar 28 12:00:00 +0000 2026",
},
{
"tweet_id": "2",
"media_type": "animated_gif",
"hashtags": ["TimmyChain", "timmyTime"],
"created_at": "Sun Mar 29 12:00:00 +0000 2026",
},
]
summary = build_media_manifest_summary(records, source_dir="/tmp/source", source_file="tweets.js")
assert summary["media_record_count"] == 2
assert summary["media_type_counts"]["video"] == 1
assert summary["media_type_counts"]["animated_gif"] == 1
assert summary["hashtag_counts"]["timmytime"] == 2
assert summary["hashtag_counts"]["timmychain"] == 1
assert summary["fields"]
assert summary["source_file"] == "tweets.js"
def test_project_doc_mentions_media_manifest_and_hashtag_preservation() -> None:
project = Path("twitter-archive/PROJECT.md").read_text()
assert "media/" in project
assert "manifest.jsonl" in project
assert "hashtags" in project
assert "#timmyTime" in project
assert "#TimmyChain" in project

View File

@@ -117,7 +117,14 @@ Late notes are terse deltas. THAT'S THE PROOF OF GROWTH.
retweets.jsonl
likes.jsonl
manifest.json
media/ # Local-first media understanding
manifest.jsonl # one row per video/gif with tweet text + hashtags preserved
manifest_summary.json # rollup counts and hashtag families
keyframes/ # future extracted frames
audio/ # future demuxed audio
style_cards/ # future per-video aesthetic summaries
notes/ # Per-batch observations
batch_001.md # early: exploratory
batch_002.md # ...
@@ -182,6 +189,11 @@ We extract metrics from session files — no custom telemetry.
4. The adapter should accelerate the spiral: faster taxonomy,
sharper deltas, better voice match
### Media metadata rule:
- tweet posts and hashtags are first-class metadata all the way through the media lane
- especially preserve and measure `#timmyTime` and `#TimmyChain`
- raw Twitter videos stay local; only derived local artifacts move through the pipeline
---
## Running It