feat: add twitter video media manifest pipeline (#47)
This commit was merged in pull request #47.
This commit is contained in:
163
scripts/twitter_archive/extract_media_manifest.py
Normal file
163
scripts/twitter_archive/extract_media_manifest.py
Normal file
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build a local manifest of Twitter archive media with post metadata preserved."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .common import ARCHIVE_DIR, ensure_layout, resolve_source_dir, stable_sha256, write_json
|
||||
|
||||
MEDIA_DIR = ARCHIVE_DIR / "media"
|
||||
MEDIA_MANIFEST_JSONL = MEDIA_DIR / "manifest.jsonl"
|
||||
MEDIA_MANIFEST_SUMMARY = MEDIA_DIR / "manifest_summary.json"
|
||||
MEDIA_FILE_DIR_NAMES = ("tweets_media", "community_tweet_media", "deleted_tweets_media")
|
||||
|
||||
|
||||
def strip_js_prefix(raw_text: str) -> str:
|
||||
start = raw_text.find("[")
|
||||
if start == -1:
|
||||
raise ValueError("Could not find JSON array in tweets.js")
|
||||
return raw_text[start:].strip()
|
||||
|
||||
|
||||
def _tweet_payload(raw_entry: dict[str, Any]) -> dict[str, Any]:
|
||||
return raw_entry.get("tweet", raw_entry)
|
||||
|
||||
|
||||
def _hashtags(tweet: dict[str, Any]) -> list[str]:
|
||||
entities = tweet.get("entities", {}) or {}
|
||||
return [item.get("text", "").strip() for item in entities.get("hashtags", []) if item.get("text")]
|
||||
|
||||
|
||||
def _urls(tweet: dict[str, Any]) -> list[str]:
|
||||
entities = tweet.get("entities", {}) or {}
|
||||
return [item.get("expanded_url", "").strip() for item in entities.get("urls", []) if item.get("expanded_url")]
|
||||
|
||||
|
||||
def _media_entries(tweet: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
return ((tweet.get("extended_entities") or {}).get("media") or (tweet.get("entities") or {}).get("media") or [])
|
||||
|
||||
|
||||
def _find_local_media_path(source_dir: Path, tweet_id: str) -> str | None:
|
||||
for dirname in MEDIA_FILE_DIR_NAMES:
|
||||
media_dir = source_dir / dirname
|
||||
if not media_dir.exists():
|
||||
continue
|
||||
matches = sorted(media_dir.glob(f"{tweet_id}-*"))
|
||||
if matches:
|
||||
return str(matches[0])
|
||||
return None
|
||||
|
||||
|
||||
def _media_variants(media: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
variants = []
|
||||
for variant in ((media.get("video_info") or {}).get("variants") or []):
|
||||
variants.append(
|
||||
{
|
||||
"content_type": variant.get("content_type"),
|
||||
"bitrate": int(variant.get("bitrate") or 0),
|
||||
"url": variant.get("url"),
|
||||
}
|
||||
)
|
||||
return variants
|
||||
|
||||
|
||||
def extract_media_records(raw_entries: list[dict[str, Any]], source_dir: Path, source_file: str) -> list[dict[str, Any]]:
|
||||
records: list[dict[str, Any]] = []
|
||||
for raw_entry in raw_entries:
|
||||
tweet = _tweet_payload(raw_entry)
|
||||
tweet_id = str(tweet.get("id_str") or tweet.get("id") or "").strip()
|
||||
if not tweet_id:
|
||||
continue
|
||||
media_entries = _media_entries(tweet)
|
||||
if not media_entries:
|
||||
continue
|
||||
base = {
|
||||
"tweet_id": tweet_id,
|
||||
"created_at": tweet.get("created_at"),
|
||||
"full_text": (tweet.get("full_text") or tweet.get("text") or "").strip(),
|
||||
"hashtags": _hashtags(tweet),
|
||||
"urls": _urls(tweet),
|
||||
"source_file": source_file,
|
||||
"local_media_path": _find_local_media_path(source_dir, tweet_id),
|
||||
}
|
||||
for index, media in enumerate(media_entries, start=1):
|
||||
records.append(
|
||||
{
|
||||
**base,
|
||||
"media_index": index,
|
||||
"media_id": str(media.get("id_str") or media.get("id") or "").strip() or None,
|
||||
"media_type": media.get("type") or "unknown",
|
||||
"expanded_url": media.get("expanded_url"),
|
||||
"media_url_https": media.get("media_url_https") or media.get("media_url"),
|
||||
"display_url": media.get("display_url"),
|
||||
"variants": _media_variants(media),
|
||||
}
|
||||
)
|
||||
return records
|
||||
|
||||
|
||||
def build_media_manifest_summary(records: list[dict[str, Any]], *, source_dir: str, source_file: str) -> dict[str, Any]:
|
||||
hashtag_counts = Counter()
|
||||
media_type_counts = Counter()
|
||||
for record in records:
|
||||
media_type_counts[record.get("media_type") or "unknown"] += 1
|
||||
for tag in record.get("hashtags", []):
|
||||
hashtag_counts[tag.lower()] += 1
|
||||
return {
|
||||
"schema_version": 1,
|
||||
"source_dir": source_dir,
|
||||
"source_file": source_file,
|
||||
"media_record_count": len(records),
|
||||
"media_type_counts": dict(media_type_counts),
|
||||
"hashtag_counts": dict(hashtag_counts),
|
||||
"fields": [
|
||||
"tweet_id",
|
||||
"created_at",
|
||||
"full_text",
|
||||
"hashtags",
|
||||
"urls",
|
||||
"media_index",
|
||||
"media_id",
|
||||
"media_type",
|
||||
"expanded_url",
|
||||
"media_url_https",
|
||||
"display_url",
|
||||
"variants",
|
||||
"local_media_path",
|
||||
"source_file",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w") as handle:
|
||||
for row in rows:
|
||||
handle.write(json.dumps(row, sort_keys=True) + "\n")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ensure_layout()
|
||||
MEDIA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
source_dir = resolve_source_dir()
|
||||
source_path = source_dir / "tweets.js"
|
||||
if not source_path.exists():
|
||||
raise SystemExit(json.dumps({"status": "error", "reason": f"missing {source_path}"}))
|
||||
|
||||
raw_payload = strip_js_prefix(source_path.read_text())
|
||||
raw_entries = json.loads(raw_payload)
|
||||
records = extract_media_records(raw_entries, source_dir, source_path.name)
|
||||
write_jsonl(MEDIA_MANIFEST_JSONL, records)
|
||||
summary = build_media_manifest_summary(records, source_dir=str(source_dir), source_file=source_path.name)
|
||||
summary["source_sha256"] = stable_sha256(source_path)
|
||||
summary["source_size_bytes"] = source_path.stat().st_size
|
||||
write_json(MEDIA_MANIFEST_SUMMARY, summary)
|
||||
print(json.dumps({"status": "ok", **summary}, sort_keys=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
99
tests/test_twitter_video_manifest.py
Normal file
99
tests/test_twitter_video_manifest.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from scripts.twitter_archive.extract_media_manifest import (
|
||||
build_media_manifest_summary,
|
||||
extract_media_records,
|
||||
)
|
||||
|
||||
|
||||
def test_extract_media_records_preserves_post_text_and_hashtags(tmp_path: Path) -> None:
|
||||
media_dir = tmp_path / "tweets_media"
|
||||
media_dir.mkdir()
|
||||
local_media = media_dir / "12345-abcde.mp4"
|
||||
local_media.write_bytes(b"fake")
|
||||
|
||||
raw_entries = [
|
||||
{
|
||||
"tweet": {
|
||||
"id_str": "12345",
|
||||
"created_at": "Sat Mar 28 12:00:00 +0000 2026",
|
||||
"full_text": "#timmyTime meets #TimmyChain in motion https://t.co/demo",
|
||||
"lang": "en",
|
||||
"entities": {
|
||||
"hashtags": [{"text": "timmyTime"}, {"text": "TimmyChain"}],
|
||||
"urls": [{"expanded_url": "https://example.com/post"}],
|
||||
},
|
||||
"extended_entities": {
|
||||
"media": [
|
||||
{
|
||||
"id_str": "abcde",
|
||||
"type": "video",
|
||||
"media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/abc.jpg",
|
||||
"expanded_url": "https://x.com/rockachopa/status/12345/video/1",
|
||||
"video_info": {
|
||||
"variants": [
|
||||
{
|
||||
"bitrate": 832000,
|
||||
"content_type": "video/mp4",
|
||||
"url": "https://video.twimg.com/ext_tw_video/abc.mp4",
|
||||
}
|
||||
]
|
||||
},
|
||||
}
|
||||
]
|
||||
},
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
records = extract_media_records(raw_entries, tmp_path, "tweets.js")
|
||||
|
||||
assert len(records) == 1
|
||||
record = records[0]
|
||||
assert record["tweet_id"] == "12345"
|
||||
assert record["full_text"].startswith("#timmyTime meets #TimmyChain")
|
||||
assert record["hashtags"] == ["timmyTime", "TimmyChain"]
|
||||
assert record["media_type"] == "video"
|
||||
assert record["local_media_path"] == str(local_media)
|
||||
assert record["source_file"] == "tweets.js"
|
||||
assert record["variants"][0]["url"].endswith("abc.mp4")
|
||||
|
||||
|
||||
def test_build_media_manifest_summary_counts_media_types_and_hashtag_families() -> None:
|
||||
records = [
|
||||
{
|
||||
"tweet_id": "1",
|
||||
"media_type": "video",
|
||||
"hashtags": ["timmyTime"],
|
||||
"created_at": "Sat Mar 28 12:00:00 +0000 2026",
|
||||
},
|
||||
{
|
||||
"tweet_id": "2",
|
||||
"media_type": "animated_gif",
|
||||
"hashtags": ["TimmyChain", "timmyTime"],
|
||||
"created_at": "Sun Mar 29 12:00:00 +0000 2026",
|
||||
},
|
||||
]
|
||||
|
||||
summary = build_media_manifest_summary(records, source_dir="/tmp/source", source_file="tweets.js")
|
||||
|
||||
assert summary["media_record_count"] == 2
|
||||
assert summary["media_type_counts"]["video"] == 1
|
||||
assert summary["media_type_counts"]["animated_gif"] == 1
|
||||
assert summary["hashtag_counts"]["timmytime"] == 2
|
||||
assert summary["hashtag_counts"]["timmychain"] == 1
|
||||
assert summary["fields"]
|
||||
assert summary["source_file"] == "tweets.js"
|
||||
|
||||
|
||||
def test_project_doc_mentions_media_manifest_and_hashtag_preservation() -> None:
|
||||
project = Path("twitter-archive/PROJECT.md").read_text()
|
||||
|
||||
assert "media/" in project
|
||||
assert "manifest.jsonl" in project
|
||||
assert "hashtags" in project
|
||||
assert "#timmyTime" in project
|
||||
assert "#TimmyChain" in project
|
||||
@@ -117,7 +117,14 @@ Late notes are terse deltas. THAT'S THE PROOF OF GROWTH.
|
||||
retweets.jsonl
|
||||
likes.jsonl
|
||||
manifest.json
|
||||
|
||||
|
||||
media/ # Local-first media understanding
|
||||
manifest.jsonl # one row per video/gif with tweet text + hashtags preserved
|
||||
manifest_summary.json # rollup counts and hashtag families
|
||||
keyframes/ # future extracted frames
|
||||
audio/ # future demuxed audio
|
||||
style_cards/ # future per-video aesthetic summaries
|
||||
|
||||
notes/ # Per-batch observations
|
||||
batch_001.md # early: exploratory
|
||||
batch_002.md # ...
|
||||
@@ -182,6 +189,11 @@ We extract metrics from session files — no custom telemetry.
|
||||
4. The adapter should accelerate the spiral: faster taxonomy,
|
||||
sharper deltas, better voice match
|
||||
|
||||
### Media metadata rule:
|
||||
- tweet posts and hashtags are first-class metadata all the way through the media lane
|
||||
- especially preserve and measure `#timmyTime` and `#TimmyChain`
|
||||
- raw Twitter videos stay local; only derived local artifacts move through the pipeline
|
||||
|
||||
---
|
||||
|
||||
## Running It
|
||||
|
||||
Reference in New Issue
Block a user