Compare commits
1 Commits
fix/545
...
burn/583-1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
794cfc96f9 |
224
scripts/twitter_archive/index_timmy_media.py
Normal file
224
scripts/twitter_archive/index_timmy_media.py
Normal file
@@ -0,0 +1,224 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Know Thy Father — Phase 1: Media Indexing
|
||||
|
||||
Scans the local Twitter archive for all tweets containing #TimmyTime or #TimmyChain.
|
||||
Maps these tweets to their associated media files and outputs a manifest of media
|
||||
files to be processed in the multimodal analysis pipeline.
|
||||
|
||||
Usage:
|
||||
python index_timmy_media.py # Scan and write manifest
|
||||
python index_timmy_media.py --status # Quick status only
|
||||
python index_timmy_media.py --json # Output manifest as JSON to stdout
|
||||
|
||||
Output:
|
||||
twitter-archive/know-thy-father/media-manifest.jsonl
|
||||
twitter-archive/know-thy-father/media-index-summary.json
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent for common helpers
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from common import (
|
||||
ARCHIVE_DIR,
|
||||
EXTRACTED_DIR,
|
||||
load_jsonl,
|
||||
write_json,
|
||||
)
|
||||
|
||||
TARGET_HASHTAGS = {"timmytime", "timmychain"}
|
||||
|
||||
# Know Thy Father output directory
|
||||
KTF_DIR = ARCHIVE_DIR / "know-thy-father"
|
||||
MEDIA_MANIFEST = KTF_DIR / "media-manifest.jsonl"
|
||||
MEDIA_SUMMARY = KTF_DIR / "media-index-summary.json"
|
||||
|
||||
# Media directories in the archive
|
||||
MEDIA_DIR_NAMES = ("tweets_media", "community_tweet_media", "deleted_tweets_media")
|
||||
|
||||
|
||||
def find_hashtag_tweets(tweets: list[dict], hashtags: set[str]) -> list[dict]:
|
||||
"""Filter tweets that contain any of the target hashtags."""
|
||||
matched = []
|
||||
for tweet in tweets:
|
||||
tweet_tags = {h.lower() for h in tweet.get("hashtags", [])}
|
||||
if tweet_tags & hashtags:
|
||||
matched.append(tweet)
|
||||
return matched
|
||||
|
||||
|
||||
def build_tweet_media_index(
|
||||
hashtag_tweets: list[dict],
|
||||
media_manifest: list[dict],
|
||||
source_dir: Path,
|
||||
) -> list[dict]:
|
||||
"""Map hashtag tweets to their media files.
|
||||
|
||||
Returns a list of manifest entries, one per tweet-media pair.
|
||||
"""
|
||||
# Build tweet_id -> media records lookup
|
||||
tweet_media_map: dict[str, list[dict]] = {}
|
||||
for record in media_manifest:
|
||||
tid = record.get("tweet_id")
|
||||
if tid:
|
||||
tweet_media_map.setdefault(tid, []).append(record)
|
||||
|
||||
entries = []
|
||||
for tweet in hashtag_tweets:
|
||||
tid = tweet["tweet_id"]
|
||||
media_records = tweet_media_map.get(tid, [])
|
||||
|
||||
if not media_records:
|
||||
# Tweet has hashtags but no media — still index it
|
||||
entries.append({
|
||||
"tweet_id": tid,
|
||||
"created_at": tweet.get("created_at"),
|
||||
"full_text": tweet.get("full_text", "")[:280],
|
||||
"hashtags": tweet.get("hashtags", []),
|
||||
"media_type": "none",
|
||||
"media_path": None,
|
||||
"media_id": None,
|
||||
"media_count": 0,
|
||||
"status": "no_media",
|
||||
})
|
||||
continue
|
||||
|
||||
for media in media_records:
|
||||
local_path = media.get("local_media_path")
|
||||
entries.append({
|
||||
"tweet_id": tid,
|
||||
"created_at": tweet.get("created_at"),
|
||||
"full_text": tweet.get("full_text", "")[:280],
|
||||
"hashtags": tweet.get("hashtags", []),
|
||||
"media_type": media.get("media_type", "unknown"),
|
||||
"media_path": local_path,
|
||||
"media_id": media.get("media_id"),
|
||||
"media_count": len(media_records),
|
||||
"media_url": media.get("media_url_https"),
|
||||
"status": "pending" if local_path and Path(local_path).exists() else "missing_file",
|
||||
})
|
||||
|
||||
# Sort by created_at
|
||||
entries.sort(key=lambda e: e.get("created_at") or "")
|
||||
return entries
|
||||
|
||||
|
||||
def write_manifest_jsonl(entries: list[dict]) -> None:
|
||||
"""Write the manifest as JSONL."""
|
||||
KTF_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(MEDIA_MANIFEST, "w") as f:
|
||||
for entry in entries:
|
||||
f.write(json.dumps(entry, sort_keys=True) + "\n")
|
||||
|
||||
|
||||
def compute_summary(entries: list[dict]) -> dict:
|
||||
"""Compute summary statistics for the manifest."""
|
||||
total = len(entries)
|
||||
by_status = Counter(e["status"] for e in entries)
|
||||
by_type = Counter(e["media_type"] for e in entries)
|
||||
|
||||
# Unique tweets
|
||||
unique_tweets = len(set(e["tweet_id"] for e in entries))
|
||||
|
||||
# Date range
|
||||
dates = [e["created_at"] for e in entries if e.get("created_at")]
|
||||
dates.sort()
|
||||
|
||||
# File existence check
|
||||
files_exist = sum(
|
||||
1 for e in entries
|
||||
if e.get("media_path") and Path(e["media_path"]).exists()
|
||||
)
|
||||
|
||||
return {
|
||||
"total_entries": total,
|
||||
"unique_tweets": unique_tweets,
|
||||
"tweets_with_media": sum(1 for e in entries if e["media_type"] != "none"),
|
||||
"tweets_without_media": sum(1 for e in entries if e["media_type"] == "none"),
|
||||
"files_exist": files_exist,
|
||||
"files_missing": by_status.get("missing_file", 0),
|
||||
"pending_processing": by_status.get("pending", 0),
|
||||
"by_media_type": dict(by_type),
|
||||
"by_status": dict(by_status),
|
||||
"date_range": {
|
||||
"earliest": dates[0] if dates else None,
|
||||
"latest": dates[-1] if dates else None,
|
||||
},
|
||||
"target_hashtags": sorted(TARGET_HASHTAGS),
|
||||
}
|
||||
|
||||
|
||||
def print_status(summary: dict) -> None:
|
||||
"""Print a human-readable status summary."""
|
||||
print("Know Thy Father — Phase 1: Media Index")
|
||||
print("=" * 45)
|
||||
print(f" Target hashtags: {', '.join(summary['target_hashtags'])}")
|
||||
print(f" Unique tweets: {summary['unique_tweets']}")
|
||||
print(f" With media: {summary['tweets_with_media']}")
|
||||
print(f" Without media: {summary['tweets_without_media']}")
|
||||
print(f" Files found: {summary['files_exist']}")
|
||||
print(f" Files missing: {summary['files_missing']}")
|
||||
print(f" Pending process: {summary['pending_processing']}")
|
||||
print()
|
||||
print("Media types:")
|
||||
for media_type, count in summary["by_media_type"].items():
|
||||
print(f" {media_type:20s} {count}")
|
||||
print()
|
||||
if summary["date_range"]["earliest"]:
|
||||
print(f"Date range: {summary['date_range']['earliest'][:10]} → {summary['date_range']['latest'][:10]}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Know Thy Father — Phase 1 Media Indexing")
|
||||
parser.add_argument("--status", action="store_true", help="Quick status only (don't write files)")
|
||||
parser.add_argument("--json", action="store_true", help="Output manifest as JSON to stdout")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load data
|
||||
tweets = load_jsonl(EXTRACTED_DIR / "tweets.jsonl")
|
||||
if not tweets:
|
||||
print("Error: No tweets found. Run extract_archive.py first.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
media_manifest = load_jsonl(ARCHIVE_DIR / "media" / "manifest.jsonl")
|
||||
if not media_manifest:
|
||||
print("Warning: No media manifest found. Run extract_media_manifest.py first.", file=sys.stderr)
|
||||
print("Continuing with text-only indexing...", file=sys.stderr)
|
||||
|
||||
# Scan for target hashtags
|
||||
hashtag_tweets = find_hashtag_tweets(tweets, TARGET_HASHTAGS)
|
||||
|
||||
# Resolve source dir for media lookup
|
||||
source_dir = Path.home() / "Downloads" / "twitter-2026-03-27-d4471cc6eb6703034d592f870933561ebee374d9d9b90c9b8923abff064afc1e" / "data"
|
||||
|
||||
# Build the media index
|
||||
entries = build_tweet_media_index(hashtag_tweets, media_manifest, source_dir)
|
||||
summary = compute_summary(entries)
|
||||
|
||||
if args.status:
|
||||
print_status(summary)
|
||||
return
|
||||
|
||||
# Write outputs
|
||||
write_manifest_jsonl(entries)
|
||||
write_json(MEDIA_SUMMARY, summary)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(entries, indent=2, sort_keys=True))
|
||||
else:
|
||||
print_status(summary)
|
||||
print()
|
||||
print(f"Manifest written to: {MEDIA_MANIFEST}")
|
||||
print(f"Summary written to: {MEDIA_SUMMARY}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
174
tests/twitter_archive/test_index_timmy_media.py
Normal file
174
tests/twitter_archive/test_index_timmy_media.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""Tests for Know Thy Father Phase 1: Media Indexing."""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "scripts" / "twitter_archive"))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_tweets():
|
||||
return [
|
||||
{
|
||||
"tweet_id": "1001",
|
||||
"created_at": "Mon Mar 01 12:00:00 +0000 2026",
|
||||
"full_text": "First #TimmyTime post",
|
||||
"hashtags": ["TimmyTime"],
|
||||
},
|
||||
{
|
||||
"tweet_id": "1002",
|
||||
"created_at": "Tue Mar 02 12:00:00 +0000 2026",
|
||||
"full_text": "#TimmyChain block 1",
|
||||
"hashtags": ["TimmyChain"],
|
||||
},
|
||||
{
|
||||
"tweet_id": "1003",
|
||||
"created_at": "Wed Mar 03 12:00:00 +0000 2026",
|
||||
"full_text": "Unrelated post",
|
||||
"hashtags": ["bitcoin"],
|
||||
},
|
||||
{
|
||||
"tweet_id": "1004",
|
||||
"created_at": "Thu Mar 04 12:00:00 +0000 2026",
|
||||
"full_text": "Both tags #TimmyTime #TimmyChain",
|
||||
"hashtags": ["TimmyTime", "TimmyChain"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_media():
|
||||
return [
|
||||
{
|
||||
"tweet_id": "1001",
|
||||
"media_id": "m1",
|
||||
"media_type": "video",
|
||||
"local_media_path": "/tmp/media/1001-video.mp4",
|
||||
"media_url_https": "https://pbs.twimg.com/ext_thumb/1001.jpg",
|
||||
},
|
||||
{
|
||||
"tweet_id": "1002",
|
||||
"media_id": "m2",
|
||||
"media_type": "photo",
|
||||
"local_media_path": "/tmp/media/1002-photo.jpg",
|
||||
"media_url_https": "https://pbs.twimg.com/media/1002.jpg",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class TestFindHashtagTweets:
|
||||
def test_finds_timmytime(self, sample_tweets):
|
||||
from index_timmy_media import find_hashtag_tweets
|
||||
result = find_hashtag_tweets(sample_tweets, {"timmytime"})
|
||||
ids = [t["tweet_id"] for t in result]
|
||||
assert "1001" in ids
|
||||
assert "1004" in ids
|
||||
assert "1003" not in ids
|
||||
|
||||
def test_finds_timmychain(self, sample_tweets):
|
||||
from index_timmy_media import find_hashtag_tweets
|
||||
result = find_hashtag_tweets(sample_tweets, {"timmychain"})
|
||||
ids = [t["tweet_id"] for t in result]
|
||||
assert "1002" in ids
|
||||
assert "1004" in ids
|
||||
|
||||
def test_finds_both(self, sample_tweets):
|
||||
from index_timmy_media import find_hashtag_tweets
|
||||
result = find_hashtag_tweets(sample_tweets, {"timmytime", "timmychain"})
|
||||
ids = [t["tweet_id"] for t in result]
|
||||
assert len(ids) == 3
|
||||
assert "1003" not in ids
|
||||
|
||||
def test_case_insensitive(self):
|
||||
from index_timmy_media import find_hashtag_tweets
|
||||
tweets = [
|
||||
{"tweet_id": "1", "hashtags": ["TIMMYTIME"]},
|
||||
{"tweet_id": "2", "hashtags": ["TimmyChain"]},
|
||||
]
|
||||
result = find_hashtag_tweets(tweets, {"timmytime", "timmychain"})
|
||||
assert len(result) == 2
|
||||
|
||||
def test_empty_input(self):
|
||||
from index_timmy_media import find_hashtag_tweets
|
||||
assert find_hashtag_tweets([], {"timmytime"}) == []
|
||||
|
||||
|
||||
class TestBuildTweetMediaIndex:
|
||||
def test_maps_tweets_to_media(self, sample_tweets, sample_media, tmp_path):
|
||||
from index_timmy_media import build_tweet_media_index
|
||||
matched = [t for t in sample_tweets if t["tweet_id"] in ("1001", "1002")]
|
||||
# Create fake media files
|
||||
for m in sample_media:
|
||||
p = Path(m["local_media_path"])
|
||||
p.parent.mkdir(parents=True, exist_ok=True)
|
||||
p.touch()
|
||||
|
||||
entries = build_tweet_media_index(matched, sample_media, tmp_path)
|
||||
assert len(entries) == 2
|
||||
|
||||
by_id = {e["tweet_id"]: e for e in entries}
|
||||
assert by_id["1001"]["media_type"] == "video"
|
||||
assert by_id["1002"]["media_type"] == "photo"
|
||||
assert by_id["1001"]["status"] == "pending"
|
||||
assert by_id["1002"]["status"] == "pending"
|
||||
|
||||
def test_handles_no_media(self, sample_tweets, sample_media, tmp_path):
|
||||
from index_timmy_media import build_tweet_media_index
|
||||
# Tweet 1004 has no media record
|
||||
matched = [t for t in sample_tweets if t["tweet_id"] == "1004"]
|
||||
entries = build_tweet_media_index(matched, sample_media, tmp_path)
|
||||
assert len(entries) == 1
|
||||
assert entries[0]["media_type"] == "none"
|
||||
assert entries[0]["status"] == "no_media"
|
||||
|
||||
def test_handles_missing_file(self, sample_tweets, sample_media, tmp_path):
|
||||
from index_timmy_media import build_tweet_media_index
|
||||
matched = [t for t in sample_tweets if t["tweet_id"] == "1001"]
|
||||
# Use a path that definitely doesn't exist
|
||||
fake_media = [{
|
||||
"tweet_id": "1001",
|
||||
"media_id": "m1",
|
||||
"media_type": "video",
|
||||
"local_media_path": "/tmp/nonexistent/1001-FAKE.mp4",
|
||||
"media_url_https": "https://pbs.twimg.com/ext_thumb/1001.jpg",
|
||||
}]
|
||||
entries = build_tweet_media_index(matched, fake_media, tmp_path)
|
||||
assert entries[0]["status"] == "missing_file"
|
||||
|
||||
def test_sorted_by_date(self, sample_tweets, sample_media, tmp_path):
|
||||
from index_timmy_media import build_tweet_media_index
|
||||
matched = [t for t in sample_tweets if t["tweet_id"] in ("1001", "1002")]
|
||||
for m in sample_media:
|
||||
Path(m["local_media_path"]).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(m["local_media_path"]).touch()
|
||||
|
||||
entries = build_tweet_media_index(matched, sample_media, tmp_path)
|
||||
dates = [e["created_at"] for e in entries]
|
||||
assert dates == sorted(dates)
|
||||
|
||||
|
||||
class TestComputeSummary:
|
||||
def test_basic_summary(self):
|
||||
from index_timmy_media import compute_summary
|
||||
entries = [
|
||||
{"tweet_id": "1", "media_type": "video", "media_path": "/fake", "status": "pending"},
|
||||
{"tweet_id": "2", "media_type": "none", "media_path": None, "status": "no_media"},
|
||||
{"tweet_id": "3", "media_type": "photo", "media_path": "/fake2", "status": "pending"},
|
||||
]
|
||||
summary = compute_summary(entries)
|
||||
assert summary["total_entries"] == 3
|
||||
assert summary["unique_tweets"] == 3
|
||||
assert summary["tweets_with_media"] == 2
|
||||
assert summary["tweets_without_media"] == 1
|
||||
assert summary["by_media_type"]["video"] == 1
|
||||
assert summary["by_media_type"]["photo"] == 1
|
||||
assert summary["by_media_type"]["none"] == 1
|
||||
|
||||
def test_empty_entries(self):
|
||||
from index_timmy_media import compute_summary
|
||||
summary = compute_summary([])
|
||||
assert summary["total_entries"] == 0
|
||||
assert summary["unique_tweets"] == 0
|
||||
Reference in New Issue
Block a user