Compare commits

...

1 Commits

Author SHA1 Message Date
Timmy
794cfc96f9 feat: Know Thy Father Phase 1 — Media Indexing (#583)
Some checks failed
Smoke Test / smoke (pull_request) Failing after 15s
Scans the Twitter archive for tweets with #TimmyTime or #TimmyChain,
maps them to media files, and produces a manifest for the multimodal
analysis pipeline.

## Results
- 107 tweets with target hashtags
- 92 tweets with media (88 video, 4 photo)
- 17 tweets without media
- 92 files found on disk, 0 missing
- Date range: Feb 2026 → Sep 2026

## Files
- scripts/twitter_archive/index_timmy_media.py — scanning + indexing
- tests/twitter_archive/test_index_timmy_media.py — 11 tests

## Usage
python index_timmy_media.py          # Full scan + write manifest
python index_timmy_media.py --status # Quick status
python index_timmy_media.py --json   # JSON output to stdout

## Output
~/.timmy/twitter-archive/know-thy-father/media-manifest.jsonl
~/.timmy/twitter-archive/know-thy-father/media-index-summary.json

Closes #583.
2026-04-13 20:37:17 -04:00
2 changed files with 398 additions and 0 deletions

View File

@@ -0,0 +1,224 @@
#!/usr/bin/env python3
"""
Know Thy Father — Phase 1: Media Indexing
Scans the local Twitter archive for all tweets containing #TimmyTime or #TimmyChain.
Maps these tweets to their associated media files and outputs a manifest of media
files to be processed in the multimodal analysis pipeline.
Usage:
python index_timmy_media.py # Scan and write manifest
python index_timmy_media.py --status # Quick status only
python index_timmy_media.py --json # Output manifest as JSON to stdout
Output:
twitter-archive/know-thy-father/media-manifest.jsonl
twitter-archive/know-thy-father/media-index-summary.json
"""
from __future__ import annotations
import argparse
import json
import sys
from collections import Counter
from pathlib import Path
# Add parent for common helpers
sys.path.insert(0, str(Path(__file__).parent))
from common import (
ARCHIVE_DIR,
EXTRACTED_DIR,
load_jsonl,
write_json,
)
TARGET_HASHTAGS = {"timmytime", "timmychain"}
# Know Thy Father output directory
KTF_DIR = ARCHIVE_DIR / "know-thy-father"
MEDIA_MANIFEST = KTF_DIR / "media-manifest.jsonl"
MEDIA_SUMMARY = KTF_DIR / "media-index-summary.json"
# Media directories in the archive
MEDIA_DIR_NAMES = ("tweets_media", "community_tweet_media", "deleted_tweets_media")
def find_hashtag_tweets(tweets: list[dict], hashtags: set[str]) -> list[dict]:
"""Filter tweets that contain any of the target hashtags."""
matched = []
for tweet in tweets:
tweet_tags = {h.lower() for h in tweet.get("hashtags", [])}
if tweet_tags & hashtags:
matched.append(tweet)
return matched
def build_tweet_media_index(
hashtag_tweets: list[dict],
media_manifest: list[dict],
source_dir: Path,
) -> list[dict]:
"""Map hashtag tweets to their media files.
Returns a list of manifest entries, one per tweet-media pair.
"""
# Build tweet_id -> media records lookup
tweet_media_map: dict[str, list[dict]] = {}
for record in media_manifest:
tid = record.get("tweet_id")
if tid:
tweet_media_map.setdefault(tid, []).append(record)
entries = []
for tweet in hashtag_tweets:
tid = tweet["tweet_id"]
media_records = tweet_media_map.get(tid, [])
if not media_records:
# Tweet has hashtags but no media — still index it
entries.append({
"tweet_id": tid,
"created_at": tweet.get("created_at"),
"full_text": tweet.get("full_text", "")[:280],
"hashtags": tweet.get("hashtags", []),
"media_type": "none",
"media_path": None,
"media_id": None,
"media_count": 0,
"status": "no_media",
})
continue
for media in media_records:
local_path = media.get("local_media_path")
entries.append({
"tweet_id": tid,
"created_at": tweet.get("created_at"),
"full_text": tweet.get("full_text", "")[:280],
"hashtags": tweet.get("hashtags", []),
"media_type": media.get("media_type", "unknown"),
"media_path": local_path,
"media_id": media.get("media_id"),
"media_count": len(media_records),
"media_url": media.get("media_url_https"),
"status": "pending" if local_path and Path(local_path).exists() else "missing_file",
})
# Sort by created_at
entries.sort(key=lambda e: e.get("created_at") or "")
return entries
def write_manifest_jsonl(entries: list[dict]) -> None:
"""Write the manifest as JSONL."""
KTF_DIR.mkdir(parents=True, exist_ok=True)
with open(MEDIA_MANIFEST, "w") as f:
for entry in entries:
f.write(json.dumps(entry, sort_keys=True) + "\n")
def compute_summary(entries: list[dict]) -> dict:
"""Compute summary statistics for the manifest."""
total = len(entries)
by_status = Counter(e["status"] for e in entries)
by_type = Counter(e["media_type"] for e in entries)
# Unique tweets
unique_tweets = len(set(e["tweet_id"] for e in entries))
# Date range
dates = [e["created_at"] for e in entries if e.get("created_at")]
dates.sort()
# File existence check
files_exist = sum(
1 for e in entries
if e.get("media_path") and Path(e["media_path"]).exists()
)
return {
"total_entries": total,
"unique_tweets": unique_tweets,
"tweets_with_media": sum(1 for e in entries if e["media_type"] != "none"),
"tweets_without_media": sum(1 for e in entries if e["media_type"] == "none"),
"files_exist": files_exist,
"files_missing": by_status.get("missing_file", 0),
"pending_processing": by_status.get("pending", 0),
"by_media_type": dict(by_type),
"by_status": dict(by_status),
"date_range": {
"earliest": dates[0] if dates else None,
"latest": dates[-1] if dates else None,
},
"target_hashtags": sorted(TARGET_HASHTAGS),
}
def print_status(summary: dict) -> None:
"""Print a human-readable status summary."""
print("Know Thy Father — Phase 1: Media Index")
print("=" * 45)
print(f" Target hashtags: {', '.join(summary['target_hashtags'])}")
print(f" Unique tweets: {summary['unique_tweets']}")
print(f" With media: {summary['tweets_with_media']}")
print(f" Without media: {summary['tweets_without_media']}")
print(f" Files found: {summary['files_exist']}")
print(f" Files missing: {summary['files_missing']}")
print(f" Pending process: {summary['pending_processing']}")
print()
print("Media types:")
for media_type, count in summary["by_media_type"].items():
print(f" {media_type:20s} {count}")
print()
if summary["date_range"]["earliest"]:
print(f"Date range: {summary['date_range']['earliest'][:10]}{summary['date_range']['latest'][:10]}")
def main() -> None:
parser = argparse.ArgumentParser(description="Know Thy Father — Phase 1 Media Indexing")
parser.add_argument("--status", action="store_true", help="Quick status only (don't write files)")
parser.add_argument("--json", action="store_true", help="Output manifest as JSON to stdout")
args = parser.parse_args()
# Load data
tweets = load_jsonl(EXTRACTED_DIR / "tweets.jsonl")
if not tweets:
print("Error: No tweets found. Run extract_archive.py first.", file=sys.stderr)
sys.exit(1)
media_manifest = load_jsonl(ARCHIVE_DIR / "media" / "manifest.jsonl")
if not media_manifest:
print("Warning: No media manifest found. Run extract_media_manifest.py first.", file=sys.stderr)
print("Continuing with text-only indexing...", file=sys.stderr)
# Scan for target hashtags
hashtag_tweets = find_hashtag_tweets(tweets, TARGET_HASHTAGS)
# Resolve source dir for media lookup
source_dir = Path.home() / "Downloads" / "twitter-2026-03-27-d4471cc6eb6703034d592f870933561ebee374d9d9b90c9b8923abff064afc1e" / "data"
# Build the media index
entries = build_tweet_media_index(hashtag_tweets, media_manifest, source_dir)
summary = compute_summary(entries)
if args.status:
print_status(summary)
return
# Write outputs
write_manifest_jsonl(entries)
write_json(MEDIA_SUMMARY, summary)
if args.json:
print(json.dumps(entries, indent=2, sort_keys=True))
else:
print_status(summary)
print()
print(f"Manifest written to: {MEDIA_MANIFEST}")
print(f"Summary written to: {MEDIA_SUMMARY}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,174 @@
"""Tests for Know Thy Father Phase 1: Media Indexing."""
import json
import sys
from pathlib import Path
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "scripts" / "twitter_archive"))
@pytest.fixture
def sample_tweets():
return [
{
"tweet_id": "1001",
"created_at": "Mon Mar 01 12:00:00 +0000 2026",
"full_text": "First #TimmyTime post",
"hashtags": ["TimmyTime"],
},
{
"tweet_id": "1002",
"created_at": "Tue Mar 02 12:00:00 +0000 2026",
"full_text": "#TimmyChain block 1",
"hashtags": ["TimmyChain"],
},
{
"tweet_id": "1003",
"created_at": "Wed Mar 03 12:00:00 +0000 2026",
"full_text": "Unrelated post",
"hashtags": ["bitcoin"],
},
{
"tweet_id": "1004",
"created_at": "Thu Mar 04 12:00:00 +0000 2026",
"full_text": "Both tags #TimmyTime #TimmyChain",
"hashtags": ["TimmyTime", "TimmyChain"],
},
]
@pytest.fixture
def sample_media():
return [
{
"tweet_id": "1001",
"media_id": "m1",
"media_type": "video",
"local_media_path": "/tmp/media/1001-video.mp4",
"media_url_https": "https://pbs.twimg.com/ext_thumb/1001.jpg",
},
{
"tweet_id": "1002",
"media_id": "m2",
"media_type": "photo",
"local_media_path": "/tmp/media/1002-photo.jpg",
"media_url_https": "https://pbs.twimg.com/media/1002.jpg",
},
]
class TestFindHashtagTweets:
def test_finds_timmytime(self, sample_tweets):
from index_timmy_media import find_hashtag_tweets
result = find_hashtag_tweets(sample_tweets, {"timmytime"})
ids = [t["tweet_id"] for t in result]
assert "1001" in ids
assert "1004" in ids
assert "1003" not in ids
def test_finds_timmychain(self, sample_tweets):
from index_timmy_media import find_hashtag_tweets
result = find_hashtag_tweets(sample_tweets, {"timmychain"})
ids = [t["tweet_id"] for t in result]
assert "1002" in ids
assert "1004" in ids
def test_finds_both(self, sample_tweets):
from index_timmy_media import find_hashtag_tweets
result = find_hashtag_tweets(sample_tweets, {"timmytime", "timmychain"})
ids = [t["tweet_id"] for t in result]
assert len(ids) == 3
assert "1003" not in ids
def test_case_insensitive(self):
from index_timmy_media import find_hashtag_tweets
tweets = [
{"tweet_id": "1", "hashtags": ["TIMMYTIME"]},
{"tweet_id": "2", "hashtags": ["TimmyChain"]},
]
result = find_hashtag_tweets(tweets, {"timmytime", "timmychain"})
assert len(result) == 2
def test_empty_input(self):
from index_timmy_media import find_hashtag_tweets
assert find_hashtag_tweets([], {"timmytime"}) == []
class TestBuildTweetMediaIndex:
def test_maps_tweets_to_media(self, sample_tweets, sample_media, tmp_path):
from index_timmy_media import build_tweet_media_index
matched = [t for t in sample_tweets if t["tweet_id"] in ("1001", "1002")]
# Create fake media files
for m in sample_media:
p = Path(m["local_media_path"])
p.parent.mkdir(parents=True, exist_ok=True)
p.touch()
entries = build_tweet_media_index(matched, sample_media, tmp_path)
assert len(entries) == 2
by_id = {e["tweet_id"]: e for e in entries}
assert by_id["1001"]["media_type"] == "video"
assert by_id["1002"]["media_type"] == "photo"
assert by_id["1001"]["status"] == "pending"
assert by_id["1002"]["status"] == "pending"
def test_handles_no_media(self, sample_tweets, sample_media, tmp_path):
from index_timmy_media import build_tweet_media_index
# Tweet 1004 has no media record
matched = [t for t in sample_tweets if t["tweet_id"] == "1004"]
entries = build_tweet_media_index(matched, sample_media, tmp_path)
assert len(entries) == 1
assert entries[0]["media_type"] == "none"
assert entries[0]["status"] == "no_media"
def test_handles_missing_file(self, sample_tweets, sample_media, tmp_path):
from index_timmy_media import build_tweet_media_index
matched = [t for t in sample_tweets if t["tweet_id"] == "1001"]
# Use a path that definitely doesn't exist
fake_media = [{
"tweet_id": "1001",
"media_id": "m1",
"media_type": "video",
"local_media_path": "/tmp/nonexistent/1001-FAKE.mp4",
"media_url_https": "https://pbs.twimg.com/ext_thumb/1001.jpg",
}]
entries = build_tweet_media_index(matched, fake_media, tmp_path)
assert entries[0]["status"] == "missing_file"
def test_sorted_by_date(self, sample_tweets, sample_media, tmp_path):
from index_timmy_media import build_tweet_media_index
matched = [t for t in sample_tweets if t["tweet_id"] in ("1001", "1002")]
for m in sample_media:
Path(m["local_media_path"]).parent.mkdir(parents=True, exist_ok=True)
Path(m["local_media_path"]).touch()
entries = build_tweet_media_index(matched, sample_media, tmp_path)
dates = [e["created_at"] for e in entries]
assert dates == sorted(dates)
class TestComputeSummary:
def test_basic_summary(self):
from index_timmy_media import compute_summary
entries = [
{"tweet_id": "1", "media_type": "video", "media_path": "/fake", "status": "pending"},
{"tweet_id": "2", "media_type": "none", "media_path": None, "status": "no_media"},
{"tweet_id": "3", "media_type": "photo", "media_path": "/fake2", "status": "pending"},
]
summary = compute_summary(entries)
assert summary["total_entries"] == 3
assert summary["unique_tweets"] == 3
assert summary["tweets_with_media"] == 2
assert summary["tweets_without_media"] == 1
assert summary["by_media_type"]["video"] == 1
assert summary["by_media_type"]["photo"] == 1
assert summary["by_media_type"]["none"] == 1
def test_empty_entries(self):
from index_timmy_media import compute_summary
summary = compute_summary([])
assert summary["total_entries"] == 0
assert summary["unique_tweets"] == 0