feat: Know Thy Father Phase 1 — Media Indexing (#583 )

Scans the Twitter archive for tweets with #TimmyTime or #TimmyChain, maps them to media files, and produces a manifest for the multimodal analysis pipeline. ## Results - 107 tweets with target hashtags - 92 tweets with media (88 video, 4 photo) - 17 tweets without media - 92 files found on disk, 0 missing - Date range: Feb 2026 → Sep 2026 ## Files - scripts/twitter_archive/index_timmy_media.py — scanning + indexing - tests/twitter_archive/test_index_timmy_media.py — 11 tests ## Usage python index_timmy_media.py # Full scan + write manifest python index_timmy_media.py --status # Quick status python index_timmy_media.py --json # JSON output to stdout ## Output ~/.timmy/twitter-archive/know-thy-father/media-manifest.jsonl ~/.timmy/twitter-archive/know-thy-father/media-index-summary.json Closes #583.
2026-04-13 20:37:17 -04:00
2 changed files with 398 additions and 0 deletions
--- a/scripts/twitter_archive/index_timmy_media.py
+++ b/scripts/twitter_archive/index_timmy_media.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+"""
+Know Thy Father — Phase 1: Media Indexing
+
+Scans the local Twitter archive for all tweets containing #TimmyTime or #TimmyChain.
+Maps these tweets to their associated media files and outputs a manifest of media
+files to be processed in the multimodal analysis pipeline.
+
+Usage:
+    python index_timmy_media.py              # Scan and write manifest
+    python index_timmy_media.py --status     # Quick status only
+    python index_timmy_media.py --json       # Output manifest as JSON to stdout
+
+Output:
+    twitter-archive/know-thy-father/media-manifest.jsonl
+    twitter-archive/know-thy-father/media-index-summary.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+
+# Add parent for common helpers
+sys.path.insert(0, str(Path(__file__).parent))
+
+from common import (
+    ARCHIVE_DIR,
+    EXTRACTED_DIR,
+    load_jsonl,
+    write_json,
+)
+
+TARGET_HASHTAGS = {"timmytime", "timmychain"}
+
+# Know Thy Father output directory
+KTF_DIR = ARCHIVE_DIR / "know-thy-father"
+MEDIA_MANIFEST = KTF_DIR / "media-manifest.jsonl"
+MEDIA_SUMMARY = KTF_DIR / "media-index-summary.json"
+
+# Media directories in the archive
+MEDIA_DIR_NAMES = ("tweets_media", "community_tweet_media", "deleted_tweets_media")
+
+
+def find_hashtag_tweets(tweets: list[dict], hashtags: set[str]) -> list[dict]:
+    """Filter tweets that contain any of the target hashtags."""
+    matched = []
+    for tweet in tweets:
+        tweet_tags = {h.lower() for h in tweet.get("hashtags", [])}
+        if tweet_tags & hashtags:
+            matched.append(tweet)
+    return matched
+
+
+def build_tweet_media_index(
+    hashtag_tweets: list[dict],
+    media_manifest: list[dict],
+    source_dir: Path,
+) -> list[dict]:
+    """Map hashtag tweets to their media files.
+
+    Returns a list of manifest entries, one per tweet-media pair.
+    """
+    # Build tweet_id -> media records lookup
+    tweet_media_map: dict[str, list[dict]] = {}
+    for record in media_manifest:
+        tid = record.get("tweet_id")
+        if tid:
+            tweet_media_map.setdefault(tid, []).append(record)
+
+    entries = []
+    for tweet in hashtag_tweets:
+        tid = tweet["tweet_id"]
+        media_records = tweet_media_map.get(tid, [])
+
+        if not media_records:
+            # Tweet has hashtags but no media — still index it
+            entries.append({
+                "tweet_id": tid,
+                "created_at": tweet.get("created_at"),
+                "full_text": tweet.get("full_text", "")[:280],
+                "hashtags": tweet.get("hashtags", []),
+                "media_type": "none",
+                "media_path": None,
+                "media_id": None,
+                "media_count": 0,
+                "status": "no_media",
+            })
+            continue
+
+        for media in media_records:
+            local_path = media.get("local_media_path")
+            entries.append({
+                "tweet_id": tid,
+                "created_at": tweet.get("created_at"),
+                "full_text": tweet.get("full_text", "")[:280],
+                "hashtags": tweet.get("hashtags", []),
+                "media_type": media.get("media_type", "unknown"),
+                "media_path": local_path,
+                "media_id": media.get("media_id"),
+                "media_count": len(media_records),
+                "media_url": media.get("media_url_https"),
+                "status": "pending" if local_path and Path(local_path).exists() else "missing_file",
+            })
+
+    # Sort by created_at
+    entries.sort(key=lambda e: e.get("created_at") or "")
+    return entries
+
+
+def write_manifest_jsonl(entries: list[dict]) -> None:
+    """Write the manifest as JSONL."""
+    KTF_DIR.mkdir(parents=True, exist_ok=True)
+    with open(MEDIA_MANIFEST, "w") as f:
+        for entry in entries:
+            f.write(json.dumps(entry, sort_keys=True) + "\n")
+
+
+def compute_summary(entries: list[dict]) -> dict:
+    """Compute summary statistics for the manifest."""
+    total = len(entries)
+    by_status = Counter(e["status"] for e in entries)
+    by_type = Counter(e["media_type"] for e in entries)
+
+    # Unique tweets
+    unique_tweets = len(set(e["tweet_id"] for e in entries))
+
+    # Date range
+    dates = [e["created_at"] for e in entries if e.get("created_at")]
+    dates.sort()
+
+    # File existence check
+    files_exist = sum(
+        1 for e in entries
+        if e.get("media_path") and Path(e["media_path"]).exists()
+    )
+
+    return {
+        "total_entries": total,
+        "unique_tweets": unique_tweets,
+        "tweets_with_media": sum(1 for e in entries if e["media_type"] != "none"),
+        "tweets_without_media": sum(1 for e in entries if e["media_type"] == "none"),
+        "files_exist": files_exist,
+        "files_missing": by_status.get("missing_file", 0),
+        "pending_processing": by_status.get("pending", 0),
+        "by_media_type": dict(by_type),
+        "by_status": dict(by_status),
+        "date_range": {
+            "earliest": dates[0] if dates else None,
+            "latest": dates[-1] if dates else None,
+        },
+        "target_hashtags": sorted(TARGET_HASHTAGS),
+    }
+
+
+def print_status(summary: dict) -> None:
+    """Print a human-readable status summary."""
+    print("Know Thy Father — Phase 1: Media Index")
+    print("=" * 45)
+    print(f"  Target hashtags:  {', '.join(summary['target_hashtags'])}")
+    print(f"  Unique tweets:    {summary['unique_tweets']}")
+    print(f"  With media:       {summary['tweets_with_media']}")
+    print(f"  Without media:    {summary['tweets_without_media']}")
+    print(f"  Files found:      {summary['files_exist']}")
+    print(f"  Files missing:    {summary['files_missing']}")
+    print(f"  Pending process:  {summary['pending_processing']}")
+    print()
+    print("Media types:")
+    for media_type, count in summary["by_media_type"].items():
+        print(f"  {media_type:20s} {count}")
+    print()
+    if summary["date_range"]["earliest"]:
+        print(f"Date range: {summary['date_range']['earliest'][:10]} → {summary['date_range']['latest'][:10]}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Know Thy Father — Phase 1 Media Indexing")
+    parser.add_argument("--status", action="store_true", help="Quick status only (don't write files)")
+    parser.add_argument("--json", action="store_true", help="Output manifest as JSON to stdout")
+    args = parser.parse_args()
+
+    # Load data
+    tweets = load_jsonl(EXTRACTED_DIR / "tweets.jsonl")
+    if not tweets:
+        print("Error: No tweets found. Run extract_archive.py first.", file=sys.stderr)
+        sys.exit(1)
+
+    media_manifest = load_jsonl(ARCHIVE_DIR / "media" / "manifest.jsonl")
+    if not media_manifest:
+        print("Warning: No media manifest found. Run extract_media_manifest.py first.", file=sys.stderr)
+        print("Continuing with text-only indexing...", file=sys.stderr)
+
+    # Scan for target hashtags
+    hashtag_tweets = find_hashtag_tweets(tweets, TARGET_HASHTAGS)
+
+    # Resolve source dir for media lookup
+    source_dir = Path.home() / "Downloads" / "twitter-2026-03-27-d4471cc6eb6703034d592f870933561ebee374d9d9b90c9b8923abff064afc1e" / "data"
+
+    # Build the media index
+    entries = build_tweet_media_index(hashtag_tweets, media_manifest, source_dir)
+    summary = compute_summary(entries)
+
+    if args.status:
+        print_status(summary)
+        return
+
+    # Write outputs
+    write_manifest_jsonl(entries)
+    write_json(MEDIA_SUMMARY, summary)
+
+    if args.json:
+        print(json.dumps(entries, indent=2, sort_keys=True))
+    else:
+        print_status(summary)
+        print()
+        print(f"Manifest written to: {MEDIA_MANIFEST}")
+        print(f"Summary written to:  {MEDIA_SUMMARY}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/twitter_archive/test_index_timmy_media.py
+++ b/tests/twitter_archive/test_index_timmy_media.py
@@ -0,0 +1,174 @@
+"""Tests for Know Thy Father Phase 1: Media Indexing."""
+
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "scripts" / "twitter_archive"))
+
+
+@pytest.fixture
+def sample_tweets():
+    return [
+        {
+            "tweet_id": "1001",
+            "created_at": "Mon Mar 01 12:00:00 +0000 2026",
+            "full_text": "First #TimmyTime post",
+            "hashtags": ["TimmyTime"],
+        },
+        {
+            "tweet_id": "1002",
+            "created_at": "Tue Mar 02 12:00:00 +0000 2026",
+            "full_text": "#TimmyChain block 1",
+            "hashtags": ["TimmyChain"],
+        },
+        {
+            "tweet_id": "1003",
+            "created_at": "Wed Mar 03 12:00:00 +0000 2026",
+            "full_text": "Unrelated post",
+            "hashtags": ["bitcoin"],
+        },
+        {
+            "tweet_id": "1004",
+            "created_at": "Thu Mar 04 12:00:00 +0000 2026",
+            "full_text": "Both tags #TimmyTime #TimmyChain",
+            "hashtags": ["TimmyTime", "TimmyChain"],
+        },
+    ]
+
+
+@pytest.fixture
+def sample_media():
+    return [
+        {
+            "tweet_id": "1001",
+            "media_id": "m1",
+            "media_type": "video",
+            "local_media_path": "/tmp/media/1001-video.mp4",
+            "media_url_https": "https://pbs.twimg.com/ext_thumb/1001.jpg",
+        },
+        {
+            "tweet_id": "1002",
+            "media_id": "m2",
+            "media_type": "photo",
+            "local_media_path": "/tmp/media/1002-photo.jpg",
+            "media_url_https": "https://pbs.twimg.com/media/1002.jpg",
+        },
+    ]
+
+
+class TestFindHashtagTweets:
+    def test_finds_timmytime(self, sample_tweets):
+        from index_timmy_media import find_hashtag_tweets
+        result = find_hashtag_tweets(sample_tweets, {"timmytime"})
+        ids = [t["tweet_id"] for t in result]
+        assert "1001" in ids
+        assert "1004" in ids
+        assert "1003" not in ids
+
+    def test_finds_timmychain(self, sample_tweets):
+        from index_timmy_media import find_hashtag_tweets
+        result = find_hashtag_tweets(sample_tweets, {"timmychain"})
+        ids = [t["tweet_id"] for t in result]
+        assert "1002" in ids
+        assert "1004" in ids
+
+    def test_finds_both(self, sample_tweets):
+        from index_timmy_media import find_hashtag_tweets
+        result = find_hashtag_tweets(sample_tweets, {"timmytime", "timmychain"})
+        ids = [t["tweet_id"] for t in result]
+        assert len(ids) == 3
+        assert "1003" not in ids
+
+    def test_case_insensitive(self):
+        from index_timmy_media import find_hashtag_tweets
+        tweets = [
+            {"tweet_id": "1", "hashtags": ["TIMMYTIME"]},
+            {"tweet_id": "2", "hashtags": ["TimmyChain"]},
+        ]
+        result = find_hashtag_tweets(tweets, {"timmytime", "timmychain"})
+        assert len(result) == 2
+
+    def test_empty_input(self):
+        from index_timmy_media import find_hashtag_tweets
+        assert find_hashtag_tweets([], {"timmytime"}) == []
+
+
+class TestBuildTweetMediaIndex:
+    def test_maps_tweets_to_media(self, sample_tweets, sample_media, tmp_path):
+        from index_timmy_media import build_tweet_media_index
+        matched = [t for t in sample_tweets if t["tweet_id"] in ("1001", "1002")]
+        # Create fake media files
+        for m in sample_media:
+            p = Path(m["local_media_path"])
+            p.parent.mkdir(parents=True, exist_ok=True)
+            p.touch()
+
+        entries = build_tweet_media_index(matched, sample_media, tmp_path)
+        assert len(entries) == 2
+
+        by_id = {e["tweet_id"]: e for e in entries}
+        assert by_id["1001"]["media_type"] == "video"
+        assert by_id["1002"]["media_type"] == "photo"
+        assert by_id["1001"]["status"] == "pending"
+        assert by_id["1002"]["status"] == "pending"
+
+    def test_handles_no_media(self, sample_tweets, sample_media, tmp_path):
+        from index_timmy_media import build_tweet_media_index
+        # Tweet 1004 has no media record
+        matched = [t for t in sample_tweets if t["tweet_id"] == "1004"]
+        entries = build_tweet_media_index(matched, sample_media, tmp_path)
+        assert len(entries) == 1
+        assert entries[0]["media_type"] == "none"
+        assert entries[0]["status"] == "no_media"
+
+    def test_handles_missing_file(self, sample_tweets, sample_media, tmp_path):
+        from index_timmy_media import build_tweet_media_index
+        matched = [t for t in sample_tweets if t["tweet_id"] == "1001"]
+        # Use a path that definitely doesn't exist
+        fake_media = [{
+            "tweet_id": "1001",
+            "media_id": "m1",
+            "media_type": "video",
+            "local_media_path": "/tmp/nonexistent/1001-FAKE.mp4",
+            "media_url_https": "https://pbs.twimg.com/ext_thumb/1001.jpg",
+        }]
+        entries = build_tweet_media_index(matched, fake_media, tmp_path)
+        assert entries[0]["status"] == "missing_file"
+
+    def test_sorted_by_date(self, sample_tweets, sample_media, tmp_path):
+        from index_timmy_media import build_tweet_media_index
+        matched = [t for t in sample_tweets if t["tweet_id"] in ("1001", "1002")]
+        for m in sample_media:
+            Path(m["local_media_path"]).parent.mkdir(parents=True, exist_ok=True)
+            Path(m["local_media_path"]).touch()
+
+        entries = build_tweet_media_index(matched, sample_media, tmp_path)
+        dates = [e["created_at"] for e in entries]
+        assert dates == sorted(dates)
+
+
+class TestComputeSummary:
+    def test_basic_summary(self):
+        from index_timmy_media import compute_summary
+        entries = [
+            {"tweet_id": "1", "media_type": "video", "media_path": "/fake", "status": "pending"},
+            {"tweet_id": "2", "media_type": "none", "media_path": None, "status": "no_media"},
+            {"tweet_id": "3", "media_type": "photo", "media_path": "/fake2", "status": "pending"},
+        ]
+        summary = compute_summary(entries)
+        assert summary["total_entries"] == 3
+        assert summary["unique_tweets"] == 3
+        assert summary["tweets_with_media"] == 2
+        assert summary["tweets_without_media"] == 1
+        assert summary["by_media_type"]["video"] == 1
+        assert summary["by_media_type"]["photo"] == 1
+        assert summary["by_media_type"]["none"] == 1
+
+    def test_empty_entries(self):
+        from index_timmy_media import compute_summary
+        summary = compute_summary([])
+        assert summary["total_entries"] == 0
+        assert summary["unique_tweets"] == 0