timmy-home/scripts/know_thy_father/index_media.py

#!/usr/bin/env python3
"""Know Thy Father — Phase 1: Media Indexing

Scans the local Twitter archive for all tweets containing #TimmyTime or #TimmyChain.
Maps these tweets to their associated media files in data/media.
Outputs a manifest of media files to be processed by the multimodal pipeline.

Usage:
    python3 scripts/know_thy_father/index_media.py
    python3 scripts/know_thy_father/index_media.py --tweets twitter-archive/extracted/tweets.jsonl
    python3 scripts/know_thy_father/index_media.py --output twitter-archive/know-thy-father/media_manifest.jsonl

Ref: #582, #583
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import sys
from collections import Counter
from dataclasses import dataclass, field, asdict
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple

logger = logging.getLogger(__name__)

# Target hashtags
TARGET_HASHTAGS = {"timmytime", "timmychain"}

# Twitter archive default paths
DEFAULT_TWEETS_PATH = Path("twitter-archive/extracted/tweets.jsonl")
DEFAULT_MEDIA_MANIFEST = Path("twitter-archive/media/manifest.jsonl")
DEFAULT_OUTPUT_PATH = Path("twitter-archive/know-thy-father/media_manifest.jsonl")


@dataclass
class MediaEntry:
    """A media file associated with a #TimmyTime/#TimmyChain tweet."""
    tweet_id: str
    created_at: str
    full_text: str
    hashtags: List[str]
    media_id: str
    media_type: str  # photo, video, animated_gif
    media_index: int
    local_media_path: str
    media_url_https: str = ""
    expanded_url: str = ""
    source: str = ""  # "media_manifest" or "tweets_only"
    indexed_at: str = ""

    def __post_init__(self):
        if not self.indexed_at:
            self.indexed_at = datetime.utcnow().isoformat() + "Z"

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class IndexStats:
    """Statistics from the indexing run."""
    total_tweets_scanned: int = 0
    target_tweets_found: int = 0
    target_tweets_with_media: int = 0
    target_tweets_without_media: int = 0
    total_media_entries: int = 0
    media_types: Dict[str, int] = field(default_factory=dict)
    hashtag_counts: Dict[str, int] = field(default_factory=dict)
    date_range: Dict[str, str] = field(default_factory=dict)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


def load_tweets(tweets_path: Path) -> List[Dict[str, Any]]:
    """Load tweets from JSONL file."""
    if not tweets_path.exists():
        logger.error(f"Tweets file not found: {tweets_path}")
        return []

    tweets = []
    with open(tweets_path) as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                tweets.append(json.loads(line))
            except json.JSONDecodeError as e:
                logger.warning(f"Line {line_num}: invalid JSON: {e}")

    logger.info(f"Loaded {len(tweets)} tweets from {tweets_path}")
    return tweets


def load_media_manifest(manifest_path: Path) -> Dict[str, List[Dict[str, Any]]]:
    """Load media manifest and index by tweet_id."""
    if not manifest_path.exists():
        logger.warning(f"Media manifest not found: {manifest_path}")
        return {}

    media_by_tweet: Dict[str, List[Dict[str, Any]]] = {}
    with open(manifest_path) as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                entry = json.loads(line)
                tweet_id = entry.get("tweet_id", "")
                if tweet_id:
                    if tweet_id not in media_by_tweet:
                        media_by_tweet[tweet_id] = []
                    media_by_tweet[tweet_id].append(entry)
            except json.JSONDecodeError as e:
                logger.warning(f"Media manifest line {line_num}: invalid JSON: {e}")

    logger.info(f"Loaded media manifest: {len(media_by_tweet)} tweets with media")
    return media_by_tweet


def filter_target_tweets(tweets: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Filter tweets that contain #TimmyTime or #TimmyChain."""
    target_tweets = []
    for tweet in tweets:
        hashtags = [h.lower() for h in tweet.get("hashtags", [])]
        if any(h in TARGET_HASHTAGS for h in hashtags):
            target_tweets.append(tweet)

    logger.info(f"Found {len(target_tweets)} tweets with target hashtags")
    return target_tweets


def build_media_entries(
    target_tweets: List[Dict[str, Any]],
    media_by_tweet: Dict[str, List[Dict[str, Any]]],
) -> Tuple[List[MediaEntry], List[Dict[str, Any]]]:
    """Build media entries for target tweets.

    Returns:
        Tuple of (media_entries, tweets_without_media)
    """
    media_entries: List[MediaEntry] = []
    tweets_without_media: List[Dict[str, Any]] = []
    seen_media: Set[str] = set()

    for tweet in target_tweets:
        tweet_id = tweet.get("tweet_id", "")
        created_at = tweet.get("created_at", "")
        full_text = tweet.get("full_text", "")
        hashtags = tweet.get("hashtags", [])

        # Get media from manifest
        tweet_media = media_by_tweet.get(tweet_id, [])

        if not tweet_media:
            tweets_without_media.append(tweet)
            continue

        for media in tweet_media:
            media_id = media.get("media_id", "")
            # Deduplicate by media_id
            if media_id in seen_media:
                continue
            seen_media.add(media_id)

            entry = MediaEntry(
                tweet_id=tweet_id,
                created_at=created_at,
                full_text=full_text,
                hashtags=hashtags,
                media_id=media_id,
                media_type=media.get("media_type", "unknown"),
                media_index=media.get("media_index", 0),
                local_media_path=media.get("local_media_path", ""),
                media_url_https=media.get("media_url_https", ""),
                expanded_url=media.get("expanded_url", ""),
                source="media_manifest",
            )
            media_entries.append(entry)

    # For tweets without media in manifest, check if they have URL-based media
    for tweet in tweets_without_media:
        urls = tweet.get("urls", [])
        if urls:
            # Create entry with URL reference
            entry = MediaEntry(
                tweet_id=tweet.get("tweet_id", ""),
                created_at=tweet.get("created_at", ""),
                full_text=tweet.get("full_text", ""),
                hashtags=tweet.get("hashtags", []),
                media_id=f"url-{tweet.get('tweet_id', '')}",
                media_type="url_reference",
                media_index=0,
                local_media_path="",
                expanded_url=urls[0] if urls else "",
                source="tweets_only",
            )
            media_entries.append(entry)

    logger.info(f"Built {len(media_entries)} media entries")
    return media_entries, tweets_without_media


def compute_stats(
    total_tweets: int,
    target_tweets: List[Dict[str, Any]],
    media_entries: List[MediaEntry],
) -> IndexStats:
    """Compute indexing statistics."""
    stats = IndexStats(
        total_tweets_scanned=total_tweets,
        target_tweets_found=len(target_tweets),
    )

    # Count media types
    media_type_counts: Dict[str, int] = {}
    hashtag_counts: Dict[str, int] = {}
    dates: List[str] = []

    tweets_with_media: Set[str] = set()

    for entry in media_entries:
        media_type_counts[entry.media_type] = media_type_counts.get(entry.media_type, 0) + 1
        tweets_with_media.add(entry.tweet_id)
        if entry.created_at:
            dates.append(entry.created_at)

    for tweet in target_tweets:
        for h in tweet.get("hashtags", []):
            h_lower = h.lower()
            hashtag_counts[h_lower] = hashtag_counts.get(h_lower, 0) + 1

    stats.target_tweets_with_media = len(tweets_with_media)
    stats.target_tweets_without_media = len(target_tweets) - len(tweets_with_media)
    stats.total_media_entries = len(media_entries)
    stats.media_types = dict(sorted(media_type_counts.items()))
    stats.hashtag_counts = dict(sorted(hashtag_counts.items(), key=lambda x: -x[1]))

    if dates:
        dates_sorted = sorted(dates)
        stats.date_range = {
            "earliest": dates_sorted[0],
            "latest": dates_sorted[-1],
        }

    return stats


def generate_summary_report(stats: IndexStats) -> str:
    """Generate a markdown summary report."""
    lines = [
        "# Know Thy Father — Phase 1: Media Indexing Report",
        "",
        f"**Generated:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}",
        "",
        "## Summary",
        "",
        "| Metric | Count |",
        "|--------|-------|",
        f"| Total tweets scanned | {stats.total_tweets_scanned} |",
        f"| #TimmyTime/#TimmyChain tweets | {stats.target_tweets_found} |",
        f"| Tweets with media | {stats.target_tweets_with_media} |",
        f"| Tweets without media | {stats.target_tweets_without_media} |",
        f"| Total media entries | {stats.total_media_entries} |",
        "",
    ]

    if stats.date_range:
        lines.extend([
            "## Date Range",
            "",
            f"- Earliest: {stats.date_range.get('earliest', 'N/A')}",
            f"- Latest: {stats.date_range.get('latest', 'N/A')}",
            "",
        ])

    if stats.media_types:
        lines.extend([
            "## Media Types",
            "",
            "| Type | Count |",
            "|------|-------|",
        ])
        for mtype, count in sorted(stats.media_types.items(), key=lambda x: -x[1]):
            lines.append(f"| {mtype} | {count} |")
        lines.append("")

    if stats.hashtag_counts:
        lines.extend([
            "## Hashtag Distribution",
            "",
            "| Hashtag | Count |",
            "|---------|-------|",
        ])
        for tag, count in list(stats.hashtag_counts.items())[:15]:
            lines.append(f"| #{tag} | {count} |")
        lines.append("")

    lines.extend([
        "---",
        "",
        "*Generated by scripts/know_thy_father/index_media.py*",
        "*Ref: #582, #583*",
        "",
    ])

    return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser(
        description="Know Thy Father — Phase 1: Media Indexing"
    )
    parser.add_argument(
        "--tweets", "-t",
        type=Path,
        default=DEFAULT_TWEETS_PATH,
        help=f"Path to tweets JSONL (default: {DEFAULT_TWEETS_PATH})",
    )
    parser.add_argument(
        "--media-manifest", "-m",
        type=Path,
        default=DEFAULT_MEDIA_MANIFEST,
        help=f"Path to media manifest (default: {DEFAULT_MEDIA_MANIFEST})",
    )
    parser.add_argument(
        "--output", "-o",
        type=Path,
        default=DEFAULT_OUTPUT_PATH,
        help=f"Output manifest path (default: {DEFAULT_OUTPUT_PATH})",
    )
    parser.add_argument(
        "--report", "-r",
        type=Path,
        default=None,
        help="Output path for summary report (optional)",
    )
    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Enable verbose logging",
    )

    args = parser.parse_args()

    logging.basicConfig(
        level=logging.DEBUG if args.verbose else logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
    )

    # Load data
    tweets = load_tweets(args.tweets)
    if not tweets:
        print(f"Error: No tweets loaded from {args.tweets}", file=sys.stderr)
        sys.exit(1)

    media_by_tweet = load_media_manifest(args.media_manifest)

    # Filter target tweets
    target_tweets = filter_target_tweets(tweets)

    if not target_tweets:
        print("Warning: No #TimmyTime/#TimmyChain tweets found", file=sys.stderr)

    # Build media entries
    media_entries, tweets_without_media = build_media_entries(target_tweets, media_by_tweet)

    # Write output manifest
    args.output.parent.mkdir(parents=True, exist_ok=True)
    with open(args.output, "w") as f:
        for entry in media_entries:
            f.write(json.dumps(entry.to_dict(), ensure_ascii=False) + "\n")

    # Compute stats
    stats = compute_stats(len(tweets), target_tweets, media_entries)

    # Generate report
    report = generate_summary_report(stats)

    if args.report:
        args.report.parent.mkdir(parents=True, exist_ok=True)
        args.report.write_text(report)
        print(f"Report written to {args.report}")

    # Print summary
    print(f"\n=== Phase 1: Media Indexing Complete ===")
    print(f"Total tweets scanned: {stats.total_tweets_scanned}")
    print(f"#TimmyTime/#TimmyChain tweets: {stats.target_tweets_found}")
    print(f"Media entries indexed: {stats.total_media_entries}")
    print(f"  - With media: {stats.target_tweets_with_media}")
    print(f"  - Without media: {stats.target_tweets_without_media}")
    print(f"\nMedia types:")
    for mtype, count in sorted(stats.media_types.items(), key=lambda x: -x[1]):
        print(f"  {mtype}: {count}")
    print(f"\nOutput: {args.output}")


if __name__ == "__main__":
    main()