Files
timmy-home/scripts/know_thy_father/index_media.py
Alexander Whitestone 34b74d81dc
Some checks failed
Smoke Test / smoke (push) Has been cancelled
feat(know-thy-father): Phase 1 media indexing (#639)
Merge PR #639
2026-04-14 22:14:17 +00:00

406 lines
13 KiB
Python

#!/usr/bin/env python3
"""Know Thy Father — Phase 1: Media Indexing
Scans the local Twitter archive for all tweets containing #TimmyTime or #TimmyChain.
Maps these tweets to their associated media files in data/media.
Outputs a manifest of media files to be processed by the multimodal pipeline.
Usage:
python3 scripts/know_thy_father/index_media.py
python3 scripts/know_thy_father/index_media.py --tweets twitter-archive/extracted/tweets.jsonl
python3 scripts/know_thy_father/index_media.py --output twitter-archive/know-thy-father/media_manifest.jsonl
Ref: #582, #583
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
from collections import Counter
from dataclasses import dataclass, field, asdict
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple
logger = logging.getLogger(__name__)
# Target hashtags
TARGET_HASHTAGS = {"timmytime", "timmychain"}
# Twitter archive default paths
DEFAULT_TWEETS_PATH = Path("twitter-archive/extracted/tweets.jsonl")
DEFAULT_MEDIA_MANIFEST = Path("twitter-archive/media/manifest.jsonl")
DEFAULT_OUTPUT_PATH = Path("twitter-archive/know-thy-father/media_manifest.jsonl")
@dataclass
class MediaEntry:
"""A media file associated with a #TimmyTime/#TimmyChain tweet."""
tweet_id: str
created_at: str
full_text: str
hashtags: List[str]
media_id: str
media_type: str # photo, video, animated_gif
media_index: int
local_media_path: str
media_url_https: str = ""
expanded_url: str = ""
source: str = "" # "media_manifest" or "tweets_only"
indexed_at: str = ""
def __post_init__(self):
if not self.indexed_at:
self.indexed_at = datetime.utcnow().isoformat() + "Z"
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class IndexStats:
"""Statistics from the indexing run."""
total_tweets_scanned: int = 0
target_tweets_found: int = 0
target_tweets_with_media: int = 0
target_tweets_without_media: int = 0
total_media_entries: int = 0
media_types: Dict[str, int] = field(default_factory=dict)
hashtag_counts: Dict[str, int] = field(default_factory=dict)
date_range: Dict[str, str] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
def load_tweets(tweets_path: Path) -> List[Dict[str, Any]]:
"""Load tweets from JSONL file."""
if not tweets_path.exists():
logger.error(f"Tweets file not found: {tweets_path}")
return []
tweets = []
with open(tweets_path) as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
tweets.append(json.loads(line))
except json.JSONDecodeError as e:
logger.warning(f"Line {line_num}: invalid JSON: {e}")
logger.info(f"Loaded {len(tweets)} tweets from {tweets_path}")
return tweets
def load_media_manifest(manifest_path: Path) -> Dict[str, List[Dict[str, Any]]]:
"""Load media manifest and index by tweet_id."""
if not manifest_path.exists():
logger.warning(f"Media manifest not found: {manifest_path}")
return {}
media_by_tweet: Dict[str, List[Dict[str, Any]]] = {}
with open(manifest_path) as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
tweet_id = entry.get("tweet_id", "")
if tweet_id:
if tweet_id not in media_by_tweet:
media_by_tweet[tweet_id] = []
media_by_tweet[tweet_id].append(entry)
except json.JSONDecodeError as e:
logger.warning(f"Media manifest line {line_num}: invalid JSON: {e}")
logger.info(f"Loaded media manifest: {len(media_by_tweet)} tweets with media")
return media_by_tweet
def filter_target_tweets(tweets: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Filter tweets that contain #TimmyTime or #TimmyChain."""
target_tweets = []
for tweet in tweets:
hashtags = [h.lower() for h in tweet.get("hashtags", [])]
if any(h in TARGET_HASHTAGS for h in hashtags):
target_tweets.append(tweet)
logger.info(f"Found {len(target_tweets)} tweets with target hashtags")
return target_tweets
def build_media_entries(
target_tweets: List[Dict[str, Any]],
media_by_tweet: Dict[str, List[Dict[str, Any]]],
) -> Tuple[List[MediaEntry], List[Dict[str, Any]]]:
"""Build media entries for target tweets.
Returns:
Tuple of (media_entries, tweets_without_media)
"""
media_entries: List[MediaEntry] = []
tweets_without_media: List[Dict[str, Any]] = []
seen_media: Set[str] = set()
for tweet in target_tweets:
tweet_id = tweet.get("tweet_id", "")
created_at = tweet.get("created_at", "")
full_text = tweet.get("full_text", "")
hashtags = tweet.get("hashtags", [])
# Get media from manifest
tweet_media = media_by_tweet.get(tweet_id, [])
if not tweet_media:
tweets_without_media.append(tweet)
continue
for media in tweet_media:
media_id = media.get("media_id", "")
# Deduplicate by media_id
if media_id in seen_media:
continue
seen_media.add(media_id)
entry = MediaEntry(
tweet_id=tweet_id,
created_at=created_at,
full_text=full_text,
hashtags=hashtags,
media_id=media_id,
media_type=media.get("media_type", "unknown"),
media_index=media.get("media_index", 0),
local_media_path=media.get("local_media_path", ""),
media_url_https=media.get("media_url_https", ""),
expanded_url=media.get("expanded_url", ""),
source="media_manifest",
)
media_entries.append(entry)
# For tweets without media in manifest, check if they have URL-based media
for tweet in tweets_without_media:
urls = tweet.get("urls", [])
if urls:
# Create entry with URL reference
entry = MediaEntry(
tweet_id=tweet.get("tweet_id", ""),
created_at=tweet.get("created_at", ""),
full_text=tweet.get("full_text", ""),
hashtags=tweet.get("hashtags", []),
media_id=f"url-{tweet.get('tweet_id', '')}",
media_type="url_reference",
media_index=0,
local_media_path="",
expanded_url=urls[0] if urls else "",
source="tweets_only",
)
media_entries.append(entry)
logger.info(f"Built {len(media_entries)} media entries")
return media_entries, tweets_without_media
def compute_stats(
total_tweets: int,
target_tweets: List[Dict[str, Any]],
media_entries: List[MediaEntry],
) -> IndexStats:
"""Compute indexing statistics."""
stats = IndexStats(
total_tweets_scanned=total_tweets,
target_tweets_found=len(target_tweets),
)
# Count media types
media_type_counts: Dict[str, int] = {}
hashtag_counts: Dict[str, int] = {}
dates: List[str] = []
tweets_with_media: Set[str] = set()
for entry in media_entries:
media_type_counts[entry.media_type] = media_type_counts.get(entry.media_type, 0) + 1
tweets_with_media.add(entry.tweet_id)
if entry.created_at:
dates.append(entry.created_at)
for tweet in target_tweets:
for h in tweet.get("hashtags", []):
h_lower = h.lower()
hashtag_counts[h_lower] = hashtag_counts.get(h_lower, 0) + 1
stats.target_tweets_with_media = len(tweets_with_media)
stats.target_tweets_without_media = len(target_tweets) - len(tweets_with_media)
stats.total_media_entries = len(media_entries)
stats.media_types = dict(sorted(media_type_counts.items()))
stats.hashtag_counts = dict(sorted(hashtag_counts.items(), key=lambda x: -x[1]))
if dates:
dates_sorted = sorted(dates)
stats.date_range = {
"earliest": dates_sorted[0],
"latest": dates_sorted[-1],
}
return stats
def generate_summary_report(stats: IndexStats) -> str:
"""Generate a markdown summary report."""
lines = [
"# Know Thy Father — Phase 1: Media Indexing Report",
"",
f"**Generated:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}",
"",
"## Summary",
"",
"| Metric | Count |",
"|--------|-------|",
f"| Total tweets scanned | {stats.total_tweets_scanned} |",
f"| #TimmyTime/#TimmyChain tweets | {stats.target_tweets_found} |",
f"| Tweets with media | {stats.target_tweets_with_media} |",
f"| Tweets without media | {stats.target_tweets_without_media} |",
f"| Total media entries | {stats.total_media_entries} |",
"",
]
if stats.date_range:
lines.extend([
"## Date Range",
"",
f"- Earliest: {stats.date_range.get('earliest', 'N/A')}",
f"- Latest: {stats.date_range.get('latest', 'N/A')}",
"",
])
if stats.media_types:
lines.extend([
"## Media Types",
"",
"| Type | Count |",
"|------|-------|",
])
for mtype, count in sorted(stats.media_types.items(), key=lambda x: -x[1]):
lines.append(f"| {mtype} | {count} |")
lines.append("")
if stats.hashtag_counts:
lines.extend([
"## Hashtag Distribution",
"",
"| Hashtag | Count |",
"|---------|-------|",
])
for tag, count in list(stats.hashtag_counts.items())[:15]:
lines.append(f"| #{tag} | {count} |")
lines.append("")
lines.extend([
"---",
"",
"*Generated by scripts/know_thy_father/index_media.py*",
"*Ref: #582, #583*",
"",
])
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Know Thy Father — Phase 1: Media Indexing"
)
parser.add_argument(
"--tweets", "-t",
type=Path,
default=DEFAULT_TWEETS_PATH,
help=f"Path to tweets JSONL (default: {DEFAULT_TWEETS_PATH})",
)
parser.add_argument(
"--media-manifest", "-m",
type=Path,
default=DEFAULT_MEDIA_MANIFEST,
help=f"Path to media manifest (default: {DEFAULT_MEDIA_MANIFEST})",
)
parser.add_argument(
"--output", "-o",
type=Path,
default=DEFAULT_OUTPUT_PATH,
help=f"Output manifest path (default: {DEFAULT_OUTPUT_PATH})",
)
parser.add_argument(
"--report", "-r",
type=Path,
default=None,
help="Output path for summary report (optional)",
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose logging",
)
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
# Load data
tweets = load_tweets(args.tweets)
if not tweets:
print(f"Error: No tweets loaded from {args.tweets}", file=sys.stderr)
sys.exit(1)
media_by_tweet = load_media_manifest(args.media_manifest)
# Filter target tweets
target_tweets = filter_target_tweets(tweets)
if not target_tweets:
print("Warning: No #TimmyTime/#TimmyChain tweets found", file=sys.stderr)
# Build media entries
media_entries, tweets_without_media = build_media_entries(target_tweets, media_by_tweet)
# Write output manifest
args.output.parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w") as f:
for entry in media_entries:
f.write(json.dumps(entry.to_dict(), ensure_ascii=False) + "\n")
# Compute stats
stats = compute_stats(len(tweets), target_tweets, media_entries)
# Generate report
report = generate_summary_report(stats)
if args.report:
args.report.parent.mkdir(parents=True, exist_ok=True)
args.report.write_text(report)
print(f"Report written to {args.report}")
# Print summary
print(f"\n=== Phase 1: Media Indexing Complete ===")
print(f"Total tweets scanned: {stats.total_tweets_scanned}")
print(f"#TimmyTime/#TimmyChain tweets: {stats.target_tweets_found}")
print(f"Media entries indexed: {stats.total_media_entries}")
print(f" - With media: {stats.target_tweets_with_media}")
print(f" - Without media: {stats.target_tweets_without_media}")
print(f"\nMedia types:")
for mtype, count in sorted(stats.media_types.items(), key=lambda x: -x[1]):
print(f" {mtype}: {count}")
print(f"\nOutput: {args.output}")
if __name__ == "__main__":
main()