Some checks failed
Smoke Test / smoke (push) Has been cancelled
Merge PR #639
406 lines
13 KiB
Python
406 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""Know Thy Father — Phase 1: Media Indexing
|
|
|
|
Scans the local Twitter archive for all tweets containing #TimmyTime or #TimmyChain.
|
|
Maps these tweets to their associated media files in data/media.
|
|
Outputs a manifest of media files to be processed by the multimodal pipeline.
|
|
|
|
Usage:
|
|
python3 scripts/know_thy_father/index_media.py
|
|
python3 scripts/know_thy_father/index_media.py --tweets twitter-archive/extracted/tweets.jsonl
|
|
python3 scripts/know_thy_father/index_media.py --output twitter-archive/know-thy-father/media_manifest.jsonl
|
|
|
|
Ref: #582, #583
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
from collections import Counter
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Target hashtags
|
|
TARGET_HASHTAGS = {"timmytime", "timmychain"}
|
|
|
|
# Twitter archive default paths
|
|
DEFAULT_TWEETS_PATH = Path("twitter-archive/extracted/tweets.jsonl")
|
|
DEFAULT_MEDIA_MANIFEST = Path("twitter-archive/media/manifest.jsonl")
|
|
DEFAULT_OUTPUT_PATH = Path("twitter-archive/know-thy-father/media_manifest.jsonl")
|
|
|
|
|
|
@dataclass
|
|
class MediaEntry:
|
|
"""A media file associated with a #TimmyTime/#TimmyChain tweet."""
|
|
tweet_id: str
|
|
created_at: str
|
|
full_text: str
|
|
hashtags: List[str]
|
|
media_id: str
|
|
media_type: str # photo, video, animated_gif
|
|
media_index: int
|
|
local_media_path: str
|
|
media_url_https: str = ""
|
|
expanded_url: str = ""
|
|
source: str = "" # "media_manifest" or "tweets_only"
|
|
indexed_at: str = ""
|
|
|
|
def __post_init__(self):
|
|
if not self.indexed_at:
|
|
self.indexed_at = datetime.utcnow().isoformat() + "Z"
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class IndexStats:
|
|
"""Statistics from the indexing run."""
|
|
total_tweets_scanned: int = 0
|
|
target_tweets_found: int = 0
|
|
target_tweets_with_media: int = 0
|
|
target_tweets_without_media: int = 0
|
|
total_media_entries: int = 0
|
|
media_types: Dict[str, int] = field(default_factory=dict)
|
|
hashtag_counts: Dict[str, int] = field(default_factory=dict)
|
|
date_range: Dict[str, str] = field(default_factory=dict)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
def load_tweets(tweets_path: Path) -> List[Dict[str, Any]]:
|
|
"""Load tweets from JSONL file."""
|
|
if not tweets_path.exists():
|
|
logger.error(f"Tweets file not found: {tweets_path}")
|
|
return []
|
|
|
|
tweets = []
|
|
with open(tweets_path) as f:
|
|
for line_num, line in enumerate(f, 1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
tweets.append(json.loads(line))
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"Line {line_num}: invalid JSON: {e}")
|
|
|
|
logger.info(f"Loaded {len(tweets)} tweets from {tweets_path}")
|
|
return tweets
|
|
|
|
|
|
def load_media_manifest(manifest_path: Path) -> Dict[str, List[Dict[str, Any]]]:
|
|
"""Load media manifest and index by tweet_id."""
|
|
if not manifest_path.exists():
|
|
logger.warning(f"Media manifest not found: {manifest_path}")
|
|
return {}
|
|
|
|
media_by_tweet: Dict[str, List[Dict[str, Any]]] = {}
|
|
with open(manifest_path) as f:
|
|
for line_num, line in enumerate(f, 1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
entry = json.loads(line)
|
|
tweet_id = entry.get("tweet_id", "")
|
|
if tweet_id:
|
|
if tweet_id not in media_by_tweet:
|
|
media_by_tweet[tweet_id] = []
|
|
media_by_tweet[tweet_id].append(entry)
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"Media manifest line {line_num}: invalid JSON: {e}")
|
|
|
|
logger.info(f"Loaded media manifest: {len(media_by_tweet)} tweets with media")
|
|
return media_by_tweet
|
|
|
|
|
|
def filter_target_tweets(tweets: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Filter tweets that contain #TimmyTime or #TimmyChain."""
|
|
target_tweets = []
|
|
for tweet in tweets:
|
|
hashtags = [h.lower() for h in tweet.get("hashtags", [])]
|
|
if any(h in TARGET_HASHTAGS for h in hashtags):
|
|
target_tweets.append(tweet)
|
|
|
|
logger.info(f"Found {len(target_tweets)} tweets with target hashtags")
|
|
return target_tweets
|
|
|
|
|
|
def build_media_entries(
|
|
target_tweets: List[Dict[str, Any]],
|
|
media_by_tweet: Dict[str, List[Dict[str, Any]]],
|
|
) -> Tuple[List[MediaEntry], List[Dict[str, Any]]]:
|
|
"""Build media entries for target tweets.
|
|
|
|
Returns:
|
|
Tuple of (media_entries, tweets_without_media)
|
|
"""
|
|
media_entries: List[MediaEntry] = []
|
|
tweets_without_media: List[Dict[str, Any]] = []
|
|
seen_media: Set[str] = set()
|
|
|
|
for tweet in target_tweets:
|
|
tweet_id = tweet.get("tweet_id", "")
|
|
created_at = tweet.get("created_at", "")
|
|
full_text = tweet.get("full_text", "")
|
|
hashtags = tweet.get("hashtags", [])
|
|
|
|
# Get media from manifest
|
|
tweet_media = media_by_tweet.get(tweet_id, [])
|
|
|
|
if not tweet_media:
|
|
tweets_without_media.append(tweet)
|
|
continue
|
|
|
|
for media in tweet_media:
|
|
media_id = media.get("media_id", "")
|
|
# Deduplicate by media_id
|
|
if media_id in seen_media:
|
|
continue
|
|
seen_media.add(media_id)
|
|
|
|
entry = MediaEntry(
|
|
tweet_id=tweet_id,
|
|
created_at=created_at,
|
|
full_text=full_text,
|
|
hashtags=hashtags,
|
|
media_id=media_id,
|
|
media_type=media.get("media_type", "unknown"),
|
|
media_index=media.get("media_index", 0),
|
|
local_media_path=media.get("local_media_path", ""),
|
|
media_url_https=media.get("media_url_https", ""),
|
|
expanded_url=media.get("expanded_url", ""),
|
|
source="media_manifest",
|
|
)
|
|
media_entries.append(entry)
|
|
|
|
# For tweets without media in manifest, check if they have URL-based media
|
|
for tweet in tweets_without_media:
|
|
urls = tweet.get("urls", [])
|
|
if urls:
|
|
# Create entry with URL reference
|
|
entry = MediaEntry(
|
|
tweet_id=tweet.get("tweet_id", ""),
|
|
created_at=tweet.get("created_at", ""),
|
|
full_text=tweet.get("full_text", ""),
|
|
hashtags=tweet.get("hashtags", []),
|
|
media_id=f"url-{tweet.get('tweet_id', '')}",
|
|
media_type="url_reference",
|
|
media_index=0,
|
|
local_media_path="",
|
|
expanded_url=urls[0] if urls else "",
|
|
source="tweets_only",
|
|
)
|
|
media_entries.append(entry)
|
|
|
|
logger.info(f"Built {len(media_entries)} media entries")
|
|
return media_entries, tweets_without_media
|
|
|
|
|
|
def compute_stats(
|
|
total_tweets: int,
|
|
target_tweets: List[Dict[str, Any]],
|
|
media_entries: List[MediaEntry],
|
|
) -> IndexStats:
|
|
"""Compute indexing statistics."""
|
|
stats = IndexStats(
|
|
total_tweets_scanned=total_tweets,
|
|
target_tweets_found=len(target_tweets),
|
|
)
|
|
|
|
# Count media types
|
|
media_type_counts: Dict[str, int] = {}
|
|
hashtag_counts: Dict[str, int] = {}
|
|
dates: List[str] = []
|
|
|
|
tweets_with_media: Set[str] = set()
|
|
|
|
for entry in media_entries:
|
|
media_type_counts[entry.media_type] = media_type_counts.get(entry.media_type, 0) + 1
|
|
tweets_with_media.add(entry.tweet_id)
|
|
if entry.created_at:
|
|
dates.append(entry.created_at)
|
|
|
|
for tweet in target_tweets:
|
|
for h in tweet.get("hashtags", []):
|
|
h_lower = h.lower()
|
|
hashtag_counts[h_lower] = hashtag_counts.get(h_lower, 0) + 1
|
|
|
|
stats.target_tweets_with_media = len(tweets_with_media)
|
|
stats.target_tweets_without_media = len(target_tweets) - len(tweets_with_media)
|
|
stats.total_media_entries = len(media_entries)
|
|
stats.media_types = dict(sorted(media_type_counts.items()))
|
|
stats.hashtag_counts = dict(sorted(hashtag_counts.items(), key=lambda x: -x[1]))
|
|
|
|
if dates:
|
|
dates_sorted = sorted(dates)
|
|
stats.date_range = {
|
|
"earliest": dates_sorted[0],
|
|
"latest": dates_sorted[-1],
|
|
}
|
|
|
|
return stats
|
|
|
|
|
|
def generate_summary_report(stats: IndexStats) -> str:
|
|
"""Generate a markdown summary report."""
|
|
lines = [
|
|
"# Know Thy Father — Phase 1: Media Indexing Report",
|
|
"",
|
|
f"**Generated:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}",
|
|
"",
|
|
"## Summary",
|
|
"",
|
|
"| Metric | Count |",
|
|
"|--------|-------|",
|
|
f"| Total tweets scanned | {stats.total_tweets_scanned} |",
|
|
f"| #TimmyTime/#TimmyChain tweets | {stats.target_tweets_found} |",
|
|
f"| Tweets with media | {stats.target_tweets_with_media} |",
|
|
f"| Tweets without media | {stats.target_tweets_without_media} |",
|
|
f"| Total media entries | {stats.total_media_entries} |",
|
|
"",
|
|
]
|
|
|
|
if stats.date_range:
|
|
lines.extend([
|
|
"## Date Range",
|
|
"",
|
|
f"- Earliest: {stats.date_range.get('earliest', 'N/A')}",
|
|
f"- Latest: {stats.date_range.get('latest', 'N/A')}",
|
|
"",
|
|
])
|
|
|
|
if stats.media_types:
|
|
lines.extend([
|
|
"## Media Types",
|
|
"",
|
|
"| Type | Count |",
|
|
"|------|-------|",
|
|
])
|
|
for mtype, count in sorted(stats.media_types.items(), key=lambda x: -x[1]):
|
|
lines.append(f"| {mtype} | {count} |")
|
|
lines.append("")
|
|
|
|
if stats.hashtag_counts:
|
|
lines.extend([
|
|
"## Hashtag Distribution",
|
|
"",
|
|
"| Hashtag | Count |",
|
|
"|---------|-------|",
|
|
])
|
|
for tag, count in list(stats.hashtag_counts.items())[:15]:
|
|
lines.append(f"| #{tag} | {count} |")
|
|
lines.append("")
|
|
|
|
lines.extend([
|
|
"---",
|
|
"",
|
|
"*Generated by scripts/know_thy_father/index_media.py*",
|
|
"*Ref: #582, #583*",
|
|
"",
|
|
])
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Know Thy Father — Phase 1: Media Indexing"
|
|
)
|
|
parser.add_argument(
|
|
"--tweets", "-t",
|
|
type=Path,
|
|
default=DEFAULT_TWEETS_PATH,
|
|
help=f"Path to tweets JSONL (default: {DEFAULT_TWEETS_PATH})",
|
|
)
|
|
parser.add_argument(
|
|
"--media-manifest", "-m",
|
|
type=Path,
|
|
default=DEFAULT_MEDIA_MANIFEST,
|
|
help=f"Path to media manifest (default: {DEFAULT_MEDIA_MANIFEST})",
|
|
)
|
|
parser.add_argument(
|
|
"--output", "-o",
|
|
type=Path,
|
|
default=DEFAULT_OUTPUT_PATH,
|
|
help=f"Output manifest path (default: {DEFAULT_OUTPUT_PATH})",
|
|
)
|
|
parser.add_argument(
|
|
"--report", "-r",
|
|
type=Path,
|
|
default=None,
|
|
help="Output path for summary report (optional)",
|
|
)
|
|
parser.add_argument(
|
|
"--verbose", "-v",
|
|
action="store_true",
|
|
help="Enable verbose logging",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
)
|
|
|
|
# Load data
|
|
tweets = load_tweets(args.tweets)
|
|
if not tweets:
|
|
print(f"Error: No tweets loaded from {args.tweets}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
media_by_tweet = load_media_manifest(args.media_manifest)
|
|
|
|
# Filter target tweets
|
|
target_tweets = filter_target_tweets(tweets)
|
|
|
|
if not target_tweets:
|
|
print("Warning: No #TimmyTime/#TimmyChain tweets found", file=sys.stderr)
|
|
|
|
# Build media entries
|
|
media_entries, tweets_without_media = build_media_entries(target_tweets, media_by_tweet)
|
|
|
|
# Write output manifest
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(args.output, "w") as f:
|
|
for entry in media_entries:
|
|
f.write(json.dumps(entry.to_dict(), ensure_ascii=False) + "\n")
|
|
|
|
# Compute stats
|
|
stats = compute_stats(len(tweets), target_tweets, media_entries)
|
|
|
|
# Generate report
|
|
report = generate_summary_report(stats)
|
|
|
|
if args.report:
|
|
args.report.parent.mkdir(parents=True, exist_ok=True)
|
|
args.report.write_text(report)
|
|
print(f"Report written to {args.report}")
|
|
|
|
# Print summary
|
|
print(f"\n=== Phase 1: Media Indexing Complete ===")
|
|
print(f"Total tweets scanned: {stats.total_tweets_scanned}")
|
|
print(f"#TimmyTime/#TimmyChain tweets: {stats.target_tweets_found}")
|
|
print(f"Media entries indexed: {stats.total_media_entries}")
|
|
print(f" - With media: {stats.target_tweets_with_media}")
|
|
print(f" - Without media: {stats.target_tweets_without_media}")
|
|
print(f"\nMedia types:")
|
|
for mtype, count in sorted(stats.media_types.items(), key=lambda x: -x[1]):
|
|
print(f" {mtype}: {count}")
|
|
print(f"\nOutput: {args.output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|