Compare commits
1 Commits
main
...
whip/583-1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2558920b2c |
@@ -0,0 +1,103 @@
|
||||
---
|
||||
name: know-thy-father-media-indexer
|
||||
description: "Phase 1 of Know Thy Father: scan Twitter archive for #TimmyTime/#TimmyChain tweets, map to local media files, produce manifest for downstream analysis."
|
||||
version: 1.0.0
|
||||
author: Timmy Time
|
||||
license: MIT
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [twitter, indexing, media, manifest, know-thy-father, sovereign]
|
||||
related_skills: [know-thy-father-multimodal, sovereign-meaning-synthesis]
|
||||
---
|
||||
|
||||
# Know Thy Father — Phase 1: Media Indexer
|
||||
|
||||
## Overview
|
||||
|
||||
Scans the local Twitter archive for all tweets containing `#TimmyTime` or `#TimmyChain`. Maps each tweet to its associated media files. Produces a manifest consumed by the Phase 2 multimodal analysis pipeline.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
cd twitter-archive
|
||||
python3 media_indexer.py --archive-dir ~/Downloads/twitter-2026-03-27-.../data
|
||||
```
|
||||
|
||||
### Custom output directory
|
||||
```bash
|
||||
python3 media_indexer.py --archive-dir ~/Downloads/.../data --output-dir ./my-output
|
||||
```
|
||||
|
||||
## Input
|
||||
|
||||
- `tweets.js` — Twitter archive export (JavaScript format with `window.YTD.tweets.part0 = [...]`)
|
||||
- `tweets_media/` — directory of downloaded media files (photos, videos, GIFs)
|
||||
|
||||
## Output
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `manifest.jsonl` | One JSON line per media item with tweet context, local path, variants |
|
||||
| `timmytime_tweets.jsonl` | Filtered tweets containing target hashtags |
|
||||
| `manifest_summary.json` | Rollup: counts, hashtag families, type breakdown |
|
||||
| `hashtag_metrics.json` | Machine-readable metrics per target hashtag |
|
||||
| `hashtag_metrics.md` | Human-readable report with monthly charts |
|
||||
|
||||
## Manifest Entry Format
|
||||
|
||||
```json
|
||||
{
|
||||
"tweet_id": "2034689097986453631",
|
||||
"created_at": "Thu Mar 19 17:50:58 +0000 2026",
|
||||
"full_text": "...",
|
||||
"hashtags": ["timmytime", "stackchain"],
|
||||
"urls": [],
|
||||
"media_index": 1,
|
||||
"media_id": "2034686192428752901",
|
||||
"media_type": "video",
|
||||
"expanded_url": "https://x.com/rockachopa/status/...",
|
||||
"media_url_https": "https://pbs.twimg.com/...",
|
||||
"display_url": "pic.x.com/...",
|
||||
"variants": [{"bitrate": 2176000, "content_type": "video/mp4", "url": "..."}],
|
||||
"local_media_path": "/Users/apayne/Downloads/.../tweets_media/file.mp4",
|
||||
"source_file": "tweets.js"
|
||||
}
|
||||
```
|
||||
|
||||
## Pipeline Flow
|
||||
|
||||
```
|
||||
Phase 0: process_tweets.js (existing)
|
||||
└─ tweets.js → tweets.jsonl, retweets.jsonl
|
||||
|
||||
Phase 1: media_indexer.py (this file)
|
||||
└─ tweets.js → manifest.jsonl (media items with target hashtags)
|
||||
└─ tweets.js → timmytime_tweets.jsonl (filtered tweets)
|
||||
└─ → manifest_summary.json, hashtag_metrics.json, hashtag_metrics.md
|
||||
|
||||
Phase 2: multimodal_pipeline.py
|
||||
└─ manifest.jsonl → analysis/{tweet_id}.json, meaning_kernels.jsonl
|
||||
|
||||
Phase 3: Holographic Synthesis
|
||||
└─ meaning_kernels.jsonl → fact_store
|
||||
|
||||
Phase 4: Cross-Reference Audit
|
||||
└─ meaning_kernels + SOUL.md → audit report
|
||||
```
|
||||
|
||||
## Target Hashtags
|
||||
|
||||
- `#timmyTime` — the movement, the vibe, the people
|
||||
- `#timmyChain` — Bitcoin stacking culture, chain tips, community
|
||||
|
||||
Both are case-insensitive. The indexer normalizes all hashtags to lowercase.
|
||||
|
||||
## Pitfalls
|
||||
|
||||
1. **Archive format** — Twitter exports change format. This script handles the `window.YTD.tweets.part0 = [...]` format with nested `tweet` objects.
|
||||
|
||||
2. **Media file matching** — Local files are named `{tweet_id}-{hash}.ext`. The indexer searches by media_id substring matching. Some media may not have local files (deleted, not downloaded).
|
||||
|
||||
3. **Large archives** — 4,800 tweets with 818 media items. The script loads all into memory. For very large archives (>100K tweets), consider streaming.
|
||||
|
||||
4. **Duplicate media** — The same media file can appear in multiple tweets (retweets, quote tweets). The manifest includes all occurrences; downstream pipeline should deduplicate by media_id.
|
||||
394
twitter-archive/media_indexer.py
Executable file
394
twitter-archive/media_indexer.py
Executable file
@@ -0,0 +1,394 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Know Thy Father — Phase 1: Media Indexer
|
||||
|
||||
Scans the local Twitter archive for all tweets containing #TimmyTime or #TimmyChain.
|
||||
Maps each tweet to its associated media files. Outputs:
|
||||
- media/manifest.jsonl — one row per media item with tweet context
|
||||
- media/manifest_summary.json — rollup counts, hashtag families, type breakdown
|
||||
- media/hashtag_metrics.json — machine-readable metrics per target hashtag
|
||||
- media/hashtag_metrics.md — human-readable local report
|
||||
- media/timmytime_tweets.jsonl — filtered tweets with target hashtags
|
||||
|
||||
Usage:
|
||||
python3 media_indexer.py --archive-dir ~/Downloads/twitter-2026-03-27-.../data
|
||||
python3 media_indexer.py --archive-dir ~/Downloads/twitter-2026-03-27-.../data --output-dir ./media
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ── Config ──────────────────────────────────────────────
|
||||
DEFAULT_ARCHIVE = os.path.expanduser(
|
||||
"~/Downloads/twitter-2026-03-27-d4471cc6eb6703034d592f870933561ebee374d9d9b90c9b8923abff064afc1e/data"
|
||||
)
|
||||
TARGET_HASHTAGS = {"timmytime", "timmychain"}
|
||||
MONTHS = lambda dt: dt.strftime("%Y-%m") if dt else "unknown"
|
||||
|
||||
|
||||
# ── Utilities ───────────────────────────────────────────
|
||||
|
||||
def log(msg: str, level: str = "INFO"):
|
||||
ts = datetime.now(timezone.utc).strftime("%H:%M:%S")
|
||||
print(f"[{ts}] [{level}] {msg}")
|
||||
|
||||
|
||||
def parse_twitter_timestamp(ts: str) -> datetime | None:
|
||||
"""Parse Twitter's created_at format: 'Sat Mar 21 17:35:31 +0000 2026'"""
|
||||
try:
|
||||
return datetime.strptime(ts, "%a %b %d %H:%M:%S %z %Y")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def load_tweets_js(path: str) -> list:
|
||||
"""Parse tweets.js (Twitter archive format) into list of tweet dicts."""
|
||||
log(f"Loading tweets from {path}...")
|
||||
raw = Path(path).read_text()
|
||||
# Strip JS assignment prefix
|
||||
raw = re.sub(r"^window\.YTD\.tweets\.part\d+\s*=\s*", "", raw.strip())
|
||||
data = json.loads(raw)
|
||||
log(f"Loaded {len(data)} tweet entries")
|
||||
return data
|
||||
|
||||
|
||||
def extract_hashtags(tweet_data: dict) -> list:
|
||||
"""Extract lowercase hashtags from a tweet."""
|
||||
# Try multiple locations
|
||||
hashtags = []
|
||||
|
||||
# From entities.hashtags
|
||||
entities = tweet_data.get("entities", {}).get("hashtags", [])
|
||||
for h in entities:
|
||||
tag = h.get("text", "").lower()
|
||||
if tag:
|
||||
hashtags.append(tag)
|
||||
|
||||
# From extended_entities (sometimes duplicates)
|
||||
ext_entities = tweet_data.get("extended_entities", {}).get("hashtags", [])
|
||||
for h in ext_entities:
|
||||
tag = h.get("text", "").lower()
|
||||
if tag and tag not in hashtags:
|
||||
hashtags.append(tag)
|
||||
|
||||
# Fallback: parse from full_text
|
||||
if not hashtags:
|
||||
full_text = tweet_data.get("full_text", "")
|
||||
hashtags = [m.group(1).lower() for m in re.finditer(r"#(\w+)", full_text)]
|
||||
|
||||
return list(set(hashtags))
|
||||
|
||||
|
||||
def extract_media(tweet_data: dict) -> list:
|
||||
"""Extract media items from a tweet (photos, videos, GIFs)."""
|
||||
media_items = []
|
||||
|
||||
# Check extended_entities first (has full video info), then entities
|
||||
for source in ["extended_entities", "entities"]:
|
||||
media_list = tweet_data.get(source, {}).get("media", [])
|
||||
for m in media_list:
|
||||
item = {
|
||||
"media_id": m.get("id_str", ""),
|
||||
"media_type": m.get("type", ""), # photo, video, animated_gif
|
||||
"media_url_https": m.get("media_url_https", ""),
|
||||
"display_url": m.get("display_url", ""),
|
||||
"expanded_url": m.get("expanded_url", ""),
|
||||
}
|
||||
|
||||
# For videos/GIFs, extract variants
|
||||
if m.get("video_info"):
|
||||
variants = m["video_info"].get("variants", [])
|
||||
item["variants"] = [
|
||||
{
|
||||
"bitrate": v.get("bitrate", 0),
|
||||
"content_type": v.get("content_type", ""),
|
||||
"url": v.get("url", ""),
|
||||
}
|
||||
for v in variants
|
||||
]
|
||||
# Also get duration for videos
|
||||
item["duration_ms"] = m["video_info"].get("duration_millis", 0)
|
||||
|
||||
media_items.append(item)
|
||||
|
||||
return media_items
|
||||
|
||||
|
||||
def find_local_media(media_id: str, media_dir: str) -> str:
|
||||
"""Find the local file for a media ID in the tweets_media directory."""
|
||||
if not os.path.isdir(media_dir):
|
||||
return ""
|
||||
|
||||
# Files are named like: {tweet_id}-{hash}.ext
|
||||
# We search by media_id prefix or by matching the hash portion
|
||||
for fname in os.listdir(media_dir):
|
||||
if media_id in fname:
|
||||
return os.path.join(media_dir, fname)
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
# ── Main Indexing Logic ────────────────────────────────
|
||||
|
||||
def index_archive(archive_dir: str, output_dir: str):
|
||||
"""Main indexing pipeline."""
|
||||
tweets_js = os.path.join(archive_dir, "tweets.js")
|
||||
media_dir = os.path.join(archive_dir, "tweets_media")
|
||||
|
||||
if not os.path.exists(tweets_js):
|
||||
log(f"tweets.js not found at {tweets_js}", "ERROR")
|
||||
sys.exit(1)
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Step 1: Load all tweets
|
||||
raw_tweets = load_tweets_js(tweets_js)
|
||||
|
||||
# Step 2: Filter tweets with target hashtags
|
||||
log("Filtering for #timmytime and #timmychain...")
|
||||
matching_tweets = []
|
||||
for entry in raw_tweets:
|
||||
tweet = entry.get("tweet", entry) # Handle both nested and flat formats
|
||||
hashtags = extract_hashtags(tweet)
|
||||
if TARGET_HASHTAGS & set(hashtags):
|
||||
matching_tweets.append((tweet, hashtags))
|
||||
|
||||
log(f"Found {len(matching_tweets)} tweets with target hashtags")
|
||||
|
||||
# Step 3: Build media manifest
|
||||
log("Building media manifest...")
|
||||
manifest_entries = []
|
||||
tweet_entries = []
|
||||
hashtag_monthly = defaultdict(lambda: defaultdict(int))
|
||||
hashtag_cooccurrence = defaultdict(Counter)
|
||||
media_type_counts = Counter()
|
||||
total_media = 0
|
||||
|
||||
for tweet, hashtags in matching_tweets:
|
||||
tweet_id = tweet.get("id_str", "")
|
||||
full_text = tweet.get("full_text", "")
|
||||
created_at = tweet.get("created_at", "")
|
||||
dt = parse_twitter_timestamp(created_at)
|
||||
|
||||
# Record tweet entry
|
||||
tweet_entry = {
|
||||
"tweet_id": tweet_id,
|
||||
"created_at": created_at,
|
||||
"full_text": full_text,
|
||||
"hashtags": [h for h in hashtags if h in TARGET_HASHTAGS],
|
||||
"all_hashtags": hashtags,
|
||||
"favorite_count": tweet.get("favorite_count", 0),
|
||||
"retweet_count": tweet.get("retweet_count", 0),
|
||||
}
|
||||
tweet_entries.append(tweet_entry)
|
||||
|
||||
# Count hashtag occurrences by month
|
||||
for tag in TARGET_HASHTAGS & set(hashtags):
|
||||
hashtag_monthly[tag][MONTHS(dt)] += 1
|
||||
|
||||
# Count co-occurring hashtags
|
||||
for tag in TARGET_HASHTAGS & set(hashtags):
|
||||
for other in hashtags:
|
||||
if other != tag:
|
||||
hashtag_cooccurrence[tag][other] += 1
|
||||
|
||||
# Extract and map media
|
||||
media_items = extract_media(tweet)
|
||||
for idx, media in enumerate(media_items):
|
||||
total_media += 1
|
||||
media_type = media.get("media_type", "unknown")
|
||||
media_type_counts[media_type] += 1
|
||||
|
||||
# Find local file
|
||||
media_id = media.get("media_id", "")
|
||||
local_path = find_local_media(media_id, media_dir)
|
||||
|
||||
entry = {
|
||||
"tweet_id": tweet_id,
|
||||
"created_at": created_at,
|
||||
"full_text": full_text,
|
||||
"hashtags": hashtags,
|
||||
"urls": [u.get("expanded_url", "") for u in tweet.get("entities", {}).get("urls", [])],
|
||||
"media_index": idx + 1,
|
||||
"media_id": media_id,
|
||||
"media_type": media_type,
|
||||
"expanded_url": media.get("expanded_url", ""),
|
||||
"media_url_https": media.get("media_url_https", ""),
|
||||
"display_url": media.get("display_url", ""),
|
||||
"variants": media.get("variants", []),
|
||||
"local_media_path": local_path,
|
||||
"source_file": "tweets.js",
|
||||
}
|
||||
manifest_entries.append(entry)
|
||||
|
||||
# Step 4: Write manifest.jsonl
|
||||
manifest_path = os.path.join(output_dir, "manifest.jsonl")
|
||||
with open(manifest_path, "w") as f:
|
||||
for entry in manifest_entries:
|
||||
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
log(f"Wrote {len(manifest_entries)} entries to {manifest_path}")
|
||||
|
||||
# Step 5: Write filtered tweets
|
||||
tweets_path = os.path.join(output_dir, "timmytime_tweets.jsonl")
|
||||
with open(tweets_path, "w") as f:
|
||||
for entry in tweet_entries:
|
||||
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
log(f"Wrote {len(tweet_entries)} tweets to {tweets_path}")
|
||||
|
||||
# Step 6: Build manifest summary
|
||||
summary = {
|
||||
"schema_version": 1,
|
||||
"source_dir": archive_dir,
|
||||
"source_file": "tweets.js",
|
||||
"source_sha256": "", # Could compute but skip for speed
|
||||
"source_size_bytes": os.path.getsize(tweets_js),
|
||||
"media_record_count": len(manifest_entries),
|
||||
"tweet_count": len(tweet_entries),
|
||||
"media_type_counts": dict(media_type_counts),
|
||||
"hashtag_counts": {
|
||||
tag: sum(1 for _, tags in matching_tweets if tag in tags)
|
||||
for tag in TARGET_HASHTAGS
|
||||
},
|
||||
"fields": [
|
||||
"tweet_id", "created_at", "full_text", "hashtags", "urls",
|
||||
"media_index", "media_id", "media_type", "expanded_url",
|
||||
"media_url_https", "display_url", "variants",
|
||||
"local_media_path", "source_file",
|
||||
],
|
||||
}
|
||||
summary_path = os.path.join(output_dir, "manifest_summary.json")
|
||||
with open(summary_path, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
log(f"Wrote summary to {summary_path}")
|
||||
|
||||
# Step 7: Build hashtag metrics
|
||||
metrics = {
|
||||
"schema_version": 1,
|
||||
"target_tags": sorted(TARGET_HASHTAGS),
|
||||
"targets": {},
|
||||
}
|
||||
|
||||
for tag in sorted(TARGET_HASHTAGS):
|
||||
tag_tweets = [(t, h) for t, h in matching_tweets if tag in h]
|
||||
tag_media = [e for e in manifest_entries if tag in e.get("hashtags", [])]
|
||||
|
||||
metrics["targets"][tag] = {
|
||||
"tag": tag,
|
||||
"post_count": len(tag_tweets),
|
||||
"media_posts": len(tag_media),
|
||||
"monthly_post_counts": dict(sorted(hashtag_monthly[tag].items())),
|
||||
"monthly_media_counts": dict(sorted(
|
||||
defaultdict(int, {
|
||||
m: len([e for e in tag_media if MONTHS(parse_twitter_timestamp(e.get("created_at", ""))) == m])
|
||||
for m in sorted(set(MONTHS(parse_twitter_timestamp(e.get("created_at", ""))) for e in tag_media))
|
||||
}).items()
|
||||
)),
|
||||
"top_cooccurring_hashtags": [
|
||||
{"tag": t, "count": c}
|
||||
for t, c in hashtag_cooccurrence[tag].most_common(10)
|
||||
],
|
||||
"media_types": {
|
||||
mt: len([e for e in tag_media if e.get("media_type") == mt])
|
||||
for mt in sorted(set(e.get("media_type", "") for e in tag_media))
|
||||
},
|
||||
}
|
||||
|
||||
# Add overlap between the two tags
|
||||
overlap_tweets = [
|
||||
t for t, h in matching_tweets if TARGET_HASHTAGS.issubset(set(h))
|
||||
]
|
||||
metrics["overlap"] = {
|
||||
"tweet_posts": len(overlap_tweets),
|
||||
"media_posts": len([
|
||||
e for e in manifest_entries
|
||||
if TARGET_HASHTAGS.issubset(set(e.get("hashtags", [])))
|
||||
]),
|
||||
}
|
||||
|
||||
metrics_path = os.path.join(output_dir, "hashtag_metrics.json")
|
||||
with open(metrics_path, "w") as f:
|
||||
json.dump(metrics, f, indent=2)
|
||||
log(f"Wrote metrics to {metrics_path}")
|
||||
|
||||
# Step 8: Generate human-readable report
|
||||
report_lines = []
|
||||
report_lines.append("# Know Thy Father — Hashtag Metrics Report")
|
||||
report_lines.append(f"\nGenerated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
|
||||
report_lines.append(f"\n## Overview")
|
||||
report_lines.append(f"- Total media items indexed: **{len(manifest_entries)}**")
|
||||
report_lines.append(f"- Total tweets with target hashtags: **{len(tweet_entries)}**")
|
||||
report_lines.append(f"- Media types: {', '.join(f'{k} ({v})' for k, v in sorted(media_type_counts.items()))}")
|
||||
|
||||
for tag in sorted(TARGET_HASHTAGS):
|
||||
data = metrics["targets"][tag]
|
||||
report_lines.append(f"\n## #{tag}")
|
||||
report_lines.append(f"- Posts: **{data['post_count']}**")
|
||||
report_lines.append(f"- With media: **{data['media_posts']}**")
|
||||
report_lines.append(f"\n### Monthly Activity")
|
||||
for month, count in sorted(data["monthly_post_counts"].items()):
|
||||
bar = "█" * count
|
||||
report_lines.append(f" {month}: {bar} ({count})")
|
||||
report_lines.append(f"\n### Top Co-occurring Hashtags")
|
||||
for item in data["top_cooccurring_hashtags"][:5]:
|
||||
report_lines.append(f" - #{item['tag']} ({item['count']}x)")
|
||||
|
||||
report_lines.append(f"\n## Overlap")
|
||||
report_lines.append(f"Tweets with BOTH #timmytime AND #timmychain:")
|
||||
report_lines.append(f"- Tweet posts: {metrics['overlap']['tweet_posts']}")
|
||||
report_lines.append(f"- Media posts: {metrics['overlap']['media_posts']}")
|
||||
|
||||
report_lines.append("\n---")
|
||||
report_lines.append("*Generated by Know Thy Father Phase 1: Media Indexer*")
|
||||
|
||||
report_path = os.path.join(output_dir, "hashtag_metrics.md")
|
||||
with open(report_path, "w") as f:
|
||||
f.write("\n".join(report_lines))
|
||||
log(f"Wrote report to {report_path}")
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f" MEDIA INDEXING COMPLETE")
|
||||
print(f"{'='*60}")
|
||||
print(f" Tweets scanned: {len(raw_tweets)}")
|
||||
print(f" Target hashtag hits: {len(matching_tweets)}")
|
||||
print(f" Media items indexed: {len(manifest_entries)}")
|
||||
print(f" #timmytime posts: {sum(1 for _, h in matching_tweets if 'timmytime' in h)}")
|
||||
print(f" #timmychain posts: {sum(1 for _, h in matching_tweets if 'timmychain' in h)}")
|
||||
print(f" Photos: {media_type_counts.get('photo', 0)}")
|
||||
print(f" Videos: {media_type_counts.get('video', 0)}")
|
||||
print(f" GIFs: {media_type_counts.get('animated_gif', 0)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
# ── CLI ─────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Know Thy Father — Phase 1: Media Indexer")
|
||||
parser.add_argument("--archive-dir", default=DEFAULT_ARCHIVE,
|
||||
help="Path to Twitter archive data directory")
|
||||
parser.add_argument("--output-dir", default=None,
|
||||
help="Output directory (default: archive-dir/../twitter-archive/media)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.isdir(args.archive_dir):
|
||||
log(f"Archive directory not found: {args.archive_dir}", "ERROR")
|
||||
sys.exit(1)
|
||||
|
||||
output_dir = args.output_dir or os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
||||
"twitter-archive", "media"
|
||||
)
|
||||
|
||||
index_archive(args.archive_dir, output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user