Files
timmy-home/scripts/twitter_archive/extract_archive.py
2026-03-27 18:09:28 -04:00

101 lines
2.8 KiB
Python

#!/usr/bin/env python3
"""Deterministically extract the private Twitter archive into JSONL."""
from __future__ import annotations
import json
from pathlib import Path
from .common import (
ARCHIVE_DIR,
EXTRACTED_DIR,
ensure_layout,
normalize_tweet,
resolve_source_dir,
sort_tweets,
stable_sha256,
write_json,
)
def strip_js_prefix(raw_text: str) -> str:
start = raw_text.find("[")
if start == -1:
raise ValueError("Could not find JSON array in tweets.js")
return raw_text[start:].strip()
def write_jsonl(path: Path, rows: list[dict]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as handle:
for row in rows:
handle.write(json.dumps(row, sort_keys=True) + "\n")
def build_manifest(source_path: Path, tweets: list[dict], retweets: list[dict]) -> dict:
ordered = sort_tweets(tweets + retweets)
return {
"schema_version": 1,
"source_dir": str(source_path.parent),
"source_file": source_path.name,
"source_sha256": stable_sha256(source_path),
"source_size_bytes": source_path.stat().st_size,
"tweet_count": len(tweets),
"retweet_count": len(retweets),
"earliest_date": ordered[0]["created_at"] if ordered else None,
"latest_date": ordered[-1]["created_at"] if ordered else None,
"fields": [
"tweet_id",
"created_at",
"full_text",
"is_retweet",
"reply_to_tweet_id",
"reply_to_user_id",
"lang",
"favorite_count",
"retweet_count",
"mentions",
"hashtags",
"urls",
"source_file",
],
}
def main() -> None:
ensure_layout()
source_dir = resolve_source_dir()
source_path = source_dir / "tweets.js"
if not source_path.exists():
raise SystemExit(json.dumps({"status": "error", "reason": f"missing {source_path}"}))
raw_payload = strip_js_prefix(source_path.read_text())
tweet_entries = json.loads(raw_payload)
tweets: list[dict] = []
retweets: list[dict] = []
for entry in tweet_entries:
normalized = normalize_tweet(entry, source_path.name)
if not normalized:
continue
if normalized["is_retweet"]:
retweets.append(normalized)
else:
tweets.append(normalized)
tweets = sort_tweets(tweets)
retweets = sort_tweets(retweets)
write_jsonl(EXTRACTED_DIR / "tweets.jsonl", tweets)
write_jsonl(EXTRACTED_DIR / "retweets.jsonl", retweets)
manifest = build_manifest(source_path, tweets, retweets)
write_json(EXTRACTED_DIR / "manifest.json", manifest)
write_json(ARCHIVE_DIR / "manifest.json", manifest)
print(json.dumps({"status": "ok", **manifest}, sort_keys=True))
if __name__ == "__main__":
main()