#!/usr/bin/env python3 """Deterministically extract the private Twitter archive into JSONL.""" from __future__ import annotations import json from pathlib import Path from .common import ( ARCHIVE_DIR, EXTRACTED_DIR, ensure_layout, normalize_tweet, resolve_source_dir, sort_tweets, stable_sha256, write_json, ) def strip_js_prefix(raw_text: str) -> str: start = raw_text.find("[") if start == -1: raise ValueError("Could not find JSON array in tweets.js") return raw_text[start:].strip() def write_jsonl(path: Path, rows: list[dict]) -> None: path.parent.mkdir(parents=True, exist_ok=True) with open(path, "w") as handle: for row in rows: handle.write(json.dumps(row, sort_keys=True) + "\n") def build_manifest(source_path: Path, tweets: list[dict], retweets: list[dict]) -> dict: ordered = sort_tweets(tweets + retweets) return { "schema_version": 1, "source_dir": str(source_path.parent), "source_file": source_path.name, "source_sha256": stable_sha256(source_path), "source_size_bytes": source_path.stat().st_size, "tweet_count": len(tweets), "retweet_count": len(retweets), "earliest_date": ordered[0]["created_at"] if ordered else None, "latest_date": ordered[-1]["created_at"] if ordered else None, "fields": [ "tweet_id", "created_at", "full_text", "is_retweet", "reply_to_tweet_id", "reply_to_user_id", "lang", "favorite_count", "retweet_count", "mentions", "hashtags", "urls", "source_file", ], } def main() -> None: ensure_layout() source_dir = resolve_source_dir() source_path = source_dir / "tweets.js" if not source_path.exists(): raise SystemExit(json.dumps({"status": "error", "reason": f"missing {source_path}"})) raw_payload = strip_js_prefix(source_path.read_text()) tweet_entries = json.loads(raw_payload) tweets: list[dict] = [] retweets: list[dict] = [] for entry in tweet_entries: normalized = normalize_tweet(entry, source_path.name) if not normalized: continue if normalized["is_retweet"]: retweets.append(normalized) else: tweets.append(normalized) tweets = sort_tweets(tweets) retweets = sort_tweets(retweets) write_jsonl(EXTRACTED_DIR / "tweets.jsonl", tweets) write_jsonl(EXTRACTED_DIR / "retweets.jsonl", retweets) manifest = build_manifest(source_path, tweets, retweets) write_json(EXTRACTED_DIR / "manifest.json", manifest) write_json(ARCHIVE_DIR / "manifest.json", manifest) print(json.dumps({"status": "ok", **manifest}, sort_keys=True)) if __name__ == "__main__": main()