101 lines
2.8 KiB
Python
101 lines
2.8 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Deterministically extract the private Twitter archive into JSONL."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from .common import (
|
||
|
|
ARCHIVE_DIR,
|
||
|
|
EXTRACTED_DIR,
|
||
|
|
ensure_layout,
|
||
|
|
normalize_tweet,
|
||
|
|
resolve_source_dir,
|
||
|
|
sort_tweets,
|
||
|
|
stable_sha256,
|
||
|
|
write_json,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def strip_js_prefix(raw_text: str) -> str:
|
||
|
|
start = raw_text.find("[")
|
||
|
|
if start == -1:
|
||
|
|
raise ValueError("Could not find JSON array in tweets.js")
|
||
|
|
return raw_text[start:].strip()
|
||
|
|
|
||
|
|
|
||
|
|
def write_jsonl(path: Path, rows: list[dict]) -> None:
|
||
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
with open(path, "w") as handle:
|
||
|
|
for row in rows:
|
||
|
|
handle.write(json.dumps(row, sort_keys=True) + "\n")
|
||
|
|
|
||
|
|
|
||
|
|
def build_manifest(source_path: Path, tweets: list[dict], retweets: list[dict]) -> dict:
|
||
|
|
ordered = sort_tweets(tweets + retweets)
|
||
|
|
return {
|
||
|
|
"schema_version": 1,
|
||
|
|
"source_dir": str(source_path.parent),
|
||
|
|
"source_file": source_path.name,
|
||
|
|
"source_sha256": stable_sha256(source_path),
|
||
|
|
"source_size_bytes": source_path.stat().st_size,
|
||
|
|
"tweet_count": len(tweets),
|
||
|
|
"retweet_count": len(retweets),
|
||
|
|
"earliest_date": ordered[0]["created_at"] if ordered else None,
|
||
|
|
"latest_date": ordered[-1]["created_at"] if ordered else None,
|
||
|
|
"fields": [
|
||
|
|
"tweet_id",
|
||
|
|
"created_at",
|
||
|
|
"full_text",
|
||
|
|
"is_retweet",
|
||
|
|
"reply_to_tweet_id",
|
||
|
|
"reply_to_user_id",
|
||
|
|
"lang",
|
||
|
|
"favorite_count",
|
||
|
|
"retweet_count",
|
||
|
|
"mentions",
|
||
|
|
"hashtags",
|
||
|
|
"urls",
|
||
|
|
"source_file",
|
||
|
|
],
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
ensure_layout()
|
||
|
|
source_dir = resolve_source_dir()
|
||
|
|
source_path = source_dir / "tweets.js"
|
||
|
|
if not source_path.exists():
|
||
|
|
raise SystemExit(json.dumps({"status": "error", "reason": f"missing {source_path}"}))
|
||
|
|
|
||
|
|
raw_payload = strip_js_prefix(source_path.read_text())
|
||
|
|
tweet_entries = json.loads(raw_payload)
|
||
|
|
|
||
|
|
tweets: list[dict] = []
|
||
|
|
retweets: list[dict] = []
|
||
|
|
for entry in tweet_entries:
|
||
|
|
normalized = normalize_tweet(entry, source_path.name)
|
||
|
|
if not normalized:
|
||
|
|
continue
|
||
|
|
if normalized["is_retweet"]:
|
||
|
|
retweets.append(normalized)
|
||
|
|
else:
|
||
|
|
tweets.append(normalized)
|
||
|
|
|
||
|
|
tweets = sort_tweets(tweets)
|
||
|
|
retweets = sort_tweets(retweets)
|
||
|
|
|
||
|
|
write_jsonl(EXTRACTED_DIR / "tweets.jsonl", tweets)
|
||
|
|
write_jsonl(EXTRACTED_DIR / "retweets.jsonl", retweets)
|
||
|
|
|
||
|
|
manifest = build_manifest(source_path, tweets, retweets)
|
||
|
|
write_json(EXTRACTED_DIR / "manifest.json", manifest)
|
||
|
|
write_json(ARCHIVE_DIR / "manifest.json", manifest)
|
||
|
|
|
||
|
|
print(json.dumps({"status": "ok", **manifest}, sort_keys=True))
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|