Files
timmy-home/twitter-archive/process_tweets.py

55 lines
1.7 KiB
Python
Raw Normal View History

import json
import os
import sys
# Read tweets.js
tweets_js_path = os.path.expanduser("~/Downloads/twitter-2026-03-27-d4471cc6eb6703034d592f870933561ebee374d9d9b90c9b8923abff064afc1e/data/tweets.js")
with open(tweets_js_path, 'r') as f:
tweets_js = f.read()
# Strip JS prefix
tweets_js = tweets_js.replace('window.YTD.tweets.part0 = ', '').strip()
tweets_data = json.loads(tweets_js)
# Separate tweets
original_tweets = []
retweets = []
for tweet in tweets_data:
# Access the actual tweet data
tweet_data = tweet['tweet']
# Check if it's a retweet
if tweet_data.get('retweeted'):
retweets.append(tweet_data)
else:
original_tweets.append(tweet_data)
# Write to JSONL files
extracted_dir = os.path.expanduser("~/.timmy/twitter-archive/extracted")
with open(os.path.join(extracted_dir, 'tweets.jsonl'), 'w') as f:
for tweet in original_tweets:
f.write(json.dumps(tweet) + '\n')
with open(os.path.join(extracted_dir, 'retweets.jsonl'), 'w') as f:
for tweet in retweets:
f.write(json.dumps(tweet) + '\n')
# Create manifest
# Filter tweets that have 'created_at'
valid_tweets = [tweet for tweet in tweets_data if 'created_at' in tweet['tweet']]
if valid_tweets:
earliest_date = min(tweet['tweet']['created_at'] for tweet in valid_tweets)
latest_date = max(tweet['tweet']['created_at'] for tweet in valid_tweets)
else:
earliest_date = None
latest_date = None
manifest = {
'original_count': len(original_tweets),
'retweet_count': len(retweets),
'earliest_date': earliest_date,
'latest_date': latest_date
}
with open(os.path.expanduser("~/.timmy/twitter-archive/manifest.json"), 'w') as f:
json.dump(manifest, f)