- Add connectors/ directory with base infrastructure - Implement SourceEvent unified schema (source/account/thread/author/timestamp/content/attachments/raw_ref/hash/consent_scope) - Create BaseConnector abstract class with checkpoint/dedup/consent gates - Implement TwitterArchiveConnector for official Twitter/X data exports - Add run_connector.py CLI entry point - Add comprehensive test suite (13 tests, all passing) - Add connectors/README.md with usage docs - Add Makefile targets: test-connectors, run-connector, connectors-help - Reference parent EPIC #194 and issue #233 This is the foundational connector pack. Future work: Discord, Slack, WhatsApp, Notion, iMessage, Google.
156 lines
5.4 KiB
Python
156 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
connectors/twitter_archive.py — Twitter/X personal archive connector.
|
|
|
|
Parses official Twitter data exports (Twitter's "Download your data" archive).
|
|
Expects the tweet.js / tweet.json files from the archive's data/ directory.
|
|
|
|
Format (Twitter's archived tweets JSON):
|
|
Each entry has: {"tweet": {"id_str": "...", "full_text": "...", "created_at": "...", ...}}
|
|
|
|
Output: normalized SourceEvent with source='twitter'.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Iterator, Optional
|
|
import logging
|
|
|
|
from .base import BaseConnector
|
|
from .schema import SourceEvent, compute_event_hash
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TwitterArchiveConnector(BaseConnector):
|
|
"""Connector for Twitter/X official archive exports."""
|
|
name = "twitter_archive"
|
|
source_glob = "**/tweet*.json"
|
|
default_consent_scope = "memory_only"
|
|
|
|
# Twitter's date format in archives: "Wed Oct 10 20:19:24 +0000 2018"
|
|
TWITTER_DATE_FMT = "%a %b %d %H:%M:%S %z %Y"
|
|
|
|
def discover_sources(self, root: Path) -> Iterator[Path]:
|
|
"""
|
|
Find tweet.js / tweet.json files in a Twitter archive.
|
|
|
|
The official Twitter export places these under:
|
|
root/
|
|
data/
|
|
tweet.js (single-file format, older exports)
|
|
or
|
|
account-XXXX-YYYY/
|
|
tweets.js (per-month splitted format)
|
|
"""
|
|
root = Path(root)
|
|
# Search for .js files that start with 'tweet' — these contain the tweet JSON blobs
|
|
candidates = list(root.rglob("tweet*.js")) + list(root.rglob("tweet*.json"))
|
|
logger.info("Discovered %d Twitter archive files under %s", len(candidates), root)
|
|
for path in candidates:
|
|
yield path
|
|
|
|
def parse_source(self, source: Path) -> Iterator[SourceEvent]:
|
|
"""
|
|
Parse a Twitter archive file and yield SourceEvents.
|
|
|
|
Handles both single-file (old) and per-month splitted formats.
|
|
Twitter wraps the JSON array in a JS variable assignment: `window.YTD.tweet.part0 = [...]`
|
|
"""
|
|
try:
|
|
with open(source, 'r', encoding='utf-8') as f:
|
|
raw = f.read()
|
|
|
|
# Extract JSON array from the JS wrapper
|
|
match = re.search(r'=\s*(\[.+?\])\s*;?\s*$', raw, re.DOTALL)
|
|
if match:
|
|
json_str = match.group(1)
|
|
records = json.loads(json_str)
|
|
else:
|
|
# Plain JSON array (no wrapper)
|
|
records = json.loads(raw)
|
|
|
|
logger.debug("Parsing %d tweet records from %s", len(records), source)
|
|
|
|
for record in records:
|
|
event = self._record_to_event(record, source)
|
|
if event:
|
|
yield event
|
|
|
|
except Exception as e:
|
|
logger.error("Failed to parse %s: %s", source, e)
|
|
|
|
def _record_to_event(self, record: dict, source_path: Path) -> Optional[SourceEvent]:
|
|
"""
|
|
Convert a single tweet record into a SourceEvent.
|
|
|
|
The record can be either the wrapped format {"tweet": {...}}} or the bare tweet object.
|
|
"""
|
|
# Unwrap the tweet object
|
|
tweet = record.get('tweet', record)
|
|
|
|
# Extract core fields
|
|
id_str = tweet.get('id_str') or tweet.get('id')
|
|
full_text = tweet.get('full_text') or tweet.get('text', '')
|
|
created_at = tweet.get('created_at', '')
|
|
|
|
# Parse timestamp
|
|
try:
|
|
dt = datetime.strptime(created_at, self.TWITTER_DATE_FMT)
|
|
iso_ts = dt.astimezone().isoformat()
|
|
except Exception:
|
|
iso_ts = created_at # fallback: keep as-is
|
|
|
|
# Author is always the account owner (Twitter archives don't include others' DMs by default)
|
|
account = "user_archive" # normalized account identifier
|
|
|
|
# Thread/channel: individual tweets have no thread ID; threads aren't preserved in basic export
|
|
thread_id = f"tweet_{id_str}"
|
|
|
|
# Attachments: extract media URLs
|
|
attachments = []
|
|
extended_entities = tweet.get('extended_entities', {})
|
|
for media in extended_entities.get('media', []):
|
|
url = media.get('media_url_https') or media.get('media_url')
|
|
if url:
|
|
attachments.append(url)
|
|
|
|
# Build raw_ref
|
|
raw_ref = f"twitter:archive:{source_path.name}:{id_str}"
|
|
|
|
# Compute hash
|
|
content_for_hash = full_text or ""
|
|
hash_val = compute_event_hash(
|
|
source="twitter",
|
|
raw_ref=raw_ref,
|
|
content=content_for_hash,
|
|
timestamp=iso_ts,
|
|
author=account
|
|
)
|
|
|
|
# Preserve metadata for provenance
|
|
metadata = {
|
|
"tweet_id": id_str,
|
|
"source_file": str(source_path),
|
|
"favorite_count": tweet.get('favorite_count'),
|
|
"retweet_count": tweet.get('retweet_count'),
|
|
"in_reply_to_status_id": tweet.get('in_reply_to_status_id_str'),
|
|
"lang": tweet.get('lang'),
|
|
}
|
|
|
|
return SourceEvent(
|
|
source="twitter",
|
|
account=account,
|
|
thread_or_channel=thread_id,
|
|
author=account,
|
|
timestamp=iso_ts,
|
|
content=full_text,
|
|
attachments=attachments,
|
|
raw_ref=raw_ref,
|
|
hash=hash_val,
|
|
consent_scope=self.consent_scope,
|
|
metadata=metadata
|
|
)
|