Files
compounding-intelligence/connectors/twitter_archive.py
Rockachopa 8628a0d610 feat(connectors): add sovereign personal archive connector pack foundation
- Add connectors/ directory with base infrastructure
- Implement SourceEvent unified schema (source/account/thread/author/timestamp/content/attachments/raw_ref/hash/consent_scope)
- Create BaseConnector abstract class with checkpoint/dedup/consent gates
- Implement TwitterArchiveConnector for official Twitter/X data exports
- Add run_connector.py CLI entry point
- Add comprehensive test suite (13 tests, all passing)
- Add connectors/README.md with usage docs
- Add Makefile targets: test-connectors, run-connector, connectors-help
- Reference parent EPIC #194 and issue #233

This is the foundational connector pack. Future work: Discord, Slack, WhatsApp, Notion, iMessage, Google.
2026-04-26 20:45:07 -04:00

156 lines
5.4 KiB
Python

#!/usr/bin/env python3
"""
connectors/twitter_archive.py — Twitter/X personal archive connector.
Parses official Twitter data exports (Twitter's "Download your data" archive).
Expects the tweet.js / tweet.json files from the archive's data/ directory.
Format (Twitter's archived tweets JSON):
Each entry has: {"tweet": {"id_str": "...", "full_text": "...", "created_at": "...", ...}}
Output: normalized SourceEvent with source='twitter'.
"""
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Iterator, Optional
import logging
from .base import BaseConnector
from .schema import SourceEvent, compute_event_hash
logger = logging.getLogger(__name__)
class TwitterArchiveConnector(BaseConnector):
"""Connector for Twitter/X official archive exports."""
name = "twitter_archive"
source_glob = "**/tweet*.json"
default_consent_scope = "memory_only"
# Twitter's date format in archives: "Wed Oct 10 20:19:24 +0000 2018"
TWITTER_DATE_FMT = "%a %b %d %H:%M:%S %z %Y"
def discover_sources(self, root: Path) -> Iterator[Path]:
"""
Find tweet.js / tweet.json files in a Twitter archive.
The official Twitter export places these under:
root/
data/
tweet.js (single-file format, older exports)
or
account-XXXX-YYYY/
tweets.js (per-month splitted format)
"""
root = Path(root)
# Search for .js files that start with 'tweet' — these contain the tweet JSON blobs
candidates = list(root.rglob("tweet*.js")) + list(root.rglob("tweet*.json"))
logger.info("Discovered %d Twitter archive files under %s", len(candidates), root)
for path in candidates:
yield path
def parse_source(self, source: Path) -> Iterator[SourceEvent]:
"""
Parse a Twitter archive file and yield SourceEvents.
Handles both single-file (old) and per-month splitted formats.
Twitter wraps the JSON array in a JS variable assignment: `window.YTD.tweet.part0 = [...]`
"""
try:
with open(source, 'r', encoding='utf-8') as f:
raw = f.read()
# Extract JSON array from the JS wrapper
match = re.search(r'=\s*(\[.+?\])\s*;?\s*$', raw, re.DOTALL)
if match:
json_str = match.group(1)
records = json.loads(json_str)
else:
# Plain JSON array (no wrapper)
records = json.loads(raw)
logger.debug("Parsing %d tweet records from %s", len(records), source)
for record in records:
event = self._record_to_event(record, source)
if event:
yield event
except Exception as e:
logger.error("Failed to parse %s: %s", source, e)
def _record_to_event(self, record: dict, source_path: Path) -> Optional[SourceEvent]:
"""
Convert a single tweet record into a SourceEvent.
The record can be either the wrapped format {"tweet": {...}}} or the bare tweet object.
"""
# Unwrap the tweet object
tweet = record.get('tweet', record)
# Extract core fields
id_str = tweet.get('id_str') or tweet.get('id')
full_text = tweet.get('full_text') or tweet.get('text', '')
created_at = tweet.get('created_at', '')
# Parse timestamp
try:
dt = datetime.strptime(created_at, self.TWITTER_DATE_FMT)
iso_ts = dt.astimezone().isoformat()
except Exception:
iso_ts = created_at # fallback: keep as-is
# Author is always the account owner (Twitter archives don't include others' DMs by default)
account = "user_archive" # normalized account identifier
# Thread/channel: individual tweets have no thread ID; threads aren't preserved in basic export
thread_id = f"tweet_{id_str}"
# Attachments: extract media URLs
attachments = []
extended_entities = tweet.get('extended_entities', {})
for media in extended_entities.get('media', []):
url = media.get('media_url_https') or media.get('media_url')
if url:
attachments.append(url)
# Build raw_ref
raw_ref = f"twitter:archive:{source_path.name}:{id_str}"
# Compute hash
content_for_hash = full_text or ""
hash_val = compute_event_hash(
source="twitter",
raw_ref=raw_ref,
content=content_for_hash,
timestamp=iso_ts,
author=account
)
# Preserve metadata for provenance
metadata = {
"tweet_id": id_str,
"source_file": str(source_path),
"favorite_count": tweet.get('favorite_count'),
"retweet_count": tweet.get('retweet_count'),
"in_reply_to_status_id": tweet.get('in_reply_to_status_id_str'),
"lang": tweet.get('lang'),
}
return SourceEvent(
source="twitter",
account=account,
thread_or_channel=thread_id,
author=account,
timestamp=iso_ts,
content=full_text,
attachments=attachments,
raw_ref=raw_ref,
hash=hash_val,
consent_scope=self.consent_scope,
metadata=metadata
)