compounding-intelligence/connectors/twitter_archive.py

#!/usr/bin/env python3
"""
connectors/twitter_archive.py — Twitter/X personal archive connector.

Parses official Twitter data exports (Twitter's "Download your data" archive).
Expects the tweet.js / tweet.json files from the archive's data/ directory.

Format (Twitter's archived tweets JSON):
  Each entry has: {"tweet": {"id_str": "...", "full_text": "...", "created_at": "...", ...}}

Output: normalized SourceEvent with source='twitter'.
"""

import json
import re
from datetime import datetime
from pathlib import Path
from typing import Iterator, Optional
import logging

from .base import BaseConnector
from .schema import SourceEvent, compute_event_hash

logger = logging.getLogger(__name__)


class TwitterArchiveConnector(BaseConnector):
    """Connector for Twitter/X official archive exports."""
    name = "twitter_archive"
    source_glob = "**/tweet*.json"
    default_consent_scope = "memory_only"

    # Twitter's date format in archives: "Wed Oct 10 20:19:24 +0000 2018"
    TWITTER_DATE_FMT = "%a %b %d %H:%M:%S %z %Y"

    def discover_sources(self, root: Path) -> Iterator[Path]:
        """
        Find tweet.js / tweet.json files in a Twitter archive.

        The official Twitter export places these under:
          root/
            data/
              tweet.js        (single-file format, older exports)
              or
              account-XXXX-YYYY/
                tweets.js     (per-month splitted format)
        """
        root = Path(root)
        # Search for .js files that start with 'tweet' — these contain the tweet JSON blobs
        candidates = list(root.rglob("tweet*.js")) + list(root.rglob("tweet*.json"))
        logger.info("Discovered %d Twitter archive files under %s", len(candidates), root)
        for path in candidates:
            yield path

    def parse_source(self, source: Path) -> Iterator[SourceEvent]:
        """
        Parse a Twitter archive file and yield SourceEvents.

        Handles both single-file (old) and per-month splitted formats.
        Twitter wraps the JSON array in a JS variable assignment: `window.YTD.tweet.part0 = [...]`
        """
        try:
            with open(source, 'r', encoding='utf-8') as f:
                raw = f.read()

            # Extract JSON array from the JS wrapper
            match = re.search(r'=\s*(\[.+?\])\s*;?\s*$', raw, re.DOTALL)
            if match:
                json_str = match.group(1)
                records = json.loads(json_str)
            else:
                # Plain JSON array (no wrapper)
                records = json.loads(raw)

            logger.debug("Parsing %d tweet records from %s", len(records), source)

            for record in records:
                event = self._record_to_event(record, source)
                if event:
                    yield event

        except Exception as e:
            logger.error("Failed to parse %s: %s", source, e)

    def _record_to_event(self, record: dict, source_path: Path) -> Optional[SourceEvent]:
        """
        Convert a single tweet record into a SourceEvent.

        The record can be either the wrapped format {"tweet": {...}}} or the bare tweet object.
        """
        # Unwrap the tweet object
        tweet = record.get('tweet', record)

        # Extract core fields
        id_str = tweet.get('id_str') or tweet.get('id')
        full_text = tweet.get('full_text') or tweet.get('text', '')
        created_at = tweet.get('created_at', '')

        # Parse timestamp
        try:
            dt = datetime.strptime(created_at, self.TWITTER_DATE_FMT)
            iso_ts = dt.astimezone().isoformat()
        except Exception:
            iso_ts = created_at  # fallback: keep as-is

        # Author is always the account owner (Twitter archives don't include others' DMs by default)
        account = "user_archive"  # normalized account identifier

        # Thread/channel: individual tweets have no thread ID; threads aren't preserved in basic export
        thread_id = f"tweet_{id_str}"

        # Attachments: extract media URLs
        attachments = []
        extended_entities = tweet.get('extended_entities', {})
        for media in extended_entities.get('media', []):
            url = media.get('media_url_https') or media.get('media_url')
            if url:
                attachments.append(url)

        # Build raw_ref
        raw_ref = f"twitter:archive:{source_path.name}:{id_str}"

        # Compute hash
        content_for_hash = full_text or ""
        hash_val = compute_event_hash(
            source="twitter",
            raw_ref=raw_ref,
            content=content_for_hash,
            timestamp=iso_ts,
            author=account
        )

        # Preserve metadata for provenance
        metadata = {
            "tweet_id": id_str,
            "source_file": str(source_path),
            "favorite_count": tweet.get('favorite_count'),
            "retweet_count": tweet.get('retweet_count'),
            "in_reply_to_status_id": tweet.get('in_reply_to_status_id_str'),
            "lang": tweet.get('lang'),
        }

        return SourceEvent(
            source="twitter",
            account=account,
            thread_or_channel=thread_id,
            author=account,
            timestamp=iso_ts,
            content=full_text,
            attachments=attachments,
            raw_ref=raw_ref,
            hash=hash_val,
            consent_scope=self.consent_scope,
            metadata=metadata
        )