- Add connectors/ directory with base infrastructure - Implement SourceEvent unified schema (source/account/thread/author/timestamp/content/attachments/raw_ref/hash/consent_scope) - Create BaseConnector abstract class with checkpoint/dedup/consent gates - Implement TwitterArchiveConnector for official Twitter/X data exports - Add run_connector.py CLI entry point - Add comprehensive test suite (13 tests, all passing) - Add connectors/README.md with usage docs - Add Makefile targets: test-connectors, run-connector, connectors-help - Reference parent EPIC #194 and issue #233 This is the foundational connector pack. Future work: Discord, Slack, WhatsApp, Notion, iMessage, Google.
101 lines
3.4 KiB
Python
101 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
connectors/schema.py — Unified source-event schema for personal archive connectors.
|
|
|
|
All connectors must produce events conforming to this schema so downstream
|
|
pipelines (harvester → knowledge store) can process them uniformly.
|
|
"""
|
|
|
|
from dataclasses import dataclass, asdict
|
|
from datetime import datetime
|
|
from typing import Optional, Any, Dict
|
|
import hashlib
|
|
import json
|
|
|
|
|
|
@dataclass
|
|
class SourceEvent:
|
|
"""
|
|
Canonical event schema for any ingested personal archive entry.
|
|
|
|
Fields
|
|
------
|
|
source : str
|
|
Platform identifier: 'twitter', 'discord', 'slack', 'whatsapp',
|
|
'notion', 'imessage', 'google', etc.
|
|
account : str
|
|
User account/channel identifier on the source platform.
|
|
thread_or_channel : str
|
|
Conversation thread, channel, or chat identifier.
|
|
author : str
|
|
Who created this content (may differ from account in group chats).
|
|
timestamp : str
|
|
ISO-8601 timestamp when the event occurred (not when it was ingested).
|
|
content : str
|
|
Primary text content. May be empty for non-text events (images only).
|
|
attachments : list[str]
|
|
List of local file paths or URLs for attached media.
|
|
raw_ref : str
|
|
Pointer to the raw source record (file path, message ID, URL, etc.).
|
|
hash : str
|
|
SHA-256 hash of the raw content for deduplication and provenance.
|
|
consent_scope : str
|
|
Privacy gate: where this content may be used.
|
|
Examples: 'memory_only', 'bootstrap_context', 'training_data'.
|
|
Default: 'memory_only' for ingested personal archives.
|
|
metadata : dict[str, Any]
|
|
Platform-specific fields retained for provenance but not indexed.
|
|
"""
|
|
source: str
|
|
account: str
|
|
thread_or_channel: str
|
|
author: str
|
|
timestamp: str
|
|
content: str
|
|
attachments: list[str]
|
|
raw_ref: str
|
|
hash: str
|
|
consent_scope: str = "memory_only"
|
|
metadata: Optional[Dict[str, Any]] = None
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to plain dict for JSON serialization."""
|
|
d = asdict(self)
|
|
if d['metadata'] is None:
|
|
d['metadata'] = {}
|
|
return d
|
|
|
|
def to_json(self) -> str:
|
|
"""Serialize to JSON line (one event per line)."""
|
|
return json.dumps(self.to_dict(), ensure_ascii=False)
|
|
|
|
|
|
def compute_event_hash(source: str, raw_ref: str, content: str,
|
|
timestamp: str, author: str) -> str:
|
|
"""
|
|
Compute deterministic SHA-256 hash for an event.
|
|
|
|
Hash inputs: source + raw_ref + content + timestamp + author.
|
|
This ensures identical content always produces the same hash,
|
|
enabling cross-connector deduplication.
|
|
"""
|
|
canonical = f"{source}|{raw_ref}|{content}|{timestamp}|{author}"
|
|
return hashlib.sha256(canonical.encode('utf-8')).hexdigest()
|
|
|
|
|
|
def validate_event(event: SourceEvent) -> bool:
|
|
"""
|
|
Minimal structural validation for a SourceEvent.
|
|
Returns True if required fields are present and well-formed.
|
|
"""
|
|
required = [event.source, event.account, event.thread_or_channel,
|
|
event.author, event.timestamp, event.content, event.raw_ref,
|
|
event.hash]
|
|
return all(str(x).strip() for x in required)
|
|
|
|
|
|
# Consent scope definitions
|
|
CONSENT_MEMORY_ONLY = "memory_only" # For retrieval only, not bootstrap
|
|
CONSENT_BOOTSTRAP = "bootstrap_context" # Can seed new sessions
|
|
CONSENT_TRAINING = "training_data" # May be used for model training
|