Files
compounding-intelligence/connectors/schema.py
Rockachopa 8628a0d610 feat(connectors): add sovereign personal archive connector pack foundation
- Add connectors/ directory with base infrastructure
- Implement SourceEvent unified schema (source/account/thread/author/timestamp/content/attachments/raw_ref/hash/consent_scope)
- Create BaseConnector abstract class with checkpoint/dedup/consent gates
- Implement TwitterArchiveConnector for official Twitter/X data exports
- Add run_connector.py CLI entry point
- Add comprehensive test suite (13 tests, all passing)
- Add connectors/README.md with usage docs
- Add Makefile targets: test-connectors, run-connector, connectors-help
- Reference parent EPIC #194 and issue #233

This is the foundational connector pack. Future work: Discord, Slack, WhatsApp, Notion, iMessage, Google.
2026-04-26 20:45:07 -04:00

101 lines
3.4 KiB
Python

#!/usr/bin/env python3
"""
connectors/schema.py — Unified source-event schema for personal archive connectors.
All connectors must produce events conforming to this schema so downstream
pipelines (harvester → knowledge store) can process them uniformly.
"""
from dataclasses import dataclass, asdict
from datetime import datetime
from typing import Optional, Any, Dict
import hashlib
import json
@dataclass
class SourceEvent:
"""
Canonical event schema for any ingested personal archive entry.
Fields
------
source : str
Platform identifier: 'twitter', 'discord', 'slack', 'whatsapp',
'notion', 'imessage', 'google', etc.
account : str
User account/channel identifier on the source platform.
thread_or_channel : str
Conversation thread, channel, or chat identifier.
author : str
Who created this content (may differ from account in group chats).
timestamp : str
ISO-8601 timestamp when the event occurred (not when it was ingested).
content : str
Primary text content. May be empty for non-text events (images only).
attachments : list[str]
List of local file paths or URLs for attached media.
raw_ref : str
Pointer to the raw source record (file path, message ID, URL, etc.).
hash : str
SHA-256 hash of the raw content for deduplication and provenance.
consent_scope : str
Privacy gate: where this content may be used.
Examples: 'memory_only', 'bootstrap_context', 'training_data'.
Default: 'memory_only' for ingested personal archives.
metadata : dict[str, Any]
Platform-specific fields retained for provenance but not indexed.
"""
source: str
account: str
thread_or_channel: str
author: str
timestamp: str
content: str
attachments: list[str]
raw_ref: str
hash: str
consent_scope: str = "memory_only"
metadata: Optional[Dict[str, Any]] = None
def to_dict(self) -> dict:
"""Convert to plain dict for JSON serialization."""
d = asdict(self)
if d['metadata'] is None:
d['metadata'] = {}
return d
def to_json(self) -> str:
"""Serialize to JSON line (one event per line)."""
return json.dumps(self.to_dict(), ensure_ascii=False)
def compute_event_hash(source: str, raw_ref: str, content: str,
timestamp: str, author: str) -> str:
"""
Compute deterministic SHA-256 hash for an event.
Hash inputs: source + raw_ref + content + timestamp + author.
This ensures identical content always produces the same hash,
enabling cross-connector deduplication.
"""
canonical = f"{source}|{raw_ref}|{content}|{timestamp}|{author}"
return hashlib.sha256(canonical.encode('utf-8')).hexdigest()
def validate_event(event: SourceEvent) -> bool:
"""
Minimal structural validation for a SourceEvent.
Returns True if required fields are present and well-formed.
"""
required = [event.source, event.account, event.thread_or_channel,
event.author, event.timestamp, event.content, event.raw_ref,
event.hash]
return all(str(x).strip() for x in required)
# Consent scope definitions
CONSENT_MEMORY_ONLY = "memory_only" # For retrieval only, not bootstrap
CONSENT_BOOTSTRAP = "bootstrap_context" # Can seed new sessions
CONSENT_TRAINING = "training_data" # May be used for model training