compounding-intelligence/connectors/schema.py

#!/usr/bin/env python3
"""
connectors/schema.py — Unified source-event schema for personal archive connectors.

All connectors must produce events conforming to this schema so downstream
pipelines (harvester → knowledge store) can process them uniformly.
"""

from dataclasses import dataclass, asdict
from datetime import datetime
from typing import Optional, Any, Dict
import hashlib
import json


@dataclass
class SourceEvent:
    """
    Canonical event schema for any ingested personal archive entry.

    Fields
    ------
    source : str
        Platform identifier: 'twitter', 'discord', 'slack', 'whatsapp',
        'notion', 'imessage', 'google', etc.
    account : str
        User account/channel identifier on the source platform.
    thread_or_channel : str
        Conversation thread, channel, or chat identifier.
    author : str
        Who created this content (may differ from account in group chats).
    timestamp : str
        ISO-8601 timestamp when the event occurred (not when it was ingested).
    content : str
        Primary text content. May be empty for non-text events (images only).
    attachments : list[str]
        List of local file paths or URLs for attached media.
    raw_ref : str
        Pointer to the raw source record (file path, message ID, URL, etc.).
    hash : str
        SHA-256 hash of the raw content for deduplication and provenance.
    consent_scope : str
        Privacy gate: where this content may be used.
        Examples: 'memory_only', 'bootstrap_context', 'training_data'.
        Default: 'memory_only' for ingested personal archives.
    metadata : dict[str, Any]
        Platform-specific fields retained for provenance but not indexed.
    """
    source: str
    account: str
    thread_or_channel: str
    author: str
    timestamp: str
    content: str
    attachments: list[str]
    raw_ref: str
    hash: str
    consent_scope: str = "memory_only"
    metadata: Optional[Dict[str, Any]] = None

    def to_dict(self) -> dict:
        """Convert to plain dict for JSON serialization."""
        d = asdict(self)
        if d['metadata'] is None:
            d['metadata'] = {}
        return d

    def to_json(self) -> str:
        """Serialize to JSON line (one event per line)."""
        return json.dumps(self.to_dict(), ensure_ascii=False)


def compute_event_hash(source: str, raw_ref: str, content: str,
                       timestamp: str, author: str) -> str:
    """
    Compute deterministic SHA-256 hash for an event.

    Hash inputs: source + raw_ref + content + timestamp + author.
    This ensures identical content always produces the same hash,
    enabling cross-connector deduplication.
    """
    canonical = f"{source}|{raw_ref}|{content}|{timestamp}|{author}"
    return hashlib.sha256(canonical.encode('utf-8')).hexdigest()


def validate_event(event: SourceEvent) -> bool:
    """
    Minimal structural validation for a SourceEvent.
    Returns True if required fields are present and well-formed.
    """
    required = [event.source, event.account, event.thread_or_channel,
                event.author, event.timestamp, event.content, event.raw_ref,
                event.hash]
    return all(str(x).strip() for x in required)


# Consent scope definitions
CONSENT_MEMORY_ONLY = "memory_only"      # For retrieval only, not bootstrap
CONSENT_BOOTSTRAP = "bootstrap_context"  # Can seed new sessions
CONSENT_TRAINING = "training_data"       # May be used for model training