#!/usr/bin/env python3 """ connectors/schema.py — Unified source-event schema for personal archive connectors. All connectors must produce events conforming to this schema so downstream pipelines (harvester → knowledge store) can process them uniformly. """ from dataclasses import dataclass, asdict from datetime import datetime from typing import Optional, Any, Dict import hashlib import json @dataclass class SourceEvent: """ Canonical event schema for any ingested personal archive entry. Fields ------ source : str Platform identifier: 'twitter', 'discord', 'slack', 'whatsapp', 'notion', 'imessage', 'google', etc. account : str User account/channel identifier on the source platform. thread_or_channel : str Conversation thread, channel, or chat identifier. author : str Who created this content (may differ from account in group chats). timestamp : str ISO-8601 timestamp when the event occurred (not when it was ingested). content : str Primary text content. May be empty for non-text events (images only). attachments : list[str] List of local file paths or URLs for attached media. raw_ref : str Pointer to the raw source record (file path, message ID, URL, etc.). hash : str SHA-256 hash of the raw content for deduplication and provenance. consent_scope : str Privacy gate: where this content may be used. Examples: 'memory_only', 'bootstrap_context', 'training_data'. Default: 'memory_only' for ingested personal archives. metadata : dict[str, Any] Platform-specific fields retained for provenance but not indexed. """ source: str account: str thread_or_channel: str author: str timestamp: str content: str attachments: list[str] raw_ref: str hash: str consent_scope: str = "memory_only" metadata: Optional[Dict[str, Any]] = None def to_dict(self) -> dict: """Convert to plain dict for JSON serialization.""" d = asdict(self) if d['metadata'] is None: d['metadata'] = {} return d def to_json(self) -> str: """Serialize to JSON line (one event per line).""" return json.dumps(self.to_dict(), ensure_ascii=False) def compute_event_hash(source: str, raw_ref: str, content: str, timestamp: str, author: str) -> str: """ Compute deterministic SHA-256 hash for an event. Hash inputs: source + raw_ref + content + timestamp + author. This ensures identical content always produces the same hash, enabling cross-connector deduplication. """ canonical = f"{source}|{raw_ref}|{content}|{timestamp}|{author}" return hashlib.sha256(canonical.encode('utf-8')).hexdigest() def validate_event(event: SourceEvent) -> bool: """ Minimal structural validation for a SourceEvent. Returns True if required fields are present and well-formed. """ required = [event.source, event.account, event.thread_or_channel, event.author, event.timestamp, event.content, event.raw_ref, event.hash] return all(str(x).strip() for x in required) # Consent scope definitions CONSENT_MEMORY_ONLY = "memory_only" # For retrieval only, not bootstrap CONSENT_BOOTSTRAP = "bootstrap_context" # Can seed new sessions CONSENT_TRAINING = "training_data" # May be used for model training