timmy-config/hermes-sovereign/mempalace/sovereign_store.py

"""Sovereign Memory Store — zero-API, zero-dependency durable memory.

Replaces the third-party `mempalace` CLI and its ONNX requirement with a
self-contained SQLite + FTS5 + HRR (Holographic Reduced Representation)
store. Every operation is local: no network calls, no API keys, no cloud.

Storage: ~/.hermes/palace/sovereign.db

Capabilities:
  - Durable fact storage with rooms, categories, and trust scores
  - Hybrid retrieval: FTS5 keyword search + HRR cosine similarity
  - Reciprocal Rank Fusion to merge keyword and semantic results
  - Trust scoring: facts that get retrieved and confirmed gain trust
  - Graceful numpy degradation: falls back to keyword-only if missing

Refs: Epic #367, MP-3 #370, MP-4 #371
"""

from __future__ import annotations

import hashlib
import json
import math
import sqlite3
import struct
import time
from pathlib import Path
from typing import Any, Optional

# ---------------------------------------------------------------------------
# HRR (Holographic Reduced Representations) — zero-dependency vectors
# ---------------------------------------------------------------------------
# Phase-encoded vectors via SHA-256. No ONNX, no embeddings API, no numpy
# required (but uses numpy when available for speed).

_TWO_PI = 2.0 * math.pi
_DIM = 512  # Compact dimension — sufficient for memory retrieval

try:
    import numpy as np
    _HAS_NUMPY = True
except ImportError:
    _HAS_NUMPY = False


def _encode_atom_np(word: str, dim: int = _DIM) -> "np.ndarray":
    """Deterministic phase vector via SHA-256 (numpy path)."""
    values_per_block = 16
    blocks_needed = math.ceil(dim / values_per_block)
    uint16_values: list[int] = []
    for i in range(blocks_needed):
        digest = hashlib.sha256(f"{word}:{i}".encode()).digest()
        uint16_values.extend(struct.unpack("<16H", digest))
    return np.array(uint16_values[:dim], dtype=np.float64) * (_TWO_PI / 65536.0)


def _encode_atom_pure(word: str, dim: int = _DIM) -> list[float]:
    """Deterministic phase vector via SHA-256 (pure Python fallback)."""
    values_per_block = 16
    blocks_needed = math.ceil(dim / values_per_block)
    uint16_values: list[int] = []
    for i in range(blocks_needed):
        digest = hashlib.sha256(f"{word}:{i}".encode()).digest()
        for j in range(0, 32, 2):
            uint16_values.append(int.from_bytes(digest[j:j+2], "little"))
    return [v * (_TWO_PI / 65536.0) for v in uint16_values[:dim]]


def encode_text(text: str, dim: int = _DIM):
    """Encode a text string into an HRR phase vector by bundling word atoms.

    Uses circular mean of per-word phase vectors — the standard HRR
    superposition operation. Result is a fixed-width vector regardless
    of input length.
    """
    words = text.lower().split()
    if not words:
        words = ["<empty>"]

    if _HAS_NUMPY:
        atoms = [_encode_atom_np(w, dim) for w in words]
        # Circular mean: average the unit vectors, extract phase
        unit_sum = sum(np.exp(1j * a) for a in atoms)
        return np.angle(unit_sum) % _TWO_PI
    else:
        # Pure Python circular mean
        real_sum = [0.0] * dim
        imag_sum = [0.0] * dim
        for w in words:
            atom = _encode_atom_pure(w, dim)
            for d in range(dim):
                real_sum[d] += math.cos(atom[d])
                imag_sum[d] += math.sin(atom[d])
        return [math.atan2(imag_sum[d], real_sum[d]) % _TWO_PI for d in range(dim)]


def cosine_similarity_phase(a, b) -> float:
    """Cosine similarity between two phase vectors.

    For phase vectors, similarity = mean(cos(a - b)).
    """
    if _HAS_NUMPY:
        return float(np.mean(np.cos(np.array(a) - np.array(b))))
    else:
        n = len(a)
        return sum(math.cos(a[i] - b[i]) for i in range(n)) / n


def serialize_vector(vec) -> bytes:
    """Serialize a vector to bytes for SQLite storage."""
    if _HAS_NUMPY:
        return vec.astype(np.float64).tobytes()
    else:
        return struct.pack(f"{len(vec)}d", *vec)


def deserialize_vector(blob: bytes):
    """Deserialize bytes back to a vector."""
    n = len(blob) // 8  # float64 = 8 bytes
    if _HAS_NUMPY:
        return np.frombuffer(blob, dtype=np.float64)
    else:
        return list(struct.unpack(f"{n}d", blob))


# ---------------------------------------------------------------------------
# SQLite Schema
# ---------------------------------------------------------------------------

_SCHEMA = """
CREATE TABLE IF NOT EXISTS memories (
    memory_id       INTEGER PRIMARY KEY AUTOINCREMENT,
    content         TEXT NOT NULL,
    room            TEXT DEFAULT 'general',
    category        TEXT DEFAULT '',
    trust_score     REAL DEFAULT 0.5,
    retrieval_count INTEGER DEFAULT 0,
    created_at      REAL NOT NULL,
    updated_at      REAL NOT NULL,
    hrr_vector      BLOB
);

CREATE INDEX IF NOT EXISTS idx_memories_room ON memories(room);
CREATE INDEX IF NOT EXISTS idx_memories_trust ON memories(trust_score DESC);

-- FTS5 for fast keyword search
CREATE VIRTUAL TABLE IF NOT EXISTS memories_fts USING fts5(
    content, room, category,
    content=memories, content_rowid=memory_id,
    tokenize='porter unicode61'
);

-- Sync triggers
CREATE TRIGGER IF NOT EXISTS memories_ai AFTER INSERT ON memories BEGIN
    INSERT INTO memories_fts(rowid, content, room, category)
        VALUES (new.memory_id, new.content, new.room, new.category);
END;

CREATE TRIGGER IF NOT EXISTS memories_ad AFTER DELETE ON memories BEGIN
    INSERT INTO memories_fts(memories_fts, rowid, content, room, category)
        VALUES ('delete', old.memory_id, old.content, old.room, old.category);
END;

CREATE TRIGGER IF NOT EXISTS memories_au AFTER UPDATE ON memories BEGIN
    INSERT INTO memories_fts(memories_fts, rowid, content, room, category)
        VALUES ('delete', old.memory_id, old.content, old.room, old.category);
    INSERT INTO memories_fts(rowid, content, room, category)
        VALUES (new.memory_id, new.content, new.room, new.category);
END;

-- Promotion log: tracks what moved from scratchpad to durable memory
CREATE TABLE IF NOT EXISTS promotion_log (
    log_id      INTEGER PRIMARY KEY AUTOINCREMENT,
    session_id  TEXT NOT NULL,
    scratch_key TEXT NOT NULL,
    memory_id   INTEGER REFERENCES memories(memory_id),
    promoted_at REAL NOT NULL,
    reason      TEXT DEFAULT ''
);
"""


# ---------------------------------------------------------------------------
# SovereignStore
# ---------------------------------------------------------------------------

class SovereignStore:
    """Zero-API durable memory store.

    All operations are local SQLite. No network calls. No API keys.
    HRR vectors provide semantic similarity without embedding models.
    FTS5 provides fast keyword search. RRF merges both rankings.
    """

    def __init__(self, db_path: Optional[str] = None):
        if db_path is None:
            db_path = str(Path.home() / ".hermes" / "palace" / "sovereign.db")
        self._db_path = db_path
        Path(db_path).parent.mkdir(parents=True, exist_ok=True)
        self._conn = sqlite3.connect(db_path)
        self._conn.row_factory = sqlite3.Row
        self._conn.executescript(_SCHEMA)

    def close(self):
        self._conn.close()

    # ------------------------------------------------------------------
    # Store
    # ------------------------------------------------------------------

    def store(
        self,
        content: str,
        room: str = "general",
        category: str = "",
        trust: float = 0.5,
    ) -> int:
        """Store a fact in durable memory. Returns the memory_id."""
        now = time.time()
        vec = encode_text(content)
        blob = serialize_vector(vec)
        cur = self._conn.execute(
            """INSERT INTO memories (content, room, category, trust_score,
               created_at, updated_at, hrr_vector)
               VALUES (?, ?, ?, ?, ?, ?, ?)""",
            (content, room, category, trust, now, now, blob),
        )
        self._conn.commit()
        return cur.lastrowid

    def store_batch(self, items: list[dict]) -> list[int]:
        """Store multiple facts. Each item: {content, room?, category?, trust?}."""
        ids = []
        now = time.time()
        for item in items:
            content = item["content"]
            vec = encode_text(content)
            blob = serialize_vector(vec)
            cur = self._conn.execute(
                """INSERT INTO memories (content, room, category, trust_score,
                   created_at, updated_at, hrr_vector)
                   VALUES (?, ?, ?, ?, ?, ?, ?)""",
                (
                    content,
                    item.get("room", "general"),
                    item.get("category", ""),
                    item.get("trust", 0.5),
                    now, now, blob,
                ),
            )
            ids.append(cur.lastrowid)
        self._conn.commit()
        return ids

    # ------------------------------------------------------------------
    # Search — hybrid FTS5 + HRR with Reciprocal Rank Fusion
    # ------------------------------------------------------------------

    def search(
        self,
        query: str,
        room: Optional[str] = None,
        limit: int = 10,
        min_trust: float = 0.0,
        fts_weight: float = 0.5,
        hrr_weight: float = 0.5,
    ) -> list[dict]:
        """Hybrid search: FTS5 keywords + HRR semantic similarity.

        Uses Reciprocal Rank Fusion (RRF) to merge both rankings.
        Returns list of dicts with content, room, score, trust_score.
        """
        k_rrf = 60  # Standard RRF constant

        # Stage 1: FTS5 candidates
        fts_results = self._fts_search(query, room, min_trust, limit * 3)

        # Stage 2: HRR candidates (scan top N by trust)
        hrr_results = self._hrr_search(query, room, min_trust, limit * 3)

        # Stage 3: RRF fusion
        scores: dict[int, float] = {}
        meta: dict[int, dict] = {}

        for rank, row in enumerate(fts_results):
            mid = row["memory_id"]
            scores[mid] = scores.get(mid, 0) + fts_weight / (k_rrf + rank + 1)
            meta[mid] = dict(row)

        for rank, row in enumerate(hrr_results):
            mid = row["memory_id"]
            scores[mid] = scores.get(mid, 0) + hrr_weight / (k_rrf + rank + 1)
            if mid not in meta:
                meta[mid] = dict(row)

        # Sort by fused score
        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:limit]

        results = []
        for mid, score in ranked:
            m = meta[mid]
            # Bump retrieval count
            self._conn.execute(
                "UPDATE memories SET retrieval_count = retrieval_count + 1 WHERE memory_id = ?",
                (mid,),
            )
            results.append({
                "memory_id": mid,
                "content": m["content"],
                "room": m["room"],
                "category": m.get("category", ""),
                "trust_score": m["trust_score"],
                "score": round(score, 6),
            })

        if results:
            self._conn.commit()
        return results

    def _fts_search(
        self, query: str, room: Optional[str], min_trust: float, limit: int
    ) -> list[dict]:
        """FTS5 full-text search."""
        try:
            if room:
                rows = self._conn.execute(
                    """SELECT m.memory_id, m.content, m.room, m.category,
                              m.trust_score, m.retrieval_count
                       FROM memories_fts f
                       JOIN memories m ON f.rowid = m.memory_id
                       WHERE memories_fts MATCH ? AND m.room = ?
                             AND m.trust_score >= ?
                       ORDER BY rank LIMIT ?""",
                    (query, room, min_trust, limit),
                ).fetchall()
            else:
                rows = self._conn.execute(
                    """SELECT m.memory_id, m.content, m.room, m.category,
                              m.trust_score, m.retrieval_count
                       FROM memories_fts f
                       JOIN memories m ON f.rowid = m.memory_id
                       WHERE memories_fts MATCH ?
                             AND m.trust_score >= ?
                       ORDER BY rank LIMIT ?""",
                    (query, min_trust, limit),
                ).fetchall()
            return [dict(r) for r in rows]
        except sqlite3.OperationalError:
            # Bad FTS query syntax — degrade gracefully
            return []

    def _hrr_search(
        self, query: str, room: Optional[str], min_trust: float, limit: int
    ) -> list[dict]:
        """HRR cosine similarity search (brute-force scan, fast for <100K facts)."""
        query_vec = encode_text(query)

        if room:
            rows = self._conn.execute(
                """SELECT memory_id, content, room, category, trust_score,
                          retrieval_count, hrr_vector
                   FROM memories
                   WHERE room = ? AND trust_score >= ? AND hrr_vector IS NOT NULL""",
                (room, min_trust),
            ).fetchall()
        else:
            rows = self._conn.execute(
                """SELECT memory_id, content, room, category, trust_score,
                          retrieval_count, hrr_vector
                   FROM memories
                   WHERE trust_score >= ? AND hrr_vector IS NOT NULL""",
                (min_trust,),
            ).fetchall()

        scored = []
        for r in rows:
            stored_vec = deserialize_vector(r["hrr_vector"])
            sim = cosine_similarity_phase(query_vec, stored_vec)
            scored.append((sim, dict(r)))

        scored.sort(key=lambda x: x[0], reverse=True)
        return [item[1] for item in scored[:limit]]

    # ------------------------------------------------------------------
    # Trust management
    # ------------------------------------------------------------------

    def boost_trust(self, memory_id: int, delta: float = 0.05) -> None:
        """Increase trust score when a memory proves useful."""
        self._conn.execute(
            """UPDATE memories SET trust_score = MIN(1.0, trust_score + ?),
               updated_at = ? WHERE memory_id = ?""",
            (delta, time.time(), memory_id),
        )
        self._conn.commit()

    def decay_trust(self, memory_id: int, delta: float = 0.02) -> None:
        """Decrease trust score when a memory is contradicted."""
        self._conn.execute(
            """UPDATE memories SET trust_score = MAX(0.0, trust_score - ?),
               updated_at = ? WHERE memory_id = ?""",
            (delta, time.time(), memory_id),
        )
        self._conn.commit()

    # ------------------------------------------------------------------
    # Room operations
    # ------------------------------------------------------------------

    def list_rooms(self) -> list[dict]:
        """List all rooms with fact counts."""
        rows = self._conn.execute(
            """SELECT room, COUNT(*) as count,
                      AVG(trust_score) as avg_trust
               FROM memories GROUP BY room ORDER BY count DESC"""
        ).fetchall()
        return [dict(r) for r in rows]

    def room_contents(self, room: str, limit: int = 50) -> list[dict]:
        """Get all facts in a room, ordered by trust."""
        rows = self._conn.execute(
            """SELECT memory_id, content, category, trust_score,
                      retrieval_count, created_at
               FROM memories WHERE room = ?
               ORDER BY trust_score DESC, created_at DESC LIMIT ?""",
            (room, limit),
        ).fetchall()
        return [dict(r) for r in rows]

    # ------------------------------------------------------------------
    # Stats
    # ------------------------------------------------------------------

    def stats(self) -> dict:
        """Return store statistics."""
        row = self._conn.execute(
            """SELECT COUNT(*) as total,
                      AVG(trust_score) as avg_trust,
                      SUM(retrieval_count) as total_retrievals,
                      COUNT(DISTINCT room) as room_count
               FROM memories"""
        ).fetchone()
        return dict(row)

    # ------------------------------------------------------------------
    # Promotion support (scratchpad → durable)
    # ------------------------------------------------------------------

    def log_promotion(
        self,
        session_id: str,
        scratch_key: str,
        memory_id: int,
        reason: str = "",
    ) -> None:
        """Record a scratchpad-to-palace promotion in the audit log."""
        self._conn.execute(
            """INSERT INTO promotion_log
               (session_id, scratch_key, memory_id, promoted_at, reason)
               VALUES (?, ?, ?, ?, ?)""",
            (session_id, scratch_key, memory_id, time.time(), reason),
        )
        self._conn.commit()

    def recent_promotions(self, limit: int = 20) -> list[dict]:
        """Get recent promotion log entries."""
        rows = self._conn.execute(
            """SELECT p.*, m.content, m.room
               FROM promotion_log p
               LEFT JOIN memories m ON p.memory_id = m.memory_id
               ORDER BY p.promoted_at DESC LIMIT ?""",
            (limit,),
        ).fetchall()
        return [dict(r) for r in rows]