1
0

[loop-cycle-54] refactor: consolidate three memory stores into single table (#37) (#223)

This commit is contained in:
2026-03-15 13:33:24 -04:00
parent 4a68f6cb8b
commit b4cb3e9975
14 changed files with 1425 additions and 1084 deletions

View File

@@ -1,28 +1,29 @@
"""Unified memory database — single SQLite DB for all memory types.
"""Unified memory schema and connection management.
Consolidates three previously separate stores into one:
- **facts**: Long-term knowledge (user preferences, learned patterns)
- **chunks**: Indexed vault documents (markdown files from memory/)
- **episodes**: Runtime memories (conversations, agent observations)
All three tables live in ``data/memory.db``. Existing APIs in
``vector_store.py`` and ``semantic_memory.py`` are updated to point here.
This module provides the central database schema for Timmy's consolidated
memory system. All memory types (facts, conversations, documents, vault chunks)
are stored in a single `memories` table with a `memory_type` discriminator.
"""
import logging
import sqlite3
import uuid
from collections.abc import Generator
from contextlib import closing, contextmanager
from dataclasses import dataclass, field
from datetime import UTC, datetime
from pathlib import Path
logger = logging.getLogger(__name__)
DB_PATH = Path(__file__).parent.parent.parent.parent / "data" / "memory.db"
# Paths
PROJECT_ROOT = Path(__file__).parent.parent.parent.parent
DB_PATH = PROJECT_ROOT / "data" / "memory.db"
@contextmanager
def get_connection() -> Generator[sqlite3.Connection, None, None]:
"""Open (and lazily create) the unified memory database."""
"""Get database connection to unified memory database."""
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
with closing(sqlite3.connect(str(DB_PATH))) as conn:
conn.row_factory = sqlite3.Row
@@ -33,56 +34,166 @@ def get_connection() -> Generator[sqlite3.Connection, None, None]:
def _ensure_schema(conn: sqlite3.Connection) -> None:
"""Create the three core tables and indexes if they don't exist."""
# --- facts ---------------------------------------------------------------
"""Create the unified memories table and indexes if they don't exist."""
conn.execute("""
CREATE TABLE IF NOT EXISTS facts (
CREATE TABLE IF NOT EXISTS memories (
id TEXT PRIMARY KEY,
category TEXT NOT NULL DEFAULT 'general',
content TEXT NOT NULL,
confidence REAL NOT NULL DEFAULT 0.8,
memory_type TEXT NOT NULL DEFAULT 'fact',
source TEXT NOT NULL DEFAULT 'agent',
embedding TEXT,
metadata TEXT,
source_hash TEXT,
agent_id TEXT,
task_id TEXT,
session_id TEXT,
confidence REAL NOT NULL DEFAULT 0.8,
tags TEXT NOT NULL DEFAULT '[]',
created_at TEXT NOT NULL,
last_accessed TEXT,
access_count INTEGER NOT NULL DEFAULT 0
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_facts_category ON facts(category)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_facts_confidence ON facts(confidence)")
# Create indexes for efficient querying
conn.execute("CREATE INDEX IF NOT EXISTS idx_memories_type ON memories(memory_type)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_memories_time ON memories(created_at)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_memories_session ON memories(session_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_memories_agent ON memories(agent_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_memories_source ON memories(source)")
conn.commit()
# --- chunks (vault document fragments) -----------------------------------
conn.execute("""
CREATE TABLE IF NOT EXISTS chunks (
id TEXT PRIMARY KEY,
source TEXT NOT NULL,
content TEXT NOT NULL,
embedding TEXT NOT NULL,
created_at TEXT NOT NULL,
source_hash TEXT NOT NULL
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source)")
# Run migration if needed
_migrate_schema(conn)
# --- episodes (runtime memory entries) -----------------------------------
conn.execute("""
CREATE TABLE IF NOT EXISTS episodes (
id TEXT PRIMARY KEY,
content TEXT NOT NULL,
source TEXT NOT NULL,
context_type TEXT NOT NULL DEFAULT 'conversation',
embedding TEXT,
metadata TEXT,
agent_id TEXT,
task_id TEXT,
session_id TEXT,
timestamp TEXT NOT NULL
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_episodes_type ON episodes(context_type)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_episodes_time ON episodes(timestamp)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_episodes_session ON episodes(session_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_episodes_agent ON episodes(agent_id)")
def _migrate_schema(conn: sqlite3.Connection) -> None:
"""Migrate from old three-table schema to unified memories table.
Migration paths:
- episodes table -> memories (context_type -> memory_type)
- chunks table -> memories with memory_type='vault_chunk'
- facts table -> dropped (unused, 0 rows expected)
"""
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = {row[0] for row in cursor.fetchall()}
has_memories = "memories" in tables
has_episodes = "episodes" in tables
has_chunks = "chunks" in tables
has_facts = "facts" in tables
# Check if we need to migrate (old schema exists but new one doesn't fully)
if not has_memories:
logger.info("Migration: Creating unified memories table")
# Schema will be created above
# Migrate episodes -> memories
if has_episodes and has_memories:
logger.info("Migration: Converting episodes table to memories")
try:
cols = _get_table_columns(conn, "episodes")
context_type_col = "context_type" if "context_type" in cols else "'conversation'"
conn.execute(f"""
INSERT INTO memories (
id, content, memory_type, source, embedding,
metadata, agent_id, task_id, session_id,
created_at, access_count, last_accessed
)
SELECT
id, content,
COALESCE({context_type_col}, 'conversation'),
COALESCE(source, 'agent'),
embedding,
metadata, agent_id, task_id, session_id,
COALESCE(timestamp, datetime('now')), 0, NULL
FROM episodes
""")
conn.execute("DROP TABLE episodes")
logger.info("Migration: Migrated episodes to memories")
except sqlite3.Error as exc:
logger.warning("Migration: Failed to migrate episodes: %s", exc)
# Migrate chunks -> memories as vault_chunk
if has_chunks and has_memories:
logger.info("Migration: Converting chunks table to memories")
try:
cols = _get_table_columns(conn, "chunks")
id_col = "id" if "id" in cols else "CAST(rowid AS TEXT)"
content_col = "content" if "content" in cols else "text"
source_col = "filepath" if "filepath" in cols else ("source" if "source" in cols else "'vault'")
embedding_col = "embedding" if "embedding" in cols else "NULL"
created_col = "created_at" if "created_at" in cols else "datetime('now')"
conn.execute(f"""
INSERT INTO memories (
id, content, memory_type, source, embedding,
created_at, access_count
)
SELECT
{id_col}, {content_col}, 'vault_chunk', {source_col},
{embedding_col}, {created_col}, 0
FROM chunks
""")
conn.execute("DROP TABLE chunks")
logger.info("Migration: Migrated chunks to memories")
except sqlite3.Error as exc:
logger.warning("Migration: Failed to migrate chunks: %s", exc)
# Drop old facts table
if has_facts:
try:
conn.execute("DROP TABLE facts")
logger.info("Migration: Dropped old facts table")
except sqlite3.Error as exc:
logger.warning("Migration: Failed to drop facts: %s", exc)
conn.commit()
def _get_table_columns(conn: sqlite3.Connection, table_name: str) -> set[str]:
"""Get the column names for a table."""
cursor = conn.execute(f"PRAGMA table_info({table_name})")
return {row[1] for row in cursor.fetchall()}
# Backward compatibility aliases
get_conn = get_connection
@dataclass
class MemoryEntry:
"""A memory entry with vector embedding.
Note: The DB column is `memory_type` but this field is named `context_type`
for backward API compatibility.
"""
id: str = field(default_factory=lambda: str(uuid.uuid4()))
content: str = "" # The actual text content
source: str = "" # Where it came from (agent, user, system)
context_type: str = "conversation" # API field name; DB column is memory_type
agent_id: str | None = None
task_id: str | None = None
session_id: str | None = None
metadata: dict | None = None
embedding: list[float] | None = None
timestamp: str = field(default_factory=lambda: datetime.now(UTC).isoformat())
relevance_score: float | None = None # Set during search
@dataclass
class MemoryChunk:
"""A searchable chunk of memory."""
id: str
source: str # filepath
content: str
embedding: list[float]
created_at: str
# Note: Functions are available via memory_system module directly
# from timmy.memory_system import store_memory, search_memories, etc.

View File

@@ -1,425 +1,37 @@
"""Vector store for semantic memory using sqlite-vss.
Provides embedding-based similarity search for the Echo agent
to retrieve relevant context from conversation history.
"""
import json
import logging
import sqlite3
import uuid
from collections.abc import Generator
from contextlib import contextmanager
from dataclasses import dataclass, field
from datetime import UTC, datetime
logger = logging.getLogger(__name__)
def _check_embedding_model() -> bool | None:
"""Check if the canonical embedding model is available."""
try:
from timmy.semantic_memory import _get_embedding_model
model = _get_embedding_model()
return model is not None and model is not False
except (ImportError, AttributeError) as exc:
logger.debug("Embedding model check failed: %s", exc)
return None
def _compute_embedding(text: str) -> list[float]:
"""Compute embedding vector for text.
Delegates to the canonical embedding provider in semantic_memory
to avoid loading the model multiple times.
"""
from timmy.semantic_memory import embed_text
return embed_text(text)
@dataclass
class MemoryEntry:
"""A memory entry with vector embedding."""
id: str = field(default_factory=lambda: str(uuid.uuid4()))
content: str = "" # The actual text content
source: str = "" # Where it came from (agent, user, system)
context_type: str = "conversation" # conversation, document, fact, etc.
agent_id: str | None = None
task_id: str | None = None
session_id: str | None = None
metadata: dict | None = None
embedding: list[float] | None = None
timestamp: str = field(default_factory=lambda: datetime.now(UTC).isoformat())
relevance_score: float | None = None # Set during search
@contextmanager
def _get_conn() -> Generator[sqlite3.Connection, None, None]:
"""Get database connection to unified memory.db."""
from timmy.memory.unified import get_connection
with get_connection() as conn:
yield conn
def store_memory(
content: str,
source: str,
context_type: str = "conversation",
agent_id: str | None = None,
task_id: str | None = None,
session_id: str | None = None,
metadata: dict | None = None,
compute_embedding: bool = True,
) -> MemoryEntry:
"""Store a memory entry with optional embedding.
Args:
content: The text content to store
source: Source of the memory (agent name, user, system)
context_type: Type of context (conversation, document, fact)
agent_id: Associated agent ID
task_id: Associated task ID
session_id: Session identifier
metadata: Additional structured data
compute_embedding: Whether to compute vector embedding
Returns:
The stored MemoryEntry
"""
embedding = None
if compute_embedding:
embedding = _compute_embedding(content)
entry = MemoryEntry(
content=content,
source=source,
context_type=context_type,
agent_id=agent_id,
task_id=task_id,
session_id=session_id,
metadata=metadata,
embedding=embedding,
)
with _get_conn() as conn:
conn.execute(
"""
INSERT INTO episodes
(id, content, source, context_type, agent_id, task_id, session_id,
metadata, embedding, timestamp)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
entry.id,
entry.content,
entry.source,
entry.context_type,
entry.agent_id,
entry.task_id,
entry.session_id,
json.dumps(metadata) if metadata else None,
json.dumps(embedding) if embedding else None,
entry.timestamp,
),
)
conn.commit()
return entry
def search_memories(
query: str,
limit: int = 10,
context_type: str | None = None,
agent_id: str | None = None,
session_id: str | None = None,
min_relevance: float = 0.0,
) -> list[MemoryEntry]:
"""Search for memories by semantic similarity.
Args:
query: Search query text
limit: Maximum results
context_type: Filter by context type
agent_id: Filter by agent
session_id: Filter by session
min_relevance: Minimum similarity score (0-1)
Returns:
List of MemoryEntry objects sorted by relevance
"""
query_embedding = _compute_embedding(query)
# Build query with filters
conditions = []
params = []
if context_type:
conditions.append("context_type = ?")
params.append(context_type)
if agent_id:
conditions.append("agent_id = ?")
params.append(agent_id)
if session_id:
conditions.append("session_id = ?")
params.append(session_id)
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
# Fetch candidates (we'll do in-memory similarity for now)
# For production with sqlite-vss, this would use vector similarity index
query_sql = f"""
SELECT * FROM episodes
{where_clause}
ORDER BY timestamp DESC
LIMIT ?
"""
params.append(limit * 3) # Get more candidates for ranking
with _get_conn() as conn:
rows = conn.execute(query_sql, params).fetchall()
# Compute similarity scores
results = []
for row in rows:
entry = MemoryEntry(
id=row["id"],
content=row["content"],
source=row["source"],
context_type=row["context_type"],
agent_id=row["agent_id"],
task_id=row["task_id"],
session_id=row["session_id"],
metadata=json.loads(row["metadata"]) if row["metadata"] else None,
embedding=json.loads(row["embedding"]) if row["embedding"] else None,
timestamp=row["timestamp"],
)
if entry.embedding:
# Cosine similarity
score = _cosine_similarity(query_embedding, entry.embedding)
entry.relevance_score = score
if score >= min_relevance:
results.append(entry)
else:
# Fallback: check for keyword overlap
score = _keyword_overlap(query, entry.content)
entry.relevance_score = score
if score >= min_relevance:
results.append(entry)
# Sort by relevance and return top results
results.sort(key=lambda x: x.relevance_score or 0, reverse=True)
return results[:limit]
def _cosine_similarity(a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two vectors."""
dot = sum(x * y for x, y in zip(a, b, strict=False))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
def _keyword_overlap(query: str, content: str) -> float:
"""Simple keyword overlap score as fallback."""
query_words = set(query.lower().split())
content_words = set(content.lower().split())
if not query_words:
return 0.0
overlap = len(query_words & content_words)
return overlap / len(query_words)
def get_memory_context(query: str, max_tokens: int = 2000, **filters) -> str:
"""Get relevant memory context as formatted text for LLM prompts.
Args:
query: Search query
max_tokens: Approximate maximum tokens to return
**filters: Additional filters (agent_id, session_id, etc.)
Returns:
Formatted context string for inclusion in prompts
"""
memories = search_memories(query, limit=20, **filters)
context_parts = []
total_chars = 0
max_chars = max_tokens * 4 # Rough approximation
for mem in memories:
formatted = f"[{mem.source}]: {mem.content}"
if total_chars + len(formatted) > max_chars:
break
context_parts.append(formatted)
total_chars += len(formatted)
if not context_parts:
return ""
return "Relevant context from memory:\n" + "\n\n".join(context_parts)
def recall_personal_facts(agent_id: str | None = None) -> list[str]:
"""Recall personal facts about the user or system.
Args:
agent_id: Optional agent filter
Returns:
List of fact strings
"""
with _get_conn() as conn:
if agent_id:
rows = conn.execute(
"""
SELECT content FROM episodes
WHERE context_type = 'fact' AND agent_id = ?
ORDER BY timestamp DESC
LIMIT 100
""",
(agent_id,),
).fetchall()
else:
rows = conn.execute(
"""
SELECT content FROM episodes
WHERE context_type = 'fact'
ORDER BY timestamp DESC
LIMIT 100
""",
).fetchall()
return [r["content"] for r in rows]
def recall_personal_facts_with_ids(agent_id: str | None = None) -> list[dict]:
"""Recall personal facts with their IDs for edit/delete operations."""
with _get_conn() as conn:
if agent_id:
rows = conn.execute(
"SELECT id, content FROM episodes WHERE context_type = 'fact' AND agent_id = ? ORDER BY timestamp DESC LIMIT 100",
(agent_id,),
).fetchall()
else:
rows = conn.execute(
"SELECT id, content FROM episodes WHERE context_type = 'fact' ORDER BY timestamp DESC LIMIT 100",
).fetchall()
return [{"id": r["id"], "content": r["content"]} for r in rows]
def update_personal_fact(memory_id: str, new_content: str) -> bool:
"""Update a personal fact's content."""
with _get_conn() as conn:
cursor = conn.execute(
"UPDATE episodes SET content = ? WHERE id = ? AND context_type = 'fact'",
(new_content, memory_id),
)
conn.commit()
updated = cursor.rowcount > 0
return updated
def store_personal_fact(fact: str, agent_id: str | None = None) -> MemoryEntry:
"""Store a personal fact about the user or system.
Args:
fact: The fact to store
agent_id: Associated agent
Returns:
The stored MemoryEntry
"""
return store_memory(
content=fact,
source="system",
context_type="fact",
agent_id=agent_id,
metadata={"auto_extracted": False},
)
def delete_memory(memory_id: str) -> bool:
"""Delete a memory entry by ID.
Returns:
True if deleted, False if not found
"""
with _get_conn() as conn:
cursor = conn.execute(
"DELETE FROM episodes WHERE id = ?",
(memory_id,),
)
conn.commit()
deleted = cursor.rowcount > 0
return deleted
def get_memory_stats() -> dict:
"""Get statistics about the memory store.
Returns:
Dict with counts by type, total entries, etc.
"""
with _get_conn() as conn:
total = conn.execute("SELECT COUNT(*) as count FROM episodes").fetchone()["count"]
by_type = {}
rows = conn.execute(
"SELECT context_type, COUNT(*) as count FROM episodes GROUP BY context_type"
).fetchall()
for row in rows:
by_type[row["context_type"]] = row["count"]
with_embeddings = conn.execute(
"SELECT COUNT(*) as count FROM episodes WHERE embedding IS NOT NULL"
).fetchone()["count"]
return {
"total_entries": total,
"by_type": by_type,
"with_embeddings": with_embeddings,
"has_embedding_model": _check_embedding_model(),
}
def prune_memories(older_than_days: int = 90, keep_facts: bool = True) -> int:
"""Delete old memories to manage storage.
Args:
older_than_days: Delete memories older than this
keep_facts: Whether to preserve fact-type memories
Returns:
Number of entries deleted
"""
from datetime import timedelta
cutoff = (datetime.now(UTC) - timedelta(days=older_than_days)).isoformat()
with _get_conn() as conn:
if keep_facts:
cursor = conn.execute(
"""
DELETE FROM episodes
WHERE timestamp < ? AND context_type != 'fact'
""",
(cutoff,),
)
else:
cursor = conn.execute(
"DELETE FROM episodes WHERE timestamp < ?",
(cutoff,),
)
deleted = cursor.rowcount
conn.commit()
return deleted
"""Backward compatibility — all memory functions live in memory_system now."""
from timmy.memory_system import (
DB_PATH,
MemoryEntry,
_cosine_similarity,
_keyword_overlap,
delete_memory,
get_memory_context,
get_memory_stats,
get_memory_system,
prune_memories,
recall_personal_facts,
recall_personal_facts_with_ids,
search_memories,
store_memory,
store_personal_fact,
update_personal_fact,
)
__all__ = [
"DB_PATH",
"MemoryEntry",
"delete_memory",
"get_memory_context",
"get_memory_stats",
"get_memory_system",
"prune_memories",
"recall_personal_facts",
"recall_personal_facts_with_ids",
"search_memories",
"store_memory",
"store_personal_fact",
"update_personal_fact",
"_cosine_similarity",
"_keyword_overlap",
]