[loop-cycle-151] refactor: extract embedding functions to memory/embeddings.py (#344) (#355)

2026-03-18 21:24:50 -04:00
parent db7220db5a
commit ce658c841a
3 changed files with 107 additions and 83 deletions
--- a/src/timmy/memory/init.py
+++ b/src/timmy/memory/init.py
@@ -1 +1,7 @@
-"""Memory — Persistent conversation and knowledge memory."""
+"""Memory — Persistent conversation and knowledge memory.
+
+Sub-modules:
+    embeddings  — text-to-vector embedding + similarity functions
+    unified     — unified memory schema and connection management
+    vector_store — backward compatibility re-exports from memory_system
+"""
--- a/src/timmy/memory/embeddings.py
+++ b/src/timmy/memory/embeddings.py
@@ -0,0 +1,88 @@
+"""Embedding functions for Timmy's memory system.
+
+Provides text-to-vector embedding using sentence-transformers (preferred)
+with a deterministic hash-based fallback when the ML library is unavailable.
+
+Also includes vector similarity utilities (cosine similarity, keyword overlap).
+"""
+
+import hashlib
+import logging
+import math
+
+logger = logging.getLogger(__name__)
+
+# Embedding model - small, fast, local
+EMBEDDING_MODEL = None
+EMBEDDING_DIM = 384  # MiniLM dimension
+
+
+def _get_embedding_model():
+    """Lazy-load embedding model."""
+    global EMBEDDING_MODEL
+    if EMBEDDING_MODEL is None:
+        try:
+            from config import settings
+
+            if settings.timmy_skip_embeddings:
+                EMBEDDING_MODEL = False
+                return EMBEDDING_MODEL
+        except ImportError:
+            pass
+
+        try:
+            from sentence_transformers import SentenceTransformer
+
+            EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
+            logger.info("MemorySystem: Loaded embedding model")
+        except ImportError:
+            logger.warning("MemorySystem: sentence-transformers not installed, using fallback")
+            EMBEDDING_MODEL = False  # Use fallback
+    return EMBEDDING_MODEL
+
+
+def _simple_hash_embedding(text: str) -> list[float]:
+    """Fallback: Simple hash-based embedding when transformers unavailable."""
+    words = text.lower().split()
+    vec = [0.0] * 128
+    for i, word in enumerate(words[:50]):  # First 50 words
+        h = hashlib.md5(word.encode()).hexdigest()
+        for j in range(8):
+            idx = (i * 8 + j) % 128
+            vec[idx] += int(h[j * 2 : j * 2 + 2], 16) / 255.0
+    # Normalize
+    mag = math.sqrt(sum(x * x for x in vec)) or 1.0
+    return [x / mag for x in vec]
+
+
+def embed_text(text: str) -> list[float]:
+    """Generate embedding for text."""
+    model = _get_embedding_model()
+    if model and model is not False:
+        embedding = model.encode(text)
+        return embedding.tolist()
+    return _simple_hash_embedding(text)
+
+
+def cosine_similarity(a: list[float], b: list[float]) -> float:
+    """Calculate cosine similarity between two vectors."""
+    dot = sum(x * y for x, y in zip(a, b, strict=False))
+    mag_a = math.sqrt(sum(x * x for x in a))
+    mag_b = math.sqrt(sum(x * x for x in b))
+    if mag_a == 0 or mag_b == 0:
+        return 0.0
+    return dot / (mag_a * mag_b)
+
+
+# Alias for backward compatibility
+_cosine_similarity = cosine_similarity
+
+
+def _keyword_overlap(query: str, content: str) -> float:
+    """Simple keyword overlap score as fallback."""
+    query_words = set(query.lower().split())
+    content_words = set(content.lower().split())
+    if not query_words:
+        return 0.0
+    overlap = len(query_words & content_words)
+    return overlap / len(query_words)