"""Embedding functions for Timmy's memory system. Provides text-to-vector embedding using sentence-transformers (preferred) with a deterministic hash-based fallback when the ML library is unavailable. Also includes vector similarity utilities (cosine similarity, keyword overlap). """ import hashlib import logging import math logger = logging.getLogger(__name__) # Embedding model - small, fast, local EMBEDDING_MODEL = None EMBEDDING_DIM = 384 # MiniLM dimension def _get_embedding_model(): """Lazy-load embedding model.""" global EMBEDDING_MODEL if EMBEDDING_MODEL is None: try: from config import settings if settings.timmy_skip_embeddings: EMBEDDING_MODEL = False return EMBEDDING_MODEL except ImportError: pass try: from sentence_transformers import SentenceTransformer EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L6-v2") logger.info("MemorySystem: Loaded embedding model") except ImportError: logger.warning("MemorySystem: sentence-transformers not installed, using fallback") EMBEDDING_MODEL = False # Use fallback return EMBEDDING_MODEL def _simple_hash_embedding(text: str) -> list[float]: """Fallback: Simple hash-based embedding when transformers unavailable.""" words = text.lower().split() vec = [0.0] * 128 for i, word in enumerate(words[:50]): # First 50 words h = hashlib.md5(word.encode()).hexdigest() for j in range(8): idx = (i * 8 + j) % 128 vec[idx] += int(h[j * 2 : j * 2 + 2], 16) / 255.0 # Normalize mag = math.sqrt(sum(x * x for x in vec)) or 1.0 return [x / mag for x in vec] def embed_text(text: str) -> list[float]: """Generate embedding for text.""" model = _get_embedding_model() if model and model is not False: embedding = model.encode(text) return embedding.tolist() return _simple_hash_embedding(text) def cosine_similarity(a: list[float], b: list[float]) -> float: """Calculate cosine similarity between two vectors.""" dot = sum(x * y for x, y in zip(a, b, strict=False)) mag_a = math.sqrt(sum(x * x for x in a)) mag_b = math.sqrt(sum(x * x for x in b)) if mag_a == 0 or mag_b == 0: return 0.0 return dot / (mag_a * mag_b) # Alias for backward compatibility _cosine_similarity = cosine_similarity def _keyword_overlap(query: str, content: str) -> float: """Simple keyword overlap score as fallback.""" query_words = set(query.lower().split()) content_words = set(content.lower().split()) if not query_words: return 0.0 overlap = len(query_words & content_words) return overlap / len(query_words)