forked from Rockachopa/Timmy-time-dashboard
89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
"""Embedding functions for Timmy's memory system.
|
|
|
|
Provides text-to-vector embedding using sentence-transformers (preferred)
|
|
with a deterministic hash-based fallback when the ML library is unavailable.
|
|
|
|
Also includes vector similarity utilities (cosine similarity, keyword overlap).
|
|
"""
|
|
|
|
import hashlib
|
|
import logging
|
|
import math
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Embedding model - small, fast, local
|
|
EMBEDDING_MODEL = None
|
|
EMBEDDING_DIM = 384 # MiniLM dimension
|
|
|
|
|
|
def _get_embedding_model():
|
|
"""Lazy-load embedding model."""
|
|
global EMBEDDING_MODEL
|
|
if EMBEDDING_MODEL is None:
|
|
try:
|
|
from config import settings
|
|
|
|
if settings.timmy_skip_embeddings:
|
|
EMBEDDING_MODEL = False
|
|
return EMBEDDING_MODEL
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
|
|
logger.info("MemorySystem: Loaded embedding model")
|
|
except ImportError:
|
|
logger.warning("MemorySystem: sentence-transformers not installed, using fallback")
|
|
EMBEDDING_MODEL = False # Use fallback
|
|
return EMBEDDING_MODEL
|
|
|
|
|
|
def _simple_hash_embedding(text: str) -> list[float]:
|
|
"""Fallback: Simple hash-based embedding when transformers unavailable."""
|
|
words = text.lower().split()
|
|
vec = [0.0] * 128
|
|
for i, word in enumerate(words[:50]): # First 50 words
|
|
h = hashlib.md5(word.encode()).hexdigest()
|
|
for j in range(8):
|
|
idx = (i * 8 + j) % 128
|
|
vec[idx] += int(h[j * 2 : j * 2 + 2], 16) / 255.0
|
|
# Normalize
|
|
mag = math.sqrt(sum(x * x for x in vec)) or 1.0
|
|
return [x / mag for x in vec]
|
|
|
|
|
|
def embed_text(text: str) -> list[float]:
|
|
"""Generate embedding for text."""
|
|
model = _get_embedding_model()
|
|
if model and model is not False:
|
|
embedding = model.encode(text)
|
|
return embedding.tolist()
|
|
return _simple_hash_embedding(text)
|
|
|
|
|
|
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
"""Calculate cosine similarity between two vectors."""
|
|
dot = sum(x * y for x, y in zip(a, b, strict=False))
|
|
mag_a = math.sqrt(sum(x * x for x in a))
|
|
mag_b = math.sqrt(sum(x * x for x in b))
|
|
if mag_a == 0 or mag_b == 0:
|
|
return 0.0
|
|
return dot / (mag_a * mag_b)
|
|
|
|
|
|
# Alias for backward compatibility
|
|
_cosine_similarity = cosine_similarity
|
|
|
|
|
|
def _keyword_overlap(query: str, content: str) -> float:
|
|
"""Simple keyword overlap score as fallback."""
|
|
query_words = set(query.lower().split())
|
|
content_words = set(content.lower().split())
|
|
if not query_words:
|
|
return 0.0
|
|
overlap = len(query_words & content_words)
|
|
return overlap / len(query_words)
|