Timmy-time-dashboard/src/timmy/memory/embeddings.py

"""Embedding functions for Timmy's memory system.

Provides text-to-vector embedding using sentence-transformers (preferred)
with a deterministic hash-based fallback when the ML library is unavailable.

Also includes vector similarity utilities (cosine similarity, keyword overlap).
"""

import hashlib
import logging
import math

logger = logging.getLogger(__name__)

# Embedding model - small, fast, local
EMBEDDING_MODEL = None
EMBEDDING_DIM = 384  # MiniLM dimension


def _get_embedding_model():
    """Lazy-load embedding model."""
    global EMBEDDING_MODEL
    if EMBEDDING_MODEL is None:
        try:
            from config import settings

            if settings.timmy_skip_embeddings:
                EMBEDDING_MODEL = False
                return EMBEDDING_MODEL
        except ImportError:
            pass

        try:
            from sentence_transformers import SentenceTransformer

            EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
            logger.info("MemorySystem: Loaded embedding model")
        except ImportError:
            logger.warning("MemorySystem: sentence-transformers not installed, using fallback")
            EMBEDDING_MODEL = False  # Use fallback
    return EMBEDDING_MODEL


def _simple_hash_embedding(text: str) -> list[float]:
    """Fallback: Simple hash-based embedding when transformers unavailable."""
    words = text.lower().split()
    vec = [0.0] * 128
    for i, word in enumerate(words[:50]):  # First 50 words
        h = hashlib.md5(word.encode()).hexdigest()
        for j in range(8):
            idx = (i * 8 + j) % 128
            vec[idx] += int(h[j * 2 : j * 2 + 2], 16) / 255.0
    # Normalize
    mag = math.sqrt(sum(x * x for x in vec)) or 1.0
    return [x / mag for x in vec]


def embed_text(text: str) -> list[float]:
    """Generate embedding for text."""
    model = _get_embedding_model()
    if model and model is not False:
        embedding = model.encode(text)
        return embedding.tolist()
    return _simple_hash_embedding(text)


def cosine_similarity(a: list[float], b: list[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    dot = sum(x * y for x, y in zip(a, b, strict=False))
    mag_a = math.sqrt(sum(x * x for x in a))
    mag_b = math.sqrt(sum(x * x for x in b))
    if mag_a == 0 or mag_b == 0:
        return 0.0
    return dot / (mag_a * mag_b)


# Alias for backward compatibility
_cosine_similarity = cosine_similarity


def _keyword_overlap(query: str, content: str) -> float:
    """Simple keyword overlap score as fallback."""
    query_words = set(query.lower().split())
    content_words = set(content.lower().split())
    if not query_words:
        return 0.0
    overlap = len(query_words & content_words)
    return overlap / len(query_words)