the-nexus/nexus/mnemosyne/embeddings.py

"""Pluggable embedding backends for Mnemosyne semantic search.

Provides an abstract EmbeddingBackend interface and concrete implementations:
- OllamaEmbeddingBackend: local models via Ollama (sovereign, no cloud)
- TfidfEmbeddingBackend: pure-Python TF-IDF fallback (no dependencies)

Usage:
    from nexus.mnemosyne.embeddings import get_embedding_backend
    backend = get_embedding_backend()  # auto-detects best available
    vec = backend.embed("hello world")
    score = backend.similarity(vec_a, vec_b)
"""

from __future__ import annotations
import abc, json, math, os, re, urllib.request
from typing import Optional


class EmbeddingBackend(abc.ABC):
    """Abstract interface for embedding-based similarity."""

    @abc.abstractmethod
    def embed(self, text: str) -> list[float]:
        """Return an embedding vector for the given text."""

    @abc.abstractmethod
    def similarity(self, a: list[float], b: list[float]) -> float:
        """Return cosine similarity between two vectors, in [0, 1]."""

    @property
    def name(self) -> str:
        return self.__class__.__name__

    @property
    def dimension(self) -> int:
        return 0


def cosine_similarity(a: list[float], b: list[float]) -> float:
    """Cosine similarity between two vectors."""
    if len(a) != len(b):
        raise ValueError(f"Vector dimension mismatch: {len(a)} vs {len(b)}")
    dot = sum(x * y for x, y in zip(a, b))
    norm_a = math.sqrt(sum(x * x for x in a))
    norm_b = math.sqrt(sum(x * x for x in b))
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)


class OllamaEmbeddingBackend(EmbeddingBackend):
    """Embedding backend using a local Ollama instance.
    Default model: nomic-embed-text (768 dims)."""

    def __init__(self, base_url: str | None = None, model: str | None = None):
        self.base_url = base_url or os.environ.get("OLLAMA_URL", "http://localhost:11434")
        self.model = model or os.environ.get("MNEMOSYNE_EMBED_MODEL", "nomic-embed-text")
        self._dim: int = 0
        self._available: bool | None = None

    def _check_available(self) -> bool:
        if self._available is not None:
            return self._available
        try:
            req = urllib.request.Request(f"{self.base_url}/api/tags", method="GET")
            resp = urllib.request.urlopen(req, timeout=3)
            tags = json.loads(resp.read())
            models = [m["name"].split(":")[0] for m in tags.get("models", [])]
            self._available = any(self.model in m for m in models)
        except Exception:
            self._available = False
        return self._available

    @property
    def name(self) -> str:
        return f"Ollama({self.model})"

    @property
    def dimension(self) -> int:
        return self._dim

    def embed(self, text: str) -> list[float]:
        if not self._check_available():
            raise RuntimeError(f"Ollama not available or model {self.model} not found")
        data = json.dumps({"model": self.model, "prompt": text}).encode()
        req = urllib.request.Request(
            f"{self.base_url}/api/embeddings", data=data,
            headers={"Content-Type": "application/json"}, method="POST")
        resp = urllib.request.urlopen(req, timeout=30)
        result = json.loads(resp.read())
        vec = result.get("embedding", [])
        if vec:
            self._dim = len(vec)
        return vec

    def similarity(self, a: list[float], b: list[float]) -> float:
        raw = cosine_similarity(a, b)
        return (raw + 1.0) / 2.0


class TfidfEmbeddingBackend(EmbeddingBackend):
    """Pure-Python TF-IDF embedding. No dependencies. Always available."""

    def __init__(self):
        self._vocab: dict[str, int] = {}
        self._idf: dict[str, float] = {}
        self._doc_count: int = 0
        self._doc_freq: dict[str, int] = {}

    @property
    def name(self) -> str:
        return "TF-IDF (local)"

    @property
    def dimension(self) -> int:
        return len(self._vocab)

    @staticmethod
    def _tokenize(text: str) -> list[str]:
        return [t for t in re.findall(r"\w+", text.lower()) if len(t) > 2]

    def _update_idf(self, tokens: list[str]):
        self._doc_count += 1
        for t in set(tokens):
            self._doc_freq[t] = self._doc_freq.get(t, 0) + 1
        for t, df in self._doc_freq.items():
            self._idf[t] = math.log((self._doc_count + 1) / (df + 1)) + 1.0

    def embed(self, text: str) -> list[float]:
        tokens = self._tokenize(text)
        if not tokens:
            return []
        for t in tokens:
            if t not in self._vocab:
                self._vocab[t] = len(self._vocab)
        self._update_idf(tokens)
        dim = len(self._vocab)
        vec = [0.0] * dim
        tf = {}
        for t in tokens:
            tf[t] = tf.get(t, 0) + 1
        for t, count in tf.items():
            vec[self._vocab[t]] = (count / len(tokens)) * self._idf.get(t, 1.0)
        norm = math.sqrt(sum(v * v for v in vec))
        if norm > 0:
            vec = [v / norm for v in vec]
        return vec

    def similarity(self, a: list[float], b: list[float]) -> float:
        if len(a) != len(b):
            mx = max(len(a), len(b))
            a = a + [0.0] * (mx - len(a))
            b = b + [0.0] * (mx - len(b))
        return max(0.0, cosine_similarity(a, b))


def get_embedding_backend(prefer: str | None = None, ollama_url: str | None = None,
                          model: str | None = None) -> EmbeddingBackend:
    """Auto-detect best available embedding backend. Priority: Ollama > TF-IDF."""
    env_pref = os.environ.get("MNEMOSYNE_EMBED_BACKEND")
    effective = prefer or env_pref
    if effective == "tfidf":
        return TfidfEmbeddingBackend()
    if effective in (None, "ollama"):
        ollama = OllamaEmbeddingBackend(base_url=ollama_url, model=model)
        if ollama._check_available():
            return ollama
        if effective == "ollama":
            raise RuntimeError("Ollama backend requested but not available")
    return TfidfEmbeddingBackend()