"""Pluggable embedding backends for Mnemosyne semantic search. Provides an abstract EmbeddingBackend interface and concrete implementations: - OllamaEmbeddingBackend: local models via Ollama (sovereign, no cloud) - TfidfEmbeddingBackend: pure-Python TF-IDF fallback (no dependencies) Usage: from nexus.mnemosyne.embeddings import get_embedding_backend backend = get_embedding_backend() # auto-detects best available vec = backend.embed("hello world") score = backend.similarity(vec_a, vec_b) """ from __future__ import annotations import abc, json, math, os, re, urllib.request from typing import Optional class EmbeddingBackend(abc.ABC): """Abstract interface for embedding-based similarity.""" @abc.abstractmethod def embed(self, text: str) -> list[float]: """Return an embedding vector for the given text.""" @abc.abstractmethod def similarity(self, a: list[float], b: list[float]) -> float: """Return cosine similarity between two vectors, in [0, 1].""" @property def name(self) -> str: return self.__class__.__name__ @property def dimension(self) -> int: return 0 def cosine_similarity(a: list[float], b: list[float]) -> float: """Cosine similarity between two vectors.""" if len(a) != len(b): raise ValueError(f"Vector dimension mismatch: {len(a)} vs {len(b)}") dot = sum(x * y for x, y in zip(a, b)) norm_a = math.sqrt(sum(x * x for x in a)) norm_b = math.sqrt(sum(x * x for x in b)) if norm_a == 0 or norm_b == 0: return 0.0 return dot / (norm_a * norm_b) class OllamaEmbeddingBackend(EmbeddingBackend): """Embedding backend using a local Ollama instance. Default model: nomic-embed-text (768 dims).""" def __init__(self, base_url: str | None = None, model: str | None = None): self.base_url = base_url or os.environ.get("OLLAMA_URL", "http://localhost:11434") self.model = model or os.environ.get("MNEMOSYNE_EMBED_MODEL", "nomic-embed-text") self._dim: int = 0 self._available: bool | None = None def _check_available(self) -> bool: if self._available is not None: return self._available try: req = urllib.request.Request(f"{self.base_url}/api/tags", method="GET") resp = urllib.request.urlopen(req, timeout=3) tags = json.loads(resp.read()) models = [m["name"].split(":")[0] for m in tags.get("models", [])] self._available = any(self.model in m for m in models) except Exception: self._available = False return self._available @property def name(self) -> str: return f"Ollama({self.model})" @property def dimension(self) -> int: return self._dim def embed(self, text: str) -> list[float]: if not self._check_available(): raise RuntimeError(f"Ollama not available or model {self.model} not found") data = json.dumps({"model": self.model, "prompt": text}).encode() req = urllib.request.Request( f"{self.base_url}/api/embeddings", data=data, headers={"Content-Type": "application/json"}, method="POST") resp = urllib.request.urlopen(req, timeout=30) result = json.loads(resp.read()) vec = result.get("embedding", []) if vec: self._dim = len(vec) return vec def similarity(self, a: list[float], b: list[float]) -> float: raw = cosine_similarity(a, b) return (raw + 1.0) / 2.0 class TfidfEmbeddingBackend(EmbeddingBackend): """Pure-Python TF-IDF embedding. No dependencies. Always available.""" def __init__(self): self._vocab: dict[str, int] = {} self._idf: dict[str, float] = {} self._doc_count: int = 0 self._doc_freq: dict[str, int] = {} @property def name(self) -> str: return "TF-IDF (local)" @property def dimension(self) -> int: return len(self._vocab) @staticmethod def _tokenize(text: str) -> list[str]: return [t for t in re.findall(r"\w+", text.lower()) if len(t) > 2] def _update_idf(self, tokens: list[str]): self._doc_count += 1 for t in set(tokens): self._doc_freq[t] = self._doc_freq.get(t, 0) + 1 for t, df in self._doc_freq.items(): self._idf[t] = math.log((self._doc_count + 1) / (df + 1)) + 1.0 def embed(self, text: str) -> list[float]: tokens = self._tokenize(text) if not tokens: return [] for t in tokens: if t not in self._vocab: self._vocab[t] = len(self._vocab) self._update_idf(tokens) dim = len(self._vocab) vec = [0.0] * dim tf = {} for t in tokens: tf[t] = tf.get(t, 0) + 1 for t, count in tf.items(): vec[self._vocab[t]] = (count / len(tokens)) * self._idf.get(t, 1.0) norm = math.sqrt(sum(v * v for v in vec)) if norm > 0: vec = [v / norm for v in vec] return vec def similarity(self, a: list[float], b: list[float]) -> float: if len(a) != len(b): mx = max(len(a), len(b)) a = a + [0.0] * (mx - len(a)) b = b + [0.0] * (mx - len(b)) return max(0.0, cosine_similarity(a, b)) def get_embedding_backend(prefer: str | None = None, ollama_url: str | None = None, model: str | None = None) -> EmbeddingBackend: """Auto-detect best available embedding backend. Priority: Ollama > TF-IDF.""" env_pref = os.environ.get("MNEMOSYNE_EMBED_BACKEND") effective = prefer or env_pref if effective == "tfidf": return TfidfEmbeddingBackend() if effective in (None, "ollama"): ollama = OllamaEmbeddingBackend(base_url=ollama_url, model=model) if ollama._check_available(): return ollama if effective == "ollama": raise RuntimeError("Ollama backend requested but not available") return TfidfEmbeddingBackend()