Files
the-nexus/nexus/mnemosyne/embeddings.py
Alexander Whitestone 3bf8d6e0a6 feat: add pluggable embedding backend (Ollama + TF-IDF)
Implements embedding_backend from FEATURES.yaml:
- Abstract EmbeddingBackend interface
- OllamaEmbeddingBackend for local sovereign models
- TfidfEmbeddingBackend pure-Python fallback
- get_embedding_backend() auto-detection
2026-04-12 05:04:53 +00:00

171 lines
6.0 KiB
Python

"""Pluggable embedding backends for Mnemosyne semantic search.
Provides an abstract EmbeddingBackend interface and concrete implementations:
- OllamaEmbeddingBackend: local models via Ollama (sovereign, no cloud)
- TfidfEmbeddingBackend: pure-Python TF-IDF fallback (no dependencies)
Usage:
from nexus.mnemosyne.embeddings import get_embedding_backend
backend = get_embedding_backend() # auto-detects best available
vec = backend.embed("hello world")
score = backend.similarity(vec_a, vec_b)
"""
from __future__ import annotations
import abc, json, math, os, re, urllib.request
from typing import Optional
class EmbeddingBackend(abc.ABC):
"""Abstract interface for embedding-based similarity."""
@abc.abstractmethod
def embed(self, text: str) -> list[float]:
"""Return an embedding vector for the given text."""
@abc.abstractmethod
def similarity(self, a: list[float], b: list[float]) -> float:
"""Return cosine similarity between two vectors, in [0, 1]."""
@property
def name(self) -> str:
return self.__class__.__name__
@property
def dimension(self) -> int:
return 0
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""Cosine similarity between two vectors."""
if len(a) != len(b):
raise ValueError(f"Vector dimension mismatch: {len(a)} vs {len(b)}")
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
class OllamaEmbeddingBackend(EmbeddingBackend):
"""Embedding backend using a local Ollama instance.
Default model: nomic-embed-text (768 dims)."""
def __init__(self, base_url: str | None = None, model: str | None = None):
self.base_url = base_url or os.environ.get("OLLAMA_URL", "http://localhost:11434")
self.model = model or os.environ.get("MNEMOSYNE_EMBED_MODEL", "nomic-embed-text")
self._dim: int = 0
self._available: bool | None = None
def _check_available(self) -> bool:
if self._available is not None:
return self._available
try:
req = urllib.request.Request(f"{self.base_url}/api/tags", method="GET")
resp = urllib.request.urlopen(req, timeout=3)
tags = json.loads(resp.read())
models = [m["name"].split(":")[0] for m in tags.get("models", [])]
self._available = any(self.model in m for m in models)
except Exception:
self._available = False
return self._available
@property
def name(self) -> str:
return f"Ollama({self.model})"
@property
def dimension(self) -> int:
return self._dim
def embed(self, text: str) -> list[float]:
if not self._check_available():
raise RuntimeError(f"Ollama not available or model {self.model} not found")
data = json.dumps({"model": self.model, "prompt": text}).encode()
req = urllib.request.Request(
f"{self.base_url}/api/embeddings", data=data,
headers={"Content-Type": "application/json"}, method="POST")
resp = urllib.request.urlopen(req, timeout=30)
result = json.loads(resp.read())
vec = result.get("embedding", [])
if vec:
self._dim = len(vec)
return vec
def similarity(self, a: list[float], b: list[float]) -> float:
raw = cosine_similarity(a, b)
return (raw + 1.0) / 2.0
class TfidfEmbeddingBackend(EmbeddingBackend):
"""Pure-Python TF-IDF embedding. No dependencies. Always available."""
def __init__(self):
self._vocab: dict[str, int] = {}
self._idf: dict[str, float] = {}
self._doc_count: int = 0
self._doc_freq: dict[str, int] = {}
@property
def name(self) -> str:
return "TF-IDF (local)"
@property
def dimension(self) -> int:
return len(self._vocab)
@staticmethod
def _tokenize(text: str) -> list[str]:
return [t for t in re.findall(r"\w+", text.lower()) if len(t) > 2]
def _update_idf(self, tokens: list[str]):
self._doc_count += 1
for t in set(tokens):
self._doc_freq[t] = self._doc_freq.get(t, 0) + 1
for t, df in self._doc_freq.items():
self._idf[t] = math.log((self._doc_count + 1) / (df + 1)) + 1.0
def embed(self, text: str) -> list[float]:
tokens = self._tokenize(text)
if not tokens:
return []
for t in tokens:
if t not in self._vocab:
self._vocab[t] = len(self._vocab)
self._update_idf(tokens)
dim = len(self._vocab)
vec = [0.0] * dim
tf = {}
for t in tokens:
tf[t] = tf.get(t, 0) + 1
for t, count in tf.items():
vec[self._vocab[t]] = (count / len(tokens)) * self._idf.get(t, 1.0)
norm = math.sqrt(sum(v * v for v in vec))
if norm > 0:
vec = [v / norm for v in vec]
return vec
def similarity(self, a: list[float], b: list[float]) -> float:
if len(a) != len(b):
mx = max(len(a), len(b))
a = a + [0.0] * (mx - len(a))
b = b + [0.0] * (mx - len(b))
return max(0.0, cosine_similarity(a, b))
def get_embedding_backend(prefer: str | None = None, ollama_url: str | None = None,
model: str | None = None) -> EmbeddingBackend:
"""Auto-detect best available embedding backend. Priority: Ollama > TF-IDF."""
env_pref = os.environ.get("MNEMOSYNE_EMBED_BACKEND")
effective = prefer or env_pref
if effective == "tfidf":
return TfidfEmbeddingBackend()
if effective in (None, "ollama"):
ollama = OllamaEmbeddingBackend(base_url=ollama_url, model=model)
if ollama._check_available():
return ollama
if effective == "ollama":
raise RuntimeError("Ollama backend requested but not available")
return TfidfEmbeddingBackend()