Implements embedding_backend from FEATURES.yaml: - Abstract EmbeddingBackend interface - OllamaEmbeddingBackend for local sovereign models - TfidfEmbeddingBackend pure-Python fallback - get_embedding_backend() auto-detection
171 lines
6.0 KiB
Python
171 lines
6.0 KiB
Python
"""Pluggable embedding backends for Mnemosyne semantic search.
|
|
|
|
Provides an abstract EmbeddingBackend interface and concrete implementations:
|
|
- OllamaEmbeddingBackend: local models via Ollama (sovereign, no cloud)
|
|
- TfidfEmbeddingBackend: pure-Python TF-IDF fallback (no dependencies)
|
|
|
|
Usage:
|
|
from nexus.mnemosyne.embeddings import get_embedding_backend
|
|
backend = get_embedding_backend() # auto-detects best available
|
|
vec = backend.embed("hello world")
|
|
score = backend.similarity(vec_a, vec_b)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
import abc, json, math, os, re, urllib.request
|
|
from typing import Optional
|
|
|
|
|
|
class EmbeddingBackend(abc.ABC):
|
|
"""Abstract interface for embedding-based similarity."""
|
|
|
|
@abc.abstractmethod
|
|
def embed(self, text: str) -> list[float]:
|
|
"""Return an embedding vector for the given text."""
|
|
|
|
@abc.abstractmethod
|
|
def similarity(self, a: list[float], b: list[float]) -> float:
|
|
"""Return cosine similarity between two vectors, in [0, 1]."""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return self.__class__.__name__
|
|
|
|
@property
|
|
def dimension(self) -> int:
|
|
return 0
|
|
|
|
|
|
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
"""Cosine similarity between two vectors."""
|
|
if len(a) != len(b):
|
|
raise ValueError(f"Vector dimension mismatch: {len(a)} vs {len(b)}")
|
|
dot = sum(x * y for x, y in zip(a, b))
|
|
norm_a = math.sqrt(sum(x * x for x in a))
|
|
norm_b = math.sqrt(sum(x * x for x in b))
|
|
if norm_a == 0 or norm_b == 0:
|
|
return 0.0
|
|
return dot / (norm_a * norm_b)
|
|
|
|
|
|
class OllamaEmbeddingBackend(EmbeddingBackend):
|
|
"""Embedding backend using a local Ollama instance.
|
|
Default model: nomic-embed-text (768 dims)."""
|
|
|
|
def __init__(self, base_url: str | None = None, model: str | None = None):
|
|
self.base_url = base_url or os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
|
self.model = model or os.environ.get("MNEMOSYNE_EMBED_MODEL", "nomic-embed-text")
|
|
self._dim: int = 0
|
|
self._available: bool | None = None
|
|
|
|
def _check_available(self) -> bool:
|
|
if self._available is not None:
|
|
return self._available
|
|
try:
|
|
req = urllib.request.Request(f"{self.base_url}/api/tags", method="GET")
|
|
resp = urllib.request.urlopen(req, timeout=3)
|
|
tags = json.loads(resp.read())
|
|
models = [m["name"].split(":")[0] for m in tags.get("models", [])]
|
|
self._available = any(self.model in m for m in models)
|
|
except Exception:
|
|
self._available = False
|
|
return self._available
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return f"Ollama({self.model})"
|
|
|
|
@property
|
|
def dimension(self) -> int:
|
|
return self._dim
|
|
|
|
def embed(self, text: str) -> list[float]:
|
|
if not self._check_available():
|
|
raise RuntimeError(f"Ollama not available or model {self.model} not found")
|
|
data = json.dumps({"model": self.model, "prompt": text}).encode()
|
|
req = urllib.request.Request(
|
|
f"{self.base_url}/api/embeddings", data=data,
|
|
headers={"Content-Type": "application/json"}, method="POST")
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
result = json.loads(resp.read())
|
|
vec = result.get("embedding", [])
|
|
if vec:
|
|
self._dim = len(vec)
|
|
return vec
|
|
|
|
def similarity(self, a: list[float], b: list[float]) -> float:
|
|
raw = cosine_similarity(a, b)
|
|
return (raw + 1.0) / 2.0
|
|
|
|
|
|
class TfidfEmbeddingBackend(EmbeddingBackend):
|
|
"""Pure-Python TF-IDF embedding. No dependencies. Always available."""
|
|
|
|
def __init__(self):
|
|
self._vocab: dict[str, int] = {}
|
|
self._idf: dict[str, float] = {}
|
|
self._doc_count: int = 0
|
|
self._doc_freq: dict[str, int] = {}
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "TF-IDF (local)"
|
|
|
|
@property
|
|
def dimension(self) -> int:
|
|
return len(self._vocab)
|
|
|
|
@staticmethod
|
|
def _tokenize(text: str) -> list[str]:
|
|
return [t for t in re.findall(r"\w+", text.lower()) if len(t) > 2]
|
|
|
|
def _update_idf(self, tokens: list[str]):
|
|
self._doc_count += 1
|
|
for t in set(tokens):
|
|
self._doc_freq[t] = self._doc_freq.get(t, 0) + 1
|
|
for t, df in self._doc_freq.items():
|
|
self._idf[t] = math.log((self._doc_count + 1) / (df + 1)) + 1.0
|
|
|
|
def embed(self, text: str) -> list[float]:
|
|
tokens = self._tokenize(text)
|
|
if not tokens:
|
|
return []
|
|
for t in tokens:
|
|
if t not in self._vocab:
|
|
self._vocab[t] = len(self._vocab)
|
|
self._update_idf(tokens)
|
|
dim = len(self._vocab)
|
|
vec = [0.0] * dim
|
|
tf = {}
|
|
for t in tokens:
|
|
tf[t] = tf.get(t, 0) + 1
|
|
for t, count in tf.items():
|
|
vec[self._vocab[t]] = (count / len(tokens)) * self._idf.get(t, 1.0)
|
|
norm = math.sqrt(sum(v * v for v in vec))
|
|
if norm > 0:
|
|
vec = [v / norm for v in vec]
|
|
return vec
|
|
|
|
def similarity(self, a: list[float], b: list[float]) -> float:
|
|
if len(a) != len(b):
|
|
mx = max(len(a), len(b))
|
|
a = a + [0.0] * (mx - len(a))
|
|
b = b + [0.0] * (mx - len(b))
|
|
return max(0.0, cosine_similarity(a, b))
|
|
|
|
|
|
def get_embedding_backend(prefer: str | None = None, ollama_url: str | None = None,
|
|
model: str | None = None) -> EmbeddingBackend:
|
|
"""Auto-detect best available embedding backend. Priority: Ollama > TF-IDF."""
|
|
env_pref = os.environ.get("MNEMOSYNE_EMBED_BACKEND")
|
|
effective = prefer or env_pref
|
|
if effective == "tfidf":
|
|
return TfidfEmbeddingBackend()
|
|
if effective in (None, "ollama"):
|
|
ollama = OllamaEmbeddingBackend(base_url=ollama_url, model=model)
|
|
if ollama._check_available():
|
|
return ollama
|
|
if effective == "ollama":
|
|
raise RuntimeError("Ollama backend requested but not available")
|
|
return TfidfEmbeddingBackend()
|