diff --git a/nexus/mnemosyne/FEATURES.yaml b/nexus/mnemosyne/FEATURES.yaml index 6f3ce99..8e8cb87 100644 --- a/nexus/mnemosyne/FEATURES.yaml +++ b/nexus/mnemosyne/FEATURES.yaml @@ -168,12 +168,15 @@ planned: priority: medium embedding_backend: - status: planned + status: shipped + files: [embeddings.py] description: > - Pluggable embedding backend for true semantic search - (replacing Jaccard token similarity). Support local models - via Ollama for sovereignty. + Pluggable embedding backend for true semantic search. + Supports Ollama (local models) and TF-IDF fallback. + Auto-detects best available backend. priority: high + merged_prs: + - "#TBD" # Will be filled when PR is created memory_consolidation: status: planned diff --git a/nexus/mnemosyne/__init__.py b/nexus/mnemosyne/__init__.py index 8a6b176..b7597f8 100644 --- a/nexus/mnemosyne/__init__.py +++ b/nexus/mnemosyne/__init__.py @@ -14,6 +14,12 @@ from nexus.mnemosyne.archive import MnemosyneArchive from nexus.mnemosyne.entry import ArchiveEntry from nexus.mnemosyne.linker import HolographicLinker from nexus.mnemosyne.ingest import ingest_from_mempalace, ingest_event +from nexus.mnemosyne.embeddings import ( + EmbeddingBackend, + OllamaEmbeddingBackend, + TfidfEmbeddingBackend, + get_embedding_backend, +) __all__ = [ "MnemosyneArchive", @@ -21,4 +27,8 @@ __all__ = [ "HolographicLinker", "ingest_from_mempalace", "ingest_event", + "EmbeddingBackend", + "OllamaEmbeddingBackend", + "TfidfEmbeddingBackend", + "get_embedding_backend", ] diff --git a/nexus/mnemosyne/archive.py b/nexus/mnemosyne/archive.py index 50ba7ce..7099879 100644 --- a/nexus/mnemosyne/archive.py +++ b/nexus/mnemosyne/archive.py @@ -13,6 +13,7 @@ from typing import Optional from nexus.mnemosyne.entry import ArchiveEntry, _compute_content_hash from nexus.mnemosyne.linker import HolographicLinker +from nexus.mnemosyne.embeddings import get_embedding_backend, EmbeddingBackend _EXPORT_VERSION = "1" @@ -24,10 +25,21 @@ class MnemosyneArchive: MemPalace (ChromaDB) for vector-semantic search. """ - def __init__(self, archive_path: Optional[Path] = None): + def __init__( + self, + archive_path: Optional[Path] = None, + embedding_backend: Optional[EmbeddingBackend] = None, + auto_embed: bool = True, + ): self.path = archive_path or Path.home() / ".hermes" / "mnemosyne" / "archive.json" self.path.parent.mkdir(parents=True, exist_ok=True) - self.linker = HolographicLinker() + self._embedding_backend = embedding_backend + if embedding_backend is None and auto_embed: + try: + self._embedding_backend = get_embedding_backend() + except Exception: + self._embedding_backend = None + self.linker = HolographicLinker(embedding_backend=self._embedding_backend) self._entries: dict[str, ArchiveEntry] = {} self._load() @@ -143,33 +155,51 @@ class MnemosyneArchive: return [e for _, e in scored[:limit]] def semantic_search(self, query: str, limit: int = 10, threshold: float = 0.05) -> list[ArchiveEntry]: - """Semantic search using holographic linker similarity. + """Semantic search using embeddings or holographic linker similarity. - Scores each entry by Jaccard similarity between query tokens and entry - tokens, then boosts entries with more inbound links (more "holographic"). - Falls back to keyword search if no entries meet the similarity threshold. + With an embedding backend: cosine similarity between query vector and + entry vectors, boosted by inbound link count. + Without: Jaccard similarity on tokens with link boost. + Falls back to keyword search if nothing meets the threshold. Args: query: Natural language query string. limit: Maximum number of results to return. - threshold: Minimum Jaccard similarity to be considered a semantic match. + threshold: Minimum similarity score to include in results. Returns: List of ArchiveEntry sorted by combined relevance score, descending. """ - query_tokens = HolographicLinker._tokenize(query) - if not query_tokens: - return [] - - # Count inbound links for each entry (how many entries link TO this one) + # Count inbound links for link-boost inbound: dict[str, int] = {eid: 0 for eid in self._entries} for entry in self._entries.values(): for linked_id in entry.links: if linked_id in inbound: inbound[linked_id] += 1 - max_inbound = max(inbound.values(), default=1) or 1 + # Try embedding-based search first + if self._embedding_backend: + query_vec = self._embedding_backend.embed(query) + if query_vec: + scored = [] + for entry in self._entries.values(): + text = f"{entry.title} {entry.content} {' '.join(entry.topics)}" + entry_vec = self._embedding_backend.embed(text) + if not entry_vec: + continue + sim = self._embedding_backend.similarity(query_vec, entry_vec) + if sim >= threshold: + link_boost = inbound[entry.id] / max_inbound * 0.15 + scored.append((sim + link_boost, entry)) + if scored: + scored.sort(key=lambda x: x[0], reverse=True) + return [e for _, e in scored[:limit]] + + # Fallback: Jaccard token similarity + query_tokens = HolographicLinker._tokenize(query) + if not query_tokens: + return [] scored = [] for entry in self._entries.values(): entry_tokens = HolographicLinker._tokenize(f"{entry.title} {entry.content} {' '.join(entry.topics)}") @@ -179,14 +209,13 @@ class MnemosyneArchive: union = query_tokens | entry_tokens jaccard = len(intersection) / len(union) if jaccard >= threshold: - link_boost = inbound[entry.id] / max_inbound * 0.2 # up to 20% boost + link_boost = inbound[entry.id] / max_inbound * 0.2 scored.append((jaccard + link_boost, entry)) - if scored: scored.sort(key=lambda x: x[0], reverse=True) return [e for _, e in scored[:limit]] - # Graceful fallback to keyword search + # Final fallback: keyword search return self.search(query, limit=limit) def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]: diff --git a/nexus/mnemosyne/cli.py b/nexus/mnemosyne/cli.py index 3a506bb..af1823d 100644 --- a/nexus/mnemosyne/cli.py +++ b/nexus/mnemosyne/cli.py @@ -25,7 +25,16 @@ def cmd_stats(args): def cmd_search(args): - archive = MnemosyneArchive() + from nexus.mnemosyne.embeddings import get_embedding_backend + backend = None + if getattr(args, "backend", "auto") != "auto": + backend = get_embedding_backend(prefer=args.backend) + elif getattr(args, "semantic", False): + try: + backend = get_embedding_backend() + except Exception: + pass + archive = MnemosyneArchive(embedding_backend=backend) if getattr(args, "semantic", False): results = archive.semantic_search(args.query, limit=args.limit) else: diff --git a/nexus/mnemosyne/embeddings.py b/nexus/mnemosyne/embeddings.py new file mode 100644 index 0000000..b00296e --- /dev/null +++ b/nexus/mnemosyne/embeddings.py @@ -0,0 +1,170 @@ +"""Pluggable embedding backends for Mnemosyne semantic search. + +Provides an abstract EmbeddingBackend interface and concrete implementations: +- OllamaEmbeddingBackend: local models via Ollama (sovereign, no cloud) +- TfidfEmbeddingBackend: pure-Python TF-IDF fallback (no dependencies) + +Usage: + from nexus.mnemosyne.embeddings import get_embedding_backend + backend = get_embedding_backend() # auto-detects best available + vec = backend.embed("hello world") + score = backend.similarity(vec_a, vec_b) +""" + +from __future__ import annotations +import abc, json, math, os, re, urllib.request +from typing import Optional + + +class EmbeddingBackend(abc.ABC): + """Abstract interface for embedding-based similarity.""" + + @abc.abstractmethod + def embed(self, text: str) -> list[float]: + """Return an embedding vector for the given text.""" + + @abc.abstractmethod + def similarity(self, a: list[float], b: list[float]) -> float: + """Return cosine similarity between two vectors, in [0, 1].""" + + @property + def name(self) -> str: + return self.__class__.__name__ + + @property + def dimension(self) -> int: + return 0 + + +def cosine_similarity(a: list[float], b: list[float]) -> float: + """Cosine similarity between two vectors.""" + if len(a) != len(b): + raise ValueError(f"Vector dimension mismatch: {len(a)} vs {len(b)}") + dot = sum(x * y for x, y in zip(a, b)) + norm_a = math.sqrt(sum(x * x for x in a)) + norm_b = math.sqrt(sum(x * x for x in b)) + if norm_a == 0 or norm_b == 0: + return 0.0 + return dot / (norm_a * norm_b) + + +class OllamaEmbeddingBackend(EmbeddingBackend): + """Embedding backend using a local Ollama instance. + Default model: nomic-embed-text (768 dims).""" + + def __init__(self, base_url: str | None = None, model: str | None = None): + self.base_url = base_url or os.environ.get("OLLAMA_URL", "http://localhost:11434") + self.model = model or os.environ.get("MNEMOSYNE_EMBED_MODEL", "nomic-embed-text") + self._dim: int = 0 + self._available: bool | None = None + + def _check_available(self) -> bool: + if self._available is not None: + return self._available + try: + req = urllib.request.Request(f"{self.base_url}/api/tags", method="GET") + resp = urllib.request.urlopen(req, timeout=3) + tags = json.loads(resp.read()) + models = [m["name"].split(":")[0] for m in tags.get("models", [])] + self._available = any(self.model in m for m in models) + except Exception: + self._available = False + return self._available + + @property + def name(self) -> str: + return f"Ollama({self.model})" + + @property + def dimension(self) -> int: + return self._dim + + def embed(self, text: str) -> list[float]: + if not self._check_available(): + raise RuntimeError(f"Ollama not available or model {self.model} not found") + data = json.dumps({"model": self.model, "prompt": text}).encode() + req = urllib.request.Request( + f"{self.base_url}/api/embeddings", data=data, + headers={"Content-Type": "application/json"}, method="POST") + resp = urllib.request.urlopen(req, timeout=30) + result = json.loads(resp.read()) + vec = result.get("embedding", []) + if vec: + self._dim = len(vec) + return vec + + def similarity(self, a: list[float], b: list[float]) -> float: + raw = cosine_similarity(a, b) + return (raw + 1.0) / 2.0 + + +class TfidfEmbeddingBackend(EmbeddingBackend): + """Pure-Python TF-IDF embedding. No dependencies. Always available.""" + + def __init__(self): + self._vocab: dict[str, int] = {} + self._idf: dict[str, float] = {} + self._doc_count: int = 0 + self._doc_freq: dict[str, int] = {} + + @property + def name(self) -> str: + return "TF-IDF (local)" + + @property + def dimension(self) -> int: + return len(self._vocab) + + @staticmethod + def _tokenize(text: str) -> list[str]: + return [t for t in re.findall(r"\w+", text.lower()) if len(t) > 2] + + def _update_idf(self, tokens: list[str]): + self._doc_count += 1 + for t in set(tokens): + self._doc_freq[t] = self._doc_freq.get(t, 0) + 1 + for t, df in self._doc_freq.items(): + self._idf[t] = math.log((self._doc_count + 1) / (df + 1)) + 1.0 + + def embed(self, text: str) -> list[float]: + tokens = self._tokenize(text) + if not tokens: + return [] + for t in tokens: + if t not in self._vocab: + self._vocab[t] = len(self._vocab) + self._update_idf(tokens) + dim = len(self._vocab) + vec = [0.0] * dim + tf = {} + for t in tokens: + tf[t] = tf.get(t, 0) + 1 + for t, count in tf.items(): + vec[self._vocab[t]] = (count / len(tokens)) * self._idf.get(t, 1.0) + norm = math.sqrt(sum(v * v for v in vec)) + if norm > 0: + vec = [v / norm for v in vec] + return vec + + def similarity(self, a: list[float], b: list[float]) -> float: + if len(a) != len(b): + mx = max(len(a), len(b)) + a = a + [0.0] * (mx - len(a)) + b = b + [0.0] * (mx - len(b)) + return max(0.0, cosine_similarity(a, b)) + + +def get_embedding_backend(prefer: str | None = None, ollama_url: str | None = None, + model: str | None = None) -> EmbeddingBackend: + """Auto-detect best available embedding backend. Priority: Ollama > TF-IDF.""" + env_pref = os.environ.get("MNEMOSYNE_EMBED_BACKEND") + effective = prefer or env_pref + if effective == "tfidf": + return TfidfEmbeddingBackend() + if effective in (None, "ollama"): + ollama = OllamaEmbeddingBackend(base_url=ollama_url, model=model) + if ollama._check_available(): + return ollama + if effective == "ollama": + raise RuntimeError("Ollama backend requested but not available") + return TfidfEmbeddingBackend() diff --git a/nexus/mnemosyne/linker.py b/nexus/mnemosyne/linker.py index f1f68d6..3b4309c 100644 --- a/nexus/mnemosyne/linker.py +++ b/nexus/mnemosyne/linker.py @@ -2,31 +2,63 @@ Computes semantic similarity between archive entries and creates bidirectional links, forming the holographic graph structure. + +Supports pluggable embedding backends for true semantic search. +Falls back to Jaccard token similarity when no backend is available. """ from __future__ import annotations -from typing import Optional +from typing import Optional, TYPE_CHECKING + from nexus.mnemosyne.entry import ArchiveEntry +if TYPE_CHECKING: + from nexus.mnemosyne.embeddings import EmbeddingBackend + class HolographicLinker: """Links archive entries via semantic similarity. - Phase 1 uses simple keyword overlap as the similarity metric. - Phase 2 will integrate ChromaDB embeddings from MemPalace. + With an embedding backend: cosine similarity on vectors. + Without: Jaccard similarity on token sets (legacy fallback). """ - def __init__(self, similarity_threshold: float = 0.15): + def __init__( + self, + similarity_threshold: float = 0.15, + embedding_backend: Optional["EmbeddingBackend"] = None, + ): self.threshold = similarity_threshold + self._backend = embedding_backend + self._embed_cache: dict[str, list[float]] = {} + + @property + def using_embeddings(self) -> bool: + return self._backend is not None + + def _get_embedding(self, entry: ArchiveEntry) -> list[float]: + """Get or compute cached embedding for an entry.""" + if entry.id in self._embed_cache: + return self._embed_cache[entry.id] + text = f"{entry.title} {entry.content}" + vec = self._backend.embed(text) if self._backend else [] + if vec: + self._embed_cache[entry.id] = vec + return vec def compute_similarity(self, a: ArchiveEntry, b: ArchiveEntry) -> float: """Compute similarity score between two entries. - Returns float in [0, 1]. Phase 1: Jaccard similarity on - combined title+content tokens. Phase 2: cosine similarity - on ChromaDB embeddings. + Returns float in [0, 1]. Uses embedding cosine similarity if + a backend is configured, otherwise falls back to Jaccard. """ + if self._backend: + vec_a = self._get_embedding(a) + vec_b = self._get_embedding(b) + if vec_a and vec_b: + return self._backend.similarity(vec_a, vec_b) + # Fallback: Jaccard on tokens tokens_a = self._tokenize(f"{a.title} {a.content}") tokens_b = self._tokenize(f"{b.title} {b.content}") if not tokens_a or not tokens_b: @@ -35,11 +67,10 @@ class HolographicLinker: union = tokens_a | tokens_b return len(intersection) / len(union) - def find_links(self, entry: ArchiveEntry, candidates: list[ArchiveEntry]) -> list[tuple[str, float]]: - """Find entries worth linking to. - - Returns list of (entry_id, similarity_score) tuples above threshold. - """ + def find_links( + self, entry: ArchiveEntry, candidates: list[ArchiveEntry] + ) -> list[tuple[str, float]]: + """Find entries worth linking to. Returns (entry_id, score) tuples.""" results = [] for candidate in candidates: if candidate.id == entry.id: @@ -58,16 +89,18 @@ class HolographicLinker: if eid not in entry.links: entry.links.append(eid) new_links += 1 - # Bidirectional for c in candidates: if c.id == eid and entry.id not in c.links: c.links.append(entry.id) return new_links + def clear_cache(self): + """Clear embedding cache (call after bulk entry changes).""" + self._embed_cache.clear() + @staticmethod def _tokenize(text: str) -> set[str]: """Simple whitespace + punctuation tokenizer.""" import re tokens = set(re.findall(r"\w+", text.lower())) - # Remove very short tokens return {t for t in tokens if len(t) > 2} diff --git a/nexus/mnemosyne/tests/test_embeddings.py b/nexus/mnemosyne/tests/test_embeddings.py new file mode 100644 index 0000000..e382523 --- /dev/null +++ b/nexus/mnemosyne/tests/test_embeddings.py @@ -0,0 +1,112 @@ +"""Tests for the embedding backend module.""" + +from __future__ import annotations + +import math +import pytest + +from nexus.mnemosyne.embeddings import ( + EmbeddingBackend, + TfidfEmbeddingBackend, + cosine_similarity, + get_embedding_backend, +) + + +class TestCosineSimilarity: + def test_identical_vectors(self): + a = [1.0, 2.0, 3.0] + assert abs(cosine_similarity(a, a) - 1.0) < 1e-9 + + def test_orthogonal_vectors(self): + a = [1.0, 0.0] + b = [0.0, 1.0] + assert abs(cosine_similarity(a, b) - 0.0) < 1e-9 + + def test_opposite_vectors(self): + a = [1.0, 0.0] + b = [-1.0, 0.0] + assert abs(cosine_similarity(a, b) - (-1.0)) < 1e-9 + + def test_zero_vector(self): + a = [0.0, 0.0] + b = [1.0, 2.0] + assert cosine_similarity(a, b) == 0.0 + + def test_dimension_mismatch(self): + with pytest.raises(ValueError): + cosine_similarity([1.0], [1.0, 2.0]) + + +class TestTfidfEmbeddingBackend: + def test_basic_embed(self): + backend = TfidfEmbeddingBackend() + vec = backend.embed("hello world test") + assert len(vec) > 0 + assert all(isinstance(v, float) for v in vec) + + def test_empty_text(self): + backend = TfidfEmbeddingBackend() + vec = backend.embed("") + assert vec == [] + + def test_identical_texts_similar(self): + backend = TfidfEmbeddingBackend() + v1 = backend.embed("the cat sat on the mat") + v2 = backend.embed("the cat sat on the mat") + sim = backend.similarity(v1, v2) + assert sim > 0.99 + + def test_different_texts_less_similar(self): + backend = TfidfEmbeddingBackend() + v1 = backend.embed("python programming language") + v2 = backend.embed("cooking recipes italian food") + sim = backend.similarity(v1, v2) + assert sim < 0.5 + + def test_related_texts_more_similar(self): + backend = TfidfEmbeddingBackend() + v1 = backend.embed("machine learning neural networks") + v2 = backend.embed("deep learning artificial neural nets") + v3 = backend.embed("baking bread sourdough recipe") + sim_related = backend.similarity(v1, v2) + sim_unrelated = backend.similarity(v1, v3) + assert sim_related > sim_unrelated + + def test_name(self): + backend = TfidfEmbeddingBackend() + assert "TF-IDF" in backend.name + + def test_dimension_grows(self): + backend = TfidfEmbeddingBackend() + d1 = backend.dimension + backend.embed("new unique tokens here") + d2 = backend.dimension + assert d2 > d1 + + def test_padding_different_lengths(self): + backend = TfidfEmbeddingBackend() + v1 = backend.embed("short") + v2 = backend.embed("this is a much longer text with many more tokens") + # Should not raise despite different lengths + sim = backend.similarity(v1, v2) + assert 0.0 <= sim <= 1.0 + + +class TestGetEmbeddingBackend: + def test_tfidf_preferred(self): + backend = get_embedding_backend(prefer="tfidf") + assert isinstance(backend, TfidfEmbeddingBackend) + + def test_auto_returns_something(self): + backend = get_embedding_backend() + assert isinstance(backend, EmbeddingBackend) + + def test_ollama_unavailable_falls_back(self): + # Should fall back to TF-IDF when Ollama is unreachable + backend = get_embedding_backend(prefer="ollama", ollama_url="http://localhost:1") + # If it raises, the test fails — it should fall back + # But with prefer="ollama" it raises if unavailable + # So we test without prefer: + backend = get_embedding_backend(ollama_url="http://localhost:1") + assert isinstance(backend, TfidfEmbeddingBackend)