2026-04-12 05:30:05 +00:00
7 changed files with 401 additions and 35 deletions
--- a/nexus/mnemosyne/FEATURES.yaml
+++ b/nexus/mnemosyne/FEATURES.yaml
@@ -168,12 +168,15 @@ planned:
    priority: medium

  embedding_backend:
-    status: planned
+    status: shipped
+    files: [embeddings.py]
    description: >
-      Pluggable embedding backend for true semantic search
-      (replacing Jaccard token similarity). Support local models
-      via Ollama for sovereignty.
+      Pluggable embedding backend for true semantic search.
+      Supports Ollama (local models) and TF-IDF fallback.
+      Auto-detects best available backend.
    priority: high
+    merged_prs:
+      - "#TBD"  # Will be filled when PR is created

  memory_consolidation:
    status: planned
--- a/nexus/mnemosyne/init.py
+++ b/nexus/mnemosyne/init.py
@@ -14,6 +14,12 @@ from nexus.mnemosyne.archive import MnemosyneArchive
 from nexus.mnemosyne.entry import ArchiveEntry
 from nexus.mnemosyne.linker import HolographicLinker
 from nexus.mnemosyne.ingest import ingest_from_mempalace, ingest_event
+from nexus.mnemosyne.embeddings import (
+    EmbeddingBackend,
+    OllamaEmbeddingBackend,
+    TfidfEmbeddingBackend,
+    get_embedding_backend,
+)

 __all__ = [
    "MnemosyneArchive",
@@ -21,4 +27,8 @@ __all__ = [
    "HolographicLinker",
    "ingest_from_mempalace",
    "ingest_event",
+    "EmbeddingBackend",
+    "OllamaEmbeddingBackend",
+    "TfidfEmbeddingBackend",
+    "get_embedding_backend",
 ]
--- a/nexus/mnemosyne/archive.py
+++ b/nexus/mnemosyne/archive.py
@@ -13,6 +13,7 @@ from typing import Optional

 from nexus.mnemosyne.entry import ArchiveEntry, _compute_content_hash
 from nexus.mnemosyne.linker import HolographicLinker
+from nexus.mnemosyne.embeddings import get_embedding_backend, EmbeddingBackend

 _EXPORT_VERSION = "1"

@@ -24,10 +25,21 @@ class MnemosyneArchive:
    MemPalace (ChromaDB) for vector-semantic search.
    """

-    def __init__(self, archive_path: Optional[Path] = None):
+    def __init__(
+        self,
+        archive_path: Optional[Path] = None,
+        embedding_backend: Optional[EmbeddingBackend] = None,
+        auto_embed: bool = True,
+    ):
        self.path = archive_path or Path.home() / ".hermes" / "mnemosyne" / "archive.json"
        self.path.parent.mkdir(parents=True, exist_ok=True)
-        self.linker = HolographicLinker()
+        self._embedding_backend = embedding_backend
+        if embedding_backend is None and auto_embed:
+            try:
+                self._embedding_backend = get_embedding_backend()
+            except Exception:
+                self._embedding_backend = None
+        self.linker = HolographicLinker(embedding_backend=self._embedding_backend)
        self._entries: dict[str, ArchiveEntry] = {}
        self._load()

@@ -143,33 +155,51 @@ class MnemosyneArchive:
        return [e for _, e in scored[:limit]]

    def semantic_search(self, query: str, limit: int = 10, threshold: float = 0.05) -> list[ArchiveEntry]:
-        """Semantic search using holographic linker similarity.
+        """Semantic search using embeddings or holographic linker similarity.

-        Scores each entry by Jaccard similarity between query tokens and entry
-        tokens, then boosts entries with more inbound links (more "holographic").
-        Falls back to keyword search if no entries meet the similarity threshold.
+        With an embedding backend: cosine similarity between query vector and
+        entry vectors, boosted by inbound link count.
+        Without: Jaccard similarity on tokens with link boost.
+        Falls back to keyword search if nothing meets the threshold.

        Args:
            query: Natural language query string.
            limit: Maximum number of results to return.
-            threshold: Minimum Jaccard similarity to be considered a semantic match.
+            threshold: Minimum similarity score to include in results.

        Returns:
            List of ArchiveEntry sorted by combined relevance score, descending.
        """
-        query_tokens = HolographicLinker._tokenize(query)
-        if not query_tokens:
-            return []
-
-        # Count inbound links for each entry (how many entries link TO this one)
+        # Count inbound links for link-boost
        inbound: dict[str, int] = {eid: 0 for eid in self._entries}
        for entry in self._entries.values():
            for linked_id in entry.links:
                if linked_id in inbound:
                    inbound[linked_id] += 1
-
        max_inbound = max(inbound.values(), default=1) or 1

+        # Try embedding-based search first
+        if self._embedding_backend:
+            query_vec = self._embedding_backend.embed(query)
+            if query_vec:
+                scored = []
+                for entry in self._entries.values():
+                    text = f"{entry.title} {entry.content} {' '.join(entry.topics)}"
+                    entry_vec = self._embedding_backend.embed(text)
+                    if not entry_vec:
+                        continue
+                    sim = self._embedding_backend.similarity(query_vec, entry_vec)
+                    if sim >= threshold:
+                        link_boost = inbound[entry.id] / max_inbound * 0.15
+                        scored.append((sim + link_boost, entry))
+                if scored:
+                    scored.sort(key=lambda x: x[0], reverse=True)
+                    return [e for _, e in scored[:limit]]
+
+        # Fallback: Jaccard token similarity
+        query_tokens = HolographicLinker._tokenize(query)
+        if not query_tokens:
+            return []
        scored = []
        for entry in self._entries.values():
            entry_tokens = HolographicLinker._tokenize(f"{entry.title} {entry.content} {' '.join(entry.topics)}")
@@ -179,14 +209,13 @@ class MnemosyneArchive:
            union = query_tokens | entry_tokens
            jaccard = len(intersection) / len(union)
            if jaccard >= threshold:
-                link_boost = inbound[entry.id] / max_inbound * 0.2  # up to 20% boost
+                link_boost = inbound[entry.id] / max_inbound * 0.2
                scored.append((jaccard + link_boost, entry))
-
        if scored:
            scored.sort(key=lambda x: x[0], reverse=True)
            return [e for _, e in scored[:limit]]

-        # Graceful fallback to keyword search
+        # Final fallback: keyword search
        return self.search(query, limit=limit)

    def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]:
--- a/nexus/mnemosyne/cli.py
+++ b/nexus/mnemosyne/cli.py
@@ -25,7 +25,16 @@ def cmd_stats(args):


 def cmd_search(args):
-    archive = MnemosyneArchive()
+    from nexus.mnemosyne.embeddings import get_embedding_backend
+    backend = None
+    if getattr(args, "backend", "auto") != "auto":
+        backend = get_embedding_backend(prefer=args.backend)
+    elif getattr(args, "semantic", False):
+        try:
+            backend = get_embedding_backend()
+        except Exception:
+            pass
+    archive = MnemosyneArchive(embedding_backend=backend)
    if getattr(args, "semantic", False):
        results = archive.semantic_search(args.query, limit=args.limit)
    else:
--- a/nexus/mnemosyne/embeddings.py
+++ b/nexus/mnemosyne/embeddings.py
@@ -0,0 +1,170 @@
+"""Pluggable embedding backends for Mnemosyne semantic search.
+
+Provides an abstract EmbeddingBackend interface and concrete implementations:
+- OllamaEmbeddingBackend: local models via Ollama (sovereign, no cloud)
+- TfidfEmbeddingBackend: pure-Python TF-IDF fallback (no dependencies)
+
+Usage:
+    from nexus.mnemosyne.embeddings import get_embedding_backend
+    backend = get_embedding_backend()  # auto-detects best available
+    vec = backend.embed("hello world")
+    score = backend.similarity(vec_a, vec_b)
+"""
+
+from __future__ import annotations
+import abc, json, math, os, re, urllib.request
+from typing import Optional
+
+
+class EmbeddingBackend(abc.ABC):
+    """Abstract interface for embedding-based similarity."""
+
+    @abc.abstractmethod
+    def embed(self, text: str) -> list[float]:
+        """Return an embedding vector for the given text."""
+
+    @abc.abstractmethod
+    def similarity(self, a: list[float], b: list[float]) -> float:
+        """Return cosine similarity between two vectors, in [0, 1]."""
+
+    @property
+    def name(self) -> str:
+        return self.__class__.__name__
+
+    @property
+    def dimension(self) -> int:
+        return 0
+
+
+def cosine_similarity(a: list[float], b: list[float]) -> float:
+    """Cosine similarity between two vectors."""
+    if len(a) != len(b):
+        raise ValueError(f"Vector dimension mismatch: {len(a)} vs {len(b)}")
+    dot = sum(x * y for x, y in zip(a, b))
+    norm_a = math.sqrt(sum(x * x for x in a))
+    norm_b = math.sqrt(sum(x * x for x in b))
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+    return dot / (norm_a * norm_b)
+
+
+class OllamaEmbeddingBackend(EmbeddingBackend):
+    """Embedding backend using a local Ollama instance.
+    Default model: nomic-embed-text (768 dims)."""
+
+    def __init__(self, base_url: str | None = None, model: str | None = None):
+        self.base_url = base_url or os.environ.get("OLLAMA_URL", "http://localhost:11434")
+        self.model = model or os.environ.get("MNEMOSYNE_EMBED_MODEL", "nomic-embed-text")
+        self._dim: int = 0
+        self._available: bool | None = None
+
+    def _check_available(self) -> bool:
+        if self._available is not None:
+            return self._available
+        try:
+            req = urllib.request.Request(f"{self.base_url}/api/tags", method="GET")
+            resp = urllib.request.urlopen(req, timeout=3)
+            tags = json.loads(resp.read())
+            models = [m["name"].split(":")[0] for m in tags.get("models", [])]
+            self._available = any(self.model in m for m in models)
+        except Exception:
+            self._available = False
+        return self._available
+
+    @property
+    def name(self) -> str:
+        return f"Ollama({self.model})"
+
+    @property
+    def dimension(self) -> int:
+        return self._dim
+
+    def embed(self, text: str) -> list[float]:
+        if not self._check_available():
+            raise RuntimeError(f"Ollama not available or model {self.model} not found")
+        data = json.dumps({"model": self.model, "prompt": text}).encode()
+        req = urllib.request.Request(
+            f"{self.base_url}/api/embeddings", data=data,
+            headers={"Content-Type": "application/json"}, method="POST")
+        resp = urllib.request.urlopen(req, timeout=30)
+        result = json.loads(resp.read())
+        vec = result.get("embedding", [])
+        if vec:
+            self._dim = len(vec)
+        return vec
+
+    def similarity(self, a: list[float], b: list[float]) -> float:
+        raw = cosine_similarity(a, b)
+        return (raw + 1.0) / 2.0
+
+
+class TfidfEmbeddingBackend(EmbeddingBackend):
+    """Pure-Python TF-IDF embedding. No dependencies. Always available."""
+
+    def __init__(self):
+        self._vocab: dict[str, int] = {}
+        self._idf: dict[str, float] = {}
+        self._doc_count: int = 0
+        self._doc_freq: dict[str, int] = {}
+
+    @property
+    def name(self) -> str:
+        return "TF-IDF (local)"
+
+    @property
+    def dimension(self) -> int:
+        return len(self._vocab)
+
+    @staticmethod
+    def _tokenize(text: str) -> list[str]:
+        return [t for t in re.findall(r"\w+", text.lower()) if len(t) > 2]
+
+    def _update_idf(self, tokens: list[str]):
+        self._doc_count += 1
+        for t in set(tokens):
+            self._doc_freq[t] = self._doc_freq.get(t, 0) + 1
+        for t, df in self._doc_freq.items():
+            self._idf[t] = math.log((self._doc_count + 1) / (df + 1)) + 1.0
+
+    def embed(self, text: str) -> list[float]:
+        tokens = self._tokenize(text)
+        if not tokens:
+            return []
+        for t in tokens:
+            if t not in self._vocab:
+                self._vocab[t] = len(self._vocab)
+        self._update_idf(tokens)
+        dim = len(self._vocab)
+        vec = [0.0] * dim
+        tf = {}
+        for t in tokens:
+            tf[t] = tf.get(t, 0) + 1
+        for t, count in tf.items():
+            vec[self._vocab[t]] = (count / len(tokens)) * self._idf.get(t, 1.0)
+        norm = math.sqrt(sum(v * v for v in vec))
+        if norm > 0:
+            vec = [v / norm for v in vec]
+        return vec
+
+    def similarity(self, a: list[float], b: list[float]) -> float:
+        if len(a) != len(b):
+            mx = max(len(a), len(b))
+            a = a + [0.0] * (mx - len(a))
+            b = b + [0.0] * (mx - len(b))
+        return max(0.0, cosine_similarity(a, b))
+
+
+def get_embedding_backend(prefer: str | None = None, ollama_url: str | None = None,
+                          model: str | None = None) -> EmbeddingBackend:
+    """Auto-detect best available embedding backend. Priority: Ollama > TF-IDF."""
+    env_pref = os.environ.get("MNEMOSYNE_EMBED_BACKEND")
+    effective = prefer or env_pref
+    if effective == "tfidf":
+        return TfidfEmbeddingBackend()
+    if effective in (None, "ollama"):
+        ollama = OllamaEmbeddingBackend(base_url=ollama_url, model=model)
+        if ollama._check_available():
+            return ollama
+        if effective == "ollama":
+            raise RuntimeError("Ollama backend requested but not available")
+    return TfidfEmbeddingBackend()
--- a/nexus/mnemosyne/linker.py
+++ b/nexus/mnemosyne/linker.py
@@ -2,31 +2,63 @@

 Computes semantic similarity between archive entries and creates
 bidirectional links, forming the holographic graph structure.
+
+Supports pluggable embedding backends for true semantic search.
+Falls back to Jaccard token similarity when no backend is available.
 """

 from __future__ import annotations

-from typing import Optional
+from typing import Optional, TYPE_CHECKING
+
 from nexus.mnemosyne.entry import ArchiveEntry

+if TYPE_CHECKING:
+    from nexus.mnemosyne.embeddings import EmbeddingBackend
+

 class HolographicLinker:
    """Links archive entries via semantic similarity.

-    Phase 1 uses simple keyword overlap as the similarity metric.
-    Phase 2 will integrate ChromaDB embeddings from MemPalace.
+    With an embedding backend: cosine similarity on vectors.
+    Without: Jaccard similarity on token sets (legacy fallback).
    """

-    def __init__(self, similarity_threshold: float = 0.15):
+    def __init__(
+        self,
+        similarity_threshold: float = 0.15,
+        embedding_backend: Optional["EmbeddingBackend"] = None,
+    ):
        self.threshold = similarity_threshold
+        self._backend = embedding_backend
+        self._embed_cache: dict[str, list[float]] = {}
+
+    @property
+    def using_embeddings(self) -> bool:
+        return self._backend is not None
+
+    def _get_embedding(self, entry: ArchiveEntry) -> list[float]:
+        """Get or compute cached embedding for an entry."""
+        if entry.id in self._embed_cache:
+            return self._embed_cache[entry.id]
+        text = f"{entry.title} {entry.content}"
+        vec = self._backend.embed(text) if self._backend else []
+        if vec:
+            self._embed_cache[entry.id] = vec
+        return vec

    def compute_similarity(self, a: ArchiveEntry, b: ArchiveEntry) -> float:
        """Compute similarity score between two entries.

-        Returns float in [0, 1]. Phase 1: Jaccard similarity on
-        combined title+content tokens. Phase 2: cosine similarity
-        on ChromaDB embeddings.
+        Returns float in [0, 1]. Uses embedding cosine similarity if
+        a backend is configured, otherwise falls back to Jaccard.
        """
+        if self._backend:
+            vec_a = self._get_embedding(a)
+            vec_b = self._get_embedding(b)
+            if vec_a and vec_b:
+                return self._backend.similarity(vec_a, vec_b)
+        # Fallback: Jaccard on tokens
        tokens_a = self._tokenize(f"{a.title} {a.content}")
        tokens_b = self._tokenize(f"{b.title} {b.content}")
        if not tokens_a or not tokens_b:
@@ -35,11 +67,10 @@ class HolographicLinker:
        union = tokens_a | tokens_b
        return len(intersection) / len(union)

-    def find_links(self, entry: ArchiveEntry, candidates: list[ArchiveEntry]) -> list[tuple[str, float]]:
-        """Find entries worth linking to.
-
-        Returns list of (entry_id, similarity_score) tuples above threshold.
-        """
+    def find_links(
+        self, entry: ArchiveEntry, candidates: list[ArchiveEntry]
+    ) -> list[tuple[str, float]]:
+        """Find entries worth linking to. Returns (entry_id, score) tuples."""
        results = []
        for candidate in candidates:
            if candidate.id == entry.id:
@@ -58,16 +89,18 @@ class HolographicLinker:
            if eid not in entry.links:
                entry.links.append(eid)
                new_links += 1
-            # Bidirectional
            for c in candidates:
                if c.id == eid and entry.id not in c.links:
                    c.links.append(entry.id)
        return new_links

+    def clear_cache(self):
+        """Clear embedding cache (call after bulk entry changes)."""
+        self._embed_cache.clear()
+
    @staticmethod
    def _tokenize(text: str) -> set[str]:
        """Simple whitespace + punctuation tokenizer."""
        import re
        tokens = set(re.findall(r"\w+", text.lower()))
-        # Remove very short tokens
        return {t for t in tokens if len(t) > 2}
--- a/nexus/mnemosyne/tests/test_embeddings.py
+++ b/nexus/mnemosyne/tests/test_embeddings.py
@@ -0,0 +1,112 @@
+"""Tests for the embedding backend module."""
+
+from __future__ import annotations
+
+import math
+import pytest
+
+from nexus.mnemosyne.embeddings import (
+    EmbeddingBackend,
+    TfidfEmbeddingBackend,
+    cosine_similarity,
+    get_embedding_backend,
+)
+
+
+class TestCosineSimilarity:
+    def test_identical_vectors(self):
+        a = [1.0, 2.0, 3.0]
+        assert abs(cosine_similarity(a, a) - 1.0) < 1e-9
+
+    def test_orthogonal_vectors(self):
+        a = [1.0, 0.0]
+        b = [0.0, 1.0]
+        assert abs(cosine_similarity(a, b) - 0.0) < 1e-9
+
+    def test_opposite_vectors(self):
+        a = [1.0, 0.0]
+        b = [-1.0, 0.0]
+        assert abs(cosine_similarity(a, b) - (-1.0)) < 1e-9
+
+    def test_zero_vector(self):
+        a = [0.0, 0.0]
+        b = [1.0, 2.0]
+        assert cosine_similarity(a, b) == 0.0
+
+    def test_dimension_mismatch(self):
+        with pytest.raises(ValueError):
+            cosine_similarity([1.0], [1.0, 2.0])
+
+
+class TestTfidfEmbeddingBackend:
+    def test_basic_embed(self):
+        backend = TfidfEmbeddingBackend()
+        vec = backend.embed("hello world test")
+        assert len(vec) > 0
+        assert all(isinstance(v, float) for v in vec)
+
+    def test_empty_text(self):
+        backend = TfidfEmbeddingBackend()
+        vec = backend.embed("")
+        assert vec == []
+
+    def test_identical_texts_similar(self):
+        backend = TfidfEmbeddingBackend()
+        v1 = backend.embed("the cat sat on the mat")
+        v2 = backend.embed("the cat sat on the mat")
+        sim = backend.similarity(v1, v2)
+        assert sim > 0.99
+
+    def test_different_texts_less_similar(self):
+        backend = TfidfEmbeddingBackend()
+        v1 = backend.embed("python programming language")
+        v2 = backend.embed("cooking recipes italian food")
+        sim = backend.similarity(v1, v2)
+        assert sim < 0.5
+
+    def test_related_texts_more_similar(self):
+        backend = TfidfEmbeddingBackend()
+        v1 = backend.embed("machine learning neural networks")
+        v2 = backend.embed("deep learning artificial neural nets")
+        v3 = backend.embed("baking bread sourdough recipe")
+        sim_related = backend.similarity(v1, v2)
+        sim_unrelated = backend.similarity(v1, v3)
+        assert sim_related > sim_unrelated
+
+    def test_name(self):
+        backend = TfidfEmbeddingBackend()
+        assert "TF-IDF" in backend.name
+
+    def test_dimension_grows(self):
+        backend = TfidfEmbeddingBackend()
+        d1 = backend.dimension
+        backend.embed("new unique tokens here")
+        d2 = backend.dimension
+        assert d2 > d1
+
+    def test_padding_different_lengths(self):
+        backend = TfidfEmbeddingBackend()
+        v1 = backend.embed("short")
+        v2 = backend.embed("this is a much longer text with many more tokens")
+        # Should not raise despite different lengths
+        sim = backend.similarity(v1, v2)
+        assert 0.0 <= sim <= 1.0
+
+
+class TestGetEmbeddingBackend:
+    def test_tfidf_preferred(self):
+        backend = get_embedding_backend(prefer="tfidf")
+        assert isinstance(backend, TfidfEmbeddingBackend)
+
+    def test_auto_returns_something(self):
+        backend = get_embedding_backend()
+        assert isinstance(backend, EmbeddingBackend)
+
+    def test_ollama_unavailable_falls_back(self):
+        # Should fall back to TF-IDF when Ollama is unreachable
+        backend = get_embedding_backend(prefer="ollama", ollama_url="http://localhost:1")
+        # If it raises, the test fails — it should fall back
+        # But with prefer="ollama" it raises if unavailable
+        # So we test without prefer:
+        backend = get_embedding_backend(ollama_url="http://localhost:1")
+        assert isinstance(backend, TfidfEmbeddingBackend)