diff --git a/nexus/mnemosyne/linker.py b/nexus/mnemosyne/linker.py index f1f68d62..3b4309c8 100644 --- a/nexus/mnemosyne/linker.py +++ b/nexus/mnemosyne/linker.py @@ -2,31 +2,63 @@ Computes semantic similarity between archive entries and creates bidirectional links, forming the holographic graph structure. + +Supports pluggable embedding backends for true semantic search. +Falls back to Jaccard token similarity when no backend is available. """ from __future__ import annotations -from typing import Optional +from typing import Optional, TYPE_CHECKING + from nexus.mnemosyne.entry import ArchiveEntry +if TYPE_CHECKING: + from nexus.mnemosyne.embeddings import EmbeddingBackend + class HolographicLinker: """Links archive entries via semantic similarity. - Phase 1 uses simple keyword overlap as the similarity metric. - Phase 2 will integrate ChromaDB embeddings from MemPalace. + With an embedding backend: cosine similarity on vectors. + Without: Jaccard similarity on token sets (legacy fallback). """ - def __init__(self, similarity_threshold: float = 0.15): + def __init__( + self, + similarity_threshold: float = 0.15, + embedding_backend: Optional["EmbeddingBackend"] = None, + ): self.threshold = similarity_threshold + self._backend = embedding_backend + self._embed_cache: dict[str, list[float]] = {} + + @property + def using_embeddings(self) -> bool: + return self._backend is not None + + def _get_embedding(self, entry: ArchiveEntry) -> list[float]: + """Get or compute cached embedding for an entry.""" + if entry.id in self._embed_cache: + return self._embed_cache[entry.id] + text = f"{entry.title} {entry.content}" + vec = self._backend.embed(text) if self._backend else [] + if vec: + self._embed_cache[entry.id] = vec + return vec def compute_similarity(self, a: ArchiveEntry, b: ArchiveEntry) -> float: """Compute similarity score between two entries. - Returns float in [0, 1]. Phase 1: Jaccard similarity on - combined title+content tokens. Phase 2: cosine similarity - on ChromaDB embeddings. + Returns float in [0, 1]. Uses embedding cosine similarity if + a backend is configured, otherwise falls back to Jaccard. """ + if self._backend: + vec_a = self._get_embedding(a) + vec_b = self._get_embedding(b) + if vec_a and vec_b: + return self._backend.similarity(vec_a, vec_b) + # Fallback: Jaccard on tokens tokens_a = self._tokenize(f"{a.title} {a.content}") tokens_b = self._tokenize(f"{b.title} {b.content}") if not tokens_a or not tokens_b: @@ -35,11 +67,10 @@ class HolographicLinker: union = tokens_a | tokens_b return len(intersection) / len(union) - def find_links(self, entry: ArchiveEntry, candidates: list[ArchiveEntry]) -> list[tuple[str, float]]: - """Find entries worth linking to. - - Returns list of (entry_id, similarity_score) tuples above threshold. - """ + def find_links( + self, entry: ArchiveEntry, candidates: list[ArchiveEntry] + ) -> list[tuple[str, float]]: + """Find entries worth linking to. Returns (entry_id, score) tuples.""" results = [] for candidate in candidates: if candidate.id == entry.id: @@ -58,16 +89,18 @@ class HolographicLinker: if eid not in entry.links: entry.links.append(eid) new_links += 1 - # Bidirectional for c in candidates: if c.id == eid and entry.id not in c.links: c.links.append(entry.id) return new_links + def clear_cache(self): + """Clear embedding cache (call after bulk entry changes).""" + self._embed_cache.clear() + @staticmethod def _tokenize(text: str) -> set[str]: """Simple whitespace + punctuation tokenizer.""" import re tokens = set(re.findall(r"\w+", text.lower())) - # Remove very short tokens return {t for t in tokens if len(t) > 2}