diff --git a/nexus/mnemosyne/archive.py b/nexus/mnemosyne/archive.py index 50ba7cef..70998796 100644 --- a/nexus/mnemosyne/archive.py +++ b/nexus/mnemosyne/archive.py @@ -13,6 +13,7 @@ from typing import Optional from nexus.mnemosyne.entry import ArchiveEntry, _compute_content_hash from nexus.mnemosyne.linker import HolographicLinker +from nexus.mnemosyne.embeddings import get_embedding_backend, EmbeddingBackend _EXPORT_VERSION = "1" @@ -24,10 +25,21 @@ class MnemosyneArchive: MemPalace (ChromaDB) for vector-semantic search. """ - def __init__(self, archive_path: Optional[Path] = None): + def __init__( + self, + archive_path: Optional[Path] = None, + embedding_backend: Optional[EmbeddingBackend] = None, + auto_embed: bool = True, + ): self.path = archive_path or Path.home() / ".hermes" / "mnemosyne" / "archive.json" self.path.parent.mkdir(parents=True, exist_ok=True) - self.linker = HolographicLinker() + self._embedding_backend = embedding_backend + if embedding_backend is None and auto_embed: + try: + self._embedding_backend = get_embedding_backend() + except Exception: + self._embedding_backend = None + self.linker = HolographicLinker(embedding_backend=self._embedding_backend) self._entries: dict[str, ArchiveEntry] = {} self._load() @@ -143,33 +155,51 @@ class MnemosyneArchive: return [e for _, e in scored[:limit]] def semantic_search(self, query: str, limit: int = 10, threshold: float = 0.05) -> list[ArchiveEntry]: - """Semantic search using holographic linker similarity. + """Semantic search using embeddings or holographic linker similarity. - Scores each entry by Jaccard similarity between query tokens and entry - tokens, then boosts entries with more inbound links (more "holographic"). - Falls back to keyword search if no entries meet the similarity threshold. + With an embedding backend: cosine similarity between query vector and + entry vectors, boosted by inbound link count. + Without: Jaccard similarity on tokens with link boost. + Falls back to keyword search if nothing meets the threshold. Args: query: Natural language query string. limit: Maximum number of results to return. - threshold: Minimum Jaccard similarity to be considered a semantic match. + threshold: Minimum similarity score to include in results. Returns: List of ArchiveEntry sorted by combined relevance score, descending. """ - query_tokens = HolographicLinker._tokenize(query) - if not query_tokens: - return [] - - # Count inbound links for each entry (how many entries link TO this one) + # Count inbound links for link-boost inbound: dict[str, int] = {eid: 0 for eid in self._entries} for entry in self._entries.values(): for linked_id in entry.links: if linked_id in inbound: inbound[linked_id] += 1 - max_inbound = max(inbound.values(), default=1) or 1 + # Try embedding-based search first + if self._embedding_backend: + query_vec = self._embedding_backend.embed(query) + if query_vec: + scored = [] + for entry in self._entries.values(): + text = f"{entry.title} {entry.content} {' '.join(entry.topics)}" + entry_vec = self._embedding_backend.embed(text) + if not entry_vec: + continue + sim = self._embedding_backend.similarity(query_vec, entry_vec) + if sim >= threshold: + link_boost = inbound[entry.id] / max_inbound * 0.15 + scored.append((sim + link_boost, entry)) + if scored: + scored.sort(key=lambda x: x[0], reverse=True) + return [e for _, e in scored[:limit]] + + # Fallback: Jaccard token similarity + query_tokens = HolographicLinker._tokenize(query) + if not query_tokens: + return [] scored = [] for entry in self._entries.values(): entry_tokens = HolographicLinker._tokenize(f"{entry.title} {entry.content} {' '.join(entry.topics)}") @@ -179,14 +209,13 @@ class MnemosyneArchive: union = query_tokens | entry_tokens jaccard = len(intersection) / len(union) if jaccard >= threshold: - link_boost = inbound[entry.id] / max_inbound * 0.2 # up to 20% boost + link_boost = inbound[entry.id] / max_inbound * 0.2 scored.append((jaccard + link_boost, entry)) - if scored: scored.sort(key=lambda x: x[0], reverse=True) return [e for _, e in scored[:limit]] - # Graceful fallback to keyword search + # Final fallback: keyword search return self.search(query, limit=limit) def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]: