feat: archive.py uses embedding backend for semantic search
- MnemosyneArchive.__init__ accepts optional EmbeddingBackend - Auto-detects best backend via get_embedding_backend() - semantic_search uses embedding cosine similarity when available - Falls back to Jaccard token similarity gracefully
This commit is contained in:
@@ -13,6 +13,7 @@ from typing import Optional
|
||||
|
||||
from nexus.mnemosyne.entry import ArchiveEntry, _compute_content_hash
|
||||
from nexus.mnemosyne.linker import HolographicLinker
|
||||
from nexus.mnemosyne.embeddings import get_embedding_backend, EmbeddingBackend
|
||||
|
||||
_EXPORT_VERSION = "1"
|
||||
|
||||
@@ -24,10 +25,21 @@ class MnemosyneArchive:
|
||||
MemPalace (ChromaDB) for vector-semantic search.
|
||||
"""
|
||||
|
||||
def __init__(self, archive_path: Optional[Path] = None):
|
||||
def __init__(
|
||||
self,
|
||||
archive_path: Optional[Path] = None,
|
||||
embedding_backend: Optional[EmbeddingBackend] = None,
|
||||
auto_embed: bool = True,
|
||||
):
|
||||
self.path = archive_path or Path.home() / ".hermes" / "mnemosyne" / "archive.json"
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.linker = HolographicLinker()
|
||||
self._embedding_backend = embedding_backend
|
||||
if embedding_backend is None and auto_embed:
|
||||
try:
|
||||
self._embedding_backend = get_embedding_backend()
|
||||
except Exception:
|
||||
self._embedding_backend = None
|
||||
self.linker = HolographicLinker(embedding_backend=self._embedding_backend)
|
||||
self._entries: dict[str, ArchiveEntry] = {}
|
||||
self._load()
|
||||
|
||||
@@ -143,33 +155,51 @@ class MnemosyneArchive:
|
||||
return [e for _, e in scored[:limit]]
|
||||
|
||||
def semantic_search(self, query: str, limit: int = 10, threshold: float = 0.05) -> list[ArchiveEntry]:
|
||||
"""Semantic search using holographic linker similarity.
|
||||
"""Semantic search using embeddings or holographic linker similarity.
|
||||
|
||||
Scores each entry by Jaccard similarity between query tokens and entry
|
||||
tokens, then boosts entries with more inbound links (more "holographic").
|
||||
Falls back to keyword search if no entries meet the similarity threshold.
|
||||
With an embedding backend: cosine similarity between query vector and
|
||||
entry vectors, boosted by inbound link count.
|
||||
Without: Jaccard similarity on tokens with link boost.
|
||||
Falls back to keyword search if nothing meets the threshold.
|
||||
|
||||
Args:
|
||||
query: Natural language query string.
|
||||
limit: Maximum number of results to return.
|
||||
threshold: Minimum Jaccard similarity to be considered a semantic match.
|
||||
threshold: Minimum similarity score to include in results.
|
||||
|
||||
Returns:
|
||||
List of ArchiveEntry sorted by combined relevance score, descending.
|
||||
"""
|
||||
query_tokens = HolographicLinker._tokenize(query)
|
||||
if not query_tokens:
|
||||
return []
|
||||
|
||||
# Count inbound links for each entry (how many entries link TO this one)
|
||||
# Count inbound links for link-boost
|
||||
inbound: dict[str, int] = {eid: 0 for eid in self._entries}
|
||||
for entry in self._entries.values():
|
||||
for linked_id in entry.links:
|
||||
if linked_id in inbound:
|
||||
inbound[linked_id] += 1
|
||||
|
||||
max_inbound = max(inbound.values(), default=1) or 1
|
||||
|
||||
# Try embedding-based search first
|
||||
if self._embedding_backend:
|
||||
query_vec = self._embedding_backend.embed(query)
|
||||
if query_vec:
|
||||
scored = []
|
||||
for entry in self._entries.values():
|
||||
text = f"{entry.title} {entry.content} {' '.join(entry.topics)}"
|
||||
entry_vec = self._embedding_backend.embed(text)
|
||||
if not entry_vec:
|
||||
continue
|
||||
sim = self._embedding_backend.similarity(query_vec, entry_vec)
|
||||
if sim >= threshold:
|
||||
link_boost = inbound[entry.id] / max_inbound * 0.15
|
||||
scored.append((sim + link_boost, entry))
|
||||
if scored:
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [e for _, e in scored[:limit]]
|
||||
|
||||
# Fallback: Jaccard token similarity
|
||||
query_tokens = HolographicLinker._tokenize(query)
|
||||
if not query_tokens:
|
||||
return []
|
||||
scored = []
|
||||
for entry in self._entries.values():
|
||||
entry_tokens = HolographicLinker._tokenize(f"{entry.title} {entry.content} {' '.join(entry.topics)}")
|
||||
@@ -179,14 +209,13 @@ class MnemosyneArchive:
|
||||
union = query_tokens | entry_tokens
|
||||
jaccard = len(intersection) / len(union)
|
||||
if jaccard >= threshold:
|
||||
link_boost = inbound[entry.id] / max_inbound * 0.2 # up to 20% boost
|
||||
link_boost = inbound[entry.id] / max_inbound * 0.2
|
||||
scored.append((jaccard + link_boost, entry))
|
||||
|
||||
if scored:
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [e for _, e in scored[:limit]]
|
||||
|
||||
# Graceful fallback to keyword search
|
||||
# Final fallback: keyword search
|
||||
return self.search(query, limit=limit)
|
||||
|
||||
def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]:
|
||||
|
||||
Reference in New Issue
Block a user