feat: linker supports pluggable embedding backend
HolographicLinker now accepts optional EmbeddingBackend. Uses cosine similarity on embeddings when available, falls back to Jaccard token similarity otherwise. Embedding cache for performance during link operations.
This commit is contained in:
@@ -2,31 +2,63 @@
|
||||
|
||||
Computes semantic similarity between archive entries and creates
|
||||
bidirectional links, forming the holographic graph structure.
|
||||
|
||||
Supports pluggable embedding backends for true semantic search.
|
||||
Falls back to Jaccard token similarity when no backend is available.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
from typing import Optional, TYPE_CHECKING
|
||||
|
||||
from nexus.mnemosyne.entry import ArchiveEntry
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from nexus.mnemosyne.embeddings import EmbeddingBackend
|
||||
|
||||
|
||||
class HolographicLinker:
|
||||
"""Links archive entries via semantic similarity.
|
||||
|
||||
Phase 1 uses simple keyword overlap as the similarity metric.
|
||||
Phase 2 will integrate ChromaDB embeddings from MemPalace.
|
||||
With an embedding backend: cosine similarity on vectors.
|
||||
Without: Jaccard similarity on token sets (legacy fallback).
|
||||
"""
|
||||
|
||||
def __init__(self, similarity_threshold: float = 0.15):
|
||||
def __init__(
|
||||
self,
|
||||
similarity_threshold: float = 0.15,
|
||||
embedding_backend: Optional["EmbeddingBackend"] = None,
|
||||
):
|
||||
self.threshold = similarity_threshold
|
||||
self._backend = embedding_backend
|
||||
self._embed_cache: dict[str, list[float]] = {}
|
||||
|
||||
@property
|
||||
def using_embeddings(self) -> bool:
|
||||
return self._backend is not None
|
||||
|
||||
def _get_embedding(self, entry: ArchiveEntry) -> list[float]:
|
||||
"""Get or compute cached embedding for an entry."""
|
||||
if entry.id in self._embed_cache:
|
||||
return self._embed_cache[entry.id]
|
||||
text = f"{entry.title} {entry.content}"
|
||||
vec = self._backend.embed(text) if self._backend else []
|
||||
if vec:
|
||||
self._embed_cache[entry.id] = vec
|
||||
return vec
|
||||
|
||||
def compute_similarity(self, a: ArchiveEntry, b: ArchiveEntry) -> float:
|
||||
"""Compute similarity score between two entries.
|
||||
|
||||
Returns float in [0, 1]. Phase 1: Jaccard similarity on
|
||||
combined title+content tokens. Phase 2: cosine similarity
|
||||
on ChromaDB embeddings.
|
||||
Returns float in [0, 1]. Uses embedding cosine similarity if
|
||||
a backend is configured, otherwise falls back to Jaccard.
|
||||
"""
|
||||
if self._backend:
|
||||
vec_a = self._get_embedding(a)
|
||||
vec_b = self._get_embedding(b)
|
||||
if vec_a and vec_b:
|
||||
return self._backend.similarity(vec_a, vec_b)
|
||||
# Fallback: Jaccard on tokens
|
||||
tokens_a = self._tokenize(f"{a.title} {a.content}")
|
||||
tokens_b = self._tokenize(f"{b.title} {b.content}")
|
||||
if not tokens_a or not tokens_b:
|
||||
@@ -35,11 +67,10 @@ class HolographicLinker:
|
||||
union = tokens_a | tokens_b
|
||||
return len(intersection) / len(union)
|
||||
|
||||
def find_links(self, entry: ArchiveEntry, candidates: list[ArchiveEntry]) -> list[tuple[str, float]]:
|
||||
"""Find entries worth linking to.
|
||||
|
||||
Returns list of (entry_id, similarity_score) tuples above threshold.
|
||||
"""
|
||||
def find_links(
|
||||
self, entry: ArchiveEntry, candidates: list[ArchiveEntry]
|
||||
) -> list[tuple[str, float]]:
|
||||
"""Find entries worth linking to. Returns (entry_id, score) tuples."""
|
||||
results = []
|
||||
for candidate in candidates:
|
||||
if candidate.id == entry.id:
|
||||
@@ -58,16 +89,18 @@ class HolographicLinker:
|
||||
if eid not in entry.links:
|
||||
entry.links.append(eid)
|
||||
new_links += 1
|
||||
# Bidirectional
|
||||
for c in candidates:
|
||||
if c.id == eid and entry.id not in c.links:
|
||||
c.links.append(entry.id)
|
||||
return new_links
|
||||
|
||||
def clear_cache(self):
|
||||
"""Clear embedding cache (call after bulk entry changes)."""
|
||||
self._embed_cache.clear()
|
||||
|
||||
@staticmethod
|
||||
def _tokenize(text: str) -> set[str]:
|
||||
"""Simple whitespace + punctuation tokenizer."""
|
||||
import re
|
||||
tokens = set(re.findall(r"\w+", text.lower()))
|
||||
# Remove very short tokens
|
||||
return {t for t in tokens if len(t) > 2}
|
||||
|
||||
Reference in New Issue
Block a user