the-nexus/nexus/mnemosyne/linker.py

"""Holographic link engine.

Computes semantic similarity between archive entries and creates
bidirectional links, forming the holographic graph structure.

Supports pluggable embedding backends for true semantic search.
Falls back to Jaccard token similarity when no backend is available.
"""

from __future__ import annotations

from typing import Optional, TYPE_CHECKING

from nexus.mnemosyne.entry import ArchiveEntry

if TYPE_CHECKING:
    from nexus.mnemosyne.embeddings import EmbeddingBackend


class HolographicLinker:
    """Links archive entries via semantic similarity.

    With an embedding backend: cosine similarity on vectors.
    Without: Jaccard similarity on token sets (legacy fallback).
    """

    def __init__(
        self,
        similarity_threshold: float = 0.15,
        embedding_backend: Optional["EmbeddingBackend"] = None,
    ):
        self.threshold = similarity_threshold
        self._backend = embedding_backend
        self._embed_cache: dict[str, list[float]] = {}

    @property
    def using_embeddings(self) -> bool:
        return self._backend is not None

    def _get_embedding(self, entry: ArchiveEntry) -> list[float]:
        """Get or compute cached embedding for an entry."""
        if entry.id in self._embed_cache:
            return self._embed_cache[entry.id]
        text = f"{entry.title} {entry.content}"
        vec = self._backend.embed(text) if self._backend else []
        if vec:
            self._embed_cache[entry.id] = vec
        return vec

    def compute_similarity(self, a: ArchiveEntry, b: ArchiveEntry) -> float:
        """Compute similarity score between two entries.

        Returns float in [0, 1]. Uses embedding cosine similarity if
        a backend is configured, otherwise falls back to Jaccard.
        """
        if self._backend:
            vec_a = self._get_embedding(a)
            vec_b = self._get_embedding(b)
            if vec_a and vec_b:
                return self._backend.similarity(vec_a, vec_b)
        # Fallback: Jaccard on tokens
        tokens_a = self._tokenize(f"{a.title} {a.content}")
        tokens_b = self._tokenize(f"{b.title} {b.content}")
        if not tokens_a or not tokens_b:
            return 0.0
        intersection = tokens_a & tokens_b
        union = tokens_a | tokens_b
        return len(intersection) / len(union)

    def find_links(
        self, entry: ArchiveEntry, candidates: list[ArchiveEntry]
    ) -> list[tuple[str, float]]:
        """Find entries worth linking to. Returns (entry_id, score) tuples."""
        results = []
        for candidate in candidates:
            if candidate.id == entry.id:
                continue
            score = self.compute_similarity(entry, candidate)
            if score >= self.threshold:
                results.append((candidate.id, score))
        results.sort(key=lambda x: x[1], reverse=True)
        return results

    def apply_links(self, entry: ArchiveEntry, candidates: list[ArchiveEntry]) -> int:
        """Auto-link an entry to related entries. Returns count of new links."""
        matches = self.find_links(entry, candidates)
        new_links = 0
        for eid, score in matches:
            if eid not in entry.links:
                entry.links.append(eid)
                new_links += 1
            for c in candidates:
                if c.id == eid and entry.id not in c.links:
                    c.links.append(entry.id)
        return new_links

    def clear_cache(self):
        """Clear embedding cache (call after bulk entry changes)."""
        self._embed_cache.clear()

    @staticmethod
    def _tokenize(text: str) -> set[str]:
        """Simple whitespace + punctuation tokenizer."""
        import re
        tokens = set(re.findall(r"\w+", text.lower()))
        return {t for t in tokens if len(t) > 2}