the-nexus/nexus/mnemosyne/linker.py

"""Holographic link engine.

Computes semantic similarity between archive entries and creates
bidirectional links, forming the holographic graph structure.
"""

from __future__ import annotations

from typing import Optional
from nexus.mnemosyne.entry import ArchiveEntry


class HolographicLinker:
    """Links archive entries via semantic similarity.

    Phase 1 uses simple keyword overlap as the similarity metric.
    Phase 2 will integrate ChromaDB embeddings from MemPalace.
    """

    def __init__(self, similarity_threshold: float = 0.15):
        self.threshold = similarity_threshold

    def compute_similarity(self, a: ArchiveEntry, b: ArchiveEntry) -> float:
        """Compute similarity score between two entries.

        Returns float in [0, 1]. Phase 1: Jaccard similarity on
        combined title+content tokens. Phase 2: cosine similarity
        on ChromaDB embeddings.
        """
        tokens_a = self._tokenize(f"{a.title} {a.content}")
        tokens_b = self._tokenize(f"{b.title} {b.content}")
        if not tokens_a or not tokens_b:
            return 0.0
        intersection = tokens_a & tokens_b
        union = tokens_a | tokens_b
        return len(intersection) / len(union)

    def find_links(self, entry: ArchiveEntry, candidates: list[ArchiveEntry]) -> list[tuple[str, float]]:
        """Find entries worth linking to.

        Returns list of (entry_id, similarity_score) tuples above threshold.
        """
        results = []
        for candidate in candidates:
            if candidate.id == entry.id:
                continue
            score = self.compute_similarity(entry, candidate)
            if score >= self.threshold:
                results.append((candidate.id, score))
        results.sort(key=lambda x: x[1], reverse=True)
        return results

    def apply_links(self, entry: ArchiveEntry, candidates: list[ArchiveEntry]) -> int:
        """Auto-link an entry to related entries. Returns count of new links."""
        matches = self.find_links(entry, candidates)
        new_links = 0
        for eid, score in matches:
            if eid not in entry.links:
                entry.links.append(eid)
                new_links += 1
            # Bidirectional
            for c in candidates:
                if c.id == eid and entry.id not in c.links:
                    c.links.append(entry.id)
        return new_links

    @staticmethod
    def _tokenize(text: str) -> set[str]:
        """Simple whitespace + punctuation tokenizer."""
        import re
        tokens = set(re.findall(r"\w+", text.lower()))
        # Remove very short tokens
        return {t for t in tokens if len(t) > 2}