Co-authored-by: Alexander Whitestone <alexander@alexanderwhitestone.com> Co-committed-by: Alexander Whitestone <alexander@alexanderwhitestone.com>
74 lines
2.6 KiB
Python
74 lines
2.6 KiB
Python
"""Holographic link engine.
|
|
|
|
Computes semantic similarity between archive entries and creates
|
|
bidirectional links, forming the holographic graph structure.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Optional
|
|
from nexus.mnemosyne.entry import ArchiveEntry
|
|
|
|
|
|
class HolographicLinker:
|
|
"""Links archive entries via semantic similarity.
|
|
|
|
Phase 1 uses simple keyword overlap as the similarity metric.
|
|
Phase 2 will integrate ChromaDB embeddings from MemPalace.
|
|
"""
|
|
|
|
def __init__(self, similarity_threshold: float = 0.15):
|
|
self.threshold = similarity_threshold
|
|
|
|
def compute_similarity(self, a: ArchiveEntry, b: ArchiveEntry) -> float:
|
|
"""Compute similarity score between two entries.
|
|
|
|
Returns float in [0, 1]. Phase 1: Jaccard similarity on
|
|
combined title+content tokens. Phase 2: cosine similarity
|
|
on ChromaDB embeddings.
|
|
"""
|
|
tokens_a = self._tokenize(f"{a.title} {a.content}")
|
|
tokens_b = self._tokenize(f"{b.title} {b.content}")
|
|
if not tokens_a or not tokens_b:
|
|
return 0.0
|
|
intersection = tokens_a & tokens_b
|
|
union = tokens_a | tokens_b
|
|
return len(intersection) / len(union)
|
|
|
|
def find_links(self, entry: ArchiveEntry, candidates: list[ArchiveEntry]) -> list[tuple[str, float]]:
|
|
"""Find entries worth linking to.
|
|
|
|
Returns list of (entry_id, similarity_score) tuples above threshold.
|
|
"""
|
|
results = []
|
|
for candidate in candidates:
|
|
if candidate.id == entry.id:
|
|
continue
|
|
score = self.compute_similarity(entry, candidate)
|
|
if score >= self.threshold:
|
|
results.append((candidate.id, score))
|
|
results.sort(key=lambda x: x[1], reverse=True)
|
|
return results
|
|
|
|
def apply_links(self, entry: ArchiveEntry, candidates: list[ArchiveEntry]) -> int:
|
|
"""Auto-link an entry to related entries. Returns count of new links."""
|
|
matches = self.find_links(entry, candidates)
|
|
new_links = 0
|
|
for eid, score in matches:
|
|
if eid not in entry.links:
|
|
entry.links.append(eid)
|
|
new_links += 1
|
|
# Bidirectional
|
|
for c in candidates:
|
|
if c.id == eid and entry.id not in c.links:
|
|
c.links.append(entry.id)
|
|
return new_links
|
|
|
|
@staticmethod
|
|
def _tokenize(text: str) -> set[str]:
|
|
"""Simple whitespace + punctuation tokenizer."""
|
|
import re
|
|
tokens = set(re.findall(r"\w+", text.lower()))
|
|
# Remove very short tokens
|
|
return {t for t in tokens if len(t) > 2}
|