feat(mnemosyne): add semantic search using holographic linker similarity

Closes #1223 - semantic_search() on MnemosyneArchive: Jaccard similarity + link connectivity boost + topic bonus - CLI: mnemosyne search --semantic flag shows relevance scores - 5 new tests: scored results, ranking, link boost, empty archive, threshold filtering
2026-04-11 16:15:09 -04:00
6 changed files with 146 additions and 10 deletions
--- a/nexus/mnemosyne/pycache/archive.cpython-311.pyc
+++ b/nexus/mnemosyne/pycache/archive.cpython-311.pyc
--- a/nexus/mnemosyne/pycache/cli.cpython-311.pyc
+++ b/nexus/mnemosyne/pycache/cli.cpython-311.pyc
--- a/nexus/mnemosyne/archive.py
+++ b/nexus/mnemosyne/archive.py
@@ -72,6 +72,60 @@ class MnemosyneArchive:
        scored.sort(key=lambda x: x[0], reverse=True)
        return [e for _, e in scored[:limit]]

+    def semantic_search(
+        self,
+        query: str,
+        limit: int = 10,
+        threshold: float = 0.05,
+        link_boost: float = 0.1,
+    ) -> list[tuple[ArchiveEntry, float]]:
+        """Semantic search using holographic linker similarity.
+
+        Scores entries by Jaccard similarity between query tokens and entry
+        tokens, boosted by link connectivity (entries with more back-links
+        are more "holographic" and get a relevance bump).
+
+        Args:
+            query: search text
+            limit: max results to return
+            threshold: minimum similarity score to include
+            link_boost: extra score per back-link (capped at 0.3)
+
+        Returns list of (entry, score) tuples sorted by score desc.
+        """
+        from nexus.mnemosyne.entry import ArchiveEntry as _AE
+
+        # Build a synthetic entry for the query so we can use the linker
+        query_entry = _AE(title=query, content="")
+        query_tokens = self.linker._tokenize(query)
+
+        # Count back-links for each entry (incoming link graph)
+        back_links: dict[str, int] = {}
+        for entry in self._entries.values():
+            for lid in entry.links:
+                back_links[lid] = back_links.get(lid, 0) + 1
+
+        scored: list[tuple[float, ArchiveEntry]] = []
+        for entry in self._entries.values():
+            sim = self.linker.compute_similarity(query_entry, entry)
+            if sim < threshold:
+                continue
+            # Link connectivity boost (capped)
+            boost = min(back_links.get(entry.id, 0) * link_boost, 0.3)
+            # Topic overlap bonus
+            topic_bonus = 0.0
+            if entry.topics:
+                topic_tokens = set()
+                for t in entry.topics:
+                    topic_tokens.update(self.linker._tokenize(t))
+                overlap = len(query_tokens & topic_tokens)
+                topic_bonus = overlap * 0.05
+            score = round(sim + boost + topic_bonus, 6)
+            scored.append((score, entry))
+
+        scored.sort(key=lambda x: x[0], reverse=True)
+        return [(e, s) for s, e in scored[:limit]]
+
    def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]:
        """Get entries linked to a given entry, up to specified depth."""
        visited = set()
--- a/nexus/mnemosyne/cli.py
+++ b/nexus/mnemosyne/cli.py
@@ -23,16 +23,28 @@ def cmd_stats(args):

 def cmd_search(args):
    archive = MnemosyneArchive()
-    results = archive.search(args.query, limit=args.limit)
-    if not results:
-        print("No results found.")
-        return
-    for entry in results:
-        linked = len(entry.links)
-        print(f"[{entry.id[:8]}] {entry.title}")
-        print(f"  Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
-        print(f"  {entry.content[:120]}...")
-        print()
+    if args.semantic:
+        results = archive.semantic_search(args.query, limit=args.limit)
+        if not results:
+            print("No results found.")
+            return
+        for entry, score in results:
+            linked = len(entry.links)
+            print(f"[{entry.id[:8]}] {entry.title}  (score: {score:.4f})")
+            print(f"  Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
+            print(f"  {entry.content[:120]}...")
+            print()
+    else:
+        results = archive.search(args.query, limit=args.limit)
+        if not results:
+            print("No results found.")
+            return
+        for entry in results:
+            linked = len(entry.links)
+            print(f"[{entry.id[:8]}] {entry.title}")
+            print(f"  Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
+            print(f"  {entry.content[:120]}...")
+            print()


 def cmd_ingest(args):
@@ -96,6 +108,7 @@ def main():
    s = sub.add_parser("search", help="Search the archive")
    s.add_argument("query", help="Search query")
    s.add_argument("-n", "--limit", type=int, default=10)
+    s.add_argument("--semantic", action="store_true", help="Use semantic similarity search instead of keyword matching")

    i = sub.add_parser("ingest", help="Ingest a new entry")
    i.add_argument("--title", required=True)
--- a/nexus/mnemosyne/tests/pycache/test_archive.cpython-311.pyc
+++ b/nexus/mnemosyne/tests/pycache/test_archive.cpython-311.pyc
--- a/nexus/mnemosyne/tests/test_archive.py
+++ b/nexus/mnemosyne/tests/test_archive.py
@@ -209,3 +209,72 @@ def test_archive_topic_counts():
        assert counts["automation"] == 2
        # sorted by count desc — both tied but must be present
        assert set(counts.keys()) == {"python", "automation"}
+
+
+def test_semantic_search_returns_scored_results():
+    with tempfile.TemporaryDirectory() as tmp:
+        path = Path(tmp) / "test_archive.json"
+        archive = MnemosyneArchive(archive_path=path)
+        ingest_event(archive, title="Python automation", content="Building automation tools in Python scripts")
+        ingest_event(archive, title="Cooking recipes", content="How to make pasta carbonara")
+        results = archive.semantic_search("Python scripting automation")
+        assert len(results) > 0
+        # Python-related entry should score higher than cooking
+        assert results[0][0].title != "Cooking recipes"
+        # Results are (entry, score) tuples
+        assert all(isinstance(s, float) for _, s in results)
+        assert all(s > 0 for _, s in results)
+
+
+def test_semantic_search_ranks_by_similarity():
+    with tempfile.TemporaryDirectory() as tmp:
+        path = Path(tmp) / "test_archive.json"
+        archive = MnemosyneArchive(archive_path=path)
+        e1 = ingest_event(archive, title="Web development", content="Building websites with HTML CSS JavaScript")
+        e2 = ingest_event(archive, title="Web scraping", content="Extracting data from websites using Python requests")
+        e3 = ingest_event(archive, title="Astronomy", content="Studying stars planets galaxies")
+        results = archive.semantic_search("web programming internet")
+        titles = [e.title for e, _ in results]
+        # Both web entries should rank above astronomy
+        assert e3.title not in titles or titles.index(e3.title) > titles.index(e1.title)
+
+
+def test_semantic_search_link_boost():
+    with tempfile.TemporaryDirectory() as tmp:
+        path = Path(tmp) / "test_archive.json"
+        archive = MnemosyneArchive(archive_path=path)
+        # Entry A is about Python — will get linked to
+        e_a = ingest_event(archive, title="Python tools", content="Automation scripting development Python")
+        # Entry B is also about Python but with different enough content to link to A
+        e_b = ingest_event(archive, title="Python frameworks", content="Django Flask FastAPI Python development")
+        # Entry C has same keywords but no links (added with auto_link=False)
+        e_c = ArchiveEntry(title="Python tools", content="Automation scripting development Python")
+        archive.add(e_c, auto_link=False)
+
+        results = archive.semantic_search("Python automation")
+        scores = {e.id: s for e, s in results}
+
+        # Entries with more back-links should get the boost
+        # e_a and e_b should be linked to each other
+        if e_a.id in scores and e_c.id in scores:
+            # e_a has auto-links from e_b, e_c does not
+            # e_a should score >= e_c (boosted by link connectivity)
+            assert scores[e_a.id] >= scores[e_c.id]
+
+
+def test_semantic_search_empty_archive():
+    with tempfile.TemporaryDirectory() as tmp:
+        path = Path(tmp) / "test_archive.json"
+        archive = MnemosyneArchive(archive_path=path)
+        results = archive.semantic_search("anything")
+        assert results == []
+
+
+def test_semantic_search_threshold():
+    with tempfile.TemporaryDirectory() as tmp:
+        path = Path(tmp) / "test_archive.json"
+        archive = MnemosyneArchive(archive_path=path)
+        ingest_event(archive, title="Quantum physics", content="Particle wave duality entanglement")
+        # Completely unrelated query — should return empty with high threshold
+        results = archive.semantic_search("cooking pasta carbonara", threshold=0.5)
+        assert results == []