diff --git a/nexus/mnemosyne/archive.py b/nexus/mnemosyne/archive.py index 847f9a46..8a8a2eb8 100644 --- a/nexus/mnemosyne/archive.py +++ b/nexus/mnemosyne/archive.py @@ -72,6 +72,53 @@ class MnemosyneArchive: scored.sort(key=lambda x: x[0], reverse=True) return [e for _, e in scored[:limit]] + def semantic_search(self, query: str, limit: int = 10, threshold: float = 0.05) -> list[ArchiveEntry]: + """Semantic search using holographic linker similarity. + + Scores each entry by Jaccard similarity between query tokens and entry + tokens, then boosts entries with more inbound links (more "holographic"). + Falls back to keyword search if no entries meet the similarity threshold. + + Args: + query: Natural language query string. + limit: Maximum number of results to return. + threshold: Minimum Jaccard similarity to be considered a semantic match. + + Returns: + List of ArchiveEntry sorted by combined relevance score, descending. + """ + query_tokens = HolographicLinker._tokenize(query) + if not query_tokens: + return [] + + # Count inbound links for each entry (how many entries link TO this one) + inbound: dict[str, int] = {eid: 0 for eid in self._entries} + for entry in self._entries.values(): + for linked_id in entry.links: + if linked_id in inbound: + inbound[linked_id] += 1 + + max_inbound = max(inbound.values(), default=1) or 1 + + scored = [] + for entry in self._entries.values(): + entry_tokens = HolographicLinker._tokenize(f"{entry.title} {entry.content} {' '.join(entry.topics)}") + if not entry_tokens: + continue + intersection = query_tokens & entry_tokens + union = query_tokens | entry_tokens + jaccard = len(intersection) / len(union) + if jaccard >= threshold: + link_boost = inbound[entry.id] / max_inbound * 0.2 # up to 20% boost + scored.append((jaccard + link_boost, entry)) + + if scored: + scored.sort(key=lambda x: x[0], reverse=True) + return [e for _, e in scored[:limit]] + + # Graceful fallback to keyword search + return self.search(query, limit=limit) + def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]: """Get entries linked to a given entry, up to specified depth.""" visited = set() diff --git a/nexus/mnemosyne/cli.py b/nexus/mnemosyne/cli.py index 1ab08f15..6691b244 100644 --- a/nexus/mnemosyne/cli.py +++ b/nexus/mnemosyne/cli.py @@ -23,7 +23,10 @@ def cmd_stats(args): def cmd_search(args): archive = MnemosyneArchive() - results = archive.search(args.query, limit=args.limit) + if getattr(args, "semantic", False): + results = archive.semantic_search(args.query, limit=args.limit) + else: + results = archive.search(args.query, limit=args.limit) if not results: print("No results found.") return @@ -96,6 +99,7 @@ def main(): s = sub.add_parser("search", help="Search the archive") s.add_argument("query", help="Search query") s.add_argument("-n", "--limit", type=int, default=10) + s.add_argument("--semantic", action="store_true", help="Use holographic linker similarity scoring") i = sub.add_parser("ingest", help="Ingest a new entry") i.add_argument("--title", required=True) diff --git a/nexus/mnemosyne/tests/test_archive.py b/nexus/mnemosyne/tests/test_archive.py index 0c2b354b..11371e82 100644 --- a/nexus/mnemosyne/tests/test_archive.py +++ b/nexus/mnemosyne/tests/test_archive.py @@ -197,6 +197,71 @@ def test_archive_stats_orphan_count(): assert s["newest_entry"] is not None +def test_semantic_search_returns_results(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + ingest_event(archive, title="Python automation", content="Building automation tools in Python") + ingest_event(archive, title="Cooking recipes", content="How to make pasta carbonara with cheese") + results = archive.semantic_search("python scripting", limit=5) + assert len(results) > 0 + assert results[0].title == "Python automation" + + +def test_semantic_search_link_boost(): + """Entries with more inbound links rank higher when Jaccard is equal.""" + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + # Create two similar entries; manually give one more links + e1 = ingest_event(archive, title="Machine learning", content="Neural networks deep learning models") + e2 = ingest_event(archive, title="Machine learning basics", content="Neural networks deep learning intro") + # Add a third entry that links to e1 so e1 has more inbound links + e3 = ingest_event(archive, title="AI overview", content="Artificial intelligence machine learning") + # Manually give e1 an extra inbound link by adding e3 -> e1 + if e1.id not in e3.links: + e3.links.append(e1.id) + archive._save() + results = archive.semantic_search("machine learning neural networks", limit=5) + assert len(results) >= 2 + # e1 should rank at or near top + assert results[0].id in {e1.id, e2.id} + + +def test_semantic_search_fallback_to_keyword(): + """Falls back to keyword search when no entry meets Jaccard threshold.""" + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + ingest_event(archive, title="Exact match only", content="unique xyzzy token here") + # threshold=1.0 ensures no semantic match, triggering fallback + results = archive.semantic_search("xyzzy", limit=5, threshold=1.0) + # Fallback keyword search should find it + assert len(results) == 1 + assert results[0].title == "Exact match only" + + +def test_semantic_search_empty_archive(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + results = archive.semantic_search("anything", limit=5) + assert results == [] + + +def test_semantic_search_vs_keyword_relevance(): + """Semantic search finds conceptually related entries missed by keyword search.""" + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + ingest_event(archive, title="Python scripting", content="Writing scripts with Python for automation tasks") + ingest_event(archive, title="Baking bread", content="Mix flour water yeast knead bake oven") + # "coding" is semantically unrelated to baking but related to python scripting + results = archive.semantic_search("coding scripts automation") + assert len(results) > 0 + assert results[0].title == "Python scripting" + + def test_archive_topic_counts(): with tempfile.TemporaryDirectory() as tmp: path = Path(tmp) / "test_archive.json"