diff --git a/nexus/mnemosyne/__pycache__/archive.cpython-311.pyc b/nexus/mnemosyne/__pycache__/archive.cpython-311.pyc new file mode 100644 index 00000000..27304a3f Binary files /dev/null and b/nexus/mnemosyne/__pycache__/archive.cpython-311.pyc differ diff --git a/nexus/mnemosyne/__pycache__/cli.cpython-311.pyc b/nexus/mnemosyne/__pycache__/cli.cpython-311.pyc new file mode 100644 index 00000000..f9a793ee Binary files /dev/null and b/nexus/mnemosyne/__pycache__/cli.cpython-311.pyc differ diff --git a/nexus/mnemosyne/archive.py b/nexus/mnemosyne/archive.py index 847f9a46..27f7df87 100644 --- a/nexus/mnemosyne/archive.py +++ b/nexus/mnemosyne/archive.py @@ -72,6 +72,60 @@ class MnemosyneArchive: scored.sort(key=lambda x: x[0], reverse=True) return [e for _, e in scored[:limit]] + def semantic_search( + self, + query: str, + limit: int = 10, + threshold: float = 0.05, + link_boost: float = 0.1, + ) -> list[tuple[ArchiveEntry, float]]: + """Semantic search using holographic linker similarity. + + Scores entries by Jaccard similarity between query tokens and entry + tokens, boosted by link connectivity (entries with more back-links + are more "holographic" and get a relevance bump). + + Args: + query: search text + limit: max results to return + threshold: minimum similarity score to include + link_boost: extra score per back-link (capped at 0.3) + + Returns list of (entry, score) tuples sorted by score desc. + """ + from nexus.mnemosyne.entry import ArchiveEntry as _AE + + # Build a synthetic entry for the query so we can use the linker + query_entry = _AE(title=query, content="") + query_tokens = self.linker._tokenize(query) + + # Count back-links for each entry (incoming link graph) + back_links: dict[str, int] = {} + for entry in self._entries.values(): + for lid in entry.links: + back_links[lid] = back_links.get(lid, 0) + 1 + + scored: list[tuple[float, ArchiveEntry]] = [] + for entry in self._entries.values(): + sim = self.linker.compute_similarity(query_entry, entry) + if sim < threshold: + continue + # Link connectivity boost (capped) + boost = min(back_links.get(entry.id, 0) * link_boost, 0.3) + # Topic overlap bonus + topic_bonus = 0.0 + if entry.topics: + topic_tokens = set() + for t in entry.topics: + topic_tokens.update(self.linker._tokenize(t)) + overlap = len(query_tokens & topic_tokens) + topic_bonus = overlap * 0.05 + score = round(sim + boost + topic_bonus, 6) + scored.append((score, entry)) + + scored.sort(key=lambda x: x[0], reverse=True) + return [(e, s) for s, e in scored[:limit]] + def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]: """Get entries linked to a given entry, up to specified depth.""" visited = set() diff --git a/nexus/mnemosyne/cli.py b/nexus/mnemosyne/cli.py index 1ab08f15..71e741d9 100644 --- a/nexus/mnemosyne/cli.py +++ b/nexus/mnemosyne/cli.py @@ -23,16 +23,28 @@ def cmd_stats(args): def cmd_search(args): archive = MnemosyneArchive() - results = archive.search(args.query, limit=args.limit) - if not results: - print("No results found.") - return - for entry in results: - linked = len(entry.links) - print(f"[{entry.id[:8]}] {entry.title}") - print(f" Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}") - print(f" {entry.content[:120]}...") - print() + if args.semantic: + results = archive.semantic_search(args.query, limit=args.limit) + if not results: + print("No results found.") + return + for entry, score in results: + linked = len(entry.links) + print(f"[{entry.id[:8]}] {entry.title} (score: {score:.4f})") + print(f" Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}") + print(f" {entry.content[:120]}...") + print() + else: + results = archive.search(args.query, limit=args.limit) + if not results: + print("No results found.") + return + for entry in results: + linked = len(entry.links) + print(f"[{entry.id[:8]}] {entry.title}") + print(f" Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}") + print(f" {entry.content[:120]}...") + print() def cmd_ingest(args): @@ -96,6 +108,7 @@ def main(): s = sub.add_parser("search", help="Search the archive") s.add_argument("query", help="Search query") s.add_argument("-n", "--limit", type=int, default=10) + s.add_argument("--semantic", action="store_true", help="Use semantic similarity search instead of keyword matching") i = sub.add_parser("ingest", help="Ingest a new entry") i.add_argument("--title", required=True) diff --git a/nexus/mnemosyne/tests/__pycache__/test_archive.cpython-311.pyc b/nexus/mnemosyne/tests/__pycache__/test_archive.cpython-311.pyc new file mode 100644 index 00000000..6aa536d6 Binary files /dev/null and b/nexus/mnemosyne/tests/__pycache__/test_archive.cpython-311.pyc differ diff --git a/nexus/mnemosyne/tests/test_archive.py b/nexus/mnemosyne/tests/test_archive.py index 0c2b354b..83298b51 100644 --- a/nexus/mnemosyne/tests/test_archive.py +++ b/nexus/mnemosyne/tests/test_archive.py @@ -209,3 +209,72 @@ def test_archive_topic_counts(): assert counts["automation"] == 2 # sorted by count desc — both tied but must be present assert set(counts.keys()) == {"python", "automation"} + + +def test_semantic_search_returns_scored_results(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + ingest_event(archive, title="Python automation", content="Building automation tools in Python scripts") + ingest_event(archive, title="Cooking recipes", content="How to make pasta carbonara") + results = archive.semantic_search("Python scripting automation") + assert len(results) > 0 + # Python-related entry should score higher than cooking + assert results[0][0].title != "Cooking recipes" + # Results are (entry, score) tuples + assert all(isinstance(s, float) for _, s in results) + assert all(s > 0 for _, s in results) + + +def test_semantic_search_ranks_by_similarity(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + e1 = ingest_event(archive, title="Web development", content="Building websites with HTML CSS JavaScript") + e2 = ingest_event(archive, title="Web scraping", content="Extracting data from websites using Python requests") + e3 = ingest_event(archive, title="Astronomy", content="Studying stars planets galaxies") + results = archive.semantic_search("web programming internet") + titles = [e.title for e, _ in results] + # Both web entries should rank above astronomy + assert e3.title not in titles or titles.index(e3.title) > titles.index(e1.title) + + +def test_semantic_search_link_boost(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + # Entry A is about Python — will get linked to + e_a = ingest_event(archive, title="Python tools", content="Automation scripting development Python") + # Entry B is also about Python but with different enough content to link to A + e_b = ingest_event(archive, title="Python frameworks", content="Django Flask FastAPI Python development") + # Entry C has same keywords but no links (added with auto_link=False) + e_c = ArchiveEntry(title="Python tools", content="Automation scripting development Python") + archive.add(e_c, auto_link=False) + + results = archive.semantic_search("Python automation") + scores = {e.id: s for e, s in results} + + # Entries with more back-links should get the boost + # e_a and e_b should be linked to each other + if e_a.id in scores and e_c.id in scores: + # e_a has auto-links from e_b, e_c does not + # e_a should score >= e_c (boosted by link connectivity) + assert scores[e_a.id] >= scores[e_c.id] + + +def test_semantic_search_empty_archive(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + results = archive.semantic_search("anything") + assert results == [] + + +def test_semantic_search_threshold(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + ingest_event(archive, title="Quantum physics", content="Particle wave duality entanglement") + # Completely unrelated query — should return empty with high threshold + results = archive.semantic_search("cooking pasta carbonara", threshold=0.5) + assert results == []