[claude] Mnemosyne: semantic search via holographic linker similarity (#1223) (#1225)

This commit is contained in:
2026-04-11 20:19:52 +00:00
committed by Alexander Whitestone
parent 46da2a8c81
commit b5ae7f4c2c
3 changed files with 117 additions and 1 deletions

View File

@@ -72,6 +72,53 @@ class MnemosyneArchive:
scored.sort(key=lambda x: x[0], reverse=True)
return [e for _, e in scored[:limit]]
def semantic_search(self, query: str, limit: int = 10, threshold: float = 0.05) -> list[ArchiveEntry]:
"""Semantic search using holographic linker similarity.
Scores each entry by Jaccard similarity between query tokens and entry
tokens, then boosts entries with more inbound links (more "holographic").
Falls back to keyword search if no entries meet the similarity threshold.
Args:
query: Natural language query string.
limit: Maximum number of results to return.
threshold: Minimum Jaccard similarity to be considered a semantic match.
Returns:
List of ArchiveEntry sorted by combined relevance score, descending.
"""
query_tokens = HolographicLinker._tokenize(query)
if not query_tokens:
return []
# Count inbound links for each entry (how many entries link TO this one)
inbound: dict[str, int] = {eid: 0 for eid in self._entries}
for entry in self._entries.values():
for linked_id in entry.links:
if linked_id in inbound:
inbound[linked_id] += 1
max_inbound = max(inbound.values(), default=1) or 1
scored = []
for entry in self._entries.values():
entry_tokens = HolographicLinker._tokenize(f"{entry.title} {entry.content} {' '.join(entry.topics)}")
if not entry_tokens:
continue
intersection = query_tokens & entry_tokens
union = query_tokens | entry_tokens
jaccard = len(intersection) / len(union)
if jaccard >= threshold:
link_boost = inbound[entry.id] / max_inbound * 0.2 # up to 20% boost
scored.append((jaccard + link_boost, entry))
if scored:
scored.sort(key=lambda x: x[0], reverse=True)
return [e for _, e in scored[:limit]]
# Graceful fallback to keyword search
return self.search(query, limit=limit)
def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]:
"""Get entries linked to a given entry, up to specified depth."""
visited = set()

View File

@@ -23,7 +23,10 @@ def cmd_stats(args):
def cmd_search(args):
archive = MnemosyneArchive()
results = archive.search(args.query, limit=args.limit)
if getattr(args, "semantic", False):
results = archive.semantic_search(args.query, limit=args.limit)
else:
results = archive.search(args.query, limit=args.limit)
if not results:
print("No results found.")
return
@@ -96,6 +99,7 @@ def main():
s = sub.add_parser("search", help="Search the archive")
s.add_argument("query", help="Search query")
s.add_argument("-n", "--limit", type=int, default=10)
s.add_argument("--semantic", action="store_true", help="Use holographic linker similarity scoring")
i = sub.add_parser("ingest", help="Ingest a new entry")
i.add_argument("--title", required=True)

View File

@@ -197,6 +197,71 @@ def test_archive_stats_orphan_count():
assert s["newest_entry"] is not None
def test_semantic_search_returns_results():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
ingest_event(archive, title="Python automation", content="Building automation tools in Python")
ingest_event(archive, title="Cooking recipes", content="How to make pasta carbonara with cheese")
results = archive.semantic_search("python scripting", limit=5)
assert len(results) > 0
assert results[0].title == "Python automation"
def test_semantic_search_link_boost():
"""Entries with more inbound links rank higher when Jaccard is equal."""
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
# Create two similar entries; manually give one more links
e1 = ingest_event(archive, title="Machine learning", content="Neural networks deep learning models")
e2 = ingest_event(archive, title="Machine learning basics", content="Neural networks deep learning intro")
# Add a third entry that links to e1 so e1 has more inbound links
e3 = ingest_event(archive, title="AI overview", content="Artificial intelligence machine learning")
# Manually give e1 an extra inbound link by adding e3 -> e1
if e1.id not in e3.links:
e3.links.append(e1.id)
archive._save()
results = archive.semantic_search("machine learning neural networks", limit=5)
assert len(results) >= 2
# e1 should rank at or near top
assert results[0].id in {e1.id, e2.id}
def test_semantic_search_fallback_to_keyword():
"""Falls back to keyword search when no entry meets Jaccard threshold."""
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
ingest_event(archive, title="Exact match only", content="unique xyzzy token here")
# threshold=1.0 ensures no semantic match, triggering fallback
results = archive.semantic_search("xyzzy", limit=5, threshold=1.0)
# Fallback keyword search should find it
assert len(results) == 1
assert results[0].title == "Exact match only"
def test_semantic_search_empty_archive():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
results = archive.semantic_search("anything", limit=5)
assert results == []
def test_semantic_search_vs_keyword_relevance():
"""Semantic search finds conceptually related entries missed by keyword search."""
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
ingest_event(archive, title="Python scripting", content="Writing scripts with Python for automation tasks")
ingest_event(archive, title="Baking bread", content="Mix flour water yeast knead bake oven")
# "coding" is semantically unrelated to baking but related to python scripting
results = archive.semantic_search("coding scripts automation")
assert len(results) > 0
assert results[0].title == "Python scripting"
def test_archive_topic_counts():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"