This commit is contained in:
@@ -72,6 +72,53 @@ class MnemosyneArchive:
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [e for _, e in scored[:limit]]
|
||||
|
||||
def semantic_search(self, query: str, limit: int = 10, threshold: float = 0.05) -> list[ArchiveEntry]:
|
||||
"""Semantic search using holographic linker similarity.
|
||||
|
||||
Scores each entry by Jaccard similarity between query tokens and entry
|
||||
tokens, then boosts entries with more inbound links (more "holographic").
|
||||
Falls back to keyword search if no entries meet the similarity threshold.
|
||||
|
||||
Args:
|
||||
query: Natural language query string.
|
||||
limit: Maximum number of results to return.
|
||||
threshold: Minimum Jaccard similarity to be considered a semantic match.
|
||||
|
||||
Returns:
|
||||
List of ArchiveEntry sorted by combined relevance score, descending.
|
||||
"""
|
||||
query_tokens = HolographicLinker._tokenize(query)
|
||||
if not query_tokens:
|
||||
return []
|
||||
|
||||
# Count inbound links for each entry (how many entries link TO this one)
|
||||
inbound: dict[str, int] = {eid: 0 for eid in self._entries}
|
||||
for entry in self._entries.values():
|
||||
for linked_id in entry.links:
|
||||
if linked_id in inbound:
|
||||
inbound[linked_id] += 1
|
||||
|
||||
max_inbound = max(inbound.values(), default=1) or 1
|
||||
|
||||
scored = []
|
||||
for entry in self._entries.values():
|
||||
entry_tokens = HolographicLinker._tokenize(f"{entry.title} {entry.content} {' '.join(entry.topics)}")
|
||||
if not entry_tokens:
|
||||
continue
|
||||
intersection = query_tokens & entry_tokens
|
||||
union = query_tokens | entry_tokens
|
||||
jaccard = len(intersection) / len(union)
|
||||
if jaccard >= threshold:
|
||||
link_boost = inbound[entry.id] / max_inbound * 0.2 # up to 20% boost
|
||||
scored.append((jaccard + link_boost, entry))
|
||||
|
||||
if scored:
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [e for _, e in scored[:limit]]
|
||||
|
||||
# Graceful fallback to keyword search
|
||||
return self.search(query, limit=limit)
|
||||
|
||||
def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]:
|
||||
"""Get entries linked to a given entry, up to specified depth."""
|
||||
visited = set()
|
||||
|
||||
@@ -23,7 +23,10 @@ def cmd_stats(args):
|
||||
|
||||
def cmd_search(args):
|
||||
archive = MnemosyneArchive()
|
||||
results = archive.search(args.query, limit=args.limit)
|
||||
if getattr(args, "semantic", False):
|
||||
results = archive.semantic_search(args.query, limit=args.limit)
|
||||
else:
|
||||
results = archive.search(args.query, limit=args.limit)
|
||||
if not results:
|
||||
print("No results found.")
|
||||
return
|
||||
@@ -96,6 +99,7 @@ def main():
|
||||
s = sub.add_parser("search", help="Search the archive")
|
||||
s.add_argument("query", help="Search query")
|
||||
s.add_argument("-n", "--limit", type=int, default=10)
|
||||
s.add_argument("--semantic", action="store_true", help="Use holographic linker similarity scoring")
|
||||
|
||||
i = sub.add_parser("ingest", help="Ingest a new entry")
|
||||
i.add_argument("--title", required=True)
|
||||
|
||||
@@ -197,6 +197,71 @@ def test_archive_stats_orphan_count():
|
||||
assert s["newest_entry"] is not None
|
||||
|
||||
|
||||
def test_semantic_search_returns_results():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="Python automation", content="Building automation tools in Python")
|
||||
ingest_event(archive, title="Cooking recipes", content="How to make pasta carbonara with cheese")
|
||||
results = archive.semantic_search("python scripting", limit=5)
|
||||
assert len(results) > 0
|
||||
assert results[0].title == "Python automation"
|
||||
|
||||
|
||||
def test_semantic_search_link_boost():
|
||||
"""Entries with more inbound links rank higher when Jaccard is equal."""
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
# Create two similar entries; manually give one more links
|
||||
e1 = ingest_event(archive, title="Machine learning", content="Neural networks deep learning models")
|
||||
e2 = ingest_event(archive, title="Machine learning basics", content="Neural networks deep learning intro")
|
||||
# Add a third entry that links to e1 so e1 has more inbound links
|
||||
e3 = ingest_event(archive, title="AI overview", content="Artificial intelligence machine learning")
|
||||
# Manually give e1 an extra inbound link by adding e3 -> e1
|
||||
if e1.id not in e3.links:
|
||||
e3.links.append(e1.id)
|
||||
archive._save()
|
||||
results = archive.semantic_search("machine learning neural networks", limit=5)
|
||||
assert len(results) >= 2
|
||||
# e1 should rank at or near top
|
||||
assert results[0].id in {e1.id, e2.id}
|
||||
|
||||
|
||||
def test_semantic_search_fallback_to_keyword():
|
||||
"""Falls back to keyword search when no entry meets Jaccard threshold."""
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="Exact match only", content="unique xyzzy token here")
|
||||
# threshold=1.0 ensures no semantic match, triggering fallback
|
||||
results = archive.semantic_search("xyzzy", limit=5, threshold=1.0)
|
||||
# Fallback keyword search should find it
|
||||
assert len(results) == 1
|
||||
assert results[0].title == "Exact match only"
|
||||
|
||||
|
||||
def test_semantic_search_empty_archive():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
results = archive.semantic_search("anything", limit=5)
|
||||
assert results == []
|
||||
|
||||
|
||||
def test_semantic_search_vs_keyword_relevance():
|
||||
"""Semantic search finds conceptually related entries missed by keyword search."""
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="Python scripting", content="Writing scripts with Python for automation tasks")
|
||||
ingest_event(archive, title="Baking bread", content="Mix flour water yeast knead bake oven")
|
||||
# "coding" is semantically unrelated to baking but related to python scripting
|
||||
results = archive.semantic_search("coding scripts automation")
|
||||
assert len(results) > 0
|
||||
assert results[0].title == "Python scripting"
|
||||
|
||||
|
||||
def test_archive_topic_counts():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
|
||||
Reference in New Issue
Block a user