Compare commits
1 Commits
mimo/code/
...
feat/mnemo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7e40d909bd |
BIN
nexus/mnemosyne/__pycache__/archive.cpython-311.pyc
Normal file
BIN
nexus/mnemosyne/__pycache__/archive.cpython-311.pyc
Normal file
Binary file not shown.
BIN
nexus/mnemosyne/__pycache__/cli.cpython-311.pyc
Normal file
BIN
nexus/mnemosyne/__pycache__/cli.cpython-311.pyc
Normal file
Binary file not shown.
@@ -72,6 +72,60 @@ class MnemosyneArchive:
|
|||||||
scored.sort(key=lambda x: x[0], reverse=True)
|
scored.sort(key=lambda x: x[0], reverse=True)
|
||||||
return [e for _, e in scored[:limit]]
|
return [e for _, e in scored[:limit]]
|
||||||
|
|
||||||
|
def semantic_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
limit: int = 10,
|
||||||
|
threshold: float = 0.05,
|
||||||
|
link_boost: float = 0.1,
|
||||||
|
) -> list[tuple[ArchiveEntry, float]]:
|
||||||
|
"""Semantic search using holographic linker similarity.
|
||||||
|
|
||||||
|
Scores entries by Jaccard similarity between query tokens and entry
|
||||||
|
tokens, boosted by link connectivity (entries with more back-links
|
||||||
|
are more "holographic" and get a relevance bump).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: search text
|
||||||
|
limit: max results to return
|
||||||
|
threshold: minimum similarity score to include
|
||||||
|
link_boost: extra score per back-link (capped at 0.3)
|
||||||
|
|
||||||
|
Returns list of (entry, score) tuples sorted by score desc.
|
||||||
|
"""
|
||||||
|
from nexus.mnemosyne.entry import ArchiveEntry as _AE
|
||||||
|
|
||||||
|
# Build a synthetic entry for the query so we can use the linker
|
||||||
|
query_entry = _AE(title=query, content="")
|
||||||
|
query_tokens = self.linker._tokenize(query)
|
||||||
|
|
||||||
|
# Count back-links for each entry (incoming link graph)
|
||||||
|
back_links: dict[str, int] = {}
|
||||||
|
for entry in self._entries.values():
|
||||||
|
for lid in entry.links:
|
||||||
|
back_links[lid] = back_links.get(lid, 0) + 1
|
||||||
|
|
||||||
|
scored: list[tuple[float, ArchiveEntry]] = []
|
||||||
|
for entry in self._entries.values():
|
||||||
|
sim = self.linker.compute_similarity(query_entry, entry)
|
||||||
|
if sim < threshold:
|
||||||
|
continue
|
||||||
|
# Link connectivity boost (capped)
|
||||||
|
boost = min(back_links.get(entry.id, 0) * link_boost, 0.3)
|
||||||
|
# Topic overlap bonus
|
||||||
|
topic_bonus = 0.0
|
||||||
|
if entry.topics:
|
||||||
|
topic_tokens = set()
|
||||||
|
for t in entry.topics:
|
||||||
|
topic_tokens.update(self.linker._tokenize(t))
|
||||||
|
overlap = len(query_tokens & topic_tokens)
|
||||||
|
topic_bonus = overlap * 0.05
|
||||||
|
score = round(sim + boost + topic_bonus, 6)
|
||||||
|
scored.append((score, entry))
|
||||||
|
|
||||||
|
scored.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
return [(e, s) for s, e in scored[:limit]]
|
||||||
|
|
||||||
def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]:
|
def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]:
|
||||||
"""Get entries linked to a given entry, up to specified depth."""
|
"""Get entries linked to a given entry, up to specified depth."""
|
||||||
visited = set()
|
visited = set()
|
||||||
|
|||||||
@@ -23,16 +23,28 @@ def cmd_stats(args):
|
|||||||
|
|
||||||
def cmd_search(args):
|
def cmd_search(args):
|
||||||
archive = MnemosyneArchive()
|
archive = MnemosyneArchive()
|
||||||
results = archive.search(args.query, limit=args.limit)
|
if args.semantic:
|
||||||
if not results:
|
results = archive.semantic_search(args.query, limit=args.limit)
|
||||||
print("No results found.")
|
if not results:
|
||||||
return
|
print("No results found.")
|
||||||
for entry in results:
|
return
|
||||||
linked = len(entry.links)
|
for entry, score in results:
|
||||||
print(f"[{entry.id[:8]}] {entry.title}")
|
linked = len(entry.links)
|
||||||
print(f" Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
|
print(f"[{entry.id[:8]}] {entry.title} (score: {score:.4f})")
|
||||||
print(f" {entry.content[:120]}...")
|
print(f" Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
|
||||||
print()
|
print(f" {entry.content[:120]}...")
|
||||||
|
print()
|
||||||
|
else:
|
||||||
|
results = archive.search(args.query, limit=args.limit)
|
||||||
|
if not results:
|
||||||
|
print("No results found.")
|
||||||
|
return
|
||||||
|
for entry in results:
|
||||||
|
linked = len(entry.links)
|
||||||
|
print(f"[{entry.id[:8]}] {entry.title}")
|
||||||
|
print(f" Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
|
||||||
|
print(f" {entry.content[:120]}...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
def cmd_ingest(args):
|
def cmd_ingest(args):
|
||||||
@@ -96,6 +108,7 @@ def main():
|
|||||||
s = sub.add_parser("search", help="Search the archive")
|
s = sub.add_parser("search", help="Search the archive")
|
||||||
s.add_argument("query", help="Search query")
|
s.add_argument("query", help="Search query")
|
||||||
s.add_argument("-n", "--limit", type=int, default=10)
|
s.add_argument("-n", "--limit", type=int, default=10)
|
||||||
|
s.add_argument("--semantic", action="store_true", help="Use semantic similarity search instead of keyword matching")
|
||||||
|
|
||||||
i = sub.add_parser("ingest", help="Ingest a new entry")
|
i = sub.add_parser("ingest", help="Ingest a new entry")
|
||||||
i.add_argument("--title", required=True)
|
i.add_argument("--title", required=True)
|
||||||
|
|||||||
BIN
nexus/mnemosyne/tests/__pycache__/test_archive.cpython-311.pyc
Normal file
BIN
nexus/mnemosyne/tests/__pycache__/test_archive.cpython-311.pyc
Normal file
Binary file not shown.
@@ -209,3 +209,72 @@ def test_archive_topic_counts():
|
|||||||
assert counts["automation"] == 2
|
assert counts["automation"] == 2
|
||||||
# sorted by count desc — both tied but must be present
|
# sorted by count desc — both tied but must be present
|
||||||
assert set(counts.keys()) == {"python", "automation"}
|
assert set(counts.keys()) == {"python", "automation"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_semantic_search_returns_scored_results():
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
path = Path(tmp) / "test_archive.json"
|
||||||
|
archive = MnemosyneArchive(archive_path=path)
|
||||||
|
ingest_event(archive, title="Python automation", content="Building automation tools in Python scripts")
|
||||||
|
ingest_event(archive, title="Cooking recipes", content="How to make pasta carbonara")
|
||||||
|
results = archive.semantic_search("Python scripting automation")
|
||||||
|
assert len(results) > 0
|
||||||
|
# Python-related entry should score higher than cooking
|
||||||
|
assert results[0][0].title != "Cooking recipes"
|
||||||
|
# Results are (entry, score) tuples
|
||||||
|
assert all(isinstance(s, float) for _, s in results)
|
||||||
|
assert all(s > 0 for _, s in results)
|
||||||
|
|
||||||
|
|
||||||
|
def test_semantic_search_ranks_by_similarity():
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
path = Path(tmp) / "test_archive.json"
|
||||||
|
archive = MnemosyneArchive(archive_path=path)
|
||||||
|
e1 = ingest_event(archive, title="Web development", content="Building websites with HTML CSS JavaScript")
|
||||||
|
e2 = ingest_event(archive, title="Web scraping", content="Extracting data from websites using Python requests")
|
||||||
|
e3 = ingest_event(archive, title="Astronomy", content="Studying stars planets galaxies")
|
||||||
|
results = archive.semantic_search("web programming internet")
|
||||||
|
titles = [e.title for e, _ in results]
|
||||||
|
# Both web entries should rank above astronomy
|
||||||
|
assert e3.title not in titles or titles.index(e3.title) > titles.index(e1.title)
|
||||||
|
|
||||||
|
|
||||||
|
def test_semantic_search_link_boost():
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
path = Path(tmp) / "test_archive.json"
|
||||||
|
archive = MnemosyneArchive(archive_path=path)
|
||||||
|
# Entry A is about Python — will get linked to
|
||||||
|
e_a = ingest_event(archive, title="Python tools", content="Automation scripting development Python")
|
||||||
|
# Entry B is also about Python but with different enough content to link to A
|
||||||
|
e_b = ingest_event(archive, title="Python frameworks", content="Django Flask FastAPI Python development")
|
||||||
|
# Entry C has same keywords but no links (added with auto_link=False)
|
||||||
|
e_c = ArchiveEntry(title="Python tools", content="Automation scripting development Python")
|
||||||
|
archive.add(e_c, auto_link=False)
|
||||||
|
|
||||||
|
results = archive.semantic_search("Python automation")
|
||||||
|
scores = {e.id: s for e, s in results}
|
||||||
|
|
||||||
|
# Entries with more back-links should get the boost
|
||||||
|
# e_a and e_b should be linked to each other
|
||||||
|
if e_a.id in scores and e_c.id in scores:
|
||||||
|
# e_a has auto-links from e_b, e_c does not
|
||||||
|
# e_a should score >= e_c (boosted by link connectivity)
|
||||||
|
assert scores[e_a.id] >= scores[e_c.id]
|
||||||
|
|
||||||
|
|
||||||
|
def test_semantic_search_empty_archive():
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
path = Path(tmp) / "test_archive.json"
|
||||||
|
archive = MnemosyneArchive(archive_path=path)
|
||||||
|
results = archive.semantic_search("anything")
|
||||||
|
assert results == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_semantic_search_threshold():
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
path = Path(tmp) / "test_archive.json"
|
||||||
|
archive = MnemosyneArchive(archive_path=path)
|
||||||
|
ingest_event(archive, title="Quantum physics", content="Particle wave duality entanglement")
|
||||||
|
# Completely unrelated query — should return empty with high threshold
|
||||||
|
results = archive.semantic_search("cooking pasta carbonara", threshold=0.5)
|
||||||
|
assert results == []
|
||||||
|
|||||||
Reference in New Issue
Block a user