Compare commits
4 Commits
feat/mnemo
...
feat/mnemo
| Author | SHA1 | Date | |
|---|---|---|---|
| 1e170074ce | |||
| 27c211268f | |||
| ddf4f8cd3d | |||
| c0bb6bf918 |
Binary file not shown.
Binary file not shown.
@@ -7,14 +7,13 @@ and provides query interfaces for retrieving connected knowledge.
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from nexus.mnemosyne.entry import ArchiveEntry
|
||||
from nexus.mnemosyne.linker import HolographicLinker
|
||||
|
||||
_EXPORT_VERSION = "1"
|
||||
|
||||
|
||||
class MnemosyneArchive:
|
||||
"""The holographic archive — stores and links entries.
|
||||
@@ -60,6 +59,21 @@ class MnemosyneArchive:
|
||||
def get(self, entry_id: str) -> Optional[ArchiveEntry]:
|
||||
return self._entries.get(entry_id)
|
||||
|
||||
def remove(self, entry_id: str) -> bool:
|
||||
"""Remove an entry and clean all bidirectional links to it.
|
||||
|
||||
Returns True if entry was found and removed, False otherwise.
|
||||
"""
|
||||
if entry_id not in self._entries:
|
||||
return False
|
||||
# Remove bidirectional links
|
||||
for other in self._entries.values():
|
||||
if entry_id in other.links:
|
||||
other.links.remove(entry_id)
|
||||
del self._entries[entry_id]
|
||||
self._save()
|
||||
return True
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> list[ArchiveEntry]:
|
||||
"""Simple keyword search across titles and content."""
|
||||
query_tokens = set(query.lower().split())
|
||||
@@ -72,59 +86,43 @@ class MnemosyneArchive:
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [e for _, e in scored[:limit]]
|
||||
|
||||
def semantic_search(
|
||||
def export(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 10,
|
||||
threshold: float = 0.05,
|
||||
link_boost: float = 0.1,
|
||||
) -> list[tuple[ArchiveEntry, float]]:
|
||||
"""Semantic search using holographic linker similarity.
|
||||
|
||||
Scores entries by Jaccard similarity between query tokens and entry
|
||||
tokens, boosted by link connectivity (entries with more back-links
|
||||
are more "holographic" and get a relevance bump).
|
||||
topics: Optional[list[str]] = None,
|
||||
source: Optional[str] = None,
|
||||
since: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""Export archive data, optionally filtered.
|
||||
|
||||
Args:
|
||||
query: search text
|
||||
limit: max results to return
|
||||
threshold: minimum similarity score to include
|
||||
link_boost: extra score per back-link (capped at 0.3)
|
||||
topics: Only include entries with at least one matching topic.
|
||||
source: Only include entries from this source.
|
||||
since: ISO date string — only include entries created after this.
|
||||
|
||||
Returns list of (entry, score) tuples sorted by score desc.
|
||||
Returns:
|
||||
Dict with entries, count, and export metadata.
|
||||
"""
|
||||
from nexus.mnemosyne.entry import ArchiveEntry as _AE
|
||||
entries = list(self._entries.values())
|
||||
|
||||
# Build a synthetic entry for the query so we can use the linker
|
||||
query_entry = _AE(title=query, content="")
|
||||
query_tokens = self.linker._tokenize(query)
|
||||
if topics:
|
||||
topics_lower = {t.lower() for t in topics}
|
||||
entries = [e for e in entries if topics_lower & {t.lower() for t in e.topics}]
|
||||
|
||||
# Count back-links for each entry (incoming link graph)
|
||||
back_links: dict[str, int] = {}
|
||||
for entry in self._entries.values():
|
||||
for lid in entry.links:
|
||||
back_links[lid] = back_links.get(lid, 0) + 1
|
||||
if source:
|
||||
entries = [e for e in entries if e.source == source]
|
||||
|
||||
scored: list[tuple[float, ArchiveEntry]] = []
|
||||
for entry in self._entries.values():
|
||||
sim = self.linker.compute_similarity(query_entry, entry)
|
||||
if sim < threshold:
|
||||
continue
|
||||
# Link connectivity boost (capped)
|
||||
boost = min(back_links.get(entry.id, 0) * link_boost, 0.3)
|
||||
# Topic overlap bonus
|
||||
topic_bonus = 0.0
|
||||
if entry.topics:
|
||||
topic_tokens = set()
|
||||
for t in entry.topics:
|
||||
topic_tokens.update(self.linker._tokenize(t))
|
||||
overlap = len(query_tokens & topic_tokens)
|
||||
topic_bonus = overlap * 0.05
|
||||
score = round(sim + boost + topic_bonus, 6)
|
||||
scored.append((score, entry))
|
||||
if since:
|
||||
entries = [e for e in entries if e.created_at >= since]
|
||||
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [(e, s) for s, e in scored[:limit]]
|
||||
return {
|
||||
"entries": [e.to_dict() for e in entries],
|
||||
"count": len(entries),
|
||||
"filters": {
|
||||
"topics": topics,
|
||||
"source": source,
|
||||
"since": since,
|
||||
},
|
||||
}
|
||||
|
||||
def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]:
|
||||
"""Get entries linked to a given entry, up to specified depth."""
|
||||
@@ -153,67 +151,13 @@ class MnemosyneArchive:
|
||||
topic_lower = topic.lower()
|
||||
return [e for e in self._entries.values() if topic_lower in [t.lower() for t in e.topics]]
|
||||
|
||||
def remove(self, entry_id: str) -> bool:
|
||||
"""Remove an entry and clean up all bidirectional links.
|
||||
|
||||
Returns True if the entry existed and was removed, False otherwise.
|
||||
"""
|
||||
if entry_id not in self._entries:
|
||||
return False
|
||||
# Remove back-links from all other entries
|
||||
for other in self._entries.values():
|
||||
if entry_id in other.links:
|
||||
other.links.remove(entry_id)
|
||||
del self._entries[entry_id]
|
||||
self._save()
|
||||
return True
|
||||
|
||||
def export(
|
||||
self,
|
||||
query: Optional[str] = None,
|
||||
topics: Optional[list[str]] = None,
|
||||
) -> dict:
|
||||
"""Export a filtered subset of the archive.
|
||||
|
||||
Args:
|
||||
query: keyword filter applied to title + content (case-insensitive)
|
||||
topics: list of topic tags; entries must match at least one
|
||||
|
||||
Returns a JSON-serialisable dict with an ``entries`` list and metadata.
|
||||
"""
|
||||
candidates = list(self._entries.values())
|
||||
|
||||
if topics:
|
||||
lower_topics = {t.lower() for t in topics}
|
||||
candidates = [
|
||||
e for e in candidates
|
||||
if any(t.lower() in lower_topics for t in e.topics)
|
||||
]
|
||||
|
||||
if query:
|
||||
query_tokens = set(query.lower().split())
|
||||
candidates = [
|
||||
e for e in candidates
|
||||
if any(
|
||||
token in f"{e.title} {e.content} {' '.join(e.topics)}".lower()
|
||||
for token in query_tokens
|
||||
)
|
||||
]
|
||||
|
||||
return {
|
||||
"version": _EXPORT_VERSION,
|
||||
"filters": {"query": query, "topics": topics},
|
||||
"count": len(candidates),
|
||||
"entries": [e.to_dict() for e in candidates],
|
||||
}
|
||||
|
||||
def topic_counts(self) -> dict[str, int]:
|
||||
"""Return a dict mapping topic name → entry count, sorted by count desc."""
|
||||
counts: dict[str, int] = {}
|
||||
def list_topics(self) -> dict[str, int]:
|
||||
"""Return dict mapping each topic to its entry count."""
|
||||
topic_counts: dict[str, int] = {}
|
||||
for entry in self._entries.values():
|
||||
for topic in entry.topics:
|
||||
counts[topic] = counts.get(topic, 0) + 1
|
||||
return dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))
|
||||
topic_counts[topic] = topic_counts.get(topic, 0) + 1
|
||||
return dict(sorted(topic_counts.items(), key=lambda x: x[1], reverse=True))
|
||||
|
||||
@property
|
||||
def count(self) -> int:
|
||||
@@ -222,29 +166,30 @@ class MnemosyneArchive:
|
||||
def stats(self) -> dict:
|
||||
entries = list(self._entries.values())
|
||||
total_links = sum(len(e.links) for e in entries)
|
||||
topics: set[str] = set()
|
||||
topics = set()
|
||||
for e in entries:
|
||||
topics.update(e.topics)
|
||||
|
||||
# Orphans: entries with no links at all
|
||||
orphans = sum(1 for e in entries if len(e.links) == 0)
|
||||
# Orphans: entries with no links
|
||||
orphans = sum(1 for e in entries if not e.links)
|
||||
|
||||
# Link density: average links per entry (0 when empty)
|
||||
# Link density: ratio of actual links to possible links
|
||||
n = len(entries)
|
||||
link_density = round(total_links / n, 4) if n else 0.0
|
||||
max_links = n * (n - 1) if n > 1 else 0
|
||||
link_density = total_links / max_links if max_links > 0 else 0.0
|
||||
|
||||
# Age distribution
|
||||
timestamps = sorted(e.created_at for e in entries)
|
||||
oldest_entry = timestamps[0] if timestamps else None
|
||||
newest_entry = timestamps[-1] if timestamps else None
|
||||
# Entry age range
|
||||
timestamps = [e.created_at for e in entries if e.created_at]
|
||||
oldest = min(timestamps) if timestamps else None
|
||||
newest = max(timestamps) if timestamps else None
|
||||
|
||||
return {
|
||||
"entries": n,
|
||||
"entries": len(entries),
|
||||
"total_links": total_links,
|
||||
"orphans": orphans,
|
||||
"link_density": round(link_density, 4),
|
||||
"unique_topics": len(topics),
|
||||
"topics": sorted(topics),
|
||||
"orphans": orphans,
|
||||
"link_density": link_density,
|
||||
"oldest_entry": oldest_entry,
|
||||
"newest_entry": newest_entry,
|
||||
"oldest_entry": oldest,
|
||||
"newest_entry": newest,
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
"""CLI interface for Mnemosyne.
|
||||
|
||||
Provides: mnemosyne ingest, mnemosyne search, mnemosyne link, mnemosyne stats,
|
||||
mnemosyne topics, mnemosyne remove, mnemosyne export
|
||||
Provides: mnemosyne ingest, mnemosyne search, mnemosyne link, mnemosyne stats, mnemosyne topics, mnemosyne remove, mnemosyne export
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -23,28 +22,16 @@ def cmd_stats(args):
|
||||
|
||||
def cmd_search(args):
|
||||
archive = MnemosyneArchive()
|
||||
if args.semantic:
|
||||
results = archive.semantic_search(args.query, limit=args.limit)
|
||||
if not results:
|
||||
print("No results found.")
|
||||
return
|
||||
for entry, score in results:
|
||||
linked = len(entry.links)
|
||||
print(f"[{entry.id[:8]}] {entry.title} (score: {score:.4f})")
|
||||
print(f" Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
|
||||
print(f" {entry.content[:120]}...")
|
||||
print()
|
||||
else:
|
||||
results = archive.search(args.query, limit=args.limit)
|
||||
if not results:
|
||||
print("No results found.")
|
||||
return
|
||||
for entry in results:
|
||||
linked = len(entry.links)
|
||||
print(f"[{entry.id[:8]}] {entry.title}")
|
||||
print(f" Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
|
||||
print(f" {entry.content[:120]}...")
|
||||
print()
|
||||
results = archive.search(args.query, limit=args.limit)
|
||||
if not results:
|
||||
print("No results found.")
|
||||
return
|
||||
for entry in results:
|
||||
linked = len(entry.links)
|
||||
print(f"[{entry.id[:8]}] {entry.title}")
|
||||
print(f" Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
|
||||
print(f" {entry.content[:120]}...")
|
||||
print()
|
||||
|
||||
|
||||
def cmd_ingest(args):
|
||||
@@ -74,29 +61,39 @@ def cmd_link(args):
|
||||
|
||||
def cmd_topics(args):
|
||||
archive = MnemosyneArchive()
|
||||
counts = archive.topic_counts()
|
||||
if not counts:
|
||||
topics = archive.list_topics()
|
||||
if not topics:
|
||||
print("No topics found.")
|
||||
return
|
||||
for topic, count in counts.items():
|
||||
print(f" {topic}: {count}")
|
||||
for topic, count in topics.items():
|
||||
print(f" {topic}: {count} {'entry' if count == 1 else 'entries'}")
|
||||
|
||||
|
||||
def cmd_remove(args):
|
||||
archive = MnemosyneArchive()
|
||||
removed = archive.remove(args.entry_id)
|
||||
if removed:
|
||||
print(f"Removed entry: {args.entry_id}")
|
||||
else:
|
||||
entry = archive.get(args.entry_id)
|
||||
if not entry:
|
||||
print(f"Entry not found: {args.entry_id}")
|
||||
sys.exit(1)
|
||||
title = entry.title
|
||||
if archive.remove(args.entry_id):
|
||||
print(f"Removed: [{args.entry_id[:8]}] {title}")
|
||||
else:
|
||||
print(f"Failed to remove: {args.entry_id}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def cmd_export(args):
|
||||
archive = MnemosyneArchive()
|
||||
topics = [t.strip() for t in args.topics.split(",")] if args.topics else None
|
||||
data = archive.export(query=args.query or None, topics=topics)
|
||||
print(json.dumps(data, indent=2))
|
||||
topics = args.topics.split(",") if args.topics else None
|
||||
data = archive.export(topics=topics, source=args.source, since=args.since)
|
||||
output = json.dumps(data, indent=2)
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(output)
|
||||
print(f"Exported {data['count']} entries to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
|
||||
def main():
|
||||
@@ -108,7 +105,6 @@ def main():
|
||||
s = sub.add_parser("search", help="Search the archive")
|
||||
s.add_argument("query", help="Search query")
|
||||
s.add_argument("-n", "--limit", type=int, default=10)
|
||||
s.add_argument("--semantic", action="store_true", help="Use semantic similarity search instead of keyword matching")
|
||||
|
||||
i = sub.add_parser("ingest", help="Ingest a new entry")
|
||||
i.add_argument("--title", required=True)
|
||||
@@ -121,28 +117,23 @@ def main():
|
||||
|
||||
sub.add_parser("topics", help="List all topics with entry counts")
|
||||
|
||||
r = sub.add_parser("remove", help="Remove an entry by ID")
|
||||
r.add_argument("entry_id", help="Entry ID to remove")
|
||||
r = sub.add_parser("remove", help="Remove an entry")
|
||||
r.add_argument("entry_id", help="Entry ID (or prefix)")
|
||||
|
||||
ex = sub.add_parser("export", help="Export filtered archive data as JSON")
|
||||
ex.add_argument("-q", "--query", default="", help="Keyword filter")
|
||||
ex.add_argument("-t", "--topics", default="", help="Comma-separated topic filter")
|
||||
e = sub.add_parser("export", help="Export archive data")
|
||||
e.add_argument("--topics", default=None, help="Filter by comma-separated topics")
|
||||
e.add_argument("--source", default=None, help="Filter by source")
|
||||
e.add_argument("--since", default=None, help="Filter by date (ISO format)")
|
||||
e.add_argument("-o", "--output", default=None, help="Output file path")
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
dispatch = {
|
||||
"stats": cmd_stats,
|
||||
"search": cmd_search,
|
||||
"ingest": cmd_ingest,
|
||||
"link": cmd_link,
|
||||
"topics": cmd_topics,
|
||||
"remove": cmd_remove,
|
||||
"export": cmd_export,
|
||||
}
|
||||
dispatch[args.command](args)
|
||||
{"stats": cmd_stats, "search": cmd_search, "ingest": cmd_ingest,
|
||||
"link": cmd_link, "topics": cmd_topics, "remove": cmd_remove,
|
||||
"export": cmd_export}[args.command](args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Binary file not shown.
@@ -73,208 +73,73 @@ def test_archive_persistence():
|
||||
assert len(results) == 1
|
||||
|
||||
|
||||
def test_archive_remove_basic():
|
||||
def test_archive_remove():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e1 = ingest_event(archive, title="Alpha", content="First entry", topics=["x"])
|
||||
assert archive.count == 1
|
||||
e1 = ingest_event(archive, title="Entry one", content="First content", topics=["test"])
|
||||
e2 = ingest_event(archive, title="Entry two", content="Second content", topics=["test"])
|
||||
assert archive.count == 2
|
||||
|
||||
result = archive.remove(e1.id)
|
||||
assert result is True
|
||||
assert archive.count == 0
|
||||
# Remove e1 — should also clean links from e2
|
||||
assert archive.remove(e1.id) is True
|
||||
assert archive.count == 1
|
||||
assert e1.id not in e2.links
|
||||
assert archive.get(e1.id) is None
|
||||
|
||||
# Remove non-existent
|
||||
assert archive.remove("nonexistent") is False
|
||||
|
||||
def test_archive_remove_nonexistent():
|
||||
|
||||
def test_archive_export():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
result = archive.remove("does-not-exist")
|
||||
assert result is False
|
||||
ingest_event(archive, title="Python entry", content="About Python", topics=["coding", "python"])
|
||||
ingest_event(archive, title="Cooking entry", content="About food", topics=["cooking"])
|
||||
ingest_event(archive, title="Another python", content="More Python", topics=["python"])
|
||||
|
||||
|
||||
def test_archive_remove_cleans_backlinks():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e1 = ingest_event(archive, title="Python automation", content="Building automation tools in Python")
|
||||
e2 = ingest_event(archive, title="Python scripting", content="Writing automation scripts using Python")
|
||||
# At least one direction should be linked
|
||||
assert e1.id in e2.links or e2.id in e1.links
|
||||
|
||||
# Remove e1; e2 must no longer reference it
|
||||
archive.remove(e1.id)
|
||||
e2_fresh = archive.get(e2.id)
|
||||
assert e2_fresh is not None
|
||||
assert e1.id not in e2_fresh.links
|
||||
|
||||
|
||||
def test_archive_remove_persists():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
a1 = MnemosyneArchive(archive_path=path)
|
||||
e = ingest_event(a1, title="Gone", content="Will be removed")
|
||||
a1.remove(e.id)
|
||||
|
||||
a2 = MnemosyneArchive(archive_path=path)
|
||||
assert a2.count == 0
|
||||
|
||||
|
||||
def test_archive_export_unfiltered():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="A", content="content a", topics=["alpha"])
|
||||
ingest_event(archive, title="B", content="content b", topics=["beta"])
|
||||
# Export all
|
||||
data = archive.export()
|
||||
assert data["count"] == 3
|
||||
|
||||
# Export by topic
|
||||
data = archive.export(topics=["python"])
|
||||
assert data["count"] == 2
|
||||
assert len(data["entries"]) == 2
|
||||
assert data["filters"] == {"query": None, "topics": None}
|
||||
|
||||
|
||||
def test_archive_export_by_topic():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="A", content="content a", topics=["alpha"])
|
||||
ingest_event(archive, title="B", content="content b", topics=["beta"])
|
||||
data = archive.export(topics=["alpha"])
|
||||
data = archive.export(topics=["cooking"])
|
||||
assert data["count"] == 1
|
||||
assert data["entries"][0]["title"] == "A"
|
||||
|
||||
# Export by non-existent topic
|
||||
data = archive.export(topics=["nonexistent"])
|
||||
assert data["count"] == 0
|
||||
|
||||
|
||||
def test_archive_export_by_query():
|
||||
def test_archive_list_topics():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="Hello world", content="greetings", topics=[])
|
||||
ingest_event(archive, title="Goodbye", content="farewell", topics=[])
|
||||
data = archive.export(query="hello")
|
||||
assert data["count"] == 1
|
||||
assert data["entries"][0]["title"] == "Hello world"
|
||||
ingest_event(archive, title="E1", content="c", topics=["alpha", "beta"])
|
||||
ingest_event(archive, title="E2", content="c", topics=["beta", "gamma"])
|
||||
ingest_event(archive, title="E3", content="c", topics=["beta"])
|
||||
|
||||
topics = archive.list_topics()
|
||||
assert topics["beta"] == 3
|
||||
assert topics["alpha"] == 1
|
||||
assert topics["gamma"] == 1
|
||||
|
||||
|
||||
def test_archive_export_combined_filters():
|
||||
def test_archive_richer_stats():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="Hello world", content="greetings", topics=["alpha"])
|
||||
ingest_event(archive, title="Hello again", content="greetings again", topics=["beta"])
|
||||
data = archive.export(query="hello", topics=["alpha"])
|
||||
assert data["count"] == 1
|
||||
assert data["entries"][0]["title"] == "Hello world"
|
||||
e1 = ingest_event(archive, title="Linked entry", content="Shared content here", topics=["test"])
|
||||
e2 = ingest_event(archive, title="Linked content", content="Shared entry here", topics=["test"])
|
||||
ingest_event(archive, title="Orphan entry", content="Nothing in common cooking", topics=["food"])
|
||||
|
||||
|
||||
def test_archive_stats_richer():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
# All four new fields present when archive is empty
|
||||
s = archive.stats()
|
||||
assert "orphans" in s
|
||||
assert "link_density" in s
|
||||
assert "oldest_entry" in s
|
||||
assert "newest_entry" in s
|
||||
assert s["orphans"] == 0
|
||||
assert s["link_density"] == 0.0
|
||||
assert s["oldest_entry"] is None
|
||||
assert s["newest_entry"] is None
|
||||
|
||||
|
||||
def test_archive_stats_orphan_count():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
# Two entries with very different content → unlikely to auto-link
|
||||
ingest_event(archive, title="Zebras", content="Zebra stripes savannah Africa", topics=[])
|
||||
ingest_event(archive, title="Compiler", content="Lexer parser AST bytecode", topics=[])
|
||||
s = archive.stats()
|
||||
# At least one should be an orphan (no cross-link between these topics)
|
||||
assert s["orphans"] >= 0 # structural check
|
||||
assert s["link_density"] >= 0.0
|
||||
assert s["oldest_entry"] is not None
|
||||
assert s["newest_entry"] is not None
|
||||
|
||||
|
||||
def test_archive_topic_counts():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="A", content="x", topics=["python", "automation"])
|
||||
ingest_event(archive, title="B", content="y", topics=["python"])
|
||||
ingest_event(archive, title="C", content="z", topics=["automation"])
|
||||
counts = archive.topic_counts()
|
||||
assert counts["python"] == 2
|
||||
assert counts["automation"] == 2
|
||||
# sorted by count desc — both tied but must be present
|
||||
assert set(counts.keys()) == {"python", "automation"}
|
||||
|
||||
|
||||
def test_semantic_search_returns_scored_results():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="Python automation", content="Building automation tools in Python scripts")
|
||||
ingest_event(archive, title="Cooking recipes", content="How to make pasta carbonara")
|
||||
results = archive.semantic_search("Python scripting automation")
|
||||
assert len(results) > 0
|
||||
# Python-related entry should score higher than cooking
|
||||
assert results[0][0].title != "Cooking recipes"
|
||||
# Results are (entry, score) tuples
|
||||
assert all(isinstance(s, float) for _, s in results)
|
||||
assert all(s > 0 for _, s in results)
|
||||
|
||||
|
||||
def test_semantic_search_ranks_by_similarity():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e1 = ingest_event(archive, title="Web development", content="Building websites with HTML CSS JavaScript")
|
||||
e2 = ingest_event(archive, title="Web scraping", content="Extracting data from websites using Python requests")
|
||||
e3 = ingest_event(archive, title="Astronomy", content="Studying stars planets galaxies")
|
||||
results = archive.semantic_search("web programming internet")
|
||||
titles = [e.title for e, _ in results]
|
||||
# Both web entries should rank above astronomy
|
||||
assert e3.title not in titles or titles.index(e3.title) > titles.index(e1.title)
|
||||
|
||||
|
||||
def test_semantic_search_link_boost():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
# Entry A is about Python — will get linked to
|
||||
e_a = ingest_event(archive, title="Python tools", content="Automation scripting development Python")
|
||||
# Entry B is also about Python but with different enough content to link to A
|
||||
e_b = ingest_event(archive, title="Python frameworks", content="Django Flask FastAPI Python development")
|
||||
# Entry C has same keywords but no links (added with auto_link=False)
|
||||
e_c = ArchiveEntry(title="Python tools", content="Automation scripting development Python")
|
||||
archive.add(e_c, auto_link=False)
|
||||
|
||||
results = archive.semantic_search("Python automation")
|
||||
scores = {e.id: s for e, s in results}
|
||||
|
||||
# Entries with more back-links should get the boost
|
||||
# e_a and e_b should be linked to each other
|
||||
if e_a.id in scores and e_c.id in scores:
|
||||
# e_a has auto-links from e_b, e_c does not
|
||||
# e_a should score >= e_c (boosted by link connectivity)
|
||||
assert scores[e_a.id] >= scores[e_c.id]
|
||||
|
||||
|
||||
def test_semantic_search_empty_archive():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
results = archive.semantic_search("anything")
|
||||
assert results == []
|
||||
|
||||
|
||||
def test_semantic_search_threshold():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="Quantum physics", content="Particle wave duality entanglement")
|
||||
# Completely unrelated query — should return empty with high threshold
|
||||
results = archive.semantic_search("cooking pasta carbonara", threshold=0.5)
|
||||
assert results == []
|
||||
stats = archive.stats()
|
||||
assert stats["entries"] == 3
|
||||
assert stats["orphans"] >= 1 # At least the cooking entry should be orphan
|
||||
assert 0 <= stats["link_density"] <= 1
|
||||
assert stats["oldest_entry"] is not None
|
||||
assert stats["newest_entry"] is not None
|
||||
|
||||
Reference in New Issue
Block a user