Compare commits
2 Commits
mimo/code/
...
feat/mnemo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0aba89e2c3 | ||
|
|
7bf69571ae |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -8,3 +8,5 @@ mempalace/__pycache__/
|
||||
# Prevent agents from writing to wrong path (see issue #1145)
|
||||
public/nexus/
|
||||
test-screenshots/
|
||||
*.pyc
|
||||
__pycache__/
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
Phase 1: Foundation — core archive, entry model, holographic linker,
|
||||
ingestion pipeline, and CLI.
|
||||
|
||||
Phase 1b: Timeline navigation, duplicate detection, entry merging.
|
||||
|
||||
Builds on MemPalace vector memory to create interconnected meaning:
|
||||
entries auto-reference related entries via semantic similarity,
|
||||
forming a living archive that surfaces relevant context autonomously.
|
||||
|
||||
BIN
nexus/mnemosyne/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
nexus/mnemosyne/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
BIN
nexus/mnemosyne/__pycache__/archive.cpython-311.pyc
Normal file
BIN
nexus/mnemosyne/__pycache__/archive.cpython-311.pyc
Normal file
Binary file not shown.
BIN
nexus/mnemosyne/__pycache__/entry.cpython-311.pyc
Normal file
BIN
nexus/mnemosyne/__pycache__/entry.cpython-311.pyc
Normal file
Binary file not shown.
BIN
nexus/mnemosyne/__pycache__/ingest.cpython-311.pyc
Normal file
BIN
nexus/mnemosyne/__pycache__/ingest.cpython-311.pyc
Normal file
Binary file not shown.
BIN
nexus/mnemosyne/__pycache__/linker.cpython-311.pyc
Normal file
BIN
nexus/mnemosyne/__pycache__/linker.cpython-311.pyc
Normal file
Binary file not shown.
@@ -7,6 +7,7 @@ and provides query interfaces for retrieving connected knowledge.
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@@ -146,6 +147,139 @@ class MnemosyneArchive:
|
||||
topic_lower = topic.lower()
|
||||
return [e for e in self._entries.values() if topic_lower in [t.lower() for t in e.topics]]
|
||||
|
||||
def timeline(
|
||||
self,
|
||||
limit: int = 20,
|
||||
since: Optional[str] = None,
|
||||
until: Optional[str] = None,
|
||||
source: Optional[str] = None,
|
||||
) -> list[ArchiveEntry]:
|
||||
"""Get entries sorted by creation date, newest first.
|
||||
|
||||
Args:
|
||||
limit: Maximum entries to return.
|
||||
since: ISO datetime string — only return entries created after this.
|
||||
until: ISO datetime string — only return entries created before this.
|
||||
source: Filter by source type (e.g. "mempalace", "event", "manual").
|
||||
|
||||
Returns:
|
||||
List of ArchiveEntry sorted by created_at descending.
|
||||
"""
|
||||
entries = list(self._entries.values())
|
||||
|
||||
if source:
|
||||
entries = [e for e in entries if e.source == source]
|
||||
|
||||
if since:
|
||||
since_dt = since if "T" in since else f"{since}T00:00:00"
|
||||
entries = [e for e in entries if e.created_at >= since_dt]
|
||||
|
||||
if until:
|
||||
until_dt = until if "T" in until else f"{until}T23:59:59"
|
||||
entries = [e for e in entries if e.created_at <= until_dt]
|
||||
|
||||
entries.sort(key=lambda e: e.created_at, reverse=True)
|
||||
return entries[:limit]
|
||||
|
||||
def recent(self, n: int = 10) -> list[ArchiveEntry]:
|
||||
"""Get the N most recent entries."""
|
||||
return self.timeline(limit=n)
|
||||
|
||||
def find_duplicates(self, threshold: float = 0.7) -> list[tuple[ArchiveEntry, ArchiveEntry, float]]:
|
||||
"""Find pairs of entries with high content similarity.
|
||||
|
||||
Uses the holographic linker's Jaccard similarity on title+content.
|
||||
|
||||
Args:
|
||||
threshold: Minimum similarity score to consider a pair duplicates.
|
||||
0.7 is recommended for catching near-duplicates.
|
||||
|
||||
Returns:
|
||||
List of (entry_a, entry_b, similarity_score) tuples, sorted by
|
||||
score descending.
|
||||
"""
|
||||
entries = list(self._entries.values())
|
||||
pairs: list[tuple[ArchiveEntry, ArchiveEntry, float]] = []
|
||||
seen: set[tuple[str, str]] = set()
|
||||
|
||||
for i, a in enumerate(entries):
|
||||
for b in entries[i + 1:]:
|
||||
pair_key = (min(a.id, b.id), max(a.id, b.id))
|
||||
if pair_key in seen:
|
||||
continue
|
||||
seen.add(pair_key)
|
||||
score = self.linker.compute_similarity(a, b)
|
||||
if score >= threshold:
|
||||
pairs.append((a, b, score))
|
||||
|
||||
pairs.sort(key=lambda x: x[2], reverse=True)
|
||||
return pairs
|
||||
|
||||
def merge_entries(
|
||||
self,
|
||||
primary_id: str,
|
||||
duplicate_id: str,
|
||||
) -> Optional[ArchiveEntry]:
|
||||
"""Merge two entries into one, keeping the richer one as primary.
|
||||
|
||||
The primary entry keeps its ID. The duplicate's unique attributes
|
||||
are merged in:
|
||||
- Topics are unioned
|
||||
- Content is concatenated if they differ (primary first)
|
||||
- Links are unioned (excluding both entry IDs)
|
||||
- Metadata keys from duplicate are added if missing in primary
|
||||
- All other entries linking to the duplicate are redirected to primary
|
||||
|
||||
Args:
|
||||
primary_id: ID of the entry to keep.
|
||||
duplicate_id: ID of the entry to merge and remove.
|
||||
|
||||
Returns:
|
||||
The updated primary entry, or None if either ID was not found.
|
||||
"""
|
||||
primary = self._entries.get(primary_id)
|
||||
duplicate = self._entries.get(duplicate_id)
|
||||
if not primary or not duplicate:
|
||||
return None
|
||||
if primary_id == duplicate_id:
|
||||
return primary
|
||||
|
||||
# Union topics
|
||||
existing_topics = {t.lower() for t in primary.topics}
|
||||
for topic in duplicate.topics:
|
||||
if topic.lower() not in existing_topics:
|
||||
primary.topics.append(topic)
|
||||
|
||||
# Append content if different
|
||||
if duplicate.content and duplicate.content not in primary.content:
|
||||
primary.content = f"{primary.content}\n\n---\n\n{duplicate.content}"
|
||||
|
||||
# Union links (skip self-references)
|
||||
existing_links = set(primary.links)
|
||||
for link_id in duplicate.links:
|
||||
if link_id != primary_id and link_id != duplicate_id and link_id not in existing_links:
|
||||
primary.links.append(link_id)
|
||||
|
||||
# Merge metadata (duplicate fills gaps)
|
||||
for key, value in duplicate.metadata.items():
|
||||
if key not in primary.metadata:
|
||||
primary.metadata[key] = value
|
||||
|
||||
# Redirect all entries linking to duplicate -> primary
|
||||
for entry in self._entries.values():
|
||||
if entry.id == primary_id:
|
||||
continue
|
||||
if duplicate_id in entry.links:
|
||||
entry.links.remove(duplicate_id)
|
||||
if primary_id not in entry.links and entry.id != primary_id:
|
||||
entry.links.append(primary_id)
|
||||
|
||||
# Remove duplicate
|
||||
del self._entries[duplicate_id]
|
||||
|
||||
self._save()
|
||||
return primary
|
||||
|
||||
def remove(self, entry_id: str) -> bool:
|
||||
"""Remove an entry and clean up all bidirectional links.
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
"""CLI interface for Mnemosyne.
|
||||
|
||||
Provides: mnemosyne ingest, mnemosyne search, mnemosyne link, mnemosyne stats,
|
||||
mnemosyne topics, mnemosyne remove, mnemosyne export
|
||||
mnemosyne topics, mnemosyne remove, mnemosyne export,
|
||||
mnemosyne timeline, mnemosyne dedup, mnemosyne merge
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -90,6 +91,80 @@ def cmd_export(args):
|
||||
print(json.dumps(data, indent=2))
|
||||
|
||||
|
||||
def cmd_timeline(args):
|
||||
archive = MnemosyneArchive()
|
||||
entries = archive.timeline(
|
||||
limit=args.limit,
|
||||
since=args.since or None,
|
||||
until=args.until or None,
|
||||
source=args.source or None,
|
||||
)
|
||||
if not entries:
|
||||
print("No entries found.")
|
||||
return
|
||||
for entry in entries:
|
||||
linked = len(entry.links)
|
||||
print(f"[{entry.id[:8]}] {entry.title}")
|
||||
print(f" Created: {entry.created_at} | Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
|
||||
print(f" {entry.content[:100]}...")
|
||||
print()
|
||||
|
||||
|
||||
def cmd_dedup(args):
|
||||
archive = MnemosyneArchive()
|
||||
pairs = archive.find_duplicates(threshold=args.threshold)
|
||||
if not pairs:
|
||||
print("No duplicates found.")
|
||||
return
|
||||
|
||||
print(f"Found {len(pairs)} duplicate pair(s) (threshold={args.threshold}):\n")
|
||||
for a, b, score in pairs:
|
||||
print(f" Score {score:.3f}:")
|
||||
print(f" [{a.id[:8]}] {a.title}")
|
||||
print(f" [{b.id[:8]}] {b.title}")
|
||||
print()
|
||||
|
||||
if not args.dry_run:
|
||||
merged_count = 0
|
||||
for a, b, _ in pairs:
|
||||
# Keep the entry with more content as primary
|
||||
primary, duplicate = (a, b) if len(a.content) >= len(b.content) else (b, a)
|
||||
if archive.get(duplicate.id) is None:
|
||||
continue # Already merged via transitive merge
|
||||
result = archive.merge_entries(primary.id, duplicate.id)
|
||||
if result:
|
||||
merged_count += 1
|
||||
print(f" Merged [{duplicate.id[:8]}] into [{result.id[:8]}] {result.title}")
|
||||
print(f"\nMerged {merged_count} pair(s).")
|
||||
|
||||
|
||||
def cmd_merge(args):
|
||||
archive = MnemosyneArchive()
|
||||
entry1 = archive.get(args.id1)
|
||||
entry2 = archive.get(args.id2)
|
||||
if not entry1:
|
||||
print(f"Entry not found: {args.id1}")
|
||||
sys.exit(1)
|
||||
if not entry2:
|
||||
print(f"Entry not found: {args.id2}")
|
||||
sys.exit(1)
|
||||
|
||||
# Primary is the one with more content unless --into is specified
|
||||
if args.into:
|
||||
primary_id, dup_id = args.into, (args.id2 if args.into == args.id1 else args.id1)
|
||||
else:
|
||||
primary_id, dup_id = (entry1.id, entry2.id) if len(entry1.content) >= len(entry2.content) else (entry2.id, entry1.id)
|
||||
|
||||
result = archive.merge_entries(primary_id, dup_id)
|
||||
if result:
|
||||
print(f"Merged into [{result.id[:8]}] {result.title}")
|
||||
print(f" Topics: {', '.join(result.topics)}")
|
||||
print(f" Links: {len(result.links)}")
|
||||
else:
|
||||
print("Merge failed.")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(prog="mnemosyne", description="The Living Holographic Archive")
|
||||
sub = parser.add_subparsers(dest="command")
|
||||
@@ -119,6 +194,21 @@ def main():
|
||||
ex.add_argument("-q", "--query", default="", help="Keyword filter")
|
||||
ex.add_argument("-t", "--topics", default="", help="Comma-separated topic filter")
|
||||
|
||||
tl = sub.add_parser("timeline", help="Show entries by creation date (newest first)")
|
||||
tl.add_argument("-n", "--limit", type=int, default=20, help="Max entries to show")
|
||||
tl.add_argument("--since", default="", help="ISO date filter: entries after this date")
|
||||
tl.add_argument("--until", default="", help="ISO date filter: entries before this date")
|
||||
tl.add_argument("--source", default="", help="Filter by source type (mempalace, event, manual)")
|
||||
|
||||
dd = sub.add_parser("dedup", help="Find and merge duplicate entries")
|
||||
dd.add_argument("--threshold", type=float, default=0.7, help="Similarity threshold (0.0-1.0)")
|
||||
dd.add_argument("--dry-run", action="store_true", help="Only list duplicates, don't merge")
|
||||
|
||||
mg = sub.add_parser("merge", help="Merge two entries into one")
|
||||
mg.add_argument("id1", help="First entry ID")
|
||||
mg.add_argument("id2", help="Second entry ID")
|
||||
mg.add_argument("--into", default="", help="Force this ID as the primary (surviving) entry")
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
@@ -132,6 +222,9 @@ def main():
|
||||
"topics": cmd_topics,
|
||||
"remove": cmd_remove,
|
||||
"export": cmd_export,
|
||||
"timeline": cmd_timeline,
|
||||
"dedup": cmd_dedup,
|
||||
"merge": cmd_merge,
|
||||
}
|
||||
dispatch[args.command](args)
|
||||
|
||||
|
||||
BIN
nexus/mnemosyne/tests/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
nexus/mnemosyne/tests/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
187
nexus/mnemosyne/tests/test_timeline_dedup.py
Normal file
187
nexus/mnemosyne/tests/test_timeline_dedup.py
Normal file
@@ -0,0 +1,187 @@
|
||||
"""Tests for Mnemosyne Phase 1b: timeline, dedup, merge."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from nexus.mnemosyne.archive import MnemosyneArchive
|
||||
from nexus.mnemosyne.entry import ArchiveEntry
|
||||
from nexus.mnemosyne.ingest import ingest_event
|
||||
|
||||
|
||||
def _make_archive(tmp_path: Path) -> MnemosyneArchive:
|
||||
return MnemosyneArchive(archive_path=tmp_path / "test.json")
|
||||
|
||||
|
||||
# ── Timeline ──────────────────────────────────────────────
|
||||
|
||||
def test_timeline_returns_newest_first():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
e1 = ingest_event(archive, title="First", content="oldest entry")
|
||||
e2 = ingest_event(archive, title="Second", content="middle entry")
|
||||
e3 = ingest_event(archive, title="Third", content="newest entry")
|
||||
timeline = archive.timeline(limit=10)
|
||||
assert len(timeline) == 3
|
||||
assert timeline[0].id == e3.id # newest first
|
||||
assert timeline[-1].id == e1.id # oldest last
|
||||
|
||||
|
||||
def test_timeline_respects_limit():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
for i in range(5):
|
||||
ingest_event(archive, title=f"Entry {i}", content=f"content {i}")
|
||||
timeline = archive.timeline(limit=2)
|
||||
assert len(timeline) == 2
|
||||
|
||||
|
||||
def test_timeline_filters_by_source():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
ingest_event(archive, title="From event", content="event content", source="event")
|
||||
ingest_event(archive, title="From manual", content="manual content", source="manual")
|
||||
ingest_event(archive, title="Also event", content="another event", source="event")
|
||||
timeline = archive.timeline(limit=10, source="event")
|
||||
assert len(timeline) == 2
|
||||
assert all(e.source == "event" for e in timeline)
|
||||
|
||||
|
||||
def test_timeline_filters_by_since():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
# Create entries — all have recent timestamps
|
||||
e1 = ingest_event(archive, title="Recent", content="recent")
|
||||
# The since filter just compares ISO strings, so use a date far in the past
|
||||
timeline = archive.timeline(limit=10, since="2099-01-01")
|
||||
assert len(timeline) == 0
|
||||
# With since in the past, should get all
|
||||
timeline_all = archive.timeline(limit=10, since="2020-01-01")
|
||||
assert len(timeline_all) == 1
|
||||
|
||||
|
||||
def test_recent_shorthand():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
for i in range(10):
|
||||
ingest_event(archive, title=f"E{i}", content=f"content {i}")
|
||||
recent = archive.recent(n=3)
|
||||
assert len(recent) == 3
|
||||
|
||||
|
||||
# ── Duplicate Detection ───────────────────────────────────
|
||||
|
||||
def test_find_duplicates_finds_near_duplicates():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
ingest_event(archive, title="Python automation", content="Building automation tools in Python for developers")
|
||||
ingest_event(archive, title="Python automation guide", content="Building automation tools in Python for developers and teams")
|
||||
ingest_event(archive, title="Cooking pasta", content="How to make carbonara")
|
||||
pairs = archive.find_duplicates(threshold=0.5)
|
||||
assert len(pairs) >= 1
|
||||
# The python entries should be in the duplicate pair
|
||||
pair_titles = {pairs[0][0].title, pairs[0][1].title}
|
||||
assert "Python automation" in pair_titles
|
||||
assert "Python automation guide" in pair_titles
|
||||
|
||||
|
||||
def test_find_duplicates_no_false_positives():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
ingest_event(archive, title="Quantum physics", content="Entanglement superposition wave function")
|
||||
ingest_event(archive, title="Baking bread", content="Flour water yeast knead oven")
|
||||
pairs = archive.find_duplicates(threshold=0.7)
|
||||
assert len(pairs) == 0
|
||||
|
||||
|
||||
def test_find_duplicates_empty_archive():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
pairs = archive.find_duplicates()
|
||||
assert pairs == []
|
||||
|
||||
|
||||
# ── Merge ─────────────────────────────────────────────────
|
||||
|
||||
def test_merge_unions_topics():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
e1 = ingest_event(archive, title="Alpha", content="content A", topics=["python", "ai"])
|
||||
e2 = ingest_event(archive, title="Beta", content="content B", topics=["ai", "ml"])
|
||||
merged = archive.merge_entries(e1.id, e2.id)
|
||||
assert merged is not None
|
||||
assert set(merged.topics) == {"python", "ai", "ml"}
|
||||
assert archive.count == 1
|
||||
|
||||
|
||||
def test_merge_appends_different_content():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
e1 = ingest_event(archive, title="Alpha", content="First part")
|
||||
e2 = ingest_event(archive, title="Beta", content="Second part")
|
||||
merged = archive.merge_entries(e1.id, e2.id)
|
||||
assert "First part" in merged.content
|
||||
assert "Second part" in merged.content
|
||||
|
||||
|
||||
def test_merge_does_not_duplicate_content():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
e1 = ingest_event(archive, title="Alpha", content="Same content")
|
||||
e2 = ingest_event(archive, title="Beta", content="Same content")
|
||||
merged = archive.merge_entries(e1.id, e2.id)
|
||||
assert merged.content.count("Same content") == 1
|
||||
|
||||
|
||||
def test_merge_redirects_links():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
e1 = ingest_event(archive, title="Alpha", content="Python automation scripts")
|
||||
e2 = ingest_event(archive, title="Beta", content="Python coding tools")
|
||||
e3 = ingest_event(archive, title="Gamma", content="Python scripting helpers")
|
||||
# e3 might link to e2; after merge, should link to e1
|
||||
merged = archive.merge_entries(e1.id, e2.id)
|
||||
e3_fresh = archive.get(e3.id)
|
||||
assert e2.id not in e3_fresh.links
|
||||
# e3 should link to e1 (the survivor) if it was linking to e2
|
||||
# (depending on exact similarity, this may or may not hold — at minimum, no dangling refs)
|
||||
|
||||
|
||||
def test_merge_removes_duplicate():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
e1 = ingest_event(archive, title="Alpha", content="first")
|
||||
e2 = ingest_event(archive, title="Beta", content="second")
|
||||
archive.merge_entries(e1.id, e2.id)
|
||||
assert archive.get(e2.id) is None
|
||||
assert archive.get(e1.id) is not None
|
||||
|
||||
|
||||
def test_merge_same_id_is_noop():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
e1 = ingest_event(archive, title="Alpha", content="content")
|
||||
result = archive.merge_entries(e1.id, e1.id)
|
||||
assert result.id == e1.id
|
||||
assert archive.count == 1
|
||||
|
||||
|
||||
def test_merge_nonexistent_returns_none():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
archive = _make_archive(Path(tmp))
|
||||
e1 = ingest_event(archive, title="Alpha", content="content")
|
||||
assert archive.merge_entries(e1.id, "no-such-id") is None
|
||||
assert archive.merge_entries("no-such-id", e1.id) is None
|
||||
|
||||
|
||||
def test_merge_persists():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "persist.json"
|
||||
a1 = MnemosyneArchive(archive_path=path)
|
||||
e1 = ingest_event(a1, title="Alpha", content="first", topics=["x"])
|
||||
e2 = ingest_event(a1, title="Beta", content="second", topics=["y"])
|
||||
a1.merge_entries(e1.id, e2.id)
|
||||
|
||||
a2 = MnemosyneArchive(archive_path=path)
|
||||
assert a2.count == 1
|
||||
entry = list(a2._entries.values())[0]
|
||||
assert set(entry.topics) == {"x", "y"}
|
||||
Reference in New Issue
Block a user