diff --git a/nexus/mnemosyne/__init__.py b/nexus/mnemosyne/__init__.py index 8a6b1760..47909774 100644 --- a/nexus/mnemosyne/__init__.py +++ b/nexus/mnemosyne/__init__.py @@ -3,6 +3,8 @@ Phase 1: Foundation — core archive, entry model, holographic linker, ingestion pipeline, and CLI. +Phase 1b: Timeline navigation, duplicate detection, entry merging. + Builds on MemPalace vector memory to create interconnected meaning: entries auto-reference related entries via semantic similarity, forming a living archive that surfaces relevant context autonomously. diff --git a/nexus/mnemosyne/__pycache__/__init__.cpython-311.pyc b/nexus/mnemosyne/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 00000000..cee8ac1c Binary files /dev/null and b/nexus/mnemosyne/__pycache__/__init__.cpython-311.pyc differ diff --git a/nexus/mnemosyne/__pycache__/archive.cpython-311.pyc b/nexus/mnemosyne/__pycache__/archive.cpython-311.pyc new file mode 100644 index 00000000..70d3ec0a Binary files /dev/null and b/nexus/mnemosyne/__pycache__/archive.cpython-311.pyc differ diff --git a/nexus/mnemosyne/__pycache__/entry.cpython-311.pyc b/nexus/mnemosyne/__pycache__/entry.cpython-311.pyc new file mode 100644 index 00000000..97c1dafd Binary files /dev/null and b/nexus/mnemosyne/__pycache__/entry.cpython-311.pyc differ diff --git a/nexus/mnemosyne/__pycache__/ingest.cpython-311.pyc b/nexus/mnemosyne/__pycache__/ingest.cpython-311.pyc new file mode 100644 index 00000000..f8bd10b2 Binary files /dev/null and b/nexus/mnemosyne/__pycache__/ingest.cpython-311.pyc differ diff --git a/nexus/mnemosyne/__pycache__/linker.cpython-311.pyc b/nexus/mnemosyne/__pycache__/linker.cpython-311.pyc new file mode 100644 index 00000000..deeca1be Binary files /dev/null and b/nexus/mnemosyne/__pycache__/linker.cpython-311.pyc differ diff --git a/nexus/mnemosyne/archive.py b/nexus/mnemosyne/archive.py index 8a8a2eb8..b461fb24 100644 --- a/nexus/mnemosyne/archive.py +++ b/nexus/mnemosyne/archive.py @@ -7,6 +7,7 @@ and provides query interfaces for retrieving connected knowledge. from __future__ import annotations import json +from datetime import datetime, timezone from pathlib import Path from typing import Optional @@ -146,6 +147,139 @@ class MnemosyneArchive: topic_lower = topic.lower() return [e for e in self._entries.values() if topic_lower in [t.lower() for t in e.topics]] + def timeline( + self, + limit: int = 20, + since: Optional[str] = None, + until: Optional[str] = None, + source: Optional[str] = None, + ) -> list[ArchiveEntry]: + """Get entries sorted by creation date, newest first. + + Args: + limit: Maximum entries to return. + since: ISO datetime string — only return entries created after this. + until: ISO datetime string — only return entries created before this. + source: Filter by source type (e.g. "mempalace", "event", "manual"). + + Returns: + List of ArchiveEntry sorted by created_at descending. + """ + entries = list(self._entries.values()) + + if source: + entries = [e for e in entries if e.source == source] + + if since: + since_dt = since if "T" in since else f"{since}T00:00:00" + entries = [e for e in entries if e.created_at >= since_dt] + + if until: + until_dt = until if "T" in until else f"{until}T23:59:59" + entries = [e for e in entries if e.created_at <= until_dt] + + entries.sort(key=lambda e: e.created_at, reverse=True) + return entries[:limit] + + def recent(self, n: int = 10) -> list[ArchiveEntry]: + """Get the N most recent entries.""" + return self.timeline(limit=n) + + def find_duplicates(self, threshold: float = 0.7) -> list[tuple[ArchiveEntry, ArchiveEntry, float]]: + """Find pairs of entries with high content similarity. + + Uses the holographic linker's Jaccard similarity on title+content. + + Args: + threshold: Minimum similarity score to consider a pair duplicates. + 0.7 is recommended for catching near-duplicates. + + Returns: + List of (entry_a, entry_b, similarity_score) tuples, sorted by + score descending. + """ + entries = list(self._entries.values()) + pairs: list[tuple[ArchiveEntry, ArchiveEntry, float]] = [] + seen: set[tuple[str, str]] = set() + + for i, a in enumerate(entries): + for b in entries[i + 1:]: + pair_key = (min(a.id, b.id), max(a.id, b.id)) + if pair_key in seen: + continue + seen.add(pair_key) + score = self.linker.compute_similarity(a, b) + if score >= threshold: + pairs.append((a, b, score)) + + pairs.sort(key=lambda x: x[2], reverse=True) + return pairs + + def merge_entries( + self, + primary_id: str, + duplicate_id: str, + ) -> Optional[ArchiveEntry]: + """Merge two entries into one, keeping the richer one as primary. + + The primary entry keeps its ID. The duplicate's unique attributes + are merged in: + - Topics are unioned + - Content is concatenated if they differ (primary first) + - Links are unioned (excluding both entry IDs) + - Metadata keys from duplicate are added if missing in primary + - All other entries linking to the duplicate are redirected to primary + + Args: + primary_id: ID of the entry to keep. + duplicate_id: ID of the entry to merge and remove. + + Returns: + The updated primary entry, or None if either ID was not found. + """ + primary = self._entries.get(primary_id) + duplicate = self._entries.get(duplicate_id) + if not primary or not duplicate: + return None + if primary_id == duplicate_id: + return primary + + # Union topics + existing_topics = {t.lower() for t in primary.topics} + for topic in duplicate.topics: + if topic.lower() not in existing_topics: + primary.topics.append(topic) + + # Append content if different + if duplicate.content and duplicate.content not in primary.content: + primary.content = f"{primary.content}\n\n---\n\n{duplicate.content}" + + # Union links (skip self-references) + existing_links = set(primary.links) + for link_id in duplicate.links: + if link_id != primary_id and link_id != duplicate_id and link_id not in existing_links: + primary.links.append(link_id) + + # Merge metadata (duplicate fills gaps) + for key, value in duplicate.metadata.items(): + if key not in primary.metadata: + primary.metadata[key] = value + + # Redirect all entries linking to duplicate -> primary + for entry in self._entries.values(): + if entry.id == primary_id: + continue + if duplicate_id in entry.links: + entry.links.remove(duplicate_id) + if primary_id not in entry.links and entry.id != primary_id: + entry.links.append(primary_id) + + # Remove duplicate + del self._entries[duplicate_id] + + self._save() + return primary + def remove(self, entry_id: str) -> bool: """Remove an entry and clean up all bidirectional links. diff --git a/nexus/mnemosyne/cli.py b/nexus/mnemosyne/cli.py index 6691b244..9a53f203 100644 --- a/nexus/mnemosyne/cli.py +++ b/nexus/mnemosyne/cli.py @@ -1,7 +1,8 @@ """CLI interface for Mnemosyne. Provides: mnemosyne ingest, mnemosyne search, mnemosyne link, mnemosyne stats, - mnemosyne topics, mnemosyne remove, mnemosyne export + mnemosyne topics, mnemosyne remove, mnemosyne export, + mnemosyne timeline, mnemosyne dedup, mnemosyne merge """ from __future__ import annotations @@ -90,6 +91,80 @@ def cmd_export(args): print(json.dumps(data, indent=2)) +def cmd_timeline(args): + archive = MnemosyneArchive() + entries = archive.timeline( + limit=args.limit, + since=args.since or None, + until=args.until or None, + source=args.source or None, + ) + if not entries: + print("No entries found.") + return + for entry in entries: + linked = len(entry.links) + print(f"[{entry.id[:8]}] {entry.title}") + print(f" Created: {entry.created_at} | Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}") + print(f" {entry.content[:100]}...") + print() + + +def cmd_dedup(args): + archive = MnemosyneArchive() + pairs = archive.find_duplicates(threshold=args.threshold) + if not pairs: + print("No duplicates found.") + return + + print(f"Found {len(pairs)} duplicate pair(s) (threshold={args.threshold}):\n") + for a, b, score in pairs: + print(f" Score {score:.3f}:") + print(f" [{a.id[:8]}] {a.title}") + print(f" [{b.id[:8]}] {b.title}") + print() + + if not args.dry_run: + merged_count = 0 + for a, b, _ in pairs: + # Keep the entry with more content as primary + primary, duplicate = (a, b) if len(a.content) >= len(b.content) else (b, a) + if archive.get(duplicate.id) is None: + continue # Already merged via transitive merge + result = archive.merge_entries(primary.id, duplicate.id) + if result: + merged_count += 1 + print(f" Merged [{duplicate.id[:8]}] into [{result.id[:8]}] {result.title}") + print(f"\nMerged {merged_count} pair(s).") + + +def cmd_merge(args): + archive = MnemosyneArchive() + entry1 = archive.get(args.id1) + entry2 = archive.get(args.id2) + if not entry1: + print(f"Entry not found: {args.id1}") + sys.exit(1) + if not entry2: + print(f"Entry not found: {args.id2}") + sys.exit(1) + + # Primary is the one with more content unless --into is specified + if args.into: + primary_id, dup_id = args.into, (args.id2 if args.into == args.id1 else args.id1) + else: + primary_id, dup_id = (entry1.id, entry2.id) if len(entry1.content) >= len(entry2.content) else (entry2.id, entry1.id) + + result = archive.merge_entries(primary_id, dup_id) + if result: + print(f"Merged into [{result.id[:8]}] {result.title}") + print(f" Topics: {', '.join(result.topics)}") + print(f" Links: {len(result.links)}") + else: + print("Merge failed.") + sys.exit(1) + + def main(): parser = argparse.ArgumentParser(prog="mnemosyne", description="The Living Holographic Archive") sub = parser.add_subparsers(dest="command") @@ -119,6 +194,21 @@ def main(): ex.add_argument("-q", "--query", default="", help="Keyword filter") ex.add_argument("-t", "--topics", default="", help="Comma-separated topic filter") + tl = sub.add_parser("timeline", help="Show entries by creation date (newest first)") + tl.add_argument("-n", "--limit", type=int, default=20, help="Max entries to show") + tl.add_argument("--since", default="", help="ISO date filter: entries after this date") + tl.add_argument("--until", default="", help="ISO date filter: entries before this date") + tl.add_argument("--source", default="", help="Filter by source type (mempalace, event, manual)") + + dd = sub.add_parser("dedup", help="Find and merge duplicate entries") + dd.add_argument("--threshold", type=float, default=0.7, help="Similarity threshold (0.0-1.0)") + dd.add_argument("--dry-run", action="store_true", help="Only list duplicates, don't merge") + + mg = sub.add_parser("merge", help="Merge two entries into one") + mg.add_argument("id1", help="First entry ID") + mg.add_argument("id2", help="Second entry ID") + mg.add_argument("--into", default="", help="Force this ID as the primary (surviving) entry") + args = parser.parse_args() if not args.command: parser.print_help() @@ -132,6 +222,9 @@ def main(): "topics": cmd_topics, "remove": cmd_remove, "export": cmd_export, + "timeline": cmd_timeline, + "dedup": cmd_dedup, + "merge": cmd_merge, } dispatch[args.command](args) diff --git a/nexus/mnemosyne/tests/__pycache__/__init__.cpython-311.pyc b/nexus/mnemosyne/tests/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 00000000..fbc81c66 Binary files /dev/null and b/nexus/mnemosyne/tests/__pycache__/__init__.cpython-311.pyc differ diff --git a/nexus/mnemosyne/tests/__pycache__/test_archive.cpython-311-pytest-9.0.2.pyc b/nexus/mnemosyne/tests/__pycache__/test_archive.cpython-311-pytest-9.0.2.pyc new file mode 100644 index 00000000..33a67ea4 Binary files /dev/null and b/nexus/mnemosyne/tests/__pycache__/test_archive.cpython-311-pytest-9.0.2.pyc differ diff --git a/nexus/mnemosyne/tests/__pycache__/test_timeline_dedup.cpython-311-pytest-9.0.2.pyc b/nexus/mnemosyne/tests/__pycache__/test_timeline_dedup.cpython-311-pytest-9.0.2.pyc new file mode 100644 index 00000000..369cf4d4 Binary files /dev/null and b/nexus/mnemosyne/tests/__pycache__/test_timeline_dedup.cpython-311-pytest-9.0.2.pyc differ diff --git a/nexus/mnemosyne/tests/test_timeline_dedup.py b/nexus/mnemosyne/tests/test_timeline_dedup.py new file mode 100644 index 00000000..e2594833 --- /dev/null +++ b/nexus/mnemosyne/tests/test_timeline_dedup.py @@ -0,0 +1,187 @@ +"""Tests for Mnemosyne Phase 1b: timeline, dedup, merge.""" + +import tempfile +from pathlib import Path + +from nexus.mnemosyne.archive import MnemosyneArchive +from nexus.mnemosyne.entry import ArchiveEntry +from nexus.mnemosyne.ingest import ingest_event + + +def _make_archive(tmp_path: Path) -> MnemosyneArchive: + return MnemosyneArchive(archive_path=tmp_path / "test.json") + + +# ── Timeline ────────────────────────────────────────────── + +def test_timeline_returns_newest_first(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + e1 = ingest_event(archive, title="First", content="oldest entry") + e2 = ingest_event(archive, title="Second", content="middle entry") + e3 = ingest_event(archive, title="Third", content="newest entry") + timeline = archive.timeline(limit=10) + assert len(timeline) == 3 + assert timeline[0].id == e3.id # newest first + assert timeline[-1].id == e1.id # oldest last + + +def test_timeline_respects_limit(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + for i in range(5): + ingest_event(archive, title=f"Entry {i}", content=f"content {i}") + timeline = archive.timeline(limit=2) + assert len(timeline) == 2 + + +def test_timeline_filters_by_source(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + ingest_event(archive, title="From event", content="event content", source="event") + ingest_event(archive, title="From manual", content="manual content", source="manual") + ingest_event(archive, title="Also event", content="another event", source="event") + timeline = archive.timeline(limit=10, source="event") + assert len(timeline) == 2 + assert all(e.source == "event" for e in timeline) + + +def test_timeline_filters_by_since(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + # Create entries — all have recent timestamps + e1 = ingest_event(archive, title="Recent", content="recent") + # The since filter just compares ISO strings, so use a date far in the past + timeline = archive.timeline(limit=10, since="2099-01-01") + assert len(timeline) == 0 + # With since in the past, should get all + timeline_all = archive.timeline(limit=10, since="2020-01-01") + assert len(timeline_all) == 1 + + +def test_recent_shorthand(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + for i in range(10): + ingest_event(archive, title=f"E{i}", content=f"content {i}") + recent = archive.recent(n=3) + assert len(recent) == 3 + + +# ── Duplicate Detection ─────────────────────────────────── + +def test_find_duplicates_finds_near_duplicates(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + ingest_event(archive, title="Python automation", content="Building automation tools in Python for developers") + ingest_event(archive, title="Python automation guide", content="Building automation tools in Python for developers and teams") + ingest_event(archive, title="Cooking pasta", content="How to make carbonara") + pairs = archive.find_duplicates(threshold=0.5) + assert len(pairs) >= 1 + # The python entries should be in the duplicate pair + pair_titles = {pairs[0][0].title, pairs[0][1].title} + assert "Python automation" in pair_titles + assert "Python automation guide" in pair_titles + + +def test_find_duplicates_no_false_positives(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + ingest_event(archive, title="Quantum physics", content="Entanglement superposition wave function") + ingest_event(archive, title="Baking bread", content="Flour water yeast knead oven") + pairs = archive.find_duplicates(threshold=0.7) + assert len(pairs) == 0 + + +def test_find_duplicates_empty_archive(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + pairs = archive.find_duplicates() + assert pairs == [] + + +# ── Merge ───────────────────────────────────────────────── + +def test_merge_unions_topics(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + e1 = ingest_event(archive, title="Alpha", content="content A", topics=["python", "ai"]) + e2 = ingest_event(archive, title="Beta", content="content B", topics=["ai", "ml"]) + merged = archive.merge_entries(e1.id, e2.id) + assert merged is not None + assert set(merged.topics) == {"python", "ai", "ml"} + assert archive.count == 1 + + +def test_merge_appends_different_content(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + e1 = ingest_event(archive, title="Alpha", content="First part") + e2 = ingest_event(archive, title="Beta", content="Second part") + merged = archive.merge_entries(e1.id, e2.id) + assert "First part" in merged.content + assert "Second part" in merged.content + + +def test_merge_does_not_duplicate_content(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + e1 = ingest_event(archive, title="Alpha", content="Same content") + e2 = ingest_event(archive, title="Beta", content="Same content") + merged = archive.merge_entries(e1.id, e2.id) + assert merged.content.count("Same content") == 1 + + +def test_merge_redirects_links(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + e1 = ingest_event(archive, title="Alpha", content="Python automation scripts") + e2 = ingest_event(archive, title="Beta", content="Python coding tools") + e3 = ingest_event(archive, title="Gamma", content="Python scripting helpers") + # e3 might link to e2; after merge, should link to e1 + merged = archive.merge_entries(e1.id, e2.id) + e3_fresh = archive.get(e3.id) + assert e2.id not in e3_fresh.links + # e3 should link to e1 (the survivor) if it was linking to e2 + # (depending on exact similarity, this may or may not hold — at minimum, no dangling refs) + + +def test_merge_removes_duplicate(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + e1 = ingest_event(archive, title="Alpha", content="first") + e2 = ingest_event(archive, title="Beta", content="second") + archive.merge_entries(e1.id, e2.id) + assert archive.get(e2.id) is None + assert archive.get(e1.id) is not None + + +def test_merge_same_id_is_noop(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + e1 = ingest_event(archive, title="Alpha", content="content") + result = archive.merge_entries(e1.id, e1.id) + assert result.id == e1.id + assert archive.count == 1 + + +def test_merge_nonexistent_returns_none(): + with tempfile.TemporaryDirectory() as tmp: + archive = _make_archive(Path(tmp)) + e1 = ingest_event(archive, title="Alpha", content="content") + assert archive.merge_entries(e1.id, "no-such-id") is None + assert archive.merge_entries("no-such-id", e1.id) is None + + +def test_merge_persists(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "persist.json" + a1 = MnemosyneArchive(archive_path=path) + e1 = ingest_event(a1, title="Alpha", content="first", topics=["x"]) + e2 = ingest_event(a1, title="Beta", content="second", topics=["y"]) + a1.merge_entries(e1.id, e2.id) + + a2 = MnemosyneArchive(archive_path=path) + assert a2.count == 1 + entry = list(a2._entries.values())[0] + assert set(entry.topics) == {"x", "y"}