Compare commits

...

2 Commits

Author SHA1 Message Date
Timmy
0aba89e2c3 chore: exclude __pycache__ from tracking
Some checks failed
CI / test (pull_request) Failing after 10s
CI / validate (pull_request) Failing after 15s
Review Approval Gate / verify-review (pull_request) Failing after 3s
2026-04-11 17:54:51 -04:00
Timmy
7bf69571ae feat(mnemosyne): Phase 1b — timeline navigation, deduplication, entry merging
Closes #1230

Adds three capabilities to MnemosyneArchive:

- timeline(limit, since, until, source): entries sorted by creation date
  with optional date range and source filtering
- recent(n): shorthand for last N entries
- find_duplicates(threshold): Jaccard similarity-based duplicate detection
  returning scored pairs above threshold
- merge_entries(primary, duplicate): union topics, append unique content,
  redirect links, remove duplicate

CLI commands:
- mnemosyne timeline [--since DATE] [--until DATE] [--source TYPE] [-n N]
- mnemosyne dedup [--threshold 0.7] [--dry-run]
- mnemosyne merge <id1> <id2> [--into ID]

17 new tests in test_timeline_dedup.py, all passing (38 total).
2026-04-11 17:54:37 -04:00
13 changed files with 419 additions and 1 deletions

2
.gitignore vendored
View File

@@ -8,3 +8,5 @@ mempalace/__pycache__/
# Prevent agents from writing to wrong path (see issue #1145)
public/nexus/
test-screenshots/
*.pyc
__pycache__/

View File

@@ -3,6 +3,8 @@
Phase 1: Foundation — core archive, entry model, holographic linker,
ingestion pipeline, and CLI.
Phase 1b: Timeline navigation, duplicate detection, entry merging.
Builds on MemPalace vector memory to create interconnected meaning:
entries auto-reference related entries via semantic similarity,
forming a living archive that surfaces relevant context autonomously.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -7,6 +7,7 @@ and provides query interfaces for retrieving connected knowledge.
from __future__ import annotations
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
@@ -146,6 +147,139 @@ class MnemosyneArchive:
topic_lower = topic.lower()
return [e for e in self._entries.values() if topic_lower in [t.lower() for t in e.topics]]
def timeline(
self,
limit: int = 20,
since: Optional[str] = None,
until: Optional[str] = None,
source: Optional[str] = None,
) -> list[ArchiveEntry]:
"""Get entries sorted by creation date, newest first.
Args:
limit: Maximum entries to return.
since: ISO datetime string — only return entries created after this.
until: ISO datetime string — only return entries created before this.
source: Filter by source type (e.g. "mempalace", "event", "manual").
Returns:
List of ArchiveEntry sorted by created_at descending.
"""
entries = list(self._entries.values())
if source:
entries = [e for e in entries if e.source == source]
if since:
since_dt = since if "T" in since else f"{since}T00:00:00"
entries = [e for e in entries if e.created_at >= since_dt]
if until:
until_dt = until if "T" in until else f"{until}T23:59:59"
entries = [e for e in entries if e.created_at <= until_dt]
entries.sort(key=lambda e: e.created_at, reverse=True)
return entries[:limit]
def recent(self, n: int = 10) -> list[ArchiveEntry]:
"""Get the N most recent entries."""
return self.timeline(limit=n)
def find_duplicates(self, threshold: float = 0.7) -> list[tuple[ArchiveEntry, ArchiveEntry, float]]:
"""Find pairs of entries with high content similarity.
Uses the holographic linker's Jaccard similarity on title+content.
Args:
threshold: Minimum similarity score to consider a pair duplicates.
0.7 is recommended for catching near-duplicates.
Returns:
List of (entry_a, entry_b, similarity_score) tuples, sorted by
score descending.
"""
entries = list(self._entries.values())
pairs: list[tuple[ArchiveEntry, ArchiveEntry, float]] = []
seen: set[tuple[str, str]] = set()
for i, a in enumerate(entries):
for b in entries[i + 1:]:
pair_key = (min(a.id, b.id), max(a.id, b.id))
if pair_key in seen:
continue
seen.add(pair_key)
score = self.linker.compute_similarity(a, b)
if score >= threshold:
pairs.append((a, b, score))
pairs.sort(key=lambda x: x[2], reverse=True)
return pairs
def merge_entries(
self,
primary_id: str,
duplicate_id: str,
) -> Optional[ArchiveEntry]:
"""Merge two entries into one, keeping the richer one as primary.
The primary entry keeps its ID. The duplicate's unique attributes
are merged in:
- Topics are unioned
- Content is concatenated if they differ (primary first)
- Links are unioned (excluding both entry IDs)
- Metadata keys from duplicate are added if missing in primary
- All other entries linking to the duplicate are redirected to primary
Args:
primary_id: ID of the entry to keep.
duplicate_id: ID of the entry to merge and remove.
Returns:
The updated primary entry, or None if either ID was not found.
"""
primary = self._entries.get(primary_id)
duplicate = self._entries.get(duplicate_id)
if not primary or not duplicate:
return None
if primary_id == duplicate_id:
return primary
# Union topics
existing_topics = {t.lower() for t in primary.topics}
for topic in duplicate.topics:
if topic.lower() not in existing_topics:
primary.topics.append(topic)
# Append content if different
if duplicate.content and duplicate.content not in primary.content:
primary.content = f"{primary.content}\n\n---\n\n{duplicate.content}"
# Union links (skip self-references)
existing_links = set(primary.links)
for link_id in duplicate.links:
if link_id != primary_id and link_id != duplicate_id and link_id not in existing_links:
primary.links.append(link_id)
# Merge metadata (duplicate fills gaps)
for key, value in duplicate.metadata.items():
if key not in primary.metadata:
primary.metadata[key] = value
# Redirect all entries linking to duplicate -> primary
for entry in self._entries.values():
if entry.id == primary_id:
continue
if duplicate_id in entry.links:
entry.links.remove(duplicate_id)
if primary_id not in entry.links and entry.id != primary_id:
entry.links.append(primary_id)
# Remove duplicate
del self._entries[duplicate_id]
self._save()
return primary
def remove(self, entry_id: str) -> bool:
"""Remove an entry and clean up all bidirectional links.

View File

@@ -1,7 +1,8 @@
"""CLI interface for Mnemosyne.
Provides: mnemosyne ingest, mnemosyne search, mnemosyne link, mnemosyne stats,
mnemosyne topics, mnemosyne remove, mnemosyne export
mnemosyne topics, mnemosyne remove, mnemosyne export,
mnemosyne timeline, mnemosyne dedup, mnemosyne merge
"""
from __future__ import annotations
@@ -90,6 +91,80 @@ def cmd_export(args):
print(json.dumps(data, indent=2))
def cmd_timeline(args):
archive = MnemosyneArchive()
entries = archive.timeline(
limit=args.limit,
since=args.since or None,
until=args.until or None,
source=args.source or None,
)
if not entries:
print("No entries found.")
return
for entry in entries:
linked = len(entry.links)
print(f"[{entry.id[:8]}] {entry.title}")
print(f" Created: {entry.created_at} | Source: {entry.source} | Topics: {', '.join(entry.topics)} | Links: {linked}")
print(f" {entry.content[:100]}...")
print()
def cmd_dedup(args):
archive = MnemosyneArchive()
pairs = archive.find_duplicates(threshold=args.threshold)
if not pairs:
print("No duplicates found.")
return
print(f"Found {len(pairs)} duplicate pair(s) (threshold={args.threshold}):\n")
for a, b, score in pairs:
print(f" Score {score:.3f}:")
print(f" [{a.id[:8]}] {a.title}")
print(f" [{b.id[:8]}] {b.title}")
print()
if not args.dry_run:
merged_count = 0
for a, b, _ in pairs:
# Keep the entry with more content as primary
primary, duplicate = (a, b) if len(a.content) >= len(b.content) else (b, a)
if archive.get(duplicate.id) is None:
continue # Already merged via transitive merge
result = archive.merge_entries(primary.id, duplicate.id)
if result:
merged_count += 1
print(f" Merged [{duplicate.id[:8]}] into [{result.id[:8]}] {result.title}")
print(f"\nMerged {merged_count} pair(s).")
def cmd_merge(args):
archive = MnemosyneArchive()
entry1 = archive.get(args.id1)
entry2 = archive.get(args.id2)
if not entry1:
print(f"Entry not found: {args.id1}")
sys.exit(1)
if not entry2:
print(f"Entry not found: {args.id2}")
sys.exit(1)
# Primary is the one with more content unless --into is specified
if args.into:
primary_id, dup_id = args.into, (args.id2 if args.into == args.id1 else args.id1)
else:
primary_id, dup_id = (entry1.id, entry2.id) if len(entry1.content) >= len(entry2.content) else (entry2.id, entry1.id)
result = archive.merge_entries(primary_id, dup_id)
if result:
print(f"Merged into [{result.id[:8]}] {result.title}")
print(f" Topics: {', '.join(result.topics)}")
print(f" Links: {len(result.links)}")
else:
print("Merge failed.")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(prog="mnemosyne", description="The Living Holographic Archive")
sub = parser.add_subparsers(dest="command")
@@ -119,6 +194,21 @@ def main():
ex.add_argument("-q", "--query", default="", help="Keyword filter")
ex.add_argument("-t", "--topics", default="", help="Comma-separated topic filter")
tl = sub.add_parser("timeline", help="Show entries by creation date (newest first)")
tl.add_argument("-n", "--limit", type=int, default=20, help="Max entries to show")
tl.add_argument("--since", default="", help="ISO date filter: entries after this date")
tl.add_argument("--until", default="", help="ISO date filter: entries before this date")
tl.add_argument("--source", default="", help="Filter by source type (mempalace, event, manual)")
dd = sub.add_parser("dedup", help="Find and merge duplicate entries")
dd.add_argument("--threshold", type=float, default=0.7, help="Similarity threshold (0.0-1.0)")
dd.add_argument("--dry-run", action="store_true", help="Only list duplicates, don't merge")
mg = sub.add_parser("merge", help="Merge two entries into one")
mg.add_argument("id1", help="First entry ID")
mg.add_argument("id2", help="Second entry ID")
mg.add_argument("--into", default="", help="Force this ID as the primary (surviving) entry")
args = parser.parse_args()
if not args.command:
parser.print_help()
@@ -132,6 +222,9 @@ def main():
"topics": cmd_topics,
"remove": cmd_remove,
"export": cmd_export,
"timeline": cmd_timeline,
"dedup": cmd_dedup,
"merge": cmd_merge,
}
dispatch[args.command](args)

View File

@@ -0,0 +1,187 @@
"""Tests for Mnemosyne Phase 1b: timeline, dedup, merge."""
import tempfile
from pathlib import Path
from nexus.mnemosyne.archive import MnemosyneArchive
from nexus.mnemosyne.entry import ArchiveEntry
from nexus.mnemosyne.ingest import ingest_event
def _make_archive(tmp_path: Path) -> MnemosyneArchive:
return MnemosyneArchive(archive_path=tmp_path / "test.json")
# ── Timeline ──────────────────────────────────────────────
def test_timeline_returns_newest_first():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="First", content="oldest entry")
e2 = ingest_event(archive, title="Second", content="middle entry")
e3 = ingest_event(archive, title="Third", content="newest entry")
timeline = archive.timeline(limit=10)
assert len(timeline) == 3
assert timeline[0].id == e3.id # newest first
assert timeline[-1].id == e1.id # oldest last
def test_timeline_respects_limit():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
for i in range(5):
ingest_event(archive, title=f"Entry {i}", content=f"content {i}")
timeline = archive.timeline(limit=2)
assert len(timeline) == 2
def test_timeline_filters_by_source():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
ingest_event(archive, title="From event", content="event content", source="event")
ingest_event(archive, title="From manual", content="manual content", source="manual")
ingest_event(archive, title="Also event", content="another event", source="event")
timeline = archive.timeline(limit=10, source="event")
assert len(timeline) == 2
assert all(e.source == "event" for e in timeline)
def test_timeline_filters_by_since():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
# Create entries — all have recent timestamps
e1 = ingest_event(archive, title="Recent", content="recent")
# The since filter just compares ISO strings, so use a date far in the past
timeline = archive.timeline(limit=10, since="2099-01-01")
assert len(timeline) == 0
# With since in the past, should get all
timeline_all = archive.timeline(limit=10, since="2020-01-01")
assert len(timeline_all) == 1
def test_recent_shorthand():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
for i in range(10):
ingest_event(archive, title=f"E{i}", content=f"content {i}")
recent = archive.recent(n=3)
assert len(recent) == 3
# ── Duplicate Detection ───────────────────────────────────
def test_find_duplicates_finds_near_duplicates():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
ingest_event(archive, title="Python automation", content="Building automation tools in Python for developers")
ingest_event(archive, title="Python automation guide", content="Building automation tools in Python for developers and teams")
ingest_event(archive, title="Cooking pasta", content="How to make carbonara")
pairs = archive.find_duplicates(threshold=0.5)
assert len(pairs) >= 1
# The python entries should be in the duplicate pair
pair_titles = {pairs[0][0].title, pairs[0][1].title}
assert "Python automation" in pair_titles
assert "Python automation guide" in pair_titles
def test_find_duplicates_no_false_positives():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
ingest_event(archive, title="Quantum physics", content="Entanglement superposition wave function")
ingest_event(archive, title="Baking bread", content="Flour water yeast knead oven")
pairs = archive.find_duplicates(threshold=0.7)
assert len(pairs) == 0
def test_find_duplicates_empty_archive():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
pairs = archive.find_duplicates()
assert pairs == []
# ── Merge ─────────────────────────────────────────────────
def test_merge_unions_topics():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="content A", topics=["python", "ai"])
e2 = ingest_event(archive, title="Beta", content="content B", topics=["ai", "ml"])
merged = archive.merge_entries(e1.id, e2.id)
assert merged is not None
assert set(merged.topics) == {"python", "ai", "ml"}
assert archive.count == 1
def test_merge_appends_different_content():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="First part")
e2 = ingest_event(archive, title="Beta", content="Second part")
merged = archive.merge_entries(e1.id, e2.id)
assert "First part" in merged.content
assert "Second part" in merged.content
def test_merge_does_not_duplicate_content():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="Same content")
e2 = ingest_event(archive, title="Beta", content="Same content")
merged = archive.merge_entries(e1.id, e2.id)
assert merged.content.count("Same content") == 1
def test_merge_redirects_links():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="Python automation scripts")
e2 = ingest_event(archive, title="Beta", content="Python coding tools")
e3 = ingest_event(archive, title="Gamma", content="Python scripting helpers")
# e3 might link to e2; after merge, should link to e1
merged = archive.merge_entries(e1.id, e2.id)
e3_fresh = archive.get(e3.id)
assert e2.id not in e3_fresh.links
# e3 should link to e1 (the survivor) if it was linking to e2
# (depending on exact similarity, this may or may not hold — at minimum, no dangling refs)
def test_merge_removes_duplicate():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="first")
e2 = ingest_event(archive, title="Beta", content="second")
archive.merge_entries(e1.id, e2.id)
assert archive.get(e2.id) is None
assert archive.get(e1.id) is not None
def test_merge_same_id_is_noop():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="content")
result = archive.merge_entries(e1.id, e1.id)
assert result.id == e1.id
assert archive.count == 1
def test_merge_nonexistent_returns_none():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="content")
assert archive.merge_entries(e1.id, "no-such-id") is None
assert archive.merge_entries("no-such-id", e1.id) is None
def test_merge_persists():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "persist.json"
a1 = MnemosyneArchive(archive_path=path)
e1 = ingest_event(a1, title="Alpha", content="first", topics=["x"])
e2 = ingest_event(a1, title="Beta", content="second", topics=["y"])
a1.merge_entries(e1.id, e2.id)
a2 = MnemosyneArchive(archive_path=path)
assert a2.count == 1
entry = list(a2._entries.values())[0]
assert set(entry.topics) == {"x", "y"}