Files
the-nexus/nexus/mnemosyne/tests/test_timeline_dedup.py
Timmy 7bf69571ae feat(mnemosyne): Phase 1b — timeline navigation, deduplication, entry merging
Closes #1230

Adds three capabilities to MnemosyneArchive:

- timeline(limit, since, until, source): entries sorted by creation date
  with optional date range and source filtering
- recent(n): shorthand for last N entries
- find_duplicates(threshold): Jaccard similarity-based duplicate detection
  returning scored pairs above threshold
- merge_entries(primary, duplicate): union topics, append unique content,
  redirect links, remove duplicate

CLI commands:
- mnemosyne timeline [--since DATE] [--until DATE] [--source TYPE] [-n N]
- mnemosyne dedup [--threshold 0.7] [--dry-run]
- mnemosyne merge <id1> <id2> [--into ID]

17 new tests in test_timeline_dedup.py, all passing (38 total).
2026-04-11 17:54:37 -04:00

188 lines
8.0 KiB
Python

"""Tests for Mnemosyne Phase 1b: timeline, dedup, merge."""
import tempfile
from pathlib import Path
from nexus.mnemosyne.archive import MnemosyneArchive
from nexus.mnemosyne.entry import ArchiveEntry
from nexus.mnemosyne.ingest import ingest_event
def _make_archive(tmp_path: Path) -> MnemosyneArchive:
return MnemosyneArchive(archive_path=tmp_path / "test.json")
# ── Timeline ──────────────────────────────────────────────
def test_timeline_returns_newest_first():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="First", content="oldest entry")
e2 = ingest_event(archive, title="Second", content="middle entry")
e3 = ingest_event(archive, title="Third", content="newest entry")
timeline = archive.timeline(limit=10)
assert len(timeline) == 3
assert timeline[0].id == e3.id # newest first
assert timeline[-1].id == e1.id # oldest last
def test_timeline_respects_limit():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
for i in range(5):
ingest_event(archive, title=f"Entry {i}", content=f"content {i}")
timeline = archive.timeline(limit=2)
assert len(timeline) == 2
def test_timeline_filters_by_source():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
ingest_event(archive, title="From event", content="event content", source="event")
ingest_event(archive, title="From manual", content="manual content", source="manual")
ingest_event(archive, title="Also event", content="another event", source="event")
timeline = archive.timeline(limit=10, source="event")
assert len(timeline) == 2
assert all(e.source == "event" for e in timeline)
def test_timeline_filters_by_since():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
# Create entries — all have recent timestamps
e1 = ingest_event(archive, title="Recent", content="recent")
# The since filter just compares ISO strings, so use a date far in the past
timeline = archive.timeline(limit=10, since="2099-01-01")
assert len(timeline) == 0
# With since in the past, should get all
timeline_all = archive.timeline(limit=10, since="2020-01-01")
assert len(timeline_all) == 1
def test_recent_shorthand():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
for i in range(10):
ingest_event(archive, title=f"E{i}", content=f"content {i}")
recent = archive.recent(n=3)
assert len(recent) == 3
# ── Duplicate Detection ───────────────────────────────────
def test_find_duplicates_finds_near_duplicates():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
ingest_event(archive, title="Python automation", content="Building automation tools in Python for developers")
ingest_event(archive, title="Python automation guide", content="Building automation tools in Python for developers and teams")
ingest_event(archive, title="Cooking pasta", content="How to make carbonara")
pairs = archive.find_duplicates(threshold=0.5)
assert len(pairs) >= 1
# The python entries should be in the duplicate pair
pair_titles = {pairs[0][0].title, pairs[0][1].title}
assert "Python automation" in pair_titles
assert "Python automation guide" in pair_titles
def test_find_duplicates_no_false_positives():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
ingest_event(archive, title="Quantum physics", content="Entanglement superposition wave function")
ingest_event(archive, title="Baking bread", content="Flour water yeast knead oven")
pairs = archive.find_duplicates(threshold=0.7)
assert len(pairs) == 0
def test_find_duplicates_empty_archive():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
pairs = archive.find_duplicates()
assert pairs == []
# ── Merge ─────────────────────────────────────────────────
def test_merge_unions_topics():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="content A", topics=["python", "ai"])
e2 = ingest_event(archive, title="Beta", content="content B", topics=["ai", "ml"])
merged = archive.merge_entries(e1.id, e2.id)
assert merged is not None
assert set(merged.topics) == {"python", "ai", "ml"}
assert archive.count == 1
def test_merge_appends_different_content():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="First part")
e2 = ingest_event(archive, title="Beta", content="Second part")
merged = archive.merge_entries(e1.id, e2.id)
assert "First part" in merged.content
assert "Second part" in merged.content
def test_merge_does_not_duplicate_content():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="Same content")
e2 = ingest_event(archive, title="Beta", content="Same content")
merged = archive.merge_entries(e1.id, e2.id)
assert merged.content.count("Same content") == 1
def test_merge_redirects_links():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="Python automation scripts")
e2 = ingest_event(archive, title="Beta", content="Python coding tools")
e3 = ingest_event(archive, title="Gamma", content="Python scripting helpers")
# e3 might link to e2; after merge, should link to e1
merged = archive.merge_entries(e1.id, e2.id)
e3_fresh = archive.get(e3.id)
assert e2.id not in e3_fresh.links
# e3 should link to e1 (the survivor) if it was linking to e2
# (depending on exact similarity, this may or may not hold — at minimum, no dangling refs)
def test_merge_removes_duplicate():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="first")
e2 = ingest_event(archive, title="Beta", content="second")
archive.merge_entries(e1.id, e2.id)
assert archive.get(e2.id) is None
assert archive.get(e1.id) is not None
def test_merge_same_id_is_noop():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="content")
result = archive.merge_entries(e1.id, e1.id)
assert result.id == e1.id
assert archive.count == 1
def test_merge_nonexistent_returns_none():
with tempfile.TemporaryDirectory() as tmp:
archive = _make_archive(Path(tmp))
e1 = ingest_event(archive, title="Alpha", content="content")
assert archive.merge_entries(e1.id, "no-such-id") is None
assert archive.merge_entries("no-such-id", e1.id) is None
def test_merge_persists():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "persist.json"
a1 = MnemosyneArchive(archive_path=path)
e1 = ingest_event(a1, title="Alpha", content="first", topics=["x"])
e2 = ingest_event(a1, title="Beta", content="second", topics=["y"])
a1.merge_entries(e1.id, e2.id)
a2 = MnemosyneArchive(archive_path=path)
assert a2.count == 1
entry = list(a2._entries.values())[0]
assert set(entry.topics) == {"x", "y"}