Closes #1230 Adds three capabilities to MnemosyneArchive: - timeline(limit, since, until, source): entries sorted by creation date with optional date range and source filtering - recent(n): shorthand for last N entries - find_duplicates(threshold): Jaccard similarity-based duplicate detection returning scored pairs above threshold - merge_entries(primary, duplicate): union topics, append unique content, redirect links, remove duplicate CLI commands: - mnemosyne timeline [--since DATE] [--until DATE] [--source TYPE] [-n N] - mnemosyne dedup [--threshold 0.7] [--dry-run] - mnemosyne merge <id1> <id2> [--into ID] 17 new tests in test_timeline_dedup.py, all passing (38 total).
188 lines
8.0 KiB
Python
188 lines
8.0 KiB
Python
"""Tests for Mnemosyne Phase 1b: timeline, dedup, merge."""
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
from nexus.mnemosyne.archive import MnemosyneArchive
|
|
from nexus.mnemosyne.entry import ArchiveEntry
|
|
from nexus.mnemosyne.ingest import ingest_event
|
|
|
|
|
|
def _make_archive(tmp_path: Path) -> MnemosyneArchive:
|
|
return MnemosyneArchive(archive_path=tmp_path / "test.json")
|
|
|
|
|
|
# ── Timeline ──────────────────────────────────────────────
|
|
|
|
def test_timeline_returns_newest_first():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
e1 = ingest_event(archive, title="First", content="oldest entry")
|
|
e2 = ingest_event(archive, title="Second", content="middle entry")
|
|
e3 = ingest_event(archive, title="Third", content="newest entry")
|
|
timeline = archive.timeline(limit=10)
|
|
assert len(timeline) == 3
|
|
assert timeline[0].id == e3.id # newest first
|
|
assert timeline[-1].id == e1.id # oldest last
|
|
|
|
|
|
def test_timeline_respects_limit():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
for i in range(5):
|
|
ingest_event(archive, title=f"Entry {i}", content=f"content {i}")
|
|
timeline = archive.timeline(limit=2)
|
|
assert len(timeline) == 2
|
|
|
|
|
|
def test_timeline_filters_by_source():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
ingest_event(archive, title="From event", content="event content", source="event")
|
|
ingest_event(archive, title="From manual", content="manual content", source="manual")
|
|
ingest_event(archive, title="Also event", content="another event", source="event")
|
|
timeline = archive.timeline(limit=10, source="event")
|
|
assert len(timeline) == 2
|
|
assert all(e.source == "event" for e in timeline)
|
|
|
|
|
|
def test_timeline_filters_by_since():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
# Create entries — all have recent timestamps
|
|
e1 = ingest_event(archive, title="Recent", content="recent")
|
|
# The since filter just compares ISO strings, so use a date far in the past
|
|
timeline = archive.timeline(limit=10, since="2099-01-01")
|
|
assert len(timeline) == 0
|
|
# With since in the past, should get all
|
|
timeline_all = archive.timeline(limit=10, since="2020-01-01")
|
|
assert len(timeline_all) == 1
|
|
|
|
|
|
def test_recent_shorthand():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
for i in range(10):
|
|
ingest_event(archive, title=f"E{i}", content=f"content {i}")
|
|
recent = archive.recent(n=3)
|
|
assert len(recent) == 3
|
|
|
|
|
|
# ── Duplicate Detection ───────────────────────────────────
|
|
|
|
def test_find_duplicates_finds_near_duplicates():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
ingest_event(archive, title="Python automation", content="Building automation tools in Python for developers")
|
|
ingest_event(archive, title="Python automation guide", content="Building automation tools in Python for developers and teams")
|
|
ingest_event(archive, title="Cooking pasta", content="How to make carbonara")
|
|
pairs = archive.find_duplicates(threshold=0.5)
|
|
assert len(pairs) >= 1
|
|
# The python entries should be in the duplicate pair
|
|
pair_titles = {pairs[0][0].title, pairs[0][1].title}
|
|
assert "Python automation" in pair_titles
|
|
assert "Python automation guide" in pair_titles
|
|
|
|
|
|
def test_find_duplicates_no_false_positives():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
ingest_event(archive, title="Quantum physics", content="Entanglement superposition wave function")
|
|
ingest_event(archive, title="Baking bread", content="Flour water yeast knead oven")
|
|
pairs = archive.find_duplicates(threshold=0.7)
|
|
assert len(pairs) == 0
|
|
|
|
|
|
def test_find_duplicates_empty_archive():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
pairs = archive.find_duplicates()
|
|
assert pairs == []
|
|
|
|
|
|
# ── Merge ─────────────────────────────────────────────────
|
|
|
|
def test_merge_unions_topics():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
e1 = ingest_event(archive, title="Alpha", content="content A", topics=["python", "ai"])
|
|
e2 = ingest_event(archive, title="Beta", content="content B", topics=["ai", "ml"])
|
|
merged = archive.merge_entries(e1.id, e2.id)
|
|
assert merged is not None
|
|
assert set(merged.topics) == {"python", "ai", "ml"}
|
|
assert archive.count == 1
|
|
|
|
|
|
def test_merge_appends_different_content():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
e1 = ingest_event(archive, title="Alpha", content="First part")
|
|
e2 = ingest_event(archive, title="Beta", content="Second part")
|
|
merged = archive.merge_entries(e1.id, e2.id)
|
|
assert "First part" in merged.content
|
|
assert "Second part" in merged.content
|
|
|
|
|
|
def test_merge_does_not_duplicate_content():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
e1 = ingest_event(archive, title="Alpha", content="Same content")
|
|
e2 = ingest_event(archive, title="Beta", content="Same content")
|
|
merged = archive.merge_entries(e1.id, e2.id)
|
|
assert merged.content.count("Same content") == 1
|
|
|
|
|
|
def test_merge_redirects_links():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
e1 = ingest_event(archive, title="Alpha", content="Python automation scripts")
|
|
e2 = ingest_event(archive, title="Beta", content="Python coding tools")
|
|
e3 = ingest_event(archive, title="Gamma", content="Python scripting helpers")
|
|
# e3 might link to e2; after merge, should link to e1
|
|
merged = archive.merge_entries(e1.id, e2.id)
|
|
e3_fresh = archive.get(e3.id)
|
|
assert e2.id not in e3_fresh.links
|
|
# e3 should link to e1 (the survivor) if it was linking to e2
|
|
# (depending on exact similarity, this may or may not hold — at minimum, no dangling refs)
|
|
|
|
|
|
def test_merge_removes_duplicate():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
e1 = ingest_event(archive, title="Alpha", content="first")
|
|
e2 = ingest_event(archive, title="Beta", content="second")
|
|
archive.merge_entries(e1.id, e2.id)
|
|
assert archive.get(e2.id) is None
|
|
assert archive.get(e1.id) is not None
|
|
|
|
|
|
def test_merge_same_id_is_noop():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
e1 = ingest_event(archive, title="Alpha", content="content")
|
|
result = archive.merge_entries(e1.id, e1.id)
|
|
assert result.id == e1.id
|
|
assert archive.count == 1
|
|
|
|
|
|
def test_merge_nonexistent_returns_none():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _make_archive(Path(tmp))
|
|
e1 = ingest_event(archive, title="Alpha", content="content")
|
|
assert archive.merge_entries(e1.id, "no-such-id") is None
|
|
assert archive.merge_entries("no-such-id", e1.id) is None
|
|
|
|
|
|
def test_merge_persists():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
path = Path(tmp) / "persist.json"
|
|
a1 = MnemosyneArchive(archive_path=path)
|
|
e1 = ingest_event(a1, title="Alpha", content="first", topics=["x"])
|
|
e2 = ingest_event(a1, title="Beta", content="second", topics=["y"])
|
|
a1.merge_entries(e1.id, e2.id)
|
|
|
|
a2 = MnemosyneArchive(archive_path=path)
|
|
assert a2.count == 1
|
|
entry = list(a2._entries.values())[0]
|
|
assert set(entry.topics) == {"x", "y"}
|