test: add consolidation tests (#1260)
Covers exact duplicate detection, dry-run, topic/link merging, triple duplicates, and link repair.
This commit is contained in:
137
nexus/mnemosyne/tests/test_consolidation.py
Normal file
137
nexus/mnemosyne/tests/test_consolidation.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""Tests for MnemosyneArchive.consolidate()."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from nexus.mnemosyne.archive import MnemosyneArchive
|
||||
from nexus.mnemosyne.entry import ArchiveEntry
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def archive(tmp_path):
|
||||
"""Create an archive with auto_embed disabled for deterministic tests."""
|
||||
path = tmp_path / "test_archive.json"
|
||||
return MnemosyneArchive(archive_path=path, auto_embed=False)
|
||||
|
||||
|
||||
class TestConsolidateExactDuplicates:
|
||||
"""Phase 1: exact duplicate detection by content_hash."""
|
||||
|
||||
def test_finds_exact_duplicates(self, archive):
|
||||
entry_a = ArchiveEntry(title="Hello", content="World")
|
||||
entry_b = ArchiveEntry(title="Hello", content="World")
|
||||
archive.add(entry_a, auto_link=False)
|
||||
archive.add(entry_b, auto_link=False)
|
||||
|
||||
# Force same content_hash
|
||||
entry_b.content_hash = entry_a.content_hash
|
||||
|
||||
# Re-add entry_b manually (bypass add() dedup)
|
||||
archive._entries[entry_b.id] = entry_b
|
||||
|
||||
merges = archive.consolidate()
|
||||
assert len(merges) == 1
|
||||
assert merges[0]["reason"] == "exact_content_hash"
|
||||
assert merges[0]["similarity"] == 1.0
|
||||
|
||||
def test_keeps_older_entry(self, archive):
|
||||
entry_a = ArchiveEntry(title="First", content="Data", created_at="2024-01-01T00:00:00+00:00")
|
||||
entry_b = ArchiveEntry(title="Second", content="Data", created_at="2024-06-01T00:00:00+00:00")
|
||||
entry_b.content_hash = entry_a.content_hash
|
||||
archive.add(entry_a, auto_link=False)
|
||||
archive._entries[entry_b.id] = entry_b
|
||||
|
||||
merges = archive.consolidate()
|
||||
assert merges[0]["kept_id"] == entry_a.id
|
||||
assert merges[0]["removed_id"] == entry_b.id
|
||||
|
||||
def test_dry_run_does_not_modify(self, archive):
|
||||
entry_a = ArchiveEntry(title="A", content="Same")
|
||||
entry_b = ArchiveEntry(title="B", content="Same")
|
||||
entry_b.content_hash = entry_a.content_hash
|
||||
archive.add(entry_a, auto_link=False)
|
||||
archive._entries[entry_b.id] = entry_b
|
||||
|
||||
count_before = archive.count
|
||||
merges = archive.consolidate(dry_run=True)
|
||||
assert len(merges) == 1
|
||||
assert archive.count == count_before # unchanged
|
||||
|
||||
def test_no_duplicates_returns_empty(self, archive):
|
||||
archive.add(ArchiveEntry(title="Unique A", content="Content A"), auto_link=False)
|
||||
archive.add(ArchiveEntry(title="Unique B", content="Content B"), auto_link=False)
|
||||
merges = archive.consolidate()
|
||||
assert merges == []
|
||||
|
||||
def test_merges_topics(self, archive):
|
||||
entry_a = ArchiveEntry(title="A", content="Data", topics=["python"])
|
||||
entry_b = ArchiveEntry(title="B", content="Data", topics=["testing"])
|
||||
entry_b.content_hash = entry_a.content_hash
|
||||
archive.add(entry_a, auto_link=False)
|
||||
archive._entries[entry_b.id] = entry_b
|
||||
|
||||
archive.consolidate()
|
||||
keeper = archive.get(entry_a.id)
|
||||
assert "python" in keeper.topics
|
||||
assert "testing" in keeper.topics
|
||||
|
||||
def test_merges_links(self, archive):
|
||||
entry_c = ArchiveEntry(title="C", content="Ref")
|
||||
archive.add(entry_c, auto_link=False)
|
||||
|
||||
entry_a = ArchiveEntry(title="A", content="Data", links=[entry_c.id])
|
||||
entry_b = ArchiveEntry(title="B", content="Data", links=[entry_c.id])
|
||||
entry_b.content_hash = entry_a.content_hash
|
||||
archive.add(entry_a, auto_link=False)
|
||||
archive._entries[entry_b.id] = entry_b
|
||||
|
||||
archive.consolidate()
|
||||
keeper = archive.get(entry_a.id)
|
||||
assert entry_c.id in keeper.links
|
||||
|
||||
def test_removes_duplicate_from_archive(self, archive):
|
||||
entry_a = ArchiveEntry(title="A", content="Same")
|
||||
entry_b = ArchiveEntry(title="B", content="Same")
|
||||
entry_b.content_hash = entry_a.content_hash
|
||||
archive.add(entry_a, auto_link=False)
|
||||
archive._entries[entry_b.id] = entry_b
|
||||
|
||||
archive.consolidate()
|
||||
assert archive.get(entry_a.id) is not None
|
||||
assert archive.get(entry_b.id) is None
|
||||
|
||||
def test_fixes_links_pointing_to_removed(self, archive):
|
||||
entry_a = ArchiveEntry(title="A", content="Same")
|
||||
entry_b = ArchiveEntry(title="B", content="Same")
|
||||
entry_c = ArchiveEntry(title="C", content="Ref", links=[entry_b.id])
|
||||
entry_b.content_hash = entry_a.content_hash
|
||||
archive.add(entry_a, auto_link=False)
|
||||
archive.add(entry_c, auto_link=False)
|
||||
archive._entries[entry_b.id] = entry_b
|
||||
|
||||
archive.consolidate()
|
||||
survivor = archive.get(entry_c.id)
|
||||
assert entry_b.id not in survivor.links
|
||||
assert entry_a.id in survivor.links
|
||||
|
||||
|
||||
class TestConsolidateTripleDuplicates:
|
||||
"""Handle 3+ entries with the same content_hash."""
|
||||
|
||||
def test_three_way_merge(self, archive):
|
||||
entry_a = ArchiveEntry(title="A", content="Same", created_at="2024-01-01T00:00:00+00:00")
|
||||
entry_b = ArchiveEntry(title="B", content="Same", created_at="2024-02-01T00:00:00+00:00")
|
||||
entry_c = ArchiveEntry(title="C", content="Same", created_at="2024-03-01T00:00:00+00:00")
|
||||
entry_b.content_hash = entry_a.content_hash
|
||||
entry_c.content_hash = entry_a.content_hash
|
||||
archive.add(entry_a, auto_link=False)
|
||||
archive._entries[entry_b.id] = entry_b
|
||||
archive._entries[entry_c.id] = entry_c
|
||||
|
||||
merges = archive.consolidate()
|
||||
assert len(merges) == 2
|
||||
assert all(m["kept_id"] == entry_a.id for m in merges)
|
||||
Reference in New Issue
Block a user