test: add consolidation tests (#1260)

Covers exact duplicate detection, dry-run, topic/link
merging, triple duplicates, and link repair.
This commit is contained in:
2026-04-12 06:20:39 +00:00
parent 1e110922b2
commit bb856765ce

View File

@@ -0,0 +1,137 @@
"""Tests for MnemosyneArchive.consolidate()."""
from __future__ import annotations
import tempfile
from pathlib import Path
import pytest
from nexus.mnemosyne.archive import MnemosyneArchive
from nexus.mnemosyne.entry import ArchiveEntry
@pytest.fixture
def archive(tmp_path):
"""Create an archive with auto_embed disabled for deterministic tests."""
path = tmp_path / "test_archive.json"
return MnemosyneArchive(archive_path=path, auto_embed=False)
class TestConsolidateExactDuplicates:
"""Phase 1: exact duplicate detection by content_hash."""
def test_finds_exact_duplicates(self, archive):
entry_a = ArchiveEntry(title="Hello", content="World")
entry_b = ArchiveEntry(title="Hello", content="World")
archive.add(entry_a, auto_link=False)
archive.add(entry_b, auto_link=False)
# Force same content_hash
entry_b.content_hash = entry_a.content_hash
# Re-add entry_b manually (bypass add() dedup)
archive._entries[entry_b.id] = entry_b
merges = archive.consolidate()
assert len(merges) == 1
assert merges[0]["reason"] == "exact_content_hash"
assert merges[0]["similarity"] == 1.0
def test_keeps_older_entry(self, archive):
entry_a = ArchiveEntry(title="First", content="Data", created_at="2024-01-01T00:00:00+00:00")
entry_b = ArchiveEntry(title="Second", content="Data", created_at="2024-06-01T00:00:00+00:00")
entry_b.content_hash = entry_a.content_hash
archive.add(entry_a, auto_link=False)
archive._entries[entry_b.id] = entry_b
merges = archive.consolidate()
assert merges[0]["kept_id"] == entry_a.id
assert merges[0]["removed_id"] == entry_b.id
def test_dry_run_does_not_modify(self, archive):
entry_a = ArchiveEntry(title="A", content="Same")
entry_b = ArchiveEntry(title="B", content="Same")
entry_b.content_hash = entry_a.content_hash
archive.add(entry_a, auto_link=False)
archive._entries[entry_b.id] = entry_b
count_before = archive.count
merges = archive.consolidate(dry_run=True)
assert len(merges) == 1
assert archive.count == count_before # unchanged
def test_no_duplicates_returns_empty(self, archive):
archive.add(ArchiveEntry(title="Unique A", content="Content A"), auto_link=False)
archive.add(ArchiveEntry(title="Unique B", content="Content B"), auto_link=False)
merges = archive.consolidate()
assert merges == []
def test_merges_topics(self, archive):
entry_a = ArchiveEntry(title="A", content="Data", topics=["python"])
entry_b = ArchiveEntry(title="B", content="Data", topics=["testing"])
entry_b.content_hash = entry_a.content_hash
archive.add(entry_a, auto_link=False)
archive._entries[entry_b.id] = entry_b
archive.consolidate()
keeper = archive.get(entry_a.id)
assert "python" in keeper.topics
assert "testing" in keeper.topics
def test_merges_links(self, archive):
entry_c = ArchiveEntry(title="C", content="Ref")
archive.add(entry_c, auto_link=False)
entry_a = ArchiveEntry(title="A", content="Data", links=[entry_c.id])
entry_b = ArchiveEntry(title="B", content="Data", links=[entry_c.id])
entry_b.content_hash = entry_a.content_hash
archive.add(entry_a, auto_link=False)
archive._entries[entry_b.id] = entry_b
archive.consolidate()
keeper = archive.get(entry_a.id)
assert entry_c.id in keeper.links
def test_removes_duplicate_from_archive(self, archive):
entry_a = ArchiveEntry(title="A", content="Same")
entry_b = ArchiveEntry(title="B", content="Same")
entry_b.content_hash = entry_a.content_hash
archive.add(entry_a, auto_link=False)
archive._entries[entry_b.id] = entry_b
archive.consolidate()
assert archive.get(entry_a.id) is not None
assert archive.get(entry_b.id) is None
def test_fixes_links_pointing_to_removed(self, archive):
entry_a = ArchiveEntry(title="A", content="Same")
entry_b = ArchiveEntry(title="B", content="Same")
entry_c = ArchiveEntry(title="C", content="Ref", links=[entry_b.id])
entry_b.content_hash = entry_a.content_hash
archive.add(entry_a, auto_link=False)
archive.add(entry_c, auto_link=False)
archive._entries[entry_b.id] = entry_b
archive.consolidate()
survivor = archive.get(entry_c.id)
assert entry_b.id not in survivor.links
assert entry_a.id in survivor.links
class TestConsolidateTripleDuplicates:
"""Handle 3+ entries with the same content_hash."""
def test_three_way_merge(self, archive):
entry_a = ArchiveEntry(title="A", content="Same", created_at="2024-01-01T00:00:00+00:00")
entry_b = ArchiveEntry(title="B", content="Same", created_at="2024-02-01T00:00:00+00:00")
entry_c = ArchiveEntry(title="C", content="Same", created_at="2024-03-01T00:00:00+00:00")
entry_b.content_hash = entry_a.content_hash
entry_c.content_hash = entry_a.content_hash
archive.add(entry_a, auto_link=False)
archive._entries[entry_b.id] = entry_b
archive._entries[entry_c.id] = entry_c
merges = archive.consolidate()
assert len(merges) == 2
assert all(m["kept_id"] == entry_a.id for m in merges)