From bb856765ce4d64fc4322cfbd9c0da8091696fc0c Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Sun, 12 Apr 2026 06:20:39 +0000 Subject: [PATCH] test: add consolidation tests (#1260) Covers exact duplicate detection, dry-run, topic/link merging, triple duplicates, and link repair. --- nexus/mnemosyne/tests/test_consolidation.py | 137 ++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 nexus/mnemosyne/tests/test_consolidation.py diff --git a/nexus/mnemosyne/tests/test_consolidation.py b/nexus/mnemosyne/tests/test_consolidation.py new file mode 100644 index 0000000..e6138e3 --- /dev/null +++ b/nexus/mnemosyne/tests/test_consolidation.py @@ -0,0 +1,137 @@ +"""Tests for MnemosyneArchive.consolidate().""" + +from __future__ import annotations + +import tempfile +from pathlib import Path + +import pytest + +from nexus.mnemosyne.archive import MnemosyneArchive +from nexus.mnemosyne.entry import ArchiveEntry + + +@pytest.fixture +def archive(tmp_path): + """Create an archive with auto_embed disabled for deterministic tests.""" + path = tmp_path / "test_archive.json" + return MnemosyneArchive(archive_path=path, auto_embed=False) + + +class TestConsolidateExactDuplicates: + """Phase 1: exact duplicate detection by content_hash.""" + + def test_finds_exact_duplicates(self, archive): + entry_a = ArchiveEntry(title="Hello", content="World") + entry_b = ArchiveEntry(title="Hello", content="World") + archive.add(entry_a, auto_link=False) + archive.add(entry_b, auto_link=False) + + # Force same content_hash + entry_b.content_hash = entry_a.content_hash + + # Re-add entry_b manually (bypass add() dedup) + archive._entries[entry_b.id] = entry_b + + merges = archive.consolidate() + assert len(merges) == 1 + assert merges[0]["reason"] == "exact_content_hash" + assert merges[0]["similarity"] == 1.0 + + def test_keeps_older_entry(self, archive): + entry_a = ArchiveEntry(title="First", content="Data", created_at="2024-01-01T00:00:00+00:00") + entry_b = ArchiveEntry(title="Second", content="Data", created_at="2024-06-01T00:00:00+00:00") + entry_b.content_hash = entry_a.content_hash + archive.add(entry_a, auto_link=False) + archive._entries[entry_b.id] = entry_b + + merges = archive.consolidate() + assert merges[0]["kept_id"] == entry_a.id + assert merges[0]["removed_id"] == entry_b.id + + def test_dry_run_does_not_modify(self, archive): + entry_a = ArchiveEntry(title="A", content="Same") + entry_b = ArchiveEntry(title="B", content="Same") + entry_b.content_hash = entry_a.content_hash + archive.add(entry_a, auto_link=False) + archive._entries[entry_b.id] = entry_b + + count_before = archive.count + merges = archive.consolidate(dry_run=True) + assert len(merges) == 1 + assert archive.count == count_before # unchanged + + def test_no_duplicates_returns_empty(self, archive): + archive.add(ArchiveEntry(title="Unique A", content="Content A"), auto_link=False) + archive.add(ArchiveEntry(title="Unique B", content="Content B"), auto_link=False) + merges = archive.consolidate() + assert merges == [] + + def test_merges_topics(self, archive): + entry_a = ArchiveEntry(title="A", content="Data", topics=["python"]) + entry_b = ArchiveEntry(title="B", content="Data", topics=["testing"]) + entry_b.content_hash = entry_a.content_hash + archive.add(entry_a, auto_link=False) + archive._entries[entry_b.id] = entry_b + + archive.consolidate() + keeper = archive.get(entry_a.id) + assert "python" in keeper.topics + assert "testing" in keeper.topics + + def test_merges_links(self, archive): + entry_c = ArchiveEntry(title="C", content="Ref") + archive.add(entry_c, auto_link=False) + + entry_a = ArchiveEntry(title="A", content="Data", links=[entry_c.id]) + entry_b = ArchiveEntry(title="B", content="Data", links=[entry_c.id]) + entry_b.content_hash = entry_a.content_hash + archive.add(entry_a, auto_link=False) + archive._entries[entry_b.id] = entry_b + + archive.consolidate() + keeper = archive.get(entry_a.id) + assert entry_c.id in keeper.links + + def test_removes_duplicate_from_archive(self, archive): + entry_a = ArchiveEntry(title="A", content="Same") + entry_b = ArchiveEntry(title="B", content="Same") + entry_b.content_hash = entry_a.content_hash + archive.add(entry_a, auto_link=False) + archive._entries[entry_b.id] = entry_b + + archive.consolidate() + assert archive.get(entry_a.id) is not None + assert archive.get(entry_b.id) is None + + def test_fixes_links_pointing_to_removed(self, archive): + entry_a = ArchiveEntry(title="A", content="Same") + entry_b = ArchiveEntry(title="B", content="Same") + entry_c = ArchiveEntry(title="C", content="Ref", links=[entry_b.id]) + entry_b.content_hash = entry_a.content_hash + archive.add(entry_a, auto_link=False) + archive.add(entry_c, auto_link=False) + archive._entries[entry_b.id] = entry_b + + archive.consolidate() + survivor = archive.get(entry_c.id) + assert entry_b.id not in survivor.links + assert entry_a.id in survivor.links + + +class TestConsolidateTripleDuplicates: + """Handle 3+ entries with the same content_hash.""" + + def test_three_way_merge(self, archive): + entry_a = ArchiveEntry(title="A", content="Same", created_at="2024-01-01T00:00:00+00:00") + entry_b = ArchiveEntry(title="B", content="Same", created_at="2024-02-01T00:00:00+00:00") + entry_c = ArchiveEntry(title="C", content="Same", created_at="2024-03-01T00:00:00+00:00") + entry_b.content_hash = entry_a.content_hash + entry_c.content_hash = entry_a.content_hash + archive.add(entry_a, auto_link=False) + archive._entries[entry_b.id] = entry_b + archive._entries[entry_c.id] = entry_c + + merges = archive.consolidate() + assert len(merges) == 2 + assert all(m["kept_id"] == entry_a.id for m in merges)