177 lines
7.3 KiB
Python
177 lines
7.3 KiB
Python
"""Tests for MnemosyneArchive.consolidate() — duplicate/near-duplicate merging."""
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
from nexus.mnemosyne.archive import MnemosyneArchive
|
|
from nexus.mnemosyne.entry import ArchiveEntry
|
|
from nexus.mnemosyne.ingest import ingest_event
|
|
|
|
|
|
def _archive(tmp: str) -> MnemosyneArchive:
|
|
return MnemosyneArchive(archive_path=Path(tmp) / "archive.json", auto_embed=False)
|
|
|
|
|
|
def test_consolidate_exact_duplicate_removed():
|
|
"""Two entries with identical content_hash are merged; only one survives."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _archive(tmp)
|
|
e1 = ingest_event(archive, title="Hello world", content="Exactly the same content", topics=["a"])
|
|
# Manually add a second entry with the same hash to simulate a duplicate
|
|
e2 = ArchiveEntry(title="Hello world", content="Exactly the same content", topics=["b"])
|
|
# Bypass dedup guard so we can test consolidate() rather than add()
|
|
archive._entries[e2.id] = e2
|
|
archive._save()
|
|
|
|
assert archive.count == 2
|
|
merges = archive.consolidate(dry_run=False)
|
|
assert len(merges) == 1
|
|
assert merges[0]["reason"] == "exact_hash"
|
|
assert merges[0]["score"] == 1.0
|
|
assert archive.count == 1
|
|
|
|
|
|
def test_consolidate_keeps_older_entry():
|
|
"""The older entry (earlier created_at) is kept, the newer is removed."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _archive(tmp)
|
|
e1 = ingest_event(archive, title="Hello world", content="Same content here", topics=[])
|
|
e2 = ArchiveEntry(title="Hello world", content="Same content here", topics=[])
|
|
# Make e2 clearly newer
|
|
e2.created_at = "2099-01-01T00:00:00+00:00"
|
|
archive._entries[e2.id] = e2
|
|
archive._save()
|
|
|
|
merges = archive.consolidate(dry_run=False)
|
|
assert len(merges) == 1
|
|
assert merges[0]["kept"] == e1.id
|
|
assert merges[0]["removed"] == e2.id
|
|
|
|
|
|
def test_consolidate_merges_topics():
|
|
"""Topics from the removed entry are merged (unioned) into the kept entry."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _archive(tmp)
|
|
e1 = ingest_event(archive, title="Memory item", content="Shared content body", topics=["alpha"])
|
|
e2 = ArchiveEntry(title="Memory item", content="Shared content body", topics=["beta", "gamma"])
|
|
e2.created_at = "2099-01-01T00:00:00+00:00"
|
|
archive._entries[e2.id] = e2
|
|
archive._save()
|
|
|
|
archive.consolidate(dry_run=False)
|
|
survivor = archive.get(e1.id)
|
|
assert survivor is not None
|
|
topic_lower = {t.lower() for t in survivor.topics}
|
|
assert "alpha" in topic_lower
|
|
assert "beta" in topic_lower
|
|
assert "gamma" in topic_lower
|
|
|
|
|
|
def test_consolidate_merges_metadata():
|
|
"""Metadata from the removed entry is merged into the kept entry; kept values win."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _archive(tmp)
|
|
e1 = ArchiveEntry(
|
|
title="Shared", content="Identical body here", topics=[], metadata={"k1": "v1", "shared": "kept"}
|
|
)
|
|
archive._entries[e1.id] = e1
|
|
e2 = ArchiveEntry(
|
|
title="Shared", content="Identical body here", topics=[], metadata={"k2": "v2", "shared": "removed"}
|
|
)
|
|
e2.created_at = "2099-01-01T00:00:00+00:00"
|
|
archive._entries[e2.id] = e2
|
|
archive._save()
|
|
|
|
archive.consolidate(dry_run=False)
|
|
survivor = archive.get(e1.id)
|
|
assert survivor.metadata["k1"] == "v1"
|
|
assert survivor.metadata["k2"] == "v2"
|
|
assert survivor.metadata["shared"] == "kept" # kept entry wins
|
|
|
|
|
|
def test_consolidate_dry_run_no_mutation():
|
|
"""Dry-run mode returns merge plan but does not alter the archive."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _archive(tmp)
|
|
ingest_event(archive, title="Same", content="Identical content to dedup", topics=[])
|
|
e2 = ArchiveEntry(title="Same", content="Identical content to dedup", topics=[])
|
|
e2.created_at = "2099-01-01T00:00:00+00:00"
|
|
archive._entries[e2.id] = e2
|
|
archive._save()
|
|
|
|
merges = archive.consolidate(dry_run=True)
|
|
assert len(merges) == 1
|
|
assert merges[0]["dry_run"] is True
|
|
# Archive must be unchanged
|
|
assert archive.count == 2
|
|
|
|
|
|
def test_consolidate_no_duplicates():
|
|
"""When no duplicates exist, consolidate returns an empty list."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _archive(tmp)
|
|
ingest_event(archive, title="Unique A", content="This is completely unique content for A")
|
|
ingest_event(archive, title="Unique B", content="Totally different words here for B")
|
|
merges = archive.consolidate(threshold=0.9)
|
|
assert merges == []
|
|
|
|
|
|
def test_consolidate_transfers_links():
|
|
"""Links from the removed entry are inherited by the kept entry."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _archive(tmp)
|
|
# Create a third entry to act as a link target
|
|
target = ingest_event(archive, title="Target", content="The link target entry", topics=[])
|
|
|
|
e1 = ArchiveEntry(title="Dup", content="Exact duplicate body text", topics=[], links=[target.id])
|
|
archive._entries[e1.id] = e1
|
|
target.links.append(e1.id)
|
|
|
|
e2 = ArchiveEntry(title="Dup", content="Exact duplicate body text", topics=[])
|
|
e2.created_at = "2099-01-01T00:00:00+00:00"
|
|
archive._entries[e2.id] = e2
|
|
archive._save()
|
|
|
|
archive.consolidate(dry_run=False)
|
|
survivor = archive.get(e1.id)
|
|
assert survivor is not None
|
|
assert target.id in survivor.links
|
|
|
|
|
|
def test_consolidate_near_duplicate_semantic():
|
|
"""Near-duplicate entries above the similarity threshold are merged."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
archive = _archive(tmp)
|
|
# Entries with very high Jaccard overlap
|
|
text_a = "python automation scripting building tools workflows"
|
|
text_b = "python automation scripting building tools workflows tasks"
|
|
e1 = ArchiveEntry(title="Automator", content=text_a, topics=[])
|
|
e2 = ArchiveEntry(title="Automator", content=text_b, topics=[])
|
|
e2.created_at = "2099-01-01T00:00:00+00:00"
|
|
archive._entries[e1.id] = e1
|
|
archive._entries[e2.id] = e2
|
|
archive._save()
|
|
|
|
# Use a low threshold to ensure these very similar entries match
|
|
merges = archive.consolidate(threshold=0.7, dry_run=False)
|
|
assert len(merges) >= 1
|
|
assert merges[0]["reason"] == "semantic_similarity"
|
|
|
|
|
|
def test_consolidate_persists_after_reload():
|
|
"""After consolidation, the reduced archive survives a save/reload cycle."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
path = Path(tmp) / "archive.json"
|
|
archive = MnemosyneArchive(archive_path=path, auto_embed=False)
|
|
ingest_event(archive, title="Persist test", content="Body to dedup and persist", topics=[])
|
|
e2 = ArchiveEntry(title="Persist test", content="Body to dedup and persist", topics=[])
|
|
e2.created_at = "2099-01-01T00:00:00+00:00"
|
|
archive._entries[e2.id] = e2
|
|
archive._save()
|
|
|
|
archive.consolidate(dry_run=False)
|
|
assert archive.count == 1
|
|
|
|
reloaded = MnemosyneArchive(archive_path=path, auto_embed=False)
|
|
assert reloaded.count == 1
|