Files
the-nexus/nexus/mnemosyne/tests/test_consolidation.py
Claude (Opus 4.6) b52c7281f0
Some checks failed
Deploy Nexus / deploy (push) Failing after 3s
Staging Verification Gate / verify-staging (push) Failing after 2s
[claude] Mnemosyne: memory consolidation — auto-merge duplicates (#1260) (#1262)
2026-04-12 06:24:24 +00:00

177 lines
7.3 KiB
Python

"""Tests for MnemosyneArchive.consolidate() — duplicate/near-duplicate merging."""
import tempfile
from pathlib import Path
from nexus.mnemosyne.archive import MnemosyneArchive
from nexus.mnemosyne.entry import ArchiveEntry
from nexus.mnemosyne.ingest import ingest_event
def _archive(tmp: str) -> MnemosyneArchive:
return MnemosyneArchive(archive_path=Path(tmp) / "archive.json", auto_embed=False)
def test_consolidate_exact_duplicate_removed():
"""Two entries with identical content_hash are merged; only one survives."""
with tempfile.TemporaryDirectory() as tmp:
archive = _archive(tmp)
e1 = ingest_event(archive, title="Hello world", content="Exactly the same content", topics=["a"])
# Manually add a second entry with the same hash to simulate a duplicate
e2 = ArchiveEntry(title="Hello world", content="Exactly the same content", topics=["b"])
# Bypass dedup guard so we can test consolidate() rather than add()
archive._entries[e2.id] = e2
archive._save()
assert archive.count == 2
merges = archive.consolidate(dry_run=False)
assert len(merges) == 1
assert merges[0]["reason"] == "exact_hash"
assert merges[0]["score"] == 1.0
assert archive.count == 1
def test_consolidate_keeps_older_entry():
"""The older entry (earlier created_at) is kept, the newer is removed."""
with tempfile.TemporaryDirectory() as tmp:
archive = _archive(tmp)
e1 = ingest_event(archive, title="Hello world", content="Same content here", topics=[])
e2 = ArchiveEntry(title="Hello world", content="Same content here", topics=[])
# Make e2 clearly newer
e2.created_at = "2099-01-01T00:00:00+00:00"
archive._entries[e2.id] = e2
archive._save()
merges = archive.consolidate(dry_run=False)
assert len(merges) == 1
assert merges[0]["kept"] == e1.id
assert merges[0]["removed"] == e2.id
def test_consolidate_merges_topics():
"""Topics from the removed entry are merged (unioned) into the kept entry."""
with tempfile.TemporaryDirectory() as tmp:
archive = _archive(tmp)
e1 = ingest_event(archive, title="Memory item", content="Shared content body", topics=["alpha"])
e2 = ArchiveEntry(title="Memory item", content="Shared content body", topics=["beta", "gamma"])
e2.created_at = "2099-01-01T00:00:00+00:00"
archive._entries[e2.id] = e2
archive._save()
archive.consolidate(dry_run=False)
survivor = archive.get(e1.id)
assert survivor is not None
topic_lower = {t.lower() for t in survivor.topics}
assert "alpha" in topic_lower
assert "beta" in topic_lower
assert "gamma" in topic_lower
def test_consolidate_merges_metadata():
"""Metadata from the removed entry is merged into the kept entry; kept values win."""
with tempfile.TemporaryDirectory() as tmp:
archive = _archive(tmp)
e1 = ArchiveEntry(
title="Shared", content="Identical body here", topics=[], metadata={"k1": "v1", "shared": "kept"}
)
archive._entries[e1.id] = e1
e2 = ArchiveEntry(
title="Shared", content="Identical body here", topics=[], metadata={"k2": "v2", "shared": "removed"}
)
e2.created_at = "2099-01-01T00:00:00+00:00"
archive._entries[e2.id] = e2
archive._save()
archive.consolidate(dry_run=False)
survivor = archive.get(e1.id)
assert survivor.metadata["k1"] == "v1"
assert survivor.metadata["k2"] == "v2"
assert survivor.metadata["shared"] == "kept" # kept entry wins
def test_consolidate_dry_run_no_mutation():
"""Dry-run mode returns merge plan but does not alter the archive."""
with tempfile.TemporaryDirectory() as tmp:
archive = _archive(tmp)
ingest_event(archive, title="Same", content="Identical content to dedup", topics=[])
e2 = ArchiveEntry(title="Same", content="Identical content to dedup", topics=[])
e2.created_at = "2099-01-01T00:00:00+00:00"
archive._entries[e2.id] = e2
archive._save()
merges = archive.consolidate(dry_run=True)
assert len(merges) == 1
assert merges[0]["dry_run"] is True
# Archive must be unchanged
assert archive.count == 2
def test_consolidate_no_duplicates():
"""When no duplicates exist, consolidate returns an empty list."""
with tempfile.TemporaryDirectory() as tmp:
archive = _archive(tmp)
ingest_event(archive, title="Unique A", content="This is completely unique content for A")
ingest_event(archive, title="Unique B", content="Totally different words here for B")
merges = archive.consolidate(threshold=0.9)
assert merges == []
def test_consolidate_transfers_links():
"""Links from the removed entry are inherited by the kept entry."""
with tempfile.TemporaryDirectory() as tmp:
archive = _archive(tmp)
# Create a third entry to act as a link target
target = ingest_event(archive, title="Target", content="The link target entry", topics=[])
e1 = ArchiveEntry(title="Dup", content="Exact duplicate body text", topics=[], links=[target.id])
archive._entries[e1.id] = e1
target.links.append(e1.id)
e2 = ArchiveEntry(title="Dup", content="Exact duplicate body text", topics=[])
e2.created_at = "2099-01-01T00:00:00+00:00"
archive._entries[e2.id] = e2
archive._save()
archive.consolidate(dry_run=False)
survivor = archive.get(e1.id)
assert survivor is not None
assert target.id in survivor.links
def test_consolidate_near_duplicate_semantic():
"""Near-duplicate entries above the similarity threshold are merged."""
with tempfile.TemporaryDirectory() as tmp:
archive = _archive(tmp)
# Entries with very high Jaccard overlap
text_a = "python automation scripting building tools workflows"
text_b = "python automation scripting building tools workflows tasks"
e1 = ArchiveEntry(title="Automator", content=text_a, topics=[])
e2 = ArchiveEntry(title="Automator", content=text_b, topics=[])
e2.created_at = "2099-01-01T00:00:00+00:00"
archive._entries[e1.id] = e1
archive._entries[e2.id] = e2
archive._save()
# Use a low threshold to ensure these very similar entries match
merges = archive.consolidate(threshold=0.7, dry_run=False)
assert len(merges) >= 1
assert merges[0]["reason"] == "semantic_similarity"
def test_consolidate_persists_after_reload():
"""After consolidation, the reduced archive survives a save/reload cycle."""
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "archive.json"
archive = MnemosyneArchive(archive_path=path, auto_embed=False)
ingest_event(archive, title="Persist test", content="Body to dedup and persist", topics=[])
e2 = ArchiveEntry(title="Persist test", content="Body to dedup and persist", topics=[])
e2.created_at = "2099-01-01T00:00:00+00:00"
archive._entries[e2.id] = e2
archive._save()
archive.consolidate(dry_run=False)
assert archive.count == 1
reloaded = MnemosyneArchive(archive_path=path, auto_embed=False)
assert reloaded.count == 1