Compare commits
5 Commits
mimo/code/
...
feat/memor
| Author | SHA1 | Date | |
|---|---|---|---|
| e7754ce101 | |||
| 2fa8b5d99b | |||
| bb856765ce | |||
| 1e110922b2 | |||
| b308e627b8 |
@@ -182,9 +182,12 @@ planned:
|
|||||||
- "#TBD" # Will be filled when PR is created
|
- "#TBD" # Will be filled when PR is created
|
||||||
|
|
||||||
memory_consolidation:
|
memory_consolidation:
|
||||||
status: planned
|
status: shipped
|
||||||
|
files: [archive.py, cli.py]
|
||||||
description: >
|
description: >
|
||||||
Automatic merging of duplicate/near-duplicate memories
|
Automatic merging of duplicate/near-duplicate memories
|
||||||
using content_hash and semantic similarity. Periodic
|
using content_hash and semantic similarity. Periodic
|
||||||
consolidation pass.
|
consolidation pass.
|
||||||
priority: low
|
priority: low
|
||||||
|
merged_prs:
|
||||||
|
- "#TBD" # Will be filled when PR is created
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
"""nexus.mnemosyne — The Living Holographic Archive.
|
"""nexus.mnemosyne — The Living Holographic Archive.
|
||||||
|
|
||||||
Phase 1: Foundation — core archive, entry model, holographic linker,
|
Phase 1: Foundation — core archive, entry model, holographic linker,
|
||||||
ingestion pipeline, and CLI.
|
ingestion pipeline, memory consolidation, and CLI.
|
||||||
|
|
||||||
Builds on MemPalace vector memory to create interconnected meaning:
|
Builds on MemPalace vector memory to create interconnected meaning:
|
||||||
entries auto-reference related entries via semantic similarity,
|
entries auto-reference related entries via semantic similarity,
|
||||||
|
|||||||
@@ -972,3 +972,123 @@ class MnemosyneArchive:
|
|||||||
|
|
||||||
self._save()
|
self._save()
|
||||||
return total_links
|
return total_links
|
||||||
|
|
||||||
|
def consolidate(
|
||||||
|
self,
|
||||||
|
similarity_threshold: float = 0.9,
|
||||||
|
dry_run: bool = False,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Find and merge duplicate or near-duplicate entries.
|
||||||
|
|
||||||
|
Scans all entries for:
|
||||||
|
1. Exact duplicates: same content_hash
|
||||||
|
2. Near-duplicates: embedding similarity > threshold (when available)
|
||||||
|
|
||||||
|
When merging, the older entry is kept. Topics, links, and metadata
|
||||||
|
from the newer entry are merged into the survivor. The newer entry
|
||||||
|
is removed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
similarity_threshold: Minimum cosine similarity to consider
|
||||||
|
near-duplicate (default 0.9). Only used with embedding backend.
|
||||||
|
dry_run: If True, returns merge pairs without modifying the archive.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts with keys: kept_id, removed_id, reason, similarity.
|
||||||
|
"""
|
||||||
|
merges = []
|
||||||
|
entries = list(self._entries.values())
|
||||||
|
removed_ids: set[str] = set()
|
||||||
|
|
||||||
|
# Phase 1: exact duplicates by content_hash
|
||||||
|
hash_groups: dict[str, list[ArchiveEntry]] = {}
|
||||||
|
for entry in entries:
|
||||||
|
if entry.content_hash:
|
||||||
|
hash_groups.setdefault(entry.content_hash, []).append(entry)
|
||||||
|
|
||||||
|
for content_hash, group in hash_groups.items():
|
||||||
|
if len(group) < 2:
|
||||||
|
continue
|
||||||
|
group.sort(key=lambda e: e.created_at)
|
||||||
|
keeper = group[0]
|
||||||
|
for dup in group[1:]:
|
||||||
|
if dup.id in removed_ids:
|
||||||
|
continue
|
||||||
|
merges.append({
|
||||||
|
"kept_id": keeper.id,
|
||||||
|
"removed_id": dup.id,
|
||||||
|
"kept_title": keeper.title,
|
||||||
|
"removed_title": dup.title,
|
||||||
|
"reason": "exact_content_hash",
|
||||||
|
"similarity": 1.0,
|
||||||
|
})
|
||||||
|
removed_ids.add(dup.id)
|
||||||
|
|
||||||
|
# Phase 2: near-duplicates via embedding similarity
|
||||||
|
if self._embedding_backend is not None:
|
||||||
|
active = [e for e in entries if e.id not in removed_ids]
|
||||||
|
for i, a in enumerate(active):
|
||||||
|
if a.id in removed_ids:
|
||||||
|
continue
|
||||||
|
vec_a = self.linker._get_embedding(a)
|
||||||
|
if not vec_a:
|
||||||
|
continue
|
||||||
|
for b in active[i + 1:]:
|
||||||
|
if b.id in removed_ids:
|
||||||
|
continue
|
||||||
|
vec_b = self.linker._get_embedding(b)
|
||||||
|
if not vec_b:
|
||||||
|
continue
|
||||||
|
sim = self._embedding_backend.similarity(vec_a, vec_b)
|
||||||
|
if sim >= similarity_threshold:
|
||||||
|
if a.created_at <= b.created_at:
|
||||||
|
keeper, loser = a, b
|
||||||
|
else:
|
||||||
|
keeper, loser = b, a
|
||||||
|
merges.append({
|
||||||
|
"kept_id": keeper.id,
|
||||||
|
"removed_id": loser.id,
|
||||||
|
"kept_title": keeper.title,
|
||||||
|
"removed_title": loser.title,
|
||||||
|
"reason": "embedding_similarity",
|
||||||
|
"similarity": round(sim, 4),
|
||||||
|
})
|
||||||
|
removed_ids.add(loser.id)
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
return merges
|
||||||
|
|
||||||
|
# Execute merges
|
||||||
|
for merge in merges:
|
||||||
|
keeper = self._entries.get(merge["kept_id"])
|
||||||
|
loser = self._entries.get(merge["removed_id"])
|
||||||
|
if keeper is None or loser is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for topic in loser.topics:
|
||||||
|
if topic not in keeper.topics:
|
||||||
|
keeper.topics.append(topic)
|
||||||
|
|
||||||
|
for link_id in loser.links:
|
||||||
|
if link_id != keeper.id and link_id not in keeper.links:
|
||||||
|
keeper.links.append(link_id)
|
||||||
|
|
||||||
|
for key, value in loser.metadata.items():
|
||||||
|
if key not in keeper.metadata:
|
||||||
|
keeper.metadata[key] = value
|
||||||
|
|
||||||
|
keeper.updated_at = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
del self._entries[loser.id]
|
||||||
|
|
||||||
|
for entry in self._entries.values():
|
||||||
|
if merge["removed_id"] in entry.links:
|
||||||
|
entry.links.remove(merge["removed_id"])
|
||||||
|
if merge["kept_id"] not in entry.links and merge["kept_id"] != entry.id:
|
||||||
|
entry.links.append(merge["kept_id"])
|
||||||
|
|
||||||
|
if merges:
|
||||||
|
self._save()
|
||||||
|
|
||||||
|
return merges
|
||||||
|
|
||||||
|
|||||||
@@ -154,6 +154,23 @@ def cmd_rebuild(args):
|
|||||||
print(f"Rebuilt links: {total} connections across {archive.count} entries")
|
print(f"Rebuilt links: {total} connections across {archive.count} entries")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_consolidate(args):
|
||||||
|
archive = MnemosyneArchive()
|
||||||
|
threshold = args.threshold
|
||||||
|
merges = archive.consolidate(similarity_threshold=threshold, dry_run=args.dry_run)
|
||||||
|
if not merges:
|
||||||
|
print("No duplicates found.")
|
||||||
|
return
|
||||||
|
action = "Would merge" if args.dry_run else "Merged"
|
||||||
|
print(f"{action} {len(merges)} pair(s):\n")
|
||||||
|
for m in merges:
|
||||||
|
sim = m["similarity"]
|
||||||
|
reason = m["reason"]
|
||||||
|
print(f" [{reason}] {m['kept_title'][:60]}")
|
||||||
|
print(f" kept: {m['kept_id'][:8]}")
|
||||||
|
print(f" removed: {m['removed_id'][:8]} (similarity: {sim})\n")
|
||||||
|
|
||||||
|
|
||||||
def cmd_tag(args):
|
def cmd_tag(args):
|
||||||
archive = MnemosyneArchive()
|
archive = MnemosyneArchive()
|
||||||
tags = [t.strip() for t in args.tags.split(",") if t.strip()]
|
tags = [t.strip() for t in args.tags.split(",") if t.strip()]
|
||||||
@@ -263,6 +280,10 @@ def main():
|
|||||||
rb = sub.add_parser("rebuild", help="Recompute all links from scratch")
|
rb = sub.add_parser("rebuild", help="Recompute all links from scratch")
|
||||||
rb.add_argument("-t", "--threshold", type=float, default=None, help="Similarity threshold override")
|
rb.add_argument("-t", "--threshold", type=float, default=None, help="Similarity threshold override")
|
||||||
|
|
||||||
|
co = sub.add_parser("consolidate", help="Find and merge duplicate/near-duplicate entries")
|
||||||
|
co.add_argument("-t", "--threshold", type=float, default=0.9, help="Similarity threshold for near-duplicates (default: 0.9)")
|
||||||
|
co.add_argument("--dry-run", action="store_true", help="Show what would merge without modifying")
|
||||||
|
|
||||||
tg = sub.add_parser("tag", help="Add tags to an existing entry")
|
tg = sub.add_parser("tag", help="Add tags to an existing entry")
|
||||||
tg.add_argument("entry_id", help="Entry ID")
|
tg.add_argument("entry_id", help="Entry ID")
|
||||||
tg.add_argument("tags", help="Comma-separated tags to add")
|
tg.add_argument("tags", help="Comma-separated tags to add")
|
||||||
@@ -300,6 +321,7 @@ def main():
|
|||||||
"hubs": cmd_hubs,
|
"hubs": cmd_hubs,
|
||||||
"bridges": cmd_bridges,
|
"bridges": cmd_bridges,
|
||||||
"rebuild": cmd_rebuild,
|
"rebuild": cmd_rebuild,
|
||||||
|
"consolidate": cmd_consolidate,
|
||||||
"tag": cmd_tag,
|
"tag": cmd_tag,
|
||||||
"untag": cmd_untag,
|
"untag": cmd_untag,
|
||||||
"retag": cmd_retag,
|
"retag": cmd_retag,
|
||||||
|
|||||||
137
nexus/mnemosyne/tests/test_consolidation.py
Normal file
137
nexus/mnemosyne/tests/test_consolidation.py
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
"""Tests for MnemosyneArchive.consolidate()."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from nexus.mnemosyne.archive import MnemosyneArchive
|
||||||
|
from nexus.mnemosyne.entry import ArchiveEntry
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def archive(tmp_path):
|
||||||
|
"""Create an archive with auto_embed disabled for deterministic tests."""
|
||||||
|
path = tmp_path / "test_archive.json"
|
||||||
|
return MnemosyneArchive(archive_path=path, auto_embed=False)
|
||||||
|
|
||||||
|
|
||||||
|
class TestConsolidateExactDuplicates:
|
||||||
|
"""Phase 1: exact duplicate detection by content_hash."""
|
||||||
|
|
||||||
|
def test_finds_exact_duplicates(self, archive):
|
||||||
|
entry_a = ArchiveEntry(title="Hello", content="World")
|
||||||
|
entry_b = ArchiveEntry(title="Hello", content="World")
|
||||||
|
archive.add(entry_a, auto_link=False)
|
||||||
|
archive.add(entry_b, auto_link=False)
|
||||||
|
|
||||||
|
# Force same content_hash
|
||||||
|
entry_b.content_hash = entry_a.content_hash
|
||||||
|
|
||||||
|
# Re-add entry_b manually (bypass add() dedup)
|
||||||
|
archive._entries[entry_b.id] = entry_b
|
||||||
|
|
||||||
|
merges = archive.consolidate()
|
||||||
|
assert len(merges) == 1
|
||||||
|
assert merges[0]["reason"] == "exact_content_hash"
|
||||||
|
assert merges[0]["similarity"] == 1.0
|
||||||
|
|
||||||
|
def test_keeps_older_entry(self, archive):
|
||||||
|
entry_a = ArchiveEntry(title="First", content="Data", created_at="2024-01-01T00:00:00+00:00")
|
||||||
|
entry_b = ArchiveEntry(title="Second", content="Data", created_at="2024-06-01T00:00:00+00:00")
|
||||||
|
entry_b.content_hash = entry_a.content_hash
|
||||||
|
archive.add(entry_a, auto_link=False)
|
||||||
|
archive._entries[entry_b.id] = entry_b
|
||||||
|
|
||||||
|
merges = archive.consolidate()
|
||||||
|
assert merges[0]["kept_id"] == entry_a.id
|
||||||
|
assert merges[0]["removed_id"] == entry_b.id
|
||||||
|
|
||||||
|
def test_dry_run_does_not_modify(self, archive):
|
||||||
|
entry_a = ArchiveEntry(title="A", content="Same")
|
||||||
|
entry_b = ArchiveEntry(title="B", content="Same")
|
||||||
|
entry_b.content_hash = entry_a.content_hash
|
||||||
|
archive.add(entry_a, auto_link=False)
|
||||||
|
archive._entries[entry_b.id] = entry_b
|
||||||
|
|
||||||
|
count_before = archive.count
|
||||||
|
merges = archive.consolidate(dry_run=True)
|
||||||
|
assert len(merges) == 1
|
||||||
|
assert archive.count == count_before # unchanged
|
||||||
|
|
||||||
|
def test_no_duplicates_returns_empty(self, archive):
|
||||||
|
archive.add(ArchiveEntry(title="Unique A", content="Content A"), auto_link=False)
|
||||||
|
archive.add(ArchiveEntry(title="Unique B", content="Content B"), auto_link=False)
|
||||||
|
merges = archive.consolidate()
|
||||||
|
assert merges == []
|
||||||
|
|
||||||
|
def test_merges_topics(self, archive):
|
||||||
|
entry_a = ArchiveEntry(title="A", content="Data", topics=["python"])
|
||||||
|
entry_b = ArchiveEntry(title="B", content="Data", topics=["testing"])
|
||||||
|
entry_b.content_hash = entry_a.content_hash
|
||||||
|
archive.add(entry_a, auto_link=False)
|
||||||
|
archive._entries[entry_b.id] = entry_b
|
||||||
|
|
||||||
|
archive.consolidate()
|
||||||
|
keeper = archive.get(entry_a.id)
|
||||||
|
assert "python" in keeper.topics
|
||||||
|
assert "testing" in keeper.topics
|
||||||
|
|
||||||
|
def test_merges_links(self, archive):
|
||||||
|
entry_c = ArchiveEntry(title="C", content="Ref")
|
||||||
|
archive.add(entry_c, auto_link=False)
|
||||||
|
|
||||||
|
entry_a = ArchiveEntry(title="A", content="Data", links=[entry_c.id])
|
||||||
|
entry_b = ArchiveEntry(title="B", content="Data", links=[entry_c.id])
|
||||||
|
entry_b.content_hash = entry_a.content_hash
|
||||||
|
archive.add(entry_a, auto_link=False)
|
||||||
|
archive._entries[entry_b.id] = entry_b
|
||||||
|
|
||||||
|
archive.consolidate()
|
||||||
|
keeper = archive.get(entry_a.id)
|
||||||
|
assert entry_c.id in keeper.links
|
||||||
|
|
||||||
|
def test_removes_duplicate_from_archive(self, archive):
|
||||||
|
entry_a = ArchiveEntry(title="A", content="Same")
|
||||||
|
entry_b = ArchiveEntry(title="B", content="Same")
|
||||||
|
entry_b.content_hash = entry_a.content_hash
|
||||||
|
archive.add(entry_a, auto_link=False)
|
||||||
|
archive._entries[entry_b.id] = entry_b
|
||||||
|
|
||||||
|
archive.consolidate()
|
||||||
|
assert archive.get(entry_a.id) is not None
|
||||||
|
assert archive.get(entry_b.id) is None
|
||||||
|
|
||||||
|
def test_fixes_links_pointing_to_removed(self, archive):
|
||||||
|
entry_a = ArchiveEntry(title="A", content="Same")
|
||||||
|
entry_b = ArchiveEntry(title="B", content="Same")
|
||||||
|
entry_c = ArchiveEntry(title="C", content="Ref", links=[entry_b.id])
|
||||||
|
entry_b.content_hash = entry_a.content_hash
|
||||||
|
archive.add(entry_a, auto_link=False)
|
||||||
|
archive.add(entry_c, auto_link=False)
|
||||||
|
archive._entries[entry_b.id] = entry_b
|
||||||
|
|
||||||
|
archive.consolidate()
|
||||||
|
survivor = archive.get(entry_c.id)
|
||||||
|
assert entry_b.id not in survivor.links
|
||||||
|
assert entry_a.id in survivor.links
|
||||||
|
|
||||||
|
|
||||||
|
class TestConsolidateTripleDuplicates:
|
||||||
|
"""Handle 3+ entries with the same content_hash."""
|
||||||
|
|
||||||
|
def test_three_way_merge(self, archive):
|
||||||
|
entry_a = ArchiveEntry(title="A", content="Same", created_at="2024-01-01T00:00:00+00:00")
|
||||||
|
entry_b = ArchiveEntry(title="B", content="Same", created_at="2024-02-01T00:00:00+00:00")
|
||||||
|
entry_c = ArchiveEntry(title="C", content="Same", created_at="2024-03-01T00:00:00+00:00")
|
||||||
|
entry_b.content_hash = entry_a.content_hash
|
||||||
|
entry_c.content_hash = entry_a.content_hash
|
||||||
|
archive.add(entry_a, auto_link=False)
|
||||||
|
archive._entries[entry_b.id] = entry_b
|
||||||
|
archive._entries[entry_c.id] = entry_c
|
||||||
|
|
||||||
|
merges = archive.consolidate()
|
||||||
|
assert len(merges) == 2
|
||||||
|
assert all(m["kept_id"] == entry_a.id for m in merges)
|
||||||
Reference in New Issue
Block a user