feat: add consolidate() method to MnemosyneArchive (#1260)
Scans for exact and near-duplicate entries by content_hash and embedding similarity. Merges older entry with union of topics, links, and metadata from duplicates.
This commit is contained in:
@@ -972,3 +972,123 @@ class MnemosyneArchive:
|
||||
|
||||
self._save()
|
||||
return total_links
|
||||
|
||||
def consolidate(
|
||||
self,
|
||||
similarity_threshold: float = 0.9,
|
||||
dry_run: bool = False,
|
||||
) -> list[dict]:
|
||||
"""Find and merge duplicate or near-duplicate entries.
|
||||
|
||||
Scans all entries for:
|
||||
1. Exact duplicates: same content_hash
|
||||
2. Near-duplicates: embedding similarity > threshold (when available)
|
||||
|
||||
When merging, the older entry is kept. Topics, links, and metadata
|
||||
from the newer entry are merged into the survivor. The newer entry
|
||||
is removed.
|
||||
|
||||
Args:
|
||||
similarity_threshold: Minimum cosine similarity to consider
|
||||
near-duplicate (default 0.9). Only used with embedding backend.
|
||||
dry_run: If True, returns merge pairs without modifying the archive.
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: kept_id, removed_id, reason, similarity.
|
||||
"""
|
||||
merges = []
|
||||
entries = list(self._entries.values())
|
||||
removed_ids: set[str] = set()
|
||||
|
||||
# Phase 1: exact duplicates by content_hash
|
||||
hash_groups: dict[str, list[ArchiveEntry]] = {}
|
||||
for entry in entries:
|
||||
if entry.content_hash:
|
||||
hash_groups.setdefault(entry.content_hash, []).append(entry)
|
||||
|
||||
for content_hash, group in hash_groups.items():
|
||||
if len(group) < 2:
|
||||
continue
|
||||
group.sort(key=lambda e: e.created_at)
|
||||
keeper = group[0]
|
||||
for dup in group[1:]:
|
||||
if dup.id in removed_ids:
|
||||
continue
|
||||
merges.append({
|
||||
"kept_id": keeper.id,
|
||||
"removed_id": dup.id,
|
||||
"kept_title": keeper.title,
|
||||
"removed_title": dup.title,
|
||||
"reason": "exact_content_hash",
|
||||
"similarity": 1.0,
|
||||
})
|
||||
removed_ids.add(dup.id)
|
||||
|
||||
# Phase 2: near-duplicates via embedding similarity
|
||||
if self._embedding_backend is not None:
|
||||
active = [e for e in entries if e.id not in removed_ids]
|
||||
for i, a in enumerate(active):
|
||||
if a.id in removed_ids:
|
||||
continue
|
||||
vec_a = self.linker._get_embedding(a)
|
||||
if not vec_a:
|
||||
continue
|
||||
for b in active[i + 1:]:
|
||||
if b.id in removed_ids:
|
||||
continue
|
||||
vec_b = self.linker._get_embedding(b)
|
||||
if not vec_b:
|
||||
continue
|
||||
sim = self._embedding_backend.similarity(vec_a, vec_b)
|
||||
if sim >= similarity_threshold:
|
||||
if a.created_at <= b.created_at:
|
||||
keeper, loser = a, b
|
||||
else:
|
||||
keeper, loser = b, a
|
||||
merges.append({
|
||||
"kept_id": keeper.id,
|
||||
"removed_id": loser.id,
|
||||
"kept_title": keeper.title,
|
||||
"removed_title": loser.title,
|
||||
"reason": "embedding_similarity",
|
||||
"similarity": round(sim, 4),
|
||||
})
|
||||
removed_ids.add(loser.id)
|
||||
|
||||
if dry_run:
|
||||
return merges
|
||||
|
||||
# Execute merges
|
||||
for merge in merges:
|
||||
keeper = self._entries.get(merge["kept_id"])
|
||||
loser = self._entries.get(merge["removed_id"])
|
||||
if keeper is None or loser is None:
|
||||
continue
|
||||
|
||||
for topic in loser.topics:
|
||||
if topic not in keeper.topics:
|
||||
keeper.topics.append(topic)
|
||||
|
||||
for link_id in loser.links:
|
||||
if link_id != keeper.id and link_id not in keeper.links:
|
||||
keeper.links.append(link_id)
|
||||
|
||||
for key, value in loser.metadata.items():
|
||||
if key not in keeper.metadata:
|
||||
keeper.metadata[key] = value
|
||||
|
||||
keeper.updated_at = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
del self._entries[loser.id]
|
||||
|
||||
for entry in self._entries.values():
|
||||
if merge["removed_id"] in entry.links:
|
||||
entry.links.remove(merge["removed_id"])
|
||||
if merge["kept_id"] not in entry.links and merge["kept_id"] != entry.id:
|
||||
entry.links.append(merge["kept_id"])
|
||||
|
||||
if merges:
|
||||
self._save()
|
||||
|
||||
return merges
|
||||
|
||||
|
||||
Reference in New Issue
Block a user