diff --git a/nexus/mnemosyne/archive.py b/nexus/mnemosyne/archive.py index c2526bcd..a18933ac 100644 --- a/nexus/mnemosyne/archive.py +++ b/nexus/mnemosyne/archive.py @@ -972,3 +972,123 @@ class MnemosyneArchive: self._save() return total_links + + def consolidate( + self, + similarity_threshold: float = 0.9, + dry_run: bool = False, + ) -> list[dict]: + """Find and merge duplicate or near-duplicate entries. + + Scans all entries for: + 1. Exact duplicates: same content_hash + 2. Near-duplicates: embedding similarity > threshold (when available) + + When merging, the older entry is kept. Topics, links, and metadata + from the newer entry are merged into the survivor. The newer entry + is removed. + + Args: + similarity_threshold: Minimum cosine similarity to consider + near-duplicate (default 0.9). Only used with embedding backend. + dry_run: If True, returns merge pairs without modifying the archive. + + Returns: + List of dicts with keys: kept_id, removed_id, reason, similarity. + """ + merges = [] + entries = list(self._entries.values()) + removed_ids: set[str] = set() + + # Phase 1: exact duplicates by content_hash + hash_groups: dict[str, list[ArchiveEntry]] = {} + for entry in entries: + if entry.content_hash: + hash_groups.setdefault(entry.content_hash, []).append(entry) + + for content_hash, group in hash_groups.items(): + if len(group) < 2: + continue + group.sort(key=lambda e: e.created_at) + keeper = group[0] + for dup in group[1:]: + if dup.id in removed_ids: + continue + merges.append({ + "kept_id": keeper.id, + "removed_id": dup.id, + "kept_title": keeper.title, + "removed_title": dup.title, + "reason": "exact_content_hash", + "similarity": 1.0, + }) + removed_ids.add(dup.id) + + # Phase 2: near-duplicates via embedding similarity + if self._embedding_backend is not None: + active = [e for e in entries if e.id not in removed_ids] + for i, a in enumerate(active): + if a.id in removed_ids: + continue + vec_a = self.linker._get_embedding(a) + if not vec_a: + continue + for b in active[i + 1:]: + if b.id in removed_ids: + continue + vec_b = self.linker._get_embedding(b) + if not vec_b: + continue + sim = self._embedding_backend.similarity(vec_a, vec_b) + if sim >= similarity_threshold: + if a.created_at <= b.created_at: + keeper, loser = a, b + else: + keeper, loser = b, a + merges.append({ + "kept_id": keeper.id, + "removed_id": loser.id, + "kept_title": keeper.title, + "removed_title": loser.title, + "reason": "embedding_similarity", + "similarity": round(sim, 4), + }) + removed_ids.add(loser.id) + + if dry_run: + return merges + + # Execute merges + for merge in merges: + keeper = self._entries.get(merge["kept_id"]) + loser = self._entries.get(merge["removed_id"]) + if keeper is None or loser is None: + continue + + for topic in loser.topics: + if topic not in keeper.topics: + keeper.topics.append(topic) + + for link_id in loser.links: + if link_id != keeper.id and link_id not in keeper.links: + keeper.links.append(link_id) + + for key, value in loser.metadata.items(): + if key not in keeper.metadata: + keeper.metadata[key] = value + + keeper.updated_at = datetime.now(timezone.utc).isoformat() + + del self._entries[loser.id] + + for entry in self._entries.values(): + if merge["removed_id"] in entry.links: + entry.links.remove(merge["removed_id"]) + if merge["kept_id"] not in entry.links and merge["kept_id"] != entry.id: + entry.links.append(merge["kept_id"]) + + if merges: + self._save() + + return merges +