Files
hermes-agent/plugins/memory/holographic/dedup.py

192 lines
6.2 KiB
Python

"""Cross-tier memory deduplication.
Detects and resolves duplicate facts between MEMORY.md (built-in) and the
holographic fact store. Facts should live in exactly one tier:
Tier 1 — MEMORY.md: Always-on context (compact, <50 entries ideal).
Tier 2 — Fact store: Deep structured storage (unlimited, entity-aware).
Tier 3 — MemPalace: Specialized long-form archives.
Ownership rules:
- user_pref / project / tool facts → fact store (structured, searchable)
- "always-on" operational notes → MEMORY.md (compact, system prompt)
- When in doubt: fact store wins (it has dedup, trust scoring, FTS5)
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
_SIMILARITY_THRESHOLD = 0.85
@dataclass
class DuplicatePair:
memory_entry: str
memory_index: int
fact_store_id: int
fact_store_content: str
similarity: float
resolution: str = ""
resolved: bool = False
@dataclass
class DedupReport:
total_memory_entries: int = 0
total_facts: int = 0
duplicates_found: int = 0
pairs: List[DuplicatePair] = field(default_factory=list)
def summary(self) -> str:
lines = [
f"Cross-tier dedup: {self.total_memory_entries} MEMORY.md entries, "
f"{self.total_facts} fact store entries, "
f"{self.duplicates_found} duplicates found",
]
for p in self.pairs:
status = f"[{p.resolution}]" if p.resolved else "[PENDING]"
lines.append(
f" {status} sim={p.similarity:.2f} "
f"mem[{p.memory_index}]: {p.memory_entry[:60]} "
f"<> fact#{p.fact_store_id}: {p.fact_store_content[:60]}"
)
return "\n".join(lines)
def _normalize(text: str) -> str:
text = text.strip().lower()
text = re.sub(r"^[\\s>*\\-\\u2022]+", "", text)
text = re.sub(r"\\s+", " ", text)
text = text.rstrip(".,;:!?",)
return text
def _similarity(a: str, b: str) -> float:
if not a or not b:
return 0.0
return SequenceMatcher(None, a, b).ratio()
def scan_cross_tier_duplicates(
memory_entries: List[str],
fact_store_facts: List[Dict[str, Any]],
threshold: float = _SIMILARITY_THRESHOLD,
) -> DedupReport:
report = DedupReport(
total_memory_entries=len(memory_entries),
total_facts=len(fact_store_facts),
)
for i, mem_line in enumerate(memory_entries):
mem_norm = _normalize(mem_line)
if not mem_norm or len(mem_norm) < 10:
continue
for fact in fact_store_facts:
fact_norm = _normalize(fact.get("content", ""))
if not fact_norm or len(fact_norm) < 10:
continue
sim = _similarity(mem_norm, fact_norm)
if sim >= threshold:
report.pairs.append(DuplicatePair(
memory_entry=mem_line,
memory_index=i,
fact_store_id=fact.get("fact_id", -1),
fact_store_content=fact.get("content", ""),
similarity=sim,
))
report.duplicates_found = len(report.pairs)
return report
def classify_tier(fact_content: str, category: str = "general") -> str:
if category in ("user_pref", "project", "tool"):
return "factstore"
content = fact_content.strip()
if len(content) < 80 and any(
kw in content.lower() for kw in ("todo", "note:", "fix:", "remember:", "always", "never")
):
return "memory"
return "factstore"
def resolve_pair(pair: DuplicatePair) -> str:
pair.resolution = "keep_factstore"
pair.resolved = True
return pair.resolution
def resolve_duplicates(
report: DedupReport,
memory_entries: List[str],
fact_store=None,
) -> List[str]:
indices_to_remove = set()
for pair in report.pairs:
resolve_pair(pair)
if pair.resolution == "keep_factstore":
indices_to_remove.add(pair.memory_index)
elif pair.resolution == "keep_memory" and fact_store:
try:
fact_store.remove_fact(pair.fact_store_id)
except Exception as e:
logger.debug("Failed to remove fact %d: %s", pair.fact_store_id, e)
cleaned = [e for i, e in enumerate(memory_entries) if i not in indices_to_remove]
removed = len(memory_entries) - len(cleaned)
if removed:
logger.info("Dedup removed %d duplicate entries from MEMORY.md", removed)
return cleaned
def is_duplicate_before_add(
content: str,
existing_facts: List[Dict[str, Any]],
threshold: float = _SIMILARITY_THRESHOLD,
) -> Optional[Dict[str, Any]]:
"""Check if content is a duplicate of an existing fact BEFORE adding.
Returns the matching fact dict if duplicate, None otherwise.
Used by on_memory_write to prevent cross-tier duplication at write time.
"""
content_norm = _normalize(content)
if not content_norm or len(content_norm) < 10:
return None
for fact in existing_facts:
fact_norm = _normalize(fact.get("content", ""))
if not fact_norm or len(fact_norm) < 10:
continue
if _similarity(content_norm, fact_norm) >= threshold:
return fact
return None
TIER_OWNERSHIP_DOC = """# Memory Tier Ownership
Each fact lives in exactly one tier to prevent duplicate tokens and stale-data divergence.
## Tier 1 — MEMORY.md (built-in)
- Always-on system prompt context (compact, <50 entries ideal).
- Operational notes, active task state, immediate context.
- Managed by: `memory` tool.
## Tier 2 — Fact Store (holographic)
- Deep structured storage with search and reasoning.
- user_pref, project, tool facts; entity-linked knowledge.
- Managed by: `fact_store` tool.
- Has: FTS5 search, trust scoring, entity resolution.
## Tier 3 — MemPalace
- Specialized long-form archives and research.
## Rules
- MEMORY.md entries >100 chars → migrate to fact store.
- Structured categories (user_pref, project, tool) → fact store.
- Duplicate across tiers: fact store wins (it has trust scoring).
- `on_memory_write` checks fact store before mirroring.
"""