hermes-agent/plugins/memory/holographic/dedup.py

"""Cross-tier memory deduplication.

Detects and resolves duplicate facts between MEMORY.md (built-in) and the
holographic fact store.  Facts should live in exactly one tier:

  Tier 1 — MEMORY.md:  Always-on context (compact, <50 entries ideal).
  Tier 2 — Fact store: Deep structured storage (unlimited, entity-aware).
  Tier 3 — MemPalace:  Specialized long-form archives.

Ownership rules:
  - user_pref / project / tool facts → fact store (structured, searchable)
  - "always-on" operational notes → MEMORY.md (compact, system prompt)
  - When in doubt: fact store wins (it has dedup, trust scoring, FTS5)
"""

from __future__ import annotations

import logging
import re
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from typing import Any, Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)

_SIMILARITY_THRESHOLD = 0.85


@dataclass
class DuplicatePair:
    memory_entry: str
    memory_index: int
    fact_store_id: int
    fact_store_content: str
    similarity: float
    resolution: str = ""
    resolved: bool = False


@dataclass
class DedupReport:
    total_memory_entries: int = 0
    total_facts: int = 0
    duplicates_found: int = 0
    pairs: List[DuplicatePair] = field(default_factory=list)

    def summary(self) -> str:
        lines = [
            f"Cross-tier dedup: {self.total_memory_entries} MEMORY.md entries, "
            f"{self.total_facts} fact store entries, "
            f"{self.duplicates_found} duplicates found",
        ]
        for p in self.pairs:
            status = f"[{p.resolution}]" if p.resolved else "[PENDING]"
            lines.append(
                f"  {status} sim={p.similarity:.2f} "
                f"mem[{p.memory_index}]: {p.memory_entry[:60]} "
                f"<> fact#{p.fact_store_id}: {p.fact_store_content[:60]}"
            )
        return "\n".join(lines)


def _normalize(text: str) -> str:
    text = text.strip().lower()
    text = re.sub(r"^[\\s>*\\-\\u2022]+", "", text)
    text = re.sub(r"\\s+", " ", text)
    text = text.rstrip(".,;:!?",)
    return text


def _similarity(a: str, b: str) -> float:
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a, b).ratio()


def scan_cross_tier_duplicates(
    memory_entries: List[str],
    fact_store_facts: List[Dict[str, Any]],
    threshold: float = _SIMILARITY_THRESHOLD,
) -> DedupReport:
    report = DedupReport(
        total_memory_entries=len(memory_entries),
        total_facts=len(fact_store_facts),
    )
    for i, mem_line in enumerate(memory_entries):
        mem_norm = _normalize(mem_line)
        if not mem_norm or len(mem_norm) < 10:
            continue
        for fact in fact_store_facts:
            fact_norm = _normalize(fact.get("content", ""))
            if not fact_norm or len(fact_norm) < 10:
                continue
            sim = _similarity(mem_norm, fact_norm)
            if sim >= threshold:
                report.pairs.append(DuplicatePair(
                    memory_entry=mem_line,
                    memory_index=i,
                    fact_store_id=fact.get("fact_id", -1),
                    fact_store_content=fact.get("content", ""),
                    similarity=sim,
                ))
    report.duplicates_found = len(report.pairs)
    return report


def classify_tier(fact_content: str, category: str = "general") -> str:
    if category in ("user_pref", "project", "tool"):
        return "factstore"
    content = fact_content.strip()
    if len(content) < 80 and any(
        kw in content.lower() for kw in ("todo", "note:", "fix:", "remember:", "always", "never")
    ):
        return "memory"
    return "factstore"


def resolve_pair(pair: DuplicatePair) -> str:
    pair.resolution = "keep_factstore"
    pair.resolved = True
    return pair.resolution


def resolve_duplicates(
    report: DedupReport,
    memory_entries: List[str],
    fact_store=None,
) -> List[str]:
    indices_to_remove = set()
    for pair in report.pairs:
        resolve_pair(pair)
        if pair.resolution == "keep_factstore":
            indices_to_remove.add(pair.memory_index)
        elif pair.resolution == "keep_memory" and fact_store:
            try:
                fact_store.remove_fact(pair.fact_store_id)
            except Exception as e:
                logger.debug("Failed to remove fact %d: %s", pair.fact_store_id, e)
    cleaned = [e for i, e in enumerate(memory_entries) if i not in indices_to_remove]
    removed = len(memory_entries) - len(cleaned)
    if removed:
        logger.info("Dedup removed %d duplicate entries from MEMORY.md", removed)
    return cleaned


def is_duplicate_before_add(
    content: str,
    existing_facts: List[Dict[str, Any]],
    threshold: float = _SIMILARITY_THRESHOLD,
) -> Optional[Dict[str, Any]]:
    """Check if content is a duplicate of an existing fact BEFORE adding.

    Returns the matching fact dict if duplicate, None otherwise.
    Used by on_memory_write to prevent cross-tier duplication at write time.
    """
    content_norm = _normalize(content)
    if not content_norm or len(content_norm) < 10:
        return None
    for fact in existing_facts:
        fact_norm = _normalize(fact.get("content", ""))
        if not fact_norm or len(fact_norm) < 10:
            continue
        if _similarity(content_norm, fact_norm) >= threshold:
            return fact
    return None


TIER_OWNERSHIP_DOC = """# Memory Tier Ownership

Each fact lives in exactly one tier to prevent duplicate tokens and stale-data divergence.

## Tier 1 — MEMORY.md (built-in)
- Always-on system prompt context (compact, <50 entries ideal).
- Operational notes, active task state, immediate context.
- Managed by: `memory` tool.

## Tier 2 — Fact Store (holographic)
- Deep structured storage with search and reasoning.
- user_pref, project, tool facts; entity-linked knowledge.
- Managed by: `fact_store` tool.
- Has: FTS5 search, trust scoring, entity resolution.

## Tier 3 — MemPalace
- Specialized long-form archives and research.

## Rules
- MEMORY.md entries >100 chars → migrate to fact store.
- Structured categories (user_pref, project, tool) → fact store.
- Duplicate across tiers: fact store wins (it has trust scoring).
- `on_memory_write` checks fact store before mirroring.
"""