192 lines
6.2 KiB
Python
192 lines
6.2 KiB
Python
"""Cross-tier memory deduplication.
|
|
|
|
Detects and resolves duplicate facts between MEMORY.md (built-in) and the
|
|
holographic fact store. Facts should live in exactly one tier:
|
|
|
|
Tier 1 — MEMORY.md: Always-on context (compact, <50 entries ideal).
|
|
Tier 2 — Fact store: Deep structured storage (unlimited, entity-aware).
|
|
Tier 3 — MemPalace: Specialized long-form archives.
|
|
|
|
Ownership rules:
|
|
- user_pref / project / tool facts → fact store (structured, searchable)
|
|
- "always-on" operational notes → MEMORY.md (compact, system prompt)
|
|
- When in doubt: fact store wins (it has dedup, trust scoring, FTS5)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from difflib import SequenceMatcher
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SIMILARITY_THRESHOLD = 0.85
|
|
|
|
|
|
@dataclass
|
|
class DuplicatePair:
|
|
memory_entry: str
|
|
memory_index: int
|
|
fact_store_id: int
|
|
fact_store_content: str
|
|
similarity: float
|
|
resolution: str = ""
|
|
resolved: bool = False
|
|
|
|
|
|
@dataclass
|
|
class DedupReport:
|
|
total_memory_entries: int = 0
|
|
total_facts: int = 0
|
|
duplicates_found: int = 0
|
|
pairs: List[DuplicatePair] = field(default_factory=list)
|
|
|
|
def summary(self) -> str:
|
|
lines = [
|
|
f"Cross-tier dedup: {self.total_memory_entries} MEMORY.md entries, "
|
|
f"{self.total_facts} fact store entries, "
|
|
f"{self.duplicates_found} duplicates found",
|
|
]
|
|
for p in self.pairs:
|
|
status = f"[{p.resolution}]" if p.resolved else "[PENDING]"
|
|
lines.append(
|
|
f" {status} sim={p.similarity:.2f} "
|
|
f"mem[{p.memory_index}]: {p.memory_entry[:60]} "
|
|
f"<> fact#{p.fact_store_id}: {p.fact_store_content[:60]}"
|
|
)
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _normalize(text: str) -> str:
|
|
text = text.strip().lower()
|
|
text = re.sub(r"^[\\s>*\\-\\u2022]+", "", text)
|
|
text = re.sub(r"\\s+", " ", text)
|
|
text = text.rstrip(".,;:!?",)
|
|
return text
|
|
|
|
|
|
def _similarity(a: str, b: str) -> float:
|
|
if not a or not b:
|
|
return 0.0
|
|
return SequenceMatcher(None, a, b).ratio()
|
|
|
|
|
|
def scan_cross_tier_duplicates(
|
|
memory_entries: List[str],
|
|
fact_store_facts: List[Dict[str, Any]],
|
|
threshold: float = _SIMILARITY_THRESHOLD,
|
|
) -> DedupReport:
|
|
report = DedupReport(
|
|
total_memory_entries=len(memory_entries),
|
|
total_facts=len(fact_store_facts),
|
|
)
|
|
for i, mem_line in enumerate(memory_entries):
|
|
mem_norm = _normalize(mem_line)
|
|
if not mem_norm or len(mem_norm) < 10:
|
|
continue
|
|
for fact in fact_store_facts:
|
|
fact_norm = _normalize(fact.get("content", ""))
|
|
if not fact_norm or len(fact_norm) < 10:
|
|
continue
|
|
sim = _similarity(mem_norm, fact_norm)
|
|
if sim >= threshold:
|
|
report.pairs.append(DuplicatePair(
|
|
memory_entry=mem_line,
|
|
memory_index=i,
|
|
fact_store_id=fact.get("fact_id", -1),
|
|
fact_store_content=fact.get("content", ""),
|
|
similarity=sim,
|
|
))
|
|
report.duplicates_found = len(report.pairs)
|
|
return report
|
|
|
|
|
|
def classify_tier(fact_content: str, category: str = "general") -> str:
|
|
if category in ("user_pref", "project", "tool"):
|
|
return "factstore"
|
|
content = fact_content.strip()
|
|
if len(content) < 80 and any(
|
|
kw in content.lower() for kw in ("todo", "note:", "fix:", "remember:", "always", "never")
|
|
):
|
|
return "memory"
|
|
return "factstore"
|
|
|
|
|
|
def resolve_pair(pair: DuplicatePair) -> str:
|
|
pair.resolution = "keep_factstore"
|
|
pair.resolved = True
|
|
return pair.resolution
|
|
|
|
|
|
def resolve_duplicates(
|
|
report: DedupReport,
|
|
memory_entries: List[str],
|
|
fact_store=None,
|
|
) -> List[str]:
|
|
indices_to_remove = set()
|
|
for pair in report.pairs:
|
|
resolve_pair(pair)
|
|
if pair.resolution == "keep_factstore":
|
|
indices_to_remove.add(pair.memory_index)
|
|
elif pair.resolution == "keep_memory" and fact_store:
|
|
try:
|
|
fact_store.remove_fact(pair.fact_store_id)
|
|
except Exception as e:
|
|
logger.debug("Failed to remove fact %d: %s", pair.fact_store_id, e)
|
|
cleaned = [e for i, e in enumerate(memory_entries) if i not in indices_to_remove]
|
|
removed = len(memory_entries) - len(cleaned)
|
|
if removed:
|
|
logger.info("Dedup removed %d duplicate entries from MEMORY.md", removed)
|
|
return cleaned
|
|
|
|
|
|
def is_duplicate_before_add(
|
|
content: str,
|
|
existing_facts: List[Dict[str, Any]],
|
|
threshold: float = _SIMILARITY_THRESHOLD,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Check if content is a duplicate of an existing fact BEFORE adding.
|
|
|
|
Returns the matching fact dict if duplicate, None otherwise.
|
|
Used by on_memory_write to prevent cross-tier duplication at write time.
|
|
"""
|
|
content_norm = _normalize(content)
|
|
if not content_norm or len(content_norm) < 10:
|
|
return None
|
|
for fact in existing_facts:
|
|
fact_norm = _normalize(fact.get("content", ""))
|
|
if not fact_norm or len(fact_norm) < 10:
|
|
continue
|
|
if _similarity(content_norm, fact_norm) >= threshold:
|
|
return fact
|
|
return None
|
|
|
|
|
|
TIER_OWNERSHIP_DOC = """# Memory Tier Ownership
|
|
|
|
Each fact lives in exactly one tier to prevent duplicate tokens and stale-data divergence.
|
|
|
|
## Tier 1 — MEMORY.md (built-in)
|
|
- Always-on system prompt context (compact, <50 entries ideal).
|
|
- Operational notes, active task state, immediate context.
|
|
- Managed by: `memory` tool.
|
|
|
|
## Tier 2 — Fact Store (holographic)
|
|
- Deep structured storage with search and reasoning.
|
|
- user_pref, project, tool facts; entity-linked knowledge.
|
|
- Managed by: `fact_store` tool.
|
|
- Has: FTS5 search, trust scoring, entity resolution.
|
|
|
|
## Tier 3 — MemPalace
|
|
- Specialized long-form archives and research.
|
|
|
|
## Rules
|
|
- MEMORY.md entries >100 chars → migrate to fact store.
|
|
- Structured categories (user_pref, project, tool) → fact store.
|
|
- Duplicate across tiers: fact store wins (it has trust scoring).
|
|
- `on_memory_write` checks fact store before mirroring.
|
|
"""
|