Merge pull request 'fix: fact distillation — block garbage and secrets, improve dedup' (#43) from fix/fact-distillation into main
All checks were successful
Tests / lint (push) Successful in 2s
Tests / test (push) Successful in 33s

This commit was merged in pull request #43.
This commit is contained in:
2026-03-14 13:00:59 -04:00
2 changed files with 63 additions and 8 deletions

View File

@@ -436,10 +436,11 @@ def memory_write(content: str, context_type: str = "fact") -> str:
try:
from timmy.memory.vector_store import search_memories, store_memory
# Dedup check for facts — skip if a very similar fact already exists
# Dedup check for facts — skip if a similar fact already exists
# Threshold 0.75 catches paraphrases (was 0.9 which only caught near-exact)
if context_type == "fact":
existing = search_memories(
content.strip(), limit=3, context_type="fact", min_relevance=0.9
content.strip(), limit=3, context_type="fact", min_relevance=0.75
)
if existing:
return f"Similar fact already stored (id={existing[0].id[:8]}). Skipping duplicate."

View File

@@ -348,10 +348,23 @@ class ThinkingEngine:
thought_text = "\n".join(f"- [{t.seed_type}] {t.content}" for t in reversed(recent))
distill_prompt = (
"You are reviewing your own recent thoughts. Extract 0-3 lasting "
"insights, preferences, or facts worth remembering long-term. "
"You are reviewing your own recent thoughts. Extract 0-3 facts "
"worth remembering long-term.\n\n"
"GOOD facts (store these):\n"
"- User preferences: 'Alexander prefers YAML config over code changes'\n"
"- Project decisions: 'Switched from hardcoded personas to agents.yaml'\n"
"- Learned knowledge: 'Ollama supports concurrent model loading'\n"
"- User information: 'Alexander is interested in Bitcoin and sovereignty'\n\n"
"BAD facts (never store these):\n"
"- Self-referential observations about your own thinking process\n"
"- Meta-commentary about your memory, timestamps, or internal state\n"
"- Observations about being idle or having no chat messages\n"
"- File paths, tokens, API keys, or any credentials\n"
"- Restatements of your standing rules or system prompt\n\n"
"Return ONLY a JSON array of strings. If nothing is worth saving, "
"return []. Do not include ephemeral observations or repeated themes.\n\n"
"return []. Be selective — only store facts about the EXTERNAL WORLD "
"(the user, the project, technical knowledge), never about your own "
"internal process.\n\n"
f"Recent thoughts:\n{thought_text}\n\nJSON array:"
)
@@ -373,10 +386,51 @@ class ThinkingEngine:
from timmy.semantic_memory import memory_write
# Sensitive patterns that must never be stored as facts
_SENSITIVE_PATTERNS = [
"token",
"password",
"secret",
"api_key",
"apikey",
"credential",
".config/",
"/token",
"access_token",
"private_key",
"ssh_key",
]
for fact in facts[:3]: # Safety cap
if isinstance(fact, str) and len(fact.strip()) > 10:
result = memory_write(fact.strip(), context_type="fact")
logger.info("Distilled fact: %s%s", fact[:60], result[:40])
if not isinstance(fact, str) or len(fact.strip()) <= 10:
continue
fact_lower = fact.lower()
# Block sensitive information
if any(pat in fact_lower for pat in _SENSITIVE_PATTERNS):
logger.warning("Distill: blocked sensitive fact: %s", fact[:60])
continue
# Block self-referential meta-observations
if any(
phrase in fact_lower
for phrase in [
"my own",
"my thinking",
"my memory",
"my working ram",
"self-declarative",
"meta-observation",
"internal state",
"my pending",
"my standing rules",
"thoughts generated",
"no chat messages",
"no user interaction",
]
):
logger.debug("Distill: skipped meta-observation: %s", fact[:60])
continue
result = memory_write(fact.strip(), context_type="fact")
logger.info("Distilled fact: %s%s", fact[:60], result[:40])
except Exception as exc:
logger.debug("Thought distillation skipped: %s", exc)