refactor: decompose _maybe_distill() into focused helpers (#151)
All checks were successful
Tests / lint (pull_request) Successful in 4s
Tests / test (pull_request) Successful in 1m1s

Break the 109-line _maybe_distill() into four single-purpose methods:
- _should_distill() -> bool: guard clause logic
- _build_distill_prompt(thoughts) -> str: prompt construction
- _parse_facts_response(raw) -> list[str]: JSON parsing + fence stripping
- _filter_and_store_facts(facts): filtering + storage

Moved _SENSITIVE_PATTERNS and _META_OBSERVATION_PHRASES to module-level constants.
_maybe_distill() is now 15 lines — a slim orchestrator calling the helpers.

No behavior change. 1426 tests pass.
This commit is contained in:
2026-03-15 10:58:52 -04:00
parent 466db7aed2
commit 9e2cfe5df5

View File

@@ -33,6 +33,37 @@ logger = logging.getLogger(__name__)
_DEFAULT_DB = Path("data/thoughts.db")
# Sensitive patterns that must never be stored as facts
_SENSITIVE_PATTERNS = [
"token",
"password",
"secret",
"api_key",
"apikey",
"credential",
".config/",
"/token",
"access_token",
"private_key",
"ssh_key",
]
# Meta-observation phrases to filter out from distilled facts
_META_OBSERVATION_PHRASES = [
"my own",
"my thinking",
"my memory",
"my working ram",
"self-declarative",
"meta-observation",
"internal state",
"my pending",
"my standing rules",
"thoughts generated",
"no chat messages",
"no user interaction",
]
# Seed types for thought generation
SEED_TYPES = (
"existential",
@@ -360,114 +391,115 @@ class ThinkingEngine:
# ── Private helpers ──────────────────────────────────────────────────
async def _maybe_distill(self) -> None:
"""Every N thoughts, extract lasting insights and store as facts.
def _should_distill(self) -> bool:
"""Check if distillation should run based on interval and thought count."""
interval = settings.thinking_distill_every
if interval <= 0:
return False
Reads the last N thoughts, asks the LLM to extract any durable facts
or insights, and stores them via memory_write. Only runs when the
thought count is divisible by the configured interval.
count = self.count_thoughts()
if count == 0 or count % interval != 0:
return False
return True
def _build_distill_prompt(self, thoughts: list[Thought]) -> str:
"""Build the prompt for extracting facts from recent thoughts.
Args:
thoughts: List of recent thoughts to analyze.
Returns:
The formatted prompt string for the LLM.
"""
thought_text = "\n".join(f"- [{t.seed_type}] {t.content}" for t in reversed(thoughts))
return (
"You are reviewing your own recent thoughts. Extract 0-3 facts "
"worth remembering long-term.\n\n"
"GOOD facts (store these):\n"
"- User preferences: 'Alexander prefers YAML config over code changes'\n"
"- Project decisions: 'Switched from hardcoded personas to agents.yaml'\n"
"- Learned knowledge: 'Ollama supports concurrent model loading'\n"
"- User information: 'Alexander is interested in Bitcoin and sovereignty'\n\n"
"BAD facts (never store these):\n"
"- Self-referential observations about your own thinking process\n"
"- Meta-commentary about your memory, timestamps, or internal state\n"
"- Observations about being idle or having no chat messages\n"
"- File paths, tokens, API keys, or any credentials\n"
"- Restatements of your standing rules or system prompt\n\n"
"Return ONLY a JSON array of strings. If nothing is worth saving, "
"return []. Be selective — only store facts about the EXTERNAL WORLD "
"(the user, the project, technical knowledge), never about your own "
"internal process.\n\n"
f"Recent thoughts:\n{thought_text}\n\nJSON array:"
)
def _parse_facts_response(self, raw: str) -> list[str]:
"""Parse JSON array from LLM response, stripping markdown fences.
Args:
raw: Raw response string from the LLM.
Returns:
List of fact strings parsed from the response.
"""
if not raw or not raw.strip():
return []
import json
cleaned = raw.strip()
if cleaned.startswith("```"):
cleaned = cleaned.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
facts = json.loads(cleaned)
if not isinstance(facts, list):
return []
return [f for f in facts if isinstance(f, str)]
def _filter_and_store_facts(self, facts: list[str]) -> None:
"""Filter and store valid facts, blocking sensitive and meta content.
Args:
facts: List of fact strings to filter and store.
"""
from timmy.semantic_memory import memory_write
for fact in facts[:3]: # Safety cap
if not isinstance(fact, str) or len(fact.strip()) <= 10:
continue
fact_lower = fact.lower()
# Block sensitive information
if any(pat in fact_lower for pat in _SENSITIVE_PATTERNS):
logger.warning("Distill: blocked sensitive fact: %s", fact[:60])
continue
# Block self-referential meta-observations
if any(phrase in fact_lower for phrase in _META_OBSERVATION_PHRASES):
logger.debug("Distill: skipped meta-observation: %s", fact[:60])
continue
result = memory_write(fact.strip(), context_type="fact")
logger.info("Distilled fact: %s%s", fact[:60], result[:40])
async def _maybe_distill(self) -> None:
"""Every N thoughts, extract lasting insights and store as facts."""
try:
if not self._should_distill():
return
interval = settings.thinking_distill_every
if interval <= 0:
return
count = self.count_thoughts()
if count == 0 or count % interval != 0:
return
recent = self.get_recent_thoughts(limit=interval)
if len(recent) < interval:
return
# Build a summary of recent thoughts for the LLM
thought_text = "\n".join(f"- [{t.seed_type}] {t.content}" for t in reversed(recent))
distill_prompt = (
"You are reviewing your own recent thoughts. Extract 0-3 facts "
"worth remembering long-term.\n\n"
"GOOD facts (store these):\n"
"- User preferences: 'Alexander prefers YAML config over code changes'\n"
"- Project decisions: 'Switched from hardcoded personas to agents.yaml'\n"
"- Learned knowledge: 'Ollama supports concurrent model loading'\n"
"- User information: 'Alexander is interested in Bitcoin and sovereignty'\n\n"
"BAD facts (never store these):\n"
"- Self-referential observations about your own thinking process\n"
"- Meta-commentary about your memory, timestamps, or internal state\n"
"- Observations about being idle or having no chat messages\n"
"- File paths, tokens, API keys, or any credentials\n"
"- Restatements of your standing rules or system prompt\n\n"
"Return ONLY a JSON array of strings. If nothing is worth saving, "
"return []. Be selective — only store facts about the EXTERNAL WORLD "
"(the user, the project, technical knowledge), never about your own "
"internal process.\n\n"
f"Recent thoughts:\n{thought_text}\n\nJSON array:"
)
raw = await self._call_agent(distill_prompt)
if not raw or not raw.strip():
return
# Parse JSON array from response
import json
# Strip markdown code fences if present
cleaned = raw.strip()
if cleaned.startswith("```"):
cleaned = cleaned.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
facts = json.loads(cleaned)
if not isinstance(facts, list) or not facts:
return
from timmy.semantic_memory import memory_write
# Sensitive patterns that must never be stored as facts
_SENSITIVE_PATTERNS = [
"token",
"password",
"secret",
"api_key",
"apikey",
"credential",
".config/",
"/token",
"access_token",
"private_key",
"ssh_key",
]
for fact in facts[:3]: # Safety cap
if not isinstance(fact, str) or len(fact.strip()) <= 10:
continue
fact_lower = fact.lower()
# Block sensitive information
if any(pat in fact_lower for pat in _SENSITIVE_PATTERNS):
logger.warning("Distill: blocked sensitive fact: %s", fact[:60])
continue
# Block self-referential meta-observations
if any(
phrase in fact_lower
for phrase in [
"my own",
"my thinking",
"my memory",
"my working ram",
"self-declarative",
"meta-observation",
"internal state",
"my pending",
"my standing rules",
"thoughts generated",
"no chat messages",
"no user interaction",
]
):
logger.debug("Distill: skipped meta-observation: %s", fact[:60])
continue
result = memory_write(fact.strip(), context_type="fact")
logger.info("Distilled fact: %s%s", fact[:60], result[:40])
raw = await self._call_agent(self._build_distill_prompt(recent))
if facts := self._parse_facts_response(raw):
self._filter_and_store_facts(facts)
except Exception as exc:
logger.debug("Thought distillation skipped: %s", exc)