refactor: decompose _maybe_distill() into focused helpers (#151)

Break the 109-line _maybe_distill() into four single-purpose methods: - _should_distill() -> bool: guard clause logic - _build_distill_prompt(thoughts) -> str: prompt construction - _parse_facts_response(raw) -> list[str]: JSON parsing + fence stripping - _filter_and_store_facts(facts): filtering + storage Moved _SENSITIVE_PATTERNS and _META_OBSERVATION_PHRASES to module-level constants. _maybe_distill() is now 15 lines — a slim orchestrator calling the helpers. No behavior change. 1426 tests pass.
2026-03-15 10:58:52 -04:00
parent 466db7aed2
commit 9e2cfe5df5
1 changed files with 132 additions and 100 deletions
--- a/src/timmy/thinking.py
+++ b/src/timmy/thinking.py
@@ -33,6 +33,37 @@ logger = logging.getLogger(__name__)

 _DEFAULT_DB = Path("data/thoughts.db")

+# Sensitive patterns that must never be stored as facts
+_SENSITIVE_PATTERNS = [
+    "token",
+    "password",
+    "secret",
+    "api_key",
+    "apikey",
+    "credential",
+    ".config/",
+    "/token",
+    "access_token",
+    "private_key",
+    "ssh_key",
+]
+
+# Meta-observation phrases to filter out from distilled facts
+_META_OBSERVATION_PHRASES = [
+    "my own",
+    "my thinking",
+    "my memory",
+    "my working ram",
+    "self-declarative",
+    "meta-observation",
+    "internal state",
+    "my pending",
+    "my standing rules",
+    "thoughts generated",
+    "no chat messages",
+    "no user interaction",
+]
+
 # Seed types for thought generation
 SEED_TYPES = (
    "existential",
@@ -360,114 +391,115 @@ class ThinkingEngine:

    # ── Private helpers ──────────────────────────────────────────────────

-    async def _maybe_distill(self) -> None:
-        """Every N thoughts, extract lasting insights and store as facts.
+    def _should_distill(self) -> bool:
+        """Check if distillation should run based on interval and thought count."""
+        interval = settings.thinking_distill_every
+        if interval <= 0:
+            return False

-        Reads the last N thoughts, asks the LLM to extract any durable facts
-        or insights, and stores them via memory_write.  Only runs when the
-        thought count is divisible by the configured interval.
+        count = self.count_thoughts()
+        if count == 0 or count % interval != 0:
+            return False
+
+        return True
+
+    def _build_distill_prompt(self, thoughts: list[Thought]) -> str:
+        """Build the prompt for extracting facts from recent thoughts.
+
+        Args:
+            thoughts: List of recent thoughts to analyze.
+
+        Returns:
+            The formatted prompt string for the LLM.
        """
+        thought_text = "\n".join(f"- [{t.seed_type}] {t.content}" for t in reversed(thoughts))
+
+        return (
+            "You are reviewing your own recent thoughts. Extract 0-3 facts "
+            "worth remembering long-term.\n\n"
+            "GOOD facts (store these):\n"
+            "- User preferences: 'Alexander prefers YAML config over code changes'\n"
+            "- Project decisions: 'Switched from hardcoded personas to agents.yaml'\n"
+            "- Learned knowledge: 'Ollama supports concurrent model loading'\n"
+            "- User information: 'Alexander is interested in Bitcoin and sovereignty'\n\n"
+            "BAD facts (never store these):\n"
+            "- Self-referential observations about your own thinking process\n"
+            "- Meta-commentary about your memory, timestamps, or internal state\n"
+            "- Observations about being idle or having no chat messages\n"
+            "- File paths, tokens, API keys, or any credentials\n"
+            "- Restatements of your standing rules or system prompt\n\n"
+            "Return ONLY a JSON array of strings. If nothing is worth saving, "
+            "return []. Be selective — only store facts about the EXTERNAL WORLD "
+            "(the user, the project, technical knowledge), never about your own "
+            "internal process.\n\n"
+            f"Recent thoughts:\n{thought_text}\n\nJSON array:"
+        )
+
+    def _parse_facts_response(self, raw: str) -> list[str]:
+        """Parse JSON array from LLM response, stripping markdown fences.
+
+        Args:
+            raw: Raw response string from the LLM.
+
+        Returns:
+            List of fact strings parsed from the response.
+        """
+        if not raw or not raw.strip():
+            return []
+
+        import json
+
+        cleaned = raw.strip()
+        if cleaned.startswith("```"):
+            cleaned = cleaned.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
+
+        facts = json.loads(cleaned)
+        if not isinstance(facts, list):
+            return []
+
+        return [f for f in facts if isinstance(f, str)]
+
+    def _filter_and_store_facts(self, facts: list[str]) -> None:
+        """Filter and store valid facts, blocking sensitive and meta content.
+
+        Args:
+            facts: List of fact strings to filter and store.
+        """
+        from timmy.semantic_memory import memory_write
+
+        for fact in facts[:3]:  # Safety cap
+            if not isinstance(fact, str) or len(fact.strip()) <= 10:
+                continue
+
+            fact_lower = fact.lower()
+
+            # Block sensitive information
+            if any(pat in fact_lower for pat in _SENSITIVE_PATTERNS):
+                logger.warning("Distill: blocked sensitive fact: %s", fact[:60])
+                continue
+
+            # Block self-referential meta-observations
+            if any(phrase in fact_lower for phrase in _META_OBSERVATION_PHRASES):
+                logger.debug("Distill: skipped meta-observation: %s", fact[:60])
+                continue
+
+            result = memory_write(fact.strip(), context_type="fact")
+            logger.info("Distilled fact: %s → %s", fact[:60], result[:40])
+
+    async def _maybe_distill(self) -> None:
+        """Every N thoughts, extract lasting insights and store as facts."""
        try:
+            if not self._should_distill():
+                return
+
            interval = settings.thinking_distill_every
-            if interval <= 0:
-                return
-
-            count = self.count_thoughts()
-            if count == 0 or count % interval != 0:
-                return
-
            recent = self.get_recent_thoughts(limit=interval)
            if len(recent) < interval:
                return

-            # Build a summary of recent thoughts for the LLM
-            thought_text = "\n".join(f"- [{t.seed_type}] {t.content}" for t in reversed(recent))
-
-            distill_prompt = (
-                "You are reviewing your own recent thoughts. Extract 0-3 facts "
-                "worth remembering long-term.\n\n"
-                "GOOD facts (store these):\n"
-                "- User preferences: 'Alexander prefers YAML config over code changes'\n"
-                "- Project decisions: 'Switched from hardcoded personas to agents.yaml'\n"
-                "- Learned knowledge: 'Ollama supports concurrent model loading'\n"
-                "- User information: 'Alexander is interested in Bitcoin and sovereignty'\n\n"
-                "BAD facts (never store these):\n"
-                "- Self-referential observations about your own thinking process\n"
-                "- Meta-commentary about your memory, timestamps, or internal state\n"
-                "- Observations about being idle or having no chat messages\n"
-                "- File paths, tokens, API keys, or any credentials\n"
-                "- Restatements of your standing rules or system prompt\n\n"
-                "Return ONLY a JSON array of strings. If nothing is worth saving, "
-                "return []. Be selective — only store facts about the EXTERNAL WORLD "
-                "(the user, the project, technical knowledge), never about your own "
-                "internal process.\n\n"
-                f"Recent thoughts:\n{thought_text}\n\nJSON array:"
-            )
-
-            raw = await self._call_agent(distill_prompt)
-            if not raw or not raw.strip():
-                return
-
-            # Parse JSON array from response
-            import json
-
-            # Strip markdown code fences if present
-            cleaned = raw.strip()
-            if cleaned.startswith("```"):
-                cleaned = cleaned.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
-
-            facts = json.loads(cleaned)
-            if not isinstance(facts, list) or not facts:
-                return
-
-            from timmy.semantic_memory import memory_write
-
-            # Sensitive patterns that must never be stored as facts
-            _SENSITIVE_PATTERNS = [
-                "token",
-                "password",
-                "secret",
-                "api_key",
-                "apikey",
-                "credential",
-                ".config/",
-                "/token",
-                "access_token",
-                "private_key",
-                "ssh_key",
-            ]
-
-            for fact in facts[:3]:  # Safety cap
-                if not isinstance(fact, str) or len(fact.strip()) <= 10:
-                    continue
-                fact_lower = fact.lower()
-                # Block sensitive information
-                if any(pat in fact_lower for pat in _SENSITIVE_PATTERNS):
-                    logger.warning("Distill: blocked sensitive fact: %s", fact[:60])
-                    continue
-                # Block self-referential meta-observations
-                if any(
-                    phrase in fact_lower
-                    for phrase in [
-                        "my own",
-                        "my thinking",
-                        "my memory",
-                        "my working ram",
-                        "self-declarative",
-                        "meta-observation",
-                        "internal state",
-                        "my pending",
-                        "my standing rules",
-                        "thoughts generated",
-                        "no chat messages",
-                        "no user interaction",
-                    ]
-                ):
-                    logger.debug("Distill: skipped meta-observation: %s", fact[:60])
-                    continue
-                result = memory_write(fact.strip(), context_type="fact")
-                logger.info("Distilled fact: %s → %s", fact[:60], result[:40])
-
+            raw = await self._call_agent(self._build_distill_prompt(recent))
+            if facts := self._parse_facts_response(raw):
+                self._filter_and_store_facts(facts)
        except Exception as exc:
            logger.debug("Thought distillation skipped: %s", exc)