From 7bc355eed6e28e7c641d75ce9d411e709e8a66ad Mon Sep 17 00:00:00 2001
From: hermes <hermes@timmy.local>
Date: Sun, 15 Mar 2026 14:50:09 -0400
Subject: [PATCH] [loop-cycle-61] fix: strip think tags and harden fact parsing
 (#237) (#254)

---
 src/timmy/thinking.py        | 48 +++++++++++++++---
 tests/timmy/test_thinking.py | 95 ++++++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 6 deletions(-)
diff --git a/src/timmy/thinking.py b/src/timmy/thinking.py
index e39a80d..c687567 100644
--- a/src/timmy/thinking.py
+++ b/src/timmy/thinking.py
@@ -19,6 +19,7 @@ Usage::
 
 import logging
 import random
+import re
 import sqlite3
 import uuid
 from collections.abc import Generator
@@ -35,6 +36,9 @@ logger = logging.getLogger(__name__)
 
 _DEFAULT_DB = Path("data/thoughts.db")
 
+# qwen3 and other reasoning models wrap chain-of-thought in <think> tags
+_THINK_TAG_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
+
 # Sensitive patterns that must never be stored as facts
 _SENSITIVE_PATTERNS = [
     "token",
@@ -435,6 +439,9 @@ class ThinkingEngine:
     def _parse_facts_response(self, raw: str) -> list[str]:
         """Parse JSON array from LLM response, stripping markdown fences.
 
+        Resilient to models that prepend reasoning text or wrap the array in
+        prose.  Finds the first ``[...]`` block and parses that.
+
         Args:
             raw: Raw response string from the LLM.
 
@@ -447,14 +454,39 @@ class ThinkingEngine:
         import json
 
         cleaned = raw.strip()
+
+        # Strip markdown code fences
         if cleaned.startswith("```"):
             cleaned = cleaned.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
 
-        facts = json.loads(cleaned)
-        if not isinstance(facts, list):
-            return []
+        # Try direct parse first (fast path)
+        try:
+            facts = json.loads(cleaned)
+            if isinstance(facts, list):
+                return [f for f in facts if isinstance(f, str)]
+        except (json.JSONDecodeError, ValueError):
+            pass
 
-        return [f for f in facts if isinstance(f, str)]
+        # Fallback: extract first JSON array from the text
+        start = cleaned.find("[")
+        if start == -1:
+            return []
+        # Walk to find the matching close bracket
+        depth = 0
+        for i, ch in enumerate(cleaned[start:], start):
+            if ch == "[":
+                depth += 1
+            elif ch == "]":
+                depth -= 1
+                if depth == 0:
+                    try:
+                        facts = json.loads(cleaned[start : i + 1])
+                        if isinstance(facts, list):
+                            return [f for f in facts if isinstance(f, str)]
+                    except (json.JSONDecodeError, ValueError):
+                        pass
+                    break
+        return []
 
     def _filter_and_store_facts(self, facts: list[str]) -> None:
         """Filter and store valid facts, blocking sensitive and meta content.
@@ -498,7 +530,7 @@ class ThinkingEngine:
             if facts := self._parse_facts_response(raw):
                 self._filter_and_store_facts(facts)
         except Exception as exc:
-            logger.debug("Thought distillation skipped: %s", exc)
+            logger.warning("Thought distillation failed: %s", exc)
 
     async def _maybe_file_issues(self) -> None:
         """Every N thoughts, classify recent thoughts and file Gitea issues.
@@ -945,12 +977,16 @@ class ThinkingEngine:
         errors that occur when MCP stdio transports are spawned inside asyncio
         background tasks (#72).  The thinking engine doesn't need Gitea or
         filesystem tools — it only needs the LLM.
+
+        Strips ``<think>`` tags from reasoning models (qwen3, etc.) so that
+        downstream parsers (fact distillation, issue filing) receive clean text.
         """
         from timmy.agent import create_timmy
 
         agent = create_timmy(skip_mcp=True)
         run = await agent.arun(prompt, stream=False)
-        return run.content if hasattr(run, "content") else str(run)
+        raw = run.content if hasattr(run, "content") else str(run)
+        return _THINK_TAG_RE.sub("", raw) if raw else raw
 
     def _store_thought(self, content: str, seed_type: str) -> Thought:
         """Persist a thought to SQLite."""
diff --git a/tests/timmy/test_thinking.py b/tests/timmy/test_thinking.py
index b87e23f..5a0a7be 100644
--- a/tests/timmy/test_thinking.py
+++ b/tests/timmy/test_thinking.py
@@ -885,3 +885,98 @@ async def test_call_agent_does_not_use_session_chat(tmp_path):
         await engine._call_agent("prompt")
 
     mock_session_chat.assert_not_awaited()
+
+
+@pytest.mark.asyncio
+async def test_call_agent_strips_think_tags(tmp_path):
+    """_call_agent must strip <think> tags from reasoning models like qwen3."""
+    engine = _make_engine(tmp_path)
+
+    mock_agent = AsyncMock()
+    mock_run = AsyncMock()
+    mock_run.content = (
+        "<think>Let me reason about this carefully...</think>"
+        "The actual thought content."
+    )
+    mock_agent.arun.return_value = mock_run
+
+    with patch("timmy.agent.create_timmy", return_value=mock_agent):
+        result = await engine._call_agent("test prompt")
+
+    assert "<think>" not in result
+    assert result == "The actual thought content."
+
+
+@pytest.mark.asyncio
+async def test_call_agent_strips_multiline_think_tags(tmp_path):
+    """_call_agent handles multi-line <think> blocks."""
+    engine = _make_engine(tmp_path)
+
+    mock_agent = AsyncMock()
+    mock_run = AsyncMock()
+    mock_run.content = (
+        "<think>\nStep 1: analyze\nStep 2: synthesize\n</think>\n"
+        "Clean output here."
+    )
+    mock_agent.arun.return_value = mock_run
+
+    with patch("timmy.agent.create_timmy", return_value=mock_agent):
+        result = await engine._call_agent("test prompt")
+
+    assert "<think>" not in result
+    assert result == "Clean output here."
+
+
+# ---------------------------------------------------------------------------
+# _parse_facts_response resilience (#237)
+# ---------------------------------------------------------------------------
+
+
+def test_parse_facts_clean_json(tmp_path):
+    """Direct JSON array should parse normally."""
+    engine = _make_engine(tmp_path)
+    result = engine._parse_facts_response('["fact one", "fact two"]')
+    assert result == ["fact one", "fact two"]
+
+
+def test_parse_facts_empty_array(tmp_path):
+    """Empty JSON array should return empty list."""
+    engine = _make_engine(tmp_path)
+    assert engine._parse_facts_response("[]") == []
+
+
+def test_parse_facts_with_prose_prefix(tmp_path):
+    """JSON array preceded by prose should still parse (#237)."""
+    engine = _make_engine(tmp_path)
+    raw = 'Here are the facts:\n["Alexander prefers YAML", "Timmy runs locally"]'
+    result = engine._parse_facts_response(raw)
+    assert result == ["Alexander prefers YAML", "Timmy runs locally"]
+
+
+def test_parse_facts_with_markdown_fences(tmp_path):
+    """JSON wrapped in markdown code fences should parse."""
+    engine = _make_engine(tmp_path)
+    raw = '```json\n["fact in fences"]\n```'
+    result = engine._parse_facts_response(raw)
+    assert result == ["fact in fences"]
+
+
+def test_parse_facts_filters_non_strings(tmp_path):
+    """Non-string entries in the array should be filtered out."""
+    engine = _make_engine(tmp_path)
+    result = engine._parse_facts_response('[42, "valid fact", null, true]')
+    assert result == ["valid fact"]
+
+
+def test_parse_facts_none_and_empty(tmp_path):
+    """None and empty input should return empty list."""
+    engine = _make_engine(tmp_path)
+    assert engine._parse_facts_response(None) == []
+    assert engine._parse_facts_response("") == []
+    assert engine._parse_facts_response("   ") == []
+
+
+def test_parse_facts_invalid_json(tmp_path):
+    """Totally invalid text with no JSON array should return empty list."""
+    engine = _make_engine(tmp_path)
+    assert engine._parse_facts_response("no json here at all") == []