From 8ecd7aed2c3b2d00048814cc79e6aac60dcaf497 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Fri, 27 Mar 2026 09:57:50 -0700
Subject: [PATCH] fix: prevent reasoning box from rendering 3x during
 tool-calling loops (#3405)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two independent bugs caused the reasoning box to appear three times when
the model produced reasoning + tool_calls:

Bug A: _build_assistant_message() re-fired reasoning_callback with the full
reasoning text even when streaming had already displayed it. The original
guard only checked structured reasoning_content deltas, but reasoning also
arrives via content tag extraction (<REASONING_SCRATCHPAD>/<think> tags
in delta.content), which went through _fire_stream_delta not
_fire_reasoning_delta. Fix: skip the callback entirely when streaming is
active — both paths display reasoning during the stream. Any reasoning not
shown during streaming is caught by the CLI post-response fallback.

Bug B: The post-response reasoning display checked _reasoning_stream_started,
but that flag was reset by _reset_stream_state() during intermediate turn
boundaries (when stream_delta_callback(None) fires between tool calls).
Introduced _reasoning_shown_this_turn flag that persists across the tool
loop and is only reset at the start of each user turn.

Live-tested in PTY: reasoning now shows exactly once per API call, no
duplicates across tool-calling loops.
---
 cli.py                          |  14 ++-
 run_agent.py                    |  21 +++--
 tests/test_reasoning_command.py | 155 ++++++++++++++++++++++++++++++++
 3 files changed, 183 insertions(+), 7 deletions(-)
diff --git a/cli.py b/cli.py
index bd7c26c48..c8aeaad8a 100644
--- a/cli.py
+++ b/cli.py
@@ -1625,6 +1625,7 @@ class HermesCLI:
         if not text:
             return
         self._reasoning_stream_started = True
+        self._reasoning_shown_this_turn = True
         if getattr(self, "_stream_box_opened", False):
             return
 
@@ -5545,6 +5546,10 @@ class HermesCLI:
 
             # Reset streaming display state for this turn
             self._reset_stream_state()
+            # Separate from _reset_stream_state because this must persist
+            # across intermediate turn boundaries (tool-calling loops) — only
+            # reset at the start of each user turn.
+            self._reasoning_shown_this_turn = False
 
             # --- Streaming TTS setup ---
             # When ElevenLabs is the TTS provider and sounddevice is available,
@@ -5759,8 +5764,13 @@ class HermesCLI:
             response_previewed = result.get("response_previewed", False) if result else False
 
             # Display reasoning (thinking) box if enabled and available.
-            # Skip when streaming already showed reasoning live.
-            if self.show_reasoning and result and not self._reasoning_stream_started:
+            # Skip when streaming already showed reasoning live.  Use the
+            # turn-persistent flag (_reasoning_shown_this_turn) instead of
+            # _reasoning_stream_started — the latter gets reset during
+            # intermediate turn boundaries (tool-calling loops), which caused
+            # the reasoning box to re-render after the final response.
+            _reasoning_already_shown = getattr(self, '_reasoning_shown_this_turn', False)
+            if self.show_reasoning and result and not _reasoning_already_shown:
                 reasoning = result.get("last_reasoning")
                 if reasoning:
                     w = shutil.get_terminal_size().columns
diff --git a/run_agent.py b/run_agent.py
index e220fa992..457479f6e 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -539,6 +539,7 @@ class AIAgent:
         self.tool_progress_callback = tool_progress_callback
         self.thinking_callback = thinking_callback
         self.reasoning_callback = reasoning_callback
+        self._reasoning_deltas_fired = False  # Set by _fire_reasoning_delta, reset per API call
         self.clarify_callback = clarify_callback
         self.step_callback = step_callback
         self.stream_delta_callback = stream_delta_callback
@@ -3415,6 +3416,7 @@ class AIAgent:
         max_stream_retries = 1
         has_tool_calls = False
         first_delta_fired = False
+        self._reasoning_deltas_fired = False
         for attempt in range(max_stream_retries + 1):
             try:
                 with active_client.responses.stream(**api_kwargs) as stream:
@@ -3691,6 +3693,7 @@ class AIAgent:
 
     def _fire_reasoning_delta(self, text: str) -> None:
         """Fire reasoning callback if registered."""
+        self._reasoning_deltas_fired = True
         cb = self.reasoning_callback
         if cb is not None:
             try:
@@ -3798,6 +3801,9 @@ class AIAgent:
             role = "assistant"
             reasoning_parts: list = []
             usage_obj = None
+            # Reset per-call reasoning tracking so _build_assistant_message
+            # knows whether reasoning was already displayed during streaming.
+            self._reasoning_deltas_fired = False
 
             for chunk in stream:
                 last_chunk_time["t"] = time.time()
@@ -3917,6 +3923,7 @@ class AIAgent:
             works unchanged.
             """
             has_tool_use = False
+            self._reasoning_deltas_fired = False
 
             # Reset stale-stream timer for this attempt
             last_chunk_time["t"] = time.time()
@@ -4630,11 +4637,15 @@ class AIAgent:
             logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {reasoning_text}")
 
         if reasoning_text and self.reasoning_callback:
-            # Skip callback for <think>-extracted reasoning when streaming is active.
-            # _stream_delta() already displayed <think> blocks during streaming;
-            # firing the callback again would cause duplicate display.
-            # Structured reasoning (from reasoning_content field) always fires.
-            if _from_structured or not self.stream_delta_callback:
+            # Skip callback when streaming is active — reasoning was already
+            # displayed during the stream via one of two paths:
+            #   (a) _fire_reasoning_delta (structured reasoning_content deltas)
+            #   (b) _stream_delta tag extraction (<think>/<REASONING_SCRATCHPAD>)
+            # When streaming is NOT active, always fire so non-streaming modes
+            # (gateway, batch, quiet) still get reasoning.
+            # Any reasoning that wasn't shown during streaming is caught by the
+            # CLI post-response display fallback (cli.py _reasoning_shown_this_turn).
+            if not self.stream_delta_callback:
                 try:
                     self.reasoning_callback(reasoning_text)
                 except Exception:
diff --git a/tests/test_reasoning_command.py b/tests/test_reasoning_command.py
index 81d452a27..4270d630d 100644
--- a/tests/test_reasoning_command.py
+++ b/tests/test_reasoning_command.py
@@ -472,6 +472,7 @@ class TestInlineThinkBlockExtraction(unittest.TestCase):
         agent._extract_reasoning = AIAgent._extract_reasoning.__get__(agent)
         agent.verbose_logging = False
         agent.reasoning_callback = None
+        agent.stream_delta_callback = None  # non-streaming by default
         return agent
 
     def test_single_think_block_extracted(self):
@@ -605,5 +606,159 @@ class TestEndToEndPipeline(unittest.TestCase):
         self.assertIsNone(result["last_reasoning"])
 
 
+# ---------------------------------------------------------------------------
+# Duplicate reasoning box prevention (Bug fix: 3 boxes for 1 reasoning)
+# ---------------------------------------------------------------------------
+
+class TestReasoningDeltasFiredFlag(unittest.TestCase):
+    """_build_assistant_message should not re-fire reasoning_callback when
+    reasoning was already streamed via _fire_reasoning_delta."""
+
+    def _make_agent(self):
+        from run_agent import AIAgent
+        agent = AIAgent.__new__(AIAgent)
+        agent.reasoning_callback = None
+        agent.stream_delta_callback = None
+        agent._reasoning_deltas_fired = False
+        agent.verbose_logging = False
+        return agent
+
+    def test_fire_reasoning_delta_sets_flag(self):
+        agent = self._make_agent()
+        captured = []
+        agent.reasoning_callback = lambda t: captured.append(t)
+        self.assertFalse(agent._reasoning_deltas_fired)
+        agent._fire_reasoning_delta("thinking...")
+        self.assertTrue(agent._reasoning_deltas_fired)
+        self.assertEqual(captured, ["thinking..."])
+
+    def test_build_assistant_message_skips_callback_when_already_streamed(self):
+        """When streaming already fired reasoning deltas, the post-stream
+        _build_assistant_message should NOT re-fire the callback."""
+        agent = self._make_agent()
+        captured = []
+        agent.reasoning_callback = lambda t: captured.append(t)
+        agent.stream_delta_callback = lambda t: None  # streaming is active
+
+        # Simulate streaming having fired reasoning
+        agent._reasoning_deltas_fired = True
+
+        msg = SimpleNamespace(
+            content="I'll merge that.",
+            tool_calls=None,
+            reasoning_content="Let me merge the PR.",
+            reasoning=None,
+            reasoning_details=None,
+        )
+        agent._build_assistant_message(msg, "stop")
+
+        # Callback should NOT have been fired again
+        self.assertEqual(captured, [])
+
+    def test_build_assistant_message_skips_callback_when_streaming_active(self):
+        """When streaming is active, callback should NEVER fire from
+        _build_assistant_message — reasoning was already displayed during the
+        stream (either via reasoning_content deltas or content tag extraction).
+        Any missed reasoning is caught by the CLI post-response fallback."""
+        agent = self._make_agent()
+        captured = []
+        agent.reasoning_callback = lambda t: captured.append(t)
+        agent.stream_delta_callback = lambda t: None  # streaming active
+
+        # Even though _reasoning_deltas_fired is False (reasoning came through
+        # content tags, not reasoning_content deltas), callback should not fire
+        agent._reasoning_deltas_fired = False
+
+        msg = SimpleNamespace(
+            content="I'll merge that.",
+            tool_calls=None,
+            reasoning_content="Let me merge the PR.",
+            reasoning=None,
+            reasoning_details=None,
+        )
+        agent._build_assistant_message(msg, "stop")
+
+        # Callback should NOT fire — streaming is active
+        self.assertEqual(captured, [])
+
+    def test_build_assistant_message_fires_callback_without_streaming(self):
+        """When no streaming is active, callback always fires for structured
+        reasoning."""
+        agent = self._make_agent()
+        captured = []
+        agent.reasoning_callback = lambda t: captured.append(t)
+        # No streaming
+        agent.stream_delta_callback = None
+        agent._reasoning_deltas_fired = False
+
+        msg = SimpleNamespace(
+            content="I'll merge that.",
+            tool_calls=None,
+            reasoning_content="Let me merge the PR.",
+            reasoning=None,
+            reasoning_details=None,
+        )
+        agent._build_assistant_message(msg, "stop")
+
+        self.assertEqual(captured, ["Let me merge the PR."])
+
+
+class TestReasoningShownThisTurnFlag(unittest.TestCase):
+    """Post-response reasoning display should be suppressed when reasoning
+    was already shown during streaming in a tool-calling loop."""
+
+    def _make_cli(self):
+        from cli import HermesCLI
+        cli = HermesCLI.__new__(HermesCLI)
+        cli.show_reasoning = True
+        cli.streaming_enabled = True
+        cli._stream_box_opened = False
+        cli._reasoning_box_opened = False
+        cli._reasoning_stream_started = False
+        cli._reasoning_shown_this_turn = False
+        cli._reasoning_buf = ""
+        cli._stream_buf = ""
+        cli._stream_started = False
+        cli._stream_text_ansi = ""
+        cli._stream_prefilt = ""
+        cli._in_reasoning_block = False
+        cli._reasoning_preview_buf = ""
+        return cli
+
+    @patch("cli._cprint")
+    def test_streaming_reasoning_sets_turn_flag(self, mock_cprint):
+        cli = self._make_cli()
+        self.assertFalse(cli._reasoning_shown_this_turn)
+        cli._stream_reasoning_delta("Thinking about it...")
+        self.assertTrue(cli._reasoning_shown_this_turn)
+
+    @patch("cli._cprint")
+    def test_turn_flag_survives_reset_stream_state(self, mock_cprint):
+        """_reasoning_shown_this_turn must NOT be cleared by
+        _reset_stream_state (called at intermediate turn boundaries)."""
+        cli = self._make_cli()
+        cli._stream_reasoning_delta("Thinking...")
+        self.assertTrue(cli._reasoning_shown_this_turn)
+
+        # Simulate intermediate turn boundary (tool call)
+        cli._reset_stream_state()
+
+        # Flag must persist
+        self.assertTrue(cli._reasoning_shown_this_turn)
+
+    @patch("cli._cprint")
+    def test_turn_flag_cleared_before_new_turn(self, mock_cprint):
+        """The turn flag should be reset at the start of a new user turn.
+        This happens outside _reset_stream_state, at the call site."""
+        cli = self._make_cli()
+        cli._reasoning_shown_this_turn = True
+
+        # Simulate new user turn setup
+        cli._reset_stream_state()
+        cli._reasoning_shown_this_turn = False  # done by process_input
+
+        self.assertFalse(cli._reasoning_shown_this_turn)
+
+
 if __name__ == "__main__":
     unittest.main()