fix: prevent reasoning box from rendering 3x during tool-calling loops (#3405)
Two independent bugs caused the reasoning box to appear three times when the model produced reasoning + tool_calls: Bug A: _build_assistant_message() re-fired reasoning_callback with the full reasoning text even when streaming had already displayed it. The original guard only checked structured reasoning_content deltas, but reasoning also arrives via content tag extraction (<REASONING_SCRATCHPAD>/<think> tags in delta.content), which went through _fire_stream_delta not _fire_reasoning_delta. Fix: skip the callback entirely when streaming is active — both paths display reasoning during the stream. Any reasoning not shown during streaming is caught by the CLI post-response fallback. Bug B: The post-response reasoning display checked _reasoning_stream_started, but that flag was reset by _reset_stream_state() during intermediate turn boundaries (when stream_delta_callback(None) fires between tool calls). Introduced _reasoning_shown_this_turn flag that persists across the tool loop and is only reset at the start of each user turn. Live-tested in PTY: reasoning now shows exactly once per API call, no duplicates across tool-calling loops.
This commit is contained in:
14
cli.py
14
cli.py
@@ -1625,6 +1625,7 @@ class HermesCLI:
|
|||||||
if not text:
|
if not text:
|
||||||
return
|
return
|
||||||
self._reasoning_stream_started = True
|
self._reasoning_stream_started = True
|
||||||
|
self._reasoning_shown_this_turn = True
|
||||||
if getattr(self, "_stream_box_opened", False):
|
if getattr(self, "_stream_box_opened", False):
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -5545,6 +5546,10 @@ class HermesCLI:
|
|||||||
|
|
||||||
# Reset streaming display state for this turn
|
# Reset streaming display state for this turn
|
||||||
self._reset_stream_state()
|
self._reset_stream_state()
|
||||||
|
# Separate from _reset_stream_state because this must persist
|
||||||
|
# across intermediate turn boundaries (tool-calling loops) — only
|
||||||
|
# reset at the start of each user turn.
|
||||||
|
self._reasoning_shown_this_turn = False
|
||||||
|
|
||||||
# --- Streaming TTS setup ---
|
# --- Streaming TTS setup ---
|
||||||
# When ElevenLabs is the TTS provider and sounddevice is available,
|
# When ElevenLabs is the TTS provider and sounddevice is available,
|
||||||
@@ -5759,8 +5764,13 @@ class HermesCLI:
|
|||||||
response_previewed = result.get("response_previewed", False) if result else False
|
response_previewed = result.get("response_previewed", False) if result else False
|
||||||
|
|
||||||
# Display reasoning (thinking) box if enabled and available.
|
# Display reasoning (thinking) box if enabled and available.
|
||||||
# Skip when streaming already showed reasoning live.
|
# Skip when streaming already showed reasoning live. Use the
|
||||||
if self.show_reasoning and result and not self._reasoning_stream_started:
|
# turn-persistent flag (_reasoning_shown_this_turn) instead of
|
||||||
|
# _reasoning_stream_started — the latter gets reset during
|
||||||
|
# intermediate turn boundaries (tool-calling loops), which caused
|
||||||
|
# the reasoning box to re-render after the final response.
|
||||||
|
_reasoning_already_shown = getattr(self, '_reasoning_shown_this_turn', False)
|
||||||
|
if self.show_reasoning and result and not _reasoning_already_shown:
|
||||||
reasoning = result.get("last_reasoning")
|
reasoning = result.get("last_reasoning")
|
||||||
if reasoning:
|
if reasoning:
|
||||||
w = shutil.get_terminal_size().columns
|
w = shutil.get_terminal_size().columns
|
||||||
|
|||||||
21
run_agent.py
21
run_agent.py
@@ -539,6 +539,7 @@ class AIAgent:
|
|||||||
self.tool_progress_callback = tool_progress_callback
|
self.tool_progress_callback = tool_progress_callback
|
||||||
self.thinking_callback = thinking_callback
|
self.thinking_callback = thinking_callback
|
||||||
self.reasoning_callback = reasoning_callback
|
self.reasoning_callback = reasoning_callback
|
||||||
|
self._reasoning_deltas_fired = False # Set by _fire_reasoning_delta, reset per API call
|
||||||
self.clarify_callback = clarify_callback
|
self.clarify_callback = clarify_callback
|
||||||
self.step_callback = step_callback
|
self.step_callback = step_callback
|
||||||
self.stream_delta_callback = stream_delta_callback
|
self.stream_delta_callback = stream_delta_callback
|
||||||
@@ -3415,6 +3416,7 @@ class AIAgent:
|
|||||||
max_stream_retries = 1
|
max_stream_retries = 1
|
||||||
has_tool_calls = False
|
has_tool_calls = False
|
||||||
first_delta_fired = False
|
first_delta_fired = False
|
||||||
|
self._reasoning_deltas_fired = False
|
||||||
for attempt in range(max_stream_retries + 1):
|
for attempt in range(max_stream_retries + 1):
|
||||||
try:
|
try:
|
||||||
with active_client.responses.stream(**api_kwargs) as stream:
|
with active_client.responses.stream(**api_kwargs) as stream:
|
||||||
@@ -3691,6 +3693,7 @@ class AIAgent:
|
|||||||
|
|
||||||
def _fire_reasoning_delta(self, text: str) -> None:
|
def _fire_reasoning_delta(self, text: str) -> None:
|
||||||
"""Fire reasoning callback if registered."""
|
"""Fire reasoning callback if registered."""
|
||||||
|
self._reasoning_deltas_fired = True
|
||||||
cb = self.reasoning_callback
|
cb = self.reasoning_callback
|
||||||
if cb is not None:
|
if cb is not None:
|
||||||
try:
|
try:
|
||||||
@@ -3798,6 +3801,9 @@ class AIAgent:
|
|||||||
role = "assistant"
|
role = "assistant"
|
||||||
reasoning_parts: list = []
|
reasoning_parts: list = []
|
||||||
usage_obj = None
|
usage_obj = None
|
||||||
|
# Reset per-call reasoning tracking so _build_assistant_message
|
||||||
|
# knows whether reasoning was already displayed during streaming.
|
||||||
|
self._reasoning_deltas_fired = False
|
||||||
|
|
||||||
for chunk in stream:
|
for chunk in stream:
|
||||||
last_chunk_time["t"] = time.time()
|
last_chunk_time["t"] = time.time()
|
||||||
@@ -3917,6 +3923,7 @@ class AIAgent:
|
|||||||
works unchanged.
|
works unchanged.
|
||||||
"""
|
"""
|
||||||
has_tool_use = False
|
has_tool_use = False
|
||||||
|
self._reasoning_deltas_fired = False
|
||||||
|
|
||||||
# Reset stale-stream timer for this attempt
|
# Reset stale-stream timer for this attempt
|
||||||
last_chunk_time["t"] = time.time()
|
last_chunk_time["t"] = time.time()
|
||||||
@@ -4630,11 +4637,15 @@ class AIAgent:
|
|||||||
logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {reasoning_text}")
|
logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {reasoning_text}")
|
||||||
|
|
||||||
if reasoning_text and self.reasoning_callback:
|
if reasoning_text and self.reasoning_callback:
|
||||||
# Skip callback for <think>-extracted reasoning when streaming is active.
|
# Skip callback when streaming is active — reasoning was already
|
||||||
# _stream_delta() already displayed <think> blocks during streaming;
|
# displayed during the stream via one of two paths:
|
||||||
# firing the callback again would cause duplicate display.
|
# (a) _fire_reasoning_delta (structured reasoning_content deltas)
|
||||||
# Structured reasoning (from reasoning_content field) always fires.
|
# (b) _stream_delta tag extraction (<think>/<REASONING_SCRATCHPAD>)
|
||||||
if _from_structured or not self.stream_delta_callback:
|
# When streaming is NOT active, always fire so non-streaming modes
|
||||||
|
# (gateway, batch, quiet) still get reasoning.
|
||||||
|
# Any reasoning that wasn't shown during streaming is caught by the
|
||||||
|
# CLI post-response display fallback (cli.py _reasoning_shown_this_turn).
|
||||||
|
if not self.stream_delta_callback:
|
||||||
try:
|
try:
|
||||||
self.reasoning_callback(reasoning_text)
|
self.reasoning_callback(reasoning_text)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|||||||
@@ -472,6 +472,7 @@ class TestInlineThinkBlockExtraction(unittest.TestCase):
|
|||||||
agent._extract_reasoning = AIAgent._extract_reasoning.__get__(agent)
|
agent._extract_reasoning = AIAgent._extract_reasoning.__get__(agent)
|
||||||
agent.verbose_logging = False
|
agent.verbose_logging = False
|
||||||
agent.reasoning_callback = None
|
agent.reasoning_callback = None
|
||||||
|
agent.stream_delta_callback = None # non-streaming by default
|
||||||
return agent
|
return agent
|
||||||
|
|
||||||
def test_single_think_block_extracted(self):
|
def test_single_think_block_extracted(self):
|
||||||
@@ -605,5 +606,159 @@ class TestEndToEndPipeline(unittest.TestCase):
|
|||||||
self.assertIsNone(result["last_reasoning"])
|
self.assertIsNone(result["last_reasoning"])
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Duplicate reasoning box prevention (Bug fix: 3 boxes for 1 reasoning)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReasoningDeltasFiredFlag(unittest.TestCase):
|
||||||
|
"""_build_assistant_message should not re-fire reasoning_callback when
|
||||||
|
reasoning was already streamed via _fire_reasoning_delta."""
|
||||||
|
|
||||||
|
def _make_agent(self):
|
||||||
|
from run_agent import AIAgent
|
||||||
|
agent = AIAgent.__new__(AIAgent)
|
||||||
|
agent.reasoning_callback = None
|
||||||
|
agent.stream_delta_callback = None
|
||||||
|
agent._reasoning_deltas_fired = False
|
||||||
|
agent.verbose_logging = False
|
||||||
|
return agent
|
||||||
|
|
||||||
|
def test_fire_reasoning_delta_sets_flag(self):
|
||||||
|
agent = self._make_agent()
|
||||||
|
captured = []
|
||||||
|
agent.reasoning_callback = lambda t: captured.append(t)
|
||||||
|
self.assertFalse(agent._reasoning_deltas_fired)
|
||||||
|
agent._fire_reasoning_delta("thinking...")
|
||||||
|
self.assertTrue(agent._reasoning_deltas_fired)
|
||||||
|
self.assertEqual(captured, ["thinking..."])
|
||||||
|
|
||||||
|
def test_build_assistant_message_skips_callback_when_already_streamed(self):
|
||||||
|
"""When streaming already fired reasoning deltas, the post-stream
|
||||||
|
_build_assistant_message should NOT re-fire the callback."""
|
||||||
|
agent = self._make_agent()
|
||||||
|
captured = []
|
||||||
|
agent.reasoning_callback = lambda t: captured.append(t)
|
||||||
|
agent.stream_delta_callback = lambda t: None # streaming is active
|
||||||
|
|
||||||
|
# Simulate streaming having fired reasoning
|
||||||
|
agent._reasoning_deltas_fired = True
|
||||||
|
|
||||||
|
msg = SimpleNamespace(
|
||||||
|
content="I'll merge that.",
|
||||||
|
tool_calls=None,
|
||||||
|
reasoning_content="Let me merge the PR.",
|
||||||
|
reasoning=None,
|
||||||
|
reasoning_details=None,
|
||||||
|
)
|
||||||
|
agent._build_assistant_message(msg, "stop")
|
||||||
|
|
||||||
|
# Callback should NOT have been fired again
|
||||||
|
self.assertEqual(captured, [])
|
||||||
|
|
||||||
|
def test_build_assistant_message_skips_callback_when_streaming_active(self):
|
||||||
|
"""When streaming is active, callback should NEVER fire from
|
||||||
|
_build_assistant_message — reasoning was already displayed during the
|
||||||
|
stream (either via reasoning_content deltas or content tag extraction).
|
||||||
|
Any missed reasoning is caught by the CLI post-response fallback."""
|
||||||
|
agent = self._make_agent()
|
||||||
|
captured = []
|
||||||
|
agent.reasoning_callback = lambda t: captured.append(t)
|
||||||
|
agent.stream_delta_callback = lambda t: None # streaming active
|
||||||
|
|
||||||
|
# Even though _reasoning_deltas_fired is False (reasoning came through
|
||||||
|
# content tags, not reasoning_content deltas), callback should not fire
|
||||||
|
agent._reasoning_deltas_fired = False
|
||||||
|
|
||||||
|
msg = SimpleNamespace(
|
||||||
|
content="I'll merge that.",
|
||||||
|
tool_calls=None,
|
||||||
|
reasoning_content="Let me merge the PR.",
|
||||||
|
reasoning=None,
|
||||||
|
reasoning_details=None,
|
||||||
|
)
|
||||||
|
agent._build_assistant_message(msg, "stop")
|
||||||
|
|
||||||
|
# Callback should NOT fire — streaming is active
|
||||||
|
self.assertEqual(captured, [])
|
||||||
|
|
||||||
|
def test_build_assistant_message_fires_callback_without_streaming(self):
|
||||||
|
"""When no streaming is active, callback always fires for structured
|
||||||
|
reasoning."""
|
||||||
|
agent = self._make_agent()
|
||||||
|
captured = []
|
||||||
|
agent.reasoning_callback = lambda t: captured.append(t)
|
||||||
|
# No streaming
|
||||||
|
agent.stream_delta_callback = None
|
||||||
|
agent._reasoning_deltas_fired = False
|
||||||
|
|
||||||
|
msg = SimpleNamespace(
|
||||||
|
content="I'll merge that.",
|
||||||
|
tool_calls=None,
|
||||||
|
reasoning_content="Let me merge the PR.",
|
||||||
|
reasoning=None,
|
||||||
|
reasoning_details=None,
|
||||||
|
)
|
||||||
|
agent._build_assistant_message(msg, "stop")
|
||||||
|
|
||||||
|
self.assertEqual(captured, ["Let me merge the PR."])
|
||||||
|
|
||||||
|
|
||||||
|
class TestReasoningShownThisTurnFlag(unittest.TestCase):
|
||||||
|
"""Post-response reasoning display should be suppressed when reasoning
|
||||||
|
was already shown during streaming in a tool-calling loop."""
|
||||||
|
|
||||||
|
def _make_cli(self):
|
||||||
|
from cli import HermesCLI
|
||||||
|
cli = HermesCLI.__new__(HermesCLI)
|
||||||
|
cli.show_reasoning = True
|
||||||
|
cli.streaming_enabled = True
|
||||||
|
cli._stream_box_opened = False
|
||||||
|
cli._reasoning_box_opened = False
|
||||||
|
cli._reasoning_stream_started = False
|
||||||
|
cli._reasoning_shown_this_turn = False
|
||||||
|
cli._reasoning_buf = ""
|
||||||
|
cli._stream_buf = ""
|
||||||
|
cli._stream_started = False
|
||||||
|
cli._stream_text_ansi = ""
|
||||||
|
cli._stream_prefilt = ""
|
||||||
|
cli._in_reasoning_block = False
|
||||||
|
cli._reasoning_preview_buf = ""
|
||||||
|
return cli
|
||||||
|
|
||||||
|
@patch("cli._cprint")
|
||||||
|
def test_streaming_reasoning_sets_turn_flag(self, mock_cprint):
|
||||||
|
cli = self._make_cli()
|
||||||
|
self.assertFalse(cli._reasoning_shown_this_turn)
|
||||||
|
cli._stream_reasoning_delta("Thinking about it...")
|
||||||
|
self.assertTrue(cli._reasoning_shown_this_turn)
|
||||||
|
|
||||||
|
@patch("cli._cprint")
|
||||||
|
def test_turn_flag_survives_reset_stream_state(self, mock_cprint):
|
||||||
|
"""_reasoning_shown_this_turn must NOT be cleared by
|
||||||
|
_reset_stream_state (called at intermediate turn boundaries)."""
|
||||||
|
cli = self._make_cli()
|
||||||
|
cli._stream_reasoning_delta("Thinking...")
|
||||||
|
self.assertTrue(cli._reasoning_shown_this_turn)
|
||||||
|
|
||||||
|
# Simulate intermediate turn boundary (tool call)
|
||||||
|
cli._reset_stream_state()
|
||||||
|
|
||||||
|
# Flag must persist
|
||||||
|
self.assertTrue(cli._reasoning_shown_this_turn)
|
||||||
|
|
||||||
|
@patch("cli._cprint")
|
||||||
|
def test_turn_flag_cleared_before_new_turn(self, mock_cprint):
|
||||||
|
"""The turn flag should be reset at the start of a new user turn.
|
||||||
|
This happens outside _reset_stream_state, at the call site."""
|
||||||
|
cli = self._make_cli()
|
||||||
|
cli._reasoning_shown_this_turn = True
|
||||||
|
|
||||||
|
# Simulate new user turn setup
|
||||||
|
cli._reset_stream_state()
|
||||||
|
cli._reasoning_shown_this_turn = False # done by process_input
|
||||||
|
|
||||||
|
self.assertFalse(cli._reasoning_shown_this_turn)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user