From 8f6ef042c110a696e5fb107ead4ea329c9671f9b Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 25 Mar 2026 12:16:39 -0700 Subject: [PATCH] fix(cli): buffer reasoning preview chunks and fix duplicate display (#3013) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three improvements to reasoning/thinking display in the CLI: 1. Buffer tiny reasoning chunks: providers like DeepSeek stream reasoning one word at a time, producing a separate [thinking] line per token. Add a buffer that coalesces chunks and flushes at natural boundaries (newlines, sentence endings, terminal width). 2. Fix duplicate reasoning display: centralize callback selection into _current_reasoning_callback() — one place instead of 4 scattered inline ternaries. Prevents both the streaming box AND the preview callback from firing simultaneously. 3. Fix post-response reasoning box guard: change the check from 'not self._stream_started' to 'not self._reasoning_stream_started' so the final reasoning box is only suppressed when reasoning was actually streamed live, not when any text was streamed. Cherry-picked from PR #2781 by juanfradb. --- cli.py | 135 ++++++++++++++++++++++++++------ tests/test_reasoning_command.py | 103 ++++++++++++++++++++++++ 2 files changed, 214 insertions(+), 24 deletions(-) diff --git a/cli.py b/cli.py index 5e256f27..f49fa226 100644 --- a/cli.py +++ b/cli.py @@ -1054,6 +1054,8 @@ class HermesCLI: self._stream_buf = "" # Partial line buffer for line-buffered rendering self._stream_started = False # True once first delta arrives self._stream_box_opened = False # True once the response box header is printed + self._reasoning_stream_started = False # True once live reasoning starts streaming + self._reasoning_preview_buf = "" # Coalesce tiny reasoning chunks for [thinking] output # Configuration - priority: CLI args > env vars > config file # Model comes from: CLI arg or config.yaml (single source of truth). @@ -1478,11 +1480,108 @@ class HermesCLI: def _on_thinking(self, text: str) -> None: """Called by agent when thinking starts/stops. Updates TUI spinner.""" + if not text: + self._flush_reasoning_preview(force=True) self._spinner_text = text or "" self._invalidate() # ── Streaming display ──────────────────────────────────────────────── + def _current_reasoning_callback(self): + """Return the active reasoning display callback for the current mode.""" + if self.show_reasoning and self.streaming_enabled: + return self._stream_reasoning_delta + if self.verbose and not self.show_reasoning: + return self._on_reasoning + return None + + def _emit_reasoning_preview(self, reasoning_text: str) -> None: + """Render a buffered reasoning preview as a single [thinking] block.""" + import re + import textwrap + + preview_text = reasoning_text.strip() + if not preview_text: + return + + try: + term_width = shutil.get_terminal_size().columns + except Exception: + term_width = 80 + prefix = " [thinking] " + wrap_width = max(30, term_width - len(prefix) - 2) + + paragraphs = [] + raw_paragraphs = re.split(r"\n\s*\n+", preview_text.replace("\r\n", "\n")) + for paragraph in raw_paragraphs: + compact = " ".join(line.strip() for line in paragraph.splitlines() if line.strip()) + if compact: + paragraphs.append(textwrap.fill(compact, width=wrap_width)) + preview_text = "\n".join(paragraphs) + if not preview_text: + return + + if self.verbose: + _cprint(f" {_DIM}[thinking] {preview_text}{_RST}") + return + + lines = preview_text.splitlines() + if len(lines) > 5: + preview = "\n".join(lines[:5]) + preview += f"\n ... ({len(lines) - 5} more lines)" + else: + preview = preview_text + _cprint(f" {_DIM}[thinking] {preview}{_RST}") + + def _flush_reasoning_preview(self, *, force: bool = False) -> None: + """Flush buffered reasoning text at natural boundaries. + + Some providers stream reasoning in tiny word or punctuation chunks. + Buffer them here so the preview path does not print one `[thinking]` + line per token. + """ + buf = getattr(self, "_reasoning_preview_buf", "") + if not buf: + return + + try: + term_width = shutil.get_terminal_size().columns + except Exception: + term_width = 80 + target_width = max(40, term_width - len(" [thinking] ") - 4) + + flush_text = "" + + if force: + flush_text = buf + buf = "" + else: + line_break = buf.rfind("\n") + min_newline_flush = max(16, target_width // 3) + if line_break != -1 and ( + line_break >= min_newline_flush + or buf.endswith("\n\n") + or buf.endswith(".\n") + or buf.endswith("!\n") + or buf.endswith("?\n") + or buf.endswith(":\n") + ): + flush_text = buf[: line_break + 1] + buf = buf[line_break + 1 :] + elif len(buf) >= target_width: + search_start = max(20, target_width // 2) + search_end = min(len(buf), max(target_width + (target_width // 3), target_width + 8)) + cut = -1 + for boundary in (" ", "\t", ".", "!", "?", ",", ";", ":"): + cut = max(cut, buf.rfind(boundary, search_start, search_end)) + if cut != -1: + flush_text = buf[: cut + 1] + buf = buf[cut + 1 :] + + self._reasoning_preview_buf = buf.lstrip() if flush_text else buf + if flush_text: + self._emit_reasoning_preview(flush_text) + def _stream_reasoning_delta(self, text: str) -> None: """Stream reasoning/thinking tokens into a dim box above the response. @@ -1496,6 +1595,7 @@ class HermesCLI: """ if not text: return + self._reasoning_stream_started = True if getattr(self, "_stream_box_opened", False): return @@ -1691,11 +1791,13 @@ class HermesCLI: self._stream_buf = "" self._stream_started = False self._stream_box_opened = False + self._reasoning_stream_started = False self._stream_text_ansi = "" self._stream_prefilt = "" self._in_reasoning_block = False self._reasoning_box_opened = False self._reasoning_buf = "" + self._reasoning_preview_buf = "" def _slow_command_status(self, command: str) -> str: """Return a user-facing status message for slower slash commands.""" @@ -1926,11 +2028,7 @@ class HermesCLI: platform="cli", session_db=self._session_db, clarify_callback=self._clarify_callback, - reasoning_callback=( - self._stream_reasoning_delta if (self.streaming_enabled and self.show_reasoning) - else self._on_reasoning if (self.show_reasoning or self.verbose) - else None - ), + reasoning_callback=self._current_reasoning_callback(), honcho_session_key=None, # resolved by run_agent via config sessions map / title fallback_model=self._fallback_model, thinking_callback=self._on_thinking, @@ -4235,11 +4333,7 @@ class HermesCLI: if self.agent: self.agent.verbose_logging = self.verbose self.agent.quiet_mode = not self.verbose - # Auto-enable reasoning display in verbose mode - if self.verbose: - self.agent.reasoning_callback = self._on_reasoning - elif not self.show_reasoning: - self.agent.reasoning_callback = None + self.agent.reasoning_callback = self._current_reasoning_callback() # Use raw ANSI codes via _cprint so the output is routed through # prompt_toolkit's renderer. self.console.print() with Rich markup @@ -4286,7 +4380,7 @@ class HermesCLI: if arg in ("show", "on"): self.show_reasoning = True if self.agent: - self.agent.reasoning_callback = self._on_reasoning + self.agent.reasoning_callback = self._current_reasoning_callback() save_config_value("display.show_reasoning", True) _cprint(f" {_GOLD}✓ Reasoning display: ON (saved){_RST}") _cprint(f" {_DIM} Model thinking will be shown during and after each response.{_RST}") @@ -4294,7 +4388,7 @@ class HermesCLI: if arg in ("hide", "off"): self.show_reasoning = False if self.agent: - self.agent.reasoning_callback = None + self.agent.reasoning_callback = self._current_reasoning_callback() save_config_value("display.show_reasoning", False) _cprint(f" {_GOLD}✓ Reasoning display: OFF (saved){_RST}") return @@ -4317,17 +4411,10 @@ class HermesCLI: def _on_reasoning(self, reasoning_text: str): """Callback for intermediate reasoning display during tool-call loops.""" - if self.verbose: - # Verbose mode: show full reasoning text - _cprint(f" {_DIM}[thinking] {reasoning_text.strip()}{_RST}") - else: - lines = reasoning_text.strip().splitlines() - if len(lines) > 5: - preview = "\n".join(lines[:5]) - preview += f"\n ... ({len(lines) - 5} more lines)" - else: - preview = reasoning_text.strip() - _cprint(f" {_DIM}[thinking] {preview}{_RST}") + if not reasoning_text: + return + self._reasoning_preview_buf = getattr(self, "_reasoning_preview_buf", "") + reasoning_text + self._flush_reasoning_preview(force=False) def _manual_compress(self): """Manually trigger context compression on the current conversation.""" @@ -5628,7 +5715,7 @@ class HermesCLI: # Display reasoning (thinking) box if enabled and available. # Skip when streaming already showed reasoning live. - if self.show_reasoning and result and not self._stream_started: + if self.show_reasoning and result and not self._reasoning_stream_started: reasoning = result.get("last_reasoning") if reasoning: w = shutil.get_terminal_size().columns diff --git a/tests/test_reasoning_command.py b/tests/test_reasoning_command.py index 425e28a5..81d452a2 100644 --- a/tests/test_reasoning_command.py +++ b/tests/test_reasoning_command.py @@ -11,6 +11,7 @@ Combines functionality from: import unittest from types import SimpleNamespace from unittest.mock import MagicMock, patch +import re # --------------------------------------------------------------------------- @@ -295,6 +296,108 @@ class TestReasoningCallback(unittest.TestCase): # No exception = pass +class TestReasoningPreviewBuffering(unittest.TestCase): + def _make_cli(self): + from cli import HermesCLI + + cli = HermesCLI.__new__(HermesCLI) + cli.verbose = True + cli._spinner_text = "" + cli._reasoning_preview_buf = "" + cli._invalidate = lambda *args, **kwargs: None + return cli + + @patch("cli._cprint") + def test_streamed_reasoning_chunks_wait_for_boundary(self, mock_cprint): + cli = self._make_cli() + + cli._on_reasoning("Let") + cli._on_reasoning(" me") + cli._on_reasoning(" think") + + self.assertEqual(mock_cprint.call_count, 0) + + cli._on_reasoning(" about this.\n") + + self.assertEqual(mock_cprint.call_count, 1) + rendered = mock_cprint.call_args[0][0] + self.assertIn("[thinking] Let me think about this.", rendered) + + @patch("cli._cprint") + def test_pending_reasoning_flushes_when_thinking_stops(self, mock_cprint): + cli = self._make_cli() + + cli._on_reasoning("see") + cli._on_reasoning(" how") + cli._on_reasoning(" this") + cli._on_reasoning(" plays") + cli._on_reasoning(" out") + + self.assertEqual(mock_cprint.call_count, 0) + + cli._on_thinking("") + + self.assertEqual(mock_cprint.call_count, 1) + rendered = mock_cprint.call_args[0][0] + self.assertIn("[thinking] see how this plays out", rendered) + + @patch("cli._cprint") + @patch("cli.shutil.get_terminal_size", return_value=SimpleNamespace(columns=50)) + def test_reasoning_preview_compacts_newlines_and_wraps_to_terminal(self, _mock_term, mock_cprint): + cli = self._make_cli() + + cli._emit_reasoning_preview( + "First line\nstill same thought\n\n\nSecond paragraph with more detail here." + ) + + rendered = mock_cprint.call_args[0][0] + plain = re.sub(r"\x1b\[[0-9;]*m", "", rendered) + normalized = " ".join(plain.split()) + self.assertIn("[thinking] First line still same thought", plain) + self.assertIn("Second paragraph with more detail here.", normalized) + self.assertNotIn("\n\n\n", plain) + + @patch("cli.shutil.get_terminal_size", return_value=SimpleNamespace(columns=60)) + def test_reasoning_flush_threshold_tracks_terminal_width(self, _mock_term): + cli = self._make_cli() + + cli._reasoning_preview_buf = "a" * 30 + cli._flush_reasoning_preview(force=False) + self.assertEqual(cli._reasoning_preview_buf, "a" * 30) + + +class TestReasoningDisplayModeSelection(unittest.TestCase): + def _make_cli(self, *, show_reasoning=False, streaming_enabled=False, verbose=False): + from cli import HermesCLI + + cli = HermesCLI.__new__(HermesCLI) + cli.show_reasoning = show_reasoning + cli.streaming_enabled = streaming_enabled + cli.verbose = verbose + cli._stream_reasoning_delta = lambda text: ("stream", text) + cli._on_reasoning = lambda text: ("preview", text) + return cli + + def test_show_reasoning_non_streaming_uses_final_box_only(self): + cli = self._make_cli(show_reasoning=True, streaming_enabled=False, verbose=False) + + self.assertIsNone(cli._current_reasoning_callback()) + + def test_show_reasoning_streaming_uses_live_reasoning_box(self): + cli = self._make_cli(show_reasoning=True, streaming_enabled=True, verbose=False) + + callback = cli._current_reasoning_callback() + self.assertIsNotNone(callback) + self.assertEqual(callback("x"), ("stream", "x")) + + def test_verbose_without_show_reasoning_uses_preview_callback(self): + cli = self._make_cli(show_reasoning=False, streaming_enabled=False, verbose=True) + + callback = cli._current_reasoning_callback() + self.assertIsNotNone(callback) + self.assertEqual(callback("x"), ("preview", "x")) + + # --------------------------------------------------------------------------- # Real provider format extraction # ---------------------------------------------------------------------------