From 8f6ef042c110a696e5fb107ead4ea329c9671f9b Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Wed, 25 Mar 2026 12:16:39 -0700
Subject: [PATCH] fix(cli): buffer reasoning preview chunks and fix duplicate
 display (#3013)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three improvements to reasoning/thinking display in the CLI:

1. Buffer tiny reasoning chunks: providers like DeepSeek stream reasoning
   one word at a time, producing a separate [thinking] line per token.
   Add a buffer that coalesces chunks and flushes at natural boundaries
   (newlines, sentence endings, terminal width).

2. Fix duplicate reasoning display: centralize callback selection into
   _current_reasoning_callback() — one place instead of 4 scattered
   inline ternaries. Prevents both the streaming box AND the preview
   callback from firing simultaneously.

3. Fix post-response reasoning box guard: change the check from
   'not self._stream_started' to 'not self._reasoning_stream_started'
   so the final reasoning box is only suppressed when reasoning was
   actually streamed live, not when any text was streamed.

Cherry-picked from PR #2781 by juanfradb.
---
 cli.py                          | 135 ++++++++++++++++++++++++++------
 tests/test_reasoning_command.py | 103 ++++++++++++++++++++++++
 2 files changed, 214 insertions(+), 24 deletions(-)

diff --git a/cli.py b/cli.py
index 5e256f27..f49fa226 100644
--- a/cli.py
+++ b/cli.py
@@ -1054,6 +1054,8 @@ class HermesCLI:
         self._stream_buf = ""        # Partial line buffer for line-buffered rendering
         self._stream_started = False  # True once first delta arrives
         self._stream_box_opened = False  # True once the response box header is printed
+        self._reasoning_stream_started = False  # True once live reasoning starts streaming
+        self._reasoning_preview_buf = ""  # Coalesce tiny reasoning chunks for [thinking] output
         
         # Configuration - priority: CLI args > env vars > config file
         # Model comes from: CLI arg or config.yaml (single source of truth).
@@ -1478,11 +1480,108 @@ class HermesCLI:
 
     def _on_thinking(self, text: str) -> None:
         """Called by agent when thinking starts/stops. Updates TUI spinner."""
+        if not text:
+            self._flush_reasoning_preview(force=True)
         self._spinner_text = text or ""
         self._invalidate()
 
     # ── Streaming display ────────────────────────────────────────────────
 
+    def _current_reasoning_callback(self):
+        """Return the active reasoning display callback for the current mode."""
+        if self.show_reasoning and self.streaming_enabled:
+            return self._stream_reasoning_delta
+        if self.verbose and not self.show_reasoning:
+            return self._on_reasoning
+        return None
+
+    def _emit_reasoning_preview(self, reasoning_text: str) -> None:
+        """Render a buffered reasoning preview as a single [thinking] block."""
+        import re
+        import textwrap
+
+        preview_text = reasoning_text.strip()
+        if not preview_text:
+            return
+
+        try:
+            term_width = shutil.get_terminal_size().columns
+        except Exception:
+            term_width = 80
+        prefix = "  [thinking] "
+        wrap_width = max(30, term_width - len(prefix) - 2)
+
+        paragraphs = []
+        raw_paragraphs = re.split(r"\n\s*\n+", preview_text.replace("\r\n", "\n"))
+        for paragraph in raw_paragraphs:
+            compact = " ".join(line.strip() for line in paragraph.splitlines() if line.strip())
+            if compact:
+                paragraphs.append(textwrap.fill(compact, width=wrap_width))
+        preview_text = "\n".join(paragraphs)
+        if not preview_text:
+            return
+
+        if self.verbose:
+            _cprint(f"  {_DIM}[thinking] {preview_text}{_RST}")
+            return
+
+        lines = preview_text.splitlines()
+        if len(lines) > 5:
+            preview = "\n".join(lines[:5])
+            preview += f"\n  ... ({len(lines) - 5} more lines)"
+        else:
+            preview = preview_text
+        _cprint(f"  {_DIM}[thinking] {preview}{_RST}")
+
+    def _flush_reasoning_preview(self, *, force: bool = False) -> None:
+        """Flush buffered reasoning text at natural boundaries.
+
+        Some providers stream reasoning in tiny word or punctuation chunks.
+        Buffer them here so the preview path does not print one `[thinking]`
+        line per token.
+        """
+        buf = getattr(self, "_reasoning_preview_buf", "")
+        if not buf:
+            return
+
+        try:
+            term_width = shutil.get_terminal_size().columns
+        except Exception:
+            term_width = 80
+        target_width = max(40, term_width - len("  [thinking] ") - 4)
+
+        flush_text = ""
+
+        if force:
+            flush_text = buf
+            buf = ""
+        else:
+            line_break = buf.rfind("\n")
+            min_newline_flush = max(16, target_width // 3)
+            if line_break != -1 and (
+                line_break >= min_newline_flush
+                or buf.endswith("\n\n")
+                or buf.endswith(".\n")
+                or buf.endswith("!\n")
+                or buf.endswith("?\n")
+                or buf.endswith(":\n")
+            ):
+                flush_text = buf[: line_break + 1]
+                buf = buf[line_break + 1 :]
+            elif len(buf) >= target_width:
+                search_start = max(20, target_width // 2)
+                search_end = min(len(buf), max(target_width + (target_width // 3), target_width + 8))
+                cut = -1
+                for boundary in (" ", "\t", ".", "!", "?", ",", ";", ":"):
+                    cut = max(cut, buf.rfind(boundary, search_start, search_end))
+                if cut != -1:
+                    flush_text = buf[: cut + 1]
+                    buf = buf[cut + 1 :]
+
+        self._reasoning_preview_buf = buf.lstrip() if flush_text else buf
+        if flush_text:
+            self._emit_reasoning_preview(flush_text)
+
     def _stream_reasoning_delta(self, text: str) -> None:
         """Stream reasoning/thinking tokens into a dim box above the response.
 
@@ -1496,6 +1595,7 @@ class HermesCLI:
         """
         if not text:
             return
+        self._reasoning_stream_started = True
         if getattr(self, "_stream_box_opened", False):
             return
 
@@ -1691,11 +1791,13 @@ class HermesCLI:
         self._stream_buf = ""
         self._stream_started = False
         self._stream_box_opened = False
+        self._reasoning_stream_started = False
         self._stream_text_ansi = ""
         self._stream_prefilt = ""
         self._in_reasoning_block = False
         self._reasoning_box_opened = False
         self._reasoning_buf = ""
+        self._reasoning_preview_buf = ""
 
     def _slow_command_status(self, command: str) -> str:
         """Return a user-facing status message for slower slash commands."""
@@ -1926,11 +2028,7 @@ class HermesCLI:
                 platform="cli",
                 session_db=self._session_db,
                 clarify_callback=self._clarify_callback,
-                reasoning_callback=(
-                    self._stream_reasoning_delta if (self.streaming_enabled and self.show_reasoning)
-                    else self._on_reasoning if (self.show_reasoning or self.verbose)
-                    else None
-                ),
+                reasoning_callback=self._current_reasoning_callback(),
                 honcho_session_key=None,  # resolved by run_agent via config sessions map / title
                 fallback_model=self._fallback_model,
                 thinking_callback=self._on_thinking,
@@ -4235,11 +4333,7 @@ class HermesCLI:
         if self.agent:
             self.agent.verbose_logging = self.verbose
             self.agent.quiet_mode = not self.verbose
-            # Auto-enable reasoning display in verbose mode
-            if self.verbose:
-                self.agent.reasoning_callback = self._on_reasoning
-            elif not self.show_reasoning:
-                self.agent.reasoning_callback = None
+            self.agent.reasoning_callback = self._current_reasoning_callback()
 
         # Use raw ANSI codes via _cprint so the output is routed through
         # prompt_toolkit's renderer.  self.console.print() with Rich markup
@@ -4286,7 +4380,7 @@ class HermesCLI:
         if arg in ("show", "on"):
             self.show_reasoning = True
             if self.agent:
-                self.agent.reasoning_callback = self._on_reasoning
+                self.agent.reasoning_callback = self._current_reasoning_callback()
             save_config_value("display.show_reasoning", True)
             _cprint(f"  {_GOLD}✓ Reasoning display: ON (saved){_RST}")
             _cprint(f"  {_DIM}  Model thinking will be shown during and after each response.{_RST}")
@@ -4294,7 +4388,7 @@ class HermesCLI:
         if arg in ("hide", "off"):
             self.show_reasoning = False
             if self.agent:
-                self.agent.reasoning_callback = None
+                self.agent.reasoning_callback = self._current_reasoning_callback()
             save_config_value("display.show_reasoning", False)
             _cprint(f"  {_GOLD}✓ Reasoning display: OFF (saved){_RST}")
             return
@@ -4317,17 +4411,10 @@ class HermesCLI:
 
     def _on_reasoning(self, reasoning_text: str):
         """Callback for intermediate reasoning display during tool-call loops."""
-        if self.verbose:
-            # Verbose mode: show full reasoning text
-            _cprint(f"  {_DIM}[thinking] {reasoning_text.strip()}{_RST}")
-        else:
-            lines = reasoning_text.strip().splitlines()
-            if len(lines) > 5:
-                preview = "\n".join(lines[:5])
-                preview += f"\n  ... ({len(lines) - 5} more lines)"
-            else:
-                preview = reasoning_text.strip()
-            _cprint(f"  {_DIM}[thinking] {preview}{_RST}")
+        if not reasoning_text:
+            return
+        self._reasoning_preview_buf = getattr(self, "_reasoning_preview_buf", "") + reasoning_text
+        self._flush_reasoning_preview(force=False)
 
     def _manual_compress(self):
         """Manually trigger context compression on the current conversation."""
@@ -5628,7 +5715,7 @@ class HermesCLI:
 
             # Display reasoning (thinking) box if enabled and available.
             # Skip when streaming already showed reasoning live.
-            if self.show_reasoning and result and not self._stream_started:
+            if self.show_reasoning and result and not self._reasoning_stream_started:
                 reasoning = result.get("last_reasoning")
                 if reasoning:
                     w = shutil.get_terminal_size().columns
diff --git a/tests/test_reasoning_command.py b/tests/test_reasoning_command.py
index 425e28a5..81d452a2 100644
--- a/tests/test_reasoning_command.py
+++ b/tests/test_reasoning_command.py
@@ -11,6 +11,7 @@ Combines functionality from:
 import unittest
 from types import SimpleNamespace
 from unittest.mock import MagicMock, patch
+import re
 
 
 # ---------------------------------------------------------------------------
@@ -295,6 +296,108 @@ class TestReasoningCallback(unittest.TestCase):
         # No exception = pass
 
 
+class TestReasoningPreviewBuffering(unittest.TestCase):
+    def _make_cli(self):
+        from cli import HermesCLI
+
+        cli = HermesCLI.__new__(HermesCLI)
+        cli.verbose = True
+        cli._spinner_text = ""
+        cli._reasoning_preview_buf = ""
+        cli._invalidate = lambda *args, **kwargs: None
+        return cli
+
+    @patch("cli._cprint")
+    def test_streamed_reasoning_chunks_wait_for_boundary(self, mock_cprint):
+        cli = self._make_cli()
+
+        cli._on_reasoning("Let")
+        cli._on_reasoning(" me")
+        cli._on_reasoning(" think")
+
+        self.assertEqual(mock_cprint.call_count, 0)
+
+        cli._on_reasoning(" about this.\n")
+
+        self.assertEqual(mock_cprint.call_count, 1)
+        rendered = mock_cprint.call_args[0][0]
+        self.assertIn("[thinking] Let me think about this.", rendered)
+
+    @patch("cli._cprint")
+    def test_pending_reasoning_flushes_when_thinking_stops(self, mock_cprint):
+        cli = self._make_cli()
+
+        cli._on_reasoning("see")
+        cli._on_reasoning(" how")
+        cli._on_reasoning(" this")
+        cli._on_reasoning(" plays")
+        cli._on_reasoning(" out")
+
+        self.assertEqual(mock_cprint.call_count, 0)
+
+        cli._on_thinking("")
+
+        self.assertEqual(mock_cprint.call_count, 1)
+        rendered = mock_cprint.call_args[0][0]
+        self.assertIn("[thinking] see how this plays out", rendered)
+
+    @patch("cli._cprint")
+    @patch("cli.shutil.get_terminal_size", return_value=SimpleNamespace(columns=50))
+    def test_reasoning_preview_compacts_newlines_and_wraps_to_terminal(self, _mock_term, mock_cprint):
+        cli = self._make_cli()
+
+        cli._emit_reasoning_preview(
+            "First line\nstill same thought\n\n\nSecond paragraph with more detail here."
+        )
+
+        rendered = mock_cprint.call_args[0][0]
+        plain = re.sub(r"\x1b\[[0-9;]*m", "", rendered)
+        normalized = " ".join(plain.split())
+        self.assertIn("[thinking] First line still same thought", plain)
+        self.assertIn("Second paragraph with more detail here.", normalized)
+        self.assertNotIn("\n\n\n", plain)
+
+    @patch("cli.shutil.get_terminal_size", return_value=SimpleNamespace(columns=60))
+    def test_reasoning_flush_threshold_tracks_terminal_width(self, _mock_term):
+        cli = self._make_cli()
+
+        cli._reasoning_preview_buf = "a" * 30
+        cli._flush_reasoning_preview(force=False)
+        self.assertEqual(cli._reasoning_preview_buf, "a" * 30)
+
+
+class TestReasoningDisplayModeSelection(unittest.TestCase):
+    def _make_cli(self, *, show_reasoning=False, streaming_enabled=False, verbose=False):
+        from cli import HermesCLI
+
+        cli = HermesCLI.__new__(HermesCLI)
+        cli.show_reasoning = show_reasoning
+        cli.streaming_enabled = streaming_enabled
+        cli.verbose = verbose
+        cli._stream_reasoning_delta = lambda text: ("stream", text)
+        cli._on_reasoning = lambda text: ("preview", text)
+        return cli
+
+    def test_show_reasoning_non_streaming_uses_final_box_only(self):
+        cli = self._make_cli(show_reasoning=True, streaming_enabled=False, verbose=False)
+
+        self.assertIsNone(cli._current_reasoning_callback())
+
+    def test_show_reasoning_streaming_uses_live_reasoning_box(self):
+        cli = self._make_cli(show_reasoning=True, streaming_enabled=True, verbose=False)
+
+        callback = cli._current_reasoning_callback()
+        self.assertIsNotNone(callback)
+        self.assertEqual(callback("x"), ("stream", "x"))
+
+    def test_verbose_without_show_reasoning_uses_preview_callback(self):
+        cli = self._make_cli(show_reasoning=False, streaming_enabled=False, verbose=True)
+
+        callback = cli._current_reasoning_callback()
+        self.assertIsNotNone(callback)
+        self.assertEqual(callback("x"), ("preview", "x"))
+
+
 # ---------------------------------------------------------------------------
 # Real provider format extraction
 # ---------------------------------------------------------------------------