feat(cli): live reasoning token streaming — dim box above response

When both display.streaming and display.show_reasoning are enabled, reasoning tokens stream in real-time into a dim bordered box. When content tokens start arriving, the reasoning box closes and the response box opens — smooth visual transition. - _stream_reasoning_delta(): line-buffered rendering in dim text - _close_reasoning_box(): flush + close, called on first content token - Reasoning callback routes to streaming version when both flags set - Skips static post-response reasoning display when streamed live - State reset per turn via _reset_stream_state() Works with reasoning_content deltas (OpenRouter reasoning mode) and thinking_delta events (Anthropic extended thinking).
2026-03-16 10:29:55 -07:00
parent d3687d3e81
commit 942950f5b9
1 changed files with 53 additions and 12 deletions
--- a/cli.py
+++ b/cli.py
@@ -1413,15 +1413,43 @@ class HermesCLI:
        self._invalidate()

    # ── Streaming display ────────────────────────────────────────────────
-    #
-    # Future: When display.show_reasoning is also enabled, stream reasoning
-    # tokens into a dim box above the response (like the existing static
-    # reasoning display, but live). The infrastructure exists — reasoning
-    # deltas fire via _fire_reasoning_delta() during streaming. The display
-    # layer needs: a dim reasoning box that opens on first reasoning token,
-    # accumulates live, then transitions to the response box when content
-    # tokens start arriving. See PR #1214 (raulvidis) for gateway-side
-    # reasoning visibility modes as a reference implementation.
+
+    def _stream_reasoning_delta(self, text: str) -> None:
+        """Stream reasoning/thinking tokens into a dim box above the response.
+
+        Opens a dim reasoning box on first token, streams line-by-line.
+        The box is closed automatically when content tokens start arriving
+        (via _stream_delta → _emit_stream_text).
+        """
+        if not text:
+            return
+
+        # Open reasoning box on first reasoning token
+        if not getattr(self, "_reasoning_box_opened", False):
+            self._reasoning_box_opened = True
+            w = shutil.get_terminal_size().columns
+            r_label = " Reasoning "
+            r_fill = w - 2 - len(r_label)
+            _cprint(f"\n{_DIM}┌─{r_label}{'─' * max(r_fill - 1, 0)}┐{_RST}")
+
+        self._reasoning_buf = getattr(self, "_reasoning_buf", "") + text
+
+        # Emit complete lines
+        while "\n" in self._reasoning_buf:
+            line, self._reasoning_buf = self._reasoning_buf.split("\n", 1)
+            _cprint(f"{_DIM}{line}{_RST}")
+
+    def _close_reasoning_box(self) -> None:
+        """Close the live reasoning box if it's open."""
+        if getattr(self, "_reasoning_box_opened", False):
+            # Flush remaining reasoning buffer
+            buf = getattr(self, "_reasoning_buf", "")
+            if buf:
+                _cprint(f"{_DIM}{buf}{_RST}")
+                self._reasoning_buf = ""
+            w = shutil.get_terminal_size().columns
+            _cprint(f"{_DIM}└{'─' * (w - 2)}┘{_RST}")
+            self._reasoning_box_opened = False

    def _stream_delta(self, text: str) -> None:
        """Line-buffered streaming callback for real-time token rendering.
@@ -1504,6 +1532,9 @@ class HermesCLI:
        if not text:
            return

+        # Close the live reasoning box before opening the response box
+        self._close_reasoning_box()
+
        # Open the response box header on the very first visible text
        if not self._stream_box_opened:
            # Strip leading whitespace/newlines before first visible content
@@ -1530,6 +1561,9 @@ class HermesCLI:

    def _flush_stream(self) -> None:
        """Emit any remaining partial line from the stream buffer and close the box."""
+        # Close reasoning box if still open (in case no content tokens arrived)
+        self._close_reasoning_box()
+
        if self._stream_buf:
            _cprint(self._stream_buf)
            self._stream_buf = ""
@@ -1546,6 +1580,8 @@ class HermesCLI:
        self._stream_box_opened = False
        self._stream_prefilt = ""
        self._in_reasoning_block = False
+        self._reasoning_box_opened = False
+        self._reasoning_buf = ""

    def _slow_command_status(self, command: str) -> str:
        """Return a user-facing status message for slower slash commands."""
@@ -1724,7 +1760,11 @@ class HermesCLI:
                platform="cli",
                session_db=self._session_db,
                clarify_callback=self._clarify_callback,
-                reasoning_callback=self._on_reasoning if (self.show_reasoning or self.verbose) else None,
+                reasoning_callback=(
+                    self._stream_reasoning_delta if (self.streaming_enabled and self.show_reasoning)
+                    else self._on_reasoning if (self.show_reasoning or self.verbose)
+                    else None
+                ),
                honcho_session_key=None,  # resolved by run_agent via config sessions map / title
                fallback_model=self._fallback_model,
                thinking_callback=self._on_thinking,
@@ -4935,8 +4975,9 @@ class HermesCLI:

            response_previewed = result.get("response_previewed", False) if result else False

-            # Display reasoning (thinking) box if enabled and available
-            if self.show_reasoning and result:
+            # Display reasoning (thinking) box if enabled and available.
+            # Skip when streaming already showed reasoning live.
+            if self.show_reasoning and result and not self._stream_started:
                reasoning = result.get("last_reasoning")
                if reasoning:
                    w = shutil.get_terminal_size().columns