From d23e9a9bed94795e5af7919082ec1ecf897f50cb Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 16 Mar 2026 05:10:15 -0700
Subject: [PATCH] =?UTF-8?q?feat(cli):=20streaming=20token=20display=20?=
 =?UTF-8?q?=E2=80=94=20line-buffered=20rendering=20with=20response=20box?=
 =?UTF-8?q?=20framing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stage 2 of streaming support. CLI now streams tokens in real-time:

- _stream_delta(): line-buffered rendering via _cprint (prompt_toolkit safe)
- _flush_stream(): emits remaining buffer and closes response box
- Response box opens on first token, closes on flush
- Skip Rich Panel when streaming already displayed content
- Reset streaming state before each agent turn
- Compatible with existing TTS streaming (both can fire simultaneously)
- Uses skin engine for response label branding

Credit: OutThisLife (#798 CLI streaming concept).
---
 cli.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/cli.py b/cli.py
index 47018657..79668876 100755
--- a/cli.py
+++ b/cli.py
@@ -1017,6 +1017,11 @@ class HermesCLI:
         self.show_reasoning = CLI_CONFIG["display"].get("show_reasoning", False)
         self.verbose = verbose if verbose is not None else (self.tool_progress_mode == "verbose")
         
+        # Streaming display state
+        self._stream_buf = ""        # Partial line buffer for line-buffered rendering
+        self._stream_started = False  # True once first delta arrives
+        self._stream_box_opened = False  # True once the response box header is printed
+        
         # Configuration - priority: CLI args > env vars > config file
         # Model comes from: CLI arg or config.yaml (single source of truth).
         # LLM_MODEL/OPENAI_MODEL env vars are NOT checked — config.yaml is
@@ -1403,6 +1408,56 @@ class HermesCLI:
         self._spinner_text = text or ""
         self._invalidate()
 
+    # ── Streaming display ────────────────────────────────────────────────
+
+    def _stream_delta(self, text: str) -> None:
+        """Line-buffered streaming callback for real-time token rendering.
+
+        Receives text deltas from the agent as tokens arrive. Buffers
+        partial lines and emits complete lines via _cprint to work
+        reliably with prompt_toolkit's patch_stdout.
+        """
+        if not text:
+            return
+
+        # Open the response box header on the very first delta
+        if not self._stream_box_opened:
+            self._stream_box_opened = True
+            try:
+                from hermes_cli.skin_engine import get_active_skin
+                _skin = get_active_skin()
+                label = _skin.get_branding("response_label", "⚕ Hermes")
+            except Exception:
+                label = "⚕ Hermes"
+            w = shutil.get_terminal_size().columns
+            fill = w - 2 - len(label)
+            _cprint(f"\n{_GOLD}╭─{label}{'─' * max(fill - 1, 0)}╮{_RST}")
+
+        self._stream_started = True
+        self._stream_buf += text
+
+        # Emit complete lines, keep partial remainder in buffer
+        while "\n" in self._stream_buf:
+            line, self._stream_buf = self._stream_buf.split("\n", 1)
+            _cprint(line)
+
+    def _flush_stream(self) -> None:
+        """Emit any remaining partial line from the stream buffer and close the box."""
+        if self._stream_buf:
+            _cprint(self._stream_buf)
+            self._stream_buf = ""
+
+        # Close the response box
+        if self._stream_box_opened:
+            w = shutil.get_terminal_size().columns
+            _cprint(f"{_GOLD}╰{'─' * (w - 2)}╯{_RST}")
+
+    def _reset_stream_state(self) -> None:
+        """Reset streaming state before each agent invocation."""
+        self._stream_buf = ""
+        self._stream_started = False
+        self._stream_box_opened = False
+
     def _slow_command_status(self, command: str) -> str:
         """Return a user-facing status message for slower slash commands."""
         cmd_lower = command.lower().strip()
@@ -1588,6 +1643,7 @@ class HermesCLI:
                 checkpoint_max_snapshots=self.checkpoint_max_snapshots,
                 pass_session_id=self.pass_session_id,
                 tool_progress_callback=self._on_tool_progress,
+                stream_delta_callback=self._stream_delta,
             )
             # Apply any pending title now that the session exists in the DB
             if self._pending_title and self._session_db:
@@ -4616,6 +4672,9 @@ class HermesCLI:
             # Run the conversation with interrupt monitoring
             result = None
 
+            # Reset streaming display state for this turn
+            self._reset_stream_state()
+
             # --- Streaming TTS setup ---
             # When ElevenLabs is the TTS provider and sounddevice is available,
             # we stream audio sentence-by-sentence as the agent generates tokens
@@ -4742,6 +4801,9 @@ class HermesCLI:
 
             agent_thread.join()  # Ensure agent thread completes
 
+            # Flush any remaining streamed text and close the box
+            self._flush_stream()
+
             # Signal end-of-text to TTS consumer and wait for it to finish
             if use_streaming_tts and text_queue is not None:
                 text_queue.put(None)  # sentinel
@@ -4816,10 +4878,15 @@ class HermesCLI:
                     _resp_text = "#FFF8DC"
 
                 is_error_response = result and (result.get("failed") or result.get("partial"))
+                already_streamed = self._stream_started and self._stream_box_opened and not is_error_response
                 if use_streaming_tts and _streaming_box_opened and not is_error_response:
                     # Text was already printed sentence-by-sentence; just close the box
                     w = shutil.get_terminal_size().columns
                     _cprint(f"\n{_GOLD}╰{'─' * (w - 2)}╯{_RST}")
+                elif already_streamed:
+                    # Response was already streamed token-by-token with box framing;
+                    # _flush_stream() already closed the box. Skip Rich Panel.
+                    pass
                 else:
                     _chat_console = ChatConsole()
                     _chat_console.print(Panel(