From d23e9a9bed94795e5af7919082ec1ecf897f50cb Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 16 Mar 2026 05:10:15 -0700 Subject: [PATCH] =?UTF-8?q?feat(cli):=20streaming=20token=20display=20?= =?UTF-8?q?=E2=80=94=20line-buffered=20rendering=20with=20response=20box?= =?UTF-8?q?=20framing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 2 of streaming support. CLI now streams tokens in real-time: - _stream_delta(): line-buffered rendering via _cprint (prompt_toolkit safe) - _flush_stream(): emits remaining buffer and closes response box - Response box opens on first token, closes on flush - Skip Rich Panel when streaming already displayed content - Reset streaming state before each agent turn - Compatible with existing TTS streaming (both can fire simultaneously) - Uses skin engine for response label branding Credit: OutThisLife (#798 CLI streaming concept). --- cli.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/cli.py b/cli.py index 47018657..79668876 100755 --- a/cli.py +++ b/cli.py @@ -1017,6 +1017,11 @@ class HermesCLI: self.show_reasoning = CLI_CONFIG["display"].get("show_reasoning", False) self.verbose = verbose if verbose is not None else (self.tool_progress_mode == "verbose") + # Streaming display state + self._stream_buf = "" # Partial line buffer for line-buffered rendering + self._stream_started = False # True once first delta arrives + self._stream_box_opened = False # True once the response box header is printed + # Configuration - priority: CLI args > env vars > config file # Model comes from: CLI arg or config.yaml (single source of truth). # LLM_MODEL/OPENAI_MODEL env vars are NOT checked — config.yaml is @@ -1403,6 +1408,56 @@ class HermesCLI: self._spinner_text = text or "" self._invalidate() + # ── Streaming display ──────────────────────────────────────────────── + + def _stream_delta(self, text: str) -> None: + """Line-buffered streaming callback for real-time token rendering. + + Receives text deltas from the agent as tokens arrive. Buffers + partial lines and emits complete lines via _cprint to work + reliably with prompt_toolkit's patch_stdout. + """ + if not text: + return + + # Open the response box header on the very first delta + if not self._stream_box_opened: + self._stream_box_opened = True + try: + from hermes_cli.skin_engine import get_active_skin + _skin = get_active_skin() + label = _skin.get_branding("response_label", "⚕ Hermes") + except Exception: + label = "⚕ Hermes" + w = shutil.get_terminal_size().columns + fill = w - 2 - len(label) + _cprint(f"\n{_GOLD}╭─{label}{'─' * max(fill - 1, 0)}╮{_RST}") + + self._stream_started = True + self._stream_buf += text + + # Emit complete lines, keep partial remainder in buffer + while "\n" in self._stream_buf: + line, self._stream_buf = self._stream_buf.split("\n", 1) + _cprint(line) + + def _flush_stream(self) -> None: + """Emit any remaining partial line from the stream buffer and close the box.""" + if self._stream_buf: + _cprint(self._stream_buf) + self._stream_buf = "" + + # Close the response box + if self._stream_box_opened: + w = shutil.get_terminal_size().columns + _cprint(f"{_GOLD}╰{'─' * (w - 2)}╯{_RST}") + + def _reset_stream_state(self) -> None: + """Reset streaming state before each agent invocation.""" + self._stream_buf = "" + self._stream_started = False + self._stream_box_opened = False + def _slow_command_status(self, command: str) -> str: """Return a user-facing status message for slower slash commands.""" cmd_lower = command.lower().strip() @@ -1588,6 +1643,7 @@ class HermesCLI: checkpoint_max_snapshots=self.checkpoint_max_snapshots, pass_session_id=self.pass_session_id, tool_progress_callback=self._on_tool_progress, + stream_delta_callback=self._stream_delta, ) # Apply any pending title now that the session exists in the DB if self._pending_title and self._session_db: @@ -4616,6 +4672,9 @@ class HermesCLI: # Run the conversation with interrupt monitoring result = None + # Reset streaming display state for this turn + self._reset_stream_state() + # --- Streaming TTS setup --- # When ElevenLabs is the TTS provider and sounddevice is available, # we stream audio sentence-by-sentence as the agent generates tokens @@ -4742,6 +4801,9 @@ class HermesCLI: agent_thread.join() # Ensure agent thread completes + # Flush any remaining streamed text and close the box + self._flush_stream() + # Signal end-of-text to TTS consumer and wait for it to finish if use_streaming_tts and text_queue is not None: text_queue.put(None) # sentinel @@ -4816,10 +4878,15 @@ class HermesCLI: _resp_text = "#FFF8DC" is_error_response = result and (result.get("failed") or result.get("partial")) + already_streamed = self._stream_started and self._stream_box_opened and not is_error_response if use_streaming_tts and _streaming_box_opened and not is_error_response: # Text was already printed sentence-by-sentence; just close the box w = shutil.get_terminal_size().columns _cprint(f"\n{_GOLD}╰{'─' * (w - 2)}╯{_RST}") + elif already_streamed: + # Response was already streamed token-by-token with box framing; + # _flush_stream() already closed the box. Skip Rich Panel. + pass else: _chat_console = ChatConsole() _chat_console.print(Panel(