diff --git a/cli.py b/cli.py index 230d1e9ff..3221cbb79 100755 --- a/cli.py +++ b/cli.py @@ -4099,6 +4099,7 @@ class HermesCLI: # we stream audio sentence-by-sentence as the agent generates tokens # instead of waiting for the full response. use_streaming_tts = False + _streaming_box_opened = False text_queue = None tts_thread = None stream_callback = None @@ -4123,9 +4124,21 @@ class HermesCLI: text_queue = queue.Queue() stop_event = threading.Event() + def display_callback(sentence: str): + """Called by TTS consumer when a sentence is ready to display + speak.""" + nonlocal _streaming_box_opened + if not _streaming_box_opened: + _streaming_box_opened = True + w = self.console.width + label = " ⚕ Hermes " + fill = w - 2 - len(label) + _cprint(f"\n{_GOLD}╭─{label}{'─' * max(fill - 1, 0)}╮{_RST}") + _cprint(sentence.rstrip()) + tts_thread = threading.Thread( target=stream_tts_to_speaker, args=(text_queue, stop_event, self._voice_tts_done), + kwargs={"display_callback": display_callback}, daemon=True, ) tts_thread.start() @@ -4244,8 +4257,7 @@ class HermesCLI: _cprint(f"\n{r_top}\n{_DIM}{display_reasoning}{_RST}\n{r_bot}") if response and not response_previewed: - # Use a Rich Panel for the response box — adapts to terminal - # width at render time instead of hard-coding border length. + # Use skin engine for label/color with fallback try: from hermes_cli.skin_engine import get_active_skin _skin = get_active_skin() @@ -4257,17 +4269,22 @@ class HermesCLI: _resp_color = "#CD7F32" _resp_text = "#FFF8DC" - _chat_console = ChatConsole() - _chat_console.print(Panel( - _rich_text_from_ansi(response), - title=f"[{_resp_color} bold]{label}[/]", - title_align="left", - border_style=_resp_color, - style=_resp_text, - box=rich_box.HORIZONTALS, - padding=(1, 2), - )) - + is_error_response = result and (result.get("failed") or result.get("partial")) + if use_streaming_tts and _streaming_box_opened and not is_error_response: + # Text was already printed sentence-by-sentence; just close the box + w = shutil.get_terminal_size().columns + _cprint(f"\n{_GOLD}╰{'─' * (w - 2)}╯{_RST}") + else: + _chat_console = ChatConsole() + _chat_console.print(Panel( + _rich_text_from_ansi(response), + title=f"[{_resp_color} bold]{label}[/]", + title_align="left", + border_style=_resp_color, + style=_resp_text, + box=rich_box.HORIZONTALS, + padding=(1, 2), + )) # Play terminal bell when agent finishes (if enabled). diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 31c57ce01..3b8773d49 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -32,7 +32,7 @@ import subprocess import tempfile import threading from pathlib import Path -from typing import Dict, Any, Optional +from typing import Callable, Dict, Any, Optional logger = logging.getLogger(__name__) @@ -469,6 +469,7 @@ def stream_tts_to_speaker( text_queue: queue.Queue, stop_event: threading.Event, tts_done_event: threading.Event, + display_callback: Optional[Callable[[str], None]] = None, ): """Consume text deltas from *text_queue*, buffer them into sentences, and stream each sentence through ElevenLabs TTS to the speaker in @@ -484,34 +485,38 @@ def stream_tts_to_speaker( tts_done_event.clear() try: + # --- TTS client setup (optional -- display_callback works without it) --- + client = None + output_stream = None + voice_id = DEFAULT_ELEVENLABS_VOICE_ID + model_id = DEFAULT_ELEVENLABS_STREAMING_MODEL_ID + tts_config = _load_tts_config() el_config = tts_config.get("elevenlabs", {}) - voice_id = el_config.get("voice_id", DEFAULT_ELEVENLABS_VOICE_ID) + voice_id = el_config.get("voice_id", voice_id) model_id = el_config.get("streaming_model_id", - el_config.get("model_id", DEFAULT_ELEVENLABS_STREAMING_MODEL_ID)) + el_config.get("model_id", model_id)) api_key = os.getenv("ELEVENLABS_API_KEY", "") if not api_key: - logger.warning("ELEVENLABS_API_KEY not set; streaming TTS disabled") - return + logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled") + elif _HAS_ELEVENLABS: + client = ElevenLabs(api_key=api_key) - client = ElevenLabs(api_key=api_key) - - # Open a single sounddevice output stream for the lifetime of - # this function. ElevenLabs pcm_24000 produces signed 16-bit - # little-endian mono PCM at 24 kHz. - use_sd = _HAS_AUDIO and sd is not None - output_stream = None - if use_sd: - try: - import numpy as _np - output_stream = sd.OutputStream( - samplerate=24000, channels=1, dtype="int16", - ) - output_stream.start() - except Exception as exc: - logger.warning("sounddevice OutputStream failed: %s", exc) - output_stream = None + # Open a single sounddevice output stream for the lifetime of + # this function. ElevenLabs pcm_24000 produces signed 16-bit + # little-endian mono PCM at 24 kHz. + use_sd = _HAS_AUDIO and sd is not None + if use_sd: + try: + import numpy as _np + output_stream = sd.OutputStream( + samplerate=24000, channels=1, dtype="int16", + ) + output_stream.start() + except Exception as exc: + logger.warning("sounddevice OutputStream failed: %s", exc) + output_stream = None sentence_buf = "" in_think = False # track ... blocks @@ -520,12 +525,18 @@ def stream_tts_to_speaker( queue_timeout = 0.5 def _speak_sentence(sentence: str): - """Generate and play audio for a single sentence.""" + """Display sentence and optionally generate + play audio.""" if stop_event.is_set(): return cleaned = _strip_markdown_for_tts(sentence).strip() if not cleaned: return + # Display raw sentence on screen before TTS processing + if display_callback is not None: + display_callback(sentence) + # Skip audio generation if no TTS client available + if client is None: + return # Truncate very long sentences if len(cleaned) > MAX_TEXT_LENGTH: cleaned = cleaned[:MAX_TEXT_LENGTH]