fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and openai are never imported at module level. Each is imported only when the feature is explicitly activated, preventing crashes in headless environments (SSH, Docker, WSL, no PortAudio). 2. No core agent loop changes: streaming TTS path extracted from _interruptible_api_call() into separate _streaming_api_call() method. The original method is restored to its upstream form. 3. Configurable key binding: push-to-talk key changed from Ctrl+R (conflicts with readline reverse-search) to Ctrl+B by default. Configurable via voice.push_to_talk_key in config.yaml. 4. Environment detection: new detect_audio_environment() function checks for SSH, Docker, WSL, and missing audio devices before enabling voice mode. Auto-disables with clear warnings in incompatible environments. 5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream, sd.OutputStream) wrapped in try/except with ImportError/OSError handling. Failures produce warnings, not crashes.
2026-03-09 12:48:49 +03:00
parent 143cc68946
commit b859dfab16
5 changed files with 526 additions and 142 deletions
--- a/cli.py
+++ b/cli.py
@@ -3779,7 +3779,15 @@ class HermesCLI:
            _cprint(f"{_DIM}Voice mode is already enabled.{_RST}")
            return

-        from tools.voice_mode import check_voice_requirements
+        from tools.voice_mode import check_voice_requirements, detect_audio_environment
+
+        # Environment detection -- warn and block in incompatible environments
+        env_check = detect_audio_environment()
+        if not env_check["available"]:
+            _cprint(f"\n{_GOLD}Voice mode unavailable in this environment:{_RST}")
+            for warning in env_check["warnings"]:
+                _cprint(f"  {_DIM}{warning}{_RST}")
+            return

        reqs = check_voice_requirements()
        if not reqs["available"]:
@@ -3815,8 +3823,14 @@ class HermesCLI:
        self.system_prompt = (self.system_prompt or "") + voice_instruction

        tts_status = " (TTS enabled)" if self._voice_tts else ""
+        try:
+            from hermes_cli.config import load_config
+            _ptt_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b")
+        except Exception:
+            _ptt_key = "c-b"
+        _ptt_display = _ptt_key.replace("c-", "Ctrl+").upper()
        _cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}")
-        _cprint(f"  {_DIM}Ctrl+R to start/stop recording{_RST}")
+        _cprint(f"  {_DIM}{_ptt_display} to start/stop recording{_RST}")
        _cprint(f"  {_DIM}/voice tts  to toggle speech output{_RST}")
        _cprint(f"  {_DIM}/voice off  to disable voice mode{_RST}")

@@ -4804,6 +4818,51 @@ class HermesCLI:
            self._should_exit = True
            event.app.exit()

+        # Voice push-to-talk key: configurable via config.yaml (voice.push_to_talk_key)
+        # Default: Ctrl+B (avoids conflict with Ctrl+R readline reverse-search)
+        try:
+            from hermes_cli.config import load_config
+            _voice_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b")
+        except Exception:
+            _voice_key = "c-b"
+
+        @kb.add(_voice_key)
+        def handle_voice_record(event):
+            """Toggle voice recording when voice mode is active."""
+            if not cli_ref._voice_mode:
+                return
+            # Always allow STOPPING a recording (even when agent is running)
+            if cli_ref._voice_recording:
+                # Manual stop via Ctrl+R: stop continuous mode
+                with cli_ref._voice_lock:
+                    cli_ref._voice_continuous = False
+                # Flag clearing is handled atomically inside _voice_stop_and_transcribe
+                event.app.invalidate()
+                threading.Thread(
+                    target=cli_ref._voice_stop_and_transcribe,
+                    daemon=True,
+                ).start()
+            else:
+                # Guard: don't START recording during agent run or interactive prompts
+                if cli_ref._agent_running:
+                    return
+                if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state:
+                    return
+                try:
+                    # Interrupt TTS if playing, so user can start talking
+                    if not cli_ref._voice_tts_done.is_set():
+                        try:
+                            from tools.voice_mode import stop_playback
+                            stop_playback()
+                            cli_ref._voice_tts_done.set()
+                        except Exception:
+                            pass
+                    with cli_ref._voice_lock:
+                        cli_ref._voice_continuous = True
+                    cli_ref._voice_start_recording()
+                    event.app.invalidate()
+                except Exception as e:
+                    _cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}")
        from prompt_toolkit.keys import Keys

        @kb.add(Keys.BracketedPaste, eager=True)
@@ -4850,44 +4909,6 @@ class HermesCLI:
                # No image found — show a hint
                pass  # silent when no image (avoid noise on accidental press)

-        @kb.add('c-space')
-        def handle_ctrl_space(event):
-            """Toggle voice recording when voice mode is active."""
-            if not cli_ref._voice_mode:
-                return
-            # Always allow STOPPING a recording (even when agent is running)
-            if cli_ref._voice_recording:
-                # Manual stop via Ctrl+R: stop continuous mode
-                with cli_ref._voice_lock:
-                    cli_ref._voice_continuous = False
-                # Flag clearing is handled atomically inside _voice_stop_and_transcribe
-                event.app.invalidate()
-                threading.Thread(
-                    target=cli_ref._voice_stop_and_transcribe,
-                    daemon=True,
-                ).start()
-            else:
-                # Guard: don't START recording during agent run or interactive prompts
-                if cli_ref._agent_running:
-                    return
-                if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state:
-                    return
-                try:
-                    # Interrupt TTS if playing, so user can start talking
-                    if not cli_ref._voice_tts_done.is_set():
-                        try:
-                            from tools.voice_mode import stop_playback
-                            stop_playback()
-                            cli_ref._voice_tts_done.set()
-                        except Exception:
-                            pass
-                    with cli_ref._voice_lock:
-                        cli_ref._voice_continuous = True
-                    cli_ref._voice_start_recording()
-                    event.app.invalidate()
-                except Exception as e:
-                    _cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}")
-
        # Dynamic prompt: shows Hermes symbol when agent is working,
        # or answer prompt when clarify freetext mode is active.
        cli_ref = self