fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.
This commit is contained in:
0xbyt4
2026-03-09 12:48:49 +03:00
parent 143cc68946
commit b859dfab16
5 changed files with 526 additions and 142 deletions

101
cli.py
View File

@@ -3779,7 +3779,15 @@ class HermesCLI:
_cprint(f"{_DIM}Voice mode is already enabled.{_RST}")
return
from tools.voice_mode import check_voice_requirements
from tools.voice_mode import check_voice_requirements, detect_audio_environment
# Environment detection -- warn and block in incompatible environments
env_check = detect_audio_environment()
if not env_check["available"]:
_cprint(f"\n{_GOLD}Voice mode unavailable in this environment:{_RST}")
for warning in env_check["warnings"]:
_cprint(f" {_DIM}{warning}{_RST}")
return
reqs = check_voice_requirements()
if not reqs["available"]:
@@ -3815,8 +3823,14 @@ class HermesCLI:
self.system_prompt = (self.system_prompt or "") + voice_instruction
tts_status = " (TTS enabled)" if self._voice_tts else ""
try:
from hermes_cli.config import load_config
_ptt_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b")
except Exception:
_ptt_key = "c-b"
_ptt_display = _ptt_key.replace("c-", "Ctrl+").upper()
_cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}")
_cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}")
_cprint(f" {_DIM}{_ptt_display} to start/stop recording{_RST}")
_cprint(f" {_DIM}/voice tts to toggle speech output{_RST}")
_cprint(f" {_DIM}/voice off to disable voice mode{_RST}")
@@ -4804,6 +4818,51 @@ class HermesCLI:
self._should_exit = True
event.app.exit()
# Voice push-to-talk key: configurable via config.yaml (voice.push_to_talk_key)
# Default: Ctrl+B (avoids conflict with Ctrl+R readline reverse-search)
try:
from hermes_cli.config import load_config
_voice_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b")
except Exception:
_voice_key = "c-b"
@kb.add(_voice_key)
def handle_voice_record(event):
"""Toggle voice recording when voice mode is active."""
if not cli_ref._voice_mode:
return
# Always allow STOPPING a recording (even when agent is running)
if cli_ref._voice_recording:
# Manual stop via Ctrl+R: stop continuous mode
with cli_ref._voice_lock:
cli_ref._voice_continuous = False
# Flag clearing is handled atomically inside _voice_stop_and_transcribe
event.app.invalidate()
threading.Thread(
target=cli_ref._voice_stop_and_transcribe,
daemon=True,
).start()
else:
# Guard: don't START recording during agent run or interactive prompts
if cli_ref._agent_running:
return
if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state:
return
try:
# Interrupt TTS if playing, so user can start talking
if not cli_ref._voice_tts_done.is_set():
try:
from tools.voice_mode import stop_playback
stop_playback()
cli_ref._voice_tts_done.set()
except Exception:
pass
with cli_ref._voice_lock:
cli_ref._voice_continuous = True
cli_ref._voice_start_recording()
event.app.invalidate()
except Exception as e:
_cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}")
from prompt_toolkit.keys import Keys
@kb.add(Keys.BracketedPaste, eager=True)
@@ -4850,44 +4909,6 @@ class HermesCLI:
# No image found — show a hint
pass # silent when no image (avoid noise on accidental press)
@kb.add('c-space')
def handle_ctrl_space(event):
"""Toggle voice recording when voice mode is active."""
if not cli_ref._voice_mode:
return
# Always allow STOPPING a recording (even when agent is running)
if cli_ref._voice_recording:
# Manual stop via Ctrl+R: stop continuous mode
with cli_ref._voice_lock:
cli_ref._voice_continuous = False
# Flag clearing is handled atomically inside _voice_stop_and_transcribe
event.app.invalidate()
threading.Thread(
target=cli_ref._voice_stop_and_transcribe,
daemon=True,
).start()
else:
# Guard: don't START recording during agent run or interactive prompts
if cli_ref._agent_running:
return
if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state:
return
try:
# Interrupt TTS if playing, so user can start talking
if not cli_ref._voice_tts_done.is_set():
try:
from tools.voice_mode import stop_playback
stop_playback()
cli_ref._voice_tts_done.set()
except Exception:
pass
with cli_ref._voice_lock:
cli_ref._voice_continuous = True
cli_ref._voice_start_recording()
event.app.invalidate()
except Exception as e:
_cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}")
# Dynamic prompt: shows Hermes symbol when agent is working,
# or answer prompt when clarify freetext mode is active.
cli_ref = self