diff --git a/cli.py b/cli.py index dd15151e..46d23729 100755 --- a/cli.py +++ b/cli.py @@ -3779,7 +3779,15 @@ class HermesCLI: _cprint(f"{_DIM}Voice mode is already enabled.{_RST}") return - from tools.voice_mode import check_voice_requirements + from tools.voice_mode import check_voice_requirements, detect_audio_environment + + # Environment detection -- warn and block in incompatible environments + env_check = detect_audio_environment() + if not env_check["available"]: + _cprint(f"\n{_GOLD}Voice mode unavailable in this environment:{_RST}") + for warning in env_check["warnings"]: + _cprint(f" {_DIM}{warning}{_RST}") + return reqs = check_voice_requirements() if not reqs["available"]: @@ -3815,8 +3823,14 @@ class HermesCLI: self.system_prompt = (self.system_prompt or "") + voice_instruction tts_status = " (TTS enabled)" if self._voice_tts else "" + try: + from hermes_cli.config import load_config + _ptt_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b") + except Exception: + _ptt_key = "c-b" + _ptt_display = _ptt_key.replace("c-", "Ctrl+").upper() _cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}") - _cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}") + _cprint(f" {_DIM}{_ptt_display} to start/stop recording{_RST}") _cprint(f" {_DIM}/voice tts to toggle speech output{_RST}") _cprint(f" {_DIM}/voice off to disable voice mode{_RST}") @@ -4804,6 +4818,51 @@ class HermesCLI: self._should_exit = True event.app.exit() + # Voice push-to-talk key: configurable via config.yaml (voice.push_to_talk_key) + # Default: Ctrl+B (avoids conflict with Ctrl+R readline reverse-search) + try: + from hermes_cli.config import load_config + _voice_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b") + except Exception: + _voice_key = "c-b" + + @kb.add(_voice_key) + def handle_voice_record(event): + """Toggle voice recording when voice mode is active.""" + if not cli_ref._voice_mode: + return + # Always allow STOPPING a recording (even when agent is running) + if cli_ref._voice_recording: + # Manual stop via Ctrl+R: stop continuous mode + with cli_ref._voice_lock: + cli_ref._voice_continuous = False + # Flag clearing is handled atomically inside _voice_stop_and_transcribe + event.app.invalidate() + threading.Thread( + target=cli_ref._voice_stop_and_transcribe, + daemon=True, + ).start() + else: + # Guard: don't START recording during agent run or interactive prompts + if cli_ref._agent_running: + return + if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: + return + try: + # Interrupt TTS if playing, so user can start talking + if not cli_ref._voice_tts_done.is_set(): + try: + from tools.voice_mode import stop_playback + stop_playback() + cli_ref._voice_tts_done.set() + except Exception: + pass + with cli_ref._voice_lock: + cli_ref._voice_continuous = True + cli_ref._voice_start_recording() + event.app.invalidate() + except Exception as e: + _cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}") from prompt_toolkit.keys import Keys @kb.add(Keys.BracketedPaste, eager=True) @@ -4850,44 +4909,6 @@ class HermesCLI: # No image found — show a hint pass # silent when no image (avoid noise on accidental press) - @kb.add('c-space') - def handle_ctrl_space(event): - """Toggle voice recording when voice mode is active.""" - if not cli_ref._voice_mode: - return - # Always allow STOPPING a recording (even when agent is running) - if cli_ref._voice_recording: - # Manual stop via Ctrl+R: stop continuous mode - with cli_ref._voice_lock: - cli_ref._voice_continuous = False - # Flag clearing is handled atomically inside _voice_stop_and_transcribe - event.app.invalidate() - threading.Thread( - target=cli_ref._voice_stop_and_transcribe, - daemon=True, - ).start() - else: - # Guard: don't START recording during agent run or interactive prompts - if cli_ref._agent_running: - return - if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: - return - try: - # Interrupt TTS if playing, so user can start talking - if not cli_ref._voice_tts_done.is_set(): - try: - from tools.voice_mode import stop_playback - stop_playback() - cli_ref._voice_tts_done.set() - except Exception: - pass - with cli_ref._voice_lock: - cli_ref._voice_continuous = True - cli_ref._voice_start_recording() - event.app.invalidate() - except Exception as e: - _cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}") - # Dynamic prompt: shows Hermes symbol when agent is working, # or answer prompt when clarify freetext mode is active. cli_ref = self diff --git a/run_agent.py b/run_agent.py index 152d6092..6df794e0 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2590,12 +2590,6 @@ class AIAgent: On interrupt, closes the HTTP client to cancel the in-flight request (stops token generation and avoids wasting money), then rebuilds the client for future calls. - - When ``self._stream_callback`` is set (streaming TTS mode), the call - uses ``stream=True`` and iterates over chunks inside the background - thread. Content deltas are forwarded to the callback in real-time - while the full response is accumulated and returned as a - ``SimpleNamespace`` that mimics a normal ``ChatCompletion``. """ result = {"response": None, "error": None} @@ -2603,30 +2597,58 @@ class AIAgent: try: if self.api_mode == "codex_responses": result["response"] = self._run_codex_stream(api_kwargs) - return elif self.api_mode == "anthropic_messages": result["response"] = self._anthropic_client.messages.create(**api_kwargs) - return - - cb = getattr(self, "_stream_callback", None) - if cb is None: - # Non-streaming path (default) + else: result["response"] = self.client.chat.completions.create(**api_kwargs) - return + except Exception as e: + result["error"] = e - # --- Streaming path for TTS pipeline --- + t = threading.Thread(target=_call, daemon=True) + t.start() + while t.is_alive(): + t.join(timeout=0.3) + if self._interrupt_requested: + # Force-close the HTTP connection to stop token generation + try: + self.client.close() + except Exception: + pass + # Rebuild the client for future calls (cheap, no network) + try: + self.client = OpenAI(**self._client_kwargs) + except Exception: + pass + raise InterruptedError("Agent interrupted during API call") + if result["error"] is not None: + raise result["error"] + return result["response"] + + def _streaming_api_call(self, api_kwargs: dict, stream_callback): + """Streaming variant of _interruptible_api_call for voice TTS pipeline. + + Uses ``stream=True`` and forwards content deltas to *stream_callback* + in real-time. Returns a ``SimpleNamespace`` that mimics a normal + ``ChatCompletion`` so the rest of the agent loop works unchanged. + + This method is separate from ``_interruptible_api_call`` to keep the + core agent loop untouched for non-voice users. + """ + result = {"response": None, "error": None} + + def _call(): + try: stream_kwargs = {**api_kwargs, "stream": True} stream = self.client.chat.completions.create(**stream_kwargs) content_parts: list[str] = [] - tool_calls_acc: dict[int, dict] = {} # index -> {id, type, function:{name, arguments}} + tool_calls_acc: dict[int, dict] = {} finish_reason = None model_name = None role = "assistant" for chunk in stream: if not chunk.choices: - # Usage-only or empty chunk if hasattr(chunk, "model") and chunk.model: model_name = chunk.model continue @@ -2635,24 +2657,17 @@ class AIAgent: if hasattr(chunk, "model") and chunk.model: model_name = chunk.model - # Content delta if delta and delta.content: content_parts.append(delta.content) try: - cb(delta.content) + stream_callback(delta.content) except Exception: pass - # Tool call deltas if delta and delta.tool_calls: for tc_delta in delta.tool_calls: idx = tc_delta.index if tc_delta.index is not None else 0 - # Gemini may reuse index 0 for multiple tool calls, - # sending a new id each time. Detect this and assign - # a fresh virtual index so calls don't merge. if idx in tool_calls_acc and tc_delta.id and tc_delta.id != tool_calls_acc[idx]["id"]: - # Look for existing entry with this id first - # (follow-up deltas for an already-created tool call) matched = False for eidx, eentry in tool_calls_acc.items(): if eentry["id"] == tc_delta.id: @@ -2679,7 +2694,6 @@ class AIAgent: if chunk.choices[0].finish_reason: finish_reason = chunk.choices[0].finish_reason - # Build a mock ChatCompletion matching the non-streaming interface full_content = "".join(content_parts) or None mock_tool_calls = None if tool_calls_acc: @@ -2722,7 +2736,6 @@ class AIAgent: while t.is_alive(): t.join(timeout=0.3) if self._interrupt_requested: - # Force-close the HTTP connection to stop token generation try: if self.api_mode == "anthropic_messages": self._anthropic_client.close() @@ -2730,7 +2743,6 @@ class AIAgent: self.client.close() except Exception: pass - # Rebuild the client for future calls (cheap, no network) try: if self.api_mode == "anthropic_messages": from agent.anthropic_adapter import build_anthropic_client @@ -4412,7 +4424,11 @@ class AIAgent: if os.getenv("HERMES_DUMP_REQUESTS", "").strip().lower() in {"1", "true", "yes", "on"}: self._dump_api_request_debug(api_kwargs, reason="preflight") - response = self._interruptible_api_call(api_kwargs) + cb = getattr(self, "_stream_callback", None) + if cb is not None: + response = self._streaming_api_call(api_kwargs, cb) + else: + response = self._interruptible_api_call(api_kwargs) api_duration = time.time() - api_start_time diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index e6a46def..c9944368 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -41,16 +41,18 @@ def temp_voice_dir(tmp_path, monkeypatch): @pytest.fixture def mock_sd(monkeypatch): - """Replace tools.voice_mode.sd with a MagicMock (sounddevice may not be installed).""" + """Mock _import_audio to return (mock_sd, real_np) so lazy imports work.""" mock = MagicMock() - monkeypatch.setattr("tools.voice_mode.sd", mock) - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) - # Also ensure numpy is available (use real numpy if installed, else mock) try: import numpy as real_np - monkeypatch.setattr("tools.voice_mode.np", real_np) except ImportError: - monkeypatch.setattr("tools.voice_mode.np", MagicMock()) + real_np = MagicMock() + + def _fake_import_audio(): + return mock, real_np + + monkeypatch.setattr("tools.voice_mode._import_audio", _fake_import_audio) + monkeypatch.setattr("tools.voice_mode._audio_available", lambda: True) return mock @@ -60,7 +62,9 @@ def mock_sd(monkeypatch): class TestCheckVoiceRequirements: def test_all_requirements_met(self, monkeypatch): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) + monkeypatch.setattr("tools.voice_mode._audio_available", lambda: True) + monkeypatch.setattr("tools.voice_mode.detect_audio_environment", + lambda: {"available": True, "warnings": []}) monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key") from tools.voice_mode import check_voice_requirements @@ -72,7 +76,9 @@ class TestCheckVoiceRequirements: assert result["missing_packages"] == [] def test_missing_audio_packages(self, monkeypatch): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + monkeypatch.setattr("tools.voice_mode._audio_available", lambda: False) + monkeypatch.setattr("tools.voice_mode.detect_audio_environment", + lambda: {"available": False, "warnings": ["Audio libraries not installed"]}) monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key") from tools.voice_mode import check_voice_requirements @@ -84,7 +90,9 @@ class TestCheckVoiceRequirements: assert "numpy" in result["missing_packages"] def test_missing_stt_key(self, monkeypatch): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) + monkeypatch.setattr("tools.voice_mode._audio_available", lambda: True) + monkeypatch.setattr("tools.voice_mode.detect_audio_environment", + lambda: {"available": True, "warnings": []}) monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) monkeypatch.delenv("GROQ_API_KEY", raising=False) @@ -102,7 +110,9 @@ class TestCheckVoiceRequirements: class TestAudioRecorderStart: def test_start_raises_without_audio(self, monkeypatch): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + def _fail_import(): + raise ImportError("no sounddevice") + monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import) from tools.voice_mode import AudioRecorder @@ -334,21 +344,25 @@ class TestPlayAudioFile: def test_play_wav_via_sounddevice(self, monkeypatch, sample_wav): np = pytest.importorskip("numpy") - mock_sd = MagicMock() - monkeypatch.setattr("tools.voice_mode.sd", mock_sd) - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) - monkeypatch.setattr("tools.voice_mode.np", np) + mock_sd_obj = MagicMock() + + def _fake_import(): + return mock_sd_obj, np + + monkeypatch.setattr("tools.voice_mode._import_audio", _fake_import) from tools.voice_mode import play_audio_file result = play_audio_file(sample_wav) assert result is True - mock_sd.play.assert_called_once() - mock_sd.wait.assert_called_once() + mock_sd_obj.play.assert_called_once() + mock_sd_obj.wait.assert_called_once() def test_returns_false_when_no_player(self, monkeypatch, sample_wav): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + def _fail_import(): + raise ImportError("no sounddevice") + monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import) monkeypatch.setattr("shutil.which", lambda _: None) from tools.voice_mode import play_audio_file @@ -446,7 +460,9 @@ class TestPlayBeep: assert len(audio_arg) > single_beep_samples def test_beep_noop_without_audio(self, monkeypatch): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + def _fail_import(): + raise ImportError("no sounddevice") + monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import) from tools.voice_mode import play_beep @@ -607,3 +623,237 @@ class TestSilenceDetection: # No crash, no callback assert recorder._on_silence_stop is None recorder.cancel() + + +# ============================================================================ +# Playback interrupt +# ============================================================================ + +class TestPlaybackInterrupt: + """Verify that TTS playback can be interrupted.""" + + def test_stop_playback_terminates_process(self): + from tools.voice_mode import stop_playback, _playback_lock + import tools.voice_mode as vm + + mock_proc = MagicMock() + mock_proc.poll.return_value = None # process is running + + with _playback_lock: + vm._active_playback = mock_proc + + stop_playback() + + mock_proc.terminate.assert_called_once() + + with _playback_lock: + assert vm._active_playback is None + + def test_stop_playback_noop_when_nothing_playing(self): + import tools.voice_mode as vm + + with vm._playback_lock: + vm._active_playback = None + + vm.stop_playback() + + def test_play_audio_file_sets_active_playback(self, monkeypatch, sample_wav): + import tools.voice_mode as vm + + def _fail_import(): + raise ImportError("no sounddevice") + monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import) + + mock_proc = MagicMock() + mock_proc.wait.return_value = 0 + + mock_popen = MagicMock(return_value=mock_proc) + monkeypatch.setattr("subprocess.Popen", mock_popen) + monkeypatch.setattr("shutil.which", lambda cmd: "/usr/bin/" + cmd) + + vm.play_audio_file(sample_wav) + + assert mock_popen.called + with vm._playback_lock: + assert vm._active_playback is None + + +# ============================================================================ +# Continuous mode flow +# ============================================================================ + +class TestContinuousModeFlow: + """Verify continuous mode: auto-restart after transcription or silence.""" + + def test_continuous_restart_on_no_speech(self, mock_sd, temp_voice_dir): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + + # First recording: only silence -> stop returns None + recorder.start() + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + for _ in range(10): + silence = np.full((1600, 1), 10, dtype="int16") + callback(silence, 1600, None, None) + + wav_path = recorder.stop() + assert wav_path is None + + # Simulate continuous mode restart + recorder.start() + assert recorder.is_recording is True + + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + for _ in range(10): + speech = np.full((1600, 1), 5000, dtype="int16") + callback(speech, 1600, None, None) + + wav_path = recorder.stop() + assert wav_path is not None + + recorder.cancel() + + def test_recorder_reusable_after_stop(self, mock_sd, temp_voice_dir): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + results = [] + + for i in range(3): + recorder.start() + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + loud = np.full((1600, 1), 5000, dtype="int16") + for _ in range(10): + callback(loud, 1600, None, None) + wav_path = recorder.stop() + results.append(wav_path) + + assert all(r is not None for r in results) + assert os.path.isfile(results[-1]) + + +# ============================================================================ +# Audio level indicator +# ============================================================================ + +class TestAudioLevelIndicator: + """Verify current_rms property updates in real-time for UI feedback.""" + + def test_rms_updates_with_audio_chunks(self, mock_sd): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + assert recorder.current_rms == 0 + + loud = np.full((1600, 1), 5000, dtype="int16") + callback(loud, 1600, None, None) + assert recorder.current_rms == 5000 + + quiet = np.full((1600, 1), 100, dtype="int16") + callback(quiet, 1600, None, None) + assert recorder.current_rms == 100 + + recorder.cancel() + + def test_peak_rms_tracks_maximum(self, mock_sd): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + frames = [ + np.full((1600, 1), 100, dtype="int16"), + np.full((1600, 1), 8000, dtype="int16"), + np.full((1600, 1), 500, dtype="int16"), + np.full((1600, 1), 3000, dtype="int16"), + ] + for frame in frames: + callback(frame, 1600, None, None) + + assert recorder._peak_rms == 8000 + assert recorder.current_rms == 3000 + + recorder.cancel() + + +# ============================================================================ +# Configurable silence parameters +# ============================================================================ + +class TestConfigurableSilenceParams: + """Verify that silence detection params can be configured.""" + + def test_custom_threshold_and_duration(self, mock_sd): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + import threading + + recorder = AudioRecorder() + recorder._silence_threshold = 5000 + recorder._silence_duration = 0.05 + recorder._min_speech_duration = 0.05 + + fired = threading.Event() + recorder.start(on_silence_stop=lambda: fired.set()) + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + # Audio at RMS 1000 -- below custom threshold (5000) + moderate = np.full((1600, 1), 1000, dtype="int16") + for _ in range(5): + callback(moderate, 1600, None, None) + time.sleep(0.02) + + assert recorder._has_spoken is False + assert fired.wait(timeout=0.2) is False + + # Now send really loud audio (above 5000 threshold) + very_loud = np.full((1600, 1), 8000, dtype="int16") + callback(very_loud, 1600, None, None) + time.sleep(0.06) + callback(very_loud, 1600, None, None) + assert recorder._has_spoken is True + + recorder.cancel() diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 6c4e5378..1a1642e0 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -37,33 +37,29 @@ from typing import Callable, Dict, Any, Optional logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- -# Optional imports -- providers degrade gracefully if not installed +# Lazy imports -- providers are imported only when actually used to avoid +# crashing in headless environments (SSH, Docker, WSL, no PortAudio). # --------------------------------------------------------------------------- -try: + +def _import_edge_tts(): + """Lazy import edge_tts. Returns the module or raises ImportError.""" import edge_tts - _HAS_EDGE_TTS = True -except ImportError: - _HAS_EDGE_TTS = False + return edge_tts -try: +def _import_elevenlabs(): + """Lazy import ElevenLabs client. Returns the class or raises ImportError.""" from elevenlabs.client import ElevenLabs - _HAS_ELEVENLABS = True -except ImportError: - _HAS_ELEVENLABS = False + return ElevenLabs -# openai is a core dependency, but guard anyway -try: +def _import_openai_client(): + """Lazy import OpenAI client. Returns the class or raises ImportError.""" from openai import OpenAI as OpenAIClient - _HAS_OPENAI = True -except ImportError: - _HAS_OPENAI = False + return OpenAIClient -try: +def _import_sounddevice(): + """Lazy import sounddevice. Returns the module or raises ImportError/OSError.""" import sounddevice as sd - _HAS_AUDIO = True -except (ImportError, OSError): - sd = None # type: ignore[assignment] - _HAS_AUDIO = False + return sd # =========================================================================== @@ -202,6 +198,7 @@ def _generate_elevenlabs(text: str, output_path: str, tts_config: Dict[str, Any] else: output_format = "mp3_44100_128" + ElevenLabs = _import_elevenlabs() client = ElevenLabs(api_key=api_key) audio_generator = client.text_to_speech.convert( text=text, @@ -247,6 +244,7 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any] else: response_format = "mp3" + OpenAIClient = _import_openai_client() client = OpenAIClient(api_key=api_key, base_url="https://api.openai.com/v1") response = client.audio.speech.create( model=model, @@ -322,7 +320,9 @@ def text_to_speech_tool( try: # Generate audio with the configured provider if provider == "elevenlabs": - if not _HAS_ELEVENLABS: + try: + _import_elevenlabs() + except ImportError: return json.dumps({ "success": False, "error": "ElevenLabs provider selected but 'elevenlabs' package not installed. Run: pip install elevenlabs" @@ -331,7 +331,9 @@ def text_to_speech_tool( _generate_elevenlabs(text, file_str, tts_config) elif provider == "openai": - if not _HAS_OPENAI: + try: + _import_openai_client() + except ImportError: return json.dumps({ "success": False, "error": "OpenAI provider selected but 'openai' package not installed." @@ -341,7 +343,9 @@ def text_to_speech_tool( else: # Default: Edge TTS (free) - if not _HAS_EDGE_TTS: + try: + _import_edge_tts() + except ImportError: return json.dumps({ "success": False, "error": "Edge TTS not available. Run: pip install edge-tts" @@ -422,12 +426,23 @@ def check_tts_requirements() -> bool: Returns: bool: True if at least one provider can work. """ - if _HAS_EDGE_TTS: - return True - if _HAS_ELEVENLABS and os.getenv("ELEVENLABS_API_KEY"): - return True - if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): + try: + _import_edge_tts() return True + except ImportError: + pass + try: + _import_elevenlabs() + if os.getenv("ELEVENLABS_API_KEY"): + return True + except ImportError: + pass + try: + _import_openai_client() + if os.getenv("VOICE_TOOLS_OPENAI_KEY"): + return True + except ImportError: + pass return False @@ -500,20 +515,27 @@ def stream_tts_to_speaker( api_key = os.getenv("ELEVENLABS_API_KEY", "") if not api_key: logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled") - elif _HAS_ELEVENLABS: - client = ElevenLabs(api_key=api_key) + else: + try: + ElevenLabs = _import_elevenlabs() + client = ElevenLabs(api_key=api_key) + except ImportError: + logger.warning("elevenlabs package not installed; streaming TTS disabled") # Open a single sounddevice output stream for the lifetime of # this function. ElevenLabs pcm_24000 produces signed 16-bit # little-endian mono PCM at 24 kHz. - use_sd = _HAS_AUDIO and sd is not None - if use_sd: + if client is not None: try: + sd = _import_sounddevice() import numpy as _np output_stream = sd.OutputStream( samplerate=24000, channels=1, dtype="int16", ) output_stream.start() + except (ImportError, OSError) as exc: + logger.debug("sounddevice not available: %s", exc) + output_stream = None except Exception as exc: logger.warning("sounddevice OutputStream failed: %s", exc) output_stream = None @@ -666,12 +688,19 @@ if __name__ == "__main__": print("🔊 Text-to-Speech Tool Module") print("=" * 50) + def _check(importer, label): + try: + importer() + return True + except ImportError: + return False + print(f"\nProvider availability:") - print(f" Edge TTS: {'✅ installed' if _HAS_EDGE_TTS else '❌ not installed (pip install edge-tts)'}") - print(f" ElevenLabs: {'✅ installed' if _HAS_ELEVENLABS else '❌ not installed (pip install elevenlabs)'}") - print(f" API Key: {'✅ set' if os.getenv('ELEVENLABS_API_KEY') else '❌ not set'}") - print(f" OpenAI: {'✅ installed' if _HAS_OPENAI else '❌ not installed'}") - print(f" API Key: {'✅ set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else '❌ not set (VOICE_TOOLS_OPENAI_KEY)'}") + print(f" Edge TTS: {'installed' if _check(_import_edge_tts, 'edge') else 'not installed (pip install edge-tts)'}") + print(f" ElevenLabs: {'installed' if _check(_import_elevenlabs, 'el') else 'not installed (pip install elevenlabs)'}") + print(f" API Key: {'set' if os.getenv('ELEVENLABS_API_KEY') else 'not set'}") + print(f" OpenAI: {'installed' if _check(_import_openai_client, 'oai') else 'not installed'}") + print(f" API Key: {'set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else 'not set (VOICE_TOOLS_OPENAI_KEY)'}") print(f" ffmpeg: {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}") print(f"\n Output dir: {DEFAULT_OUTPUT_DIR}") diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 87b6cad6..27de0fc5 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -25,17 +25,69 @@ from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- -# Optional imports with graceful degradation +# Lazy audio imports -- never imported at module level to avoid crashing +# in headless environments (SSH, Docker, WSL, no PortAudio). # --------------------------------------------------------------------------- -try: + +def _import_audio(): + """Lazy-import sounddevice and numpy. Returns (sd, np). + + Raises ImportError or OSError if the libraries are not available + (e.g. PortAudio missing on headless servers). + """ import sounddevice as sd import numpy as np + return sd, np - _HAS_AUDIO = True -except (ImportError, OSError): - sd = None # type: ignore[assignment] - np = None # type: ignore[assignment] - _HAS_AUDIO = False + +def _audio_available() -> bool: + """Return True if audio libraries can be imported.""" + try: + _import_audio() + return True + except (ImportError, OSError): + return False + + +def detect_audio_environment() -> dict: + """Detect if the current environment supports audio I/O. + + Returns dict with 'available' (bool) and 'warnings' (list of strings). + """ + warnings = [] + + # SSH detection + if any(os.environ.get(v) for v in ('SSH_CLIENT', 'SSH_TTY', 'SSH_CONNECTION')): + warnings.append("Running over SSH -- no audio devices available") + + # Docker detection + if os.path.exists('/.dockerenv'): + warnings.append("Running inside Docker container -- no audio devices") + + # WSL detection + try: + with open('/proc/version', 'r') as f: + if 'microsoft' in f.read().lower(): + warnings.append("Running in WSL -- audio requires PulseAudio bridge to Windows") + except (FileNotFoundError, PermissionError, OSError): + pass + + # Check audio libraries + try: + sd, _ = _import_audio() + try: + devices = sd.query_devices() + if not devices: + warnings.append("No audio input/output devices detected") + except Exception: + warnings.append("Audio subsystem error (PortAudio cannot query devices)") + except (ImportError, OSError): + warnings.append("Audio libraries not installed (pip install sounddevice numpy)") + + return { + "available": len(warnings) == 0, + "warnings": warnings, + } # --------------------------------------------------------------------------- # Recording parameters @@ -65,7 +117,9 @@ def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> N duration: Duration of each beep in seconds. count: Number of beeps to play (with short gap between). """ - if not _HAS_AUDIO: + try: + sd, np = _import_audio() + except (ImportError, OSError): return try: gap = 0.06 # seconds between beeps @@ -161,12 +215,14 @@ class AudioRecorder: Raises ``RuntimeError`` if sounddevice/numpy are not installed or if a recording is already in progress. """ - if not _HAS_AUDIO: + try: + sd, np = _import_audio() + except (ImportError, OSError) as e: raise RuntimeError( "Voice mode requires sounddevice and numpy.\n" "Install with: pip install sounddevice numpy\n" "Or: pip install hermes-agent[voice]" - ) + ) from e with self._lock: if self._recording: @@ -269,6 +325,7 @@ class AudioRecorder: return None # Concatenate frames and write WAV + _, np = _import_audio() audio_data = np.concatenate(self._frames, axis=0) self._frames = [] @@ -434,11 +491,11 @@ def stop_playback() -> None: except Exception: pass # Also stop sounddevice playback if active - if _HAS_AUDIO: - try: - sd.stop() - except Exception: - pass + try: + sd, _ = _import_audio() + sd.stop() + except Exception: + pass def play_audio_file(file_path: str) -> bool: @@ -461,8 +518,9 @@ def play_audio_file(file_path: str) -> bool: return False # Try sounddevice for WAV files - if _HAS_AUDIO and file_path.endswith(".wav"): + if file_path.endswith(".wav"): try: + sd, np = _import_audio() with wave.open(file_path, "rb") as wf: frames = wf.readframes(wf.getnframes()) audio_data = np.frombuffer(frames, dtype=np.int16) @@ -471,6 +529,8 @@ def play_audio_file(file_path: str) -> bool: sd.play(audio_data, samplerate=sample_rate) sd.wait() return True + except (ImportError, OSError): + pass # audio libs not available, fall through to system players except Exception as e: logger.debug("sounddevice playback failed: %s", e) @@ -518,14 +578,18 @@ def check_voice_requirements() -> Dict[str, Any]: groq_key = bool(os.getenv("GROQ_API_KEY")) stt_key_set = openai_key or groq_key missing: List[str] = [] + has_audio = _audio_available() - if not _HAS_AUDIO: + if not has_audio: missing.extend(["sounddevice", "numpy"]) - available = _HAS_AUDIO and stt_key_set + # Environment detection + env_check = detect_audio_environment() + + available = has_audio and stt_key_set and env_check["available"] details_parts = [] - if _HAS_AUDIO: + if has_audio: details_parts.append("Audio capture: OK") else: details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)") @@ -537,12 +601,16 @@ def check_voice_requirements() -> Dict[str, Any]: else: details_parts.append("STT API key: MISSING (set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY)") + for warning in env_check["warnings"]: + details_parts.append(f"Environment: {warning}") + return { "available": available, - "audio_available": _HAS_AUDIO, + "audio_available": has_audio, "stt_key_set": stt_key_set, "missing_packages": missing, "details": "\n".join(details_parts), + "environment": env_check, }