diff --git a/cli.py b/cli.py index 3b3032c40..9fb613c85 100755 --- a/cli.py +++ b/cli.py @@ -3539,10 +3539,27 @@ class HermesCLI: if self._voice_recorder is None: self._voice_recorder = AudioRecorder() - self._voice_recorder.start() + def _on_silence(): + """Called by AudioRecorder when silence is detected after speech.""" + with self._voice_lock: + if not self._voice_recording: + return + _cprint(f"\n{_DIM}Silence detected, auto-stopping...{_RST}") + if hasattr(self, '_app') and self._app: + self._app.invalidate() + self._voice_stop_and_transcribe() + + self._voice_recorder.start(on_silence_stop=_on_silence) with self._voice_lock: self._voice_recording = True - _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+R to stop, Ctrl+C to cancel){_RST}") + + # Audio cue: single beep on recording start + try: + from tools.voice_mode import play_beep + threading.Thread(target=play_beep, kwargs={"frequency": 880, "count": 1}, daemon=True).start() + except Exception: + pass + _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}") def _voice_stop_and_transcribe(self): """Stop recording, transcribe via STT, and queue the transcript as input.""" @@ -3554,6 +3571,13 @@ class HermesCLI: with self._voice_lock: self._voice_recording = False + # Audio cue: double beep on recording stop + try: + from tools.voice_mode import play_beep + threading.Thread(target=play_beep, kwargs={"frequency": 660, "count": 2}, daemon=True).start() + except Exception: + pass + if wav_path is None: _cprint(f"{_DIM}No speech detected (recording too short).{_RST}") return @@ -3603,6 +3627,7 @@ class HermesCLI: """Speak the agent's response aloud using TTS (runs in background thread).""" if not self._voice_tts: return + self._voice_tts_done.clear() try: from tools.tts_tool import text_to_speech_tool from tools.voice_mode import play_audio_file @@ -3649,6 +3674,8 @@ class HermesCLI: except Exception as e: logger.warning("Voice TTS playback failed: %s", e) _cprint(f"{_DIM}TTS playback failed: {e}{_RST}") + finally: + self._voice_tts_done.set() def _handle_voice_command(self, command: str): """Handle /voice [on|off|tts|status] command.""" @@ -3714,6 +3741,7 @@ class HermesCLI: self._voice_recording = False self._voice_mode = False self._voice_tts = False + self._voice_continuous = False _cprint(f"\n{_DIM}Voice mode disabled.{_RST}") def _toggle_voice_tts(self): @@ -4331,6 +4359,9 @@ class HermesCLI: self._voice_recorder = None # AudioRecorder instance (lazy init) self._voice_recording = False # Whether currently recording self._voice_processing = False # Whether STT is in progress + self._voice_continuous = False # Whether to auto-restart after agent responds + self._voice_tts_done = threading.Event() # Signals TTS playback finished + self._voice_tts_done.set() # Initially "done" (no TTS pending) # Register callbacks so terminal_tool prompts route through our UI set_sudo_password_callback(self._sudo_password_callback) @@ -4650,7 +4681,10 @@ class HermesCLI: if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: return if cli_ref._voice_recording: - cli_ref._voice_recording = False + # Manual stop via Ctrl+R: stop continuous mode + with cli_ref._voice_lock: + cli_ref._voice_continuous = False + cli_ref._voice_recording = False event.app.invalidate() threading.Thread( target=cli_ref._voice_stop_and_transcribe, @@ -4658,6 +4692,8 @@ class HermesCLI: ).start() else: try: + with cli_ref._voice_lock: + cli_ref._voice_continuous = True cli_ref._voice_start_recording() event.app.invalidate() except Exception as e: @@ -5267,13 +5303,25 @@ class HermesCLI: # Regular chat - run agent self._agent_running = True app.invalidate() # Refresh status line - + try: self.chat(user_input, images=submit_images or None) finally: self._agent_running = False self._spinner_text = "" app.invalidate() # Refresh status line + + # Continuous voice: auto-restart recording after agent responds + if self._voice_mode and self._voice_continuous and not self._voice_recording: + try: + # Wait for TTS to finish so we don't record the speaker + if self._voice_tts: + self._voice_tts_done.wait(timeout=60) + time.sleep(0.3) # Brief pause after TTS ends + self._voice_start_recording() + app.invalidate() + except Exception as e: + _cprint(f"{_DIM}Voice auto-restart failed: {e}{_RST}") except Exception as e: print(f"Error: {e}") diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index d9dcba2c3..ff1a99b2f 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -346,3 +346,154 @@ class TestCleanupTempRecordings: deleted = cleanup_temp_recordings(max_age_seconds=3600) assert deleted == 0 assert other_file.exists() + + +# ============================================================================ +# play_beep +# ============================================================================ + +class TestPlayBeep: + def test_beep_calls_sounddevice_play(self, mock_sd): + np = pytest.importorskip("numpy") + + from tools.voice_mode import play_beep + + play_beep(frequency=880, duration=0.1, count=1) + + mock_sd.play.assert_called_once() + mock_sd.wait.assert_called_once() + # Verify audio data is int16 numpy array + audio_arg = mock_sd.play.call_args[0][0] + assert audio_arg.dtype == np.int16 + assert len(audio_arg) > 0 + + def test_beep_double_produces_longer_audio(self, mock_sd): + np = pytest.importorskip("numpy") + + from tools.voice_mode import play_beep + + play_beep(frequency=660, duration=0.1, count=2) + + audio_arg = mock_sd.play.call_args[0][0] + single_beep_samples = int(16000 * 0.1) + # Double beep should be longer than a single beep + assert len(audio_arg) > single_beep_samples + + def test_beep_noop_without_audio(self, monkeypatch): + monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + + from tools.voice_mode import play_beep + + # Should not raise + play_beep() + + def test_beep_handles_playback_error(self, mock_sd): + mock_sd.play.side_effect = Exception("device error") + + from tools.voice_mode import play_beep + + # Should not raise + play_beep() + + +# ============================================================================ +# Silence detection +# ============================================================================ + +class TestSilenceDetection: + def test_silence_callback_fires_after_speech_then_silence(self, mock_sd): + np = pytest.importorskip("numpy") + import threading + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder, SAMPLE_RATE + + recorder = AudioRecorder() + # Use very short silence duration for testing + recorder._silence_duration = 0.05 + + fired = threading.Event() + + def on_silence(): + fired.set() + + recorder.start(on_silence_stop=on_silence) + + # Get the callback function from InputStream constructor + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + # Simulate loud audio (speech) -- RMS well above threshold + loud_frame = np.full((1600, 1), 5000, dtype="int16") + callback(loud_frame, 1600, None, None) + assert recorder._has_spoken is True + + # Simulate silence + silent_frame = np.zeros((1600, 1), dtype="int16") + callback(silent_frame, 1600, None, None) + + # Wait a bit past the silence duration, then send another silent frame + time.sleep(0.06) + callback(silent_frame, 1600, None, None) + + # The callback should have been fired + assert fired.wait(timeout=1.0) is True + + recorder.cancel() + + def test_silence_without_speech_does_not_fire(self, mock_sd): + np = pytest.importorskip("numpy") + import threading + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder._silence_duration = 0.02 + + fired = threading.Event() + recorder.start(on_silence_stop=lambda: fired.set()) + + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + # Only silence -- no speech detected, so callback should NOT fire + silent_frame = np.zeros((1600, 1), dtype="int16") + for _ in range(5): + callback(silent_frame, 1600, None, None) + time.sleep(0.01) + + assert fired.wait(timeout=0.2) is False + + recorder.cancel() + + def test_no_callback_means_no_silence_detection(self, mock_sd): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() # no on_silence_stop + + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + # Even with speech then silence, nothing should happen + loud_frame = np.full((1600, 1), 5000, dtype="int16") + silent_frame = np.zeros((1600, 1), dtype="int16") + callback(loud_frame, 1600, None, None) + callback(silent_frame, 1600, None, None) + + # No crash, no callback + assert recorder._on_silence_stop is None + recorder.cancel() diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 7a7bb6b05..5abdc4d60 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -45,10 +45,51 @@ DTYPE = "int16" # 16-bit PCM SAMPLE_WIDTH = 2 # bytes per sample (int16) MAX_RECORDING_SECONDS = 120 # Safety cap +# Silence detection defaults +SILENCE_RMS_THRESHOLD = 200 # RMS below this = silence (int16 range 0-32767) +SILENCE_DURATION_SECONDS = 3.0 # Seconds of continuous silence before auto-stop + # Temp directory for voice recordings _TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice") +# ============================================================================ +# Audio cues (beep tones) +# ============================================================================ +def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> None: + """Play a short beep tone using numpy + sounddevice. + + Args: + frequency: Tone frequency in Hz (default 880 = A5). + duration: Duration of each beep in seconds. + count: Number of beeps to play (with short gap between). + """ + if not _HAS_AUDIO: + return + try: + gap = 0.06 # seconds between beeps + samples_per_beep = int(SAMPLE_RATE * duration) + samples_per_gap = int(SAMPLE_RATE * gap) + + parts = [] + for i in range(count): + t = np.linspace(0, duration, samples_per_beep, endpoint=False) + # Apply fade in/out to avoid click artifacts + tone = np.sin(2 * np.pi * frequency * t) + fade_len = min(int(SAMPLE_RATE * 0.01), samples_per_beep // 4) + tone[:fade_len] *= np.linspace(0, 1, fade_len) + tone[-fade_len:] *= np.linspace(1, 0, fade_len) + parts.append((tone * 0.3 * 32767).astype(np.int16)) + if i < count - 1: + parts.append(np.zeros(samples_per_gap, dtype=np.int16)) + + audio = np.concatenate(parts) + sd.play(audio, samplerate=SAMPLE_RATE) + sd.wait() + except Exception as e: + logger.debug("Beep playback failed: %s", e) + + # ============================================================================ # AudioRecorder # ============================================================================ @@ -58,11 +99,14 @@ class AudioRecorder: Usage:: recorder = AudioRecorder() - recorder.start() + recorder.start(on_silence_stop=my_callback) # ... user speaks ... wav_path = recorder.stop() # returns path to WAV file # or recorder.cancel() # discard without saving + + If ``on_silence_stop`` is provided, recording automatically stops when + the user is silent for ``silence_duration`` seconds and calls the callback. """ def __init__(self) -> None: @@ -71,6 +115,12 @@ class AudioRecorder: self._frames: List[Any] = [] self._recording = False self._start_time: float = 0.0 + # Silence detection state + self._has_spoken = False + self._silence_start: float = 0.0 + self._on_silence_stop = None + self._silence_threshold: int = SILENCE_RMS_THRESHOLD + self._silence_duration: float = SILENCE_DURATION_SECONDS # -- public properties --------------------------------------------------- @@ -86,9 +136,14 @@ class AudioRecorder: # -- public methods ------------------------------------------------------ - def start(self) -> None: + def start(self, on_silence_stop=None) -> None: """Start capturing audio from the default input device. + Args: + on_silence_stop: Optional callback invoked (in a daemon thread) when + silence is detected after speech. The callback receives no arguments. + Use this to auto-stop recording and trigger transcription. + Raises ``RuntimeError`` if sounddevice/numpy are not installed or if a recording is already in progress. """ @@ -105,12 +160,35 @@ class AudioRecorder: self._frames = [] self._start_time = time.monotonic() + self._has_spoken = False + self._silence_start = 0.0 + self._on_silence_stop = on_silence_stop def _callback(indata, frames, time_info, status): # noqa: ARG001 if status: logger.debug("sounddevice status: %s", status) self._frames.append(indata.copy()) + # Silence detection: compute RMS of this chunk + if self._on_silence_stop is not None and self._recording: + rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2))) + now = time.monotonic() + + if rms > self._silence_threshold: + self._has_spoken = True + self._silence_start = 0.0 + elif self._has_spoken: + # User was speaking and now is silent + if self._silence_start == 0.0: + self._silence_start = now + elif now - self._silence_start >= self._silence_duration: + logger.info("Silence detected (%.1fs), auto-stopping", + self._silence_duration) + cb = self._on_silence_stop + self._on_silence_stop = None # fire only once + if cb: + threading.Thread(target=cb, daemon=True).start() + self._stream = sd.InputStream( samplerate=SAMPLE_RATE, channels=CHANNELS,