feat: add Phase 4 low-latency features for voice mode

- Audio cues: beep on record start (880Hz), double beep on stop (660Hz) - Silence detection: auto-stop recording after 3s of silence (RMS-based) - Continuous mode: auto-restart recording after agent responds - Ctrl+R starts continuous mode, Ctrl+R during recording exits it - Waits for TTS to finish before restarting to avoid recording speaker - Tests: 7 new tests for beep generation and silence detection
2026-03-03 19:56:00 +03:00
parent a69bd55b5a
commit bfd9c97705
3 changed files with 283 additions and 6 deletions
--- a/cli.py
+++ b/cli.py
@@ -3539,10 +3539,27 @@ class HermesCLI:
        if self._voice_recorder is None:
            self._voice_recorder = AudioRecorder()

-        self._voice_recorder.start()
+        def _on_silence():
+            """Called by AudioRecorder when silence is detected after speech."""
+            with self._voice_lock:
+                if not self._voice_recording:
+                    return
+            _cprint(f"\n{_DIM}Silence detected, auto-stopping...{_RST}")
+            if hasattr(self, '_app') and self._app:
+                self._app.invalidate()
+            self._voice_stop_and_transcribe()
+
+        self._voice_recorder.start(on_silence_stop=_on_silence)
        with self._voice_lock:
            self._voice_recording = True
-        _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+R to stop, Ctrl+C to cancel){_RST}")
+
+        # Audio cue: single beep on recording start
+        try:
+            from tools.voice_mode import play_beep
+            threading.Thread(target=play_beep, kwargs={"frequency": 880, "count": 1}, daemon=True).start()
+        except Exception:
+            pass
+        _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}")

    def _voice_stop_and_transcribe(self):
        """Stop recording, transcribe via STT, and queue the transcript as input."""
@@ -3554,6 +3571,13 @@ class HermesCLI:
            with self._voice_lock:
                self._voice_recording = False

+            # Audio cue: double beep on recording stop
+            try:
+                from tools.voice_mode import play_beep
+                threading.Thread(target=play_beep, kwargs={"frequency": 660, "count": 2}, daemon=True).start()
+            except Exception:
+                pass
+
            if wav_path is None:
                _cprint(f"{_DIM}No speech detected (recording too short).{_RST}")
                return
@@ -3603,6 +3627,7 @@ class HermesCLI:
        """Speak the agent's response aloud using TTS (runs in background thread)."""
        if not self._voice_tts:
            return
+        self._voice_tts_done.clear()
        try:
            from tools.tts_tool import text_to_speech_tool
            from tools.voice_mode import play_audio_file
@@ -3649,6 +3674,8 @@ class HermesCLI:
        except Exception as e:
            logger.warning("Voice TTS playback failed: %s", e)
            _cprint(f"{_DIM}TTS playback failed: {e}{_RST}")
+        finally:
+            self._voice_tts_done.set()

    def _handle_voice_command(self, command: str):
        """Handle /voice [on|off|tts|status] command."""
@@ -3714,6 +3741,7 @@ class HermesCLI:
                self._voice_recording = False
            self._voice_mode = False
            self._voice_tts = False
+            self._voice_continuous = False
        _cprint(f"\n{_DIM}Voice mode disabled.{_RST}")

    def _toggle_voice_tts(self):
@@ -4331,6 +4359,9 @@ class HermesCLI:
        self._voice_recorder = None     # AudioRecorder instance (lazy init)
        self._voice_recording = False   # Whether currently recording
        self._voice_processing = False  # Whether STT is in progress
+        self._voice_continuous = False  # Whether to auto-restart after agent responds
+        self._voice_tts_done = threading.Event()  # Signals TTS playback finished
+        self._voice_tts_done.set()  # Initially "done" (no TTS pending)

        # Register callbacks so terminal_tool prompts route through our UI
        set_sudo_password_callback(self._sudo_password_callback)
@@ -4650,7 +4681,10 @@ class HermesCLI:
            if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state:
                return
            if cli_ref._voice_recording:
-                cli_ref._voice_recording = False
+                # Manual stop via Ctrl+R: stop continuous mode
+                with cli_ref._voice_lock:
+                    cli_ref._voice_continuous = False
+                    cli_ref._voice_recording = False
                event.app.invalidate()
                threading.Thread(
                    target=cli_ref._voice_stop_and_transcribe,
@@ -4658,6 +4692,8 @@ class HermesCLI:
                ).start()
            else:
                try:
+                    with cli_ref._voice_lock:
+                        cli_ref._voice_continuous = True
                    cli_ref._voice_start_recording()
                    event.app.invalidate()
                except Exception as e:
@@ -5267,13 +5303,25 @@ class HermesCLI:
                    # Regular chat - run agent
                    self._agent_running = True
                    app.invalidate()  # Refresh status line
-                    
+
                    try:
                        self.chat(user_input, images=submit_images or None)
                    finally:
                        self._agent_running = False
                        self._spinner_text = ""
                        app.invalidate()  # Refresh status line
+
+                        # Continuous voice: auto-restart recording after agent responds
+                        if self._voice_mode and self._voice_continuous and not self._voice_recording:
+                            try:
+                                # Wait for TTS to finish so we don't record the speaker
+                                if self._voice_tts:
+                                    self._voice_tts_done.wait(timeout=60)
+                                    time.sleep(0.3)  # Brief pause after TTS ends
+                                self._voice_start_recording()
+                                app.invalidate()
+                            except Exception as e:
+                                _cprint(f"{_DIM}Voice auto-restart failed: {e}{_RST}")
                    
                except Exception as e:
                    print(f"Error: {e}")
--- a/tests/tools/test_voice_mode.py
+++ b/tests/tools/test_voice_mode.py
@@ -346,3 +346,154 @@ class TestCleanupTempRecordings:
        deleted = cleanup_temp_recordings(max_age_seconds=3600)
        assert deleted == 0
        assert other_file.exists()
+
+
+# ============================================================================
+# play_beep
+# ============================================================================
+
+class TestPlayBeep:
+    def test_beep_calls_sounddevice_play(self, mock_sd):
+        np = pytest.importorskip("numpy")
+
+        from tools.voice_mode import play_beep
+
+        play_beep(frequency=880, duration=0.1, count=1)
+
+        mock_sd.play.assert_called_once()
+        mock_sd.wait.assert_called_once()
+        # Verify audio data is int16 numpy array
+        audio_arg = mock_sd.play.call_args[0][0]
+        assert audio_arg.dtype == np.int16
+        assert len(audio_arg) > 0
+
+    def test_beep_double_produces_longer_audio(self, mock_sd):
+        np = pytest.importorskip("numpy")
+
+        from tools.voice_mode import play_beep
+
+        play_beep(frequency=660, duration=0.1, count=2)
+
+        audio_arg = mock_sd.play.call_args[0][0]
+        single_beep_samples = int(16000 * 0.1)
+        # Double beep should be longer than a single beep
+        assert len(audio_arg) > single_beep_samples
+
+    def test_beep_noop_without_audio(self, monkeypatch):
+        monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False)
+
+        from tools.voice_mode import play_beep
+
+        # Should not raise
+        play_beep()
+
+    def test_beep_handles_playback_error(self, mock_sd):
+        mock_sd.play.side_effect = Exception("device error")
+
+        from tools.voice_mode import play_beep
+
+        # Should not raise
+        play_beep()
+
+
+# ============================================================================
+# Silence detection
+# ============================================================================
+
+class TestSilenceDetection:
+    def test_silence_callback_fires_after_speech_then_silence(self, mock_sd):
+        np = pytest.importorskip("numpy")
+        import threading
+
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder, SAMPLE_RATE
+
+        recorder = AudioRecorder()
+        # Use very short silence duration for testing
+        recorder._silence_duration = 0.05
+
+        fired = threading.Event()
+
+        def on_silence():
+            fired.set()
+
+        recorder.start(on_silence_stop=on_silence)
+
+        # Get the callback function from InputStream constructor
+        callback = mock_sd.InputStream.call_args.kwargs.get("callback")
+        if callback is None:
+            callback = mock_sd.InputStream.call_args[1]["callback"]
+
+        # Simulate loud audio (speech) -- RMS well above threshold
+        loud_frame = np.full((1600, 1), 5000, dtype="int16")
+        callback(loud_frame, 1600, None, None)
+        assert recorder._has_spoken is True
+
+        # Simulate silence
+        silent_frame = np.zeros((1600, 1), dtype="int16")
+        callback(silent_frame, 1600, None, None)
+
+        # Wait a bit past the silence duration, then send another silent frame
+        time.sleep(0.06)
+        callback(silent_frame, 1600, None, None)
+
+        # The callback should have been fired
+        assert fired.wait(timeout=1.0) is True
+
+        recorder.cancel()
+
+    def test_silence_without_speech_does_not_fire(self, mock_sd):
+        np = pytest.importorskip("numpy")
+        import threading
+
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        recorder._silence_duration = 0.02
+
+        fired = threading.Event()
+        recorder.start(on_silence_stop=lambda: fired.set())
+
+        callback = mock_sd.InputStream.call_args.kwargs.get("callback")
+        if callback is None:
+            callback = mock_sd.InputStream.call_args[1]["callback"]
+
+        # Only silence -- no speech detected, so callback should NOT fire
+        silent_frame = np.zeros((1600, 1), dtype="int16")
+        for _ in range(5):
+            callback(silent_frame, 1600, None, None)
+            time.sleep(0.01)
+
+        assert fired.wait(timeout=0.2) is False
+
+        recorder.cancel()
+
+    def test_no_callback_means_no_silence_detection(self, mock_sd):
+        np = pytest.importorskip("numpy")
+
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        recorder.start()  # no on_silence_stop
+
+        callback = mock_sd.InputStream.call_args.kwargs.get("callback")
+        if callback is None:
+            callback = mock_sd.InputStream.call_args[1]["callback"]
+
+        # Even with speech then silence, nothing should happen
+        loud_frame = np.full((1600, 1), 5000, dtype="int16")
+        silent_frame = np.zeros((1600, 1), dtype="int16")
+        callback(loud_frame, 1600, None, None)
+        callback(silent_frame, 1600, None, None)
+
+        # No crash, no callback
+        assert recorder._on_silence_stop is None
+        recorder.cancel()
--- a/tools/voice_mode.py
+++ b/tools/voice_mode.py
@@ -45,10 +45,51 @@ DTYPE = "int16"  # 16-bit PCM
 SAMPLE_WIDTH = 2  # bytes per sample (int16)
 MAX_RECORDING_SECONDS = 120  # Safety cap

+# Silence detection defaults
+SILENCE_RMS_THRESHOLD = 200  # RMS below this = silence (int16 range 0-32767)
+SILENCE_DURATION_SECONDS = 3.0  # Seconds of continuous silence before auto-stop
+
 # Temp directory for voice recordings
 _TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice")


+# ============================================================================
+# Audio cues (beep tones)
+# ============================================================================
+def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> None:
+    """Play a short beep tone using numpy + sounddevice.
+
+    Args:
+        frequency: Tone frequency in Hz (default 880 = A5).
+        duration: Duration of each beep in seconds.
+        count: Number of beeps to play (with short gap between).
+    """
+    if not _HAS_AUDIO:
+        return
+    try:
+        gap = 0.06  # seconds between beeps
+        samples_per_beep = int(SAMPLE_RATE * duration)
+        samples_per_gap = int(SAMPLE_RATE * gap)
+
+        parts = []
+        for i in range(count):
+            t = np.linspace(0, duration, samples_per_beep, endpoint=False)
+            # Apply fade in/out to avoid click artifacts
+            tone = np.sin(2 * np.pi * frequency * t)
+            fade_len = min(int(SAMPLE_RATE * 0.01), samples_per_beep // 4)
+            tone[:fade_len] *= np.linspace(0, 1, fade_len)
+            tone[-fade_len:] *= np.linspace(1, 0, fade_len)
+            parts.append((tone * 0.3 * 32767).astype(np.int16))
+            if i < count - 1:
+                parts.append(np.zeros(samples_per_gap, dtype=np.int16))
+
+        audio = np.concatenate(parts)
+        sd.play(audio, samplerate=SAMPLE_RATE)
+        sd.wait()
+    except Exception as e:
+        logger.debug("Beep playback failed: %s", e)
+
+
 # ============================================================================
 # AudioRecorder
 # ============================================================================
@@ -58,11 +99,14 @@ class AudioRecorder:
    Usage::

        recorder = AudioRecorder()
-        recorder.start()
+        recorder.start(on_silence_stop=my_callback)
        # ... user speaks ...
        wav_path = recorder.stop()   # returns path to WAV file
        # or
        recorder.cancel()            # discard without saving
+
+    If ``on_silence_stop`` is provided, recording automatically stops when
+    the user is silent for ``silence_duration`` seconds and calls the callback.
    """

    def __init__(self) -> None:
@@ -71,6 +115,12 @@ class AudioRecorder:
        self._frames: List[Any] = []
        self._recording = False
        self._start_time: float = 0.0
+        # Silence detection state
+        self._has_spoken = False
+        self._silence_start: float = 0.0
+        self._on_silence_stop = None
+        self._silence_threshold: int = SILENCE_RMS_THRESHOLD
+        self._silence_duration: float = SILENCE_DURATION_SECONDS

    # -- public properties ---------------------------------------------------

@@ -86,9 +136,14 @@ class AudioRecorder:

    # -- public methods ------------------------------------------------------

-    def start(self) -> None:
+    def start(self, on_silence_stop=None) -> None:
        """Start capturing audio from the default input device.

+        Args:
+            on_silence_stop: Optional callback invoked (in a daemon thread) when
+                silence is detected after speech. The callback receives no arguments.
+                Use this to auto-stop recording and trigger transcription.
+
        Raises ``RuntimeError`` if sounddevice/numpy are not installed
        or if a recording is already in progress.
        """
@@ -105,12 +160,35 @@ class AudioRecorder:

            self._frames = []
            self._start_time = time.monotonic()
+            self._has_spoken = False
+            self._silence_start = 0.0
+            self._on_silence_stop = on_silence_stop

            def _callback(indata, frames, time_info, status):  # noqa: ARG001
                if status:
                    logger.debug("sounddevice status: %s", status)
                self._frames.append(indata.copy())

+                # Silence detection: compute RMS of this chunk
+                if self._on_silence_stop is not None and self._recording:
+                    rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
+                    now = time.monotonic()
+
+                    if rms > self._silence_threshold:
+                        self._has_spoken = True
+                        self._silence_start = 0.0
+                    elif self._has_spoken:
+                        # User was speaking and now is silent
+                        if self._silence_start == 0.0:
+                            self._silence_start = now
+                        elif now - self._silence_start >= self._silence_duration:
+                            logger.info("Silence detected (%.1fs), auto-stopping",
+                                        self._silence_duration)
+                            cb = self._on_silence_stop
+                            self._on_silence_stop = None  # fire only once
+                            if cb:
+                                threading.Thread(target=cb, daemon=True).start()
+
            self._stream = sd.InputStream(
                samplerate=SAMPLE_RATE,
                channels=CHANNELS,