fix: persistent audio stream and silence detection improvements

- Keep InputStream alive across recordings to avoid CoreAudio hang on repeated open/close cycles on macOS. New _ensure_stream() creates the stream once; start()/stop()/cancel() only toggle frame collection. - Add _close_stream_with_timeout() with daemon thread to prevent stream.stop()/close() from blocking indefinitely. - Add generation counter to detect stale stream-open completions after cancel or restart. - Run recorder.cancel() in background thread from Ctrl+C handler to keep the event loop responsive. - Add shutdown() method called on /voice off to release audio resources. - Fix silence timer reset during active speech: use dip tolerance for _resume_start tracker so natural speech pauses (< 0.3s) don't prevent the silence timer from being reset. - Update tests to match persistent stream behavior.
2026-03-10 20:37:17 +03:00
parent eec04d180a
commit eb79dda04b
4 changed files with 221 additions and 132 deletions
--- a/cli.py
+++ b/cli.py
@@ -3848,14 +3848,26 @@ class HermesCLI:

    def _disable_voice_mode(self):
        """Disable voice mode, cancel any active recording, and stop TTS."""
+        recorder = None
        with self._voice_lock:
            if self._voice_recording and self._voice_recorder:
                self._voice_recorder.cancel()
                self._voice_recording = False
+            recorder = self._voice_recorder
            self._voice_mode = False
            self._voice_tts = False
            self._voice_continuous = False

+        # Shut down the persistent audio stream in background
+        if recorder is not None:
+            def _bg_shutdown(rec=recorder):
+                try:
+                    rec.shutdown()
+                except Exception:
+                    pass
+            threading.Thread(target=_bg_shutdown, daemon=True).start()
+            self._voice_recorder = None
+
        # Stop any active TTS playback
        try:
            from tools.voice_mode import stop_playback
@@ -4799,15 +4811,24 @@ class HermesCLI:
            import time as _time
            now = _time.time()

-            # Cancel active voice recording
+            # Cancel active voice recording.
+            # Run cancel() in a background thread to prevent blocking the
+            # event loop if AudioRecorder._lock or CoreAudio takes time.
+            _should_cancel_voice = False
+            _recorder_ref = None
            with cli_ref._voice_lock:
                if cli_ref._voice_recording and cli_ref._voice_recorder:
-                    cli_ref._voice_recorder.cancel()
+                    _recorder_ref = cli_ref._voice_recorder
                    cli_ref._voice_recording = False
                    cli_ref._voice_continuous = False
-                    _cprint(f"\n{_DIM}Recording cancelled.{_RST}")
-                    event.app.invalidate()
-                    return
+                    _should_cancel_voice = True
+            if _should_cancel_voice:
+                _cprint(f"\n{_DIM}Recording cancelled.{_RST}")
+                threading.Thread(
+                    target=_recorder_ref.cancel, daemon=True
+                ).start()
+                event.app.invalidate()
+                return

            # Cancel sudo prompt
            if self._sudo_state:
--- a/tests/tools/test_voice_cli_integration.py
+++ b/tests/tools/test_voice_cli_integration.py
@@ -603,28 +603,14 @@ class TestDisableVoiceModeStopsTTS:

    def test_disable_voice_mode_calls_stop_playback(self):
        """Source check: _disable_voice_mode must call stop_playback()."""
-        with open("cli.py") as f:
-            source = f.read()
+        import inspect
+        from cli import HermesCLI

-        # Extract _disable_voice_mode method body
-        lines = source.split("\n")
-        in_method = False
-        method_lines = []
-        for line in lines:
-            if "def _disable_voice_mode" in line:
-                in_method = True
-            elif in_method:
-                if line.strip() and not line.startswith(" ") and not line.startswith("\t"):
-                    break
-                if line.strip().startswith("def "):
-                    break
-                method_lines.append(line)
-
-        method_body = "\n".join(method_lines)
-        assert "stop_playback" in method_body, (
+        source = inspect.getsource(HermesCLI._disable_voice_mode)
+        assert "stop_playback" in source, (
            "_disable_voice_mode must call stop_playback()"
        )
-        assert "_voice_tts_done.set()" in method_body, (
+        assert "_voice_tts_done.set()" in source, (
            "_disable_voice_mode must set _voice_tts_done"
        )

--- a/tests/tools/test_voice_mode.py
+++ b/tests/tools/test_voice_mode.py
@@ -235,8 +235,9 @@ class TestAudioRecorderCancel:

        assert recorder.is_recording is False
        assert recorder._frames == []
-        mock_stream.stop.assert_called_once()
-        mock_stream.close.assert_called_once()
+        # Stream is kept alive (persistent) — cancel() does NOT close it.
+        mock_stream.stop.assert_not_called()
+        mock_stream.close.assert_not_called()

    def test_cancel_when_not_recording_is_safe(self):
        from tools.voice_mode import AudioRecorder
--- a/tools/voice_mode.py
+++ b/tools/voice_mode.py
@@ -175,6 +175,9 @@ class AudioRecorder:
        self._frames: List[Any] = []
        self._recording = False
        self._start_time: float = 0.0
+        # Generation counter — incremented on each start/cancel/stop to
+        # detect stale stream-open completions after a cancel or restart.
+        self._generation: int = 0
        # Silence detection state
        self._has_spoken = False
        self._speech_start: float = 0.0  # When speech attempt began
@@ -182,6 +185,8 @@ class AudioRecorder:
        self._min_speech_duration: float = 0.3  # Seconds of speech needed to confirm
        self._max_dip_tolerance: float = 0.3  # Max dip duration before resetting speech
        self._silence_start: float = 0.0
+        self._resume_start: float = 0.0  # Tracks sustained speech after silence starts
+        self._resume_dip_start: float = 0.0  # Dip tolerance tracker for resume detection
        self._on_silence_stop = None
        self._silence_threshold: int = SILENCE_RMS_THRESHOLD
        self._silence_duration: float = SILENCE_DURATION_SECONDS
@@ -210,9 +215,137 @@ class AudioRecorder:

    # -- public methods ------------------------------------------------------

+    def _ensure_stream(self) -> None:
+        """Create the audio InputStream once and keep it alive.
+
+        The stream stays open for the lifetime of the recorder.  Between
+        recordings the callback simply discards audio chunks (``_recording``
+        is ``False``).  This avoids the CoreAudio bug where closing and
+        re-opening an ``InputStream`` hangs indefinitely on macOS.
+        """
+        if self._stream is not None:
+            return  # already alive
+
+        sd, np = _import_audio()
+
+        def _callback(indata, frames, time_info, status):  # noqa: ARG001
+            if status:
+                logger.debug("sounddevice status: %s", status)
+            # When not recording the stream is idle — discard audio.
+            if not self._recording:
+                return
+            self._frames.append(indata.copy())
+
+            # Compute RMS for level display and silence detection
+            rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
+            self._current_rms = rms
+            if rms > self._peak_rms:
+                self._peak_rms = rms
+
+            # Silence detection
+            if self._on_silence_stop is not None:
+                now = time.monotonic()
+                elapsed = now - self._start_time
+
+                if rms > self._silence_threshold:
+                    # Audio is above threshold -- this is speech (or noise).
+                    self._dip_start = 0.0  # Reset dip tracker
+                    if self._speech_start == 0.0:
+                        self._speech_start = now
+                    elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration:
+                        self._has_spoken = True
+                        logger.debug("Speech confirmed (%.2fs above threshold)",
+                                     now - self._speech_start)
+                    # After speech is confirmed, only reset silence timer if
+                    # speech is sustained (>0.3s above threshold).  Brief
+                    # spikes from ambient noise should NOT reset the timer.
+                    if not self._has_spoken:
+                        self._silence_start = 0.0
+                    else:
+                        # Track resumed speech with dip tolerance.
+                        # Brief dips below threshold are normal during speech,
+                        # so we mirror the initial speech detection pattern:
+                        # start tracking, tolerate short dips, confirm after 0.3s.
+                        self._resume_dip_start = 0.0  # Above threshold — no dip
+                        if self._resume_start == 0.0:
+                            self._resume_start = now
+                        elif now - self._resume_start >= self._min_speech_duration:
+                            self._silence_start = 0.0
+                            self._resume_start = 0.0
+                elif self._has_spoken:
+                    # Below threshold after speech confirmed.
+                    # Use dip tolerance before resetting resume tracker —
+                    # natural speech has brief dips below threshold.
+                    if self._resume_start > 0:
+                        if self._resume_dip_start == 0.0:
+                            self._resume_dip_start = now
+                        elif now - self._resume_dip_start >= self._max_dip_tolerance:
+                            # Sustained dip — user actually stopped speaking
+                            self._resume_start = 0.0
+                            self._resume_dip_start = 0.0
+                elif self._speech_start > 0:
+                    # We were in a speech attempt but RMS dipped.
+                    # Tolerate brief dips (micro-pauses between syllables).
+                    if self._dip_start == 0.0:
+                        self._dip_start = now
+                    elif now - self._dip_start >= self._max_dip_tolerance:
+                        # Dip lasted too long -- genuine silence, reset
+                        logger.debug("Speech attempt reset (dip lasted %.2fs)",
+                                     now - self._dip_start)
+                        self._speech_start = 0.0
+                        self._dip_start = 0.0
+
+                # Fire silence callback when:
+                # 1. User spoke then went silent for silence_duration, OR
+                # 2. No speech detected at all for max_wait seconds
+                should_fire = False
+                if self._has_spoken and rms <= self._silence_threshold:
+                    # User was speaking and now is silent
+                    if self._silence_start == 0.0:
+                        self._silence_start = now
+                    elif now - self._silence_start >= self._silence_duration:
+                        logger.info("Silence detected (%.1fs), auto-stopping",
+                                    self._silence_duration)
+                        should_fire = True
+                elif not self._has_spoken and elapsed >= self._max_wait:
+                    logger.info("No speech within %.0fs, auto-stopping",
+                                self._max_wait)
+                    should_fire = True
+
+                if should_fire:
+                    cb = self._on_silence_stop
+                    self._on_silence_stop = None  # fire only once
+                    if cb:
+                        def _safe_cb():
+                            try:
+                                cb()
+                            except Exception as e:
+                                logger.error("Silence callback failed: %s", e, exc_info=True)
+                        threading.Thread(target=_safe_cb, daemon=True).start()
+
+        # Create stream — may block on CoreAudio (first call only).
+        try:
+            stream = sd.InputStream(
+                samplerate=SAMPLE_RATE,
+                channels=CHANNELS,
+                dtype=DTYPE,
+                callback=_callback,
+            )
+            stream.start()
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to open audio input stream: {e}. "
+                "Check that a microphone is connected and accessible."
+            ) from e
+        self._stream = stream
+
    def start(self, on_silence_stop=None) -> None:
        """Start capturing audio from the default input device.

+        The underlying InputStream is created once and kept alive across
+        recordings.  Subsequent calls simply reset detection state and
+        toggle frame collection via ``_recording``.
+
        Args:
            on_silence_stop: Optional callback invoked (in a daemon thread) when
                silence is detected after speech. The callback receives no arguments.
@@ -222,7 +355,7 @@ class AudioRecorder:
        or if a recording is already in progress.
        """
        try:
-            sd, np = _import_audio()
+            _import_audio()
        except (ImportError, OSError) as e:
            raise RuntimeError(
                "Voice mode requires sounddevice and numpy.\n"
@@ -234,107 +367,54 @@ class AudioRecorder:
            if self._recording:
                return  # already recording

+            self._generation += 1
+
            self._frames = []
            self._start_time = time.monotonic()
            self._has_spoken = False
            self._speech_start = 0.0
            self._dip_start = 0.0
            self._silence_start = 0.0
+            self._resume_start = 0.0
+            self._resume_dip_start = 0.0
            self._peak_rms = 0
+            self._current_rms = 0
            self._on_silence_stop = on_silence_stop

-            def _callback(indata, frames, time_info, status):  # noqa: ARG001
-                if status:
-                    logger.debug("sounddevice status: %s", status)
-                self._frames.append(indata.copy())
+        # Ensure the persistent stream is alive (no-op after first call).
+        self._ensure_stream()

-                # Compute RMS for level display and silence detection
-                rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
-                self._current_rms = rms
-                if rms > self._peak_rms:
-                    self._peak_rms = rms
-
-                # Silence detection
-                if self._on_silence_stop is not None and self._recording:
-                    now = time.monotonic()
-
-                    if rms > self._silence_threshold:
-                        # Audio is above threshold -- this is speech (or noise).
-                        self._dip_start = 0.0  # Reset dip tracker
-                        if self._speech_start == 0.0:
-                            self._speech_start = now
-                        elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration:
-                            self._has_spoken = True
-                            logger.debug("Speech confirmed (%.2fs above threshold)",
-                                         now - self._speech_start)
-                        self._silence_start = 0.0
-                    elif self._has_spoken:
-                        # Speech already confirmed, let silence timer run below
-                        pass
-                    elif self._speech_start > 0:
-                        # We were in a speech attempt but RMS dipped.
-                        # Tolerate brief dips (micro-pauses between syllables).
-                        if self._dip_start == 0.0:
-                            self._dip_start = now
-                        elif now - self._dip_start >= self._max_dip_tolerance:
-                            # Dip lasted too long -- genuine silence, reset
-                            logger.debug("Speech attempt reset (dip lasted %.2fs)",
-                                         now - self._dip_start)
-                            self._speech_start = 0.0
-                            self._dip_start = 0.0
-                        # else: brief dip, keep tolerating
-                    # else: no speech attempt, just silence -- nothing to do
-
-                    # Fire silence callback when:
-                    # 1. User spoke then went silent for silence_duration, OR
-                    # 2. No speech detected at all for max_wait seconds
-                    should_fire = False
-                    if self._has_spoken and rms <= self._silence_threshold:
-                        # User was speaking and now is silent
-                        if self._silence_start == 0.0:
-                            self._silence_start = now
-                        elif now - self._silence_start >= self._silence_duration:
-                            logger.info("Silence detected (%.1fs), auto-stopping",
-                                        self._silence_duration)
-                            should_fire = True
-                    elif not self._has_spoken and now - self._start_time >= self._max_wait:
-                        # No speech detected within max_wait — stop to avoid
-                        # infinite recording in quiet environments.
-                        logger.info("No speech within %.0fs, auto-stopping",
-                                    self._max_wait)
-                        should_fire = True
-
-                    if should_fire:
-                        cb = self._on_silence_stop
-                        self._on_silence_stop = None  # fire only once
-                        if cb:
-                            def _safe_cb():
-                                try:
-                                    cb()
-                                except Exception as e:
-                                    logger.error("Silence callback failed: %s", e, exc_info=True)
-                            threading.Thread(target=_safe_cb, daemon=True).start()
-
-            try:
-                self._stream = sd.InputStream(
-                    samplerate=SAMPLE_RATE,
-                    channels=CHANNELS,
-                    dtype=DTYPE,
-                    callback=_callback,
-                )
-                self._stream.start()
-            except Exception as e:
-                self._stream = None
-                raise RuntimeError(
-                    f"Failed to open audio input stream: {e}. "
-                    "Check that a microphone is connected and accessible."
-                ) from e
+        with self._lock:
            self._recording = True
-            logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
+        logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
+
+    def _close_stream_with_timeout(self, timeout: float = 3.0) -> None:
+        """Close the audio stream with a timeout to prevent CoreAudio hangs."""
+        if self._stream is None:
+            return
+
+        stream = self._stream
+        self._stream = None
+
+        def _do_close():
+            try:
+                stream.stop()
+                stream.close()
+            except Exception:
+                pass
+
+        t = threading.Thread(target=_do_close, daemon=True)
+        t.start()
+        t.join(timeout=timeout)
+        if t.is_alive():
+            logger.warning("Audio stream close timed out after %.1fs — forcing ahead", timeout)

    def stop(self) -> Optional[str]:
        """Stop recording and write captured audio to a WAV file.

+        The underlying stream is kept alive for reuse — only frame
+        collection is stopped.
+
        Returns:
            Path to the WAV file, or ``None`` if no audio was captured.
        """
@@ -343,14 +423,9 @@ class AudioRecorder:
                return None

            self._recording = False
-
-            if self._stream is not None:
-                try:
-                    self._stream.stop()
-                    self._stream.close()
-                except Exception:
-                    pass
-                self._stream = None
+            self._generation += 1  # Invalidate any pending start()
+            self._current_rms = 0
+            # Stream stays alive — no close needed.

            if not self._frames:
                return None
@@ -379,20 +454,26 @@ class AudioRecorder:
            return self._write_wav(audio_data)

    def cancel(self) -> None:
-        """Stop recording and discard all captured audio."""
+        """Stop recording and discard all captured audio.
+
+        The underlying stream is kept alive for reuse.
+        """
+        with self._lock:
+            self._generation += 1  # Invalidate any pending start()
+            self._recording = False
+            self._frames = []
+            self._on_silence_stop = None
+            self._current_rms = 0
+        logger.info("Voice recording cancelled")
+
+    def shutdown(self) -> None:
+        """Release the audio stream.  Call when voice mode is disabled."""
        with self._lock:
            self._recording = False
            self._frames = []
-
-            if self._stream is not None:
-                try:
-                    self._stream.stop()
-                    self._stream.close()
-                except Exception:
-                    pass
-                self._stream = None
-
-            logger.info("Voice recording cancelled")
+            self._on_silence_stop = None
+            self._close_stream_with_timeout()
+        logger.info("AudioRecorder shut down")

    # -- private helpers -----------------------------------------------------