feat: add silence filter, hallucination guard, and continuous mode control

- Skip silent recordings before STT call (RMS check in AudioRecorder.stop) - Filter known Whisper hallucinations ("Thank you.", "Bye." etc.) - Continuous mode: Ctrl+R starts loop, Ctrl+R during recording exits it - Wait for TTS to finish before auto-restart to avoid recording speaker - Silence timeout increased to 3s for natural pauses - Tests: hallucination filter, silent recording skip, real speech passthrough
2026-03-03 19:58:38 +03:00
parent bfd9c97705
commit 32b033c11c
2 changed files with 111 additions and 3 deletions
--- a/tests/tools/test_voice_mode.py
+++ b/tests/tools/test_voice_mode.py
@@ -154,8 +154,8 @@ class TestAudioRecorderStop:
        recorder = AudioRecorder()
        recorder.start()

-        # Simulate captured audio frames (1 second of silence)
-        frame = np.zeros((SAMPLE_RATE, 1), dtype="int16")
+        # Simulate captured audio frames (1 second of loud audio above RMS threshold)
+        frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16")
        recorder._frames = [frame]

        wav_path = recorder.stop()
@@ -189,6 +189,24 @@ class TestAudioRecorderStop:
        wav_path = recorder.stop()
        assert wav_path is None

+    def test_stop_returns_none_for_silent_recording(self, mock_sd, temp_voice_dir):
+        np = pytest.importorskip("numpy")
+
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder, SAMPLE_RATE
+
+        recorder = AudioRecorder()
+        recorder.start()
+
+        # 1 second of near-silence (RMS well below threshold)
+        frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16")
+        recorder._frames = [frame]
+
+        wav_path = recorder.stop()
+        assert wav_path is None
+

 class TestAudioRecorderCancel:
    def test_cancel_discards_frames(self, mock_sd):
@@ -259,6 +277,52 @@ class TestTranscribeRecording:
        assert result["transcript"] == "hello world"
        mock_transcribe.assert_called_once_with("/tmp/test.wav", model="whisper-1")

+    def test_filters_whisper_hallucination(self):
+        mock_transcribe = MagicMock(return_value={
+            "success": True,
+            "transcript": "Thank you.",
+        })
+
+        with patch("tools.transcription_tools.transcribe_audio", mock_transcribe):
+            from tools.voice_mode import transcribe_recording
+            result = transcribe_recording("/tmp/test.wav")
+
+        assert result["success"] is True
+        assert result["transcript"] == ""
+        assert result["filtered"] is True
+
+    def test_does_not_filter_real_speech(self):
+        mock_transcribe = MagicMock(return_value={
+            "success": True,
+            "transcript": "Thank you for helping me with this code.",
+        })
+
+        with patch("tools.transcription_tools.transcribe_audio", mock_transcribe):
+            from tools.voice_mode import transcribe_recording
+            result = transcribe_recording("/tmp/test.wav")
+
+        assert result["transcript"] == "Thank you for helping me with this code."
+        assert "filtered" not in result
+
+
+class TestWhisperHallucinationFilter:
+    def test_known_hallucinations(self):
+        from tools.voice_mode import is_whisper_hallucination
+
+        assert is_whisper_hallucination("Thank you.") is True
+        assert is_whisper_hallucination("thank you") is True
+        assert is_whisper_hallucination("Thanks for watching.") is True
+        assert is_whisper_hallucination("Bye.") is True
+        assert is_whisper_hallucination("  Thank you.  ") is True  # with whitespace
+        assert is_whisper_hallucination("you") is True
+
+    def test_real_speech_not_filtered(self):
+        from tools.voice_mode import is_whisper_hallucination
+
+        assert is_whisper_hallucination("Hello, how are you?") is False
+        assert is_whisper_hallucination("Thank you for your help with the project.") is False
+        assert is_whisper_hallucination("Can you explain this code?") is False
+

 # ============================================================================
 # play_audio_file
--- a/tools/voice_mode.py
+++ b/tools/voice_mode.py
@@ -235,6 +235,12 @@ class AudioRecorder:
                logger.debug("Recording too short (%d samples), discarding", len(audio_data))
                return None

+            # Skip silent recordings (RMS below threshold = no real speech)
+            rms = int(np.sqrt(np.mean(audio_data.astype(np.float64) ** 2)))
+            if rms < SILENCE_RMS_THRESHOLD:
+                logger.info("Recording too quiet (RMS=%d < %d), discarding", rms, SILENCE_RMS_THRESHOLD)
+                return None
+
            return self._write_wav(audio_data)

    def cancel(self) -> None:
@@ -276,6 +282,36 @@ class AudioRecorder:
        return wav_path


+# ============================================================================
+# Whisper hallucination filter
+# ============================================================================
+# Whisper commonly hallucinates these phrases on silent/near-silent audio.
+WHISPER_HALLUCINATIONS = {
+    "thank you.",
+    "thank you",
+    "thanks for watching.",
+    "thanks for watching",
+    "subscribe to my channel.",
+    "subscribe to my channel",
+    "like and subscribe.",
+    "like and subscribe",
+    "please subscribe.",
+    "please subscribe",
+    "thank you for watching.",
+    "thank you for watching",
+    "bye.",
+    "bye",
+    "you",
+    "the end.",
+    "the end",
+}
+
+
+def is_whisper_hallucination(transcript: str) -> bool:
+    """Check if a transcript is a known Whisper hallucination on silence."""
+    return transcript.strip().lower() in WHISPER_HALLUCINATIONS
+
+
 # ============================================================================
 # STT dispatch
 # ============================================================================
@@ -283,6 +319,7 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str
    """Transcribe a WAV recording using the existing Whisper pipeline.

    Delegates to ``tools.transcription_tools.transcribe_audio()``.
+    Filters out known Whisper hallucinations on silent audio.

    Args:
        wav_path: Path to the WAV file.
@@ -293,7 +330,14 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str
    """
    from tools.transcription_tools import transcribe_audio

-    return transcribe_audio(wav_path, model=model)
+    result = transcribe_audio(wav_path, model=model)
+
+    # Filter out Whisper hallucinations (common on silent/near-silent audio)
+    if result.get("success") and is_whisper_hallucination(result.get("transcript", "")):
+        logger.info("Filtered Whisper hallucination: %r", result["transcript"])
+        return {"success": True, "transcript": "", "filtered": True}
+
+    return result


 # ============================================================================