feat: add silence filter, hallucination guard, and continuous mode control

- Skip silent recordings before STT call (RMS check in AudioRecorder.stop)
- Filter known Whisper hallucinations ("Thank you.", "Bye." etc.)
- Continuous mode: Ctrl+R starts loop, Ctrl+R during recording exits it
- Wait for TTS to finish before auto-restart to avoid recording speaker
- Silence timeout increased to 3s for natural pauses
- Tests: hallucination filter, silent recording skip, real speech passthrough
This commit is contained in:
0xbyt4
2026-03-03 19:58:38 +03:00
parent bfd9c97705
commit 32b033c11c
2 changed files with 111 additions and 3 deletions

View File

@@ -154,8 +154,8 @@ class TestAudioRecorderStop:
recorder = AudioRecorder()
recorder.start()
# Simulate captured audio frames (1 second of silence)
frame = np.zeros((SAMPLE_RATE, 1), dtype="int16")
# Simulate captured audio frames (1 second of loud audio above RMS threshold)
frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16")
recorder._frames = [frame]
wav_path = recorder.stop()
@@ -189,6 +189,24 @@ class TestAudioRecorderStop:
wav_path = recorder.stop()
assert wav_path is None
def test_stop_returns_none_for_silent_recording(self, mock_sd, temp_voice_dir):
np = pytest.importorskip("numpy")
mock_stream = MagicMock()
mock_sd.InputStream.return_value = mock_stream
from tools.voice_mode import AudioRecorder, SAMPLE_RATE
recorder = AudioRecorder()
recorder.start()
# 1 second of near-silence (RMS well below threshold)
frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16")
recorder._frames = [frame]
wav_path = recorder.stop()
assert wav_path is None
class TestAudioRecorderCancel:
def test_cancel_discards_frames(self, mock_sd):
@@ -259,6 +277,52 @@ class TestTranscribeRecording:
assert result["transcript"] == "hello world"
mock_transcribe.assert_called_once_with("/tmp/test.wav", model="whisper-1")
def test_filters_whisper_hallucination(self):
mock_transcribe = MagicMock(return_value={
"success": True,
"transcript": "Thank you.",
})
with patch("tools.transcription_tools.transcribe_audio", mock_transcribe):
from tools.voice_mode import transcribe_recording
result = transcribe_recording("/tmp/test.wav")
assert result["success"] is True
assert result["transcript"] == ""
assert result["filtered"] is True
def test_does_not_filter_real_speech(self):
mock_transcribe = MagicMock(return_value={
"success": True,
"transcript": "Thank you for helping me with this code.",
})
with patch("tools.transcription_tools.transcribe_audio", mock_transcribe):
from tools.voice_mode import transcribe_recording
result = transcribe_recording("/tmp/test.wav")
assert result["transcript"] == "Thank you for helping me with this code."
assert "filtered" not in result
class TestWhisperHallucinationFilter:
def test_known_hallucinations(self):
from tools.voice_mode import is_whisper_hallucination
assert is_whisper_hallucination("Thank you.") is True
assert is_whisper_hallucination("thank you") is True
assert is_whisper_hallucination("Thanks for watching.") is True
assert is_whisper_hallucination("Bye.") is True
assert is_whisper_hallucination(" Thank you. ") is True # with whitespace
assert is_whisper_hallucination("you") is True
def test_real_speech_not_filtered(self):
from tools.voice_mode import is_whisper_hallucination
assert is_whisper_hallucination("Hello, how are you?") is False
assert is_whisper_hallucination("Thank you for your help with the project.") is False
assert is_whisper_hallucination("Can you explain this code?") is False
# ============================================================================
# play_audio_file

View File

@@ -235,6 +235,12 @@ class AudioRecorder:
logger.debug("Recording too short (%d samples), discarding", len(audio_data))
return None
# Skip silent recordings (RMS below threshold = no real speech)
rms = int(np.sqrt(np.mean(audio_data.astype(np.float64) ** 2)))
if rms < SILENCE_RMS_THRESHOLD:
logger.info("Recording too quiet (RMS=%d < %d), discarding", rms, SILENCE_RMS_THRESHOLD)
return None
return self._write_wav(audio_data)
def cancel(self) -> None:
@@ -276,6 +282,36 @@ class AudioRecorder:
return wav_path
# ============================================================================
# Whisper hallucination filter
# ============================================================================
# Whisper commonly hallucinates these phrases on silent/near-silent audio.
WHISPER_HALLUCINATIONS = {
"thank you.",
"thank you",
"thanks for watching.",
"thanks for watching",
"subscribe to my channel.",
"subscribe to my channel",
"like and subscribe.",
"like and subscribe",
"please subscribe.",
"please subscribe",
"thank you for watching.",
"thank you for watching",
"bye.",
"bye",
"you",
"the end.",
"the end",
}
def is_whisper_hallucination(transcript: str) -> bool:
"""Check if a transcript is a known Whisper hallucination on silence."""
return transcript.strip().lower() in WHISPER_HALLUCINATIONS
# ============================================================================
# STT dispatch
# ============================================================================
@@ -283,6 +319,7 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str
"""Transcribe a WAV recording using the existing Whisper pipeline.
Delegates to ``tools.transcription_tools.transcribe_audio()``.
Filters out known Whisper hallucinations on silent audio.
Args:
wav_path: Path to the WAV file.
@@ -293,7 +330,14 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str
"""
from tools.transcription_tools import transcribe_audio
return transcribe_audio(wav_path, model=model)
result = transcribe_audio(wav_path, model=model)
# Filter out Whisper hallucinations (common on silent/near-silent audio)
if result.get("success") and is_whisper_hallucination(result.get("transcript", "")):
logger.info("Filtered Whisper hallucination: %r", result["transcript"])
return {"success": True, "transcript": "", "filtered": True}
return result
# ============================================================================