diff --git a/src/timmy/voice_loop.py b/src/timmy/voice_loop.py index 6058e42..5b31979 100644 --- a/src/timmy/voice_loop.py +++ b/src/timmy/voice_loop.py @@ -78,6 +78,11 @@ DEFAULT_MAX_UTTERANCE = 30.0 # safety cap — don't record forever DEFAULT_SESSION_ID = "voice" +def _rms(block: np.ndarray) -> float: + """Compute root-mean-square energy of an audio block.""" + return float(np.sqrt(np.mean(block.astype(np.float32) ** 2))) + + @dataclass class VoiceConfig: """Configuration for the voice loop.""" @@ -161,13 +166,6 @@ class VoiceLoop: min_blocks = int(self.config.min_utterance / 0.1) max_blocks = int(self.config.max_utterance / 0.1) - audio_chunks: list[np.ndarray] = [] - silent_count = 0 - recording = False - - def _rms(block: np.ndarray) -> float: - return float(np.sqrt(np.mean(block.astype(np.float32) ** 2))) - sys.stdout.write("\n 🎤 Listening... (speak now)\n") sys.stdout.flush() @@ -177,42 +175,70 @@ class VoiceLoop: dtype="float32", blocksize=block_size, ) as stream: - while self._running: - block, overflowed = stream.read(block_size) - if overflowed: - logger.debug("Audio buffer overflowed") + chunks = self._capture_audio_blocks(stream, block_size, silence_blocks, max_blocks) - rms = _rms(block) + return self._finalize_utterance(chunks, min_blocks, sr) - if not recording: - if rms > self.config.silence_threshold: - recording = True - silent_count = 0 - audio_chunks.append(block.copy()) - sys.stdout.write(" 📢 Recording...\r") - sys.stdout.flush() - else: + def _capture_audio_blocks( + self, + stream, + block_size: int, + silence_blocks: int, + max_blocks: int, + ) -> list[np.ndarray]: + """Read audio blocks from *stream* until silence or safety cap. + + Returns the list of captured audio blocks (may be empty if no + speech was detected). + """ + audio_chunks: list[np.ndarray] = [] + silent_count = 0 + recording = False + + while self._running: + block, overflowed = stream.read(block_size) + if overflowed: + logger.debug("Audio buffer overflowed") + + rms = _rms(block) + + if not recording: + if rms > self.config.silence_threshold: + recording = True + silent_count = 0 audio_chunks.append(block.copy()) + sys.stdout.write(" 📢 Recording...\r") + sys.stdout.flush() + else: + audio_chunks.append(block.copy()) - if rms < self.config.silence_threshold: - silent_count += 1 - else: - silent_count = 0 + if rms < self.config.silence_threshold: + silent_count += 1 + else: + silent_count = 0 - # End of utterance - if silent_count >= silence_blocks: - break + if silent_count >= silence_blocks: + break - # Safety cap - if len(audio_chunks) >= max_blocks: - logger.info("Max utterance length reached, stopping.") - break + if len(audio_chunks) >= max_blocks: + logger.info("Max utterance length reached, stopping.") + break - if not audio_chunks or len(audio_chunks) < min_blocks: + return audio_chunks + + @staticmethod + def _finalize_utterance( + chunks: list[np.ndarray], min_blocks: int, sample_rate: int + ) -> np.ndarray | None: + """Concatenate captured chunks and report duration. + + Returns None if the utterance is too short (below *min_blocks*). + """ + if not chunks or len(chunks) < min_blocks: return None - audio = np.concatenate(audio_chunks, axis=0).flatten() - duration = len(audio) / sr + audio = np.concatenate(chunks, axis=0).flatten() + duration = len(audio) / sample_rate sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n") sys.stdout.flush() return audio diff --git a/tests/timmy/test_voice_loop.py b/tests/timmy/test_voice_loop.py index 809bd15..e5d930c 100644 --- a/tests/timmy/test_voice_loop.py +++ b/tests/timmy/test_voice_loop.py @@ -15,7 +15,7 @@ except ImportError: np = None try: - from timmy.voice_loop import VoiceConfig, VoiceLoop, _strip_markdown + from timmy.voice_loop import VoiceConfig, VoiceLoop, _rms, _strip_markdown except ImportError: pass # pytestmark will skip all tests anyway @@ -336,3 +336,28 @@ class TestSpeakSetsFlag: # After speak assert loop._speaking is False + + +class TestRms: + def test_rms_of_silence(self): + block = np.zeros(1600, dtype=np.float32) + assert _rms(block) == 0.0 + + def test_rms_of_signal(self): + block = np.ones(1600, dtype=np.float32) * 0.5 + assert abs(_rms(block) - 0.5) < 1e-5 + + +class TestFinalizeUtterance: + def test_returns_none_for_empty(self): + assert VoiceLoop._finalize_utterance([], min_blocks=5, sample_rate=16000) is None + + def test_returns_none_below_min(self): + chunks = [np.zeros(1600, dtype=np.float32) for _ in range(3)] + assert VoiceLoop._finalize_utterance(chunks, min_blocks=5, sample_rate=16000) is None + + def test_concatenates_chunks(self): + chunks = [np.ones(1600, dtype=np.float32) for _ in range(5)] + result = VoiceLoop._finalize_utterance(chunks, min_blocks=3, sample_rate=16000) + assert result is not None + assert len(result) == 8000