From 9f244ffc704f4ee3a64fdb51e41ccfdc368c49e5 Mon Sep 17 00:00:00 2001 From: Kimi Agent Date: Thu, 19 Mar 2026 21:37:32 -0400 Subject: [PATCH] refactor: break up _record_utterance() into focused helpers (#572) Co-authored-by: Kimi Agent Co-committed-by: Kimi Agent --- src/timmy/voice_loop.py | 93 +++++++++++++++++++++------------- tests/timmy/test_voice_loop.py | 27 +++++++++- 2 files changed, 85 insertions(+), 35 deletions(-) diff --git a/src/timmy/voice_loop.py b/src/timmy/voice_loop.py index 6058e42a..e2fcfb25 100644 --- a/src/timmy/voice_loop.py +++ b/src/timmy/voice_loop.py @@ -78,6 +78,11 @@ DEFAULT_MAX_UTTERANCE = 30.0 # safety cap — don't record forever DEFAULT_SESSION_ID = "voice" +def _rms(block: np.ndarray) -> float: + """Compute root-mean-square energy of an audio block.""" + return float(np.sqrt(np.mean(block.astype(np.float32) ** 2))) + + @dataclass class VoiceConfig: """Configuration for the voice loop.""" @@ -161,13 +166,6 @@ class VoiceLoop: min_blocks = int(self.config.min_utterance / 0.1) max_blocks = int(self.config.max_utterance / 0.1) - audio_chunks: list[np.ndarray] = [] - silent_count = 0 - recording = False - - def _rms(block: np.ndarray) -> float: - return float(np.sqrt(np.mean(block.astype(np.float32) ** 2))) - sys.stdout.write("\n 🎤 Listening... (speak now)\n") sys.stdout.flush() @@ -177,42 +175,69 @@ class VoiceLoop: dtype="float32", blocksize=block_size, ) as stream: - while self._running: - block, overflowed = stream.read(block_size) - if overflowed: - logger.debug("Audio buffer overflowed") + chunks = self._capture_audio_blocks(stream, block_size, silence_blocks, max_blocks) - rms = _rms(block) + return self._finalize_utterance(chunks, min_blocks, sr) - if not recording: - if rms > self.config.silence_threshold: - recording = True - silent_count = 0 - audio_chunks.append(block.copy()) - sys.stdout.write(" 📢 Recording...\r") - sys.stdout.flush() + def _capture_audio_blocks( + self, + stream, + block_size: int, + silence_blocks: int, + max_blocks: int, + ) -> list[np.ndarray]: + """Read audio blocks from *stream* until silence or max length. + + Returns the list of captured audio chunks (may be empty). + """ + chunks: list[np.ndarray] = [] + silent_count = 0 + recording = False + + while self._running: + block, overflowed = stream.read(block_size) + if overflowed: + logger.debug("Audio buffer overflowed") + + rms = _rms(block) + + if not recording: + if rms > self.config.silence_threshold: + recording = True + silent_count = 0 + chunks.append(block.copy()) + sys.stdout.write(" 📢 Recording...\r") + sys.stdout.flush() + else: + chunks.append(block.copy()) + + if rms < self.config.silence_threshold: + silent_count += 1 else: - audio_chunks.append(block.copy()) + silent_count = 0 - if rms < self.config.silence_threshold: - silent_count += 1 - else: - silent_count = 0 + if silent_count >= silence_blocks: + break - # End of utterance - if silent_count >= silence_blocks: - break + if len(chunks) >= max_blocks: + logger.info("Max utterance length reached, stopping.") + break - # Safety cap - if len(audio_chunks) >= max_blocks: - logger.info("Max utterance length reached, stopping.") - break + return chunks - if not audio_chunks or len(audio_chunks) < min_blocks: + @staticmethod + def _finalize_utterance( + chunks: list[np.ndarray], min_blocks: int, sample_rate: int + ) -> np.ndarray | None: + """Concatenate recorded chunks and report duration. + + Returns ``None`` if the utterance is too short to be meaningful. + """ + if not chunks or len(chunks) < min_blocks: return None - audio = np.concatenate(audio_chunks, axis=0).flatten() - duration = len(audio) / sr + audio = np.concatenate(chunks, axis=0).flatten() + duration = len(audio) / sample_rate sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n") sys.stdout.flush() return audio diff --git a/tests/timmy/test_voice_loop.py b/tests/timmy/test_voice_loop.py index 809bd151..74d56e59 100644 --- a/tests/timmy/test_voice_loop.py +++ b/tests/timmy/test_voice_loop.py @@ -15,7 +15,7 @@ except ImportError: np = None try: - from timmy.voice_loop import VoiceConfig, VoiceLoop, _strip_markdown + from timmy.voice_loop import VoiceConfig, VoiceLoop, _rms, _strip_markdown except ImportError: pass # pytestmark will skip all tests anyway @@ -147,6 +147,31 @@ class TestStripMarkdown: assert "*" not in result +class TestRms: + def test_silent_block(self): + block = np.zeros(1600, dtype=np.float32) + assert _rms(block) == pytest.approx(0.0, abs=1e-7) + + def test_loud_block(self): + block = np.ones(1600, dtype=np.float32) + assert _rms(block) == pytest.approx(1.0, abs=1e-5) + + +class TestFinalizeUtterance: + def test_returns_none_for_empty(self): + assert VoiceLoop._finalize_utterance([], min_blocks=5, sample_rate=16000) is None + + def test_returns_none_for_too_short(self): + chunks = [np.zeros(1600, dtype=np.float32) for _ in range(3)] + assert VoiceLoop._finalize_utterance(chunks, min_blocks=5, sample_rate=16000) is None + + def test_returns_audio_for_sufficient_chunks(self): + chunks = [np.ones(1600, dtype=np.float32) for _ in range(6)] + result = VoiceLoop._finalize_utterance(chunks, min_blocks=5, sample_rate=16000) + assert result is not None + assert len(result) == 6 * 1600 + + class TestThink: def test_think_returns_response(self): loop = VoiceLoop()