refactor: break up _record_utterance() into focused helpers (#572)

Co-authored-by: Kimi Agent <kimi@timmy.local>
Co-committed-by: Kimi Agent <kimi@timmy.local>
This commit is contained in:
2026-03-19 21:37:32 -04:00
committed by Timmy Time
parent 0162a604be
commit 9f244ffc70
2 changed files with 85 additions and 35 deletions

View File

@@ -78,6 +78,11 @@ DEFAULT_MAX_UTTERANCE = 30.0 # safety cap — don't record forever
DEFAULT_SESSION_ID = "voice"
def _rms(block: np.ndarray) -> float:
"""Compute root-mean-square energy of an audio block."""
return float(np.sqrt(np.mean(block.astype(np.float32) ** 2)))
@dataclass
class VoiceConfig:
"""Configuration for the voice loop."""
@@ -161,13 +166,6 @@ class VoiceLoop:
min_blocks = int(self.config.min_utterance / 0.1)
max_blocks = int(self.config.max_utterance / 0.1)
audio_chunks: list[np.ndarray] = []
silent_count = 0
recording = False
def _rms(block: np.ndarray) -> float:
return float(np.sqrt(np.mean(block.astype(np.float32) ** 2)))
sys.stdout.write("\n 🎤 Listening... (speak now)\n")
sys.stdout.flush()
@@ -177,42 +175,69 @@ class VoiceLoop:
dtype="float32",
blocksize=block_size,
) as stream:
while self._running:
block, overflowed = stream.read(block_size)
if overflowed:
logger.debug("Audio buffer overflowed")
chunks = self._capture_audio_blocks(stream, block_size, silence_blocks, max_blocks)
rms = _rms(block)
return self._finalize_utterance(chunks, min_blocks, sr)
if not recording:
if rms > self.config.silence_threshold:
recording = True
silent_count = 0
audio_chunks.append(block.copy())
sys.stdout.write(" 📢 Recording...\r")
sys.stdout.flush()
def _capture_audio_blocks(
self,
stream,
block_size: int,
silence_blocks: int,
max_blocks: int,
) -> list[np.ndarray]:
"""Read audio blocks from *stream* until silence or max length.
Returns the list of captured audio chunks (may be empty).
"""
chunks: list[np.ndarray] = []
silent_count = 0
recording = False
while self._running:
block, overflowed = stream.read(block_size)
if overflowed:
logger.debug("Audio buffer overflowed")
rms = _rms(block)
if not recording:
if rms > self.config.silence_threshold:
recording = True
silent_count = 0
chunks.append(block.copy())
sys.stdout.write(" 📢 Recording...\r")
sys.stdout.flush()
else:
chunks.append(block.copy())
if rms < self.config.silence_threshold:
silent_count += 1
else:
audio_chunks.append(block.copy())
silent_count = 0
if rms < self.config.silence_threshold:
silent_count += 1
else:
silent_count = 0
if silent_count >= silence_blocks:
break
# End of utterance
if silent_count >= silence_blocks:
break
if len(chunks) >= max_blocks:
logger.info("Max utterance length reached, stopping.")
break
# Safety cap
if len(audio_chunks) >= max_blocks:
logger.info("Max utterance length reached, stopping.")
break
return chunks
if not audio_chunks or len(audio_chunks) < min_blocks:
@staticmethod
def _finalize_utterance(
chunks: list[np.ndarray], min_blocks: int, sample_rate: int
) -> np.ndarray | None:
"""Concatenate recorded chunks and report duration.
Returns ``None`` if the utterance is too short to be meaningful.
"""
if not chunks or len(chunks) < min_blocks:
return None
audio = np.concatenate(audio_chunks, axis=0).flatten()
duration = len(audio) / sr
audio = np.concatenate(chunks, axis=0).flatten()
duration = len(audio) / sample_rate
sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n")
sys.stdout.flush()
return audio