Compare commits
1 Commits
main
...
kimi/issue
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cd62c61cd6 |
@@ -78,6 +78,11 @@ DEFAULT_MAX_UTTERANCE = 30.0 # safety cap — don't record forever
|
||||
DEFAULT_SESSION_ID = "voice"
|
||||
|
||||
|
||||
def _rms(block: np.ndarray) -> float:
|
||||
"""Compute root-mean-square energy of an audio block."""
|
||||
return float(np.sqrt(np.mean(block.astype(np.float32) ** 2)))
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoiceConfig:
|
||||
"""Configuration for the voice loop."""
|
||||
@@ -161,13 +166,6 @@ class VoiceLoop:
|
||||
min_blocks = int(self.config.min_utterance / 0.1)
|
||||
max_blocks = int(self.config.max_utterance / 0.1)
|
||||
|
||||
audio_chunks: list[np.ndarray] = []
|
||||
silent_count = 0
|
||||
recording = False
|
||||
|
||||
def _rms(block: np.ndarray) -> float:
|
||||
return float(np.sqrt(np.mean(block.astype(np.float32) ** 2)))
|
||||
|
||||
sys.stdout.write("\n 🎤 Listening... (speak now)\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
@@ -177,42 +175,70 @@ class VoiceLoop:
|
||||
dtype="float32",
|
||||
blocksize=block_size,
|
||||
) as stream:
|
||||
while self._running:
|
||||
block, overflowed = stream.read(block_size)
|
||||
if overflowed:
|
||||
logger.debug("Audio buffer overflowed")
|
||||
chunks = self._capture_audio_blocks(stream, block_size, silence_blocks, max_blocks)
|
||||
|
||||
rms = _rms(block)
|
||||
return self._finalize_utterance(chunks, min_blocks, sr)
|
||||
|
||||
if not recording:
|
||||
if rms > self.config.silence_threshold:
|
||||
recording = True
|
||||
silent_count = 0
|
||||
audio_chunks.append(block.copy())
|
||||
sys.stdout.write(" 📢 Recording...\r")
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
def _capture_audio_blocks(
|
||||
self,
|
||||
stream,
|
||||
block_size: int,
|
||||
silence_blocks: int,
|
||||
max_blocks: int,
|
||||
) -> list[np.ndarray]:
|
||||
"""Read audio blocks from *stream* until silence or safety cap.
|
||||
|
||||
Returns the list of captured audio blocks (may be empty if no
|
||||
speech was detected).
|
||||
"""
|
||||
audio_chunks: list[np.ndarray] = []
|
||||
silent_count = 0
|
||||
recording = False
|
||||
|
||||
while self._running:
|
||||
block, overflowed = stream.read(block_size)
|
||||
if overflowed:
|
||||
logger.debug("Audio buffer overflowed")
|
||||
|
||||
rms = _rms(block)
|
||||
|
||||
if not recording:
|
||||
if rms > self.config.silence_threshold:
|
||||
recording = True
|
||||
silent_count = 0
|
||||
audio_chunks.append(block.copy())
|
||||
sys.stdout.write(" 📢 Recording...\r")
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
audio_chunks.append(block.copy())
|
||||
|
||||
if rms < self.config.silence_threshold:
|
||||
silent_count += 1
|
||||
else:
|
||||
silent_count = 0
|
||||
if rms < self.config.silence_threshold:
|
||||
silent_count += 1
|
||||
else:
|
||||
silent_count = 0
|
||||
|
||||
# End of utterance
|
||||
if silent_count >= silence_blocks:
|
||||
break
|
||||
if silent_count >= silence_blocks:
|
||||
break
|
||||
|
||||
# Safety cap
|
||||
if len(audio_chunks) >= max_blocks:
|
||||
logger.info("Max utterance length reached, stopping.")
|
||||
break
|
||||
if len(audio_chunks) >= max_blocks:
|
||||
logger.info("Max utterance length reached, stopping.")
|
||||
break
|
||||
|
||||
if not audio_chunks or len(audio_chunks) < min_blocks:
|
||||
return audio_chunks
|
||||
|
||||
@staticmethod
|
||||
def _finalize_utterance(
|
||||
chunks: list[np.ndarray], min_blocks: int, sample_rate: int
|
||||
) -> np.ndarray | None:
|
||||
"""Concatenate captured chunks and report duration.
|
||||
|
||||
Returns None if the utterance is too short (below *min_blocks*).
|
||||
"""
|
||||
if not chunks or len(chunks) < min_blocks:
|
||||
return None
|
||||
|
||||
audio = np.concatenate(audio_chunks, axis=0).flatten()
|
||||
duration = len(audio) / sr
|
||||
audio = np.concatenate(chunks, axis=0).flatten()
|
||||
duration = len(audio) / sample_rate
|
||||
sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n")
|
||||
sys.stdout.flush()
|
||||
return audio
|
||||
|
||||
@@ -15,7 +15,7 @@ except ImportError:
|
||||
np = None
|
||||
|
||||
try:
|
||||
from timmy.voice_loop import VoiceConfig, VoiceLoop, _strip_markdown
|
||||
from timmy.voice_loop import VoiceConfig, VoiceLoop, _rms, _strip_markdown
|
||||
except ImportError:
|
||||
pass # pytestmark will skip all tests anyway
|
||||
|
||||
@@ -336,3 +336,28 @@ class TestSpeakSetsFlag:
|
||||
|
||||
# After speak
|
||||
assert loop._speaking is False
|
||||
|
||||
|
||||
class TestRms:
|
||||
def test_rms_of_silence(self):
|
||||
block = np.zeros(1600, dtype=np.float32)
|
||||
assert _rms(block) == 0.0
|
||||
|
||||
def test_rms_of_signal(self):
|
||||
block = np.ones(1600, dtype=np.float32) * 0.5
|
||||
assert abs(_rms(block) - 0.5) < 1e-5
|
||||
|
||||
|
||||
class TestFinalizeUtterance:
|
||||
def test_returns_none_for_empty(self):
|
||||
assert VoiceLoop._finalize_utterance([], min_blocks=5, sample_rate=16000) is None
|
||||
|
||||
def test_returns_none_below_min(self):
|
||||
chunks = [np.zeros(1600, dtype=np.float32) for _ in range(3)]
|
||||
assert VoiceLoop._finalize_utterance(chunks, min_blocks=5, sample_rate=16000) is None
|
||||
|
||||
def test_concatenates_chunks(self):
|
||||
chunks = [np.ones(1600, dtype=np.float32) for _ in range(5)]
|
||||
result = VoiceLoop._finalize_utterance(chunks, min_blocks=3, sample_rate=16000)
|
||||
assert result is not None
|
||||
assert len(result) == 8000
|
||||
|
||||
Reference in New Issue
Block a user