Compare commits
1 Commits
main
...
kimi/issue
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cd62c61cd6 |
@@ -78,6 +78,11 @@ DEFAULT_MAX_UTTERANCE = 30.0 # safety cap — don't record forever
|
|||||||
DEFAULT_SESSION_ID = "voice"
|
DEFAULT_SESSION_ID = "voice"
|
||||||
|
|
||||||
|
|
||||||
|
def _rms(block: np.ndarray) -> float:
|
||||||
|
"""Compute root-mean-square energy of an audio block."""
|
||||||
|
return float(np.sqrt(np.mean(block.astype(np.float32) ** 2)))
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class VoiceConfig:
|
class VoiceConfig:
|
||||||
"""Configuration for the voice loop."""
|
"""Configuration for the voice loop."""
|
||||||
@@ -161,13 +166,6 @@ class VoiceLoop:
|
|||||||
min_blocks = int(self.config.min_utterance / 0.1)
|
min_blocks = int(self.config.min_utterance / 0.1)
|
||||||
max_blocks = int(self.config.max_utterance / 0.1)
|
max_blocks = int(self.config.max_utterance / 0.1)
|
||||||
|
|
||||||
audio_chunks: list[np.ndarray] = []
|
|
||||||
silent_count = 0
|
|
||||||
recording = False
|
|
||||||
|
|
||||||
def _rms(block: np.ndarray) -> float:
|
|
||||||
return float(np.sqrt(np.mean(block.astype(np.float32) ** 2)))
|
|
||||||
|
|
||||||
sys.stdout.write("\n 🎤 Listening... (speak now)\n")
|
sys.stdout.write("\n 🎤 Listening... (speak now)\n")
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
@@ -177,6 +175,26 @@ class VoiceLoop:
|
|||||||
dtype="float32",
|
dtype="float32",
|
||||||
blocksize=block_size,
|
blocksize=block_size,
|
||||||
) as stream:
|
) as stream:
|
||||||
|
chunks = self._capture_audio_blocks(stream, block_size, silence_blocks, max_blocks)
|
||||||
|
|
||||||
|
return self._finalize_utterance(chunks, min_blocks, sr)
|
||||||
|
|
||||||
|
def _capture_audio_blocks(
|
||||||
|
self,
|
||||||
|
stream,
|
||||||
|
block_size: int,
|
||||||
|
silence_blocks: int,
|
||||||
|
max_blocks: int,
|
||||||
|
) -> list[np.ndarray]:
|
||||||
|
"""Read audio blocks from *stream* until silence or safety cap.
|
||||||
|
|
||||||
|
Returns the list of captured audio blocks (may be empty if no
|
||||||
|
speech was detected).
|
||||||
|
"""
|
||||||
|
audio_chunks: list[np.ndarray] = []
|
||||||
|
silent_count = 0
|
||||||
|
recording = False
|
||||||
|
|
||||||
while self._running:
|
while self._running:
|
||||||
block, overflowed = stream.read(block_size)
|
block, overflowed = stream.read(block_size)
|
||||||
if overflowed:
|
if overflowed:
|
||||||
@@ -199,20 +217,28 @@ class VoiceLoop:
|
|||||||
else:
|
else:
|
||||||
silent_count = 0
|
silent_count = 0
|
||||||
|
|
||||||
# End of utterance
|
|
||||||
if silent_count >= silence_blocks:
|
if silent_count >= silence_blocks:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Safety cap
|
|
||||||
if len(audio_chunks) >= max_blocks:
|
if len(audio_chunks) >= max_blocks:
|
||||||
logger.info("Max utterance length reached, stopping.")
|
logger.info("Max utterance length reached, stopping.")
|
||||||
break
|
break
|
||||||
|
|
||||||
if not audio_chunks or len(audio_chunks) < min_blocks:
|
return audio_chunks
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _finalize_utterance(
|
||||||
|
chunks: list[np.ndarray], min_blocks: int, sample_rate: int
|
||||||
|
) -> np.ndarray | None:
|
||||||
|
"""Concatenate captured chunks and report duration.
|
||||||
|
|
||||||
|
Returns None if the utterance is too short (below *min_blocks*).
|
||||||
|
"""
|
||||||
|
if not chunks or len(chunks) < min_blocks:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
audio = np.concatenate(audio_chunks, axis=0).flatten()
|
audio = np.concatenate(chunks, axis=0).flatten()
|
||||||
duration = len(audio) / sr
|
duration = len(audio) / sample_rate
|
||||||
sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n")
|
sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n")
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
return audio
|
return audio
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ except ImportError:
|
|||||||
np = None
|
np = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from timmy.voice_loop import VoiceConfig, VoiceLoop, _strip_markdown
|
from timmy.voice_loop import VoiceConfig, VoiceLoop, _rms, _strip_markdown
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass # pytestmark will skip all tests anyway
|
pass # pytestmark will skip all tests anyway
|
||||||
|
|
||||||
@@ -336,3 +336,28 @@ class TestSpeakSetsFlag:
|
|||||||
|
|
||||||
# After speak
|
# After speak
|
||||||
assert loop._speaking is False
|
assert loop._speaking is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestRms:
|
||||||
|
def test_rms_of_silence(self):
|
||||||
|
block = np.zeros(1600, dtype=np.float32)
|
||||||
|
assert _rms(block) == 0.0
|
||||||
|
|
||||||
|
def test_rms_of_signal(self):
|
||||||
|
block = np.ones(1600, dtype=np.float32) * 0.5
|
||||||
|
assert abs(_rms(block) - 0.5) < 1e-5
|
||||||
|
|
||||||
|
|
||||||
|
class TestFinalizeUtterance:
|
||||||
|
def test_returns_none_for_empty(self):
|
||||||
|
assert VoiceLoop._finalize_utterance([], min_blocks=5, sample_rate=16000) is None
|
||||||
|
|
||||||
|
def test_returns_none_below_min(self):
|
||||||
|
chunks = [np.zeros(1600, dtype=np.float32) for _ in range(3)]
|
||||||
|
assert VoiceLoop._finalize_utterance(chunks, min_blocks=5, sample_rate=16000) is None
|
||||||
|
|
||||||
|
def test_concatenates_chunks(self):
|
||||||
|
chunks = [np.ones(1600, dtype=np.float32) for _ in range(5)]
|
||||||
|
result = VoiceLoop._finalize_utterance(chunks, min_blocks=3, sample_rate=16000)
|
||||||
|
assert result is not None
|
||||||
|
assert len(result) == 8000
|
||||||
|
|||||||
Reference in New Issue
Block a user