feat: add Phase 4 low-latency features for voice mode
- Audio cues: beep on record start (880Hz), double beep on stop (660Hz) - Silence detection: auto-stop recording after 3s of silence (RMS-based) - Continuous mode: auto-restart recording after agent responds - Ctrl+R starts continuous mode, Ctrl+R during recording exits it - Waits for TTS to finish before restarting to avoid recording speaker - Tests: 7 new tests for beep generation and silence detection
This commit is contained in:
56
cli.py
56
cli.py
@@ -3539,10 +3539,27 @@ class HermesCLI:
|
||||
if self._voice_recorder is None:
|
||||
self._voice_recorder = AudioRecorder()
|
||||
|
||||
self._voice_recorder.start()
|
||||
def _on_silence():
|
||||
"""Called by AudioRecorder when silence is detected after speech."""
|
||||
with self._voice_lock:
|
||||
if not self._voice_recording:
|
||||
return
|
||||
_cprint(f"\n{_DIM}Silence detected, auto-stopping...{_RST}")
|
||||
if hasattr(self, '_app') and self._app:
|
||||
self._app.invalidate()
|
||||
self._voice_stop_and_transcribe()
|
||||
|
||||
self._voice_recorder.start(on_silence_stop=_on_silence)
|
||||
with self._voice_lock:
|
||||
self._voice_recording = True
|
||||
_cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+R to stop, Ctrl+C to cancel){_RST}")
|
||||
|
||||
# Audio cue: single beep on recording start
|
||||
try:
|
||||
from tools.voice_mode import play_beep
|
||||
threading.Thread(target=play_beep, kwargs={"frequency": 880, "count": 1}, daemon=True).start()
|
||||
except Exception:
|
||||
pass
|
||||
_cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}")
|
||||
|
||||
def _voice_stop_and_transcribe(self):
|
||||
"""Stop recording, transcribe via STT, and queue the transcript as input."""
|
||||
@@ -3554,6 +3571,13 @@ class HermesCLI:
|
||||
with self._voice_lock:
|
||||
self._voice_recording = False
|
||||
|
||||
# Audio cue: double beep on recording stop
|
||||
try:
|
||||
from tools.voice_mode import play_beep
|
||||
threading.Thread(target=play_beep, kwargs={"frequency": 660, "count": 2}, daemon=True).start()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if wav_path is None:
|
||||
_cprint(f"{_DIM}No speech detected (recording too short).{_RST}")
|
||||
return
|
||||
@@ -3603,6 +3627,7 @@ class HermesCLI:
|
||||
"""Speak the agent's response aloud using TTS (runs in background thread)."""
|
||||
if not self._voice_tts:
|
||||
return
|
||||
self._voice_tts_done.clear()
|
||||
try:
|
||||
from tools.tts_tool import text_to_speech_tool
|
||||
from tools.voice_mode import play_audio_file
|
||||
@@ -3649,6 +3674,8 @@ class HermesCLI:
|
||||
except Exception as e:
|
||||
logger.warning("Voice TTS playback failed: %s", e)
|
||||
_cprint(f"{_DIM}TTS playback failed: {e}{_RST}")
|
||||
finally:
|
||||
self._voice_tts_done.set()
|
||||
|
||||
def _handle_voice_command(self, command: str):
|
||||
"""Handle /voice [on|off|tts|status] command."""
|
||||
@@ -3714,6 +3741,7 @@ class HermesCLI:
|
||||
self._voice_recording = False
|
||||
self._voice_mode = False
|
||||
self._voice_tts = False
|
||||
self._voice_continuous = False
|
||||
_cprint(f"\n{_DIM}Voice mode disabled.{_RST}")
|
||||
|
||||
def _toggle_voice_tts(self):
|
||||
@@ -4331,6 +4359,9 @@ class HermesCLI:
|
||||
self._voice_recorder = None # AudioRecorder instance (lazy init)
|
||||
self._voice_recording = False # Whether currently recording
|
||||
self._voice_processing = False # Whether STT is in progress
|
||||
self._voice_continuous = False # Whether to auto-restart after agent responds
|
||||
self._voice_tts_done = threading.Event() # Signals TTS playback finished
|
||||
self._voice_tts_done.set() # Initially "done" (no TTS pending)
|
||||
|
||||
# Register callbacks so terminal_tool prompts route through our UI
|
||||
set_sudo_password_callback(self._sudo_password_callback)
|
||||
@@ -4650,7 +4681,10 @@ class HermesCLI:
|
||||
if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state:
|
||||
return
|
||||
if cli_ref._voice_recording:
|
||||
cli_ref._voice_recording = False
|
||||
# Manual stop via Ctrl+R: stop continuous mode
|
||||
with cli_ref._voice_lock:
|
||||
cli_ref._voice_continuous = False
|
||||
cli_ref._voice_recording = False
|
||||
event.app.invalidate()
|
||||
threading.Thread(
|
||||
target=cli_ref._voice_stop_and_transcribe,
|
||||
@@ -4658,6 +4692,8 @@ class HermesCLI:
|
||||
).start()
|
||||
else:
|
||||
try:
|
||||
with cli_ref._voice_lock:
|
||||
cli_ref._voice_continuous = True
|
||||
cli_ref._voice_start_recording()
|
||||
event.app.invalidate()
|
||||
except Exception as e:
|
||||
@@ -5267,13 +5303,25 @@ class HermesCLI:
|
||||
# Regular chat - run agent
|
||||
self._agent_running = True
|
||||
app.invalidate() # Refresh status line
|
||||
|
||||
|
||||
try:
|
||||
self.chat(user_input, images=submit_images or None)
|
||||
finally:
|
||||
self._agent_running = False
|
||||
self._spinner_text = ""
|
||||
app.invalidate() # Refresh status line
|
||||
|
||||
# Continuous voice: auto-restart recording after agent responds
|
||||
if self._voice_mode and self._voice_continuous and not self._voice_recording:
|
||||
try:
|
||||
# Wait for TTS to finish so we don't record the speaker
|
||||
if self._voice_tts:
|
||||
self._voice_tts_done.wait(timeout=60)
|
||||
time.sleep(0.3) # Brief pause after TTS ends
|
||||
self._voice_start_recording()
|
||||
app.invalidate()
|
||||
except Exception as e:
|
||||
_cprint(f"{_DIM}Voice auto-restart failed: {e}{_RST}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
@@ -346,3 +346,154 @@ class TestCleanupTempRecordings:
|
||||
deleted = cleanup_temp_recordings(max_age_seconds=3600)
|
||||
assert deleted == 0
|
||||
assert other_file.exists()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# play_beep
|
||||
# ============================================================================
|
||||
|
||||
class TestPlayBeep:
|
||||
def test_beep_calls_sounddevice_play(self, mock_sd):
|
||||
np = pytest.importorskip("numpy")
|
||||
|
||||
from tools.voice_mode import play_beep
|
||||
|
||||
play_beep(frequency=880, duration=0.1, count=1)
|
||||
|
||||
mock_sd.play.assert_called_once()
|
||||
mock_sd.wait.assert_called_once()
|
||||
# Verify audio data is int16 numpy array
|
||||
audio_arg = mock_sd.play.call_args[0][0]
|
||||
assert audio_arg.dtype == np.int16
|
||||
assert len(audio_arg) > 0
|
||||
|
||||
def test_beep_double_produces_longer_audio(self, mock_sd):
|
||||
np = pytest.importorskip("numpy")
|
||||
|
||||
from tools.voice_mode import play_beep
|
||||
|
||||
play_beep(frequency=660, duration=0.1, count=2)
|
||||
|
||||
audio_arg = mock_sd.play.call_args[0][0]
|
||||
single_beep_samples = int(16000 * 0.1)
|
||||
# Double beep should be longer than a single beep
|
||||
assert len(audio_arg) > single_beep_samples
|
||||
|
||||
def test_beep_noop_without_audio(self, monkeypatch):
|
||||
monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False)
|
||||
|
||||
from tools.voice_mode import play_beep
|
||||
|
||||
# Should not raise
|
||||
play_beep()
|
||||
|
||||
def test_beep_handles_playback_error(self, mock_sd):
|
||||
mock_sd.play.side_effect = Exception("device error")
|
||||
|
||||
from tools.voice_mode import play_beep
|
||||
|
||||
# Should not raise
|
||||
play_beep()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Silence detection
|
||||
# ============================================================================
|
||||
|
||||
class TestSilenceDetection:
|
||||
def test_silence_callback_fires_after_speech_then_silence(self, mock_sd):
|
||||
np = pytest.importorskip("numpy")
|
||||
import threading
|
||||
|
||||
mock_stream = MagicMock()
|
||||
mock_sd.InputStream.return_value = mock_stream
|
||||
|
||||
from tools.voice_mode import AudioRecorder, SAMPLE_RATE
|
||||
|
||||
recorder = AudioRecorder()
|
||||
# Use very short silence duration for testing
|
||||
recorder._silence_duration = 0.05
|
||||
|
||||
fired = threading.Event()
|
||||
|
||||
def on_silence():
|
||||
fired.set()
|
||||
|
||||
recorder.start(on_silence_stop=on_silence)
|
||||
|
||||
# Get the callback function from InputStream constructor
|
||||
callback = mock_sd.InputStream.call_args.kwargs.get("callback")
|
||||
if callback is None:
|
||||
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||
|
||||
# Simulate loud audio (speech) -- RMS well above threshold
|
||||
loud_frame = np.full((1600, 1), 5000, dtype="int16")
|
||||
callback(loud_frame, 1600, None, None)
|
||||
assert recorder._has_spoken is True
|
||||
|
||||
# Simulate silence
|
||||
silent_frame = np.zeros((1600, 1), dtype="int16")
|
||||
callback(silent_frame, 1600, None, None)
|
||||
|
||||
# Wait a bit past the silence duration, then send another silent frame
|
||||
time.sleep(0.06)
|
||||
callback(silent_frame, 1600, None, None)
|
||||
|
||||
# The callback should have been fired
|
||||
assert fired.wait(timeout=1.0) is True
|
||||
|
||||
recorder.cancel()
|
||||
|
||||
def test_silence_without_speech_does_not_fire(self, mock_sd):
|
||||
np = pytest.importorskip("numpy")
|
||||
import threading
|
||||
|
||||
mock_stream = MagicMock()
|
||||
mock_sd.InputStream.return_value = mock_stream
|
||||
|
||||
from tools.voice_mode import AudioRecorder
|
||||
|
||||
recorder = AudioRecorder()
|
||||
recorder._silence_duration = 0.02
|
||||
|
||||
fired = threading.Event()
|
||||
recorder.start(on_silence_stop=lambda: fired.set())
|
||||
|
||||
callback = mock_sd.InputStream.call_args.kwargs.get("callback")
|
||||
if callback is None:
|
||||
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||
|
||||
# Only silence -- no speech detected, so callback should NOT fire
|
||||
silent_frame = np.zeros((1600, 1), dtype="int16")
|
||||
for _ in range(5):
|
||||
callback(silent_frame, 1600, None, None)
|
||||
time.sleep(0.01)
|
||||
|
||||
assert fired.wait(timeout=0.2) is False
|
||||
|
||||
recorder.cancel()
|
||||
|
||||
def test_no_callback_means_no_silence_detection(self, mock_sd):
|
||||
np = pytest.importorskip("numpy")
|
||||
|
||||
mock_stream = MagicMock()
|
||||
mock_sd.InputStream.return_value = mock_stream
|
||||
|
||||
from tools.voice_mode import AudioRecorder
|
||||
|
||||
recorder = AudioRecorder()
|
||||
recorder.start() # no on_silence_stop
|
||||
|
||||
callback = mock_sd.InputStream.call_args.kwargs.get("callback")
|
||||
if callback is None:
|
||||
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||
|
||||
# Even with speech then silence, nothing should happen
|
||||
loud_frame = np.full((1600, 1), 5000, dtype="int16")
|
||||
silent_frame = np.zeros((1600, 1), dtype="int16")
|
||||
callback(loud_frame, 1600, None, None)
|
||||
callback(silent_frame, 1600, None, None)
|
||||
|
||||
# No crash, no callback
|
||||
assert recorder._on_silence_stop is None
|
||||
recorder.cancel()
|
||||
|
||||
@@ -45,10 +45,51 @@ DTYPE = "int16" # 16-bit PCM
|
||||
SAMPLE_WIDTH = 2 # bytes per sample (int16)
|
||||
MAX_RECORDING_SECONDS = 120 # Safety cap
|
||||
|
||||
# Silence detection defaults
|
||||
SILENCE_RMS_THRESHOLD = 200 # RMS below this = silence (int16 range 0-32767)
|
||||
SILENCE_DURATION_SECONDS = 3.0 # Seconds of continuous silence before auto-stop
|
||||
|
||||
# Temp directory for voice recordings
|
||||
_TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Audio cues (beep tones)
|
||||
# ============================================================================
|
||||
def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> None:
|
||||
"""Play a short beep tone using numpy + sounddevice.
|
||||
|
||||
Args:
|
||||
frequency: Tone frequency in Hz (default 880 = A5).
|
||||
duration: Duration of each beep in seconds.
|
||||
count: Number of beeps to play (with short gap between).
|
||||
"""
|
||||
if not _HAS_AUDIO:
|
||||
return
|
||||
try:
|
||||
gap = 0.06 # seconds between beeps
|
||||
samples_per_beep = int(SAMPLE_RATE * duration)
|
||||
samples_per_gap = int(SAMPLE_RATE * gap)
|
||||
|
||||
parts = []
|
||||
for i in range(count):
|
||||
t = np.linspace(0, duration, samples_per_beep, endpoint=False)
|
||||
# Apply fade in/out to avoid click artifacts
|
||||
tone = np.sin(2 * np.pi * frequency * t)
|
||||
fade_len = min(int(SAMPLE_RATE * 0.01), samples_per_beep // 4)
|
||||
tone[:fade_len] *= np.linspace(0, 1, fade_len)
|
||||
tone[-fade_len:] *= np.linspace(1, 0, fade_len)
|
||||
parts.append((tone * 0.3 * 32767).astype(np.int16))
|
||||
if i < count - 1:
|
||||
parts.append(np.zeros(samples_per_gap, dtype=np.int16))
|
||||
|
||||
audio = np.concatenate(parts)
|
||||
sd.play(audio, samplerate=SAMPLE_RATE)
|
||||
sd.wait()
|
||||
except Exception as e:
|
||||
logger.debug("Beep playback failed: %s", e)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# AudioRecorder
|
||||
# ============================================================================
|
||||
@@ -58,11 +99,14 @@ class AudioRecorder:
|
||||
Usage::
|
||||
|
||||
recorder = AudioRecorder()
|
||||
recorder.start()
|
||||
recorder.start(on_silence_stop=my_callback)
|
||||
# ... user speaks ...
|
||||
wav_path = recorder.stop() # returns path to WAV file
|
||||
# or
|
||||
recorder.cancel() # discard without saving
|
||||
|
||||
If ``on_silence_stop`` is provided, recording automatically stops when
|
||||
the user is silent for ``silence_duration`` seconds and calls the callback.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
@@ -71,6 +115,12 @@ class AudioRecorder:
|
||||
self._frames: List[Any] = []
|
||||
self._recording = False
|
||||
self._start_time: float = 0.0
|
||||
# Silence detection state
|
||||
self._has_spoken = False
|
||||
self._silence_start: float = 0.0
|
||||
self._on_silence_stop = None
|
||||
self._silence_threshold: int = SILENCE_RMS_THRESHOLD
|
||||
self._silence_duration: float = SILENCE_DURATION_SECONDS
|
||||
|
||||
# -- public properties ---------------------------------------------------
|
||||
|
||||
@@ -86,9 +136,14 @@ class AudioRecorder:
|
||||
|
||||
# -- public methods ------------------------------------------------------
|
||||
|
||||
def start(self) -> None:
|
||||
def start(self, on_silence_stop=None) -> None:
|
||||
"""Start capturing audio from the default input device.
|
||||
|
||||
Args:
|
||||
on_silence_stop: Optional callback invoked (in a daemon thread) when
|
||||
silence is detected after speech. The callback receives no arguments.
|
||||
Use this to auto-stop recording and trigger transcription.
|
||||
|
||||
Raises ``RuntimeError`` if sounddevice/numpy are not installed
|
||||
or if a recording is already in progress.
|
||||
"""
|
||||
@@ -105,12 +160,35 @@ class AudioRecorder:
|
||||
|
||||
self._frames = []
|
||||
self._start_time = time.monotonic()
|
||||
self._has_spoken = False
|
||||
self._silence_start = 0.0
|
||||
self._on_silence_stop = on_silence_stop
|
||||
|
||||
def _callback(indata, frames, time_info, status): # noqa: ARG001
|
||||
if status:
|
||||
logger.debug("sounddevice status: %s", status)
|
||||
self._frames.append(indata.copy())
|
||||
|
||||
# Silence detection: compute RMS of this chunk
|
||||
if self._on_silence_stop is not None and self._recording:
|
||||
rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
|
||||
now = time.monotonic()
|
||||
|
||||
if rms > self._silence_threshold:
|
||||
self._has_spoken = True
|
||||
self._silence_start = 0.0
|
||||
elif self._has_spoken:
|
||||
# User was speaking and now is silent
|
||||
if self._silence_start == 0.0:
|
||||
self._silence_start = now
|
||||
elif now - self._silence_start >= self._silence_duration:
|
||||
logger.info("Silence detected (%.1fs), auto-stopping",
|
||||
self._silence_duration)
|
||||
cb = self._on_silence_stop
|
||||
self._on_silence_stop = None # fire only once
|
||||
if cb:
|
||||
threading.Thread(target=cb, daemon=True).start()
|
||||
|
||||
self._stream = sd.InputStream(
|
||||
samplerate=SAMPLE_RATE,
|
||||
channels=CHANNELS,
|
||||
|
||||
Reference in New Issue
Block a user