feat: add voice mode with push-to-talk and TTS output for CLI

Implements Issue #314 Phase 2 & 3: - /voice command to toggle voice mode (on/off/tts/status) - Ctrl+Space push-to-talk recording via sounddevice - Whisper STT transcription via existing transcription_tools - Optional TTS response playback via existing tts_tool - Visual indicators in prompt (recording/transcribing/voice) - 21 unit tests, all mocked (no real mic/API) - Optional deps: sounddevice, numpy (pip install hermes-agent[voice])
2026-03-03 16:17:05 +03:00
parent cf3dceafe1
commit 1a6fbef8a9
6 changed files with 977 additions and 1 deletions
--- a/cli.py
+++ b/cli.py
@@ -3017,6 +3017,8 @@ class HermesCLI:
            self._handle_background_command(cmd_original)
        elif cmd_lower.startswith("/skin"):
            self._handle_skin_command(cmd_original)
+        elif cmd_lower.startswith("/voice"):
+            self._handle_voice_command(cmd_original)
        else:
            # Check for user-defined quick commands (bypass agent loop, no LLM call)
            base_cmd = cmd_lower.split()[0]
@@ -3511,6 +3513,201 @@ class HermesCLI:
        except Exception as e:
            print(f"  ❌ MCP reload failed: {e}")

+    # ====================================================================
+    # Voice mode methods
+    # ====================================================================
+
+    def _voice_start_recording(self):
+        """Start capturing audio from the microphone."""
+        from tools.voice_mode import AudioRecorder, check_voice_requirements
+
+        reqs = check_voice_requirements()
+        if not reqs["audio_available"]:
+            raise RuntimeError(
+                "Voice mode requires sounddevice and numpy.\n"
+                "Install with: pip install sounddevice numpy\n"
+                "Or: pip install hermes-agent[voice]"
+            )
+        if not reqs["stt_key_set"]:
+            raise RuntimeError(
+                "Voice mode requires VOICE_TOOLS_OPENAI_KEY for transcription.\n"
+                "Get one at: https://platform.openai.com/api-keys"
+            )
+
+        if self._voice_recorder is None:
+            self._voice_recorder = AudioRecorder()
+
+        self._voice_recorder.start()
+        self._voice_recording = True
+        _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+Space to stop, Ctrl+C to cancel){_RST}")
+
+    def _voice_stop_and_transcribe(self):
+        """Stop recording, transcribe via STT, and queue the transcript as input."""
+        try:
+            if self._voice_recorder is None:
+                return
+
+            wav_path = self._voice_recorder.stop()
+            self._voice_recording = False
+
+            if wav_path is None:
+                _cprint(f"{_DIM}No speech detected (recording too short).{_RST}")
+                return
+
+            self._voice_processing = True
+            if hasattr(self, '_app') and self._app:
+                self._app.invalidate()
+            _cprint(f"{_DIM}Transcribing...{_RST}")
+
+            # Get STT model from config
+            stt_model = None
+            try:
+                from hermes_cli.config import load_config
+                stt_config = load_config().get("stt", {})
+                stt_model = stt_config.get("model")
+            except Exception:
+                pass
+
+            from tools.voice_mode import transcribe_recording
+            result = transcribe_recording(wav_path, model=stt_model)
+
+            if result.get("success") and result.get("transcript", "").strip():
+                transcript = result["transcript"].strip()
+                _cprint(f"\n{_GOLD}●{_RST} {_BOLD}{transcript}{_RST}")
+                self._pending_input.put(transcript)
+            elif result.get("success"):
+                _cprint(f"{_DIM}No speech detected.{_RST}")
+            else:
+                error = result.get("error", "Unknown error")
+                _cprint(f"\n{_DIM}Transcription failed: {error}{_RST}")
+
+        except Exception as e:
+            _cprint(f"\n{_DIM}Voice processing error: {e}{_RST}")
+        finally:
+            self._voice_processing = False
+            if hasattr(self, '_app') and self._app:
+                self._app.invalidate()
+            # Clean up temp file
+            try:
+                if wav_path and os.path.isfile(wav_path):
+                    os.unlink(wav_path)
+            except Exception:
+                pass
+
+    def _voice_speak_response(self, text: str):
+        """Speak the agent's response aloud using TTS (runs in background thread)."""
+        if not self._voice_tts:
+            return
+        try:
+            from tools.tts_tool import text_to_speech_tool
+            from tools.voice_mode import play_audio_file
+            import json
+
+            # Truncate to TTS limit
+            tts_text = text[:4000] if len(text) > 4000 else text
+            result_json = text_to_speech_tool(text=tts_text)
+            result = json.loads(result_json)
+
+            if result.get("success") and result.get("file_path"):
+                play_audio_file(result["file_path"])
+        except Exception as e:
+            logger.debug("Voice TTS playback failed: %s", e)
+
+    def _handle_voice_command(self, command: str):
+        """Handle /voice [on|off|tts|status] command."""
+        parts = command.strip().split(maxsplit=1)
+        subcommand = parts[1].lower().strip() if len(parts) > 1 else ""
+
+        if subcommand == "on":
+            self._enable_voice_mode()
+        elif subcommand == "off":
+            self._disable_voice_mode()
+        elif subcommand == "tts":
+            self._toggle_voice_tts()
+        elif subcommand == "status":
+            self._show_voice_status()
+        elif subcommand == "":
+            # Toggle
+            if self._voice_mode:
+                self._disable_voice_mode()
+            else:
+                self._enable_voice_mode()
+        else:
+            print(f"Unknown voice subcommand: {subcommand}")
+            print("Usage: /voice [on|off|tts|status]")
+
+    def _enable_voice_mode(self):
+        """Enable voice mode after checking requirements."""
+        from tools.voice_mode import check_voice_requirements
+
+        reqs = check_voice_requirements()
+        if not reqs["available"]:
+            _cprint(f"\n{_GOLD}Voice mode requirements not met:{_RST}")
+            for line in reqs["details"].split("\n"):
+                _cprint(f"  {_DIM}{line}{_RST}")
+            if reqs["missing_packages"]:
+                _cprint(f"\n  {_BOLD}Install: pip install {' '.join(reqs['missing_packages'])}{_RST}")
+                _cprint(f"  {_DIM}Or: pip install hermes-agent[voice]{_RST}")
+            return
+
+        self._voice_mode = True
+
+        # Check config for auto_tts
+        try:
+            from hermes_cli.config import load_config
+            voice_config = load_config().get("voice", {})
+            if voice_config.get("auto_tts", False):
+                self._voice_tts = True
+        except Exception:
+            pass
+
+        tts_status = " (TTS enabled)" if self._voice_tts else ""
+        _cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}")
+        _cprint(f"  {_DIM}Ctrl+Space to start/stop recording{_RST}")
+        _cprint(f"  {_DIM}/voice tts  to toggle speech output{_RST}")
+        _cprint(f"  {_DIM}/voice off  to disable voice mode{_RST}")
+
+    def _disable_voice_mode(self):
+        """Disable voice mode and cancel any active recording."""
+        if self._voice_recording and self._voice_recorder:
+            self._voice_recorder.cancel()
+            self._voice_recording = False
+
+        self._voice_mode = False
+        self._voice_tts = False
+        _cprint(f"\n{_DIM}Voice mode disabled.{_RST}")
+
+    def _toggle_voice_tts(self):
+        """Toggle TTS output for voice mode."""
+        if not self._voice_mode:
+            _cprint(f"{_DIM}Enable voice mode first: /voice on{_RST}")
+            return
+
+        self._voice_tts = not self._voice_tts
+        status = "enabled" if self._voice_tts else "disabled"
+
+        if self._voice_tts:
+            from tools.tts_tool import check_tts_requirements
+            if not check_tts_requirements():
+                _cprint(f"{_DIM}Warning: No TTS provider available. Install edge-tts or set API keys.{_RST}")
+
+        _cprint(f"{_GOLD}Voice TTS {status}.{_RST}")
+
+    def _show_voice_status(self):
+        """Show current voice mode status."""
+        from tools.voice_mode import check_voice_requirements
+
+        reqs = check_voice_requirements()
+
+        _cprint(f"\n{_BOLD}Voice Mode Status{_RST}")
+        _cprint(f"  Mode:      {'ON' if self._voice_mode else 'OFF'}")
+        _cprint(f"  TTS:       {'ON' if self._voice_tts else 'OFF'}")
+        _cprint(f"  Recording: {'YES' if self._voice_recording else 'no'}")
+        _cprint(f"  Record key: Ctrl+Space")
+        _cprint(f"\n  {_BOLD}Requirements:{_RST}")
+        for line in reqs["details"].split("\n"):
+            _cprint(f"    {line}")
+
    def _clarify_callback(self, question, choices):
        """
        Platform callback for the clarify tool. Called from the agent thread.
@@ -3876,12 +4073,23 @@ class HermesCLI:
                    padding=(1, 2),
                ))

+
+
            # Play terminal bell when agent finishes (if enabled).
            # Works over SSH — the bell propagates to the user's terminal.
            if self.bell_on_complete:
                sys.stdout.write("\a")
                sys.stdout.flush()

+            # Speak response aloud if voice TTS is enabled
+            if self._voice_tts and response:
+                threading.Thread(
+                    target=self._voice_speak_response,
+                    args=(response,),
+                    daemon=True,
+                ).start()
+
+
            # Combine all interrupt messages (user may have typed multiple while waiting)
            # and re-queue as one prompt for process_loop
            if pending_message and hasattr(self, '_pending_input'):
@@ -3964,6 +4172,10 @@ class HermesCLI:
    def _get_tui_prompt_fragments(self):
        """Return the prompt_toolkit fragments for the current interactive state."""
        symbol, state_suffix = self._get_tui_prompt_symbols()
+        if self._voice_recording:
+            return [("class:voice-recording", f"● {state_suffix}")]
+        if self._voice_processing:
+            return [("class:voice-processing", f"◉ {state_suffix}")]
        if self._sudo_state:
            return [("class:sudo-prompt", f"🔐 {state_suffix}")]
        if self._secret_state:
@@ -3978,6 +4190,8 @@ class HermesCLI:
            return [("class:prompt-working", f"{self._command_spinner_frame()} {state_suffix}")]
        if self._agent_running:
            return [("class:prompt-working", f"⚕ {state_suffix}")]
+        if self._voice_mode:
+            return [("class:voice-prompt", f"🎤 {state_suffix}")]
        return [("class:prompt", symbol)]

    def _get_tui_prompt_text(self) -> str:
@@ -4070,6 +4284,13 @@ class HermesCLI:
        self._attached_images: list[Path] = []
        self._image_counter = 0

+        # Voice mode state
+        self._voice_mode = False        # Whether voice mode is enabled
+        self._voice_tts = False         # Whether TTS output is enabled
+        self._voice_recorder = None     # AudioRecorder instance (lazy init)
+        self._voice_recording = False   # Whether currently recording
+        self._voice_processing = False  # Whether STT is in progress
+
        # Register callbacks so terminal_tool prompts route through our UI
        set_sudo_password_callback(self._sudo_password_callback)
        set_approval_callback(self._approval_callback)
@@ -4254,6 +4475,7 @@ class HermesCLI:
            """Handle Ctrl+C - cancel interactive prompts, interrupt agent, or exit.
            
            Priority:
+            0. Cancel active voice recording
            1. Cancel active sudo/approval/clarify prompt
            2. Interrupt the running agent (first press)
            3. Force exit (second press within 2s, or when idle)
@@ -4261,6 +4483,14 @@ class HermesCLI:
            import time as _time
            now = _time.time()

+            # Cancel active voice recording
+            if cli_ref._voice_recording and cli_ref._voice_recorder:
+                cli_ref._voice_recorder.cancel()
+                cli_ref._voice_recording = False
+                _cprint(f"\n{_DIM}Recording cancelled.{_RST}")
+                event.app.invalidate()
+                return
+
            # Cancel sudo prompt
            if self._sudo_state:
                self._sudo_state["response_queue"].put("")
@@ -4367,6 +4597,30 @@ class HermesCLI:
                # No image found — show a hint
                pass  # silent when no image (avoid noise on accidental press)

+        @kb.add('c-space')
+        def handle_ctrl_space(event):
+            """Toggle voice recording when voice mode is active."""
+            if not cli_ref._voice_mode:
+                return
+            if cli_ref._agent_running:
+                return
+            # Block recording during interactive prompts
+            if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state:
+                return
+            if cli_ref._voice_recording:
+                cli_ref._voice_recording = False
+                event.app.invalidate()
+                threading.Thread(
+                    target=cli_ref._voice_stop_and_transcribe,
+                    daemon=True,
+                ).start()
+            else:
+                try:
+                    cli_ref._voice_start_recording()
+                    event.app.invalidate()
+                except Exception as e:
+                    _cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}")
+
        # Dynamic prompt: shows Hermes symbol when agent is working,
        # or answer prompt when clarify freetext mode is active.
        cli_ref = self
@@ -4460,6 +4714,10 @@ class HermesCLI:
                return Transformation(fragments=ti.fragments)

        def _get_placeholder():
+            if cli_ref._voice_recording:
+                return "recording... Ctrl+Space to stop, Ctrl+C to cancel"
+            if cli_ref._voice_processing:
+                return "transcribing..."
            if cli_ref._sudo_state:
                return "type password (hidden), Enter to skip"
            if cli_ref._secret_state:
@@ -4476,6 +4734,8 @@ class HermesCLI:
                return f"{frame} {status}"
            if cli_ref._agent_running:
                return "type a message + Enter to interrupt, Ctrl+C to cancel"
+            if cli_ref._voice_mode:
+                return "type or Ctrl+Space to record"
            return ""

        input_area.control.input_processors.append(_PlaceholderProcessor(_get_placeholder))
@@ -4869,6 +5129,10 @@ class HermesCLI:
            'approval-cmd': '#AAAAAA italic',
            'approval-choice': '#AAAAAA',
            'approval-selected': '#FFD700 bold',
+            # Voice mode
+            'voice-prompt': '#87CEEB',
+            'voice-recording': '#FF4444 bold',
+            'voice-processing': '#FFA500 italic',
        }
        style = PTStyle.from_dict(self._build_tui_style_dict())
        
@@ -4993,6 +5257,18 @@ class HermesCLI:
                    self.agent.flush_memories(self.conversation_history)
                except Exception:
                    pass
+            # Cancel active voice recording
+            if hasattr(self, '_voice_recorder') and self._voice_recorder and self._voice_recording:
+                try:
+                    self._voice_recorder.cancel()
+                except Exception:
+                    pass
+            # Clean up old temp voice recordings
+            try:
+                from tools.voice_mode import cleanup_temp_recordings
+                cleanup_temp_recordings()
+            except Exception:
+                pass
            # Unregister callbacks to avoid dangling references
            set_sudo_password_callback(None)
            set_approval_callback(None)
--- a/hermes_cli/commands.py
+++ b/hermes_cli/commands.py
@@ -37,6 +37,7 @@ COMMANDS_BY_CATEGORY = {
        "/verbose": "Cycle tool progress display: off → new → all → verbose",
        "/reasoning": "Manage reasoning effort and display (usage: /reasoning [level|show|hide])",
        "/skin": "Show or change the display skin/theme",
+        "/voice": "Toggle voice mode (Ctrl+B to record). Usage: /voice [on|off|tts|status]",
    },
    "Tools & Skills": {
        "/tools": "List available tools",
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -203,6 +203,12 @@ DEFAULT_CONFIG = {
        },
    },

+    "voice": {
+        "record_key": "ctrl+space",
+        "max_recording_seconds": 120,
+        "auto_tts": False,
+    },
+    
    "human_delay": {
        "mode": "off",
        "min_ms": 800,
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,6 +48,7 @@ cron = ["croniter"]
 slack = ["slack-bolt>=1.18.0", "slack-sdk>=3.27.0"]
 cli = ["simple-term-menu"]
 tts-premium = ["elevenlabs"]
+voice = ["sounddevice>=0.4.6", "numpy>=1.24.0"]
 pty = [
  "ptyprocess>=0.7.0; sys_platform != 'win32'",
  "pywinpty>=2.0.0; sys_platform == 'win32'",
@@ -78,6 +79,7 @@ all = [
  "hermes-agent[mcp]",
  "hermes-agent[homeassistant]",
  "hermes-agent[acp]",
+  "hermes-agent[voice]",
 ]

 [project.scripts]
--- a/tests/tools/test_voice_mode.py
+++ b/tests/tools/test_voice_mode.py
@@ -0,0 +1,347 @@
+"""Tests for tools.voice_mode -- all mocked, no real microphone or API calls."""
+
+import os
+import struct
+import time
+import wave
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+# ============================================================================
+# Fixtures
+# ============================================================================
+
+@pytest.fixture
+def sample_wav(tmp_path):
+    """Create a minimal valid WAV file (1 second of silence at 16kHz)."""
+    wav_path = tmp_path / "test.wav"
+    n_frames = 16000  # 1 second at 16kHz
+    silence = struct.pack(f"<{n_frames}h", *([0] * n_frames))
+
+    with wave.open(str(wav_path), "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(16000)
+        wf.writeframes(silence)
+
+    return str(wav_path)
+
+
+@pytest.fixture
+def temp_voice_dir(tmp_path, monkeypatch):
+    """Redirect _TEMP_DIR to a temporary path."""
+    voice_dir = tmp_path / "hermes_voice"
+    voice_dir.mkdir()
+    monkeypatch.setattr("tools.voice_mode._TEMP_DIR", str(voice_dir))
+    return voice_dir
+
+
+@pytest.fixture
+def mock_sd(monkeypatch):
+    """Replace tools.voice_mode.sd with a MagicMock (sounddevice may not be installed)."""
+    mock = MagicMock()
+    monkeypatch.setattr("tools.voice_mode.sd", mock)
+    monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True)
+    # Also ensure numpy is available (use real numpy if installed, else mock)
+    try:
+        import numpy as real_np
+        monkeypatch.setattr("tools.voice_mode.np", real_np)
+    except ImportError:
+        monkeypatch.setattr("tools.voice_mode.np", MagicMock())
+    return mock
+
+
+# ============================================================================
+# check_voice_requirements
+# ============================================================================
+
+class TestCheckVoiceRequirements:
+    def test_all_requirements_met(self, monkeypatch):
+        monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True)
+        monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key")
+
+        from tools.voice_mode import check_voice_requirements
+
+        result = check_voice_requirements()
+        assert result["available"] is True
+        assert result["audio_available"] is True
+        assert result["stt_key_set"] is True
+        assert result["missing_packages"] == []
+
+    def test_missing_audio_packages(self, monkeypatch):
+        monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False)
+        monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key")
+
+        from tools.voice_mode import check_voice_requirements
+
+        result = check_voice_requirements()
+        assert result["available"] is False
+        assert result["audio_available"] is False
+        assert "sounddevice" in result["missing_packages"]
+        assert "numpy" in result["missing_packages"]
+
+    def test_missing_stt_key(self, monkeypatch):
+        monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True)
+        monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
+
+        from tools.voice_mode import check_voice_requirements
+
+        result = check_voice_requirements()
+        assert result["available"] is False
+        assert result["stt_key_set"] is False
+        assert "STT API key: MISSING" in result["details"]
+
+
+# ============================================================================
+# AudioRecorder
+# ============================================================================
+
+class TestAudioRecorderStart:
+    def test_start_raises_without_audio(self, monkeypatch):
+        monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False)
+
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        with pytest.raises(RuntimeError, match="sounddevice and numpy"):
+            recorder.start()
+
+    def test_start_creates_and_starts_stream(self, mock_sd):
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        recorder.start()
+
+        assert recorder.is_recording is True
+        mock_sd.InputStream.assert_called_once()
+        mock_stream.start.assert_called_once()
+
+    def test_double_start_is_noop(self, mock_sd):
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        recorder.start()
+        recorder.start()  # second call should be noop
+
+        assert mock_sd.InputStream.call_count == 1
+
+
+class TestAudioRecorderStop:
+    def test_stop_returns_none_when_not_recording(self):
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        assert recorder.stop() is None
+
+    def test_stop_writes_wav_file(self, mock_sd, temp_voice_dir):
+        np = pytest.importorskip("numpy")
+
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder, SAMPLE_RATE
+
+        recorder = AudioRecorder()
+        recorder.start()
+
+        # Simulate captured audio frames (1 second of silence)
+        frame = np.zeros((SAMPLE_RATE, 1), dtype="int16")
+        recorder._frames = [frame]
+
+        wav_path = recorder.stop()
+
+        assert wav_path is not None
+        assert os.path.isfile(wav_path)
+        assert wav_path.endswith(".wav")
+        assert recorder.is_recording is False
+
+        # Verify it is a valid WAV
+        with wave.open(wav_path, "rb") as wf:
+            assert wf.getnchannels() == 1
+            assert wf.getsampwidth() == 2
+            assert wf.getframerate() == SAMPLE_RATE
+
+    def test_stop_returns_none_for_very_short_recording(self, mock_sd, temp_voice_dir):
+        np = pytest.importorskip("numpy")
+
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        recorder.start()
+
+        # Very short recording (100 samples = ~6ms at 16kHz)
+        frame = np.zeros((100, 1), dtype="int16")
+        recorder._frames = [frame]
+
+        wav_path = recorder.stop()
+        assert wav_path is None
+
+
+class TestAudioRecorderCancel:
+    def test_cancel_discards_frames(self, mock_sd):
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        recorder.start()
+        recorder._frames = [MagicMock()]  # simulate captured data
+
+        recorder.cancel()
+
+        assert recorder.is_recording is False
+        assert recorder._frames == []
+        mock_stream.stop.assert_called_once()
+        mock_stream.close.assert_called_once()
+
+    def test_cancel_when_not_recording_is_safe(self):
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        recorder.cancel()  # should not raise
+        assert recorder.is_recording is False
+
+
+class TestAudioRecorderProperties:
+    def test_elapsed_seconds_when_not_recording(self):
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        assert recorder.elapsed_seconds == 0.0
+
+    def test_elapsed_seconds_when_recording(self, mock_sd):
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        recorder.start()
+
+        # Force start time to 1 second ago
+        recorder._start_time = time.monotonic() - 1.0
+        elapsed = recorder.elapsed_seconds
+        assert 0.9 < elapsed < 2.0
+
+        recorder.cancel()
+
+
+# ============================================================================
+# transcribe_recording
+# ============================================================================
+
+class TestTranscribeRecording:
+    def test_delegates_to_transcribe_audio(self):
+        mock_transcribe = MagicMock(return_value={
+            "success": True,
+            "transcript": "hello world",
+        })
+
+        with patch("tools.transcription_tools.transcribe_audio", mock_transcribe):
+            from tools.voice_mode import transcribe_recording
+            result = transcribe_recording("/tmp/test.wav", model="whisper-1")
+
+        assert result["success"] is True
+        assert result["transcript"] == "hello world"
+        mock_transcribe.assert_called_once_with("/tmp/test.wav", model="whisper-1")
+
+
+# ============================================================================
+# play_audio_file
+# ============================================================================
+
+class TestPlayAudioFile:
+    def test_play_wav_via_sounddevice(self, monkeypatch, sample_wav):
+        np = pytest.importorskip("numpy")
+
+        mock_sd = MagicMock()
+        monkeypatch.setattr("tools.voice_mode.sd", mock_sd)
+        monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True)
+        monkeypatch.setattr("tools.voice_mode.np", np)
+
+        from tools.voice_mode import play_audio_file
+
+        result = play_audio_file(sample_wav)
+
+        assert result is True
+        mock_sd.play.assert_called_once()
+        mock_sd.wait.assert_called_once()
+
+    def test_returns_false_when_no_player(self, monkeypatch, sample_wav):
+        monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False)
+        monkeypatch.setattr("shutil.which", lambda _: None)
+
+        from tools.voice_mode import play_audio_file
+
+        result = play_audio_file(sample_wav)
+        assert result is False
+
+    def test_returns_false_for_missing_file(self):
+        from tools.voice_mode import play_audio_file
+
+        result = play_audio_file("/nonexistent/file.wav")
+        assert result is False
+
+
+# ============================================================================
+# cleanup_temp_recordings
+# ============================================================================
+
+class TestCleanupTempRecordings:
+    def test_old_files_deleted(self, temp_voice_dir):
+        # Create an "old" file
+        old_file = temp_voice_dir / "recording_20240101_000000.wav"
+        old_file.write_bytes(b"\x00" * 100)
+        # Set mtime to 2 hours ago
+        old_mtime = time.time() - 7200
+        os.utime(str(old_file), (old_mtime, old_mtime))
+
+        from tools.voice_mode import cleanup_temp_recordings
+
+        deleted = cleanup_temp_recordings(max_age_seconds=3600)
+        assert deleted == 1
+        assert not old_file.exists()
+
+    def test_recent_files_preserved(self, temp_voice_dir):
+        # Create a "recent" file
+        recent_file = temp_voice_dir / "recording_20260303_120000.wav"
+        recent_file.write_bytes(b"\x00" * 100)
+
+        from tools.voice_mode import cleanup_temp_recordings
+
+        deleted = cleanup_temp_recordings(max_age_seconds=3600)
+        assert deleted == 0
+        assert recent_file.exists()
+
+    def test_nonexistent_dir_returns_zero(self, monkeypatch):
+        monkeypatch.setattr("tools.voice_mode._TEMP_DIR", "/nonexistent/dir")
+
+        from tools.voice_mode import cleanup_temp_recordings
+
+        assert cleanup_temp_recordings() == 0
+
+    def test_non_recording_files_ignored(self, temp_voice_dir):
+        # Create a file that doesn't match the pattern
+        other_file = temp_voice_dir / "other_file.txt"
+        other_file.write_bytes(b"\x00" * 100)
+        old_mtime = time.time() - 7200
+        os.utime(str(other_file), (old_mtime, old_mtime))
+
+        from tools.voice_mode import cleanup_temp_recordings
+
+        deleted = cleanup_temp_recordings(max_age_seconds=3600)
+        assert deleted == 0
+        assert other_file.exists()
--- a/tools/voice_mode.py
+++ b/tools/voice_mode.py
@@ -0,0 +1,344 @@
+"""Voice Mode -- Push-to-talk audio recording and playback for the CLI.
+
+Provides audio capture via sounddevice, WAV encoding via stdlib wave,
+STT dispatch via tools.transcription_tools, and TTS playback via
+sounddevice or system audio players.
+
+Dependencies (optional):
+    pip install sounddevice numpy
+    or: pip install hermes-agent[voice]
+"""
+
+import logging
+import os
+import platform
+import shutil
+import subprocess
+import tempfile
+import threading
+import time
+import wave
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Optional imports with graceful degradation
+# ---------------------------------------------------------------------------
+try:
+    import sounddevice as sd
+    import numpy as np
+
+    _HAS_AUDIO = True
+except ImportError:
+    sd = None  # type: ignore[assignment]
+    np = None  # type: ignore[assignment]
+    _HAS_AUDIO = False
+
+# ---------------------------------------------------------------------------
+# Recording parameters
+# ---------------------------------------------------------------------------
+SAMPLE_RATE = 16000  # Whisper native rate
+CHANNELS = 1  # Mono
+DTYPE = "int16"  # 16-bit PCM
+SAMPLE_WIDTH = 2  # bytes per sample (int16)
+MAX_RECORDING_SECONDS = 120  # Safety cap
+
+# Temp directory for voice recordings
+_TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice")
+
+
+# ============================================================================
+# AudioRecorder
+# ============================================================================
+class AudioRecorder:
+    """Thread-safe audio recorder using sounddevice.InputStream.
+
+    Usage::
+
+        recorder = AudioRecorder()
+        recorder.start()
+        # ... user speaks ...
+        wav_path = recorder.stop()   # returns path to WAV file
+        # or
+        recorder.cancel()            # discard without saving
+    """
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._stream: Any = None
+        self._frames: List[Any] = []
+        self._recording = False
+        self._start_time: float = 0.0
+
+    # -- public properties ---------------------------------------------------
+
+    @property
+    def is_recording(self) -> bool:
+        return self._recording
+
+    @property
+    def elapsed_seconds(self) -> float:
+        if not self._recording:
+            return 0.0
+        return time.monotonic() - self._start_time
+
+    # -- public methods ------------------------------------------------------
+
+    def start(self) -> None:
+        """Start capturing audio from the default input device.
+
+        Raises ``RuntimeError`` if sounddevice/numpy are not installed
+        or if a recording is already in progress.
+        """
+        if not _HAS_AUDIO:
+            raise RuntimeError(
+                "Voice mode requires sounddevice and numpy.\n"
+                "Install with: pip install sounddevice numpy\n"
+                "Or: pip install hermes-agent[voice]"
+            )
+
+        with self._lock:
+            if self._recording:
+                return  # already recording
+
+            self._frames = []
+            self._start_time = time.monotonic()
+
+            def _callback(indata, frames, time_info, status):  # noqa: ARG001
+                if status:
+                    logger.debug("sounddevice status: %s", status)
+                self._frames.append(indata.copy())
+
+            self._stream = sd.InputStream(
+                samplerate=SAMPLE_RATE,
+                channels=CHANNELS,
+                dtype=DTYPE,
+                callback=_callback,
+            )
+            self._stream.start()
+            self._recording = True
+            logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
+
+    def stop(self) -> Optional[str]:
+        """Stop recording and write captured audio to a WAV file.
+
+        Returns:
+            Path to the WAV file, or ``None`` if no audio was captured.
+        """
+        with self._lock:
+            if not self._recording:
+                return None
+
+            self._recording = False
+
+            if self._stream is not None:
+                try:
+                    self._stream.stop()
+                    self._stream.close()
+                except Exception:
+                    pass
+                self._stream = None
+
+            if not self._frames:
+                return None
+
+            # Concatenate frames and write WAV
+            audio_data = np.concatenate(self._frames, axis=0)
+            self._frames = []
+
+            elapsed = time.monotonic() - self._start_time
+            logger.info("Voice recording stopped (%.1fs, %d samples)", elapsed, len(audio_data))
+
+            # Skip very short recordings (< 0.3s of audio)
+            min_samples = int(SAMPLE_RATE * 0.3)
+            if len(audio_data) < min_samples:
+                logger.debug("Recording too short (%d samples), discarding", len(audio_data))
+                return None
+
+            return self._write_wav(audio_data)
+
+    def cancel(self) -> None:
+        """Stop recording and discard all captured audio."""
+        with self._lock:
+            self._recording = False
+            self._frames = []
+
+            if self._stream is not None:
+                try:
+                    self._stream.stop()
+                    self._stream.close()
+                except Exception:
+                    pass
+                self._stream = None
+
+            logger.info("Voice recording cancelled")
+
+    # -- private helpers -----------------------------------------------------
+
+    @staticmethod
+    def _write_wav(audio_data) -> str:
+        """Write numpy int16 audio data to a WAV file.
+
+        Returns the file path.
+        """
+        os.makedirs(_TEMP_DIR, exist_ok=True)
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        wav_path = os.path.join(_TEMP_DIR, f"recording_{timestamp}.wav")
+
+        with wave.open(wav_path, "wb") as wf:
+            wf.setnchannels(CHANNELS)
+            wf.setsampwidth(SAMPLE_WIDTH)
+            wf.setframerate(SAMPLE_RATE)
+            wf.writeframes(audio_data.tobytes())
+
+        file_size = os.path.getsize(wav_path)
+        logger.info("WAV written: %s (%d bytes)", wav_path, file_size)
+        return wav_path
+
+
+# ============================================================================
+# STT dispatch
+# ============================================================================
+def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str, Any]:
+    """Transcribe a WAV recording using the existing Whisper pipeline.
+
+    Delegates to ``tools.transcription_tools.transcribe_audio()``.
+
+    Args:
+        wav_path: Path to the WAV file.
+        model: Whisper model name (default: from config or ``whisper-1``).
+
+    Returns:
+        Dict with ``success``, ``transcript``, and optionally ``error``.
+    """
+    from tools.transcription_tools import transcribe_audio
+
+    return transcribe_audio(wav_path, model=model)
+
+
+# ============================================================================
+# Audio playback
+# ============================================================================
+def play_audio_file(file_path: str) -> bool:
+    """Play an audio file through the default output device.
+
+    Strategy:
+    1. WAV files via ``sounddevice.play()`` when available.
+    2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform),
+       ``aplay`` (Linux ALSA).
+
+    Returns:
+        ``True`` if playback succeeded, ``False`` otherwise.
+    """
+    if not os.path.isfile(file_path):
+        logger.warning("Audio file not found: %s", file_path)
+        return False
+
+    # Try sounddevice for WAV files
+    if _HAS_AUDIO and file_path.endswith(".wav"):
+        try:
+            with wave.open(file_path, "rb") as wf:
+                frames = wf.readframes(wf.getnframes())
+                audio_data = np.frombuffer(frames, dtype=np.int16)
+                sample_rate = wf.getframerate()
+
+            sd.play(audio_data, samplerate=sample_rate)
+            sd.wait()
+            return True
+        except Exception as e:
+            logger.debug("sounddevice playback failed: %s", e)
+
+    # Fall back to system audio players
+    system = platform.system()
+    players = []
+
+    if system == "Darwin":
+        players.append(["afplay", file_path])
+    players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path])
+    if system == "Linux":
+        players.append(["aplay", "-q", file_path])
+
+    for cmd in players:
+        exe = shutil.which(cmd[0])
+        if exe:
+            try:
+                subprocess.run(cmd, capture_output=True, timeout=300)
+                return True
+            except Exception as e:
+                logger.debug("System player %s failed: %s", cmd[0], e)
+
+    logger.warning("No audio player available for %s", file_path)
+    return False
+
+
+# ============================================================================
+# Requirements check
+# ============================================================================
+def check_voice_requirements() -> Dict[str, Any]:
+    """Check if all voice mode requirements are met.
+
+    Returns:
+        Dict with ``available``, ``audio_available``, ``stt_key_set``,
+        ``missing_packages``, and ``details``.
+    """
+    stt_key_set = bool(os.getenv("VOICE_TOOLS_OPENAI_KEY"))
+    missing: List[str] = []
+
+    if not _HAS_AUDIO:
+        missing.extend(["sounddevice", "numpy"])
+
+    available = _HAS_AUDIO and stt_key_set
+    details_parts = []
+
+    if _HAS_AUDIO:
+        details_parts.append("Audio capture: OK")
+    else:
+        details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)")
+
+    if stt_key_set:
+        details_parts.append("STT API key: OK")
+    else:
+        details_parts.append("STT API key: MISSING (set VOICE_TOOLS_OPENAI_KEY)")
+
+    return {
+        "available": available,
+        "audio_available": _HAS_AUDIO,
+        "stt_key_set": stt_key_set,
+        "missing_packages": missing,
+        "details": "\n".join(details_parts),
+    }
+
+
+# ============================================================================
+# Temp file cleanup
+# ============================================================================
+def cleanup_temp_recordings(max_age_seconds: int = 3600) -> int:
+    """Remove old temporary voice recording files.
+
+    Args:
+        max_age_seconds: Delete files older than this (default: 1 hour).
+
+    Returns:
+        Number of files deleted.
+    """
+    if not os.path.isdir(_TEMP_DIR):
+        return 0
+
+    deleted = 0
+    now = time.time()
+
+    for entry in os.scandir(_TEMP_DIR):
+        if entry.is_file() and entry.name.startswith("recording_") and entry.name.endswith(".wav"):
+            try:
+                age = now - entry.stat().st_mtime
+                if age > max_age_seconds:
+                    os.unlink(entry.path)
+                    deleted += 1
+            except OSError:
+                pass
+
+    if deleted:
+        logger.debug("Cleaned up %d old voice recordings", deleted)
+    return deleted