Merge pull request #1299 from NousResearch/hermes/hermes-f5fb1d3b

fix: salvage PR #327 voice mode onto current main
2026-03-14 06:45:20 -07:00
parent c1cca65168 9633ddd8d8
commit 95c0bee7f8
38 changed files with 9454 additions and 230 deletions
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@@ -224,24 +224,14 @@ def _emergency_cleanup_all_sessions():
        logger.error("Emergency cleanup error: %s", e)


-def _signal_handler(signum, frame):
-    """Handle interrupt signals to cleanup sessions before exit."""
-    logger.warning("Received signal %s, cleaning up...", signum)
-    _emergency_cleanup_all_sessions()
-    sys.exit(128 + signum)
-
-
-# Register cleanup handlers
+# Register cleanup via atexit only.  Previous versions installed SIGINT/SIGTERM
+# handlers that called sys.exit(), but this conflicts with prompt_toolkit's
+# async event loop — a SystemExit raised inside a key-binding callback
+# corrupts the coroutine state and makes the process unkillable.  atexit
+# handlers run on any normal exit (including sys.exit), so browser sessions
+# are still cleaned up without hijacking signals.
 atexit.register(_emergency_cleanup_all_sessions)

-# Only register signal handlers in main process (not in multiprocessing workers)
-try:
-    if os.getpid() == os.getpgrp():  # Main process check
-        signal.signal(signal.SIGINT, _signal_handler)
-        signal.signal(signal.SIGTERM, _signal_handler)
-except (OSError, AttributeError):
-    pass  # Signal handling not available (e.g., Windows or worker process)
-

 # =============================================================================
 # Inactivity Cleanup Functions
--- a/tools/transcription_tools.py
+++ b/tools/transcription_tools.py
@@ -2,11 +2,12 @@
 """
 Transcription Tools Module

-Provides speech-to-text transcription with two providers:
+Provides speech-to-text transcription with three providers:

  - **local** (default, free) — faster-whisper running locally, no API key needed.
    Auto-downloads the model (~150 MB for ``base``) on first use.
-  - **openai** — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``.
+  - **groq** (free tier) — Groq Whisper API, requires ``GROQ_API_KEY``.
+  - **openai** (paid) — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``.

 Used by the messaging gateway to automatically transcribe voice messages
 sent by users on Telegram, Discord, WhatsApp, Slack, and Signal.
@@ -33,18 +34,9 @@ logger = logging.getLogger(__name__)
 # Optional imports — graceful degradation
 # ---------------------------------------------------------------------------

-try:
-    from faster_whisper import WhisperModel
-    _HAS_FASTER_WHISPER = True
-except ImportError:
-    _HAS_FASTER_WHISPER = False
-    WhisperModel = None  # type: ignore[assignment,misc]
-
-try:
-    from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
-    _HAS_OPENAI = True
-except ImportError:
-    _HAS_OPENAI = False
+import importlib.util as _ilu
+_HAS_FASTER_WHISPER = _ilu.find_spec("faster_whisper") is not None
+_HAS_OPENAI = _ilu.find_spec("openai") is not None

 # ---------------------------------------------------------------------------
 # Constants
@@ -52,13 +44,21 @@ except ImportError:

 DEFAULT_PROVIDER = "local"
 DEFAULT_LOCAL_MODEL = "base"
-DEFAULT_OPENAI_MODEL = "whisper-1"
+DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1")
+DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo")
+
+GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
+OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")

 SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"}
 MAX_FILE_SIZE = 25 * 1024 * 1024  # 25 MB

+# Known model sets for auto-correction
+OPENAI_MODELS = {"whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"}
+GROQ_MODELS = {"whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"}
+
 # Singleton for the local model — loaded once, reused across calls
-_local_model: Optional["WhisperModel"] = None
+_local_model: Optional[object] = None
 _local_model_name: Optional[str] = None

 # ---------------------------------------------------------------------------
@@ -66,6 +66,24 @@ _local_model_name: Optional[str] = None
 # ---------------------------------------------------------------------------


+def get_stt_model_from_config() -> Optional[str]:
+    """Read the STT model name from ~/.hermes/config.yaml.
+
+    Returns the value of ``stt.model`` if present, otherwise ``None``.
+    Silently returns ``None`` on any error (missing file, bad YAML, etc.).
+    """
+    try:
+        import yaml
+        cfg_path = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "config.yaml"
+        if cfg_path.exists():
+            with open(cfg_path) as f:
+                data = yaml.safe_load(f) or {}
+            return data.get("stt", {}).get("model")
+    except Exception:
+        pass
+    return None
+
+
 def _load_stt_config() -> dict:
    """Load the ``stt`` section from user config, falling back to defaults."""
    try:
@@ -80,7 +98,7 @@ def _get_provider(stt_config: dict) -> str:

    Priority:
      1. Explicit config value  (``stt.provider``)
-      2. Auto-detect: local if faster-whisper available, else openai if key set
+      2. Auto-detect: local > groq (free) > openai (paid)
      3. Disabled (returns "none")
    """
    provider = stt_config.get("provider", DEFAULT_PROVIDER)
@@ -88,19 +106,37 @@ def _get_provider(stt_config: dict) -> str:
    if provider == "local":
        if _HAS_FASTER_WHISPER:
            return "local"
-        # Local requested but not available — fall back to openai if possible
+        # Local requested but not available — fall back to groq, then openai
+        if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
+            logger.info("faster-whisper not installed, falling back to Groq Whisper API")
+            return "groq"
        if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
            logger.info("faster-whisper not installed, falling back to OpenAI Whisper API")
            return "openai"
        return "none"

+    if provider == "groq":
+        if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
+            return "groq"
+        # Groq requested but no key — fall back
+        if _HAS_FASTER_WHISPER:
+            logger.info("GROQ_API_KEY not set, falling back to local faster-whisper")
+            return "local"
+        if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
+            logger.info("GROQ_API_KEY not set, falling back to OpenAI Whisper API")
+            return "openai"
+        return "none"
+
    if provider == "openai":
        if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
            return "openai"
-        # OpenAI requested but no key — fall back to local if possible
+        # OpenAI requested but no key — fall back
        if _HAS_FASTER_WHISPER:
            logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to local faster-whisper")
            return "local"
+        if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
+            logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to Groq Whisper API")
+            return "groq"
        return "none"

    return provider  # Unknown — let it fail downstream
@@ -150,6 +186,7 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
        return {"success": False, "transcript": "", "error": "faster-whisper not installed"}

    try:
+        from faster_whisper import WhisperModel
        # Lazy-load the model (downloads on first use, ~150 MB for 'base')
        if _local_model is None or _local_model_name != model_name:
            logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name)
@@ -164,12 +201,60 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
            Path(file_path).name, model_name, info.language, info.duration,
        )

-        return {"success": True, "transcript": transcript}
+        return {"success": True, "transcript": transcript, "provider": "local"}

    except Exception as e:
        logger.error("Local transcription failed: %s", e, exc_info=True)
        return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}

+# ---------------------------------------------------------------------------
+# Provider: groq (Whisper API — free tier)
+# ---------------------------------------------------------------------------
+
+
+def _transcribe_groq(file_path: str, model_name: str) -> Dict[str, Any]:
+    """Transcribe using Groq Whisper API (free tier available)."""
+    api_key = os.getenv("GROQ_API_KEY")
+    if not api_key:
+        return {"success": False, "transcript": "", "error": "GROQ_API_KEY not set"}
+
+    if not _HAS_OPENAI:
+        return {"success": False, "transcript": "", "error": "openai package not installed"}
+
+    # Auto-correct model if caller passed an OpenAI-only model
+    if model_name in OPENAI_MODELS:
+        logger.info("Model %s not available on Groq, using %s", model_name, DEFAULT_GROQ_STT_MODEL)
+        model_name = DEFAULT_GROQ_STT_MODEL
+
+    try:
+        from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
+        client = OpenAI(api_key=api_key, base_url=GROQ_BASE_URL, timeout=30, max_retries=0)
+
+        with open(file_path, "rb") as audio_file:
+            transcription = client.audio.transcriptions.create(
+                model=model_name,
+                file=audio_file,
+                response_format="text",
+            )
+
+        transcript_text = str(transcription).strip()
+        logger.info("Transcribed %s via Groq API (%s, %d chars)",
+                     Path(file_path).name, model_name, len(transcript_text))
+
+        return {"success": True, "transcript": transcript_text, "provider": "groq"}
+
+    except PermissionError:
+        return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
+    except APIConnectionError as e:
+        return {"success": False, "transcript": "", "error": f"Connection error: {e}"}
+    except APITimeoutError as e:
+        return {"success": False, "transcript": "", "error": f"Request timeout: {e}"}
+    except APIError as e:
+        return {"success": False, "transcript": "", "error": f"API error: {e}"}
+    except Exception as e:
+        logger.error("Groq transcription failed: %s", e, exc_info=True)
+        return {"success": False, "transcript": "", "error": f"Transcription failed: {e}"}
+
 # ---------------------------------------------------------------------------
 # Provider: openai (Whisper API)
 # ---------------------------------------------------------------------------
@@ -184,8 +269,14 @@ def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]:
    if not _HAS_OPENAI:
        return {"success": False, "transcript": "", "error": "openai package not installed"}

+    # Auto-correct model if caller passed a Groq-only model
+    if model_name in GROQ_MODELS:
+        logger.info("Model %s not available on OpenAI, using %s", model_name, DEFAULT_STT_MODEL)
+        model_name = DEFAULT_STT_MODEL
+
    try:
-        client = OpenAI(api_key=api_key, base_url="https://api.openai.com/v1")
+        from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
+        client = OpenAI(api_key=api_key, base_url=OPENAI_BASE_URL, timeout=30, max_retries=0)

        with open(file_path, "rb") as audio_file:
            transcription = client.audio.transcriptions.create(
@@ -198,7 +289,7 @@ def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]:
        logger.info("Transcribed %s via OpenAI API (%s, %d chars)",
                     Path(file_path).name, model_name, len(transcript_text))

-        return {"success": True, "transcript": transcript_text}
+        return {"success": True, "transcript": transcript_text, "provider": "openai"}

    except PermissionError:
        return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
@@ -223,7 +314,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A

    Provider priority:
      1. User config (``stt.provider`` in config.yaml)
-      2. Auto-detect: local faster-whisper if available, else OpenAI API
+      2. Auto-detect: local faster-whisper (free) > Groq (free tier) > OpenAI (paid)

    Args:
        file_path: Absolute path to the audio file to transcribe.
@@ -234,6 +325,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
          - "success" (bool): Whether transcription succeeded
          - "transcript" (str): The transcribed text (empty on failure)
          - "error" (str, optional): Error message if success is False
+          - "provider" (str, optional): Which provider was used
    """
    # Validate input
    error = _validate_audio_file(file_path)
@@ -249,9 +341,13 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
        model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
        return _transcribe_local(file_path, model_name)

+    if provider == "groq":
+        model_name = model or DEFAULT_GROQ_STT_MODEL
+        return _transcribe_groq(file_path, model_name)
+
    if provider == "openai":
        openai_cfg = stt_config.get("openai", {})
-        model_name = model or openai_cfg.get("model", DEFAULT_OPENAI_MODEL)
+        model_name = model or openai_cfg.get("model", DEFAULT_STT_MODEL)
        return _transcribe_openai(file_path, model_name)

    # No provider available
@@ -260,6 +356,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
        "transcript": "",
        "error": (
            "No STT provider available. Install faster-whisper for free local "
-            "transcription, or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API."
+            "transcription, set GROQ_API_KEY for free Groq Whisper, "
+            "or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API."
        ),
    }
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@@ -25,35 +25,41 @@ import datetime
 import json
 import logging
 import os
+import queue
+import re
 import shutil
 import subprocess
 import tempfile
+import threading
 from pathlib import Path
-from typing import Dict, Any, Optional
+from typing import Callable, Dict, Any, Optional

 logger = logging.getLogger(__name__)

 # ---------------------------------------------------------------------------
-# Optional imports -- providers degrade gracefully if not installed
+# Lazy imports -- providers are imported only when actually used to avoid
+# crashing in headless environments (SSH, Docker, WSL, no PortAudio).
 # ---------------------------------------------------------------------------
-try:
+
+def _import_edge_tts():
+    """Lazy import edge_tts. Returns the module or raises ImportError."""
    import edge_tts
-    _HAS_EDGE_TTS = True
-except ImportError:
-    _HAS_EDGE_TTS = False
+    return edge_tts

-try:
+def _import_elevenlabs():
+    """Lazy import ElevenLabs client. Returns the class or raises ImportError."""
    from elevenlabs.client import ElevenLabs
-    _HAS_ELEVENLABS = True
-except ImportError:
-    _HAS_ELEVENLABS = False
+    return ElevenLabs

-# openai is a core dependency, but guard anyway
-try:
+def _import_openai_client():
+    """Lazy import OpenAI client. Returns the class or raises ImportError."""
    from openai import OpenAI as OpenAIClient
-    _HAS_OPENAI = True
-except ImportError:
-    _HAS_OPENAI = False
+    return OpenAIClient
+
+def _import_sounddevice():
+    """Lazy import sounddevice. Returns the module or raises ImportError/OSError."""
+    import sounddevice as sd
+    return sd


 # ===========================================================================
@@ -63,6 +69,7 @@ DEFAULT_PROVIDER = "edge"
 DEFAULT_EDGE_VOICE = "en-US-AriaNeural"
 DEFAULT_ELEVENLABS_VOICE_ID = "pNInz6obpgDQGcFmaJgB"  # Adam
 DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
+DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
 DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
 DEFAULT_OPENAI_VOICE = "alloy"
 DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
@@ -154,10 +161,11 @@ async def _generate_edge_tts(text: str, output_path: str, tts_config: Dict[str,
    Returns:
        Path to the saved audio file.
    """
+    _edge_tts = _import_edge_tts()
    edge_config = tts_config.get("edge", {})
    voice = edge_config.get("voice", DEFAULT_EDGE_VOICE)

-    communicate = edge_tts.Communicate(text, voice)
+    communicate = _edge_tts.Communicate(text, voice)
    await communicate.save(output_path)
    return output_path

@@ -191,6 +199,7 @@ def _generate_elevenlabs(text: str, output_path: str, tts_config: Dict[str, Any]
    else:
        output_format = "mp3_44100_128"

+    ElevenLabs = _import_elevenlabs()
    client = ElevenLabs(api_key=api_key)
    audio_generator = client.text_to_speech.convert(
        text=text,
@@ -236,6 +245,7 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
    else:
        response_format = "mp3"

+    OpenAIClient = _import_openai_client()
    client = OpenAIClient(api_key=api_key, base_url="https://api.openai.com/v1")
    response = client.audio.speech.create(
        model=model,
@@ -311,7 +321,9 @@ def text_to_speech_tool(
    try:
        # Generate audio with the configured provider
        if provider == "elevenlabs":
-            if not _HAS_ELEVENLABS:
+            try:
+                _import_elevenlabs()
+            except ImportError:
                return json.dumps({
                    "success": False,
                    "error": "ElevenLabs provider selected but 'elevenlabs' package not installed. Run: pip install elevenlabs"
@@ -320,7 +332,9 @@ def text_to_speech_tool(
            _generate_elevenlabs(text, file_str, tts_config)

        elif provider == "openai":
-            if not _HAS_OPENAI:
+            try:
+                _import_openai_client()
+            except ImportError:
                return json.dumps({
                    "success": False,
                    "error": "OpenAI provider selected but 'openai' package not installed."
@@ -330,7 +344,9 @@ def text_to_speech_tool(

        else:
            # Default: Edge TTS (free)
-            if not _HAS_EDGE_TTS:
+            try:
+                _import_edge_tts()
+            except ImportError:
                return json.dumps({
                    "success": False,
                    "error": "Edge TTS not available. Run: pip install edge-tts"
@@ -411,15 +427,262 @@ def check_tts_requirements() -> bool:
    Returns:
        bool: True if at least one provider can work.
    """
-    if _HAS_EDGE_TTS:
-        return True
-    if _HAS_ELEVENLABS and os.getenv("ELEVENLABS_API_KEY"):
-        return True
-    if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
+    try:
+        _import_edge_tts()
        return True
+    except ImportError:
+        pass
+    try:
+        _import_elevenlabs()
+        if os.getenv("ELEVENLABS_API_KEY"):
+            return True
+    except ImportError:
+        pass
+    try:
+        _import_openai_client()
+        if os.getenv("VOICE_TOOLS_OPENAI_KEY"):
+            return True
+    except ImportError:
+        pass
    return False


+# ===========================================================================
+# Streaming TTS: sentence-by-sentence pipeline for ElevenLabs
+# ===========================================================================
+# Sentence boundary pattern: punctuation followed by space or newline
+_SENTENCE_BOUNDARY_RE = re.compile(r'(?<=[.!?])(?:\s|\n)|(?:\n\n)')
+
+# Markdown stripping patterns (same as cli.py _voice_speak_response)
+_MD_CODE_BLOCK = re.compile(r'```[\s\S]*?```')
+_MD_LINK = re.compile(r'\[([^\]]+)\]\([^)]+\)')
+_MD_URL = re.compile(r'https?://\S+')
+_MD_BOLD = re.compile(r'\*\*(.+?)\*\*')
+_MD_ITALIC = re.compile(r'\*(.+?)\*')
+_MD_INLINE_CODE = re.compile(r'`(.+?)`')
+_MD_HEADER = re.compile(r'^#+\s*', flags=re.MULTILINE)
+_MD_LIST_ITEM = re.compile(r'^\s*[-*]\s+', flags=re.MULTILINE)
+_MD_HR = re.compile(r'---+')
+_MD_EXCESS_NL = re.compile(r'\n{3,}')
+
+
+def _strip_markdown_for_tts(text: str) -> str:
+    """Remove markdown formatting that shouldn't be spoken aloud."""
+    text = _MD_CODE_BLOCK.sub(' ', text)
+    text = _MD_LINK.sub(r'\1', text)
+    text = _MD_URL.sub('', text)
+    text = _MD_BOLD.sub(r'\1', text)
+    text = _MD_ITALIC.sub(r'\1', text)
+    text = _MD_INLINE_CODE.sub(r'\1', text)
+    text = _MD_HEADER.sub('', text)
+    text = _MD_LIST_ITEM.sub('', text)
+    text = _MD_HR.sub('', text)
+    text = _MD_EXCESS_NL.sub('\n\n', text)
+    return text.strip()
+
+
+def stream_tts_to_speaker(
+    text_queue: queue.Queue,
+    stop_event: threading.Event,
+    tts_done_event: threading.Event,
+    display_callback: Optional[Callable[[str], None]] = None,
+):
+    """Consume text deltas from *text_queue*, buffer them into sentences,
+    and stream each sentence through ElevenLabs TTS to the speaker in
+    real-time.
+
+    Protocol:
+        * The producer puts ``str`` deltas onto *text_queue*.
+        * A ``None`` sentinel signals end-of-text (flush remaining buffer).
+        * *stop_event* can be set to abort early (e.g. user interrupt).
+        * *tts_done_event* is **set** in the ``finally`` block so callers
+          waiting on it (continuous voice mode) know playback is finished.
+    """
+    tts_done_event.clear()
+
+    try:
+        # --- TTS client setup (optional -- display_callback works without it) ---
+        client = None
+        output_stream = None
+        voice_id = DEFAULT_ELEVENLABS_VOICE_ID
+        model_id = DEFAULT_ELEVENLABS_STREAMING_MODEL_ID
+
+        tts_config = _load_tts_config()
+        el_config = tts_config.get("elevenlabs", {})
+        voice_id = el_config.get("voice_id", voice_id)
+        model_id = el_config.get("streaming_model_id",
+                                 el_config.get("model_id", model_id))
+
+        api_key = os.getenv("ELEVENLABS_API_KEY", "")
+        if not api_key:
+            logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled")
+        else:
+            try:
+                ElevenLabs = _import_elevenlabs()
+                client = ElevenLabs(api_key=api_key)
+            except ImportError:
+                logger.warning("elevenlabs package not installed; streaming TTS disabled")
+
+            # Open a single sounddevice output stream for the lifetime of
+            # this function.  ElevenLabs pcm_24000 produces signed 16-bit
+            # little-endian mono PCM at 24 kHz.
+            if client is not None:
+                try:
+                    sd = _import_sounddevice()
+                    import numpy as _np
+                    output_stream = sd.OutputStream(
+                        samplerate=24000, channels=1, dtype="int16",
+                    )
+                    output_stream.start()
+                except (ImportError, OSError) as exc:
+                    logger.debug("sounddevice not available: %s", exc)
+                    output_stream = None
+                except Exception as exc:
+                    logger.warning("sounddevice OutputStream failed: %s", exc)
+                    output_stream = None
+
+        sentence_buf = ""
+        min_sentence_len = 20
+        long_flush_len = 100
+        queue_timeout = 0.5
+        _spoken_sentences: list[str] = []  # track spoken sentences to skip duplicates
+        # Regex to strip complete <think>...</think> blocks from buffer
+        _think_block_re = re.compile(r'<think[\s>].*?</think>', flags=re.DOTALL)
+
+        def _speak_sentence(sentence: str):
+            """Display sentence and optionally generate + play audio."""
+            if stop_event.is_set():
+                return
+            cleaned = _strip_markdown_for_tts(sentence).strip()
+            if not cleaned:
+                return
+            # Skip duplicate/near-duplicate sentences (LLM repetition)
+            cleaned_lower = cleaned.lower().rstrip(".!,")
+            for prev in _spoken_sentences:
+                if prev.lower().rstrip(".!,") == cleaned_lower:
+                    return
+            _spoken_sentences.append(cleaned)
+            # Display raw sentence on screen before TTS processing
+            if display_callback is not None:
+                display_callback(sentence)
+            # Skip audio generation if no TTS client available
+            if client is None:
+                return
+            # Truncate very long sentences
+            if len(cleaned) > MAX_TEXT_LENGTH:
+                cleaned = cleaned[:MAX_TEXT_LENGTH]
+            try:
+                audio_iter = client.text_to_speech.convert(
+                    text=cleaned,
+                    voice_id=voice_id,
+                    model_id=model_id,
+                    output_format="pcm_24000",
+                )
+                if output_stream is not None:
+                    for chunk in audio_iter:
+                        if stop_event.is_set():
+                            break
+                        import numpy as _np
+                        audio_array = _np.frombuffer(chunk, dtype=_np.int16)
+                        output_stream.write(audio_array.reshape(-1, 1))
+                else:
+                    # Fallback: write chunks to temp file and play via system player
+                    _play_via_tempfile(audio_iter, stop_event)
+            except Exception as exc:
+                logger.warning("Streaming TTS sentence failed: %s", exc)
+
+        def _play_via_tempfile(audio_iter, stop_evt):
+            """Write PCM chunks to a temp WAV file and play it."""
+            tmp_path = None
+            try:
+                import wave
+                tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+                tmp_path = tmp.name
+                with wave.open(tmp, "wb") as wf:
+                    wf.setnchannels(1)
+                    wf.setsampwidth(2)  # 16-bit
+                    wf.setframerate(24000)
+                    for chunk in audio_iter:
+                        if stop_evt.is_set():
+                            break
+                        wf.writeframes(chunk)
+                from tools.voice_mode import play_audio_file
+                play_audio_file(tmp_path)
+            except Exception as exc:
+                logger.warning("Temp-file TTS fallback failed: %s", exc)
+            finally:
+                if tmp_path:
+                    try:
+                        os.unlink(tmp_path)
+                    except OSError:
+                        pass
+
+        while not stop_event.is_set():
+            # Read next delta from queue
+            try:
+                delta = text_queue.get(timeout=queue_timeout)
+            except queue.Empty:
+                # Timeout: if we have accumulated a long buffer, flush it
+                if len(sentence_buf) > long_flush_len:
+                    _speak_sentence(sentence_buf)
+                    sentence_buf = ""
+                continue
+
+            if delta is None:
+                # End-of-text sentinel: strip any remaining think blocks, flush
+                sentence_buf = _think_block_re.sub('', sentence_buf)
+                if sentence_buf.strip():
+                    _speak_sentence(sentence_buf)
+                break
+
+            sentence_buf += delta
+
+            # --- Think block filtering ---
+            # Strip complete <think>...</think> blocks from buffer.
+            # Works correctly even when tags span multiple deltas.
+            sentence_buf = _think_block_re.sub('', sentence_buf)
+
+            # If an incomplete <think tag is at the end, wait for more data
+            # before extracting sentences (the closing tag may arrive next).
+            if '<think' in sentence_buf and '</think>' not in sentence_buf:
+                continue
+
+            # Check for sentence boundaries
+            while True:
+                m = _SENTENCE_BOUNDARY_RE.search(sentence_buf)
+                if m is None:
+                    break
+                end_pos = m.end()
+                sentence = sentence_buf[:end_pos]
+                sentence_buf = sentence_buf[end_pos:]
+                # Merge short fragments into the next sentence
+                if len(sentence.strip()) < min_sentence_len:
+                    sentence_buf = sentence + sentence_buf
+                    break
+                _speak_sentence(sentence)
+
+        # Drain any remaining items from the queue
+        while True:
+            try:
+                text_queue.get_nowait()
+            except queue.Empty:
+                break
+
+        # output_stream is closed in the finally block below
+
+    except Exception as exc:
+        logger.warning("Streaming TTS pipeline error: %s", exc)
+    finally:
+        # Always close the audio output stream to avoid locking the device
+        if output_stream is not None:
+            try:
+                output_stream.stop()
+                output_stream.close()
+            except Exception:
+                pass
+        tts_done_event.set()
+
+
 # ===========================================================================
 # Main -- quick diagnostics
 # ===========================================================================
@@ -427,12 +690,19 @@ if __name__ == "__main__":
    print("🔊 Text-to-Speech Tool Module")
    print("=" * 50)

+    def _check(importer, label):
+        try:
+            importer()
+            return True
+        except ImportError:
+            return False
+
    print(f"\nProvider availability:")
-    print(f"  Edge TTS:   {'✅ installed' if _HAS_EDGE_TTS else '❌ not installed (pip install edge-tts)'}")
-    print(f"  ElevenLabs: {'✅ installed' if _HAS_ELEVENLABS else '❌ not installed (pip install elevenlabs)'}")
-    print(f"    API Key:  {'✅ set' if os.getenv('ELEVENLABS_API_KEY') else '❌ not set'}")
-    print(f"  OpenAI:     {'✅ installed' if _HAS_OPENAI else '❌ not installed'}")
-    print(f"    API Key:  {'✅ set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else '❌ not set (VOICE_TOOLS_OPENAI_KEY)'}")
+    print(f"  Edge TTS:   {'installed' if _check(_import_edge_tts, 'edge') else 'not installed (pip install edge-tts)'}")
+    print(f"  ElevenLabs: {'installed' if _check(_import_elevenlabs, 'el') else 'not installed (pip install elevenlabs)'}")
+    print(f"    API Key:  {'set' if os.getenv('ELEVENLABS_API_KEY') else 'not set'}")
+    print(f"  OpenAI:     {'installed' if _check(_import_openai_client, 'oai') else 'not installed'}")
+    print(f"    API Key:  {'set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else 'not set (VOICE_TOOLS_OPENAI_KEY)'}")
    print(f"  ffmpeg:     {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}")
    print(f"\n  Output dir: {DEFAULT_OUTPUT_DIR}")

--- a/tools/voice_mode.py
+++ b/tools/voice_mode.py
@@ -0,0 +1,783 @@
+"""Voice Mode -- Push-to-talk audio recording and playback for the CLI.
+
+Provides audio capture via sounddevice, WAV encoding via stdlib wave,
+STT dispatch via tools.transcription_tools, and TTS playback via
+sounddevice or system audio players.
+
+Dependencies (optional):
+    pip install sounddevice numpy
+    or: pip install hermes-agent[voice]
+"""
+
+import logging
+import os
+import platform
+import re
+import shutil
+import subprocess
+import tempfile
+import threading
+import time
+import wave
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Lazy audio imports -- never imported at module level to avoid crashing
+# in headless environments (SSH, Docker, WSL, no PortAudio).
+# ---------------------------------------------------------------------------
+
+def _import_audio():
+    """Lazy-import sounddevice and numpy.  Returns (sd, np).
+
+    Raises ImportError or OSError if the libraries are not available
+    (e.g. PortAudio missing on headless servers).
+    """
+    import sounddevice as sd
+    import numpy as np
+    return sd, np
+
+
+def _audio_available() -> bool:
+    """Return True if audio libraries can be imported."""
+    try:
+        _import_audio()
+        return True
+    except (ImportError, OSError):
+        return False
+
+
+def detect_audio_environment() -> dict:
+    """Detect if the current environment supports audio I/O.
+
+    Returns dict with 'available' (bool) and 'warnings' (list of strings).
+    """
+    warnings = []
+
+    # SSH detection
+    if any(os.environ.get(v) for v in ('SSH_CLIENT', 'SSH_TTY', 'SSH_CONNECTION')):
+        warnings.append("Running over SSH -- no audio devices available")
+
+    # Docker detection
+    if os.path.exists('/.dockerenv'):
+        warnings.append("Running inside Docker container -- no audio devices")
+
+    # WSL detection
+    try:
+        with open('/proc/version', 'r') as f:
+            if 'microsoft' in f.read().lower():
+                warnings.append("Running in WSL -- audio requires PulseAudio bridge to Windows")
+    except (FileNotFoundError, PermissionError, OSError):
+        pass
+
+    # Check audio libraries
+    try:
+        sd, _ = _import_audio()
+        try:
+            devices = sd.query_devices()
+            if not devices:
+                warnings.append("No audio input/output devices detected")
+        except Exception:
+            warnings.append("Audio subsystem error (PortAudio cannot query devices)")
+    except (ImportError, OSError):
+        warnings.append("Audio libraries not installed (pip install sounddevice numpy)")
+
+    return {
+        "available": len(warnings) == 0,
+        "warnings": warnings,
+    }
+
+# ---------------------------------------------------------------------------
+# Recording parameters
+# ---------------------------------------------------------------------------
+SAMPLE_RATE = 16000  # Whisper native rate
+CHANNELS = 1  # Mono
+DTYPE = "int16"  # 16-bit PCM
+SAMPLE_WIDTH = 2  # bytes per sample (int16)
+MAX_RECORDING_SECONDS = 120  # Safety cap
+
+# Silence detection defaults
+SILENCE_RMS_THRESHOLD = 200  # RMS below this = silence (int16 range 0-32767)
+SILENCE_DURATION_SECONDS = 3.0  # Seconds of continuous silence before auto-stop
+
+# Temp directory for voice recordings
+_TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice")
+
+
+# ============================================================================
+# Audio cues (beep tones)
+# ============================================================================
+def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> None:
+    """Play a short beep tone using numpy + sounddevice.
+
+    Args:
+        frequency: Tone frequency in Hz (default 880 = A5).
+        duration: Duration of each beep in seconds.
+        count: Number of beeps to play (with short gap between).
+    """
+    try:
+        sd, np = _import_audio()
+    except (ImportError, OSError):
+        return
+    try:
+        gap = 0.06  # seconds between beeps
+        samples_per_beep = int(SAMPLE_RATE * duration)
+        samples_per_gap = int(SAMPLE_RATE * gap)
+
+        parts = []
+        for i in range(count):
+            t = np.linspace(0, duration, samples_per_beep, endpoint=False)
+            # Apply fade in/out to avoid click artifacts
+            tone = np.sin(2 * np.pi * frequency * t)
+            fade_len = min(int(SAMPLE_RATE * 0.01), samples_per_beep // 4)
+            tone[:fade_len] *= np.linspace(0, 1, fade_len)
+            tone[-fade_len:] *= np.linspace(1, 0, fade_len)
+            parts.append((tone * 0.3 * 32767).astype(np.int16))
+            if i < count - 1:
+                parts.append(np.zeros(samples_per_gap, dtype=np.int16))
+
+        audio = np.concatenate(parts)
+        sd.play(audio, samplerate=SAMPLE_RATE)
+        # sd.wait() calls Event.wait() without timeout — hangs forever if the
+        # audio device stalls.  Poll with a 2s ceiling and force-stop.
+        deadline = time.monotonic() + 2.0
+        while sd.get_stream() and sd.get_stream().active and time.monotonic() < deadline:
+            time.sleep(0.01)
+        sd.stop()
+    except Exception as e:
+        logger.debug("Beep playback failed: %s", e)
+
+
+# ============================================================================
+# AudioRecorder
+# ============================================================================
+class AudioRecorder:
+    """Thread-safe audio recorder using sounddevice.InputStream.
+
+    Usage::
+
+        recorder = AudioRecorder()
+        recorder.start(on_silence_stop=my_callback)
+        # ... user speaks ...
+        wav_path = recorder.stop()   # returns path to WAV file
+        # or
+        recorder.cancel()            # discard without saving
+
+    If ``on_silence_stop`` is provided, recording automatically stops when
+    the user is silent for ``silence_duration`` seconds and calls the callback.
+    """
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._stream: Any = None
+        self._frames: List[Any] = []
+        self._recording = False
+        self._start_time: float = 0.0
+        # Silence detection state
+        self._has_spoken = False
+        self._speech_start: float = 0.0  # When speech attempt began
+        self._dip_start: float = 0.0  # When current below-threshold dip began
+        self._min_speech_duration: float = 0.3  # Seconds of speech needed to confirm
+        self._max_dip_tolerance: float = 0.3  # Max dip duration before resetting speech
+        self._silence_start: float = 0.0
+        self._resume_start: float = 0.0  # Tracks sustained speech after silence starts
+        self._resume_dip_start: float = 0.0  # Dip tolerance tracker for resume detection
+        self._on_silence_stop = None
+        self._silence_threshold: int = SILENCE_RMS_THRESHOLD
+        self._silence_duration: float = SILENCE_DURATION_SECONDS
+        self._max_wait: float = 15.0  # Max seconds to wait for speech before auto-stop
+        # Peak RMS seen during recording (for speech presence check in stop())
+        self._peak_rms: int = 0
+        # Live audio level (read by UI for visual feedback)
+        self._current_rms: int = 0
+
+    # -- public properties ---------------------------------------------------
+
+    @property
+    def is_recording(self) -> bool:
+        return self._recording
+
+    @property
+    def elapsed_seconds(self) -> float:
+        if not self._recording:
+            return 0.0
+        return time.monotonic() - self._start_time
+
+    @property
+    def current_rms(self) -> int:
+        """Current audio input RMS level (0-32767). Updated each audio chunk."""
+        return self._current_rms
+
+    # -- public methods ------------------------------------------------------
+
+    def _ensure_stream(self) -> None:
+        """Create the audio InputStream once and keep it alive.
+
+        The stream stays open for the lifetime of the recorder.  Between
+        recordings the callback simply discards audio chunks (``_recording``
+        is ``False``).  This avoids the CoreAudio bug where closing and
+        re-opening an ``InputStream`` hangs indefinitely on macOS.
+        """
+        if self._stream is not None:
+            return  # already alive
+
+        sd, np = _import_audio()
+
+        def _callback(indata, frames, time_info, status):  # noqa: ARG001
+            if status:
+                logger.debug("sounddevice status: %s", status)
+            # When not recording the stream is idle — discard audio.
+            if not self._recording:
+                return
+            self._frames.append(indata.copy())
+
+            # Compute RMS for level display and silence detection
+            rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
+            self._current_rms = rms
+            if rms > self._peak_rms:
+                self._peak_rms = rms
+
+            # Silence detection
+            if self._on_silence_stop is not None:
+                now = time.monotonic()
+                elapsed = now - self._start_time
+
+                if rms > self._silence_threshold:
+                    # Audio is above threshold -- this is speech (or noise).
+                    self._dip_start = 0.0  # Reset dip tracker
+                    if self._speech_start == 0.0:
+                        self._speech_start = now
+                    elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration:
+                        self._has_spoken = True
+                        logger.debug("Speech confirmed (%.2fs above threshold)",
+                                     now - self._speech_start)
+                    # After speech is confirmed, only reset silence timer if
+                    # speech is sustained (>0.3s above threshold).  Brief
+                    # spikes from ambient noise should NOT reset the timer.
+                    if not self._has_spoken:
+                        self._silence_start = 0.0
+                    else:
+                        # Track resumed speech with dip tolerance.
+                        # Brief dips below threshold are normal during speech,
+                        # so we mirror the initial speech detection pattern:
+                        # start tracking, tolerate short dips, confirm after 0.3s.
+                        self._resume_dip_start = 0.0  # Above threshold — no dip
+                        if self._resume_start == 0.0:
+                            self._resume_start = now
+                        elif now - self._resume_start >= self._min_speech_duration:
+                            self._silence_start = 0.0
+                            self._resume_start = 0.0
+                elif self._has_spoken:
+                    # Below threshold after speech confirmed.
+                    # Use dip tolerance before resetting resume tracker —
+                    # natural speech has brief dips below threshold.
+                    if self._resume_start > 0:
+                        if self._resume_dip_start == 0.0:
+                            self._resume_dip_start = now
+                        elif now - self._resume_dip_start >= self._max_dip_tolerance:
+                            # Sustained dip — user actually stopped speaking
+                            self._resume_start = 0.0
+                            self._resume_dip_start = 0.0
+                elif self._speech_start > 0:
+                    # We were in a speech attempt but RMS dipped.
+                    # Tolerate brief dips (micro-pauses between syllables).
+                    if self._dip_start == 0.0:
+                        self._dip_start = now
+                    elif now - self._dip_start >= self._max_dip_tolerance:
+                        # Dip lasted too long -- genuine silence, reset
+                        logger.debug("Speech attempt reset (dip lasted %.2fs)",
+                                     now - self._dip_start)
+                        self._speech_start = 0.0
+                        self._dip_start = 0.0
+
+                # Fire silence callback when:
+                # 1. User spoke then went silent for silence_duration, OR
+                # 2. No speech detected at all for max_wait seconds
+                should_fire = False
+                if self._has_spoken and rms <= self._silence_threshold:
+                    # User was speaking and now is silent
+                    if self._silence_start == 0.0:
+                        self._silence_start = now
+                    elif now - self._silence_start >= self._silence_duration:
+                        logger.info("Silence detected (%.1fs), auto-stopping",
+                                    self._silence_duration)
+                        should_fire = True
+                elif not self._has_spoken and elapsed >= self._max_wait:
+                    logger.info("No speech within %.0fs, auto-stopping",
+                                self._max_wait)
+                    should_fire = True
+
+                if should_fire:
+                    with self._lock:
+                        cb = self._on_silence_stop
+                        self._on_silence_stop = None  # fire only once
+                    if cb:
+                        def _safe_cb():
+                            try:
+                                cb()
+                            except Exception as e:
+                                logger.error("Silence callback failed: %s", e, exc_info=True)
+                        threading.Thread(target=_safe_cb, daemon=True).start()
+
+        # Create stream — may block on CoreAudio (first call only).
+        stream = None
+        try:
+            stream = sd.InputStream(
+                samplerate=SAMPLE_RATE,
+                channels=CHANNELS,
+                dtype=DTYPE,
+                callback=_callback,
+            )
+            stream.start()
+        except Exception as e:
+            if stream is not None:
+                try:
+                    stream.close()
+                except Exception:
+                    pass
+            raise RuntimeError(
+                f"Failed to open audio input stream: {e}. "
+                "Check that a microphone is connected and accessible."
+            ) from e
+        self._stream = stream
+
+    def start(self, on_silence_stop=None) -> None:
+        """Start capturing audio from the default input device.
+
+        The underlying InputStream is created once and kept alive across
+        recordings.  Subsequent calls simply reset detection state and
+        toggle frame collection via ``_recording``.
+
+        Args:
+            on_silence_stop: Optional callback invoked (in a daemon thread) when
+                silence is detected after speech. The callback receives no arguments.
+                Use this to auto-stop recording and trigger transcription.
+
+        Raises ``RuntimeError`` if sounddevice/numpy are not installed
+        or if a recording is already in progress.
+        """
+        try:
+            _import_audio()
+        except (ImportError, OSError) as e:
+            raise RuntimeError(
+                "Voice mode requires sounddevice and numpy.\n"
+                "Install with: pip install sounddevice numpy\n"
+                "Or: pip install hermes-agent[voice]"
+            ) from e
+
+        with self._lock:
+            if self._recording:
+                return  # already recording
+
+            self._frames = []
+            self._start_time = time.monotonic()
+            self._has_spoken = False
+            self._speech_start = 0.0
+            self._dip_start = 0.0
+            self._silence_start = 0.0
+            self._resume_start = 0.0
+            self._resume_dip_start = 0.0
+            self._peak_rms = 0
+            self._current_rms = 0
+            self._on_silence_stop = on_silence_stop
+
+        # Ensure the persistent stream is alive (no-op after first call).
+        self._ensure_stream()
+
+        with self._lock:
+            self._recording = True
+        logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
+
+    def _close_stream_with_timeout(self, timeout: float = 3.0) -> None:
+        """Close the audio stream with a timeout to prevent CoreAudio hangs."""
+        if self._stream is None:
+            return
+
+        stream = self._stream
+        self._stream = None
+
+        def _do_close():
+            try:
+                stream.stop()
+                stream.close()
+            except Exception:
+                pass
+
+        t = threading.Thread(target=_do_close, daemon=True)
+        t.start()
+        # Poll in short intervals so Ctrl+C is not blocked
+        deadline = __import__("time").monotonic() + timeout
+        while t.is_alive() and __import__("time").monotonic() < deadline:
+            t.join(timeout=0.1)
+        if t.is_alive():
+            logger.warning("Audio stream close timed out after %.1fs — forcing ahead", timeout)
+
+    def stop(self) -> Optional[str]:
+        """Stop recording and write captured audio to a WAV file.
+
+        The underlying stream is kept alive for reuse — only frame
+        collection is stopped.
+
+        Returns:
+            Path to the WAV file, or ``None`` if no audio was captured.
+        """
+        with self._lock:
+            if not self._recording:
+                return None
+
+            self._recording = False
+            self._current_rms = 0
+            # Stream stays alive — no close needed.
+
+            if not self._frames:
+                return None
+
+            # Concatenate frames and write WAV
+            _, np = _import_audio()
+            audio_data = np.concatenate(self._frames, axis=0)
+            self._frames = []
+
+            elapsed = time.monotonic() - self._start_time
+            logger.info("Voice recording stopped (%.1fs, %d samples)", elapsed, len(audio_data))
+
+            # Skip very short recordings (< 0.3s of audio)
+            min_samples = int(SAMPLE_RATE * 0.3)
+            if len(audio_data) < min_samples:
+                logger.debug("Recording too short (%d samples), discarding", len(audio_data))
+                return None
+
+            # Skip silent recordings using peak RMS (not overall average, which
+            # gets diluted by silence at the end of the recording).
+            if self._peak_rms < SILENCE_RMS_THRESHOLD:
+                logger.info("Recording too quiet (peak RMS=%d < %d), discarding",
+                            self._peak_rms, SILENCE_RMS_THRESHOLD)
+                return None
+
+            return self._write_wav(audio_data)
+
+    def cancel(self) -> None:
+        """Stop recording and discard all captured audio.
+
+        The underlying stream is kept alive for reuse.
+        """
+        with self._lock:
+            self._recording = False
+            self._frames = []
+            self._on_silence_stop = None
+            self._current_rms = 0
+        logger.info("Voice recording cancelled")
+
+    def shutdown(self) -> None:
+        """Release the audio stream.  Call when voice mode is disabled."""
+        with self._lock:
+            self._recording = False
+            self._frames = []
+            self._on_silence_stop = None
+        # Close stream OUTSIDE the lock to avoid deadlock with audio callback
+        self._close_stream_with_timeout()
+        logger.info("AudioRecorder shut down")
+
+    # -- private helpers -----------------------------------------------------
+
+    @staticmethod
+    def _write_wav(audio_data) -> str:
+        """Write numpy int16 audio data to a WAV file.
+
+        Returns the file path.
+        """
+        os.makedirs(_TEMP_DIR, exist_ok=True)
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        wav_path = os.path.join(_TEMP_DIR, f"recording_{timestamp}.wav")
+
+        with wave.open(wav_path, "wb") as wf:
+            wf.setnchannels(CHANNELS)
+            wf.setsampwidth(SAMPLE_WIDTH)
+            wf.setframerate(SAMPLE_RATE)
+            wf.writeframes(audio_data.tobytes())
+
+        file_size = os.path.getsize(wav_path)
+        logger.info("WAV written: %s (%d bytes)", wav_path, file_size)
+        return wav_path
+
+
+# ============================================================================
+# Whisper hallucination filter
+# ============================================================================
+# Whisper commonly hallucinates these phrases on silent/near-silent audio.
+WHISPER_HALLUCINATIONS = {
+    "thank you.",
+    "thank you",
+    "thanks for watching.",
+    "thanks for watching",
+    "subscribe to my channel.",
+    "subscribe to my channel",
+    "like and subscribe.",
+    "like and subscribe",
+    "please subscribe.",
+    "please subscribe",
+    "thank you for watching.",
+    "thank you for watching",
+    "bye.",
+    "bye",
+    "you",
+    "the end.",
+    "the end",
+    # Non-English hallucinations (common on silence)
+    "продолжение следует",
+    "продолжение следует...",
+    "sous-titres",
+    "sous-titres réalisés par la communauté d'amara.org",
+    "sottotitoli creati dalla comunità amara.org",
+    "untertitel von stephanie geiges",
+    "amara.org",
+    "www.mooji.org",
+    "ご視聴ありがとうございました",
+}
+
+# Regex patterns for repetitive hallucinations (e.g. "Thank you. Thank you. Thank you.")
+_HALLUCINATION_REPEAT_RE = re.compile(
+    r'^(?:thank you|thanks|bye|you|ok|okay|the end|\.|\s|,|!)+$',
+    flags=re.IGNORECASE,
+)
+
+
+def is_whisper_hallucination(transcript: str) -> bool:
+    """Check if a transcript is a known Whisper hallucination on silence."""
+    cleaned = transcript.strip().lower()
+    if not cleaned:
+        return True
+    # Exact match against known phrases
+    if cleaned.rstrip('.!') in WHISPER_HALLUCINATIONS or cleaned in WHISPER_HALLUCINATIONS:
+        return True
+    # Repetitive patterns (e.g. "Thank you. Thank you. Thank you. you")
+    if _HALLUCINATION_REPEAT_RE.match(cleaned):
+        return True
+    return False
+
+
+# ============================================================================
+# STT dispatch
+# ============================================================================
+def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str, Any]:
+    """Transcribe a WAV recording using the existing Whisper pipeline.
+
+    Delegates to ``tools.transcription_tools.transcribe_audio()``.
+    Filters out known Whisper hallucinations on silent audio.
+
+    Args:
+        wav_path: Path to the WAV file.
+        model: Whisper model name (default: from config or ``whisper-1``).
+
+    Returns:
+        Dict with ``success``, ``transcript``, and optionally ``error``.
+    """
+    from tools.transcription_tools import transcribe_audio
+
+    result = transcribe_audio(wav_path, model=model)
+
+    # Filter out Whisper hallucinations (common on silent/near-silent audio)
+    if result.get("success") and is_whisper_hallucination(result.get("transcript", "")):
+        logger.info("Filtered Whisper hallucination: %r", result["transcript"])
+        return {"success": True, "transcript": "", "filtered": True}
+
+    return result
+
+
+# ============================================================================
+# Audio playback (interruptable)
+# ============================================================================
+
+# Global reference to the active playback process so it can be interrupted.
+_active_playback: Optional[subprocess.Popen] = None
+_playback_lock = threading.Lock()
+
+
+def stop_playback() -> None:
+    """Interrupt the currently playing audio (if any)."""
+    global _active_playback
+    with _playback_lock:
+        proc = _active_playback
+        _active_playback = None
+    if proc and proc.poll() is None:
+        try:
+            proc.terminate()
+            logger.info("Audio playback interrupted")
+        except Exception:
+            pass
+    # Also stop sounddevice playback if active
+    try:
+        sd, _ = _import_audio()
+        sd.stop()
+    except Exception:
+        pass
+
+
+def play_audio_file(file_path: str) -> bool:
+    """Play an audio file through the default output device.
+
+    Strategy:
+    1. WAV files via ``sounddevice.play()`` when available.
+    2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform),
+       ``aplay`` (Linux ALSA).
+
+    Playback can be interrupted by calling ``stop_playback()``.
+
+    Returns:
+        ``True`` if playback succeeded, ``False`` otherwise.
+    """
+    global _active_playback
+
+    if not os.path.isfile(file_path):
+        logger.warning("Audio file not found: %s", file_path)
+        return False
+
+    # Try sounddevice for WAV files
+    if file_path.endswith(".wav"):
+        try:
+            sd, np = _import_audio()
+            with wave.open(file_path, "rb") as wf:
+                frames = wf.readframes(wf.getnframes())
+                audio_data = np.frombuffer(frames, dtype=np.int16)
+                sample_rate = wf.getframerate()
+
+            sd.play(audio_data, samplerate=sample_rate)
+            # sd.wait() calls Event.wait() without timeout — hangs forever if
+            # the audio device stalls.  Poll with a ceiling and force-stop.
+            duration_secs = len(audio_data) / sample_rate
+            deadline = time.monotonic() + duration_secs + 2.0
+            while sd.get_stream() and sd.get_stream().active and time.monotonic() < deadline:
+                time.sleep(0.01)
+            sd.stop()
+            return True
+        except (ImportError, OSError):
+            pass  # audio libs not available, fall through to system players
+        except Exception as e:
+            logger.debug("sounddevice playback failed: %s", e)
+
+    # Fall back to system audio players (using Popen for interruptability)
+    system = platform.system()
+    players = []
+
+    if system == "Darwin":
+        players.append(["afplay", file_path])
+    players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path])
+    if system == "Linux":
+        players.append(["aplay", "-q", file_path])
+
+    for cmd in players:
+        exe = shutil.which(cmd[0])
+        if exe:
+            try:
+                proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+                with _playback_lock:
+                    _active_playback = proc
+                proc.wait(timeout=300)
+                with _playback_lock:
+                    _active_playback = None
+                return True
+            except subprocess.TimeoutExpired:
+                logger.warning("System player %s timed out, killing process", cmd[0])
+                proc.kill()
+                proc.wait()
+                with _playback_lock:
+                    _active_playback = None
+            except Exception as e:
+                logger.debug("System player %s failed: %s", cmd[0], e)
+                with _playback_lock:
+                    _active_playback = None
+
+    logger.warning("No audio player available for %s", file_path)
+    return False
+
+
+# ============================================================================
+# Requirements check
+# ============================================================================
+def check_voice_requirements() -> Dict[str, Any]:
+    """Check if all voice mode requirements are met.
+
+    Returns:
+        Dict with ``available``, ``audio_available``, ``stt_available``,
+        ``missing_packages``, and ``details``.
+    """
+    # Determine STT provider availability
+    from tools.transcription_tools import _get_provider, _load_stt_config, _HAS_FASTER_WHISPER
+    stt_config = _load_stt_config()
+    stt_provider = _get_provider(stt_config)
+    stt_available = stt_provider != "none"
+
+    missing: List[str] = []
+    has_audio = _audio_available()
+
+    if not has_audio:
+        missing.extend(["sounddevice", "numpy"])
+
+    # Environment detection
+    env_check = detect_audio_environment()
+
+    available = has_audio and stt_available and env_check["available"]
+    details_parts = []
+
+    if has_audio:
+        details_parts.append("Audio capture: OK")
+    else:
+        details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)")
+
+    if stt_provider == "local":
+        details_parts.append("STT provider: OK (local faster-whisper)")
+    elif stt_provider == "groq":
+        details_parts.append("STT provider: OK (Groq)")
+    elif stt_provider == "openai":
+        details_parts.append("STT provider: OK (OpenAI)")
+    else:
+        details_parts.append(
+            "STT provider: MISSING (pip install faster-whisper, "
+            "or set GROQ_API_KEY / VOICE_TOOLS_OPENAI_KEY)"
+        )
+
+    for warning in env_check["warnings"]:
+        details_parts.append(f"Environment: {warning}")
+
+    return {
+        "available": available,
+        "audio_available": has_audio,
+        "stt_available": stt_available,
+        "missing_packages": missing,
+        "details": "\n".join(details_parts),
+        "environment": env_check,
+    }
+
+
+# ============================================================================
+# Temp file cleanup
+# ============================================================================
+def cleanup_temp_recordings(max_age_seconds: int = 3600) -> int:
+    """Remove old temporary voice recording files.
+
+    Args:
+        max_age_seconds: Delete files older than this (default: 1 hour).
+
+    Returns:
+        Number of files deleted.
+    """
+    if not os.path.isdir(_TEMP_DIR):
+        return 0
+
+    deleted = 0
+    now = time.time()
+
+    for entry in os.scandir(_TEMP_DIR):
+        if entry.is_file() and entry.name.startswith("recording_") and entry.name.endswith(".wav"):
+            try:
+                age = now - entry.stat().st_mtime
+                if age > max_age_seconds:
+                    os.unlink(entry.path)
+                    deleted += 1
+            except OSError:
+                pass
+
+    if deleted:
+        logger.debug("Cleaned up %d old voice recordings", deleted)
+    return deleted