Merge pull request #1299 from NousResearch/hermes/hermes-f5fb1d3b
fix: salvage PR #327 voice mode onto current main
This commit is contained in:
@@ -224,24 +224,14 @@ def _emergency_cleanup_all_sessions():
|
||||
logger.error("Emergency cleanup error: %s", e)
|
||||
|
||||
|
||||
def _signal_handler(signum, frame):
|
||||
"""Handle interrupt signals to cleanup sessions before exit."""
|
||||
logger.warning("Received signal %s, cleaning up...", signum)
|
||||
_emergency_cleanup_all_sessions()
|
||||
sys.exit(128 + signum)
|
||||
|
||||
|
||||
# Register cleanup handlers
|
||||
# Register cleanup via atexit only. Previous versions installed SIGINT/SIGTERM
|
||||
# handlers that called sys.exit(), but this conflicts with prompt_toolkit's
|
||||
# async event loop — a SystemExit raised inside a key-binding callback
|
||||
# corrupts the coroutine state and makes the process unkillable. atexit
|
||||
# handlers run on any normal exit (including sys.exit), so browser sessions
|
||||
# are still cleaned up without hijacking signals.
|
||||
atexit.register(_emergency_cleanup_all_sessions)
|
||||
|
||||
# Only register signal handlers in main process (not in multiprocessing workers)
|
||||
try:
|
||||
if os.getpid() == os.getpgrp(): # Main process check
|
||||
signal.signal(signal.SIGINT, _signal_handler)
|
||||
signal.signal(signal.SIGTERM, _signal_handler)
|
||||
except (OSError, AttributeError):
|
||||
pass # Signal handling not available (e.g., Windows or worker process)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Inactivity Cleanup Functions
|
||||
|
||||
@@ -2,11 +2,12 @@
|
||||
"""
|
||||
Transcription Tools Module
|
||||
|
||||
Provides speech-to-text transcription with two providers:
|
||||
Provides speech-to-text transcription with three providers:
|
||||
|
||||
- **local** (default, free) — faster-whisper running locally, no API key needed.
|
||||
Auto-downloads the model (~150 MB for ``base``) on first use.
|
||||
- **openai** — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``.
|
||||
- **groq** (free tier) — Groq Whisper API, requires ``GROQ_API_KEY``.
|
||||
- **openai** (paid) — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``.
|
||||
|
||||
Used by the messaging gateway to automatically transcribe voice messages
|
||||
sent by users on Telegram, Discord, WhatsApp, Slack, and Signal.
|
||||
@@ -33,18 +34,9 @@ logger = logging.getLogger(__name__)
|
||||
# Optional imports — graceful degradation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
_HAS_FASTER_WHISPER = True
|
||||
except ImportError:
|
||||
_HAS_FASTER_WHISPER = False
|
||||
WhisperModel = None # type: ignore[assignment,misc]
|
||||
|
||||
try:
|
||||
from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
|
||||
_HAS_OPENAI = True
|
||||
except ImportError:
|
||||
_HAS_OPENAI = False
|
||||
import importlib.util as _ilu
|
||||
_HAS_FASTER_WHISPER = _ilu.find_spec("faster_whisper") is not None
|
||||
_HAS_OPENAI = _ilu.find_spec("openai") is not None
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
@@ -52,13 +44,21 @@ except ImportError:
|
||||
|
||||
DEFAULT_PROVIDER = "local"
|
||||
DEFAULT_LOCAL_MODEL = "base"
|
||||
DEFAULT_OPENAI_MODEL = "whisper-1"
|
||||
DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1")
|
||||
DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo")
|
||||
|
||||
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
|
||||
OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||||
|
||||
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"}
|
||||
MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB
|
||||
|
||||
# Known model sets for auto-correction
|
||||
OPENAI_MODELS = {"whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"}
|
||||
GROQ_MODELS = {"whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"}
|
||||
|
||||
# Singleton for the local model — loaded once, reused across calls
|
||||
_local_model: Optional["WhisperModel"] = None
|
||||
_local_model: Optional[object] = None
|
||||
_local_model_name: Optional[str] = None
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -66,6 +66,24 @@ _local_model_name: Optional[str] = None
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_stt_model_from_config() -> Optional[str]:
|
||||
"""Read the STT model name from ~/.hermes/config.yaml.
|
||||
|
||||
Returns the value of ``stt.model`` if present, otherwise ``None``.
|
||||
Silently returns ``None`` on any error (missing file, bad YAML, etc.).
|
||||
"""
|
||||
try:
|
||||
import yaml
|
||||
cfg_path = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "config.yaml"
|
||||
if cfg_path.exists():
|
||||
with open(cfg_path) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
return data.get("stt", {}).get("model")
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _load_stt_config() -> dict:
|
||||
"""Load the ``stt`` section from user config, falling back to defaults."""
|
||||
try:
|
||||
@@ -80,7 +98,7 @@ def _get_provider(stt_config: dict) -> str:
|
||||
|
||||
Priority:
|
||||
1. Explicit config value (``stt.provider``)
|
||||
2. Auto-detect: local if faster-whisper available, else openai if key set
|
||||
2. Auto-detect: local > groq (free) > openai (paid)
|
||||
3. Disabled (returns "none")
|
||||
"""
|
||||
provider = stt_config.get("provider", DEFAULT_PROVIDER)
|
||||
@@ -88,19 +106,37 @@ def _get_provider(stt_config: dict) -> str:
|
||||
if provider == "local":
|
||||
if _HAS_FASTER_WHISPER:
|
||||
return "local"
|
||||
# Local requested but not available — fall back to openai if possible
|
||||
# Local requested but not available — fall back to groq, then openai
|
||||
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||
logger.info("faster-whisper not installed, falling back to Groq Whisper API")
|
||||
return "groq"
|
||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
||||
logger.info("faster-whisper not installed, falling back to OpenAI Whisper API")
|
||||
return "openai"
|
||||
return "none"
|
||||
|
||||
if provider == "groq":
|
||||
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||
return "groq"
|
||||
# Groq requested but no key — fall back
|
||||
if _HAS_FASTER_WHISPER:
|
||||
logger.info("GROQ_API_KEY not set, falling back to local faster-whisper")
|
||||
return "local"
|
||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
||||
logger.info("GROQ_API_KEY not set, falling back to OpenAI Whisper API")
|
||||
return "openai"
|
||||
return "none"
|
||||
|
||||
if provider == "openai":
|
||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
||||
return "openai"
|
||||
# OpenAI requested but no key — fall back to local if possible
|
||||
# OpenAI requested but no key — fall back
|
||||
if _HAS_FASTER_WHISPER:
|
||||
logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to local faster-whisper")
|
||||
return "local"
|
||||
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||
logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to Groq Whisper API")
|
||||
return "groq"
|
||||
return "none"
|
||||
|
||||
return provider # Unknown — let it fail downstream
|
||||
@@ -150,6 +186,7 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||
return {"success": False, "transcript": "", "error": "faster-whisper not installed"}
|
||||
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
# Lazy-load the model (downloads on first use, ~150 MB for 'base')
|
||||
if _local_model is None or _local_model_name != model_name:
|
||||
logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name)
|
||||
@@ -164,12 +201,60 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||
Path(file_path).name, model_name, info.language, info.duration,
|
||||
)
|
||||
|
||||
return {"success": True, "transcript": transcript}
|
||||
return {"success": True, "transcript": transcript, "provider": "local"}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Local transcription failed: %s", e, exc_info=True)
|
||||
return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider: groq (Whisper API — free tier)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _transcribe_groq(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||
"""Transcribe using Groq Whisper API (free tier available)."""
|
||||
api_key = os.getenv("GROQ_API_KEY")
|
||||
if not api_key:
|
||||
return {"success": False, "transcript": "", "error": "GROQ_API_KEY not set"}
|
||||
|
||||
if not _HAS_OPENAI:
|
||||
return {"success": False, "transcript": "", "error": "openai package not installed"}
|
||||
|
||||
# Auto-correct model if caller passed an OpenAI-only model
|
||||
if model_name in OPENAI_MODELS:
|
||||
logger.info("Model %s not available on Groq, using %s", model_name, DEFAULT_GROQ_STT_MODEL)
|
||||
model_name = DEFAULT_GROQ_STT_MODEL
|
||||
|
||||
try:
|
||||
from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
|
||||
client = OpenAI(api_key=api_key, base_url=GROQ_BASE_URL, timeout=30, max_retries=0)
|
||||
|
||||
with open(file_path, "rb") as audio_file:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=audio_file,
|
||||
response_format="text",
|
||||
)
|
||||
|
||||
transcript_text = str(transcription).strip()
|
||||
logger.info("Transcribed %s via Groq API (%s, %d chars)",
|
||||
Path(file_path).name, model_name, len(transcript_text))
|
||||
|
||||
return {"success": True, "transcript": transcript_text, "provider": "groq"}
|
||||
|
||||
except PermissionError:
|
||||
return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
|
||||
except APIConnectionError as e:
|
||||
return {"success": False, "transcript": "", "error": f"Connection error: {e}"}
|
||||
except APITimeoutError as e:
|
||||
return {"success": False, "transcript": "", "error": f"Request timeout: {e}"}
|
||||
except APIError as e:
|
||||
return {"success": False, "transcript": "", "error": f"API error: {e}"}
|
||||
except Exception as e:
|
||||
logger.error("Groq transcription failed: %s", e, exc_info=True)
|
||||
return {"success": False, "transcript": "", "error": f"Transcription failed: {e}"}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider: openai (Whisper API)
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -184,8 +269,14 @@ def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||
if not _HAS_OPENAI:
|
||||
return {"success": False, "transcript": "", "error": "openai package not installed"}
|
||||
|
||||
# Auto-correct model if caller passed a Groq-only model
|
||||
if model_name in GROQ_MODELS:
|
||||
logger.info("Model %s not available on OpenAI, using %s", model_name, DEFAULT_STT_MODEL)
|
||||
model_name = DEFAULT_STT_MODEL
|
||||
|
||||
try:
|
||||
client = OpenAI(api_key=api_key, base_url="https://api.openai.com/v1")
|
||||
from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
|
||||
client = OpenAI(api_key=api_key, base_url=OPENAI_BASE_URL, timeout=30, max_retries=0)
|
||||
|
||||
with open(file_path, "rb") as audio_file:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
@@ -198,7 +289,7 @@ def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||
logger.info("Transcribed %s via OpenAI API (%s, %d chars)",
|
||||
Path(file_path).name, model_name, len(transcript_text))
|
||||
|
||||
return {"success": True, "transcript": transcript_text}
|
||||
return {"success": True, "transcript": transcript_text, "provider": "openai"}
|
||||
|
||||
except PermissionError:
|
||||
return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
|
||||
@@ -223,7 +314,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
||||
|
||||
Provider priority:
|
||||
1. User config (``stt.provider`` in config.yaml)
|
||||
2. Auto-detect: local faster-whisper if available, else OpenAI API
|
||||
2. Auto-detect: local faster-whisper (free) > Groq (free tier) > OpenAI (paid)
|
||||
|
||||
Args:
|
||||
file_path: Absolute path to the audio file to transcribe.
|
||||
@@ -234,6 +325,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
||||
- "success" (bool): Whether transcription succeeded
|
||||
- "transcript" (str): The transcribed text (empty on failure)
|
||||
- "error" (str, optional): Error message if success is False
|
||||
- "provider" (str, optional): Which provider was used
|
||||
"""
|
||||
# Validate input
|
||||
error = _validate_audio_file(file_path)
|
||||
@@ -249,9 +341,13 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
||||
model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
|
||||
return _transcribe_local(file_path, model_name)
|
||||
|
||||
if provider == "groq":
|
||||
model_name = model or DEFAULT_GROQ_STT_MODEL
|
||||
return _transcribe_groq(file_path, model_name)
|
||||
|
||||
if provider == "openai":
|
||||
openai_cfg = stt_config.get("openai", {})
|
||||
model_name = model or openai_cfg.get("model", DEFAULT_OPENAI_MODEL)
|
||||
model_name = model or openai_cfg.get("model", DEFAULT_STT_MODEL)
|
||||
return _transcribe_openai(file_path, model_name)
|
||||
|
||||
# No provider available
|
||||
@@ -260,6 +356,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
||||
"transcript": "",
|
||||
"error": (
|
||||
"No STT provider available. Install faster-whisper for free local "
|
||||
"transcription, or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API."
|
||||
"transcription, set GROQ_API_KEY for free Groq Whisper, "
|
||||
"or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API."
|
||||
),
|
||||
}
|
||||
|
||||
@@ -25,35 +25,41 @@ import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from typing import Callable, Dict, Any, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Optional imports -- providers degrade gracefully if not installed
|
||||
# Lazy imports -- providers are imported only when actually used to avoid
|
||||
# crashing in headless environments (SSH, Docker, WSL, no PortAudio).
|
||||
# ---------------------------------------------------------------------------
|
||||
try:
|
||||
|
||||
def _import_edge_tts():
|
||||
"""Lazy import edge_tts. Returns the module or raises ImportError."""
|
||||
import edge_tts
|
||||
_HAS_EDGE_TTS = True
|
||||
except ImportError:
|
||||
_HAS_EDGE_TTS = False
|
||||
return edge_tts
|
||||
|
||||
try:
|
||||
def _import_elevenlabs():
|
||||
"""Lazy import ElevenLabs client. Returns the class or raises ImportError."""
|
||||
from elevenlabs.client import ElevenLabs
|
||||
_HAS_ELEVENLABS = True
|
||||
except ImportError:
|
||||
_HAS_ELEVENLABS = False
|
||||
return ElevenLabs
|
||||
|
||||
# openai is a core dependency, but guard anyway
|
||||
try:
|
||||
def _import_openai_client():
|
||||
"""Lazy import OpenAI client. Returns the class or raises ImportError."""
|
||||
from openai import OpenAI as OpenAIClient
|
||||
_HAS_OPENAI = True
|
||||
except ImportError:
|
||||
_HAS_OPENAI = False
|
||||
return OpenAIClient
|
||||
|
||||
def _import_sounddevice():
|
||||
"""Lazy import sounddevice. Returns the module or raises ImportError/OSError."""
|
||||
import sounddevice as sd
|
||||
return sd
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
@@ -63,6 +69,7 @@ DEFAULT_PROVIDER = "edge"
|
||||
DEFAULT_EDGE_VOICE = "en-US-AriaNeural"
|
||||
DEFAULT_ELEVENLABS_VOICE_ID = "pNInz6obpgDQGcFmaJgB" # Adam
|
||||
DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
|
||||
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
|
||||
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
|
||||
DEFAULT_OPENAI_VOICE = "alloy"
|
||||
DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
|
||||
@@ -154,10 +161,11 @@ async def _generate_edge_tts(text: str, output_path: str, tts_config: Dict[str,
|
||||
Returns:
|
||||
Path to the saved audio file.
|
||||
"""
|
||||
_edge_tts = _import_edge_tts()
|
||||
edge_config = tts_config.get("edge", {})
|
||||
voice = edge_config.get("voice", DEFAULT_EDGE_VOICE)
|
||||
|
||||
communicate = edge_tts.Communicate(text, voice)
|
||||
communicate = _edge_tts.Communicate(text, voice)
|
||||
await communicate.save(output_path)
|
||||
return output_path
|
||||
|
||||
@@ -191,6 +199,7 @@ def _generate_elevenlabs(text: str, output_path: str, tts_config: Dict[str, Any]
|
||||
else:
|
||||
output_format = "mp3_44100_128"
|
||||
|
||||
ElevenLabs = _import_elevenlabs()
|
||||
client = ElevenLabs(api_key=api_key)
|
||||
audio_generator = client.text_to_speech.convert(
|
||||
text=text,
|
||||
@@ -236,6 +245,7 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
|
||||
else:
|
||||
response_format = "mp3"
|
||||
|
||||
OpenAIClient = _import_openai_client()
|
||||
client = OpenAIClient(api_key=api_key, base_url="https://api.openai.com/v1")
|
||||
response = client.audio.speech.create(
|
||||
model=model,
|
||||
@@ -311,7 +321,9 @@ def text_to_speech_tool(
|
||||
try:
|
||||
# Generate audio with the configured provider
|
||||
if provider == "elevenlabs":
|
||||
if not _HAS_ELEVENLABS:
|
||||
try:
|
||||
_import_elevenlabs()
|
||||
except ImportError:
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": "ElevenLabs provider selected but 'elevenlabs' package not installed. Run: pip install elevenlabs"
|
||||
@@ -320,7 +332,9 @@ def text_to_speech_tool(
|
||||
_generate_elevenlabs(text, file_str, tts_config)
|
||||
|
||||
elif provider == "openai":
|
||||
if not _HAS_OPENAI:
|
||||
try:
|
||||
_import_openai_client()
|
||||
except ImportError:
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": "OpenAI provider selected but 'openai' package not installed."
|
||||
@@ -330,7 +344,9 @@ def text_to_speech_tool(
|
||||
|
||||
else:
|
||||
# Default: Edge TTS (free)
|
||||
if not _HAS_EDGE_TTS:
|
||||
try:
|
||||
_import_edge_tts()
|
||||
except ImportError:
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": "Edge TTS not available. Run: pip install edge-tts"
|
||||
@@ -411,15 +427,262 @@ def check_tts_requirements() -> bool:
|
||||
Returns:
|
||||
bool: True if at least one provider can work.
|
||||
"""
|
||||
if _HAS_EDGE_TTS:
|
||||
return True
|
||||
if _HAS_ELEVENLABS and os.getenv("ELEVENLABS_API_KEY"):
|
||||
return True
|
||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
||||
try:
|
||||
_import_edge_tts()
|
||||
return True
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
_import_elevenlabs()
|
||||
if os.getenv("ELEVENLABS_API_KEY"):
|
||||
return True
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
_import_openai_client()
|
||||
if os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
||||
return True
|
||||
except ImportError:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Streaming TTS: sentence-by-sentence pipeline for ElevenLabs
|
||||
# ===========================================================================
|
||||
# Sentence boundary pattern: punctuation followed by space or newline
|
||||
_SENTENCE_BOUNDARY_RE = re.compile(r'(?<=[.!?])(?:\s|\n)|(?:\n\n)')
|
||||
|
||||
# Markdown stripping patterns (same as cli.py _voice_speak_response)
|
||||
_MD_CODE_BLOCK = re.compile(r'```[\s\S]*?```')
|
||||
_MD_LINK = re.compile(r'\[([^\]]+)\]\([^)]+\)')
|
||||
_MD_URL = re.compile(r'https?://\S+')
|
||||
_MD_BOLD = re.compile(r'\*\*(.+?)\*\*')
|
||||
_MD_ITALIC = re.compile(r'\*(.+?)\*')
|
||||
_MD_INLINE_CODE = re.compile(r'`(.+?)`')
|
||||
_MD_HEADER = re.compile(r'^#+\s*', flags=re.MULTILINE)
|
||||
_MD_LIST_ITEM = re.compile(r'^\s*[-*]\s+', flags=re.MULTILINE)
|
||||
_MD_HR = re.compile(r'---+')
|
||||
_MD_EXCESS_NL = re.compile(r'\n{3,}')
|
||||
|
||||
|
||||
def _strip_markdown_for_tts(text: str) -> str:
|
||||
"""Remove markdown formatting that shouldn't be spoken aloud."""
|
||||
text = _MD_CODE_BLOCK.sub(' ', text)
|
||||
text = _MD_LINK.sub(r'\1', text)
|
||||
text = _MD_URL.sub('', text)
|
||||
text = _MD_BOLD.sub(r'\1', text)
|
||||
text = _MD_ITALIC.sub(r'\1', text)
|
||||
text = _MD_INLINE_CODE.sub(r'\1', text)
|
||||
text = _MD_HEADER.sub('', text)
|
||||
text = _MD_LIST_ITEM.sub('', text)
|
||||
text = _MD_HR.sub('', text)
|
||||
text = _MD_EXCESS_NL.sub('\n\n', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def stream_tts_to_speaker(
|
||||
text_queue: queue.Queue,
|
||||
stop_event: threading.Event,
|
||||
tts_done_event: threading.Event,
|
||||
display_callback: Optional[Callable[[str], None]] = None,
|
||||
):
|
||||
"""Consume text deltas from *text_queue*, buffer them into sentences,
|
||||
and stream each sentence through ElevenLabs TTS to the speaker in
|
||||
real-time.
|
||||
|
||||
Protocol:
|
||||
* The producer puts ``str`` deltas onto *text_queue*.
|
||||
* A ``None`` sentinel signals end-of-text (flush remaining buffer).
|
||||
* *stop_event* can be set to abort early (e.g. user interrupt).
|
||||
* *tts_done_event* is **set** in the ``finally`` block so callers
|
||||
waiting on it (continuous voice mode) know playback is finished.
|
||||
"""
|
||||
tts_done_event.clear()
|
||||
|
||||
try:
|
||||
# --- TTS client setup (optional -- display_callback works without it) ---
|
||||
client = None
|
||||
output_stream = None
|
||||
voice_id = DEFAULT_ELEVENLABS_VOICE_ID
|
||||
model_id = DEFAULT_ELEVENLABS_STREAMING_MODEL_ID
|
||||
|
||||
tts_config = _load_tts_config()
|
||||
el_config = tts_config.get("elevenlabs", {})
|
||||
voice_id = el_config.get("voice_id", voice_id)
|
||||
model_id = el_config.get("streaming_model_id",
|
||||
el_config.get("model_id", model_id))
|
||||
|
||||
api_key = os.getenv("ELEVENLABS_API_KEY", "")
|
||||
if not api_key:
|
||||
logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled")
|
||||
else:
|
||||
try:
|
||||
ElevenLabs = _import_elevenlabs()
|
||||
client = ElevenLabs(api_key=api_key)
|
||||
except ImportError:
|
||||
logger.warning("elevenlabs package not installed; streaming TTS disabled")
|
||||
|
||||
# Open a single sounddevice output stream for the lifetime of
|
||||
# this function. ElevenLabs pcm_24000 produces signed 16-bit
|
||||
# little-endian mono PCM at 24 kHz.
|
||||
if client is not None:
|
||||
try:
|
||||
sd = _import_sounddevice()
|
||||
import numpy as _np
|
||||
output_stream = sd.OutputStream(
|
||||
samplerate=24000, channels=1, dtype="int16",
|
||||
)
|
||||
output_stream.start()
|
||||
except (ImportError, OSError) as exc:
|
||||
logger.debug("sounddevice not available: %s", exc)
|
||||
output_stream = None
|
||||
except Exception as exc:
|
||||
logger.warning("sounddevice OutputStream failed: %s", exc)
|
||||
output_stream = None
|
||||
|
||||
sentence_buf = ""
|
||||
min_sentence_len = 20
|
||||
long_flush_len = 100
|
||||
queue_timeout = 0.5
|
||||
_spoken_sentences: list[str] = [] # track spoken sentences to skip duplicates
|
||||
# Regex to strip complete <think>...</think> blocks from buffer
|
||||
_think_block_re = re.compile(r'<think[\s>].*?</think>', flags=re.DOTALL)
|
||||
|
||||
def _speak_sentence(sentence: str):
|
||||
"""Display sentence and optionally generate + play audio."""
|
||||
if stop_event.is_set():
|
||||
return
|
||||
cleaned = _strip_markdown_for_tts(sentence).strip()
|
||||
if not cleaned:
|
||||
return
|
||||
# Skip duplicate/near-duplicate sentences (LLM repetition)
|
||||
cleaned_lower = cleaned.lower().rstrip(".!,")
|
||||
for prev in _spoken_sentences:
|
||||
if prev.lower().rstrip(".!,") == cleaned_lower:
|
||||
return
|
||||
_spoken_sentences.append(cleaned)
|
||||
# Display raw sentence on screen before TTS processing
|
||||
if display_callback is not None:
|
||||
display_callback(sentence)
|
||||
# Skip audio generation if no TTS client available
|
||||
if client is None:
|
||||
return
|
||||
# Truncate very long sentences
|
||||
if len(cleaned) > MAX_TEXT_LENGTH:
|
||||
cleaned = cleaned[:MAX_TEXT_LENGTH]
|
||||
try:
|
||||
audio_iter = client.text_to_speech.convert(
|
||||
text=cleaned,
|
||||
voice_id=voice_id,
|
||||
model_id=model_id,
|
||||
output_format="pcm_24000",
|
||||
)
|
||||
if output_stream is not None:
|
||||
for chunk in audio_iter:
|
||||
if stop_event.is_set():
|
||||
break
|
||||
import numpy as _np
|
||||
audio_array = _np.frombuffer(chunk, dtype=_np.int16)
|
||||
output_stream.write(audio_array.reshape(-1, 1))
|
||||
else:
|
||||
# Fallback: write chunks to temp file and play via system player
|
||||
_play_via_tempfile(audio_iter, stop_event)
|
||||
except Exception as exc:
|
||||
logger.warning("Streaming TTS sentence failed: %s", exc)
|
||||
|
||||
def _play_via_tempfile(audio_iter, stop_evt):
|
||||
"""Write PCM chunks to a temp WAV file and play it."""
|
||||
tmp_path = None
|
||||
try:
|
||||
import wave
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
tmp_path = tmp.name
|
||||
with wave.open(tmp, "wb") as wf:
|
||||
wf.setnchannels(1)
|
||||
wf.setsampwidth(2) # 16-bit
|
||||
wf.setframerate(24000)
|
||||
for chunk in audio_iter:
|
||||
if stop_evt.is_set():
|
||||
break
|
||||
wf.writeframes(chunk)
|
||||
from tools.voice_mode import play_audio_file
|
||||
play_audio_file(tmp_path)
|
||||
except Exception as exc:
|
||||
logger.warning("Temp-file TTS fallback failed: %s", exc)
|
||||
finally:
|
||||
if tmp_path:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
while not stop_event.is_set():
|
||||
# Read next delta from queue
|
||||
try:
|
||||
delta = text_queue.get(timeout=queue_timeout)
|
||||
except queue.Empty:
|
||||
# Timeout: if we have accumulated a long buffer, flush it
|
||||
if len(sentence_buf) > long_flush_len:
|
||||
_speak_sentence(sentence_buf)
|
||||
sentence_buf = ""
|
||||
continue
|
||||
|
||||
if delta is None:
|
||||
# End-of-text sentinel: strip any remaining think blocks, flush
|
||||
sentence_buf = _think_block_re.sub('', sentence_buf)
|
||||
if sentence_buf.strip():
|
||||
_speak_sentence(sentence_buf)
|
||||
break
|
||||
|
||||
sentence_buf += delta
|
||||
|
||||
# --- Think block filtering ---
|
||||
# Strip complete <think>...</think> blocks from buffer.
|
||||
# Works correctly even when tags span multiple deltas.
|
||||
sentence_buf = _think_block_re.sub('', sentence_buf)
|
||||
|
||||
# If an incomplete <think tag is at the end, wait for more data
|
||||
# before extracting sentences (the closing tag may arrive next).
|
||||
if '<think' in sentence_buf and '</think>' not in sentence_buf:
|
||||
continue
|
||||
|
||||
# Check for sentence boundaries
|
||||
while True:
|
||||
m = _SENTENCE_BOUNDARY_RE.search(sentence_buf)
|
||||
if m is None:
|
||||
break
|
||||
end_pos = m.end()
|
||||
sentence = sentence_buf[:end_pos]
|
||||
sentence_buf = sentence_buf[end_pos:]
|
||||
# Merge short fragments into the next sentence
|
||||
if len(sentence.strip()) < min_sentence_len:
|
||||
sentence_buf = sentence + sentence_buf
|
||||
break
|
||||
_speak_sentence(sentence)
|
||||
|
||||
# Drain any remaining items from the queue
|
||||
while True:
|
||||
try:
|
||||
text_queue.get_nowait()
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
# output_stream is closed in the finally block below
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning("Streaming TTS pipeline error: %s", exc)
|
||||
finally:
|
||||
# Always close the audio output stream to avoid locking the device
|
||||
if output_stream is not None:
|
||||
try:
|
||||
output_stream.stop()
|
||||
output_stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
tts_done_event.set()
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Main -- quick diagnostics
|
||||
# ===========================================================================
|
||||
@@ -427,12 +690,19 @@ if __name__ == "__main__":
|
||||
print("🔊 Text-to-Speech Tool Module")
|
||||
print("=" * 50)
|
||||
|
||||
def _check(importer, label):
|
||||
try:
|
||||
importer()
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
print(f"\nProvider availability:")
|
||||
print(f" Edge TTS: {'✅ installed' if _HAS_EDGE_TTS else '❌ not installed (pip install edge-tts)'}")
|
||||
print(f" ElevenLabs: {'✅ installed' if _HAS_ELEVENLABS else '❌ not installed (pip install elevenlabs)'}")
|
||||
print(f" API Key: {'✅ set' if os.getenv('ELEVENLABS_API_KEY') else '❌ not set'}")
|
||||
print(f" OpenAI: {'✅ installed' if _HAS_OPENAI else '❌ not installed'}")
|
||||
print(f" API Key: {'✅ set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else '❌ not set (VOICE_TOOLS_OPENAI_KEY)'}")
|
||||
print(f" Edge TTS: {'installed' if _check(_import_edge_tts, 'edge') else 'not installed (pip install edge-tts)'}")
|
||||
print(f" ElevenLabs: {'installed' if _check(_import_elevenlabs, 'el') else 'not installed (pip install elevenlabs)'}")
|
||||
print(f" API Key: {'set' if os.getenv('ELEVENLABS_API_KEY') else 'not set'}")
|
||||
print(f" OpenAI: {'installed' if _check(_import_openai_client, 'oai') else 'not installed'}")
|
||||
print(f" API Key: {'set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else 'not set (VOICE_TOOLS_OPENAI_KEY)'}")
|
||||
print(f" ffmpeg: {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}")
|
||||
print(f"\n Output dir: {DEFAULT_OUTPUT_DIR}")
|
||||
|
||||
|
||||
783
tools/voice_mode.py
Normal file
783
tools/voice_mode.py
Normal file
@@ -0,0 +1,783 @@
|
||||
"""Voice Mode -- Push-to-talk audio recording and playback for the CLI.
|
||||
|
||||
Provides audio capture via sounddevice, WAV encoding via stdlib wave,
|
||||
STT dispatch via tools.transcription_tools, and TTS playback via
|
||||
sounddevice or system audio players.
|
||||
|
||||
Dependencies (optional):
|
||||
pip install sounddevice numpy
|
||||
or: pip install hermes-agent[voice]
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import wave
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Lazy audio imports -- never imported at module level to avoid crashing
|
||||
# in headless environments (SSH, Docker, WSL, no PortAudio).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _import_audio():
|
||||
"""Lazy-import sounddevice and numpy. Returns (sd, np).
|
||||
|
||||
Raises ImportError or OSError if the libraries are not available
|
||||
(e.g. PortAudio missing on headless servers).
|
||||
"""
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
return sd, np
|
||||
|
||||
|
||||
def _audio_available() -> bool:
|
||||
"""Return True if audio libraries can be imported."""
|
||||
try:
|
||||
_import_audio()
|
||||
return True
|
||||
except (ImportError, OSError):
|
||||
return False
|
||||
|
||||
|
||||
def detect_audio_environment() -> dict:
|
||||
"""Detect if the current environment supports audio I/O.
|
||||
|
||||
Returns dict with 'available' (bool) and 'warnings' (list of strings).
|
||||
"""
|
||||
warnings = []
|
||||
|
||||
# SSH detection
|
||||
if any(os.environ.get(v) for v in ('SSH_CLIENT', 'SSH_TTY', 'SSH_CONNECTION')):
|
||||
warnings.append("Running over SSH -- no audio devices available")
|
||||
|
||||
# Docker detection
|
||||
if os.path.exists('/.dockerenv'):
|
||||
warnings.append("Running inside Docker container -- no audio devices")
|
||||
|
||||
# WSL detection
|
||||
try:
|
||||
with open('/proc/version', 'r') as f:
|
||||
if 'microsoft' in f.read().lower():
|
||||
warnings.append("Running in WSL -- audio requires PulseAudio bridge to Windows")
|
||||
except (FileNotFoundError, PermissionError, OSError):
|
||||
pass
|
||||
|
||||
# Check audio libraries
|
||||
try:
|
||||
sd, _ = _import_audio()
|
||||
try:
|
||||
devices = sd.query_devices()
|
||||
if not devices:
|
||||
warnings.append("No audio input/output devices detected")
|
||||
except Exception:
|
||||
warnings.append("Audio subsystem error (PortAudio cannot query devices)")
|
||||
except (ImportError, OSError):
|
||||
warnings.append("Audio libraries not installed (pip install sounddevice numpy)")
|
||||
|
||||
return {
|
||||
"available": len(warnings) == 0,
|
||||
"warnings": warnings,
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Recording parameters
|
||||
# ---------------------------------------------------------------------------
|
||||
SAMPLE_RATE = 16000 # Whisper native rate
|
||||
CHANNELS = 1 # Mono
|
||||
DTYPE = "int16" # 16-bit PCM
|
||||
SAMPLE_WIDTH = 2 # bytes per sample (int16)
|
||||
MAX_RECORDING_SECONDS = 120 # Safety cap
|
||||
|
||||
# Silence detection defaults
|
||||
SILENCE_RMS_THRESHOLD = 200 # RMS below this = silence (int16 range 0-32767)
|
||||
SILENCE_DURATION_SECONDS = 3.0 # Seconds of continuous silence before auto-stop
|
||||
|
||||
# Temp directory for voice recordings
|
||||
_TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Audio cues (beep tones)
|
||||
# ============================================================================
|
||||
def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> None:
|
||||
"""Play a short beep tone using numpy + sounddevice.
|
||||
|
||||
Args:
|
||||
frequency: Tone frequency in Hz (default 880 = A5).
|
||||
duration: Duration of each beep in seconds.
|
||||
count: Number of beeps to play (with short gap between).
|
||||
"""
|
||||
try:
|
||||
sd, np = _import_audio()
|
||||
except (ImportError, OSError):
|
||||
return
|
||||
try:
|
||||
gap = 0.06 # seconds between beeps
|
||||
samples_per_beep = int(SAMPLE_RATE * duration)
|
||||
samples_per_gap = int(SAMPLE_RATE * gap)
|
||||
|
||||
parts = []
|
||||
for i in range(count):
|
||||
t = np.linspace(0, duration, samples_per_beep, endpoint=False)
|
||||
# Apply fade in/out to avoid click artifacts
|
||||
tone = np.sin(2 * np.pi * frequency * t)
|
||||
fade_len = min(int(SAMPLE_RATE * 0.01), samples_per_beep // 4)
|
||||
tone[:fade_len] *= np.linspace(0, 1, fade_len)
|
||||
tone[-fade_len:] *= np.linspace(1, 0, fade_len)
|
||||
parts.append((tone * 0.3 * 32767).astype(np.int16))
|
||||
if i < count - 1:
|
||||
parts.append(np.zeros(samples_per_gap, dtype=np.int16))
|
||||
|
||||
audio = np.concatenate(parts)
|
||||
sd.play(audio, samplerate=SAMPLE_RATE)
|
||||
# sd.wait() calls Event.wait() without timeout — hangs forever if the
|
||||
# audio device stalls. Poll with a 2s ceiling and force-stop.
|
||||
deadline = time.monotonic() + 2.0
|
||||
while sd.get_stream() and sd.get_stream().active and time.monotonic() < deadline:
|
||||
time.sleep(0.01)
|
||||
sd.stop()
|
||||
except Exception as e:
|
||||
logger.debug("Beep playback failed: %s", e)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# AudioRecorder
|
||||
# ============================================================================
|
||||
class AudioRecorder:
|
||||
"""Thread-safe audio recorder using sounddevice.InputStream.
|
||||
|
||||
Usage::
|
||||
|
||||
recorder = AudioRecorder()
|
||||
recorder.start(on_silence_stop=my_callback)
|
||||
# ... user speaks ...
|
||||
wav_path = recorder.stop() # returns path to WAV file
|
||||
# or
|
||||
recorder.cancel() # discard without saving
|
||||
|
||||
If ``on_silence_stop`` is provided, recording automatically stops when
|
||||
the user is silent for ``silence_duration`` seconds and calls the callback.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._lock = threading.Lock()
|
||||
self._stream: Any = None
|
||||
self._frames: List[Any] = []
|
||||
self._recording = False
|
||||
self._start_time: float = 0.0
|
||||
# Silence detection state
|
||||
self._has_spoken = False
|
||||
self._speech_start: float = 0.0 # When speech attempt began
|
||||
self._dip_start: float = 0.0 # When current below-threshold dip began
|
||||
self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm
|
||||
self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech
|
||||
self._silence_start: float = 0.0
|
||||
self._resume_start: float = 0.0 # Tracks sustained speech after silence starts
|
||||
self._resume_dip_start: float = 0.0 # Dip tolerance tracker for resume detection
|
||||
self._on_silence_stop = None
|
||||
self._silence_threshold: int = SILENCE_RMS_THRESHOLD
|
||||
self._silence_duration: float = SILENCE_DURATION_SECONDS
|
||||
self._max_wait: float = 15.0 # Max seconds to wait for speech before auto-stop
|
||||
# Peak RMS seen during recording (for speech presence check in stop())
|
||||
self._peak_rms: int = 0
|
||||
# Live audio level (read by UI for visual feedback)
|
||||
self._current_rms: int = 0
|
||||
|
||||
# -- public properties ---------------------------------------------------
|
||||
|
||||
@property
|
||||
def is_recording(self) -> bool:
|
||||
return self._recording
|
||||
|
||||
@property
|
||||
def elapsed_seconds(self) -> float:
|
||||
if not self._recording:
|
||||
return 0.0
|
||||
return time.monotonic() - self._start_time
|
||||
|
||||
@property
|
||||
def current_rms(self) -> int:
|
||||
"""Current audio input RMS level (0-32767). Updated each audio chunk."""
|
||||
return self._current_rms
|
||||
|
||||
# -- public methods ------------------------------------------------------
|
||||
|
||||
def _ensure_stream(self) -> None:
|
||||
"""Create the audio InputStream once and keep it alive.
|
||||
|
||||
The stream stays open for the lifetime of the recorder. Between
|
||||
recordings the callback simply discards audio chunks (``_recording``
|
||||
is ``False``). This avoids the CoreAudio bug where closing and
|
||||
re-opening an ``InputStream`` hangs indefinitely on macOS.
|
||||
"""
|
||||
if self._stream is not None:
|
||||
return # already alive
|
||||
|
||||
sd, np = _import_audio()
|
||||
|
||||
def _callback(indata, frames, time_info, status): # noqa: ARG001
|
||||
if status:
|
||||
logger.debug("sounddevice status: %s", status)
|
||||
# When not recording the stream is idle — discard audio.
|
||||
if not self._recording:
|
||||
return
|
||||
self._frames.append(indata.copy())
|
||||
|
||||
# Compute RMS for level display and silence detection
|
||||
rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
|
||||
self._current_rms = rms
|
||||
if rms > self._peak_rms:
|
||||
self._peak_rms = rms
|
||||
|
||||
# Silence detection
|
||||
if self._on_silence_stop is not None:
|
||||
now = time.monotonic()
|
||||
elapsed = now - self._start_time
|
||||
|
||||
if rms > self._silence_threshold:
|
||||
# Audio is above threshold -- this is speech (or noise).
|
||||
self._dip_start = 0.0 # Reset dip tracker
|
||||
if self._speech_start == 0.0:
|
||||
self._speech_start = now
|
||||
elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration:
|
||||
self._has_spoken = True
|
||||
logger.debug("Speech confirmed (%.2fs above threshold)",
|
||||
now - self._speech_start)
|
||||
# After speech is confirmed, only reset silence timer if
|
||||
# speech is sustained (>0.3s above threshold). Brief
|
||||
# spikes from ambient noise should NOT reset the timer.
|
||||
if not self._has_spoken:
|
||||
self._silence_start = 0.0
|
||||
else:
|
||||
# Track resumed speech with dip tolerance.
|
||||
# Brief dips below threshold are normal during speech,
|
||||
# so we mirror the initial speech detection pattern:
|
||||
# start tracking, tolerate short dips, confirm after 0.3s.
|
||||
self._resume_dip_start = 0.0 # Above threshold — no dip
|
||||
if self._resume_start == 0.0:
|
||||
self._resume_start = now
|
||||
elif now - self._resume_start >= self._min_speech_duration:
|
||||
self._silence_start = 0.0
|
||||
self._resume_start = 0.0
|
||||
elif self._has_spoken:
|
||||
# Below threshold after speech confirmed.
|
||||
# Use dip tolerance before resetting resume tracker —
|
||||
# natural speech has brief dips below threshold.
|
||||
if self._resume_start > 0:
|
||||
if self._resume_dip_start == 0.0:
|
||||
self._resume_dip_start = now
|
||||
elif now - self._resume_dip_start >= self._max_dip_tolerance:
|
||||
# Sustained dip — user actually stopped speaking
|
||||
self._resume_start = 0.0
|
||||
self._resume_dip_start = 0.0
|
||||
elif self._speech_start > 0:
|
||||
# We were in a speech attempt but RMS dipped.
|
||||
# Tolerate brief dips (micro-pauses between syllables).
|
||||
if self._dip_start == 0.0:
|
||||
self._dip_start = now
|
||||
elif now - self._dip_start >= self._max_dip_tolerance:
|
||||
# Dip lasted too long -- genuine silence, reset
|
||||
logger.debug("Speech attempt reset (dip lasted %.2fs)",
|
||||
now - self._dip_start)
|
||||
self._speech_start = 0.0
|
||||
self._dip_start = 0.0
|
||||
|
||||
# Fire silence callback when:
|
||||
# 1. User spoke then went silent for silence_duration, OR
|
||||
# 2. No speech detected at all for max_wait seconds
|
||||
should_fire = False
|
||||
if self._has_spoken and rms <= self._silence_threshold:
|
||||
# User was speaking and now is silent
|
||||
if self._silence_start == 0.0:
|
||||
self._silence_start = now
|
||||
elif now - self._silence_start >= self._silence_duration:
|
||||
logger.info("Silence detected (%.1fs), auto-stopping",
|
||||
self._silence_duration)
|
||||
should_fire = True
|
||||
elif not self._has_spoken and elapsed >= self._max_wait:
|
||||
logger.info("No speech within %.0fs, auto-stopping",
|
||||
self._max_wait)
|
||||
should_fire = True
|
||||
|
||||
if should_fire:
|
||||
with self._lock:
|
||||
cb = self._on_silence_stop
|
||||
self._on_silence_stop = None # fire only once
|
||||
if cb:
|
||||
def _safe_cb():
|
||||
try:
|
||||
cb()
|
||||
except Exception as e:
|
||||
logger.error("Silence callback failed: %s", e, exc_info=True)
|
||||
threading.Thread(target=_safe_cb, daemon=True).start()
|
||||
|
||||
# Create stream — may block on CoreAudio (first call only).
|
||||
stream = None
|
||||
try:
|
||||
stream = sd.InputStream(
|
||||
samplerate=SAMPLE_RATE,
|
||||
channels=CHANNELS,
|
||||
dtype=DTYPE,
|
||||
callback=_callback,
|
||||
)
|
||||
stream.start()
|
||||
except Exception as e:
|
||||
if stream is not None:
|
||||
try:
|
||||
stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
raise RuntimeError(
|
||||
f"Failed to open audio input stream: {e}. "
|
||||
"Check that a microphone is connected and accessible."
|
||||
) from e
|
||||
self._stream = stream
|
||||
|
||||
def start(self, on_silence_stop=None) -> None:
|
||||
"""Start capturing audio from the default input device.
|
||||
|
||||
The underlying InputStream is created once and kept alive across
|
||||
recordings. Subsequent calls simply reset detection state and
|
||||
toggle frame collection via ``_recording``.
|
||||
|
||||
Args:
|
||||
on_silence_stop: Optional callback invoked (in a daemon thread) when
|
||||
silence is detected after speech. The callback receives no arguments.
|
||||
Use this to auto-stop recording and trigger transcription.
|
||||
|
||||
Raises ``RuntimeError`` if sounddevice/numpy are not installed
|
||||
or if a recording is already in progress.
|
||||
"""
|
||||
try:
|
||||
_import_audio()
|
||||
except (ImportError, OSError) as e:
|
||||
raise RuntimeError(
|
||||
"Voice mode requires sounddevice and numpy.\n"
|
||||
"Install with: pip install sounddevice numpy\n"
|
||||
"Or: pip install hermes-agent[voice]"
|
||||
) from e
|
||||
|
||||
with self._lock:
|
||||
if self._recording:
|
||||
return # already recording
|
||||
|
||||
self._frames = []
|
||||
self._start_time = time.monotonic()
|
||||
self._has_spoken = False
|
||||
self._speech_start = 0.0
|
||||
self._dip_start = 0.0
|
||||
self._silence_start = 0.0
|
||||
self._resume_start = 0.0
|
||||
self._resume_dip_start = 0.0
|
||||
self._peak_rms = 0
|
||||
self._current_rms = 0
|
||||
self._on_silence_stop = on_silence_stop
|
||||
|
||||
# Ensure the persistent stream is alive (no-op after first call).
|
||||
self._ensure_stream()
|
||||
|
||||
with self._lock:
|
||||
self._recording = True
|
||||
logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
|
||||
|
||||
def _close_stream_with_timeout(self, timeout: float = 3.0) -> None:
|
||||
"""Close the audio stream with a timeout to prevent CoreAudio hangs."""
|
||||
if self._stream is None:
|
||||
return
|
||||
|
||||
stream = self._stream
|
||||
self._stream = None
|
||||
|
||||
def _do_close():
|
||||
try:
|
||||
stream.stop()
|
||||
stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
t = threading.Thread(target=_do_close, daemon=True)
|
||||
t.start()
|
||||
# Poll in short intervals so Ctrl+C is not blocked
|
||||
deadline = __import__("time").monotonic() + timeout
|
||||
while t.is_alive() and __import__("time").monotonic() < deadline:
|
||||
t.join(timeout=0.1)
|
||||
if t.is_alive():
|
||||
logger.warning("Audio stream close timed out after %.1fs — forcing ahead", timeout)
|
||||
|
||||
def stop(self) -> Optional[str]:
|
||||
"""Stop recording and write captured audio to a WAV file.
|
||||
|
||||
The underlying stream is kept alive for reuse — only frame
|
||||
collection is stopped.
|
||||
|
||||
Returns:
|
||||
Path to the WAV file, or ``None`` if no audio was captured.
|
||||
"""
|
||||
with self._lock:
|
||||
if not self._recording:
|
||||
return None
|
||||
|
||||
self._recording = False
|
||||
self._current_rms = 0
|
||||
# Stream stays alive — no close needed.
|
||||
|
||||
if not self._frames:
|
||||
return None
|
||||
|
||||
# Concatenate frames and write WAV
|
||||
_, np = _import_audio()
|
||||
audio_data = np.concatenate(self._frames, axis=0)
|
||||
self._frames = []
|
||||
|
||||
elapsed = time.monotonic() - self._start_time
|
||||
logger.info("Voice recording stopped (%.1fs, %d samples)", elapsed, len(audio_data))
|
||||
|
||||
# Skip very short recordings (< 0.3s of audio)
|
||||
min_samples = int(SAMPLE_RATE * 0.3)
|
||||
if len(audio_data) < min_samples:
|
||||
logger.debug("Recording too short (%d samples), discarding", len(audio_data))
|
||||
return None
|
||||
|
||||
# Skip silent recordings using peak RMS (not overall average, which
|
||||
# gets diluted by silence at the end of the recording).
|
||||
if self._peak_rms < SILENCE_RMS_THRESHOLD:
|
||||
logger.info("Recording too quiet (peak RMS=%d < %d), discarding",
|
||||
self._peak_rms, SILENCE_RMS_THRESHOLD)
|
||||
return None
|
||||
|
||||
return self._write_wav(audio_data)
|
||||
|
||||
def cancel(self) -> None:
|
||||
"""Stop recording and discard all captured audio.
|
||||
|
||||
The underlying stream is kept alive for reuse.
|
||||
"""
|
||||
with self._lock:
|
||||
self._recording = False
|
||||
self._frames = []
|
||||
self._on_silence_stop = None
|
||||
self._current_rms = 0
|
||||
logger.info("Voice recording cancelled")
|
||||
|
||||
def shutdown(self) -> None:
|
||||
"""Release the audio stream. Call when voice mode is disabled."""
|
||||
with self._lock:
|
||||
self._recording = False
|
||||
self._frames = []
|
||||
self._on_silence_stop = None
|
||||
# Close stream OUTSIDE the lock to avoid deadlock with audio callback
|
||||
self._close_stream_with_timeout()
|
||||
logger.info("AudioRecorder shut down")
|
||||
|
||||
# -- private helpers -----------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _write_wav(audio_data) -> str:
|
||||
"""Write numpy int16 audio data to a WAV file.
|
||||
|
||||
Returns the file path.
|
||||
"""
|
||||
os.makedirs(_TEMP_DIR, exist_ok=True)
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
wav_path = os.path.join(_TEMP_DIR, f"recording_{timestamp}.wav")
|
||||
|
||||
with wave.open(wav_path, "wb") as wf:
|
||||
wf.setnchannels(CHANNELS)
|
||||
wf.setsampwidth(SAMPLE_WIDTH)
|
||||
wf.setframerate(SAMPLE_RATE)
|
||||
wf.writeframes(audio_data.tobytes())
|
||||
|
||||
file_size = os.path.getsize(wav_path)
|
||||
logger.info("WAV written: %s (%d bytes)", wav_path, file_size)
|
||||
return wav_path
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Whisper hallucination filter
|
||||
# ============================================================================
|
||||
# Whisper commonly hallucinates these phrases on silent/near-silent audio.
|
||||
WHISPER_HALLUCINATIONS = {
|
||||
"thank you.",
|
||||
"thank you",
|
||||
"thanks for watching.",
|
||||
"thanks for watching",
|
||||
"subscribe to my channel.",
|
||||
"subscribe to my channel",
|
||||
"like and subscribe.",
|
||||
"like and subscribe",
|
||||
"please subscribe.",
|
||||
"please subscribe",
|
||||
"thank you for watching.",
|
||||
"thank you for watching",
|
||||
"bye.",
|
||||
"bye",
|
||||
"you",
|
||||
"the end.",
|
||||
"the end",
|
||||
# Non-English hallucinations (common on silence)
|
||||
"продолжение следует",
|
||||
"продолжение следует...",
|
||||
"sous-titres",
|
||||
"sous-titres réalisés par la communauté d'amara.org",
|
||||
"sottotitoli creati dalla comunità amara.org",
|
||||
"untertitel von stephanie geiges",
|
||||
"amara.org",
|
||||
"www.mooji.org",
|
||||
"ご視聴ありがとうございました",
|
||||
}
|
||||
|
||||
# Regex patterns for repetitive hallucinations (e.g. "Thank you. Thank you. Thank you.")
|
||||
_HALLUCINATION_REPEAT_RE = re.compile(
|
||||
r'^(?:thank you|thanks|bye|you|ok|okay|the end|\.|\s|,|!)+$',
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def is_whisper_hallucination(transcript: str) -> bool:
|
||||
"""Check if a transcript is a known Whisper hallucination on silence."""
|
||||
cleaned = transcript.strip().lower()
|
||||
if not cleaned:
|
||||
return True
|
||||
# Exact match against known phrases
|
||||
if cleaned.rstrip('.!') in WHISPER_HALLUCINATIONS or cleaned in WHISPER_HALLUCINATIONS:
|
||||
return True
|
||||
# Repetitive patterns (e.g. "Thank you. Thank you. Thank you. you")
|
||||
if _HALLUCINATION_REPEAT_RE.match(cleaned):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# STT dispatch
|
||||
# ============================================================================
|
||||
def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Transcribe a WAV recording using the existing Whisper pipeline.
|
||||
|
||||
Delegates to ``tools.transcription_tools.transcribe_audio()``.
|
||||
Filters out known Whisper hallucinations on silent audio.
|
||||
|
||||
Args:
|
||||
wav_path: Path to the WAV file.
|
||||
model: Whisper model name (default: from config or ``whisper-1``).
|
||||
|
||||
Returns:
|
||||
Dict with ``success``, ``transcript``, and optionally ``error``.
|
||||
"""
|
||||
from tools.transcription_tools import transcribe_audio
|
||||
|
||||
result = transcribe_audio(wav_path, model=model)
|
||||
|
||||
# Filter out Whisper hallucinations (common on silent/near-silent audio)
|
||||
if result.get("success") and is_whisper_hallucination(result.get("transcript", "")):
|
||||
logger.info("Filtered Whisper hallucination: %r", result["transcript"])
|
||||
return {"success": True, "transcript": "", "filtered": True}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Audio playback (interruptable)
|
||||
# ============================================================================
|
||||
|
||||
# Global reference to the active playback process so it can be interrupted.
|
||||
_active_playback: Optional[subprocess.Popen] = None
|
||||
_playback_lock = threading.Lock()
|
||||
|
||||
|
||||
def stop_playback() -> None:
|
||||
"""Interrupt the currently playing audio (if any)."""
|
||||
global _active_playback
|
||||
with _playback_lock:
|
||||
proc = _active_playback
|
||||
_active_playback = None
|
||||
if proc and proc.poll() is None:
|
||||
try:
|
||||
proc.terminate()
|
||||
logger.info("Audio playback interrupted")
|
||||
except Exception:
|
||||
pass
|
||||
# Also stop sounddevice playback if active
|
||||
try:
|
||||
sd, _ = _import_audio()
|
||||
sd.stop()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def play_audio_file(file_path: str) -> bool:
|
||||
"""Play an audio file through the default output device.
|
||||
|
||||
Strategy:
|
||||
1. WAV files via ``sounddevice.play()`` when available.
|
||||
2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform),
|
||||
``aplay`` (Linux ALSA).
|
||||
|
||||
Playback can be interrupted by calling ``stop_playback()``.
|
||||
|
||||
Returns:
|
||||
``True`` if playback succeeded, ``False`` otherwise.
|
||||
"""
|
||||
global _active_playback
|
||||
|
||||
if not os.path.isfile(file_path):
|
||||
logger.warning("Audio file not found: %s", file_path)
|
||||
return False
|
||||
|
||||
# Try sounddevice for WAV files
|
||||
if file_path.endswith(".wav"):
|
||||
try:
|
||||
sd, np = _import_audio()
|
||||
with wave.open(file_path, "rb") as wf:
|
||||
frames = wf.readframes(wf.getnframes())
|
||||
audio_data = np.frombuffer(frames, dtype=np.int16)
|
||||
sample_rate = wf.getframerate()
|
||||
|
||||
sd.play(audio_data, samplerate=sample_rate)
|
||||
# sd.wait() calls Event.wait() without timeout — hangs forever if
|
||||
# the audio device stalls. Poll with a ceiling and force-stop.
|
||||
duration_secs = len(audio_data) / sample_rate
|
||||
deadline = time.monotonic() + duration_secs + 2.0
|
||||
while sd.get_stream() and sd.get_stream().active and time.monotonic() < deadline:
|
||||
time.sleep(0.01)
|
||||
sd.stop()
|
||||
return True
|
||||
except (ImportError, OSError):
|
||||
pass # audio libs not available, fall through to system players
|
||||
except Exception as e:
|
||||
logger.debug("sounddevice playback failed: %s", e)
|
||||
|
||||
# Fall back to system audio players (using Popen for interruptability)
|
||||
system = platform.system()
|
||||
players = []
|
||||
|
||||
if system == "Darwin":
|
||||
players.append(["afplay", file_path])
|
||||
players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path])
|
||||
if system == "Linux":
|
||||
players.append(["aplay", "-q", file_path])
|
||||
|
||||
for cmd in players:
|
||||
exe = shutil.which(cmd[0])
|
||||
if exe:
|
||||
try:
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
with _playback_lock:
|
||||
_active_playback = proc
|
||||
proc.wait(timeout=300)
|
||||
with _playback_lock:
|
||||
_active_playback = None
|
||||
return True
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("System player %s timed out, killing process", cmd[0])
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
with _playback_lock:
|
||||
_active_playback = None
|
||||
except Exception as e:
|
||||
logger.debug("System player %s failed: %s", cmd[0], e)
|
||||
with _playback_lock:
|
||||
_active_playback = None
|
||||
|
||||
logger.warning("No audio player available for %s", file_path)
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Requirements check
|
||||
# ============================================================================
|
||||
def check_voice_requirements() -> Dict[str, Any]:
|
||||
"""Check if all voice mode requirements are met.
|
||||
|
||||
Returns:
|
||||
Dict with ``available``, ``audio_available``, ``stt_available``,
|
||||
``missing_packages``, and ``details``.
|
||||
"""
|
||||
# Determine STT provider availability
|
||||
from tools.transcription_tools import _get_provider, _load_stt_config, _HAS_FASTER_WHISPER
|
||||
stt_config = _load_stt_config()
|
||||
stt_provider = _get_provider(stt_config)
|
||||
stt_available = stt_provider != "none"
|
||||
|
||||
missing: List[str] = []
|
||||
has_audio = _audio_available()
|
||||
|
||||
if not has_audio:
|
||||
missing.extend(["sounddevice", "numpy"])
|
||||
|
||||
# Environment detection
|
||||
env_check = detect_audio_environment()
|
||||
|
||||
available = has_audio and stt_available and env_check["available"]
|
||||
details_parts = []
|
||||
|
||||
if has_audio:
|
||||
details_parts.append("Audio capture: OK")
|
||||
else:
|
||||
details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)")
|
||||
|
||||
if stt_provider == "local":
|
||||
details_parts.append("STT provider: OK (local faster-whisper)")
|
||||
elif stt_provider == "groq":
|
||||
details_parts.append("STT provider: OK (Groq)")
|
||||
elif stt_provider == "openai":
|
||||
details_parts.append("STT provider: OK (OpenAI)")
|
||||
else:
|
||||
details_parts.append(
|
||||
"STT provider: MISSING (pip install faster-whisper, "
|
||||
"or set GROQ_API_KEY / VOICE_TOOLS_OPENAI_KEY)"
|
||||
)
|
||||
|
||||
for warning in env_check["warnings"]:
|
||||
details_parts.append(f"Environment: {warning}")
|
||||
|
||||
return {
|
||||
"available": available,
|
||||
"audio_available": has_audio,
|
||||
"stt_available": stt_available,
|
||||
"missing_packages": missing,
|
||||
"details": "\n".join(details_parts),
|
||||
"environment": env_check,
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Temp file cleanup
|
||||
# ============================================================================
|
||||
def cleanup_temp_recordings(max_age_seconds: int = 3600) -> int:
|
||||
"""Remove old temporary voice recording files.
|
||||
|
||||
Args:
|
||||
max_age_seconds: Delete files older than this (default: 1 hour).
|
||||
|
||||
Returns:
|
||||
Number of files deleted.
|
||||
"""
|
||||
if not os.path.isdir(_TEMP_DIR):
|
||||
return 0
|
||||
|
||||
deleted = 0
|
||||
now = time.time()
|
||||
|
||||
for entry in os.scandir(_TEMP_DIR):
|
||||
if entry.is_file() and entry.name.startswith("recording_") and entry.name.endswith(".wav"):
|
||||
try:
|
||||
age = now - entry.stat().st_mtime
|
||||
if age > max_age_seconds:
|
||||
os.unlink(entry.path)
|
||||
deleted += 1
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
if deleted:
|
||||
logger.debug("Cleaned up %d old voice recordings", deleted)
|
||||
return deleted
|
||||
Reference in New Issue
Block a user