From 1f72ce71b7d03e9d7498c4d97d4d746bea46588c Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sun, 15 Mar 2026 21:51:40 -0700 Subject: [PATCH] fix: restore local STT fallback for gateway voice notes Restore local STT command fallback for voice transcription, detect whisper and ffmpeg in common local install paths, and avoid bogus no-provider messaging when only a backend-specific key is missing. --- gateway/run.py | 5 +- tests/gateway/test_stt_config.py | 24 +++ tests/tools/test_transcription_tools.py | 97 +++++++++ tools/transcription_tools.py | 192 +++++++++++++++++- .../docs/reference/environment-variables.md | 4 +- website/docs/user-guide/features/tts.md | 20 +- 6 files changed, 324 insertions(+), 18 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 7b6dec173..7475564d5 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -3635,7 +3635,10 @@ class GatewayRunner: ) else: error = result.get("error", "unknown error") - if "No STT provider" in error or "not set" in error: + if ( + "No STT provider" in error + or error.startswith("Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set") + ): enriched_parts.append( "[The user sent a voice message but I can't listen " "to it right now~ No STT provider is configured " diff --git a/tests/gateway/test_stt_config.py b/tests/gateway/test_stt_config.py index d5a9fc55b..436afd7c1 100644 --- a/tests/gateway/test_stt_config.py +++ b/tests/gateway/test_stt_config.py @@ -51,3 +51,27 @@ async def test_enrich_message_with_transcription_skips_when_stt_disabled(): assert "transcription is disabled" in result.lower() assert "caption" in result + + +@pytest.mark.asyncio +async def test_enrich_message_with_transcription_avoids_bogus_no_provider_message_for_backend_key_errors(): + from gateway.run import GatewayRunner + + runner = GatewayRunner.__new__(GatewayRunner) + runner.config = GatewayConfig(stt_enabled=True) + + with patch( + "tools.transcription_tools.transcribe_audio", + return_value={"success": False, "error": "VOICE_TOOLS_OPENAI_KEY not set"}, + ), patch( + "tools.transcription_tools.get_stt_model_from_config", + return_value=None, + ): + result = await runner._enrich_message_with_transcription( + "caption", + ["/tmp/voice.ogg"], + ) + + assert "No STT provider is configured" not in result + assert "trouble transcribing" in result + assert "caption" in result diff --git a/tests/tools/test_transcription_tools.py b/tests/tools/test_transcription_tools.py index 2f5b7cfbe..a74fde049 100644 --- a/tests/tools/test_transcription_tools.py +++ b/tests/tools/test_transcription_tools.py @@ -7,6 +7,7 @@ end-to-end dispatch. All external dependencies are mocked. import os import struct +import subprocess import wave from unittest.mock import MagicMock, patch @@ -45,7 +46,10 @@ def sample_ogg(tmp_path): def clean_env(monkeypatch): """Ensure no real API keys leak into tests.""" monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) monkeypatch.delenv("GROQ_API_KEY", raising=False) + monkeypatch.delenv("HERMES_LOCAL_STT_COMMAND", raising=False) + monkeypatch.delenv("HERMES_LOCAL_STT_LANGUAGE", raising=False) # ============================================================================ @@ -132,6 +136,19 @@ class TestGetProviderFallbackPriority: from tools.transcription_tools import _get_provider assert _get_provider({}) == "local" + def test_openai_fallback_to_local_command(self, monkeypatch): + monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("GROQ_API_KEY", raising=False) + monkeypatch.setenv( + "HERMES_LOCAL_STT_COMMAND", + "whisper {input_path} --output_dir {output_dir} --language {language}", + ) + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \ + patch("tools.transcription_tools._HAS_OPENAI", True): + from tools.transcription_tools import _get_provider + assert _get_provider({"provider": "openai"}) == "local_command" + # ============================================================================ # _transcribe_groq @@ -279,6 +296,63 @@ class TestTranscribeOpenAIExtended: assert "Permission denied" in result["error"] +class TestTranscribeLocalCommand: + def test_auto_detects_local_whisper_binary(self, monkeypatch): + monkeypatch.delenv("HERMES_LOCAL_STT_COMMAND", raising=False) + monkeypatch.setattr("tools.transcription_tools._find_whisper_binary", lambda: "/opt/homebrew/bin/whisper") + + from tools.transcription_tools import _get_local_command_template + + template = _get_local_command_template() + + assert template is not None + assert template.startswith("/opt/homebrew/bin/whisper ") + assert "{model}" in template + assert "{output_dir}" in template + + def test_command_fallback_with_template(self, monkeypatch, sample_ogg, tmp_path): + out_dir = tmp_path / "local-out" + out_dir.mkdir() + + monkeypatch.setenv( + "HERMES_LOCAL_STT_COMMAND", + "whisper {input_path} --model {model} --output_dir {output_dir} --language {language}", + ) + monkeypatch.setenv("HERMES_LOCAL_STT_LANGUAGE", "en") + + def fake_tempdir(prefix=None): + class _TempDir: + def __enter__(self_inner): + return str(out_dir) + + def __exit__(self_inner, exc_type, exc, tb): + return False + + return _TempDir() + + def fake_run(cmd, *args, **kwargs): + if isinstance(cmd, list): + output_path = cmd[-1] + with open(output_path, "wb") as handle: + handle.write(b"RIFF....WAVEfmt ") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + (out_dir / "test.txt").write_text("hello from local command\n", encoding="utf-8") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr("tools.transcription_tools.tempfile.TemporaryDirectory", fake_tempdir) + monkeypatch.setattr("tools.transcription_tools._find_ffmpeg_binary", lambda: "/opt/homebrew/bin/ffmpeg") + monkeypatch.setattr("tools.transcription_tools.subprocess.run", fake_run) + + from tools.transcription_tools import _transcribe_local_command + + result = _transcribe_local_command(sample_ogg, "base") + + assert result["success"] is True + assert result["transcript"] == "hello from local command" + assert result["provider"] == "local_command" + + # ============================================================================ # _transcribe_local — additional tests # ============================================================================ @@ -612,6 +686,29 @@ class TestTranscribeAudioDispatch: assert "faster-whisper" in result["error"] assert "GROQ_API_KEY" in result["error"] + def test_openai_provider_falls_back_to_local_command(self, monkeypatch, sample_ogg): + monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.setenv( + "HERMES_LOCAL_STT_COMMAND", + "whisper {input_path} --model {model} --output_dir {output_dir} --language {language}", + ) + + with patch("tools.transcription_tools._load_stt_config", return_value={"provider": "openai"}), \ + patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \ + patch("tools.transcription_tools._HAS_OPENAI", True), \ + patch("tools.transcription_tools._transcribe_local_command", return_value={ + "success": True, + "transcript": "hello from fallback", + "provider": "local_command", + }) as mock_local_command: + from tools.transcription_tools import transcribe_audio + result = transcribe_audio(sample_ogg) + + assert result["success"] is True + assert result["transcript"] == "hello from fallback" + mock_local_command.assert_called_once_with(sample_ogg, "base") + def test_invalid_file_short_circuits(self): from tools.transcription_tools import transcribe_audio result = transcribe_audio("/nonexistent/audio.wav") diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index 684d0a8d8..d279dbd37 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -25,6 +25,10 @@ Usage:: import logging import os +import shlex +import shutil +import subprocess +import tempfile from pathlib import Path from typing import Optional, Dict, Any @@ -44,13 +48,18 @@ _HAS_OPENAI = _ilu.find_spec("openai") is not None DEFAULT_PROVIDER = "local" DEFAULT_LOCAL_MODEL = "base" +DEFAULT_LOCAL_STT_LANGUAGE = "en" DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1") DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo") +LOCAL_STT_COMMAND_ENV = "HERMES_LOCAL_STT_COMMAND" +LOCAL_STT_LANGUAGE_ENV = "HERMES_LOCAL_STT_LANGUAGE" +COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin") GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1") OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1") SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"} +LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"} MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB # Known model sets for auto-correction @@ -105,6 +114,53 @@ def is_stt_enabled(stt_config: Optional[dict] = None) -> bool: return bool(enabled) +def _resolve_openai_api_key() -> str: + """Prefer the voice-tools key, but fall back to the normal OpenAI key.""" + return os.getenv("VOICE_TOOLS_OPENAI_KEY", "") or os.getenv("OPENAI_API_KEY", "") + + +def _find_binary(binary_name: str) -> Optional[str]: + """Find a local binary, checking common Homebrew/local prefixes as well as PATH.""" + for directory in COMMON_LOCAL_BIN_DIRS: + candidate = Path(directory) / binary_name + if candidate.exists() and os.access(candidate, os.X_OK): + return str(candidate) + return shutil.which(binary_name) + + +def _find_ffmpeg_binary() -> Optional[str]: + return _find_binary("ffmpeg") + + +def _find_whisper_binary() -> Optional[str]: + return _find_binary("whisper") + + +def _get_local_command_template() -> Optional[str]: + configured = os.getenv(LOCAL_STT_COMMAND_ENV, "").strip() + if configured: + return configured + + whisper_binary = _find_whisper_binary() + if whisper_binary: + quoted_binary = shlex.quote(whisper_binary) + return ( + f"{quoted_binary} {{input_path}} --model {{model}} --output_format txt " + "--output_dir {output_dir} --language {language}" + ) + return None + + +def _has_local_command() -> bool: + return _get_local_command_template() is not None + + +def _normalize_local_command_model(model_name: Optional[str]) -> str: + if not model_name or model_name in OPENAI_MODELS or model_name in GROQ_MODELS: + return DEFAULT_LOCAL_MODEL + return model_name + + def _get_provider(stt_config: dict) -> str: """Determine which STT provider to use. @@ -121,15 +177,32 @@ def _get_provider(stt_config: dict) -> str: if provider == "local": if _HAS_FASTER_WHISPER: return "local" + if _has_local_command(): + logger.info("faster-whisper not installed, falling back to local STT command") + return "local_command" # Local requested but not available — fall back to groq, then openai if _HAS_OPENAI and os.getenv("GROQ_API_KEY"): logger.info("faster-whisper not installed, falling back to Groq Whisper API") return "groq" - if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): + if _HAS_OPENAI and _resolve_openai_api_key(): logger.info("faster-whisper not installed, falling back to OpenAI Whisper API") return "openai" return "none" + if provider == "local_command": + if _has_local_command(): + return "local_command" + if _HAS_FASTER_WHISPER: + logger.info("Local STT command unavailable, falling back to local faster-whisper") + return "local" + if _HAS_OPENAI and os.getenv("GROQ_API_KEY"): + logger.info("Local STT command unavailable, falling back to Groq Whisper API") + return "groq" + if _HAS_OPENAI and _resolve_openai_api_key(): + logger.info("Local STT command unavailable, falling back to OpenAI Whisper API") + return "openai" + return "none" + if provider == "groq": if _HAS_OPENAI and os.getenv("GROQ_API_KEY"): return "groq" @@ -137,20 +210,26 @@ def _get_provider(stt_config: dict) -> str: if _HAS_FASTER_WHISPER: logger.info("GROQ_API_KEY not set, falling back to local faster-whisper") return "local" - if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): + if _has_local_command(): + logger.info("GROQ_API_KEY not set, falling back to local STT command") + return "local_command" + if _HAS_OPENAI and _resolve_openai_api_key(): logger.info("GROQ_API_KEY not set, falling back to OpenAI Whisper API") return "openai" return "none" if provider == "openai": - if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): + if _HAS_OPENAI and _resolve_openai_api_key(): return "openai" # OpenAI requested but no key — fall back if _HAS_FASTER_WHISPER: - logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to local faster-whisper") + logger.info("OpenAI STT key not set, falling back to local faster-whisper") return "local" + if _has_local_command(): + logger.info("OpenAI STT key not set, falling back to local STT command") + return "local_command" if _HAS_OPENAI and os.getenv("GROQ_API_KEY"): - logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to Groq Whisper API") + logger.info("OpenAI STT key not set, falling back to Groq Whisper API") return "groq" return "none" @@ -222,6 +301,89 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]: logger.error("Local transcription failed: %s", e, exc_info=True) return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"} + +def _prepare_local_audio(file_path: str, work_dir: str) -> tuple[Optional[str], Optional[str]]: + """Normalize audio for local CLI STT when needed.""" + audio_path = Path(file_path) + if audio_path.suffix.lower() in LOCAL_NATIVE_AUDIO_FORMATS: + return file_path, None + + ffmpeg = _find_ffmpeg_binary() + if not ffmpeg: + return None, "Local STT fallback requires ffmpeg for non-WAV inputs, but ffmpeg was not found" + + converted_path = os.path.join(work_dir, f"{audio_path.stem}.wav") + command = [ffmpeg, "-y", "-i", file_path, converted_path] + + try: + subprocess.run(command, check=True, capture_output=True, text=True) + return converted_path, None + except subprocess.CalledProcessError as e: + details = e.stderr.strip() or e.stdout.strip() or str(e) + logger.error("ffmpeg conversion failed for %s: %s", file_path, details) + return None, f"Failed to convert audio for local STT: {details}" + + +def _transcribe_local_command(file_path: str, model_name: str) -> Dict[str, Any]: + """Run the configured local STT command template and read back a .txt transcript.""" + command_template = _get_local_command_template() + if not command_template: + return { + "success": False, + "transcript": "", + "error": ( + f"{LOCAL_STT_COMMAND_ENV} not configured and no local whisper binary was found" + ), + } + + language = os.getenv(LOCAL_STT_LANGUAGE_ENV, DEFAULT_LOCAL_STT_LANGUAGE) + normalized_model = _normalize_local_command_model(model_name) + + try: + with tempfile.TemporaryDirectory(prefix="hermes-local-stt-") as output_dir: + prepared_input, prep_error = _prepare_local_audio(file_path, output_dir) + if prep_error: + return {"success": False, "transcript": "", "error": prep_error} + + command = command_template.format( + input_path=shlex.quote(prepared_input), + output_dir=shlex.quote(output_dir), + language=shlex.quote(language), + model=shlex.quote(normalized_model), + ) + subprocess.run(command, shell=True, check=True, capture_output=True, text=True) + + txt_files = sorted(Path(output_dir).glob("*.txt")) + if not txt_files: + return { + "success": False, + "transcript": "", + "error": "Local STT command completed but did not produce a .txt transcript", + } + + transcript_text = txt_files[0].read_text(encoding="utf-8").strip() + logger.info( + "Transcribed %s via local STT command (%s, %d chars)", + Path(file_path).name, + normalized_model, + len(transcript_text), + ) + return {"success": True, "transcript": transcript_text, "provider": "local_command"} + + except KeyError as e: + return { + "success": False, + "transcript": "", + "error": f"Invalid {LOCAL_STT_COMMAND_ENV} template, missing placeholder: {e}", + } + except subprocess.CalledProcessError as e: + details = e.stderr.strip() or e.stdout.strip() or str(e) + logger.error("Local STT command failed for %s: %s", file_path, details) + return {"success": False, "transcript": "", "error": f"Local STT failed: {details}"} + except Exception as e: + logger.error("Unexpected error during local command transcription: %s", e, exc_info=True) + return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"} + # --------------------------------------------------------------------------- # Provider: groq (Whisper API — free tier) # --------------------------------------------------------------------------- @@ -277,9 +439,13 @@ def _transcribe_groq(file_path: str, model_name: str) -> Dict[str, Any]: def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]: """Transcribe using OpenAI Whisper API (paid).""" - api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY") + api_key = _resolve_openai_api_key() if not api_key: - return {"success": False, "transcript": "", "error": "VOICE_TOOLS_OPENAI_KEY not set"} + return { + "success": False, + "transcript": "", + "error": "Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set", + } if not _HAS_OPENAI: return {"success": False, "transcript": "", "error": "openai package not installed"} @@ -363,6 +529,13 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL) return _transcribe_local(file_path, model_name) + if provider == "local_command": + local_cfg = stt_config.get("local", {}) + model_name = _normalize_local_command_model( + model or local_cfg.get("model", DEFAULT_LOCAL_MODEL) + ) + return _transcribe_local_command(file_path, model_name) + if provider == "groq": model_name = model or DEFAULT_GROQ_STT_MODEL return _transcribe_groq(file_path, model_name) @@ -378,7 +551,8 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A "transcript": "", "error": ( "No STT provider available. Install faster-whisper for free local " - "transcription, set GROQ_API_KEY for free Groq Whisper, " - "or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API." + f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, " + "set GROQ_API_KEY for free Groq Whisper, or set VOICE_TOOLS_OPENAI_KEY " + "or OPENAI_API_KEY for the OpenAI Whisper API." ), } diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md index bd75b2cbe..03e84d93f 100644 --- a/website/docs/reference/environment-variables.md +++ b/website/docs/reference/environment-variables.md @@ -31,7 +31,9 @@ All variables go in `~/.hermes/.env`. You can also set them with `hermes config | `CLAUDE_CODE_OAUTH_TOKEN` | Explicit Claude Code token override if you export one manually | | `HERMES_MODEL` | Preferred model name (checked before `LLM_MODEL`, used by gateway) | | `LLM_MODEL` | Default model name (fallback when not set in config.yaml) | -| `VOICE_TOOLS_OPENAI_KEY` | OpenAI key for OpenAI speech-to-text and text-to-speech providers | +| `VOICE_TOOLS_OPENAI_KEY` | Preferred OpenAI key for OpenAI speech-to-text and text-to-speech providers | +| `HERMES_LOCAL_STT_COMMAND` | Optional local speech-to-text command template. Supports `{input_path}`, `{output_dir}`, `{language}`, and `{model}` placeholders | +| `HERMES_LOCAL_STT_LANGUAGE` | Default language passed to `HERMES_LOCAL_STT_COMMAND` or auto-detected local `whisper` CLI fallback (default: `en`) | | `HERMES_HOME` | Override Hermes config directory (default: `~/.hermes`) | ## Provider Auth (OAuth) diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index 6634ba2ab..c6ba365a1 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -74,10 +74,11 @@ Voice messages sent on Telegram, Discord, WhatsApp, Slack, or Signal are automat | Provider | Quality | Cost | API Key | |----------|---------|------|---------| | **Local Whisper** (default) | Good | Free | None needed | -| **OpenAI Whisper API** | Good–Best | Paid | `VOICE_TOOLS_OPENAI_KEY` | +| **Groq Whisper API** | Good–Best | Free tier | `GROQ_API_KEY` | +| **OpenAI Whisper API** | Good–Best | Paid | `VOICE_TOOLS_OPENAI_KEY` or `OPENAI_API_KEY` | :::info Zero Config -Local transcription works out of the box — no API key needed. The `faster-whisper` model (~150 MB for `base`) is auto-downloaded on first voice message. +Local transcription works out of the box when `faster-whisper` is installed. If that's unavailable, Hermes can also use a local `whisper` CLI from common install locations (like `/opt/homebrew/bin`) or a custom command via `HERMES_LOCAL_STT_COMMAND`. ::: ### Configuration @@ -85,7 +86,7 @@ Local transcription works out of the box — no API key needed. The `faster-whis ```yaml # In ~/.hermes/config.yaml stt: - provider: "local" # "local" (free, faster-whisper) | "openai" (API) + provider: "local" # "local" | "groq" | "openai" local: model: "base" # tiny, base, small, medium, large-v3 openai: @@ -104,11 +105,16 @@ stt: | `medium` | ~1.5 GB | Slower | Great | | `large-v3` | ~3 GB | Slowest | Best | -**OpenAI API** — Requires `VOICE_TOOLS_OPENAI_KEY`. Supports `whisper-1`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. +**Groq API** — Requires `GROQ_API_KEY`. Good cloud fallback when you want a free hosted STT option. + +**OpenAI API** — Accepts `VOICE_TOOLS_OPENAI_KEY` first and falls back to `OPENAI_API_KEY`. Supports `whisper-1`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + +**Custom local CLI fallback** — Set `HERMES_LOCAL_STT_COMMAND` if you want Hermes to call a local transcription command directly. The command template supports `{input_path}`, `{output_dir}`, `{language}`, and `{model}` placeholders. ### Fallback Behavior If your configured provider isn't available, Hermes automatically falls back: -- **Local not installed** → Falls back to OpenAI API (if key is set) -- **OpenAI key not set** → Falls back to local Whisper (if installed) -- **Neither available** → Voice messages pass through with a note to the user +- **Local faster-whisper unavailable** → Tries a local `whisper` CLI or `HERMES_LOCAL_STT_COMMAND` before cloud providers +- **Groq key not set** → Falls back to local transcription, then OpenAI +- **OpenAI key not set** → Falls back to local transcription, then Groq +- **Nothing available** → Voice messages pass through with an accurate note to the user