From c36136084a86a37cf6abee7ffe98301d3d780d03 Mon Sep 17 00:00:00 2001 From: teyrebaz33 Date: Sat, 14 Mar 2026 22:09:53 -0700 Subject: [PATCH 1/2] fix(gateway): honor stt.enabled false for voice transcription - bridge stt.enabled from config.yaml into gateway runtime config - preserve the flag in GatewayConfig serialization - skip gateway voice transcription when STT is disabled - add regression tests for config loading and disabled transcription flow --- gateway/config.py | 26 ++++++++++++++++ gateway/run.py | 8 ++++- tests/gateway/test_stt_config.py | 53 ++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 tests/gateway/test_stt_config.py diff --git a/gateway/config.py b/gateway/config.py index 47c739e91..2b187c521 100644 --- a/gateway/config.py +++ b/gateway/config.py @@ -21,6 +21,17 @@ from hermes_cli.config import get_hermes_home logger = logging.getLogger(__name__) +def _coerce_bool(value: Any, default: bool = True) -> bool: + """Coerce bool-ish config values, preserving a caller-provided default.""" + if value is None: + return default + if isinstance(value, bool): + return value + if isinstance(value, str): + return value.strip().lower() in ("true", "1", "yes", "on") + return bool(value) + + class Platform(Enum): """Supported messaging platforms.""" LOCAL = "local" @@ -160,6 +171,9 @@ class GatewayConfig: # Delivery settings always_log_local: bool = True # Always save cron outputs to local files + + # STT settings + stt_enabled: bool = True # Whether to auto-transcribe inbound voice messages def get_connected_platforms(self) -> List[Platform]: """Return list of platforms that are enabled and configured.""" @@ -224,6 +238,7 @@ class GatewayConfig: "quick_commands": self.quick_commands, "sessions_dir": str(self.sessions_dir), "always_log_local": self.always_log_local, + "stt_enabled": self.stt_enabled, } @classmethod @@ -260,6 +275,10 @@ class GatewayConfig: if not isinstance(quick_commands, dict): quick_commands = {} + stt_enabled = data.get("stt_enabled") + if stt_enabled is None: + stt_enabled = data.get("stt", {}).get("enabled") if isinstance(data.get("stt"), dict) else None + return cls( platforms=platforms, default_reset_policy=default_policy, @@ -269,6 +288,7 @@ class GatewayConfig: quick_commands=quick_commands, sessions_dir=sessions_dir, always_log_local=data.get("always_log_local", True), + stt_enabled=_coerce_bool(stt_enabled, True), ) @@ -318,6 +338,12 @@ def load_gateway_config() -> GatewayConfig: else: logger.warning("Ignoring invalid quick_commands in config.yaml (expected mapping, got %s)", type(qc).__name__) + # Bridge STT enable/disable from config.yaml into gateway runtime. + # This keeps the gateway aligned with the user-facing config source. + stt_cfg = yaml_cfg.get("stt") + if isinstance(stt_cfg, dict) and "enabled" in stt_cfg: + config.stt_enabled = _coerce_bool(stt_cfg.get("enabled"), True) + # Bridge discord settings from config.yaml to env vars # (env vars take precedence — only set if not already defined) discord_cfg = yaml_cfg.get("discord", {}) diff --git a/gateway/run.py b/gateway/run.py index e973852b4..f955573c1 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -3512,7 +3512,7 @@ class GatewayRunner: audio_paths: List[str], ) -> str: """ - Auto-transcribe user voice/audio messages using OpenAI Whisper API + Auto-transcribe user voice/audio messages using the configured STT provider and prepend the transcript to the message text. Args: @@ -3522,6 +3522,12 @@ class GatewayRunner: Returns: The enriched message string with transcriptions prepended. """ + if not getattr(self.config, "stt_enabled", True): + disabled_note = "[The user sent voice message(s), but transcription is disabled in config.]" + if user_text: + return f"{disabled_note}\n\n{user_text}" + return disabled_note + from tools.transcription_tools import transcribe_audio, get_stt_model_from_config import asyncio diff --git a/tests/gateway/test_stt_config.py b/tests/gateway/test_stt_config.py new file mode 100644 index 000000000..d5a9fc55b --- /dev/null +++ b/tests/gateway/test_stt_config.py @@ -0,0 +1,53 @@ +"""Gateway STT config tests — honor stt.enabled: false from config.yaml.""" + +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest +import yaml + +from gateway.config import GatewayConfig, load_gateway_config + + +def test_gateway_config_stt_disabled_from_dict_nested(): + config = GatewayConfig.from_dict({"stt": {"enabled": False}}) + assert config.stt_enabled is False + + +def test_load_gateway_config_bridges_stt_enabled_from_config_yaml(tmp_path, monkeypatch): + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + (hermes_home / "config.yaml").write_text( + yaml.dump({"stt": {"enabled": False}}), + encoding="utf-8", + ) + + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + monkeypatch.setattr(Path, "home", lambda: tmp_path) + + config = load_gateway_config() + + assert config.stt_enabled is False + + +@pytest.mark.asyncio +async def test_enrich_message_with_transcription_skips_when_stt_disabled(): + from gateway.run import GatewayRunner + + runner = GatewayRunner.__new__(GatewayRunner) + runner.config = GatewayConfig(stt_enabled=False) + + with patch( + "tools.transcription_tools.transcribe_audio", + side_effect=AssertionError("transcribe_audio should not be called when STT is disabled"), + ), patch( + "tools.transcription_tools.get_stt_model_from_config", + return_value=None, + ): + result = await runner._enrich_message_with_transcription( + "caption", + ["/tmp/voice.ogg"], + ) + + assert "transcription is disabled" in result.lower() + assert "caption" in result From f8ceadbad0c0aaaacbda59ed8293fc806b867f84 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 22:09:59 -0700 Subject: [PATCH 2/2] fix: propagate STT disable through shared transcription config - add stt.enabled to the default user config - make transcription_tools respect the disabled flag globally - surface disabled state cleanly in voice mode diagnostics - add regression coverage for disabled STT provider selection --- hermes_cli/config.py | 5 +++-- tests/tools/test_transcription.py | 16 ++++++++++++++++ tools/transcription_tools.py | 22 ++++++++++++++++++++++ tools/voice_mode.py | 9 ++++++--- 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index bdde858d3..44755b195 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -205,7 +205,8 @@ DEFAULT_CONFIG = { }, "stt": { - "provider": "local", # "local" (free, faster-whisper) | "openai" (Whisper API) + "enabled": True, + "provider": "local", # "local" (free, faster-whisper) | "groq" | "openai" (Whisper API) "local": { "model": "base", # tiny, base, small, medium, large-v3 }, @@ -284,7 +285,7 @@ DEFAULT_CONFIG = { }, # Config schema version - bump this when adding new required fields - "_config_version": 7, + "_config_version": 8, } # ============================================================================= diff --git a/tests/tools/test_transcription.py b/tests/tools/test_transcription.py index fe3b24a8d..c8daface0 100644 --- a/tests/tools/test_transcription.py +++ b/tests/tools/test_transcription.py @@ -59,6 +59,10 @@ class TestGetProvider: from tools.transcription_tools import _get_provider assert _get_provider({}) == "local" + def test_disabled_config_returns_none(self): + from tools.transcription_tools import _get_provider + assert _get_provider({"enabled": False, "provider": "openai"}) == "none" + # --------------------------------------------------------------------------- # File validation @@ -217,6 +221,18 @@ class TestTranscribeAudio: assert result["success"] is False assert "No STT provider" in result["error"] + def test_disabled_config_returns_disabled_error(self, tmp_path): + audio_file = tmp_path / "test.ogg" + audio_file.write_bytes(b"fake audio") + + with patch("tools.transcription_tools._load_stt_config", return_value={"enabled": False}), \ + patch("tools.transcription_tools._get_provider", return_value="none"): + from tools.transcription_tools import transcribe_audio + result = transcribe_audio(str(audio_file)) + + assert result["success"] is False + assert "disabled" in result["error"].lower() + def test_invalid_file_returns_error(self): from tools.transcription_tools import transcribe_audio result = transcribe_audio("/nonexistent/file.ogg") diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index a20ba4134..684d0a8d8 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -93,6 +93,18 @@ def _load_stt_config() -> dict: return {} +def is_stt_enabled(stt_config: Optional[dict] = None) -> bool: + """Return whether STT is enabled in config.""" + if stt_config is None: + stt_config = _load_stt_config() + enabled = stt_config.get("enabled", True) + if isinstance(enabled, str): + return enabled.strip().lower() in ("true", "1", "yes", "on") + if enabled is None: + return True + return bool(enabled) + + def _get_provider(stt_config: dict) -> str: """Determine which STT provider to use. @@ -101,6 +113,9 @@ def _get_provider(stt_config: dict) -> str: 2. Auto-detect: local > groq (free) > openai (paid) 3. Disabled (returns "none") """ + if not is_stt_enabled(stt_config): + return "none" + provider = stt_config.get("provider", DEFAULT_PROVIDER) if provider == "local": @@ -334,6 +349,13 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A # Load config and determine provider stt_config = _load_stt_config() + if not is_stt_enabled(stt_config): + return { + "success": False, + "transcript": "", + "error": "STT is disabled in config.yaml (stt.enabled: false).", + } + provider = _get_provider(stt_config) if provider == "local": diff --git a/tools/voice_mode.py b/tools/voice_mode.py index a2c70ac1b..783584895 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -703,10 +703,11 @@ def check_voice_requirements() -> Dict[str, Any]: ``missing_packages``, and ``details``. """ # Determine STT provider availability - from tools.transcription_tools import _get_provider, _load_stt_config, _HAS_FASTER_WHISPER + from tools.transcription_tools import _get_provider, _load_stt_config, is_stt_enabled, _HAS_FASTER_WHISPER stt_config = _load_stt_config() + stt_enabled = is_stt_enabled(stt_config) stt_provider = _get_provider(stt_config) - stt_available = stt_provider != "none" + stt_available = stt_enabled and stt_provider != "none" missing: List[str] = [] has_audio = _audio_available() @@ -725,7 +726,9 @@ def check_voice_requirements() -> Dict[str, Any]: else: details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)") - if stt_provider == "local": + if not stt_enabled: + details_parts.append("STT provider: DISABLED in config (stt.enabled: false)") + elif stt_provider == "local": details_parts.append("STT provider: OK (local faster-whisper)") elif stt_provider == "groq": details_parts.append("STT provider: OK (Groq)")