diff --git a/cli.py b/cli.py index cc9f522aa..507e2d666 100755 --- a/cli.py +++ b/cli.py @@ -4213,20 +4213,20 @@ class HermesCLI: if text_queue is not None: text_queue.put(delta) - # When voice mode is active, prepend a brief instruction to the - # user message so the model responds concisely. This avoids - # modifying the system prompt (which would invalidate the prompt - # cache). The original message in conversation_history stays clean. - agent_message = message + # When voice mode is active, prepend a brief instruction so the + # model responds concisely. The prefix is API-call-local only — + # we strip it from the returned history so it never persists to + # session DB or resumed sessions. + _voice_prefix = "" if self._voice_mode and isinstance(message, str): - agent_message = ( + _voice_prefix = ( "[Voice input — respond concisely and conversationally, " "2-3 sentences max. No code blocks or markdown.] " - + message ) def run_agent(): nonlocal result + agent_message = _voice_prefix + message if _voice_prefix else message result = self.agent.run_conversation( user_message=agent_message, conversation_history=self.conversation_history[:-1], # Exclude the message we just added @@ -4298,6 +4298,13 @@ class HermesCLI: # Update history with full conversation self.conversation_history = result.get("messages", self.conversation_history) if result else self.conversation_history + # Strip voice prefix from history so it never persists + if _voice_prefix and self.conversation_history: + for msg in self.conversation_history: + if msg.get("role") == "user" and isinstance(msg.get("content"), str): + if msg["content"].startswith(_voice_prefix): + msg["content"] = msg["content"][len(_voice_prefix):] + # Get the final response response = result.get("final_response", "") if result else "" diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index c3abaa696..df4166f41 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -351,6 +351,8 @@ class BasePlatformAdapter(ABC): # Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt) self._active_sessions: Dict[str, asyncio.Event] = {} self._pending_messages: Dict[str, MessageEvent] = {} + # Chats where auto-TTS on voice input is disabled (set by /voice off) + self._auto_tts_disabled_chats: set = set() @property def name(self) -> str: @@ -733,8 +735,12 @@ class BasePlatformAdapter(ABC): logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response)) # Auto-TTS: if voice message, generate audio FIRST (before sending text) + # Skipped when the chat has voice mode disabled (/voice off) _tts_path = None - if event.message_type == MessageType.VOICE and text_content and not media_files: + if (event.message_type == MessageType.VOICE + and text_content + and not media_files + and event.source.chat_id not in self._auto_tts_disabled_chats): try: from tools.tts_tool import text_to_speech_tool, check_tts_requirements if check_tts_requirements(): diff --git a/gateway/run.py b/gateway/run.py index 75449d629..423a224db 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2119,9 +2119,13 @@ class GatewayRunner: args = event.get_command_args().strip().lower() chat_id = event.source.chat_id + adapter = self.adapters.get(event.source.platform) + if args in ("on", "enable"): self._voice_mode[chat_id] = "voice_only" self._save_voice_modes() + if adapter: + adapter._auto_tts_disabled_chats.discard(chat_id) return ( "Voice mode enabled.\n" "I'll reply with voice when you send voice messages.\n" @@ -2130,10 +2134,14 @@ class GatewayRunner: elif args in ("off", "disable"): self._voice_mode.pop(chat_id, None) self._save_voice_modes() + if adapter: + adapter._auto_tts_disabled_chats.add(chat_id) return "Voice mode disabled. Text-only replies." elif args == "tts": self._voice_mode[chat_id] = "all" self._save_voice_modes() + if adapter: + adapter._auto_tts_disabled_chats.discard(chat_id) return ( "Auto-TTS enabled.\n" "All replies will include a voice message." @@ -2171,10 +2179,14 @@ class GatewayRunner: if current == "off": self._voice_mode[chat_id] = "voice_only" self._save_voice_modes() + if adapter: + adapter._auto_tts_disabled_chats.discard(chat_id) return "Voice mode enabled." else: self._voice_mode.pop(chat_id, None) self._save_voice_modes() + if adapter: + adapter._auto_tts_disabled_chats.add(chat_id) return "Voice mode disabled." async def _handle_voice_channel_join(self, event: MessageEvent) -> str: @@ -2211,6 +2223,7 @@ class GatewayRunner: adapter._voice_text_channels[guild_id] = int(event.source.chat_id) self._voice_mode[event.source.chat_id] = "all" self._save_voice_modes() + adapter._auto_tts_disabled_chats.discard(event.source.chat_id) return ( f"Joined voice channel **{voice_channel.name}**.\n" f"I'll speak my replies and listen to you. Use /voice leave to disconnect." @@ -2265,21 +2278,28 @@ class GatewayRunner: if not text_ch_id: return - # Show transcript in text channel - try: - channel = adapter._client.get_channel(text_ch_id) - if channel: - await channel.send(f"**[Voice]** <@{user_id}>: {transcript}") - except Exception: - pass - - # Build a synthetic MessageEvent and feed through the normal pipeline + # Check authorization before processing voice input source = SessionSource( platform=Platform.DISCORD, chat_id=str(text_ch_id), user_id=str(user_id), user_name=str(user_id), + chat_type="channel", ) + if not self._is_user_authorized(source): + logger.debug("Unauthorized voice input from user %d, ignoring", user_id) + return + + # Show transcript in text channel (after auth, with mention sanitization) + try: + channel = adapter._client.get_channel(text_ch_id) + if channel: + safe_text = transcript[:2000].replace("@everyone", "@\u200beveryone").replace("@here", "@\u200bhere") + await channel.send(f"**[Voice]** <@{user_id}>: {safe_text}") + except Exception: + pass + + # Build a synthetic MessageEvent and feed through the normal pipeline # Use SimpleNamespace as raw_message so _get_guild_id() can extract # guild_id and _send_voice_reply() plays audio in the voice channel. from types import SimpleNamespace diff --git a/run_agent.py b/run_agent.py index 283590fc8..66f5196a3 100644 --- a/run_agent.py +++ b/run_agent.py @@ -508,6 +508,7 @@ class AIAgent: from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token effective_key = api_key or resolve_anthropic_token() or "" self._anthropic_api_key = effective_key + self._anthropic_base_url = base_url self._anthropic_client = build_anthropic_client(effective_key, base_url) # No OpenAI client needed for Anthropic mode self.client = None @@ -2625,7 +2626,7 @@ class AIAgent: try: if self.api_mode == "anthropic_messages": from agent.anthropic_adapter import build_anthropic_client - self._anthropic_client = build_anthropic_client(self._anthropic_api_key) + self._anthropic_client = build_anthropic_client(self._anthropic_api_key, getattr(self, "_anthropic_base_url", None)) else: self.client = OpenAI(**self._client_kwargs) except Exception: @@ -2757,7 +2758,7 @@ class AIAgent: try: if self.api_mode == "anthropic_messages": from agent.anthropic_adapter import build_anthropic_client - self._anthropic_client = build_anthropic_client(self._anthropic_api_key) + self._anthropic_client = build_anthropic_client(self._anthropic_api_key, getattr(self, "_anthropic_base_url", None)) else: self.client = OpenAI(**self._client_kwargs) except Exception: @@ -2823,7 +2824,8 @@ class AIAgent: from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token effective_key = fb_client.api_key or resolve_anthropic_token() or "" self._anthropic_api_key = effective_key - self._anthropic_client = build_anthropic_client(effective_key) + self._anthropic_base_url = getattr(fb_client, "base_url", None) + self._anthropic_client = build_anthropic_client(effective_key, self._anthropic_base_url) self.client = None self._client_kwargs = {} else: @@ -4436,7 +4438,7 @@ class AIAgent: self._dump_api_request_debug(api_kwargs, reason="preflight") cb = getattr(self, "_stream_callback", None) - if cb is not None: + if cb is not None and self.api_mode == "chat_completions": response = self._streaming_api_call(api_kwargs, cb) else: response = self._interruptible_api_call(api_kwargs) @@ -4770,7 +4772,7 @@ class AIAgent: new_token = resolve_anthropic_token() if new_token and new_token != self._anthropic_api_key: self._anthropic_api_key = new_token - self._anthropic_client = build_anthropic_client(new_token) + self._anthropic_client = build_anthropic_client(new_token, getattr(self, "_anthropic_base_url", None)) print(f"{self.log_prefix}🔐 Anthropic credentials refreshed after 401. Retrying request...") continue # Credential refresh didn't help — show diagnostic info diff --git a/tests/gateway/test_voice_command.py b/tests/gateway/test_voice_command.py index 4925f2845..1914688c8 100644 --- a/tests/gateway/test_voice_command.py +++ b/tests/gateway/test_voice_command.py @@ -38,6 +38,7 @@ def _make_runner(tmp_path): runner._VOICE_MODE_PATH = tmp_path / "gateway_voice_mode.json" runner._session_db = None runner.session_store = MagicMock() + runner._is_user_authorized = lambda source: True return runner @@ -731,6 +732,7 @@ class TestVoiceChannelCommands: assert event.text == "Hello from VC" assert event.message_type == MessageType.VOICE assert event.source.chat_id == "123" + assert event.source.chat_type == "channel" @pytest.mark.asyncio async def test_input_posts_transcript_in_text_channel(self, runner):