fix(voice): enable TTS voice reply when streaming is active (#2322)

When streaming is enabled, the base adapter receives None from _handle_message (already_sent=True) and cannot run auto-TTS for voice input. The runner was unconditionally skipping voice input TTS assuming the base adapter would handle it. Now the runner takes over TTS responsibility when streaming has already delivered the text response, so voice channel playback works with both streaming on and off. Streaming off behavior is unchanged (default already_sent=False preserves the original code path exactly). Co-authored-by: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>
2026-03-21 08:08:37 -07:00
parent 06f4df52f1
commit 28bb0e770f
2 changed files with 60 additions and 12 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -2248,7 +2248,8 @@ class GatewayRunner:
            )

            # Auto voice reply: send TTS audio before the text response
-            if self._should_send_voice_reply(event, response, agent_messages):
+            _already_sent = bool(agent_result.get("already_sent"))
+            if self._should_send_voice_reply(event, response, agent_messages, already_sent=_already_sent):
                await self._send_voice_reply(event, response)

            # If streaming already delivered the response, return None so
@@ -3054,6 +3055,7 @@ class GatewayRunner:
        event: MessageEvent,
        response: str,
        agent_messages: list,
+        already_sent: bool = False,
    ) -> bool:
        """Decide whether the runner should send a TTS voice reply.

@@ -3062,8 +3064,9 @@ class GatewayRunner:
        - response is empty or an error
        - agent already called text_to_speech tool (dedup)
        - voice input and base adapter auto-TTS already handled it (skip_double)
-          Exception: Discord voice channel — base play_tts is a no-op there,
-          so the runner must handle VC playback.
+          UNLESS streaming already consumed the response (already_sent=True),
+          in which case the base adapter won't have text for auto-TTS so the
+          runner must handle it.
        """
        if not response or response.startswith("Error:"):
            return False
@@ -3093,7 +3096,10 @@ class GatewayRunner:

        # Dedup: base adapter auto-TTS already handles voice input
        # (play_tts plays in VC when connected, so runner can skip).
-        if is_voice_input:
+        # When streaming already delivered the text (already_sent=True),
+        # the base adapter will receive None and can't run auto-TTS,
+        # so the runner must take over.
+        if is_voice_input and not already_sent:
            return False

        return True
--- a/tests/gateway/test_voice_command.py
+++ b/tests/gateway/test_voice_command.py
@@ -2467,7 +2467,8 @@ class TestVoiceTTSPlayback:
        runner.adapters = {}
        return runner

-    def _call_should_reply(self, runner, voice_mode, msg_type, response="Hello", agent_msgs=None):
+    def _call_should_reply(self, runner, voice_mode, msg_type, response="Hello",
+                           agent_msgs=None, already_sent=False):
        from gateway.platforms.base import MessageType, MessageEvent, SessionSource
        from gateway.config import Platform
        runner._voice_mode["ch1"] = voice_mode
@@ -2476,28 +2477,32 @@ class TestVoiceTTSPlayback:
            user_id="1", user_name="test", chat_type="channel",
        )
        event = MessageEvent(source=source, text="test", message_type=msg_type)
-        return runner._should_send_voice_reply(event, response, agent_msgs or [])
+        return runner._should_send_voice_reply(
+            event, response, agent_msgs or [], already_sent=already_sent,
+        )
+
+    # -- Streaming OFF (existing behavior, must not change) --

    def test_voice_input_runner_skips(self):
-        """Voice input: runner skips — base adapter handles via play_tts."""
+        """Streaming OFF + voice input: runner skips — base adapter handles."""
        from gateway.platforms.base import MessageType
        runner = self._make_runner()
-        assert self._call_should_reply(runner, "all", MessageType.VOICE) is False
+        assert self._call_should_reply(runner, "all", MessageType.VOICE, already_sent=False) is False

    def test_text_input_voice_all_runner_fires(self):
-        """Text input + voice_mode=all: runner generates TTS."""
+        """Streaming OFF + text input + voice_mode=all: runner generates TTS."""
        from gateway.platforms.base import MessageType
        runner = self._make_runner()
-        assert self._call_should_reply(runner, "all", MessageType.TEXT) is True
+        assert self._call_should_reply(runner, "all", MessageType.TEXT, already_sent=False) is True

    def test_text_input_voice_off_no_tts(self):
-        """Text input + voice_mode=off: no TTS."""
+        """Streaming OFF + text input + voice_mode=off: no TTS."""
        from gateway.platforms.base import MessageType
        runner = self._make_runner()
        assert self._call_should_reply(runner, "off", MessageType.TEXT) is False

    def test_text_input_voice_only_no_tts(self):
-        """Text input + voice_mode=voice_only: no TTS for text."""
+        """Streaming OFF + text input + voice_mode=voice_only: no TTS for text."""
        from gateway.platforms.base import MessageType
        runner = self._make_runner()
        assert self._call_should_reply(runner, "voice_only", MessageType.TEXT) is False
@@ -2523,6 +2528,43 @@ class TestVoiceTTSPlayback:
        ]}]
        assert self._call_should_reply(runner, "all", MessageType.TEXT, agent_msgs=agent_msgs) is False

+    # -- Streaming ON (already_sent=True) --
+
+    def test_streaming_on_voice_input_runner_fires(self):
+        """Streaming ON + voice input: runner handles TTS (base adapter has no text)."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        assert self._call_should_reply(runner, "all", MessageType.VOICE, already_sent=True) is True
+
+    def test_streaming_on_text_input_runner_fires(self):
+        """Streaming ON + text input: runner handles TTS (same as before)."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        assert self._call_should_reply(runner, "all", MessageType.TEXT, already_sent=True) is True
+
+    def test_streaming_on_voice_off_no_tts(self):
+        """Streaming ON + voice_mode=off: no TTS regardless of streaming."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        assert self._call_should_reply(runner, "off", MessageType.VOICE, already_sent=True) is False
+
+    def test_streaming_on_empty_response_no_tts(self):
+        """Streaming ON + empty response: no TTS."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        assert self._call_should_reply(runner, "all", MessageType.VOICE, response="", already_sent=True) is False
+
+    def test_streaming_on_agent_tts_dedup(self):
+        """Streaming ON + agent called TTS: runner skips (dedup still works)."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        agent_msgs = [{"role": "assistant", "tool_calls": [
+            {"id": "1", "type": "function", "function": {"name": "text_to_speech", "arguments": "{}"}}
+        ]}]
+        assert self._call_should_reply(
+            runner, "all", MessageType.VOICE, agent_msgs=agent_msgs, already_sent=True,
+        ) is False
+

 class TestUDPKeepalive:
    """UDP keepalive prevents Discord from dropping the voice session."""