fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and openai are never imported at module level. Each is imported only when the feature is explicitly activated, preventing crashes in headless environments (SSH, Docker, WSL, no PortAudio). 2. No core agent loop changes: streaming TTS path extracted from _interruptible_api_call() into separate _streaming_api_call() method. The original method is restored to its upstream form. 3. Configurable key binding: push-to-talk key changed from Ctrl+R (conflicts with readline reverse-search) to Ctrl+B by default. Configurable via voice.push_to_talk_key in config.yaml. 4. Environment detection: new detect_audio_environment() function checks for SSH, Docker, WSL, and missing audio devices before enabling voice mode. Auto-disables with clear warnings in incompatible environments. 5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream, sd.OutputStream) wrapped in try/except with ImportError/OSError handling. Failures produce warnings, not crashes.
2026-03-09 12:48:49 +03:00
parent 143cc68946
commit b859dfab16
5 changed files with 526 additions and 142 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -2590,12 +2590,6 @@ class AIAgent:
        On interrupt, closes the HTTP client to cancel the in-flight request
        (stops token generation and avoids wasting money), then rebuilds the
        client for future calls.
-
-        When ``self._stream_callback`` is set (streaming TTS mode), the call
-        uses ``stream=True`` and iterates over chunks inside the background
-        thread.  Content deltas are forwarded to the callback in real-time
-        while the full response is accumulated and returned as a
-        ``SimpleNamespace`` that mimics a normal ``ChatCompletion``.
        """
        result = {"response": None, "error": None}

@@ -2603,30 +2597,58 @@ class AIAgent:
            try:
                if self.api_mode == "codex_responses":
                    result["response"] = self._run_codex_stream(api_kwargs)
-                    return
                elif self.api_mode == "anthropic_messages":
                    result["response"] = self._anthropic_client.messages.create(**api_kwargs)
-                    return
-
-                cb = getattr(self, "_stream_callback", None)
-                if cb is None:
-                    # Non-streaming path (default)
+                else:
                    result["response"] = self.client.chat.completions.create(**api_kwargs)
-                    return
+            except Exception as e:
+                result["error"] = e

-                # --- Streaming path for TTS pipeline ---
+        t = threading.Thread(target=_call, daemon=True)
+        t.start()
+        while t.is_alive():
+            t.join(timeout=0.3)
+            if self._interrupt_requested:
+                # Force-close the HTTP connection to stop token generation
+                try:
+                    self.client.close()
+                except Exception:
+                    pass
+                # Rebuild the client for future calls (cheap, no network)
+                try:
+                    self.client = OpenAI(**self._client_kwargs)
+                except Exception:
+                    pass
+                raise InterruptedError("Agent interrupted during API call")
+        if result["error"] is not None:
+            raise result["error"]
+        return result["response"]
+
+    def _streaming_api_call(self, api_kwargs: dict, stream_callback):
+        """Streaming variant of _interruptible_api_call for voice TTS pipeline.
+
+        Uses ``stream=True`` and forwards content deltas to *stream_callback*
+        in real-time.  Returns a ``SimpleNamespace`` that mimics a normal
+        ``ChatCompletion`` so the rest of the agent loop works unchanged.
+
+        This method is separate from ``_interruptible_api_call`` to keep the
+        core agent loop untouched for non-voice users.
+        """
+        result = {"response": None, "error": None}
+
+        def _call():
+            try:
                stream_kwargs = {**api_kwargs, "stream": True}
                stream = self.client.chat.completions.create(**stream_kwargs)

                content_parts: list[str] = []
-                tool_calls_acc: dict[int, dict] = {}  # index -> {id, type, function:{name, arguments}}
+                tool_calls_acc: dict[int, dict] = {}
                finish_reason = None
                model_name = None
                role = "assistant"

                for chunk in stream:
                    if not chunk.choices:
-                        # Usage-only or empty chunk
                        if hasattr(chunk, "model") and chunk.model:
                            model_name = chunk.model
                        continue
@@ -2635,24 +2657,17 @@ class AIAgent:
                    if hasattr(chunk, "model") and chunk.model:
                        model_name = chunk.model

-                    # Content delta
                    if delta and delta.content:
                        content_parts.append(delta.content)
                        try:
-                            cb(delta.content)
+                            stream_callback(delta.content)
                        except Exception:
                            pass

-                    # Tool call deltas
                    if delta and delta.tool_calls:
                        for tc_delta in delta.tool_calls:
                            idx = tc_delta.index if tc_delta.index is not None else 0
-                            # Gemini may reuse index 0 for multiple tool calls,
-                            # sending a new id each time.  Detect this and assign
-                            # a fresh virtual index so calls don't merge.
                            if idx in tool_calls_acc and tc_delta.id and tc_delta.id != tool_calls_acc[idx]["id"]:
-                                # Look for existing entry with this id first
-                                # (follow-up deltas for an already-created tool call)
                                matched = False
                                for eidx, eentry in tool_calls_acc.items():
                                    if eentry["id"] == tc_delta.id:
@@ -2679,7 +2694,6 @@ class AIAgent:
                    if chunk.choices[0].finish_reason:
                        finish_reason = chunk.choices[0].finish_reason

-                # Build a mock ChatCompletion matching the non-streaming interface
                full_content = "".join(content_parts) or None
                mock_tool_calls = None
                if tool_calls_acc:
@@ -2722,7 +2736,6 @@ class AIAgent:
        while t.is_alive():
            t.join(timeout=0.3)
            if self._interrupt_requested:
-                # Force-close the HTTP connection to stop token generation
                try:
                    if self.api_mode == "anthropic_messages":
                        self._anthropic_client.close()
@@ -2730,7 +2743,6 @@ class AIAgent:
                        self.client.close()
                except Exception:
                    pass
-                # Rebuild the client for future calls (cheap, no network)
                try:
                    if self.api_mode == "anthropic_messages":
                        from agent.anthropic_adapter import build_anthropic_client
@@ -4412,7 +4424,11 @@ class AIAgent:
                    if os.getenv("HERMES_DUMP_REQUESTS", "").strip().lower() in {"1", "true", "yes", "on"}:
                        self._dump_api_request_debug(api_kwargs, reason="preflight")

-                    response = self._interruptible_api_call(api_kwargs)
+                    cb = getattr(self, "_stream_callback", None)
+                    if cb is not None:
+                        response = self._streaming_api_call(api_kwargs, cb)
+                    else:
+                        response = self._interruptible_api_call(api_kwargs)
                    
                    api_duration = time.time() - api_start_time