From 3a1b35ed92340918db9a869073937fe46898ec65 Mon Sep 17 00:00:00 2001
From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>
Date: Fri, 6 Mar 2026 01:32:37 +0300
Subject: [PATCH] fix: voice mode race conditions, temp file leak, think tag
 parsing

- Atomic check-and-set for _voice_recording flag with _voice_lock
- Guard _voice_stop_and_transcribe against concurrent invocation
- Remove premature flag clearing from Ctrl+R handler
- Clean up temp WAV files in finally block (_play_via_tempfile)
- Use buffer-level regex for <think> block filtering (handles chunked tags)
- Prevent /voice on prompt accumulation on repeated calls
- Include Groq in STT key error message
---
 cli.py            | 39 ++++++++++++++++++++++++++------------
 tools/tts_tool.py | 48 +++++++++++++++++++----------------------------
 2 files changed, 46 insertions(+), 41 deletions(-)
diff --git a/cli.py b/cli.py
index 3221cbb79..d15d43a16 100755
--- a/cli.py
+++ b/cli.py
@@ -3544,10 +3544,6 @@ class HermesCLI:
 
     def _voice_start_recording(self):
         """Start capturing audio from the microphone."""
-        # Prevent double-start from concurrent threads
-        if self._voice_recording:
-            return
-
         from tools.voice_mode import AudioRecorder, check_voice_requirements
 
         reqs = check_voice_requirements()
@@ -3559,10 +3555,18 @@ class HermesCLI:
             )
         if not reqs["stt_key_set"]:
             raise RuntimeError(
-                "Voice mode requires VOICE_TOOLS_OPENAI_KEY for transcription.\n"
-                "Get one at: https://platform.openai.com/api-keys"
+                "Voice mode requires an STT API key for transcription.\n"
+                "Set GROQ_API_KEY (free) or VOICE_TOOLS_OPENAI_KEY.\n"
+                "Groq: https://console.groq.com/keys\n"
+                "OpenAI: https://platform.openai.com/api-keys"
             )
 
+        # Prevent double-start from concurrent threads (atomic check-and-set)
+        with self._voice_lock:
+            if self._voice_recording:
+                return
+            self._voice_recording = True
+
         # Load silence detection params from config
         voice_cfg = {}
         try:
@@ -3595,9 +3599,12 @@ class HermesCLI:
         except Exception:
             pass
 
-        self._voice_recorder.start(on_silence_stop=_on_silence)
-        with self._voice_lock:
-            self._voice_recording = True
+        try:
+            self._voice_recorder.start(on_silence_stop=_on_silence)
+        except Exception:
+            with self._voice_lock:
+                self._voice_recording = False
+            raise
         _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}")
 
         # Periodically refresh prompt to update audio level indicator
@@ -3610,6 +3617,12 @@ class HermesCLI:
 
     def _voice_stop_and_transcribe(self):
         """Stop recording, transcribe via STT, and queue the transcript as input."""
+        # Atomic guard: only one thread can enter stop-and-transcribe
+        with self._voice_lock:
+            if not self._voice_recording:
+                return
+            self._voice_recording = False
+
         submitted = False
         wav_path = None
         try:
@@ -3617,8 +3630,6 @@ class HermesCLI:
                 return
 
             wav_path = self._voice_recorder.stop()
-            with self._voice_lock:
-                self._voice_recording = False
 
             # Audio cue: double beep after stream stopped (no CoreAudio conflict)
             try:
@@ -3764,6 +3775,10 @@ class HermesCLI:
 
     def _enable_voice_mode(self):
         """Enable voice mode after checking requirements."""
+        if self._voice_mode:
+            _cprint(f"{_DIM}Voice mode is already enabled.{_RST}")
+            return
+
         from tools.voice_mode import check_voice_requirements
 
         reqs = check_voice_requirements()
@@ -4838,7 +4853,7 @@ class HermesCLI:
                 # Manual stop via Ctrl+R: stop continuous mode
                 with cli_ref._voice_lock:
                     cli_ref._voice_continuous = False
-                    cli_ref._voice_recording = False
+                # Flag clearing is handled atomically inside _voice_stop_and_transcribe
                 event.app.invalidate()
                 threading.Thread(
                     target=cli_ref._voice_stop_and_transcribe,
diff --git a/tools/tts_tool.py b/tools/tts_tool.py
index 3b8773d49..988fa653a 100644
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@@ -519,10 +519,11 @@ def stream_tts_to_speaker(
                     output_stream = None
 
         sentence_buf = ""
-        in_think = False  # track <think>...</think> blocks
         min_sentence_len = 20
         long_flush_len = 100
         queue_timeout = 0.5
+        # Regex to strip complete <think>...</think> blocks from buffer
+        _think_block_re = re.compile(r'<think[\s>].*?</think>', flags=re.DOTALL)
 
         def _speak_sentence(sentence: str):
             """Display sentence and optionally generate + play audio."""
@@ -562,6 +563,7 @@ def stream_tts_to_speaker(
 
         def _play_via_tempfile(audio_iter, stop_evt):
             """Write PCM chunks to a temp WAV file and play it."""
+            tmp_path = None
             try:
                 import wave
                 tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
@@ -576,9 +578,14 @@ def stream_tts_to_speaker(
                         wf.writeframes(chunk)
                 from tools.voice_mode import play_audio_file
                 play_audio_file(tmp_path)
-                os.unlink(tmp_path)
             except Exception as exc:
                 logger.warning("Temp-file TTS fallback failed: %s", exc)
+            finally:
+                if tmp_path:
+                    try:
+                        os.unlink(tmp_path)
+                    except OSError:
+                        pass
 
         while not stop_event.is_set():
             # Read next delta from queue
@@ -592,41 +599,24 @@ def stream_tts_to_speaker(
                 continue
 
             if delta is None:
-                # End-of-text sentinel: flush remaining buffer
+                # End-of-text sentinel: strip any remaining think blocks, flush
+                sentence_buf = _think_block_re.sub('', sentence_buf)
                 if sentence_buf.strip():
                     _speak_sentence(sentence_buf)
                 break
 
+            sentence_buf += delta
+
             # --- Think block filtering ---
-            # Process delta character by character for think tags
-            i = 0
-            filtered_delta = []
-            while i < len(delta):
-                # Check for opening <think tag
-                if delta[i:].startswith("<think"):
-                    in_think = True
-                    # Skip past the tag
-                    end = delta.find(">", i)
-                    if end != -1:
-                        i = end + 1
-                    else:
-                        i = len(delta)
-                    continue
-                # Check for closing </think> tag
-                if delta[i:].startswith("</think>"):
-                    in_think = False
-                    i += len("</think>")
-                    continue
-                if not in_think:
-                    filtered_delta.append(delta[i])
-                i += 1
+            # Strip complete <think>...</think> blocks from buffer.
+            # Works correctly even when tags span multiple deltas.
+            sentence_buf = _think_block_re.sub('', sentence_buf)
 
-            text = "".join(filtered_delta)
-            if not text:
+            # If an incomplete <think tag is at the end, wait for more data
+            # before extracting sentences (the closing tag may arrive next).
+            if '<think' in sentence_buf and '</think>' not in sentence_buf:
                 continue
 
-            sentence_buf += text
-
             # Check for sentence boundaries
             while True:
                 m = _SENTENCE_BOUNDARY_RE.search(sentence_buf)