From 3a1b35ed92340918db9a869073937fe46898ec65 Mon Sep 17 00:00:00 2001
From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>
Date: Fri, 6 Mar 2026 01:32:37 +0300
Subject: [PATCH] fix: voice mode race conditions, temp file leak, think tag
parsing
- Atomic check-and-set for _voice_recording flag with _voice_lock
- Guard _voice_stop_and_transcribe against concurrent invocation
- Remove premature flag clearing from Ctrl+R handler
- Clean up temp WAV files in finally block (_play_via_tempfile)
- Use buffer-level regex for block filtering (handles chunked tags)
- Prevent /voice on prompt accumulation on repeated calls
- Include Groq in STT key error message
---
cli.py | 39 ++++++++++++++++++++++++++------------
tools/tts_tool.py | 48 +++++++++++++++++++----------------------------
2 files changed, 46 insertions(+), 41 deletions(-)
diff --git a/cli.py b/cli.py
index 3221cbb79..d15d43a16 100755
--- a/cli.py
+++ b/cli.py
@@ -3544,10 +3544,6 @@ class HermesCLI:
def _voice_start_recording(self):
"""Start capturing audio from the microphone."""
- # Prevent double-start from concurrent threads
- if self._voice_recording:
- return
-
from tools.voice_mode import AudioRecorder, check_voice_requirements
reqs = check_voice_requirements()
@@ -3559,10 +3555,18 @@ class HermesCLI:
)
if not reqs["stt_key_set"]:
raise RuntimeError(
- "Voice mode requires VOICE_TOOLS_OPENAI_KEY for transcription.\n"
- "Get one at: https://platform.openai.com/api-keys"
+ "Voice mode requires an STT API key for transcription.\n"
+ "Set GROQ_API_KEY (free) or VOICE_TOOLS_OPENAI_KEY.\n"
+ "Groq: https://console.groq.com/keys\n"
+ "OpenAI: https://platform.openai.com/api-keys"
)
+ # Prevent double-start from concurrent threads (atomic check-and-set)
+ with self._voice_lock:
+ if self._voice_recording:
+ return
+ self._voice_recording = True
+
# Load silence detection params from config
voice_cfg = {}
try:
@@ -3595,9 +3599,12 @@ class HermesCLI:
except Exception:
pass
- self._voice_recorder.start(on_silence_stop=_on_silence)
- with self._voice_lock:
- self._voice_recording = True
+ try:
+ self._voice_recorder.start(on_silence_stop=_on_silence)
+ except Exception:
+ with self._voice_lock:
+ self._voice_recording = False
+ raise
_cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}")
# Periodically refresh prompt to update audio level indicator
@@ -3610,6 +3617,12 @@ class HermesCLI:
def _voice_stop_and_transcribe(self):
"""Stop recording, transcribe via STT, and queue the transcript as input."""
+ # Atomic guard: only one thread can enter stop-and-transcribe
+ with self._voice_lock:
+ if not self._voice_recording:
+ return
+ self._voice_recording = False
+
submitted = False
wav_path = None
try:
@@ -3617,8 +3630,6 @@ class HermesCLI:
return
wav_path = self._voice_recorder.stop()
- with self._voice_lock:
- self._voice_recording = False
# Audio cue: double beep after stream stopped (no CoreAudio conflict)
try:
@@ -3764,6 +3775,10 @@ class HermesCLI:
def _enable_voice_mode(self):
"""Enable voice mode after checking requirements."""
+ if self._voice_mode:
+ _cprint(f"{_DIM}Voice mode is already enabled.{_RST}")
+ return
+
from tools.voice_mode import check_voice_requirements
reqs = check_voice_requirements()
@@ -4838,7 +4853,7 @@ class HermesCLI:
# Manual stop via Ctrl+R: stop continuous mode
with cli_ref._voice_lock:
cli_ref._voice_continuous = False
- cli_ref._voice_recording = False
+ # Flag clearing is handled atomically inside _voice_stop_and_transcribe
event.app.invalidate()
threading.Thread(
target=cli_ref._voice_stop_and_transcribe,
diff --git a/tools/tts_tool.py b/tools/tts_tool.py
index 3b8773d49..988fa653a 100644
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@@ -519,10 +519,11 @@ def stream_tts_to_speaker(
output_stream = None
sentence_buf = ""
- in_think = False # track ... blocks
min_sentence_len = 20
long_flush_len = 100
queue_timeout = 0.5
+ # Regex to strip complete ... blocks from buffer
+ _think_block_re = re.compile(r'].*?', flags=re.DOTALL)
def _speak_sentence(sentence: str):
"""Display sentence and optionally generate + play audio."""
@@ -562,6 +563,7 @@ def stream_tts_to_speaker(
def _play_via_tempfile(audio_iter, stop_evt):
"""Write PCM chunks to a temp WAV file and play it."""
+ tmp_path = None
try:
import wave
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
@@ -576,9 +578,14 @@ def stream_tts_to_speaker(
wf.writeframes(chunk)
from tools.voice_mode import play_audio_file
play_audio_file(tmp_path)
- os.unlink(tmp_path)
except Exception as exc:
logger.warning("Temp-file TTS fallback failed: %s", exc)
+ finally:
+ if tmp_path:
+ try:
+ os.unlink(tmp_path)
+ except OSError:
+ pass
while not stop_event.is_set():
# Read next delta from queue
@@ -592,41 +599,24 @@ def stream_tts_to_speaker(
continue
if delta is None:
- # End-of-text sentinel: flush remaining buffer
+ # End-of-text sentinel: strip any remaining think blocks, flush
+ sentence_buf = _think_block_re.sub('', sentence_buf)
if sentence_buf.strip():
_speak_sentence(sentence_buf)
break
+ sentence_buf += delta
+
# --- Think block filtering ---
- # Process delta character by character for think tags
- i = 0
- filtered_delta = []
- while i < len(delta):
- # Check for opening ", i)
- if end != -1:
- i = end + 1
- else:
- i = len(delta)
- continue
- # Check for closing tag
- if delta[i:].startswith(""):
- in_think = False
- i += len("")
- continue
- if not in_think:
- filtered_delta.append(delta[i])
- i += 1
+ # Strip complete ... blocks from buffer.
+ # Works correctly even when tags span multiple deltas.
+ sentence_buf = _think_block_re.sub('', sentence_buf)
- text = "".join(filtered_delta)
- if not text:
+ # If an incomplete ' not in sentence_buf:
continue
- sentence_buf += text
-
# Check for sentence boundaries
while True:
m = _SENTENCE_BOUNDARY_RE.search(sentence_buf)