From 3a1b35ed92340918db9a869073937fe46898ec65 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Fri, 6 Mar 2026 01:32:37 +0300 Subject: [PATCH] fix: voice mode race conditions, temp file leak, think tag parsing - Atomic check-and-set for _voice_recording flag with _voice_lock - Guard _voice_stop_and_transcribe against concurrent invocation - Remove premature flag clearing from Ctrl+R handler - Clean up temp WAV files in finally block (_play_via_tempfile) - Use buffer-level regex for block filtering (handles chunked tags) - Prevent /voice on prompt accumulation on repeated calls - Include Groq in STT key error message --- cli.py | 39 ++++++++++++++++++++++++++------------ tools/tts_tool.py | 48 +++++++++++++++++++---------------------------- 2 files changed, 46 insertions(+), 41 deletions(-) diff --git a/cli.py b/cli.py index 3221cbb79..d15d43a16 100755 --- a/cli.py +++ b/cli.py @@ -3544,10 +3544,6 @@ class HermesCLI: def _voice_start_recording(self): """Start capturing audio from the microphone.""" - # Prevent double-start from concurrent threads - if self._voice_recording: - return - from tools.voice_mode import AudioRecorder, check_voice_requirements reqs = check_voice_requirements() @@ -3559,10 +3555,18 @@ class HermesCLI: ) if not reqs["stt_key_set"]: raise RuntimeError( - "Voice mode requires VOICE_TOOLS_OPENAI_KEY for transcription.\n" - "Get one at: https://platform.openai.com/api-keys" + "Voice mode requires an STT API key for transcription.\n" + "Set GROQ_API_KEY (free) or VOICE_TOOLS_OPENAI_KEY.\n" + "Groq: https://console.groq.com/keys\n" + "OpenAI: https://platform.openai.com/api-keys" ) + # Prevent double-start from concurrent threads (atomic check-and-set) + with self._voice_lock: + if self._voice_recording: + return + self._voice_recording = True + # Load silence detection params from config voice_cfg = {} try: @@ -3595,9 +3599,12 @@ class HermesCLI: except Exception: pass - self._voice_recorder.start(on_silence_stop=_on_silence) - with self._voice_lock: - self._voice_recording = True + try: + self._voice_recorder.start(on_silence_stop=_on_silence) + except Exception: + with self._voice_lock: + self._voice_recording = False + raise _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}") # Periodically refresh prompt to update audio level indicator @@ -3610,6 +3617,12 @@ class HermesCLI: def _voice_stop_and_transcribe(self): """Stop recording, transcribe via STT, and queue the transcript as input.""" + # Atomic guard: only one thread can enter stop-and-transcribe + with self._voice_lock: + if not self._voice_recording: + return + self._voice_recording = False + submitted = False wav_path = None try: @@ -3617,8 +3630,6 @@ class HermesCLI: return wav_path = self._voice_recorder.stop() - with self._voice_lock: - self._voice_recording = False # Audio cue: double beep after stream stopped (no CoreAudio conflict) try: @@ -3764,6 +3775,10 @@ class HermesCLI: def _enable_voice_mode(self): """Enable voice mode after checking requirements.""" + if self._voice_mode: + _cprint(f"{_DIM}Voice mode is already enabled.{_RST}") + return + from tools.voice_mode import check_voice_requirements reqs = check_voice_requirements() @@ -4838,7 +4853,7 @@ class HermesCLI: # Manual stop via Ctrl+R: stop continuous mode with cli_ref._voice_lock: cli_ref._voice_continuous = False - cli_ref._voice_recording = False + # Flag clearing is handled atomically inside _voice_stop_and_transcribe event.app.invalidate() threading.Thread( target=cli_ref._voice_stop_and_transcribe, diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 3b8773d49..988fa653a 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -519,10 +519,11 @@ def stream_tts_to_speaker( output_stream = None sentence_buf = "" - in_think = False # track ... blocks min_sentence_len = 20 long_flush_len = 100 queue_timeout = 0.5 + # Regex to strip complete ... blocks from buffer + _think_block_re = re.compile(r'].*?', flags=re.DOTALL) def _speak_sentence(sentence: str): """Display sentence and optionally generate + play audio.""" @@ -562,6 +563,7 @@ def stream_tts_to_speaker( def _play_via_tempfile(audio_iter, stop_evt): """Write PCM chunks to a temp WAV file and play it.""" + tmp_path = None try: import wave tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) @@ -576,9 +578,14 @@ def stream_tts_to_speaker( wf.writeframes(chunk) from tools.voice_mode import play_audio_file play_audio_file(tmp_path) - os.unlink(tmp_path) except Exception as exc: logger.warning("Temp-file TTS fallback failed: %s", exc) + finally: + if tmp_path: + try: + os.unlink(tmp_path) + except OSError: + pass while not stop_event.is_set(): # Read next delta from queue @@ -592,41 +599,24 @@ def stream_tts_to_speaker( continue if delta is None: - # End-of-text sentinel: flush remaining buffer + # End-of-text sentinel: strip any remaining think blocks, flush + sentence_buf = _think_block_re.sub('', sentence_buf) if sentence_buf.strip(): _speak_sentence(sentence_buf) break + sentence_buf += delta + # --- Think block filtering --- - # Process delta character by character for think tags - i = 0 - filtered_delta = [] - while i < len(delta): - # Check for opening ", i) - if end != -1: - i = end + 1 - else: - i = len(delta) - continue - # Check for closing tag - if delta[i:].startswith(""): - in_think = False - i += len("") - continue - if not in_think: - filtered_delta.append(delta[i]) - i += 1 + # Strip complete ... blocks from buffer. + # Works correctly even when tags span multiple deltas. + sentence_buf = _think_block_re.sub('', sentence_buf) - text = "".join(filtered_delta) - if not text: + # If an incomplete ' not in sentence_buf: continue - sentence_buf += text - # Check for sentence boundaries while True: m = _SENTENCE_BOUNDARY_RE.search(sentence_buf)