2026-02-12 10:05:08 -08:00
#!/usr/bin/env python3
"""
Text - to - Speech Tool Module
Supports three TTS providers :
- Edge TTS ( default , free , no API key ) : Microsoft Edge neural voices
- ElevenLabs ( premium ) : High - quality voices , needs ELEVENLABS_API_KEY
- OpenAI TTS : Good quality , needs OPENAI_API_KEY
Output formats :
- Opus ( . ogg ) for Telegram voice bubbles ( requires ffmpeg for Edge TTS )
- MP3 ( . mp3 ) for everything else ( CLI , Discord , WhatsApp )
Configuration is loaded from ~ / . hermes / config . yaml under the ' tts: ' key .
The user chooses the provider and voice ; the model just sends text .
Usage :
from tools . tts_tool import text_to_speech_tool , check_tts_requirements
result = text_to_speech_tool ( text = " Hello world " )
"""
import asyncio
import datetime
import json
2026-02-21 03:11:11 -08:00
import logging
2026-02-12 10:05:08 -08:00
import os
2026-03-03 23:03:42 +03:00
import queue
import re
2026-02-12 10:05:08 -08:00
import shutil
import subprocess
import tempfile
2026-03-03 23:03:42 +03:00
import threading
2026-02-12 10:05:08 -08:00
from pathlib import Path
from typing import Dict , Any , Optional
2026-02-21 03:11:11 -08:00
logger = logging . getLogger ( __name__ )
2026-02-12 10:05:08 -08:00
# ---------------------------------------------------------------------------
# Optional imports -- providers degrade gracefully if not installed
# ---------------------------------------------------------------------------
try :
import edge_tts
_HAS_EDGE_TTS = True
except ImportError :
_HAS_EDGE_TTS = False
try :
from elevenlabs . client import ElevenLabs
_HAS_ELEVENLABS = True
except ImportError :
_HAS_ELEVENLABS = False
# openai is a core dependency, but guard anyway
try :
from openai import OpenAI as OpenAIClient
_HAS_OPENAI = True
except ImportError :
_HAS_OPENAI = False
2026-03-03 23:03:42 +03:00
try :
import sounddevice as sd
_HAS_AUDIO = True
2026-03-05 21:26:59 +03:00
except ( ImportError , OSError ) :
2026-03-03 23:03:42 +03:00
sd = None # type: ignore[assignment]
_HAS_AUDIO = False
2026-02-12 10:05:08 -08:00
# ===========================================================================
# Defaults
# ===========================================================================
DEFAULT_PROVIDER = " edge "
DEFAULT_EDGE_VOICE = " en-US-AriaNeural "
DEFAULT_ELEVENLABS_VOICE_ID = " pNInz6obpgDQGcFmaJgB " # Adam
DEFAULT_ELEVENLABS_MODEL_ID = " eleven_multilingual_v2 "
2026-03-03 23:03:42 +03:00
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = " eleven_flash_v2_5 "
2026-02-12 10:05:08 -08:00
DEFAULT_OPENAI_MODEL = " gpt-4o-mini-tts "
DEFAULT_OPENAI_VOICE = " alloy "
fix(cli): respect HERMES_HOME in all remaining hardcoded ~/.hermes paths
Several files resolved paths via Path.home() / ".hermes" or
os.path.expanduser("~/.hermes/..."), bypassing the HERMES_HOME
environment variable. This broke isolation when running multiple
Hermes instances with distinct HERMES_HOME directories.
Replace all hardcoded paths with calls to get_hermes_home() from
hermes_cli.config, consistent with the rest of the codebase.
Files fixed:
- tools/process_registry.py (processes.json)
- gateway/pairing.py (pairing/)
- gateway/sticker_cache.py (sticker_cache.json)
- gateway/channel_directory.py (channel_directory.json, sessions.json)
- gateway/config.py (gateway.json, config.yaml, sessions_dir)
- gateway/mirror.py (sessions/)
- gateway/hooks.py (hooks/)
- gateway/platforms/base.py (image_cache/, audio_cache/, document_cache/)
- gateway/platforms/whatsapp.py (whatsapp/session)
- gateway/delivery.py (cron/output)
- agent/auxiliary_client.py (auth.json)
- agent/prompt_builder.py (SOUL.md)
- cli.py (config.yaml, images/, pastes/, history)
- run_agent.py (logs/)
- tools/environments/base.py (sandboxes/)
- tools/environments/modal.py (modal_snapshots.json)
- tools/environments/singularity.py (singularity_snapshots.json)
- tools/tts_tool.py (audio_cache)
- hermes_cli/status.py (cron/jobs.json, sessions.json)
- hermes_cli/gateway.py (logs/, whatsapp session)
- hermes_cli/main.py (whatsapp/session)
Tests updated to use HERMES_HOME env var instead of patching Path.home().
Closes #892
(cherry picked from commit 78ac1bba43b8b74a934c6172f2c29bb4d03164b9)
2026-03-11 07:31:41 +01:00
DEFAULT_OUTPUT_DIR = str ( Path ( os . getenv ( " HERMES_HOME " , Path . home ( ) / " .hermes " ) ) / " audio_cache " )
2026-02-12 10:05:08 -08:00
MAX_TEXT_LENGTH = 4000
# ===========================================================================
# Config loader -- reads tts: section from ~/.hermes/config.yaml
# ===========================================================================
def _load_tts_config ( ) - > Dict [ str , Any ] :
"""
Load TTS configuration from ~ / . hermes / config . yaml .
Returns a dict with provider settings . Falls back to defaults
for any missing fields .
"""
try :
from hermes_cli . config import load_config
config = load_config ( )
return config . get ( " tts " , { } )
2026-03-07 16:53:30 +03:00
except ImportError :
logger . debug ( " hermes_cli.config not available, using default TTS config " )
return { }
except Exception as e :
logger . warning ( " Failed to load TTS config: %s " , e , exc_info = True )
2026-02-12 10:05:08 -08:00
return { }
def _get_provider ( tts_config : Dict [ str , Any ] ) - > str :
""" Get the configured TTS provider name. """
return tts_config . get ( " provider " , DEFAULT_PROVIDER ) . lower ( ) . strip ( )
# ===========================================================================
# ffmpeg Opus conversion (Edge TTS MP3 -> OGG Opus for Telegram)
# ===========================================================================
def _has_ffmpeg ( ) - > bool :
""" Check if ffmpeg is available on the system. """
return shutil . which ( " ffmpeg " ) is not None
def _convert_to_opus ( mp3_path : str ) - > Optional [ str ] :
"""
Convert an MP3 file to OGG Opus format for Telegram voice bubbles .
Args :
mp3_path : Path to the input MP3 file .
Returns :
Path to the . ogg file , or None if conversion fails .
"""
if not _has_ffmpeg ( ) :
return None
ogg_path = mp3_path . rsplit ( " . " , 1 ) [ 0 ] + " .ogg "
try :
2026-03-07 16:53:30 +03:00
result = subprocess . run (
2026-02-12 10:05:08 -08:00
[ " ffmpeg " , " -i " , mp3_path , " -acodec " , " libopus " ,
" -ac " , " 1 " , " -b:a " , " 64k " , " -vbr " , " off " , ogg_path , " -y " ] ,
capture_output = True , timeout = 30 ,
)
2026-03-07 16:53:30 +03:00
if result . returncode != 0 :
logger . warning ( " ffmpeg conversion failed with return code %d : %s " ,
result . returncode , result . stderr . decode ( ' utf-8 ' , errors = ' ignore ' ) [ : 200 ] )
return None
2026-02-12 10:05:08 -08:00
if os . path . exists ( ogg_path ) and os . path . getsize ( ogg_path ) > 0 :
return ogg_path
2026-03-07 16:53:30 +03:00
except subprocess . TimeoutExpired :
logger . warning ( " ffmpeg OGG conversion timed out after 30s " )
except FileNotFoundError :
logger . warning ( " ffmpeg not found in PATH " )
2026-02-21 03:32:11 -08:00
except Exception as e :
2026-03-07 16:53:30 +03:00
logger . warning ( " ffmpeg OGG conversion failed: %s " , e , exc_info = True )
2026-02-12 10:05:08 -08:00
return None
# ===========================================================================
# Provider: Edge TTS (free)
# ===========================================================================
async def _generate_edge_tts ( text : str , output_path : str , tts_config : Dict [ str , Any ] ) - > str :
"""
Generate audio using Edge TTS .
Args :
text : Text to convert .
output_path : Where to save the MP3 file .
tts_config : TTS config dict .
Returns :
Path to the saved audio file .
"""
edge_config = tts_config . get ( " edge " , { } )
voice = edge_config . get ( " voice " , DEFAULT_EDGE_VOICE )
communicate = edge_tts . Communicate ( text , voice )
await communicate . save ( output_path )
return output_path
# ===========================================================================
# Provider: ElevenLabs (premium)
# ===========================================================================
def _generate_elevenlabs ( text : str , output_path : str , tts_config : Dict [ str , Any ] ) - > str :
"""
Generate audio using ElevenLabs .
Args :
text : Text to convert .
output_path : Where to save the audio file .
tts_config : TTS config dict .
Returns :
Path to the saved audio file .
"""
api_key = os . getenv ( " ELEVENLABS_API_KEY " , " " )
if not api_key :
raise ValueError ( " ELEVENLABS_API_KEY not set. Get one at https://elevenlabs.io/ " )
el_config = tts_config . get ( " elevenlabs " , { } )
voice_id = el_config . get ( " voice_id " , DEFAULT_ELEVENLABS_VOICE_ID )
model_id = el_config . get ( " model_id " , DEFAULT_ELEVENLABS_MODEL_ID )
# Determine output format based on file extension
if output_path . endswith ( " .ogg " ) :
output_format = " opus_48000_64 "
else :
output_format = " mp3_44100_128 "
client = ElevenLabs ( api_key = api_key )
audio_generator = client . text_to_speech . convert (
text = text ,
voice_id = voice_id ,
model_id = model_id ,
output_format = output_format ,
)
# audio_generator yields chunks -- write them all
with open ( output_path , " wb " ) as f :
for chunk in audio_generator :
f . write ( chunk )
return output_path
# ===========================================================================
# Provider: OpenAI TTS
# ===========================================================================
def _generate_openai_tts ( text : str , output_path : str , tts_config : Dict [ str , Any ] ) - > str :
"""
Generate audio using OpenAI TTS .
Args :
text : Text to convert .
output_path : Where to save the audio file .
tts_config : TTS config dict .
Returns :
Path to the saved audio file .
"""
2026-02-26 19:56:42 -08:00
api_key = os . getenv ( " VOICE_TOOLS_OPENAI_KEY " , " " )
2026-02-12 10:05:08 -08:00
if not api_key :
2026-02-23 23:21:33 +00:00
raise ValueError ( " VOICE_TOOLS_OPENAI_KEY not set. Get one at https://platform.openai.com/api-keys " )
2026-02-12 10:05:08 -08:00
oai_config = tts_config . get ( " openai " , { } )
model = oai_config . get ( " model " , DEFAULT_OPENAI_MODEL )
voice = oai_config . get ( " voice " , DEFAULT_OPENAI_VOICE )
# Determine response format from extension
if output_path . endswith ( " .ogg " ) :
response_format = " opus "
else :
response_format = " mp3 "
2026-02-23 20:59:39 -08:00
client = OpenAIClient ( api_key = api_key , base_url = " https://api.openai.com/v1 " )
2026-02-12 10:05:08 -08:00
response = client . audio . speech . create (
model = model ,
voice = voice ,
input = text ,
response_format = response_format ,
)
response . stream_to_file ( output_path )
return output_path
# ===========================================================================
# Main tool function
# ===========================================================================
def text_to_speech_tool (
text : str ,
output_path : Optional [ str ] = None ,
) - > str :
"""
Convert text to speech audio .
Reads provider / voice config from ~ / . hermes / config . yaml ( tts : section ) .
The model sends text ; the user configures voice and provider .
On messaging platforms , the returned MEDIA : < path > tag is intercepted
by the send pipeline and delivered as a native voice message .
In CLI mode , the file is saved to ~ / voice - memos / .
Args :
text : The text to convert to speech .
output_path : Optional custom save path . Defaults to ~ / voice - memos / < timestamp > . mp3
Returns :
str : JSON result with success , file_path , and optionally MEDIA tag .
"""
if not text or not text . strip ( ) :
return json . dumps ( { " success " : False , " error " : " Text is required " } , ensure_ascii = False )
# Truncate very long text with a warning
if len ( text ) > MAX_TEXT_LENGTH :
2026-02-21 03:11:11 -08:00
logger . warning ( " TTS text too long ( %d chars), truncating to %d " , len ( text ) , MAX_TEXT_LENGTH )
2026-02-12 10:05:08 -08:00
text = text [ : MAX_TEXT_LENGTH ]
tts_config = _load_tts_config ( )
provider = _get_provider ( tts_config )
2026-02-14 16:13:26 -08:00
# Detect platform from gateway env var to choose the best output format.
# Telegram voice bubbles require Opus (.ogg); OpenAI and ElevenLabs can
# produce Opus natively (no ffmpeg needed). Edge TTS always outputs MP3
# and needs ffmpeg for conversion.
platform = os . getenv ( " HERMES_SESSION_PLATFORM " , " " ) . lower ( )
want_opus = ( platform == " telegram " )
2026-02-12 10:05:08 -08:00
# Determine output path
if output_path :
file_path = Path ( output_path ) . expanduser ( )
else :
timestamp = datetime . datetime . now ( ) . strftime ( " % Y % m %d _ % H % M % S " )
out_dir = Path ( DEFAULT_OUTPUT_DIR )
out_dir . mkdir ( parents = True , exist_ok = True )
2026-02-14 16:13:26 -08:00
# Use .ogg for Telegram with providers that support native Opus output,
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
if want_opus and provider in ( " openai " , " elevenlabs " ) :
file_path = out_dir / f " tts_ { timestamp } .ogg "
else :
file_path = out_dir / f " tts_ { timestamp } .mp3 "
2026-02-12 10:05:08 -08:00
# Ensure parent directory exists
file_path . parent . mkdir ( parents = True , exist_ok = True )
file_str = str ( file_path )
try :
# Generate audio with the configured provider
if provider == " elevenlabs " :
if not _HAS_ELEVENLABS :
return json . dumps ( {
" success " : False ,
" error " : " ElevenLabs provider selected but ' elevenlabs ' package not installed. Run: pip install elevenlabs "
} , ensure_ascii = False )
2026-02-21 03:11:11 -08:00
logger . info ( " Generating speech with ElevenLabs... " )
2026-02-12 10:05:08 -08:00
_generate_elevenlabs ( text , file_str , tts_config )
elif provider == " openai " :
if not _HAS_OPENAI :
return json . dumps ( {
" success " : False ,
" error " : " OpenAI provider selected but ' openai ' package not installed. "
} , ensure_ascii = False )
2026-02-21 03:11:11 -08:00
logger . info ( " Generating speech with OpenAI TTS... " )
2026-02-12 10:05:08 -08:00
_generate_openai_tts ( text , file_str , tts_config )
else :
# Default: Edge TTS (free)
if not _HAS_EDGE_TTS :
return json . dumps ( {
" success " : False ,
" error " : " Edge TTS not available. Run: pip install edge-tts "
} , ensure_ascii = False )
2026-02-21 03:11:11 -08:00
logger . info ( " Generating speech with Edge TTS... " )
2026-02-12 10:05:08 -08:00
# Edge TTS is async, run it
try :
loop = asyncio . get_running_loop ( )
import concurrent . futures
with concurrent . futures . ThreadPoolExecutor ( max_workers = 1 ) as pool :
pool . submit (
lambda : asyncio . run ( _generate_edge_tts ( text , file_str , tts_config ) )
) . result ( timeout = 60 )
except RuntimeError :
asyncio . run ( _generate_edge_tts ( text , file_str , tts_config ) )
# Check the file was actually created
if not os . path . exists ( file_str ) or os . path . getsize ( file_str ) == 0 :
return json . dumps ( {
" success " : False ,
" error " : f " TTS generation produced no output (provider: { provider } ) "
} , ensure_ascii = False )
# Try Opus conversion for Telegram compatibility (Edge TTS only outputs MP3)
voice_compatible = False
if provider == " edge " and file_str . endswith ( " .mp3 " ) :
opus_path = _convert_to_opus ( file_str )
if opus_path :
file_str = opus_path
voice_compatible = True
elif provider in ( " elevenlabs " , " openai " ) :
# These providers can output Opus natively if the path ends in .ogg
voice_compatible = file_str . endswith ( " .ogg " )
file_size = os . path . getsize ( file_str )
2026-02-21 03:11:11 -08:00
logger . info ( " TTS audio saved: %s ( %s bytes, provider: %s ) " , file_str , f " { file_size : , } " , provider )
2026-02-12 10:05:08 -08:00
# Build response with MEDIA tag for platform delivery
media_tag = f " MEDIA: { file_str } "
if voice_compatible :
media_tag = f " [[audio_as_voice]] \n { media_tag } "
return json . dumps ( {
" success " : True ,
" file_path " : file_str ,
" media_tag " : media_tag ,
" provider " : provider ,
" voice_compatible " : voice_compatible ,
} , ensure_ascii = False )
2026-03-07 16:53:30 +03:00
except ValueError as e :
# Configuration errors (missing API keys, etc.)
error_msg = f " TTS configuration error ( { provider } ): { e } "
logger . error ( " %s " , error_msg )
return json . dumps ( { " success " : False , " error " : error_msg } , ensure_ascii = False )
except FileNotFoundError as e :
# Missing dependencies or files
error_msg = f " TTS dependency missing ( { provider } ): { e } "
logger . error ( " %s " , error_msg , exc_info = True )
return json . dumps ( { " success " : False , " error " : error_msg } , ensure_ascii = False )
2026-02-12 10:05:08 -08:00
except Exception as e :
2026-03-07 16:53:30 +03:00
# Unexpected errors
2026-02-12 10:05:08 -08:00
error_msg = f " TTS generation failed ( { provider } ): { e } "
2026-03-07 16:53:30 +03:00
logger . error ( " %s " , error_msg , exc_info = True )
2026-02-12 10:05:08 -08:00
return json . dumps ( { " success " : False , " error " : error_msg } , ensure_ascii = False )
# ===========================================================================
# Requirements check
# ===========================================================================
def check_tts_requirements ( ) - > bool :
"""
Check if at least one TTS provider is available .
Edge TTS needs no API key and is the default , so if the package
is installed , TTS is available .
Returns :
bool : True if at least one provider can work .
"""
if _HAS_EDGE_TTS :
return True
if _HAS_ELEVENLABS and os . getenv ( " ELEVENLABS_API_KEY " ) :
return True
2026-02-26 19:56:42 -08:00
if _HAS_OPENAI and os . getenv ( " VOICE_TOOLS_OPENAI_KEY " ) :
2026-02-12 10:05:08 -08:00
return True
return False
2026-03-03 23:03:42 +03:00
# ===========================================================================
# Streaming TTS: sentence-by-sentence pipeline for ElevenLabs
# ===========================================================================
# Sentence boundary pattern: punctuation followed by space or newline
_SENTENCE_BOUNDARY_RE = re . compile ( r ' (?<=[.!?])(?: \ s| \ n)|(?: \ n \ n) ' )
# Markdown stripping patterns (same as cli.py _voice_speak_response)
_MD_CODE_BLOCK = re . compile ( r ' ```[ \ s \ S]*?``` ' )
_MD_LINK = re . compile ( r ' \ [([^ \ ]]+) \ ] \ ([^)]+ \ ) ' )
_MD_URL = re . compile ( r ' https?:// \ S+ ' )
_MD_BOLD = re . compile ( r ' \ * \ *(.+?) \ * \ * ' )
_MD_ITALIC = re . compile ( r ' \ *(.+?) \ * ' )
_MD_INLINE_CODE = re . compile ( r ' `(.+?)` ' )
_MD_HEADER = re . compile ( r ' ^#+ \ s* ' , flags = re . MULTILINE )
_MD_LIST_ITEM = re . compile ( r ' ^ \ s*[-*] \ s+ ' , flags = re . MULTILINE )
_MD_HR = re . compile ( r ' ---+ ' )
_MD_EXCESS_NL = re . compile ( r ' \ n { 3,} ' )
def _strip_markdown_for_tts ( text : str ) - > str :
""" Remove markdown formatting that shouldn ' t be spoken aloud. """
text = _MD_CODE_BLOCK . sub ( ' ' , text )
text = _MD_LINK . sub ( r ' \ 1 ' , text )
text = _MD_URL . sub ( ' ' , text )
text = _MD_BOLD . sub ( r ' \ 1 ' , text )
text = _MD_ITALIC . sub ( r ' \ 1 ' , text )
text = _MD_INLINE_CODE . sub ( r ' \ 1 ' , text )
text = _MD_HEADER . sub ( ' ' , text )
text = _MD_LIST_ITEM . sub ( ' ' , text )
text = _MD_HR . sub ( ' ' , text )
text = _MD_EXCESS_NL . sub ( ' \n \n ' , text )
return text . strip ( )
def stream_tts_to_speaker (
text_queue : queue . Queue ,
stop_event : threading . Event ,
tts_done_event : threading . Event ,
) :
""" Consume text deltas from *text_queue*, buffer them into sentences,
and stream each sentence through ElevenLabs TTS to the speaker in
real - time .
Protocol :
* The producer puts ` ` str ` ` deltas onto * text_queue * .
* A ` ` None ` ` sentinel signals end - of - text ( flush remaining buffer ) .
* * stop_event * can be set to abort early ( e . g . user interrupt ) .
* * tts_done_event * is * * set * * in the ` ` finally ` ` block so callers
waiting on it ( continuous voice mode ) know playback is finished .
"""
tts_done_event . clear ( )
try :
tts_config = _load_tts_config ( )
el_config = tts_config . get ( " elevenlabs " , { } )
voice_id = el_config . get ( " voice_id " , DEFAULT_ELEVENLABS_VOICE_ID )
model_id = el_config . get ( " streaming_model_id " ,
el_config . get ( " model_id " , DEFAULT_ELEVENLABS_STREAMING_MODEL_ID ) )
api_key = os . getenv ( " ELEVENLABS_API_KEY " , " " )
if not api_key :
logger . warning ( " ELEVENLABS_API_KEY not set; streaming TTS disabled " )
return
client = ElevenLabs ( api_key = api_key )
# Open a single sounddevice output stream for the lifetime of
# this function. ElevenLabs pcm_24000 produces signed 16-bit
# little-endian mono PCM at 24 kHz.
use_sd = _HAS_AUDIO and sd is not None
output_stream = None
if use_sd :
try :
import numpy as _np
output_stream = sd . OutputStream (
samplerate = 24000 , channels = 1 , dtype = " int16 " ,
)
output_stream . start ( )
except Exception as exc :
logger . warning ( " sounddevice OutputStream failed: %s " , exc )
output_stream = None
sentence_buf = " "
in_think = False # track <think>...</think> blocks
min_sentence_len = 20
long_flush_len = 100
queue_timeout = 0.5
def _speak_sentence ( sentence : str ) :
""" Generate and play audio for a single sentence. """
if stop_event . is_set ( ) :
return
cleaned = _strip_markdown_for_tts ( sentence ) . strip ( )
if not cleaned :
return
# Truncate very long sentences
if len ( cleaned ) > MAX_TEXT_LENGTH :
cleaned = cleaned [ : MAX_TEXT_LENGTH ]
try :
audio_iter = client . text_to_speech . convert (
text = cleaned ,
voice_id = voice_id ,
model_id = model_id ,
output_format = " pcm_24000 " ,
)
if output_stream is not None :
for chunk in audio_iter :
if stop_event . is_set ( ) :
break
import numpy as _np
audio_array = _np . frombuffer ( chunk , dtype = _np . int16 )
output_stream . write ( audio_array . reshape ( - 1 , 1 ) )
else :
# Fallback: write chunks to temp file and play via system player
_play_via_tempfile ( audio_iter , stop_event )
except Exception as exc :
logger . warning ( " Streaming TTS sentence failed: %s " , exc )
def _play_via_tempfile ( audio_iter , stop_evt ) :
""" Write PCM chunks to a temp WAV file and play it. """
try :
import wave
tmp = tempfile . NamedTemporaryFile ( suffix = " .wav " , delete = False )
tmp_path = tmp . name
with wave . open ( tmp , " wb " ) as wf :
wf . setnchannels ( 1 )
wf . setsampwidth ( 2 ) # 16-bit
wf . setframerate ( 24000 )
for chunk in audio_iter :
if stop_evt . is_set ( ) :
break
wf . writeframes ( chunk )
from tools . voice_mode import play_audio_file
play_audio_file ( tmp_path )
os . unlink ( tmp_path )
except Exception as exc :
logger . warning ( " Temp-file TTS fallback failed: %s " , exc )
while not stop_event . is_set ( ) :
# Read next delta from queue
try :
delta = text_queue . get ( timeout = queue_timeout )
except queue . Empty :
# Timeout: if we have accumulated a long buffer, flush it
if len ( sentence_buf ) > long_flush_len :
_speak_sentence ( sentence_buf )
sentence_buf = " "
continue
if delta is None :
# End-of-text sentinel: flush remaining buffer
if sentence_buf . strip ( ) :
_speak_sentence ( sentence_buf )
break
# --- Think block filtering ---
# Process delta character by character for think tags
i = 0
filtered_delta = [ ]
while i < len ( delta ) :
# Check for opening <think tag
if delta [ i : ] . startswith ( " <think " ) :
in_think = True
# Skip past the tag
end = delta . find ( " > " , i )
if end != - 1 :
i = end + 1
else :
i = len ( delta )
continue
# Check for closing </think> tag
if delta [ i : ] . startswith ( " </think> " ) :
in_think = False
i + = len ( " </think> " )
continue
if not in_think :
filtered_delta . append ( delta [ i ] )
i + = 1
text = " " . join ( filtered_delta )
if not text :
continue
sentence_buf + = text
# Check for sentence boundaries
while True :
m = _SENTENCE_BOUNDARY_RE . search ( sentence_buf )
if m is None :
break
end_pos = m . end ( )
sentence = sentence_buf [ : end_pos ]
sentence_buf = sentence_buf [ end_pos : ]
# Merge short fragments into the next sentence
if len ( sentence . strip ( ) ) < min_sentence_len :
sentence_buf = sentence + sentence_buf
break
_speak_sentence ( sentence )
# Drain any remaining items from the queue
while True :
try :
text_queue . get_nowait ( )
except queue . Empty :
break
# Close the audio output stream
if output_stream is not None :
try :
output_stream . stop ( )
output_stream . close ( )
except Exception :
pass
except Exception as exc :
logger . warning ( " Streaming TTS pipeline error: %s " , exc )
finally :
tts_done_event . set ( )
2026-02-12 10:05:08 -08:00
# ===========================================================================
# Main -- quick diagnostics
# ===========================================================================
if __name__ == " __main__ " :
print ( " 🔊 Text-to-Speech Tool Module " )
print ( " = " * 50 )
print ( f " \n Provider availability: " )
print ( f " Edge TTS: { ' ✅ installed ' if _HAS_EDGE_TTS else ' ❌ not installed (pip install edge-tts) ' } " )
print ( f " ElevenLabs: { ' ✅ installed ' if _HAS_ELEVENLABS else ' ❌ not installed (pip install elevenlabs) ' } " )
print ( f " API Key: { ' ✅ set ' if os . getenv ( ' ELEVENLABS_API_KEY ' ) else ' ❌ not set ' } " )
print ( f " OpenAI: { ' ✅ installed ' if _HAS_OPENAI else ' ❌ not installed ' } " )
2026-02-26 19:56:42 -08:00
print ( f " API Key: { ' ✅ set ' if os . getenv ( ' VOICE_TOOLS_OPENAI_KEY ' ) else ' ❌ not set (VOICE_TOOLS_OPENAI_KEY) ' } " )
2026-02-12 10:05:08 -08:00
print ( f " ffmpeg: { ' ✅ found ' if _has_ffmpeg ( ) else ' ❌ not found (needed for Telegram Opus) ' } " )
print ( f " \n Output dir: { DEFAULT_OUTPUT_DIR } " )
config = _load_tts_config ( )
provider = _get_provider ( config )
print ( f " Configured provider: { provider } " )
2026-02-21 20:22:33 -08:00
# ---------------------------------------------------------------------------
# Registry
# ---------------------------------------------------------------------------
from tools . registry import registry
TTS_SCHEMA = {
" name " : " text_to_speech " ,
" description " : " Convert text to speech audio. Returns a MEDIA: path that the platform delivers as a voice message. On Telegram it plays as a voice bubble, on Discord/WhatsApp as an audio attachment. In CLI mode, saves to ~/voice-memos/. Voice and provider are user-configured, not model-selected. " ,
" parameters " : {
" type " : " object " ,
" properties " : {
" text " : {
" type " : " string " ,
" description " : " The text to convert to speech. Keep under 4000 characters. "
} ,
" output_path " : {
" type " : " string " ,
2026-02-25 04:22:03 -08:00
" description " : " Optional custom file path to save the audio. Defaults to ~/.hermes/audio_cache/<timestamp>.mp3 "
2026-02-21 20:22:33 -08:00
}
} ,
" required " : [ " text " ]
}
}
registry . register (
name = " text_to_speech " ,
toolset = " tts " ,
schema = TTS_SCHEMA ,
handler = lambda args , * * kw : text_to_speech_tool (
text = args . get ( " text " , " " ) ,
output_path = args . get ( " output_path " ) ) ,
check_fn = check_tts_requirements ,
)