From 11f029c311d57dbee37ca94cf45bfb212f04b13e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 18 Mar 2026 02:55:30 -0700 Subject: [PATCH] fix(tts): document NeuTTS provider and align install guidance (#1903) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: charles-édouard <59705750+ccbbccbb@users.noreply.github.com> --- hermes_cli/setup.py | 2 +- tools/neutts_synth.py | 4 +-- tools/tts_tool.py | 4 +-- .../docs/guides/use-voice-mode-with-hermes.md | 32 +++++++++++++++++ website/docs/user-guide/configuration.md | 7 +++- website/docs/user-guide/features/tts.md | 13 +++++-- .../docs/user-guide/features/voice-mode.md | 34 ++++++++++++++----- 7 files changed, 79 insertions(+), 17 deletions(-) diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 46c7eea96..dd06279f2 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -1710,7 +1710,7 @@ def _install_neutts_deps() -> bool: return True except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: print_error(f"Failed to install neutts: {e}") - print_info("Try manually: pip install neutts[all]") + print_info("Try manually: python -m pip install -U neutts[all]") return False diff --git a/tools/neutts_synth.py b/tools/neutts_synth.py index b1a91451f..ee2c84b23 100644 --- a/tools/neutts_synth.py +++ b/tools/neutts_synth.py @@ -8,7 +8,7 @@ Usage: python -m tools.neutts_synth --text "Hello" --out output.wav \ --ref-audio samples/jo.wav --ref-text samples/jo.txt -Requires: pip install neutts[all] +Requires: python -m pip install -U neutts[all] System: apt install espeak-ng (or brew install espeak-ng) """ @@ -75,7 +75,7 @@ def main(): try: from neutts import NeuTTS except ImportError: - print("Error: neutts not installed. Run: pip install neutts[all]", file=sys.stderr) + print("Error: neutts not installed. Run: python -m pip install -U neutts[all]", file=sys.stderr) sys.exit(1) tts = NeuTTS( diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 66911371e..e05d1efef 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -423,8 +423,8 @@ def text_to_speech_tool( if not _check_neutts_available(): return json.dumps({ "success": False, - "error": "NeuTTS provider selected but neutts_cli is not installed. " - "Install the NeuTTS skill and run the bootstrap helper first." + "error": "NeuTTS provider selected but neutts is not installed. " + "Run hermes setup and choose NeuTTS, or install espeak-ng and run python -m pip install -U neutts[all]." }, ensure_ascii=False) logger.info("Generating speech with NeuTTS (local)...") _generate_neutts(text, file_str, tts_config) diff --git a/website/docs/guides/use-voice-mode-with-hermes.md b/website/docs/guides/use-voice-mode-with-hermes.md index dc35dcc65..fe38b837b 100644 --- a/website/docs/guides/use-voice-mode-with-hermes.md +++ b/website/docs/guides/use-voice-mode-with-hermes.md @@ -72,6 +72,12 @@ pip install hermes-agent[messaging] pip install hermes-agent[tts-premium] ``` +### Local NeuTTS (optional) + +```bash +python -m pip install -U neutts[all] +``` + ### Everything ```bash @@ -84,18 +90,21 @@ pip install hermes-agent[all] ```bash brew install portaudio ffmpeg opus +brew install espeak-ng ``` ### Ubuntu / Debian ```bash sudo apt install portaudio19-dev ffmpeg libopus0 +sudo apt install espeak-ng ``` Why these matter: - `portaudio` → microphone input / playback for CLI voice mode - `ffmpeg` → audio conversion for TTS and messaging delivery - `opus` → Discord voice codec support +- `espeak-ng` → phonemizer backend for NeuTTS ## Step 4: choose STT and TTS providers @@ -133,9 +142,20 @@ ELEVENLABS_API_KEY=*** #### Text-to-speech - `edge` → free and good enough for most users +- `neutts` → free local/on-device TTS - `elevenlabs` → best quality - `openai` → good middle ground +### If you use `hermes setup` + +If you choose NeuTTS in the setup wizard, Hermes checks whether `neutts` is already installed. If it is missing, the wizard tells you NeuTTS needs the Python package `neutts` and the system package `espeak-ng`, offers to install them for you, installs `espeak-ng` with your platform package manager, and then runs: + +```bash +python -m pip install -U neutts[all] +``` + +If you skip that install or it fails, the wizard falls back to Edge TTS. + ## Step 5: recommended config ```yaml @@ -159,6 +179,18 @@ tts: This is a good conservative default for most people. +If you want local TTS instead, switch the `tts` block to: + +```yaml +tts: + provider: "neutts" + neutts: + ref_audio: '' + ref_text: '' + model: neuphonic/neutts-air-q4-gguf + device: cpu +``` + ## Use case 1: CLI voice mode ## Turn it on diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index 878982b28..aa770c9e8 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -929,7 +929,7 @@ You can also change the reasoning effort at runtime with the `/reasoning` comman ```yaml tts: - provider: "edge" # "edge" | "elevenlabs" | "openai" + provider: "edge" # "edge" | "elevenlabs" | "openai" | "neutts" edge: voice: "en-US-AriaNeural" # 322 voices, 74 languages elevenlabs: @@ -938,6 +938,11 @@ tts: openai: model: "gpt-4o-mini-tts" voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer + neutts: + ref_audio: '' + ref_text: '' + model: neuphonic/neutts-air-q4-gguf + device: cpu ``` This controls both the `text_to_speech` tool and spoken replies in voice mode (`/voice tts` in the CLI or messaging gateway). diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index c6ba365a1..930a1bbfb 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -10,13 +10,14 @@ Hermes Agent supports both text-to-speech output and voice message transcription ## Text-to-Speech -Convert text to speech with three providers: +Convert text to speech with four providers: | Provider | Quality | Cost | API Key | |----------|---------|------|---------| | **Edge TTS** (default) | Good | Free | None needed | | **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` | | **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` | +| **NeuTTS** | Good | Free | None needed | ### Platform Delivery @@ -32,7 +33,7 @@ Convert text to speech with three providers: ```yaml # In ~/.hermes/config.yaml tts: - provider: "edge" # "edge" | "elevenlabs" | "openai" + provider: "edge" # "edge" | "elevenlabs" | "openai" | "neutts" edge: voice: "en-US-AriaNeural" # 322 voices, 74 languages elevenlabs: @@ -41,6 +42,11 @@ tts: openai: model: "gpt-4o-mini-tts" voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer + neutts: + ref_audio: '' + ref_text: '' + model: neuphonic/neutts-air-q4-gguf + device: cpu ``` ### Telegram Voice Bubbles & ffmpeg @@ -49,6 +55,7 @@ Telegram voice bubbles require Opus/OGG audio format: - **OpenAI and ElevenLabs** produce Opus natively — no extra setup - **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert: +- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles ```bash # Ubuntu/Debian @@ -61,7 +68,7 @@ brew install ffmpeg sudo dnf install ffmpeg ``` -Without ffmpeg, Edge TTS audio is sent as a regular audio file (playable, but shows as a rectangular player instead of a voice bubble). +Without ffmpeg, Edge TTS and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble). :::tip If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider. diff --git a/website/docs/user-guide/features/voice-mode.md b/website/docs/user-guide/features/voice-mode.md index 3dfe0db46..b0f203556 100644 --- a/website/docs/user-guide/features/voice-mode.md +++ b/website/docs/user-guide/features/voice-mode.md @@ -44,6 +44,9 @@ pip install hermes-agent[messaging] # Premium TTS (ElevenLabs) pip install hermes-agent[tts-premium] +# Local TTS (NeuTTS, optional) +python -m pip install -U neutts[all] + # Everything at once pip install hermes-agent[all] ``` @@ -54,6 +57,8 @@ pip install hermes-agent[all] | `messaging` | `discord.py[voice]`, `python-telegram-bot`, `aiohttp` | Discord & Telegram bots | | `tts-premium` | `elevenlabs` | ElevenLabs TTS provider | +Optional local TTS provider: install `neutts` separately with `python -m pip install -U neutts[all]`. On first use it downloads the model automatically. + :::info `discord.py[voice]` installs **PyNaCl** (for voice encryption) and **opus bindings** automatically. This is required for Discord voice channel support. ::: @@ -63,9 +68,11 @@ pip install hermes-agent[all] ```bash # macOS brew install portaudio ffmpeg opus +brew install espeak-ng # for NeuTTS # Ubuntu/Debian sudo apt install portaudio19-dev ffmpeg libopus0 +sudo apt install espeak-ng # for NeuTTS ``` | Dependency | Purpose | Required For | @@ -73,6 +80,7 @@ sudo apt install portaudio19-dev ffmpeg libopus0 | **PortAudio** | Microphone input and audio playback | CLI voice mode | | **ffmpeg** | Audio format conversion (MP3 → Opus, PCM → WAV) | All platforms | | **Opus** | Discord voice codec | Discord voice channels | +| **espeak-ng** | Phonemizer backend | Local NeuTTS provider | ### API Keys @@ -84,8 +92,9 @@ Add to `~/.hermes/.env`: GROQ_API_KEY=your-key # Groq Whisper — fast, free tier (cloud) VOICE_TOOLS_OPENAI_KEY=your-key # OpenAI Whisper — paid (cloud) -# Text-to-Speech (optional — Edge TTS works without any key) -ELEVENLABS_API_KEY=your-key # ElevenLabs — premium quality +# Text-to-Speech (optional — Edge TTS and NeuTTS work without any key) +ELEVENLABS_API_KEY=*** # ElevenLabs — premium quality +# VOICE_TOOLS_OPENAI_KEY above also enables OpenAI TTS ``` :::tip @@ -303,8 +312,9 @@ DISCORD_ALLOWED_USERS=your-user-id # STT — local provider needs no key (pip install faster-whisper) # GROQ_API_KEY=your-key # Alternative: cloud-based, fast, free tier -# TTS — optional, Edge TTS (free) is the default -# ELEVENLABS_API_KEY=your-key # Premium quality +# TTS — optional. Edge TTS and NeuTTS need no key. +# ELEVENLABS_API_KEY=*** # Premium quality +# VOICE_TOOLS_OPENAI_KEY=*** # OpenAI TTS / Whisper ``` ### Start the Gateway @@ -385,7 +395,7 @@ stt: # Text-to-Speech tts: - provider: "edge" # "edge" (free) | "elevenlabs" | "openai" + provider: "edge" # "edge" (free) | "elevenlabs" | "openai" | "neutts" edge: voice: "en-US-AriaNeural" # 322 voices, 74 languages elevenlabs: @@ -394,6 +404,11 @@ tts: openai: model: "gpt-4o-mini-tts" voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer + neutts: + ref_audio: '' + ref_text: '' + model: neuphonic/neutts-air-q4-gguf + device: cpu ``` ### Environment Variables @@ -410,9 +425,9 @@ STT_OPENAI_MODEL=whisper-1 # Override default OpenAI STT model GROQ_BASE_URL=https://api.groq.com/openai/v1 # Custom Groq endpoint STT_OPENAI_BASE_URL=https://api.openai.com/v1 # Custom OpenAI STT endpoint -# Text-to-Speech providers (Edge TTS needs no key) -ELEVENLABS_API_KEY=... # ElevenLabs (premium quality) -# OpenAI TTS uses VOICE_TOOLS_OPENAI_KEY +# Text-to-Speech providers (Edge TTS and NeuTTS need no key) +ELEVENLABS_API_KEY=*** # ElevenLabs (premium quality) +# VOICE_TOOLS_OPENAI_KEY above also enables OpenAI TTS # Discord voice channel DISCORD_BOT_TOKEN=... @@ -440,6 +455,9 @@ Provider priority (automatic fallback): **local** > **groq** > **openai** | **Edge TTS** | Good | Free | ~1s | No | | **ElevenLabs** | Excellent | Paid | ~2s | Yes | | **OpenAI TTS** | Good | Paid | ~1.5s | Yes | +| **NeuTTS** | Good | Free | Depends on CPU/GPU | No | + +NeuTTS uses the `tts.neutts` config block above. ---