From 11f029c311d57dbee37ca94cf45bfb212f04b13e Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Wed, 18 Mar 2026 02:55:30 -0700
Subject: [PATCH] fix(tts): document NeuTTS provider and align install guidance
 (#1903)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: charles-édouard <59705750+ccbbccbb@users.noreply.github.com>
---
 hermes_cli/setup.py                           |  2 +-
 tools/neutts_synth.py                         |  4 +--
 tools/tts_tool.py                             |  4 +--
 .../docs/guides/use-voice-mode-with-hermes.md | 32 +++++++++++++++++
 website/docs/user-guide/configuration.md      |  7 +++-
 website/docs/user-guide/features/tts.md       | 13 +++++--
 .../docs/user-guide/features/voice-mode.md    | 34 ++++++++++++++-----
 7 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py
index 46c7eea96..dd06279f2 100644
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@@ -1710,7 +1710,7 @@ def _install_neutts_deps() -> bool:
         return True
     except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
         print_error(f"Failed to install neutts: {e}")
-        print_info("Try manually: pip install neutts[all]")
+        print_info("Try manually: python -m pip install -U neutts[all]")
         return False
 
 
diff --git a/tools/neutts_synth.py b/tools/neutts_synth.py
index b1a91451f..ee2c84b23 100644
--- a/tools/neutts_synth.py
+++ b/tools/neutts_synth.py
@@ -8,7 +8,7 @@ Usage:
     python -m tools.neutts_synth --text "Hello" --out output.wav \
         --ref-audio samples/jo.wav --ref-text samples/jo.txt
 
-Requires: pip install neutts[all]
+Requires: python -m pip install -U neutts[all]
 System:   apt install espeak-ng  (or brew install espeak-ng)
 """
 
@@ -75,7 +75,7 @@ def main():
     try:
         from neutts import NeuTTS
     except ImportError:
-        print("Error: neutts not installed. Run: pip install neutts[all]", file=sys.stderr)
+        print("Error: neutts not installed. Run: python -m pip install -U neutts[all]", file=sys.stderr)
         sys.exit(1)
 
     tts = NeuTTS(
diff --git a/tools/tts_tool.py b/tools/tts_tool.py
index 66911371e..e05d1efef 100644
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@@ -423,8 +423,8 @@ def text_to_speech_tool(
             if not _check_neutts_available():
                 return json.dumps({
                     "success": False,
-                    "error": "NeuTTS provider selected but neutts_cli is not installed. "
-                             "Install the NeuTTS skill and run the bootstrap helper first."
+                    "error": "NeuTTS provider selected but neutts is not installed. "
+                             "Run hermes setup and choose NeuTTS, or install espeak-ng and run python -m pip install -U neutts[all]."
                 }, ensure_ascii=False)
             logger.info("Generating speech with NeuTTS (local)...")
             _generate_neutts(text, file_str, tts_config)
diff --git a/website/docs/guides/use-voice-mode-with-hermes.md b/website/docs/guides/use-voice-mode-with-hermes.md
index dc35dcc65..fe38b837b 100644
--- a/website/docs/guides/use-voice-mode-with-hermes.md
+++ b/website/docs/guides/use-voice-mode-with-hermes.md
@@ -72,6 +72,12 @@ pip install hermes-agent[messaging]
 pip install hermes-agent[tts-premium]
 ```
 
+### Local NeuTTS (optional)
+
+```bash
+python -m pip install -U neutts[all]
+```
+
 ### Everything
 
 ```bash
@@ -84,18 +90,21 @@ pip install hermes-agent[all]
 
 ```bash
 brew install portaudio ffmpeg opus
+brew install espeak-ng
 ```
 
 ### Ubuntu / Debian
 
 ```bash
 sudo apt install portaudio19-dev ffmpeg libopus0
+sudo apt install espeak-ng
 ```
 
 Why these matter:
 - `portaudio` → microphone input / playback for CLI voice mode
 - `ffmpeg` → audio conversion for TTS and messaging delivery
 - `opus` → Discord voice codec support
+- `espeak-ng` → phonemizer backend for NeuTTS
 
 ## Step 4: choose STT and TTS providers
 
@@ -133,9 +142,20 @@ ELEVENLABS_API_KEY=***
 #### Text-to-speech
 
 - `edge` → free and good enough for most users
+- `neutts` → free local/on-device TTS
 - `elevenlabs` → best quality
 - `openai` → good middle ground
 
+### If you use `hermes setup`
+
+If you choose NeuTTS in the setup wizard, Hermes checks whether `neutts` is already installed. If it is missing, the wizard tells you NeuTTS needs the Python package `neutts` and the system package `espeak-ng`, offers to install them for you, installs `espeak-ng` with your platform package manager, and then runs:
+
+```bash
+python -m pip install -U neutts[all]
+```
+
+If you skip that install or it fails, the wizard falls back to Edge TTS.
+
 ## Step 5: recommended config
 
 ```yaml
@@ -159,6 +179,18 @@ tts:
 
 This is a good conservative default for most people.
 
+If you want local TTS instead, switch the `tts` block to:
+
+```yaml
+tts:
+  provider: "neutts"
+  neutts:
+    ref_audio: ''
+    ref_text: ''
+    model: neuphonic/neutts-air-q4-gguf
+    device: cpu
+```
+
 ## Use case 1: CLI voice mode
 
 ## Turn it on
diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md
index 878982b28..aa770c9e8 100644
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@@ -929,7 +929,7 @@ You can also change the reasoning effort at runtime with the `/reasoning` comman
 
 ```yaml
 tts:
-  provider: "edge"              # "edge" | "elevenlabs" | "openai"
+  provider: "edge"              # "edge" | "elevenlabs" | "openai" | "neutts"
   edge:
     voice: "en-US-AriaNeural"   # 322 voices, 74 languages
   elevenlabs:
@@ -938,6 +938,11 @@ tts:
   openai:
     model: "gpt-4o-mini-tts"
     voice: "alloy"              # alloy, echo, fable, onyx, nova, shimmer
+  neutts:
+    ref_audio: ''
+    ref_text: ''
+    model: neuphonic/neutts-air-q4-gguf
+    device: cpu
 ```
 
 This controls both the `text_to_speech` tool and spoken replies in voice mode (`/voice tts` in the CLI or messaging gateway).
diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md
index c6ba365a1..930a1bbfb 100644
--- a/website/docs/user-guide/features/tts.md
+++ b/website/docs/user-guide/features/tts.md
@@ -10,13 +10,14 @@ Hermes Agent supports both text-to-speech output and voice message transcription
 
 ## Text-to-Speech
 
-Convert text to speech with three providers:
+Convert text to speech with four providers:
 
 | Provider | Quality | Cost | API Key |
 |----------|---------|------|---------|
 | **Edge TTS** (default) | Good | Free | None needed |
 | **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` |
 | **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` |
+| **NeuTTS** | Good | Free | None needed |
 
 ### Platform Delivery
 
@@ -32,7 +33,7 @@ Convert text to speech with three providers:
 ```yaml
 # In ~/.hermes/config.yaml
 tts:
-  provider: "edge"              # "edge" | "elevenlabs" | "openai"
+  provider: "edge"              # "edge" | "elevenlabs" | "openai" | "neutts"
   edge:
     voice: "en-US-AriaNeural"   # 322 voices, 74 languages
   elevenlabs:
@@ -41,6 +42,11 @@ tts:
   openai:
     model: "gpt-4o-mini-tts"
     voice: "alloy"              # alloy, echo, fable, onyx, nova, shimmer
+  neutts:
+    ref_audio: ''
+    ref_text: ''
+    model: neuphonic/neutts-air-q4-gguf
+    device: cpu
 ```
 
 ### Telegram Voice Bubbles & ffmpeg
@@ -49,6 +55,7 @@ Telegram voice bubbles require Opus/OGG audio format:
 
 - **OpenAI and ElevenLabs** produce Opus natively — no extra setup
 - **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
+- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
 
 ```bash
 # Ubuntu/Debian
@@ -61,7 +68,7 @@ brew install ffmpeg
 sudo dnf install ffmpeg
 ```
 
-Without ffmpeg, Edge TTS audio is sent as a regular audio file (playable, but shows as a rectangular player instead of a voice bubble).
+Without ffmpeg, Edge TTS and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
 
 :::tip
 If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider.
diff --git a/website/docs/user-guide/features/voice-mode.md b/website/docs/user-guide/features/voice-mode.md
index 3dfe0db46..b0f203556 100644
--- a/website/docs/user-guide/features/voice-mode.md
+++ b/website/docs/user-guide/features/voice-mode.md
@@ -44,6 +44,9 @@ pip install hermes-agent[messaging]
 # Premium TTS (ElevenLabs)
 pip install hermes-agent[tts-premium]
 
+# Local TTS (NeuTTS, optional)
+python -m pip install -U neutts[all]
+
 # Everything at once
 pip install hermes-agent[all]
 ```
@@ -54,6 +57,8 @@ pip install hermes-agent[all]
 | `messaging` | `discord.py[voice]`, `python-telegram-bot`, `aiohttp` | Discord & Telegram bots |
 | `tts-premium` | `elevenlabs` | ElevenLabs TTS provider |
 
+Optional local TTS provider: install `neutts` separately with `python -m pip install -U neutts[all]`. On first use it downloads the model automatically.
+
 :::info
 `discord.py[voice]` installs **PyNaCl** (for voice encryption) and **opus bindings** automatically. This is required for Discord voice channel support.
 :::
@@ -63,9 +68,11 @@ pip install hermes-agent[all]
 ```bash
 # macOS
 brew install portaudio ffmpeg opus
+brew install espeak-ng   # for NeuTTS
 
 # Ubuntu/Debian
 sudo apt install portaudio19-dev ffmpeg libopus0
+sudo apt install espeak-ng   # for NeuTTS
 ```
 
 | Dependency | Purpose | Required For |
@@ -73,6 +80,7 @@ sudo apt install portaudio19-dev ffmpeg libopus0
 | **PortAudio** | Microphone input and audio playback | CLI voice mode |
 | **ffmpeg** | Audio format conversion (MP3 → Opus, PCM → WAV) | All platforms |
 | **Opus** | Discord voice codec | Discord voice channels |
+| **espeak-ng** | Phonemizer backend | Local NeuTTS provider |
 
 ### API Keys
 
@@ -84,8 +92,9 @@ Add to `~/.hermes/.env`:
 GROQ_API_KEY=your-key                 # Groq Whisper — fast, free tier (cloud)
 VOICE_TOOLS_OPENAI_KEY=your-key       # OpenAI Whisper — paid (cloud)
 
-# Text-to-Speech (optional — Edge TTS works without any key)
-ELEVENLABS_API_KEY=your-key           # ElevenLabs — premium quality
+# Text-to-Speech (optional — Edge TTS and NeuTTS work without any key)
+ELEVENLABS_API_KEY=***           # ElevenLabs — premium quality
+# VOICE_TOOLS_OPENAI_KEY above also enables OpenAI TTS
 ```
 
 :::tip
@@ -303,8 +312,9 @@ DISCORD_ALLOWED_USERS=your-user-id
 # STT — local provider needs no key (pip install faster-whisper)
 # GROQ_API_KEY=your-key            # Alternative: cloud-based, fast, free tier
 
-# TTS — optional, Edge TTS (free) is the default
-# ELEVENLABS_API_KEY=your-key      # Premium quality
+# TTS — optional. Edge TTS and NeuTTS need no key.
+# ELEVENLABS_API_KEY=***      # Premium quality
+# VOICE_TOOLS_OPENAI_KEY=***  # OpenAI TTS / Whisper
 ```
 
 ### Start the Gateway
@@ -385,7 +395,7 @@ stt:
 
 # Text-to-Speech
 tts:
-  provider: "edge"                 # "edge" (free) | "elevenlabs" | "openai"
+  provider: "edge"                 # "edge" (free) | "elevenlabs" | "openai" | "neutts"
   edge:
     voice: "en-US-AriaNeural"      # 322 voices, 74 languages
   elevenlabs:
@@ -394,6 +404,11 @@ tts:
   openai:
     model: "gpt-4o-mini-tts"
     voice: "alloy"                 # alloy, echo, fable, onyx, nova, shimmer
+  neutts:
+    ref_audio: ''
+    ref_text: ''
+    model: neuphonic/neutts-air-q4-gguf
+    device: cpu
 ```
 
 ### Environment Variables
@@ -410,9 +425,9 @@ STT_OPENAI_MODEL=whisper-1               # Override default OpenAI STT model
 GROQ_BASE_URL=https://api.groq.com/openai/v1     # Custom Groq endpoint
 STT_OPENAI_BASE_URL=https://api.openai.com/v1    # Custom OpenAI STT endpoint
 
-# Text-to-Speech providers (Edge TTS needs no key)
-ELEVENLABS_API_KEY=...             # ElevenLabs (premium quality)
-# OpenAI TTS uses VOICE_TOOLS_OPENAI_KEY
+# Text-to-Speech providers (Edge TTS and NeuTTS need no key)
+ELEVENLABS_API_KEY=***             # ElevenLabs (premium quality)
+# VOICE_TOOLS_OPENAI_KEY above also enables OpenAI TTS
 
 # Discord voice channel
 DISCORD_BOT_TOKEN=...
@@ -440,6 +455,9 @@ Provider priority (automatic fallback): **local** > **groq** > **openai**
 | **Edge TTS** | Good | Free | ~1s | No |
 | **ElevenLabs** | Excellent | Paid | ~2s | Yes |
 | **OpenAI TTS** | Good | Paid | ~1.5s | Yes |
+| **NeuTTS** | Good | Free | Depends on CPU/GPU | No |
+
+NeuTTS uses the `tts.neutts` config block above.
 
 ---