refactor(tts): replace NeuTTS optional skill with built-in provider + setup flow

Remove the optional skill (redundant now that NeuTTS is a built-in TTS
provider). Replace neutts_cli dependency with a standalone synthesis
helper (tools/neutts_synth.py) that calls the neutts Python API directly
in a subprocess.

Add TTS provider selection to hermes setup:
- 'hermes setup' now prompts for TTS provider after model selection
- 'hermes setup tts' available as standalone section
- Selecting NeuTTS checks for deps and offers to install:
  espeak-ng (system) + neutts[all] (pip)
- ElevenLabs/OpenAI selections prompt for API keys
- Tool status display shows NeuTTS install state

Changes:
- Remove optional-skills/mlops/models/neutts/ (skill + CLI scaffold)
- Add tools/neutts_synth.py (standalone synthesis subprocess helper)
- Move jo.wav/jo.txt to tools/neutts_samples/ (bundled default voice)
- Refactor _generate_neutts() — uses neutts API via subprocess, no
  neutts_cli dependency, config-driven ref_audio/ref_text/model/device
- Add TTS setup to hermes_cli/setup.py (SETUP_SECTIONS, tool status)
- Update config.py defaults (ref_audio, ref_text, model, device)
This commit is contained in:
Teknium
2026-03-17 02:33:12 -07:00
committed by GitHub
parent e2e53d497f
commit d50e0711c2
15 changed files with 310 additions and 1192 deletions

View File

@@ -256,7 +256,10 @@ DEFAULT_CONFIG = {
# Voices: alloy, echo, fable, onyx, nova, shimmer
},
"neutts": {
"voice": "", # NeuTTS voice profile name (empty = use default)
"ref_audio": "", # Path to reference voice audio (empty = bundled default)
"ref_text": "", # Path to reference voice transcript (empty = bundled default)
"model": "neuphonic/neutts-air-q4-gguf", # HuggingFace model repo
"device": "cpu", # cpu, cuda, or mps
},
},

View File

@@ -479,6 +479,16 @@ def _print_setup_summary(config: dict, hermes_home):
tool_status.append(("Text-to-Speech (ElevenLabs)", True, None))
elif tts_provider == "openai" and get_env_value("VOICE_TOOLS_OPENAI_KEY"):
tool_status.append(("Text-to-Speech (OpenAI)", True, None))
elif tts_provider == "neutts":
try:
import importlib.util
neutts_ok = importlib.util.find_spec("neutts") is not None
except Exception:
neutts_ok = False
if neutts_ok:
tool_status.append(("Text-to-Speech (NeuTTS local)", True, None))
else:
tool_status.append(("Text-to-Speech (NeuTTS — not installed)", False, "run 'hermes setup tts'"))
else:
tool_status.append(("Text-to-Speech (Edge TTS)", True, None))
@@ -1571,6 +1581,163 @@ def setup_model_provider(config: dict):
save_config(config)
# Offer TTS provider selection at the end of model setup
_setup_tts_provider(config)
# =============================================================================
# Section 1b: TTS Provider Configuration
# =============================================================================
def _check_espeak_ng() -> bool:
"""Check if espeak-ng is installed."""
import shutil
return shutil.which("espeak-ng") is not None or shutil.which("espeak") is not None
def _install_neutts_deps() -> bool:
"""Install NeuTTS dependencies with user approval. Returns True on success."""
import sys
# Check espeak-ng
if not _check_espeak_ng():
print()
print_warning("NeuTTS requires espeak-ng for phonemization.")
if sys.platform == "darwin":
print_info("Install with: brew install espeak-ng")
elif sys.platform == "win32":
print_info("Install with: choco install espeak-ng")
else:
print_info("Install with: sudo apt install espeak-ng")
print()
if prompt_yes_no("Install espeak-ng now?", True):
try:
if sys.platform == "darwin":
subprocess.run(["brew", "install", "espeak-ng"], check=True)
elif sys.platform == "win32":
subprocess.run(["choco", "install", "espeak-ng", "-y"], check=True)
else:
subprocess.run(["sudo", "apt", "install", "-y", "espeak-ng"], check=True)
print_success("espeak-ng installed")
except (subprocess.CalledProcessError, FileNotFoundError) as e:
print_warning(f"Could not install espeak-ng automatically: {e}")
print_info("Please install it manually and re-run setup.")
return False
else:
print_warning("espeak-ng is required for NeuTTS. Install it manually before using NeuTTS.")
# Install neutts Python package
print()
print_info("Installing neutts Python package...")
print_info("This will also download the TTS model (~300MB) on first use.")
print()
try:
subprocess.run(
[sys.executable, "-m", "pip", "install", "-U", "neutts[all]", "--quiet"],
check=True, timeout=300,
)
print_success("neutts installed successfully")
return True
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
print_error(f"Failed to install neutts: {e}")
print_info("Try manually: pip install neutts[all]")
return False
def _setup_tts_provider(config: dict):
"""Interactive TTS provider selection with install flow for NeuTTS."""
tts_config = config.get("tts", {})
current_provider = tts_config.get("provider", "edge")
provider_labels = {
"edge": "Edge TTS",
"elevenlabs": "ElevenLabs",
"openai": "OpenAI TTS",
"neutts": "NeuTTS",
}
current_label = provider_labels.get(current_provider, current_provider)
print()
print_header("Text-to-Speech Provider (optional)")
print_info(f"Current: {current_label}")
print()
choices = [
"Edge TTS (free, cloud-based, no setup needed)",
"ElevenLabs (premium quality, needs API key)",
"OpenAI TTS (good quality, needs API key)",
"NeuTTS (local on-device, free, ~300MB model download)",
f"Keep current ({current_label})",
]
idx = prompt_choice("Select TTS provider:", choices, len(choices) - 1)
if idx == 4: # Keep current
return
providers = ["edge", "elevenlabs", "openai", "neutts"]
selected = providers[idx]
if selected == "neutts":
# Check if already installed
try:
import importlib.util
already_installed = importlib.util.find_spec("neutts") is not None
except Exception:
already_installed = False
if already_installed:
print_success("NeuTTS is already installed")
else:
print()
print_info("NeuTTS requires:")
print_info(" • Python package: neutts (~50MB install + ~300MB model on first use)")
print_info(" • System package: espeak-ng (phonemizer)")
print()
if prompt_yes_no("Install NeuTTS dependencies now?", True):
if not _install_neutts_deps():
print_warning("NeuTTS installation incomplete. Falling back to Edge TTS.")
selected = "edge"
else:
print_info("Skipping install. Set tts.provider to 'neutts' after installing manually.")
selected = "edge"
elif selected == "elevenlabs":
existing = get_env_value("ELEVENLABS_API_KEY")
if not existing:
print()
api_key = prompt("ElevenLabs API key", password=True)
if api_key:
save_env_value("ELEVENLABS_API_KEY", api_key)
print_success("ElevenLabs API key saved")
else:
print_warning("No API key provided. Falling back to Edge TTS.")
selected = "edge"
elif selected == "openai":
existing = get_env_value("VOICE_TOOLS_OPENAI_KEY")
if not existing:
print()
api_key = prompt("OpenAI API key for TTS", password=True)
if api_key:
save_env_value("VOICE_TOOLS_OPENAI_KEY", api_key)
print_success("OpenAI TTS API key saved")
else:
print_warning("No API key provided. Falling back to Edge TTS.")
selected = "edge"
# Save the selection
if "tts" not in config:
config["tts"] = {}
config["tts"]["provider"] = selected
save_config(config)
print_success(f"TTS provider set to: {provider_labels.get(selected, selected)}")
def setup_tts(config: dict):
"""Standalone TTS setup (for 'hermes setup tts')."""
_setup_tts_provider(config)
# =============================================================================
# Section 2: Terminal Backend Configuration
@@ -2548,6 +2715,7 @@ def _offer_openclaw_migration(hermes_home: Path) -> bool:
SETUP_SECTIONS = [
("model", "Model & Provider", setup_model_provider),
("tts", "Text-to-Speech", setup_tts),
("terminal", "Terminal Backend", setup_terminal_backend),
("gateway", "Messaging Platforms (Gateway)", setup_gateway),
("tools", "Tools", setup_tools),

View File

@@ -1,435 +0,0 @@
---
name: neutts
description: Use the local NeuTTS CLI to install NeuTTS, manage reusable voice profiles, and synthesize speech fully on-device. Best when the user wants local or offline-ish TTS instead of a hosted API.
version: 1.0.0
author: Hermes Agent + Nous Research
license: MIT
platforms: [linux, macos, windows]
metadata:
hermes:
tags: [TTS, Text-To-Speech, Local-AI, Voice-Cloning, Audio, NeuTTS]
related_skills: [whisper, audiocraft-audio-generation]
requires_toolsets: [terminal]
---
# NeuTTS - Local Text-to-Speech
Use NeuTTS through the standalone `neutts` CLI. This skill is for local speech synthesis, reusable voice profiles, and quick Hermes-driven setup inside or alongside the Hermes repository.
NeuTTS is an on-device TTS model family from Neuphonic. This skill assumes the CLI wrapper exists and Hermes should drive it via terminal commands rather than a dedicated Hermes core tool.
## When to Use
- The user wants local TTS instead of Edge/OpenAI/ElevenLabs
- The user wants voice cloning from a short reference clip
- The user wants Hermes to install or verify the `neutts` CLI scaffold
- The user wants to create or inspect saved voice profiles
- The user wants to synthesize speech to a local WAV file
- The user wants to give the agent a custom voice / persona voice
- Keywords: `neutts`, `local tts`, `voice clone`, `on-device speech`, `offline speech`, `give you a voice`, `what do you sound like`
## Quick Reference
| Command | Purpose |
|---------|---------|
| `neutts doctor` | Check local install health (includes default voice) |
| `neutts install --all` | Install upstream NeuTTS with extras |
| `neutts list-models` | Show known official model repos |
| `neutts add-voice NAME --ref-audio clip.wav --ref-text-file clip.txt` | Save a reusable voice profile |
| `neutts list-voices` | Show saved local voice profiles |
| `neutts config --default-voice NAME` | Lock in a voice as the default for all synthesis |
| `neutts config` | View current settings (model, device, default voice) |
| `neutts synth --text Hello there` | Synthesize using the default voice |
| `neutts synth --voice NAME --text Hello there` | Synthesize using a specific voice |
| `neutts synth --voice NAME --text Hello --out sample.wav` | Generate a specific WAV |
## Procedure
## First-run execution policy
For a fresh NeuTTS setup, do not do broad filesystem exploration or repeated command probing. Keep the startup path short and deterministic.
Do not infer install state from prior conversation context, memory, or the mere presence of this skill. Only say NeuTTS is installed, verified, or ready if you checked it in the current turn with live commands.
Once first-run verification is complete, do not consult memory again for this flow unless the user explicitly asks about past setup, prior voice choices, or saved preferences.
Preferred sequence:
1. Resolve the target Python interpreter first
2. Use the bootstrap helper shipped with this skill to install the bundled NeuTTS CLI scaffold into that interpreter
3. Run `doctor` via `<target-python> -m neutts_cli.cli doctor` as the primary health check
4. If `doctor` reports `neutts_installed: false`, run `install --all`
5. Re-run `doctor`
6. Run `list-voices`
7. Confirm that `jo-demo` exists
8. Only then run one verification synthesis to `~/voice-tests/neutts_verify.wav`
Definitions:
- `<target-python>` means the Python interpreter for the environment where NeuTTS should live
- `<skill-bootstrap-helper>` means the `bootstrap_neutts_cli.py` file shipped with this installed skill, usually `~/.hermes/skills/mlops/models/neutts/scripts/bootstrap_neutts_cli.py`
Bootstrap example:
```bash
<target-python> <skill-bootstrap-helper> --install-cli --sample-profile --execute --json
```
Install NeuTTS runtime:
```bash
<target-python> -m neutts_cli.cli install --all
```
Verification synthesis:
```bash
mkdir -p ~/voice-tests
<target-python> -m neutts_cli.cli synth --voice jo-demo --text "Hello from Hermes" --out ~/voice-tests/neutts_verify.wav
```
First-run gate:
- for first-run verification, `jo-demo` is required
- do not treat NeuTTS as fully ready until `list-voices` includes `jo-demo`
- do not substitute a built-in/default voice, ad hoc reference, or memory-based prior voice for `jo-demo` during first-run verification
- if bootstrap with `--sample-profile` fails, stop and report the exact failure rather than improvising around it
- treat the bootstrap helper as the source of truth for bundled sample assets; do not manually inspect random skill directories looking for replacements before reporting the failure
Short-circuit rules:
- do not use `command -v neutts` or wrapper presence as the primary health gate; prefer `<target-python> -m neutts_cli.cli doctor` first
- if the `neutts` wrapper is missing, that alone does not mean the CLI module is unusable; check the module path before concluding anything
- if `neutts doctor` exits with code `127`, treat that as "CLI missing" and bootstrap immediately
- after a `127` from `neutts doctor`, do not run `neutts list-voices` until bootstrap is complete
- do not talk about memory unless it materially changes what you do next
- keep the Python interpreter consistent for the whole workflow; do not mix bare `python`, `/usr/bin/python`, and a target virtualenv interpreter
- prefer `<target-python> -m neutts_cli.cli ...` until the `neutts` wrapper is confirmed present in that same interpreter
- prefer the bootstrap helper bundled with this skill; treat the bundled scaffold as the only supported bootstrap source
- when the bootstrap helper path is already known, do not probe repo-local scaffolds first; run bootstrap directly
- if `list-voices` does not include `jo-demo` after bootstrap, fix that specific problem before attempting synthesis or voice design
Avoid:
- broad `find *neutts*` scans
- repeated checks for hardcoded executable paths
- wrapper-first health checks when the module path is available
- reading repo files like `pyproject.toml` unless bootstrap or install fails
- multiple failed synthesis attempts before running `neutts install --all`
- telling the user NeuTTS is already installed or verified unless `neutts doctor` succeeded in the current turn
- invoking the bootstrap helper with a different Python than the one you plan to use for `doctor`, `install`, `list-voices`, and `synth`
- extra repo-path probing when the bundled bootstrap helper is already available
- using a built-in/default voice as a substitute for the missing `jo-demo` baseline during first-run verification
- consulting memory or searching elsewhere for old voice profiles during first-run bootstrap
### 1. Locate or install the NeuTTS CLI
The bootstrap helper shipped with this skill is the preferred install path because it carries a bundled NeuTTS CLI scaffold and does not require a specific Hermes repo layout.
The helper installs the bundled CLI scaffold with `pip install --no-build-isolation -e ...` so it can work cleanly in environments without network access during the editable install step.
```bash
<target-python> <skill-bootstrap-helper> --install-cli --sample-profile --execute --json
```
Then verify:
```bash
<target-python> -m neutts_cli.cli doctor
```
If `neutts --help` or `neutts doctor` fails, treat NeuTTS as not yet ready and continue with bootstrap or install instead of summarizing it as already working.
If the skill needs help previewing the bootstrap plan without executing it, use:
```bash
<target-python> <skill-bootstrap-helper> --json
```
To actually perform the bootstrap steps instead of only printing them:
```bash
<target-python> <skill-bootstrap-helper> --install-cli --sample-profile --execute --json
```
The helper uses the bundled skill assets as the source of truth. Use the same `<target-python>` for bootstrap, `doctor`, `install`, `list-voices`, and `synth`.
After bootstrap, explicitly confirm that `list-voices` includes `jo-demo`. If it does not, stop and report that the bundled sample-profile creation failed. Do not continue into synthesis, public-domain sourcing, or memory-based recovery.
### 2. Install NeuTTS itself
```bash
<target-python> -m neutts_cli.cli install --all
```
This installs the upstream `neutts` package into the active Python environment. For quick CPU-only verification, `--all` is acceptable; if the user wants a slimmer setup, use `--onnx` or `--llama` as appropriate.
### Fresh setup fallback
If `neutts doctor` shows `neutts_installed: false`, or an early synthesis attempt fails because the NeuTTS runtime is missing, immediately run:
```bash
<target-python> -m neutts_cli.cli install --all
```
Then re-run:
```bash
<target-python> -m neutts_cli.cli doctor
<target-python> -m neutts_cli.cli list-voices
```
For a first-time verification, prefer a stable user-visible output path:
```bash
mkdir -p ~/voice-tests
<target-python> -m neutts_cli.cli synth --voice jo-demo --text "Hello from Hermes" --out ~/voice-tests/neutts_verify.wav
```
For a fresh setup, `jo-demo` is the expected baseline profile if the sample bootstrap step was run.
If `jo-demo` is missing after bootstrap, that is a setup failure, not a cue to switch to another voice source. Report it clearly and stop there.
### 3. Collaborative voice design (proactive)
**When to trigger:** If the user asks you to speak, use TTS, or send a voice message and you have no custom voice profile yet (check `neutts list-voices`), proactively offer to design a voice together before falling back to the default `jo-demo` profile.
**Interaction style:** Prefer short clarify prompts with concrete options over long open-ended questions. Use multiple-choice first to keep momentum high, then switch to freeform only when you need details the user must supply.
**Clarify timeout policy:** If a clarify prompt times out, do not start new sourcing, downloading, or transformation work unless the timed-out question was explicitly about accepting a recommendation you already made. If you must continue after timeout, say so plainly and pick the safest default:
- default to your recommendation when the timeout happened on `Go with my recommendation` vs `Something else`
- otherwise stop and ask again in plain language rather than silently making a bigger decision
**Preferred first clarify after verification:**
- Keep demo voice
- Create a voice for me
- Set default voice
- Just test synthesis
**If the user chooses to create a voice for the assistant, first give one concise recommendation sentence.** Make it personal and relationship-aware. Prefer wording like:
> "Based on what I know about you, I'd make my voice warm, grounded, and a little nerdy rather than polished narrator-clean."
Then prefer a binary clarify prompt:
- Go with my recommendation
- Something else
**If the user chooses `Something else`, prefer one short follow-up clarify for direction:**
- Warm and grounded
- Bright and energetic
- Calm and precise
- Distinct / separate persona
After the user picks a direction, prefer a second short clarify for how to source the reference:
- Find public-domain clips for me
- I'll give you a clip path and transcript
Default to doing the heavy lifting yourself. The first option should be presented as the default path whenever possible.
If the user chooses `Find public-domain clips for me`, take responsibility for the search and present a small curated set of promising 3-15 second candidates instead of pushing the work back onto the user immediately.
Use a constrained sourcing workflow:
- prefer the built-in web or browser tools for search and page inspection
- prefer a small set of trusted public-domain sources such as LibriVox and Project Gutenberg recordings when available
- do not call unavailable or speculative tools such as `web_search`; use only tools that are actually present in the environment
- do not use ad hoc Python scraping with `requests`, `bs4`, or one-off parsing scripts for clip discovery unless the user explicitly asked for that style of debugging
- do not bounce across many search methods in one turn
- stop at 3 strong candidates maximum
If the first sourcing method fails, use one fallback method only. If that also fails, stop and ask the user whether they want you to keep searching later or provide a clip path directly. Do not continue thrashing through more tools.
If a clarify timed out earlier in the same branch, do not interpret that as permission to begin sourcing or downloading on your own unless the timed-out choice was specifically approval to follow your recommendation.
When presenting sourced candidates in a clarify menu, put the short description directly in each option label instead of listing bare names only. Prefer compact labels like:
- Mark Nelson - friendly nerdy storyteller
- Adrian Praetzellis - warm professor energy
- Peter Yearsley - calm precise British
- Show me more options
Keep the summary above the menu brief. The menu itself should carry most of the distinction between options so the user can decide at a glance.
When sourcing succeeds, present at most 3 candidates and move straight to selection. Do not keep exploring once you already have enough viable options.
That means:
1. present candidates
2. get the user's candidate choice
3. immediately ask `Use this source` or `Show me another`
4. only after `Use this source`, begin download, clipping, transcript lookup, or transcription
Do not download audio, fetch source text, or prepare clips before that confirmation step.
After the user selects a candidate source voice, use one short confirmation prompt before downloading, clipping, or transcribing:
- Use this source
- Show me another
This confirmation is mandatory. Do not start clip extraction or transcription work until the user confirms the source, unless the timed-out clarify was specifically approval to follow your recommendation.
For clip preparation, prefer a temporary workspace such as `/tmp/neutts-voice-reference` rather than writing into `~/.hermes/` or another durable user directory by default.
For transcripts, prefer source text over STT whenever the material comes from LibriVox, Project Gutenberg, or another public-domain reading with matching text available. Use Whisper or other STT only as a fallback when matching source text is not readily available.
If transcript extraction fails once, stop and ask whether to try another clip instead of retrying blindly through multiple transcription attempts.
Before creating the voice profile, verify the final transcript once for obvious shell artifacts, prompt text, or mismatched lines. Fix the transcript file first, then run `add-voice`. Do not create a profile and patch it afterward as the normal path.
In the normal path, create the intended final voice name directly. Do not create duplicate workaround names like `atom2` unless the user explicitly asked for variants or you are preserving two intentionally different voices.
Do not manually edit `voice.json` as part of the standard workflow. Only treat direct metadata edits as a last-resort recovery step after you have clearly explained the problem and simpler CLI-based fixes failed.
If the user chooses `I'll give you a clip path and transcript`, ask only for the required freeform inputs:
- reference audio path
- transcript
Frame this as creating or refining the agent's own voice for the user-facing relationship. Prefer wording like "create a voice for me", "design my voice", or "make me sound like X" over generic phrases like "create a custom voice" unless the user used that wording first.
**How to approach it:** Be conversational and opinionated, not a questionnaire. You know the user — draw on what you know about them, your relationship, the platform you're on, and who you are as an agent. Lead with your own take on what voice would fit, then invite their input.
The value proposition is agent identity, not generic TTS setup. Default to language that treats the voice as the assistant's voice in the relationship with the user.
**Framework:**
1. **Open with your perspective.** Reflect briefly on who you are to the user (cognitive partner, assistant, creative collaborator, etc.) and what kind of voice would match that dynamic. Share a concrete suggestion — don't be generic.
2. **Describe the vibe, not just parameters.** Instead of "select a pitch range," paint a picture: warm and grounded, bright and energetic, calm and steady, playful with an edge. Use language that conveys personality, not spec sheets.
3. **Ask open-ended questions.** Cover these dimensions naturally in conversation (not as a numbered list unless the user seems unsure):
- Register / feel: lower and grounded, higher and bright, something neutral
- Tone: calm, energetic, warm, precise, playful
- Similarity to the user: close to their own voice, or distinctly different
- Any specific voices they like or want to approximate
4. **Take on the sourcing work by default.** NeuTTS voice cloning needs a reference audio clip (3-15 seconds, mono WAV preferred) plus a transcript of what the clip says. By default, offer to go find public-domain reference clips yourself and narrow them down for the user. Only ask the user for a local clip path and transcript if they choose that route or already have one ready.
5. **Iterate if needed.** After the first synthesis, ask if the voice feels right or if they want to try a different reference. Voice design is subjective — treat it as a collaborative process, not a one-shot.
**Example opener** (adapt to your actual persona and relationship with the user):
> "So if I'm going to talk to you, let me think about what I should actually sound like... I'm your [role] — the one who [what you do together]. I'm thinking something [concrete vibe description]. I can go find a few strong public-domain reference clips for us, or if you already have a clip you want me to use, you can point me to it."
**After the user provides a reference clip:**
```bash
neutts add-voice AGENT_NAME --ref-audio /path/to/clip.wav --ref-text-file /path/to/transcript.txt --language en
neutts synth --voice AGENT_NAME --text "Here's what I sound like now — what do you think?" --out ./voice_test.wav
```
Send the test WAV to the user and ask for feedback before considering the voice finalized.
Do not auto-play the generated audio locally as part of the standard flow. Report the output path clearly so the user can choose whether to play it.
**Locking in the voice:**
Once the user approves the voice, set it as the default so all future synthesis uses it automatically — no `--voice` flag needed:
```bash
neutts config --default-voice AGENT_NAME
```
Confirm the lock-in to the user. Let them know:
- This voice will be used automatically whenever you speak from now on
- They can change it anytime (`neutts config --default-voice OTHER_NAME`)
- They can check what's set with `neutts config`
Offer next steps naturally, like Atom's approach: suggest sending a longer voice note, tweaking the style, or just moving on — don't make it feel like a configuration wizard that just completed.
### 4. Add a voice profile manually
If skipping the collaborative flow, or adding a voice from a known reference:
If working from this repo, a sample profile can be bootstrapped automatically:
```bash
python optional-skills/mlops/models/neutts/scripts/bootstrap_neutts_cli.py --repo-root . --install-cli --sample-profile --json
```
Add `--execute` to actually run those commands.
Or add one manually:
```bash
neutts add-voice demo --ref-audio ./samples/voice.wav --ref-text-file ./samples/voice.txt --language en
```
Reference guidelines:
- mono WAV preferred
- 3 to 15 seconds is ideal
- transcript should match the reference audio closely
- use same-language references for best multilingual results
### 5. Synthesize speech
For a quick smoke test:
```bash
neutts synth --voice demo --text Hello from Hermes
```
For a named output file:
```bash
neutts synth --voice demo --text This is a local NeuTTS test --out ./speech.wav
```
### 6. Report results clearly
After running synthesis:
- confirm the output path
- note whether a saved voice profile or ad-hoc reference was used
- mention any warnings from NeuTTS, but do not treat watermark warnings as a hard failure
- after verification, prefer a short clarify prompt with concrete next-step options instead of a long open-ended paragraph
- when offering voice creation, phrase it as creating the assistant's voice for the user, not as a generic custom voice feature
- if verification did not happen in the current turn, explicitly say that instead of implying the environment is already ready
- do not perform risky or noisy cleanup commands in the normal success path; temporary files can simply be left in `/tmp` unless the user asked for cleanup
## Memory
- do not save memory for routine install or verification runs
- only save memory if the user established a durable voice preference, approved a default voice, or a non-trivial workaround/fix was required
- if you save memory for this flow, do it once at the very end after the voice is finalized or set as default
- do not do intermediate memory writes during setup, sourcing, clip prep, or testing
- if memory save fails or memory is full, do not thrash through retries; either skip it or replace a single clearly related prior NeuTTS entry once
## Pitfalls
- `neutts synth` needs either `--voice` or both `--ref-audio` and `--ref-text`
- The first synthesis call can be slow because models need to load
- `llama-cpp-python` acceleration is platform-specific and may require custom build flags
- `doctor` may show `ffmpeg` missing; that does not block WAV synthesis
- The upstream NeuTTS package may emit Perth watermark warnings; these are informational unless the user explicitly needs watermarking
- If the `neutts` command is missing after install, ensure the active virtualenv is the same environment where the editable package was installed
- transcript files can pick up shell artifacts if written carelessly; verify them before `add-voice`
- avoid duplicate profile-name workarounds and direct `voice.json` edits in the normal path
## Verification
Use this sequence:
```bash
neutts doctor
neutts list-voices
neutts synth --voice jo-demo --text Hello from Hermes --out ./verify.wav
```
Success means:
- `doctor` shows `neutts_installed: true`
- `list-voices` includes the expected profile
- synthesis completes and writes a WAV file
## References
- NeuTTS upstream: https://github.com/neuphonic/neutts
- Bundled NeuTTS CLI scaffold: `assets/neutts-cli`
- Skill bootstrap helper: `optional-skills/mlops/models/neutts/scripts/bootstrap_neutts_cli.py`

View File

@@ -1,55 +0,0 @@
# NeuTTS CLI
Small standalone CLI for installing, checking, and running [NeuTTS](https://github.com/neuphonic/neutts) locally.
This scaffold is designed to be a good fit for a future Hermes optional skill:
- predictable commands
- machine-friendly output for inspection
- local voice profile management
- direct local synthesis
## Commands
```bash
neutts install --all
neutts doctor
neutts list-models
neutts add-voice demo --ref-audio ./samples/jo.wav --ref-text-file ./samples/jo.txt
neutts list-voices
neutts synth --voice demo --text Hello from NeuTTS --out ./out.wav
neutts synth --voice demo --text Quick smoke test
```
## Install the bundled scaffold
```bash
cd optional-skills/mlops/models/neutts/assets/neutts-cli
python -m pip install -e .
```
## Add the bundled sample profile
This skill bundles an upstream NeuTTS sample reference in `samples/`.
```bash
cd optional-skills/mlops/models/neutts/assets/neutts-cli
PYTHONPATH=src python -m neutts_cli.cli add-voice jo-demo \
--ref-audio ./samples/jo.wav \
--ref-text-file ./samples/jo.txt \
--language en
```
Then inspect it with:
```bash
PYTHONPATH=src python -m neutts_cli.cli list-voices
```
## Notes
- `install` installs the upstream `neutts` package into the current Python environment.
- `list-voices` shows local voice profiles created with `add-voice`.
- `synth` uses NeuTTS reference cloning. A voice profile is just a saved reference audio/text pair.
- `synth` accepts quoted or unquoted text and defaults to `./out.wav` when `--out` is omitted.
- GGUF / `llama-cpp-python` acceleration can vary by platform, so the CLI prints follow-up guidance instead of forcing one build recipe.

View File

@@ -1,24 +0,0 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "neutts-cli"
version = "0.1.0"
description = "Standalone CLI for installing and running NeuTTS locally"
readme = "README.md"
requires-python = ">=3.10"
license = {text = "MIT"}
authors = [
{name = "Hermes Agent Contributors"}
]
dependencies = []
[project.scripts]
neutts = "neutts_cli.cli:main"
[tool.setuptools]
package-dir = {"" = "src"}
[tool.setuptools.packages.find]
where = ["src"]

View File

@@ -1,3 +0,0 @@
__all__ = ["__version__"]
__version__ = "0.1.0"

View File

@@ -1,26 +0,0 @@
from __future__ import annotations
import wave
from pathlib import Path
def write_wav(path: str | Path, samples, sample_rate: int) -> Path:
output_path = Path(path).expanduser().resolve()
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
import numpy as np
except ImportError as exc:
raise RuntimeError("numpy is required to write NeuTTS audio output") from exc
data = np.asarray(samples, dtype=np.float32).flatten()
clipped = np.clip(data, -1.0, 1.0)
pcm16 = (clipped * 32767.0).astype(np.int16)
with wave.open(str(output_path), "wb") as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
wav_file.writeframes(pcm16.tobytes())
return output_path

View File

@@ -1,204 +0,0 @@
from __future__ import annotations
import argparse
import json
import sys
from .config import AppConfig
from .core import (
KNOWN_MODELS,
doctor_report,
list_voices,
load_voice,
platform_notes,
run_install,
save_voice,
synthesize,
)
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Standalone CLI for local NeuTTS workflows"
)
subparsers = parser.add_subparsers(dest="command", required=True)
install_parser = subparsers.add_parser(
"install", help="Install NeuTTS into the current Python environment"
)
install_parser.add_argument(
"--llama",
action="store_true",
help="Install llama-cpp-python support via neutts[llama]",
)
install_parser.add_argument(
"--onnx",
action="store_true",
help="Install ONNX decoder support via neutts[onnx]",
)
install_parser.add_argument(
"--all", action="store_true", help="Install all upstream NeuTTS extras"
)
install_parser.add_argument(
"--dry-run",
action="store_true",
help="Print the install command without running it",
)
subparsers.add_parser("doctor", help="Inspect NeuTTS CLI environment")
subparsers.add_parser(
"list-models", help="Show known official NeuTTS model repositories"
)
subparsers.add_parser("list-voices", help="Show local voice profiles")
add_voice_parser = subparsers.add_parser(
"add-voice", help="Save a local voice profile from a reference sample"
)
add_voice_parser.add_argument("name", help="Voice profile name")
add_voice_parser.add_argument(
"--ref-audio", required=True, help="Reference WAV file"
)
add_voice_parser.add_argument(
"--ref-text", help="Transcript text for the reference audio"
)
add_voice_parser.add_argument(
"--ref-text-file",
help="Path to a text file containing the reference transcript",
)
add_voice_parser.add_argument(
"--language", default="unknown", help="Optional language tag"
)
synth_parser = subparsers.add_parser(
"synth", help="Synthesize speech to a WAV file"
)
synth_parser.add_argument(
"--text", nargs="+", required=True, help="Text to synthesize"
)
synth_parser.add_argument("--voice", help="Saved voice profile name")
synth_parser.add_argument(
"--ref-audio", help="Reference audio path when not using --voice"
)
synth_parser.add_argument(
"--ref-text", help="Reference transcript when not using --voice"
)
synth_parser.add_argument("--out", default="out.wav", help="Output WAV file path")
config_parser = subparsers.add_parser(
"config", help="View or update default synthesis settings"
)
config_parser.add_argument("--backbone-repo")
config_parser.add_argument("--backbone-device")
config_parser.add_argument("--codec-repo")
config_parser.add_argument("--codec-device")
config_parser.add_argument("--sample-rate", type=int)
config_parser.add_argument(
"--default-voice",
help="Voice profile name to use when --voice is omitted from synth",
)
return parser
def _read_ref_text(args: argparse.Namespace) -> str:
if args.ref_text:
return args.ref_text.strip()
if args.ref_text_file:
with open(args.ref_text_file, "r", encoding="utf-8") as handle:
return handle.read().strip()
raise ValueError("Provide either --ref-text or --ref-text-file")
def _normalize_text_arg(value: str | list[str]) -> str:
if isinstance(value, list):
return " ".join(value).strip()
return value.strip()
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
try:
if args.command == "install":
commands = run_install(args.llama, args.onnx, args.all, args.dry_run)
print(
json.dumps(
{
"commands": commands,
"notes": platform_notes(),
"dry_run": args.dry_run,
},
indent=2,
)
)
return 0
if args.command == "doctor":
print(json.dumps(doctor_report(), indent=2))
return 0
if args.command == "list-models":
print(json.dumps(KNOWN_MODELS, indent=2))
return 0
if args.command == "list-voices":
profiles = [profile.__dict__ for profile in list_voices()]
print(json.dumps(profiles, indent=2))
return 0
if args.command == "add-voice":
metadata_path = save_voice(
name=args.name,
ref_audio=args.ref_audio,
ref_text=_read_ref_text(args),
language=args.language,
)
profile = load_voice(args.name)
print(
json.dumps(
{"saved": str(metadata_path), "voice": profile.__dict__}, indent=2
)
)
return 0
if args.command == "synth":
output = synthesize(
text=_normalize_text_arg(args.text),
out=args.out,
voice=args.voice,
ref_audio=args.ref_audio,
ref_text=args.ref_text,
)
print(json.dumps({"output": str(output)}, indent=2))
return 0
if args.command == "config":
config = AppConfig.load()
changed = False
for field in (
"backbone_repo",
"backbone_device",
"codec_repo",
"codec_device",
"sample_rate",
"default_voice",
):
value = getattr(args, field, None)
if value is not None:
setattr(config, field, value)
changed = True
if changed:
config.save()
print(json.dumps(config.__dict__, indent=2))
return 0
parser.error(f"Unknown command: {args.command}")
return 2
except Exception as exc:
print(f"error: {exc}", file=sys.stderr)
return 1
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,67 +0,0 @@
from __future__ import annotations
import json
import os
from dataclasses import dataclass
from pathlib import Path
DEFAULT_BACKBONE = "neuphonic/neutts-nano"
DEFAULT_CODEC = "neuphonic/neucodec"
DEFAULT_SAMPLE_RATE = 24000
def app_home() -> Path:
override = os.getenv("NEUTTS_CLI_HOME")
if override:
return Path(override).expanduser()
return Path.home() / ".neutts-cli"
def config_path() -> Path:
return app_home() / "config.json"
def voices_dir() -> Path:
return app_home() / "voices"
@dataclass
class AppConfig:
backbone_repo: str = DEFAULT_BACKBONE
backbone_device: str = "cpu"
codec_repo: str = DEFAULT_CODEC
codec_device: str = "cpu"
sample_rate: int = DEFAULT_SAMPLE_RATE
default_voice: str | None = None
@classmethod
def load(cls) -> "AppConfig":
path = config_path()
if not path.exists():
return cls()
data = json.loads(path.read_text(encoding="utf-8"))
return cls(
backbone_repo=data.get("backbone_repo", DEFAULT_BACKBONE),
backbone_device=data.get("backbone_device", "cpu"),
codec_repo=data.get("codec_repo", DEFAULT_CODEC),
codec_device=data.get("codec_device", "cpu"),
sample_rate=int(data.get("sample_rate", DEFAULT_SAMPLE_RATE)),
default_voice=data.get("default_voice") or None,
)
def save(self) -> Path:
home = app_home()
home.mkdir(parents=True, exist_ok=True)
path = config_path()
payload = {
"backbone_repo": self.backbone_repo,
"backbone_device": self.backbone_device,
"codec_repo": self.codec_repo,
"codec_device": self.codec_device,
"sample_rate": self.sample_rate,
"default_voice": self.default_voice,
}
path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
return path

View File

@@ -1,197 +0,0 @@
from __future__ import annotations
import importlib
import importlib.util
import json
import platform
import shutil
import subprocess
import sys
from dataclasses import asdict, dataclass
from pathlib import Path
from .audio import write_wav
from .config import AppConfig, app_home, voices_dir
KNOWN_MODELS = [
"neuphonic/neutts-air",
"neuphonic/neutts-air-q8-gguf",
"neuphonic/neutts-air-q4-gguf",
"neuphonic/neutts-nano",
"neuphonic/neutts-nano-q8-gguf",
"neuphonic/neutts-nano-q4-gguf",
"neuphonic/neutts-nano-french",
"neuphonic/neutts-nano-german",
"neuphonic/neutts-nano-spanish",
]
@dataclass
class VoiceProfile:
name: str
ref_audio: str
ref_text: str
language: str = "unknown"
def is_module_available(module_name: str) -> bool:
return importlib.util.find_spec(module_name) is not None
def run_install(
include_llama: bool, include_onnx: bool, include_all: bool, dry_run: bool
) -> list[str]:
extras = []
if include_all:
extras = ["all"]
else:
if include_llama:
extras.append("llama")
if include_onnx:
extras.append("onnx")
requirement = "neutts"
if extras:
requirement = f"neutts[{','.join(extras)}]"
command = [sys.executable, "-m", "pip", "install", "-U", requirement]
rendered = " ".join(command)
if dry_run:
return [rendered]
subprocess.run(command, check=True)
return [rendered]
def platform_notes() -> list[str]:
system = platform.system()
if system == "Darwin":
return [
"For Apple Silicon GGUF acceleration, install the llama extra with BLAS/Accelerate flags.",
"See the upstream NeuTTS README for the recommended CMAKE_ARGS invocation.",
]
if system == "Linux":
return [
"For GGUF acceleration on Linux, install OpenBLAS and then reinstall the llama extra with matching CMAKE_ARGS.",
]
if system == "Windows":
return [
"For GGUF acceleration on Windows, install OpenBLAS first and then install the llama extra from PowerShell with CMAKE_ARGS set.",
]
return []
def doctor_report() -> dict:
voice_count = (
len(list(voices_dir().glob("*/voice.json"))) if voices_dir().exists() else 0
)
config = AppConfig.load()
report = {
"python": sys.version.split()[0],
"platform": platform.platform(),
"app_home": str(app_home()),
"config": asdict(config),
"neutts_installed": is_module_available("neutts"),
"numpy_installed": is_module_available("numpy"),
"onnxruntime_installed": is_module_available("onnxruntime"),
"llama_cpp_installed": is_module_available("llama_cpp"),
"ffmpeg_in_path": shutil.which("ffmpeg") is not None,
"voice_profiles": voice_count,
"default_voice": config.default_voice,
}
return report
def save_voice(
name: str, ref_audio: str, ref_text: str, language: str = "unknown"
) -> Path:
source_audio = Path(ref_audio).expanduser().resolve()
if not source_audio.exists():
raise FileNotFoundError(f"Reference audio not found: {source_audio}")
destination = voices_dir() / name
destination.mkdir(parents=True, exist_ok=True)
audio_target = destination / source_audio.name
text_target = destination / "reference.txt"
metadata_target = destination / "voice.json"
if audio_target.resolve() != source_audio:
if audio_target.exists():
audio_target.unlink()
audio_target.write_bytes(source_audio.read_bytes())
if text_target.exists():
text_target.unlink()
text_target.write_text(ref_text.strip() + "\n", encoding="utf-8")
profile = VoiceProfile(
name=name,
ref_audio=str(audio_target),
ref_text=ref_text.strip(),
language=language,
)
metadata_target.write_text(
json.dumps(asdict(profile), indent=2) + "\n", encoding="utf-8"
)
return metadata_target
def load_voice(name: str) -> VoiceProfile:
metadata_path = voices_dir() / name / "voice.json"
if not metadata_path.exists():
raise FileNotFoundError(f"Voice profile not found: {name}")
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
return VoiceProfile(**payload)
def list_voices() -> list[VoiceProfile]:
if not voices_dir().exists():
return []
profiles = []
for metadata_path in sorted(voices_dir().glob("*/voice.json")):
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
profiles.append(VoiceProfile(**payload))
return profiles
def synthesize(
text: str,
out: str,
voice: str | None = None,
ref_audio: str | None = None,
ref_text: str | None = None,
) -> Path:
if not text.strip():
raise ValueError("Input text is required")
# Fall back to the configured default voice when no voice is specified
if not voice and not ref_audio:
config = AppConfig.load()
if config.default_voice:
voice = config.default_voice
if voice:
profile = load_voice(voice)
ref_audio = profile.ref_audio
ref_text = profile.ref_text
if not ref_audio or not ref_text:
raise ValueError("Provide either --voice or both --ref-audio and --ref-text")
if not is_module_available("neutts"):
raise RuntimeError("NeuTTS is not installed. Run 'neutts install' first.")
neu_module = importlib.import_module("neutts")
NeuTTS = getattr(neu_module, "NeuTTS")
config = AppConfig.load()
tts = NeuTTS(
backbone_repo=config.backbone_repo,
backbone_device=config.backbone_device,
codec_repo=config.codec_repo,
codec_device=config.codec_device,
)
ref_codes = tts.encode_reference(ref_audio)
wav = tts.infer(text, ref_codes, ref_text)
return write_wav(out, wav, config.sample_rate)

View File

@@ -1,168 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import shlex
import subprocess
import sys
from pathlib import Path
SCRIPT_DIR = Path(__file__).resolve().parent
SKILL_DIR = SCRIPT_DIR.parent
BUNDLED_CLI_DIR = SKILL_DIR / "assets" / "neutts-cli"
def _quote(path: Path) -> str:
return shlex.quote(str(path))
def _quote_text(value: str) -> str:
return shlex.quote(value)
def find_cli_dir() -> tuple[Path, str]:
if BUNDLED_CLI_DIR.exists():
return BUNDLED_CLI_DIR, "bundled"
raise FileNotFoundError(
"NeuTTS CLI scaffold not found in bundled skill assets."
)
def build_commands(
cli_dir: Path,
install_cli: bool,
sample_profile: bool,
python_executable: str,
) -> list[str]:
commands: list[str] = []
module_runner = f"{_quote_text(python_executable)} -m neutts_cli.cli"
if install_cli:
commands.append(
f"{_quote_text(python_executable)} -m pip install --no-build-isolation -e {_quote(cli_dir)}"
)
commands.append(f"{module_runner} doctor")
else:
commands.append("neutts doctor")
if sample_profile:
sample_audio = cli_dir / "samples" / "jo.wav"
sample_text = cli_dir / "samples" / "jo.txt"
if not sample_audio.exists() or not sample_text.exists():
raise FileNotFoundError(
"Sample profile files are missing from bundled skill assets."
)
commands.append(
" ".join(
[
f"{module_runner if install_cli else 'neutts'} add-voice jo-demo",
f"--ref-audio {_quote(sample_audio)}",
f"--ref-text-file {_quote(sample_text)}",
"--language en",
]
)
)
return commands
def maybe_run(commands: list[str], workdir: Path, execute: bool) -> list[dict]:
results: list[dict] = []
for command in commands:
if not execute:
results.append({"command": command, "executed": False})
continue
completed = subprocess.run(
shlex.split(command),
cwd=str(workdir),
text=True,
capture_output=True,
check=False,
)
results.append(
{
"command": command,
"executed": True,
"returncode": completed.returncode,
"stdout": completed.stdout.strip(),
"stderr": completed.stderr.strip(),
}
)
if completed.returncode != 0:
break
return results
def main() -> int:
parser = argparse.ArgumentParser(
description="Bootstrap the standalone NeuTTS CLI for Hermes skill usage"
)
parser.add_argument(
"--repo-root",
default=".",
help="Working directory used when executing bootstrap commands",
)
parser.add_argument(
"--install-cli",
action="store_true",
help="Install the standalone NeuTTS CLI in editable mode",
)
parser.add_argument(
"--sample-profile",
action="store_true",
help="Add the bundled jo-demo sample profile",
)
parser.add_argument(
"--execute", action="store_true", help="Actually run the generated commands"
)
parser.add_argument(
"--json", action="store_true", help="Print machine-readable JSON output"
)
args = parser.parse_args()
repo_root = Path(args.repo_root).expanduser().resolve()
cli_dir, cli_source = find_cli_dir()
commands = build_commands(
cli_dir, args.install_cli, args.sample_profile, sys.executable
)
workdir = repo_root if repo_root.exists() else Path.cwd()
results = maybe_run(commands, workdir, args.execute)
payload = {
"python_executable": sys.executable,
"repo_root": str(repo_root),
"workdir": str(workdir),
"cli_dir": str(cli_dir),
"cli_source": cli_source,
"commands": commands,
"results": results,
"next_steps": [
"Re-run with '--execute' to actually perform the bootstrap commands.",
f"Run '{sys.executable} -m neutts_cli.cli install --all' to install the upstream NeuTTS runtime.",
f"Run '{sys.executable} -m neutts_cli.cli list-voices' to confirm saved profiles.",
f"Run '{sys.executable} -m neutts_cli.cli synth --voice jo-demo --text Hello from Hermes' for a smoke test.",
],
}
if args.json:
print(json.dumps(payload, indent=2))
else:
print(f"Repo root: {repo_root}")
print(f"Workdir: {workdir}")
print(f"CLI dir: {cli_dir}")
print(f"CLI source: {cli_source}")
for entry in results:
print(f"- {entry['command']}")
if entry.get("executed"):
print(f" rc={entry['returncode']}")
if entry.get("stdout"):
print(f" stdout: {entry['stdout']}")
if entry.get("stderr"):
print(f" stderr: {entry['stderr']}")
for step in payload["next_steps"]:
print(f"next: {step}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

104
tools/neutts_synth.py Normal file
View File

@@ -0,0 +1,104 @@
#!/usr/bin/env python3
"""Standalone NeuTTS synthesis helper.
Called by tts_tool.py via subprocess to keep the TTS model (~500MB)
in a separate process that exits after synthesis — no lingering memory.
Usage:
python -m tools.neutts_synth --text "Hello" --out output.wav \
--ref-audio samples/jo.wav --ref-text samples/jo.txt
Requires: pip install neutts[all]
System: apt install espeak-ng (or brew install espeak-ng)
"""
import argparse
import struct
import sys
from pathlib import Path
def _write_wav(path: str, samples, sample_rate: int = 24000) -> None:
"""Write a WAV file from float32 samples (no soundfile dependency)."""
import numpy as np
if not isinstance(samples, np.ndarray):
samples = np.array(samples, dtype=np.float32)
samples = samples.flatten()
# Clamp and convert to int16
samples = np.clip(samples, -1.0, 1.0)
pcm = (samples * 32767).astype(np.int16)
num_channels = 1
bits_per_sample = 16
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
block_align = num_channels * (bits_per_sample // 8)
data_size = len(pcm) * (bits_per_sample // 8)
with open(path, "wb") as f:
f.write(b"RIFF")
f.write(struct.pack("<I", 36 + data_size))
f.write(b"WAVE")
f.write(b"fmt ")
f.write(struct.pack("<IHHIIHH", 16, 1, num_channels, sample_rate,
byte_rate, block_align, bits_per_sample))
f.write(b"data")
f.write(struct.pack("<I", data_size))
f.write(pcm.tobytes())
def main():
parser = argparse.ArgumentParser(description="NeuTTS synthesis helper")
parser.add_argument("--text", required=True, help="Text to synthesize")
parser.add_argument("--out", required=True, help="Output WAV path")
parser.add_argument("--ref-audio", required=True, help="Reference voice audio path")
parser.add_argument("--ref-text", required=True, help="Reference voice transcript path")
parser.add_argument("--model", default="neuphonic/neutts-air-q4-gguf",
help="HuggingFace backbone model repo")
parser.add_argument("--device", default="cpu", help="Device (cpu/cuda/mps)")
args = parser.parse_args()
# Validate inputs
ref_audio = Path(args.ref_audio).expanduser()
ref_text_path = Path(args.ref_text).expanduser()
if not ref_audio.exists():
print(f"Error: reference audio not found: {ref_audio}", file=sys.stderr)
sys.exit(1)
if not ref_text_path.exists():
print(f"Error: reference text not found: {ref_text_path}", file=sys.stderr)
sys.exit(1)
ref_text = ref_text_path.read_text(encoding="utf-8").strip()
# Import and run NeuTTS
try:
from neutts import NeuTTS
except ImportError:
print("Error: neutts not installed. Run: pip install neutts[all]", file=sys.stderr)
sys.exit(1)
tts = NeuTTS(
backbone_repo=args.model,
backbone_device=args.device,
codec_repo="neuphonic/neucodec",
codec_device=args.device,
)
ref_codes = tts.encode_reference(str(ref_audio))
wav = tts.infer(args.text, ref_codes, ref_text)
# Write output
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
try:
import soundfile as sf
sf.write(str(out_path), wav, 24000)
except ImportError:
_write_wav(str(out_path), wav, 24000)
print(f"OK: {out_path}", file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -73,7 +73,6 @@ DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
DEFAULT_OPENAI_VOICE = "alloy"
DEFAULT_NEUTTS_VOICE = "" # empty = use neutts_cli default voice
DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
MAX_TEXT_LENGTH = 4000
@@ -265,24 +264,38 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
# ===========================================================================
def _check_neutts_available() -> bool:
"""Check if neutts_cli is importable (installed locally)."""
"""Check if the neutts engine is importable (installed locally)."""
try:
import importlib.util
return importlib.util.find_spec("neutts_cli") is not None
return importlib.util.find_spec("neutts") is not None
except Exception:
return False
def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate speech using the local NeuTTS CLI.
def _default_neutts_ref_audio() -> str:
"""Return path to the bundled default voice reference audio."""
return str(Path(__file__).parent / "neutts_samples" / "jo.wav")
Calls neutts_cli.cli synth via subprocess. Outputs WAV by default;
the caller handles conversion to .ogg for Telegram if needed.
def _default_neutts_ref_text() -> str:
"""Return path to the bundled default voice reference transcript."""
return str(Path(__file__).parent / "neutts_samples" / "jo.txt")
def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate speech using the local NeuTTS engine.
Runs synthesis in a subprocess via tools/neutts_synth.py to keep the
~500MB model in a separate process that exits after synthesis.
Outputs WAV; the caller handles conversion for Telegram if needed.
"""
import sys
neutts_config = tts_config.get("neutts", {})
voice = neutts_config.get("voice", DEFAULT_NEUTTS_VOICE)
ref_audio = neutts_config.get("ref_audio", "") or _default_neutts_ref_audio()
ref_text = neutts_config.get("ref_text", "") or _default_neutts_ref_text()
model = neutts_config.get("model", "neuphonic/neutts-air-q4-gguf")
device = neutts_config.get("device", "cpu")
# NeuTTS outputs WAV natively — use a .wav path for generation,
# let the caller convert to the final format afterward.
@@ -290,14 +303,23 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) ->
if not output_path.endswith(".wav"):
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
cmd = [sys.executable, "-m", "neutts_cli.cli", "synth", "--text", text, "--out", wav_path]
if voice:
cmd.extend(["--voice", voice])
synth_script = str(Path(__file__).parent / "neutts_synth.py")
cmd = [
sys.executable, synth_script,
"--text", text,
"--out", wav_path,
"--ref-audio", ref_audio,
"--ref-text", ref_text,
"--model", model,
"--device", device,
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
stderr = result.stderr.strip()
raise RuntimeError(f"NeuTTS synthesis failed: {stderr or 'unknown error'}")
# Filter out the "OK:" line from stderr
error_lines = [l for l in stderr.splitlines() if not l.startswith("OK:")]
raise RuntimeError(f"NeuTTS synthesis failed: {chr(10).join(error_lines) or 'unknown error'}")
# If the caller wanted .mp3 or .ogg, convert from WAV
if wav_path != output_path: