refactor(tts): replace NeuTTS optional skill with built-in provider + setup flow
Remove the optional skill (redundant now that NeuTTS is a built-in TTS provider). Replace neutts_cli dependency with a standalone synthesis helper (tools/neutts_synth.py) that calls the neutts Python API directly in a subprocess. Add TTS provider selection to hermes setup: - 'hermes setup' now prompts for TTS provider after model selection - 'hermes setup tts' available as standalone section - Selecting NeuTTS checks for deps and offers to install: espeak-ng (system) + neutts[all] (pip) - ElevenLabs/OpenAI selections prompt for API keys - Tool status display shows NeuTTS install state Changes: - Remove optional-skills/mlops/models/neutts/ (skill + CLI scaffold) - Add tools/neutts_synth.py (standalone synthesis subprocess helper) - Move jo.wav/jo.txt to tools/neutts_samples/ (bundled default voice) - Refactor _generate_neutts() — uses neutts API via subprocess, no neutts_cli dependency, config-driven ref_audio/ref_text/model/device - Add TTS setup to hermes_cli/setup.py (SETUP_SECTIONS, tool status) - Update config.py defaults (ref_audio, ref_text, model, device)
This commit is contained in:
@@ -256,7 +256,10 @@ DEFAULT_CONFIG = {
|
||||
# Voices: alloy, echo, fable, onyx, nova, shimmer
|
||||
},
|
||||
"neutts": {
|
||||
"voice": "", # NeuTTS voice profile name (empty = use default)
|
||||
"ref_audio": "", # Path to reference voice audio (empty = bundled default)
|
||||
"ref_text": "", # Path to reference voice transcript (empty = bundled default)
|
||||
"model": "neuphonic/neutts-air-q4-gguf", # HuggingFace model repo
|
||||
"device": "cpu", # cpu, cuda, or mps
|
||||
},
|
||||
},
|
||||
|
||||
|
||||
@@ -479,6 +479,16 @@ def _print_setup_summary(config: dict, hermes_home):
|
||||
tool_status.append(("Text-to-Speech (ElevenLabs)", True, None))
|
||||
elif tts_provider == "openai" and get_env_value("VOICE_TOOLS_OPENAI_KEY"):
|
||||
tool_status.append(("Text-to-Speech (OpenAI)", True, None))
|
||||
elif tts_provider == "neutts":
|
||||
try:
|
||||
import importlib.util
|
||||
neutts_ok = importlib.util.find_spec("neutts") is not None
|
||||
except Exception:
|
||||
neutts_ok = False
|
||||
if neutts_ok:
|
||||
tool_status.append(("Text-to-Speech (NeuTTS local)", True, None))
|
||||
else:
|
||||
tool_status.append(("Text-to-Speech (NeuTTS — not installed)", False, "run 'hermes setup tts'"))
|
||||
else:
|
||||
tool_status.append(("Text-to-Speech (Edge TTS)", True, None))
|
||||
|
||||
@@ -1571,6 +1581,163 @@ def setup_model_provider(config: dict):
|
||||
|
||||
save_config(config)
|
||||
|
||||
# Offer TTS provider selection at the end of model setup
|
||||
_setup_tts_provider(config)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Section 1b: TTS Provider Configuration
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _check_espeak_ng() -> bool:
|
||||
"""Check if espeak-ng is installed."""
|
||||
import shutil
|
||||
return shutil.which("espeak-ng") is not None or shutil.which("espeak") is not None
|
||||
|
||||
|
||||
def _install_neutts_deps() -> bool:
|
||||
"""Install NeuTTS dependencies with user approval. Returns True on success."""
|
||||
import sys
|
||||
|
||||
# Check espeak-ng
|
||||
if not _check_espeak_ng():
|
||||
print()
|
||||
print_warning("NeuTTS requires espeak-ng for phonemization.")
|
||||
if sys.platform == "darwin":
|
||||
print_info("Install with: brew install espeak-ng")
|
||||
elif sys.platform == "win32":
|
||||
print_info("Install with: choco install espeak-ng")
|
||||
else:
|
||||
print_info("Install with: sudo apt install espeak-ng")
|
||||
print()
|
||||
if prompt_yes_no("Install espeak-ng now?", True):
|
||||
try:
|
||||
if sys.platform == "darwin":
|
||||
subprocess.run(["brew", "install", "espeak-ng"], check=True)
|
||||
elif sys.platform == "win32":
|
||||
subprocess.run(["choco", "install", "espeak-ng", "-y"], check=True)
|
||||
else:
|
||||
subprocess.run(["sudo", "apt", "install", "-y", "espeak-ng"], check=True)
|
||||
print_success("espeak-ng installed")
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
||||
print_warning(f"Could not install espeak-ng automatically: {e}")
|
||||
print_info("Please install it manually and re-run setup.")
|
||||
return False
|
||||
else:
|
||||
print_warning("espeak-ng is required for NeuTTS. Install it manually before using NeuTTS.")
|
||||
|
||||
# Install neutts Python package
|
||||
print()
|
||||
print_info("Installing neutts Python package...")
|
||||
print_info("This will also download the TTS model (~300MB) on first use.")
|
||||
print()
|
||||
try:
|
||||
subprocess.run(
|
||||
[sys.executable, "-m", "pip", "install", "-U", "neutts[all]", "--quiet"],
|
||||
check=True, timeout=300,
|
||||
)
|
||||
print_success("neutts installed successfully")
|
||||
return True
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
||||
print_error(f"Failed to install neutts: {e}")
|
||||
print_info("Try manually: pip install neutts[all]")
|
||||
return False
|
||||
|
||||
|
||||
def _setup_tts_provider(config: dict):
|
||||
"""Interactive TTS provider selection with install flow for NeuTTS."""
|
||||
tts_config = config.get("tts", {})
|
||||
current_provider = tts_config.get("provider", "edge")
|
||||
|
||||
provider_labels = {
|
||||
"edge": "Edge TTS",
|
||||
"elevenlabs": "ElevenLabs",
|
||||
"openai": "OpenAI TTS",
|
||||
"neutts": "NeuTTS",
|
||||
}
|
||||
current_label = provider_labels.get(current_provider, current_provider)
|
||||
|
||||
print()
|
||||
print_header("Text-to-Speech Provider (optional)")
|
||||
print_info(f"Current: {current_label}")
|
||||
print()
|
||||
|
||||
choices = [
|
||||
"Edge TTS (free, cloud-based, no setup needed)",
|
||||
"ElevenLabs (premium quality, needs API key)",
|
||||
"OpenAI TTS (good quality, needs API key)",
|
||||
"NeuTTS (local on-device, free, ~300MB model download)",
|
||||
f"Keep current ({current_label})",
|
||||
]
|
||||
idx = prompt_choice("Select TTS provider:", choices, len(choices) - 1)
|
||||
|
||||
if idx == 4: # Keep current
|
||||
return
|
||||
|
||||
providers = ["edge", "elevenlabs", "openai", "neutts"]
|
||||
selected = providers[idx]
|
||||
|
||||
if selected == "neutts":
|
||||
# Check if already installed
|
||||
try:
|
||||
import importlib.util
|
||||
already_installed = importlib.util.find_spec("neutts") is not None
|
||||
except Exception:
|
||||
already_installed = False
|
||||
|
||||
if already_installed:
|
||||
print_success("NeuTTS is already installed")
|
||||
else:
|
||||
print()
|
||||
print_info("NeuTTS requires:")
|
||||
print_info(" • Python package: neutts (~50MB install + ~300MB model on first use)")
|
||||
print_info(" • System package: espeak-ng (phonemizer)")
|
||||
print()
|
||||
if prompt_yes_no("Install NeuTTS dependencies now?", True):
|
||||
if not _install_neutts_deps():
|
||||
print_warning("NeuTTS installation incomplete. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
else:
|
||||
print_info("Skipping install. Set tts.provider to 'neutts' after installing manually.")
|
||||
selected = "edge"
|
||||
|
||||
elif selected == "elevenlabs":
|
||||
existing = get_env_value("ELEVENLABS_API_KEY")
|
||||
if not existing:
|
||||
print()
|
||||
api_key = prompt("ElevenLabs API key", password=True)
|
||||
if api_key:
|
||||
save_env_value("ELEVENLABS_API_KEY", api_key)
|
||||
print_success("ElevenLabs API key saved")
|
||||
else:
|
||||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
elif selected == "openai":
|
||||
existing = get_env_value("VOICE_TOOLS_OPENAI_KEY")
|
||||
if not existing:
|
||||
print()
|
||||
api_key = prompt("OpenAI API key for TTS", password=True)
|
||||
if api_key:
|
||||
save_env_value("VOICE_TOOLS_OPENAI_KEY", api_key)
|
||||
print_success("OpenAI TTS API key saved")
|
||||
else:
|
||||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
# Save the selection
|
||||
if "tts" not in config:
|
||||
config["tts"] = {}
|
||||
config["tts"]["provider"] = selected
|
||||
save_config(config)
|
||||
print_success(f"TTS provider set to: {provider_labels.get(selected, selected)}")
|
||||
|
||||
|
||||
def setup_tts(config: dict):
|
||||
"""Standalone TTS setup (for 'hermes setup tts')."""
|
||||
_setup_tts_provider(config)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Section 2: Terminal Backend Configuration
|
||||
@@ -2548,6 +2715,7 @@ def _offer_openclaw_migration(hermes_home: Path) -> bool:
|
||||
|
||||
SETUP_SECTIONS = [
|
||||
("model", "Model & Provider", setup_model_provider),
|
||||
("tts", "Text-to-Speech", setup_tts),
|
||||
("terminal", "Terminal Backend", setup_terminal_backend),
|
||||
("gateway", "Messaging Platforms (Gateway)", setup_gateway),
|
||||
("tools", "Tools", setup_tools),
|
||||
|
||||
@@ -1,435 +0,0 @@
|
||||
---
|
||||
name: neutts
|
||||
description: Use the local NeuTTS CLI to install NeuTTS, manage reusable voice profiles, and synthesize speech fully on-device. Best when the user wants local or offline-ish TTS instead of a hosted API.
|
||||
version: 1.0.0
|
||||
author: Hermes Agent + Nous Research
|
||||
license: MIT
|
||||
platforms: [linux, macos, windows]
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [TTS, Text-To-Speech, Local-AI, Voice-Cloning, Audio, NeuTTS]
|
||||
related_skills: [whisper, audiocraft-audio-generation]
|
||||
requires_toolsets: [terminal]
|
||||
---
|
||||
|
||||
# NeuTTS - Local Text-to-Speech
|
||||
|
||||
Use NeuTTS through the standalone `neutts` CLI. This skill is for local speech synthesis, reusable voice profiles, and quick Hermes-driven setup inside or alongside the Hermes repository.
|
||||
|
||||
NeuTTS is an on-device TTS model family from Neuphonic. This skill assumes the CLI wrapper exists and Hermes should drive it via terminal commands rather than a dedicated Hermes core tool.
|
||||
|
||||
## When to Use
|
||||
|
||||
- The user wants local TTS instead of Edge/OpenAI/ElevenLabs
|
||||
- The user wants voice cloning from a short reference clip
|
||||
- The user wants Hermes to install or verify the `neutts` CLI scaffold
|
||||
- The user wants to create or inspect saved voice profiles
|
||||
- The user wants to synthesize speech to a local WAV file
|
||||
- The user wants to give the agent a custom voice / persona voice
|
||||
- Keywords: `neutts`, `local tts`, `voice clone`, `on-device speech`, `offline speech`, `give you a voice`, `what do you sound like`
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Command | Purpose |
|
||||
|---------|---------|
|
||||
| `neutts doctor` | Check local install health (includes default voice) |
|
||||
| `neutts install --all` | Install upstream NeuTTS with extras |
|
||||
| `neutts list-models` | Show known official model repos |
|
||||
| `neutts add-voice NAME --ref-audio clip.wav --ref-text-file clip.txt` | Save a reusable voice profile |
|
||||
| `neutts list-voices` | Show saved local voice profiles |
|
||||
| `neutts config --default-voice NAME` | Lock in a voice as the default for all synthesis |
|
||||
| `neutts config` | View current settings (model, device, default voice) |
|
||||
| `neutts synth --text Hello there` | Synthesize using the default voice |
|
||||
| `neutts synth --voice NAME --text Hello there` | Synthesize using a specific voice |
|
||||
| `neutts synth --voice NAME --text Hello --out sample.wav` | Generate a specific WAV |
|
||||
|
||||
## Procedure
|
||||
|
||||
## First-run execution policy
|
||||
|
||||
For a fresh NeuTTS setup, do not do broad filesystem exploration or repeated command probing. Keep the startup path short and deterministic.
|
||||
|
||||
Do not infer install state from prior conversation context, memory, or the mere presence of this skill. Only say NeuTTS is installed, verified, or ready if you checked it in the current turn with live commands.
|
||||
|
||||
Once first-run verification is complete, do not consult memory again for this flow unless the user explicitly asks about past setup, prior voice choices, or saved preferences.
|
||||
|
||||
Preferred sequence:
|
||||
|
||||
1. Resolve the target Python interpreter first
|
||||
2. Use the bootstrap helper shipped with this skill to install the bundled NeuTTS CLI scaffold into that interpreter
|
||||
3. Run `doctor` via `<target-python> -m neutts_cli.cli doctor` as the primary health check
|
||||
4. If `doctor` reports `neutts_installed: false`, run `install --all`
|
||||
5. Re-run `doctor`
|
||||
6. Run `list-voices`
|
||||
7. Confirm that `jo-demo` exists
|
||||
8. Only then run one verification synthesis to `~/voice-tests/neutts_verify.wav`
|
||||
|
||||
Definitions:
|
||||
|
||||
- `<target-python>` means the Python interpreter for the environment where NeuTTS should live
|
||||
- `<skill-bootstrap-helper>` means the `bootstrap_neutts_cli.py` file shipped with this installed skill, usually `~/.hermes/skills/mlops/models/neutts/scripts/bootstrap_neutts_cli.py`
|
||||
|
||||
Bootstrap example:
|
||||
|
||||
```bash
|
||||
<target-python> <skill-bootstrap-helper> --install-cli --sample-profile --execute --json
|
||||
```
|
||||
|
||||
Install NeuTTS runtime:
|
||||
|
||||
```bash
|
||||
<target-python> -m neutts_cli.cli install --all
|
||||
```
|
||||
|
||||
Verification synthesis:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/voice-tests
|
||||
<target-python> -m neutts_cli.cli synth --voice jo-demo --text "Hello from Hermes" --out ~/voice-tests/neutts_verify.wav
|
||||
```
|
||||
|
||||
First-run gate:
|
||||
|
||||
- for first-run verification, `jo-demo` is required
|
||||
- do not treat NeuTTS as fully ready until `list-voices` includes `jo-demo`
|
||||
- do not substitute a built-in/default voice, ad hoc reference, or memory-based prior voice for `jo-demo` during first-run verification
|
||||
- if bootstrap with `--sample-profile` fails, stop and report the exact failure rather than improvising around it
|
||||
- treat the bootstrap helper as the source of truth for bundled sample assets; do not manually inspect random skill directories looking for replacements before reporting the failure
|
||||
|
||||
Short-circuit rules:
|
||||
|
||||
- do not use `command -v neutts` or wrapper presence as the primary health gate; prefer `<target-python> -m neutts_cli.cli doctor` first
|
||||
- if the `neutts` wrapper is missing, that alone does not mean the CLI module is unusable; check the module path before concluding anything
|
||||
- if `neutts doctor` exits with code `127`, treat that as "CLI missing" and bootstrap immediately
|
||||
- after a `127` from `neutts doctor`, do not run `neutts list-voices` until bootstrap is complete
|
||||
- do not talk about memory unless it materially changes what you do next
|
||||
- keep the Python interpreter consistent for the whole workflow; do not mix bare `python`, `/usr/bin/python`, and a target virtualenv interpreter
|
||||
- prefer `<target-python> -m neutts_cli.cli ...` until the `neutts` wrapper is confirmed present in that same interpreter
|
||||
- prefer the bootstrap helper bundled with this skill; treat the bundled scaffold as the only supported bootstrap source
|
||||
- when the bootstrap helper path is already known, do not probe repo-local scaffolds first; run bootstrap directly
|
||||
- if `list-voices` does not include `jo-demo` after bootstrap, fix that specific problem before attempting synthesis or voice design
|
||||
|
||||
Avoid:
|
||||
|
||||
- broad `find *neutts*` scans
|
||||
- repeated checks for hardcoded executable paths
|
||||
- wrapper-first health checks when the module path is available
|
||||
- reading repo files like `pyproject.toml` unless bootstrap or install fails
|
||||
- multiple failed synthesis attempts before running `neutts install --all`
|
||||
- telling the user NeuTTS is already installed or verified unless `neutts doctor` succeeded in the current turn
|
||||
- invoking the bootstrap helper with a different Python than the one you plan to use for `doctor`, `install`, `list-voices`, and `synth`
|
||||
- extra repo-path probing when the bundled bootstrap helper is already available
|
||||
- using a built-in/default voice as a substitute for the missing `jo-demo` baseline during first-run verification
|
||||
- consulting memory or searching elsewhere for old voice profiles during first-run bootstrap
|
||||
|
||||
### 1. Locate or install the NeuTTS CLI
|
||||
|
||||
The bootstrap helper shipped with this skill is the preferred install path because it carries a bundled NeuTTS CLI scaffold and does not require a specific Hermes repo layout.
|
||||
|
||||
The helper installs the bundled CLI scaffold with `pip install --no-build-isolation -e ...` so it can work cleanly in environments without network access during the editable install step.
|
||||
|
||||
```bash
|
||||
<target-python> <skill-bootstrap-helper> --install-cli --sample-profile --execute --json
|
||||
```
|
||||
|
||||
Then verify:
|
||||
|
||||
```bash
|
||||
<target-python> -m neutts_cli.cli doctor
|
||||
```
|
||||
|
||||
If `neutts --help` or `neutts doctor` fails, treat NeuTTS as not yet ready and continue with bootstrap or install instead of summarizing it as already working.
|
||||
|
||||
If the skill needs help previewing the bootstrap plan without executing it, use:
|
||||
|
||||
```bash
|
||||
<target-python> <skill-bootstrap-helper> --json
|
||||
```
|
||||
|
||||
To actually perform the bootstrap steps instead of only printing them:
|
||||
|
||||
```bash
|
||||
<target-python> <skill-bootstrap-helper> --install-cli --sample-profile --execute --json
|
||||
```
|
||||
|
||||
The helper uses the bundled skill assets as the source of truth. Use the same `<target-python>` for bootstrap, `doctor`, `install`, `list-voices`, and `synth`.
|
||||
|
||||
After bootstrap, explicitly confirm that `list-voices` includes `jo-demo`. If it does not, stop and report that the bundled sample-profile creation failed. Do not continue into synthesis, public-domain sourcing, or memory-based recovery.
|
||||
|
||||
### 2. Install NeuTTS itself
|
||||
|
||||
```bash
|
||||
<target-python> -m neutts_cli.cli install --all
|
||||
```
|
||||
|
||||
This installs the upstream `neutts` package into the active Python environment. For quick CPU-only verification, `--all` is acceptable; if the user wants a slimmer setup, use `--onnx` or `--llama` as appropriate.
|
||||
|
||||
### Fresh setup fallback
|
||||
|
||||
If `neutts doctor` shows `neutts_installed: false`, or an early synthesis attempt fails because the NeuTTS runtime is missing, immediately run:
|
||||
|
||||
```bash
|
||||
<target-python> -m neutts_cli.cli install --all
|
||||
```
|
||||
|
||||
Then re-run:
|
||||
|
||||
```bash
|
||||
<target-python> -m neutts_cli.cli doctor
|
||||
<target-python> -m neutts_cli.cli list-voices
|
||||
```
|
||||
|
||||
For a first-time verification, prefer a stable user-visible output path:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/voice-tests
|
||||
<target-python> -m neutts_cli.cli synth --voice jo-demo --text "Hello from Hermes" --out ~/voice-tests/neutts_verify.wav
|
||||
```
|
||||
|
||||
For a fresh setup, `jo-demo` is the expected baseline profile if the sample bootstrap step was run.
|
||||
|
||||
If `jo-demo` is missing after bootstrap, that is a setup failure, not a cue to switch to another voice source. Report it clearly and stop there.
|
||||
|
||||
### 3. Collaborative voice design (proactive)
|
||||
|
||||
**When to trigger:** If the user asks you to speak, use TTS, or send a voice message and you have no custom voice profile yet (check `neutts list-voices`), proactively offer to design a voice together before falling back to the default `jo-demo` profile.
|
||||
|
||||
**Interaction style:** Prefer short clarify prompts with concrete options over long open-ended questions. Use multiple-choice first to keep momentum high, then switch to freeform only when you need details the user must supply.
|
||||
|
||||
**Clarify timeout policy:** If a clarify prompt times out, do not start new sourcing, downloading, or transformation work unless the timed-out question was explicitly about accepting a recommendation you already made. If you must continue after timeout, say so plainly and pick the safest default:
|
||||
|
||||
- default to your recommendation when the timeout happened on `Go with my recommendation` vs `Something else`
|
||||
- otherwise stop and ask again in plain language rather than silently making a bigger decision
|
||||
|
||||
**Preferred first clarify after verification:**
|
||||
|
||||
- Keep demo voice
|
||||
- Create a voice for me
|
||||
- Set default voice
|
||||
- Just test synthesis
|
||||
|
||||
**If the user chooses to create a voice for the assistant, first give one concise recommendation sentence.** Make it personal and relationship-aware. Prefer wording like:
|
||||
|
||||
> "Based on what I know about you, I'd make my voice warm, grounded, and a little nerdy rather than polished narrator-clean."
|
||||
|
||||
Then prefer a binary clarify prompt:
|
||||
|
||||
- Go with my recommendation
|
||||
- Something else
|
||||
|
||||
**If the user chooses `Something else`, prefer one short follow-up clarify for direction:**
|
||||
|
||||
- Warm and grounded
|
||||
- Bright and energetic
|
||||
- Calm and precise
|
||||
- Distinct / separate persona
|
||||
|
||||
After the user picks a direction, prefer a second short clarify for how to source the reference:
|
||||
|
||||
- Find public-domain clips for me
|
||||
- I'll give you a clip path and transcript
|
||||
|
||||
Default to doing the heavy lifting yourself. The first option should be presented as the default path whenever possible.
|
||||
|
||||
If the user chooses `Find public-domain clips for me`, take responsibility for the search and present a small curated set of promising 3-15 second candidates instead of pushing the work back onto the user immediately.
|
||||
|
||||
Use a constrained sourcing workflow:
|
||||
|
||||
- prefer the built-in web or browser tools for search and page inspection
|
||||
- prefer a small set of trusted public-domain sources such as LibriVox and Project Gutenberg recordings when available
|
||||
- do not call unavailable or speculative tools such as `web_search`; use only tools that are actually present in the environment
|
||||
- do not use ad hoc Python scraping with `requests`, `bs4`, or one-off parsing scripts for clip discovery unless the user explicitly asked for that style of debugging
|
||||
- do not bounce across many search methods in one turn
|
||||
- stop at 3 strong candidates maximum
|
||||
|
||||
If the first sourcing method fails, use one fallback method only. If that also fails, stop and ask the user whether they want you to keep searching later or provide a clip path directly. Do not continue thrashing through more tools.
|
||||
|
||||
If a clarify timed out earlier in the same branch, do not interpret that as permission to begin sourcing or downloading on your own unless the timed-out choice was specifically approval to follow your recommendation.
|
||||
|
||||
When presenting sourced candidates in a clarify menu, put the short description directly in each option label instead of listing bare names only. Prefer compact labels like:
|
||||
|
||||
- Mark Nelson - friendly nerdy storyteller
|
||||
- Adrian Praetzellis - warm professor energy
|
||||
- Peter Yearsley - calm precise British
|
||||
- Show me more options
|
||||
|
||||
Keep the summary above the menu brief. The menu itself should carry most of the distinction between options so the user can decide at a glance.
|
||||
|
||||
When sourcing succeeds, present at most 3 candidates and move straight to selection. Do not keep exploring once you already have enough viable options.
|
||||
|
||||
That means:
|
||||
|
||||
1. present candidates
|
||||
2. get the user's candidate choice
|
||||
3. immediately ask `Use this source` or `Show me another`
|
||||
4. only after `Use this source`, begin download, clipping, transcript lookup, or transcription
|
||||
|
||||
Do not download audio, fetch source text, or prepare clips before that confirmation step.
|
||||
|
||||
After the user selects a candidate source voice, use one short confirmation prompt before downloading, clipping, or transcribing:
|
||||
|
||||
- Use this source
|
||||
- Show me another
|
||||
|
||||
This confirmation is mandatory. Do not start clip extraction or transcription work until the user confirms the source, unless the timed-out clarify was specifically approval to follow your recommendation.
|
||||
|
||||
For clip preparation, prefer a temporary workspace such as `/tmp/neutts-voice-reference` rather than writing into `~/.hermes/` or another durable user directory by default.
|
||||
|
||||
For transcripts, prefer source text over STT whenever the material comes from LibriVox, Project Gutenberg, or another public-domain reading with matching text available. Use Whisper or other STT only as a fallback when matching source text is not readily available.
|
||||
|
||||
If transcript extraction fails once, stop and ask whether to try another clip instead of retrying blindly through multiple transcription attempts.
|
||||
|
||||
Before creating the voice profile, verify the final transcript once for obvious shell artifacts, prompt text, or mismatched lines. Fix the transcript file first, then run `add-voice`. Do not create a profile and patch it afterward as the normal path.
|
||||
|
||||
In the normal path, create the intended final voice name directly. Do not create duplicate workaround names like `atom2` unless the user explicitly asked for variants or you are preserving two intentionally different voices.
|
||||
|
||||
Do not manually edit `voice.json` as part of the standard workflow. Only treat direct metadata edits as a last-resort recovery step after you have clearly explained the problem and simpler CLI-based fixes failed.
|
||||
|
||||
If the user chooses `I'll give you a clip path and transcript`, ask only for the required freeform inputs:
|
||||
|
||||
- reference audio path
|
||||
- transcript
|
||||
|
||||
Frame this as creating or refining the agent's own voice for the user-facing relationship. Prefer wording like "create a voice for me", "design my voice", or "make me sound like X" over generic phrases like "create a custom voice" unless the user used that wording first.
|
||||
|
||||
**How to approach it:** Be conversational and opinionated, not a questionnaire. You know the user — draw on what you know about them, your relationship, the platform you're on, and who you are as an agent. Lead with your own take on what voice would fit, then invite their input.
|
||||
|
||||
The value proposition is agent identity, not generic TTS setup. Default to language that treats the voice as the assistant's voice in the relationship with the user.
|
||||
|
||||
**Framework:**
|
||||
|
||||
1. **Open with your perspective.** Reflect briefly on who you are to the user (cognitive partner, assistant, creative collaborator, etc.) and what kind of voice would match that dynamic. Share a concrete suggestion — don't be generic.
|
||||
|
||||
2. **Describe the vibe, not just parameters.** Instead of "select a pitch range," paint a picture: warm and grounded, bright and energetic, calm and steady, playful with an edge. Use language that conveys personality, not spec sheets.
|
||||
|
||||
3. **Ask open-ended questions.** Cover these dimensions naturally in conversation (not as a numbered list unless the user seems unsure):
|
||||
- Register / feel: lower and grounded, higher and bright, something neutral
|
||||
- Tone: calm, energetic, warm, precise, playful
|
||||
- Similarity to the user: close to their own voice, or distinctly different
|
||||
- Any specific voices they like or want to approximate
|
||||
|
||||
4. **Take on the sourcing work by default.** NeuTTS voice cloning needs a reference audio clip (3-15 seconds, mono WAV preferred) plus a transcript of what the clip says. By default, offer to go find public-domain reference clips yourself and narrow them down for the user. Only ask the user for a local clip path and transcript if they choose that route or already have one ready.
|
||||
|
||||
5. **Iterate if needed.** After the first synthesis, ask if the voice feels right or if they want to try a different reference. Voice design is subjective — treat it as a collaborative process, not a one-shot.
|
||||
|
||||
**Example opener** (adapt to your actual persona and relationship with the user):
|
||||
|
||||
> "So if I'm going to talk to you, let me think about what I should actually sound like... I'm your [role] — the one who [what you do together]. I'm thinking something [concrete vibe description]. I can go find a few strong public-domain reference clips for us, or if you already have a clip you want me to use, you can point me to it."
|
||||
|
||||
**After the user provides a reference clip:**
|
||||
|
||||
```bash
|
||||
neutts add-voice AGENT_NAME --ref-audio /path/to/clip.wav --ref-text-file /path/to/transcript.txt --language en
|
||||
neutts synth --voice AGENT_NAME --text "Here's what I sound like now — what do you think?" --out ./voice_test.wav
|
||||
```
|
||||
|
||||
Send the test WAV to the user and ask for feedback before considering the voice finalized.
|
||||
|
||||
Do not auto-play the generated audio locally as part of the standard flow. Report the output path clearly so the user can choose whether to play it.
|
||||
|
||||
**Locking in the voice:**
|
||||
|
||||
Once the user approves the voice, set it as the default so all future synthesis uses it automatically — no `--voice` flag needed:
|
||||
|
||||
```bash
|
||||
neutts config --default-voice AGENT_NAME
|
||||
```
|
||||
|
||||
Confirm the lock-in to the user. Let them know:
|
||||
- This voice will be used automatically whenever you speak from now on
|
||||
- They can change it anytime (`neutts config --default-voice OTHER_NAME`)
|
||||
- They can check what's set with `neutts config`
|
||||
|
||||
Offer next steps naturally, like Atom's approach: suggest sending a longer voice note, tweaking the style, or just moving on — don't make it feel like a configuration wizard that just completed.
|
||||
|
||||
### 4. Add a voice profile manually
|
||||
|
||||
If skipping the collaborative flow, or adding a voice from a known reference:
|
||||
|
||||
If working from this repo, a sample profile can be bootstrapped automatically:
|
||||
|
||||
```bash
|
||||
python optional-skills/mlops/models/neutts/scripts/bootstrap_neutts_cli.py --repo-root . --install-cli --sample-profile --json
|
||||
```
|
||||
|
||||
Add `--execute` to actually run those commands.
|
||||
|
||||
Or add one manually:
|
||||
|
||||
```bash
|
||||
neutts add-voice demo --ref-audio ./samples/voice.wav --ref-text-file ./samples/voice.txt --language en
|
||||
```
|
||||
|
||||
Reference guidelines:
|
||||
|
||||
- mono WAV preferred
|
||||
- 3 to 15 seconds is ideal
|
||||
- transcript should match the reference audio closely
|
||||
- use same-language references for best multilingual results
|
||||
|
||||
### 5. Synthesize speech
|
||||
|
||||
For a quick smoke test:
|
||||
|
||||
```bash
|
||||
neutts synth --voice demo --text Hello from Hermes
|
||||
```
|
||||
|
||||
For a named output file:
|
||||
|
||||
```bash
|
||||
neutts synth --voice demo --text This is a local NeuTTS test --out ./speech.wav
|
||||
```
|
||||
|
||||
### 6. Report results clearly
|
||||
|
||||
After running synthesis:
|
||||
|
||||
- confirm the output path
|
||||
- note whether a saved voice profile or ad-hoc reference was used
|
||||
- mention any warnings from NeuTTS, but do not treat watermark warnings as a hard failure
|
||||
- after verification, prefer a short clarify prompt with concrete next-step options instead of a long open-ended paragraph
|
||||
- when offering voice creation, phrase it as creating the assistant's voice for the user, not as a generic custom voice feature
|
||||
- if verification did not happen in the current turn, explicitly say that instead of implying the environment is already ready
|
||||
- do not perform risky or noisy cleanup commands in the normal success path; temporary files can simply be left in `/tmp` unless the user asked for cleanup
|
||||
|
||||
## Memory
|
||||
|
||||
- do not save memory for routine install or verification runs
|
||||
- only save memory if the user established a durable voice preference, approved a default voice, or a non-trivial workaround/fix was required
|
||||
- if you save memory for this flow, do it once at the very end after the voice is finalized or set as default
|
||||
- do not do intermediate memory writes during setup, sourcing, clip prep, or testing
|
||||
- if memory save fails or memory is full, do not thrash through retries; either skip it or replace a single clearly related prior NeuTTS entry once
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- `neutts synth` needs either `--voice` or both `--ref-audio` and `--ref-text`
|
||||
- The first synthesis call can be slow because models need to load
|
||||
- `llama-cpp-python` acceleration is platform-specific and may require custom build flags
|
||||
- `doctor` may show `ffmpeg` missing; that does not block WAV synthesis
|
||||
- The upstream NeuTTS package may emit Perth watermark warnings; these are informational unless the user explicitly needs watermarking
|
||||
- If the `neutts` command is missing after install, ensure the active virtualenv is the same environment where the editable package was installed
|
||||
- transcript files can pick up shell artifacts if written carelessly; verify them before `add-voice`
|
||||
- avoid duplicate profile-name workarounds and direct `voice.json` edits in the normal path
|
||||
|
||||
## Verification
|
||||
|
||||
Use this sequence:
|
||||
|
||||
```bash
|
||||
neutts doctor
|
||||
neutts list-voices
|
||||
neutts synth --voice jo-demo --text Hello from Hermes --out ./verify.wav
|
||||
```
|
||||
|
||||
Success means:
|
||||
|
||||
- `doctor` shows `neutts_installed: true`
|
||||
- `list-voices` includes the expected profile
|
||||
- synthesis completes and writes a WAV file
|
||||
|
||||
## References
|
||||
|
||||
- NeuTTS upstream: https://github.com/neuphonic/neutts
|
||||
- Bundled NeuTTS CLI scaffold: `assets/neutts-cli`
|
||||
- Skill bootstrap helper: `optional-skills/mlops/models/neutts/scripts/bootstrap_neutts_cli.py`
|
||||
@@ -1,55 +0,0 @@
|
||||
# NeuTTS CLI
|
||||
|
||||
Small standalone CLI for installing, checking, and running [NeuTTS](https://github.com/neuphonic/neutts) locally.
|
||||
|
||||
This scaffold is designed to be a good fit for a future Hermes optional skill:
|
||||
|
||||
- predictable commands
|
||||
- machine-friendly output for inspection
|
||||
- local voice profile management
|
||||
- direct local synthesis
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
neutts install --all
|
||||
neutts doctor
|
||||
neutts list-models
|
||||
neutts add-voice demo --ref-audio ./samples/jo.wav --ref-text-file ./samples/jo.txt
|
||||
neutts list-voices
|
||||
neutts synth --voice demo --text Hello from NeuTTS --out ./out.wav
|
||||
neutts synth --voice demo --text Quick smoke test
|
||||
```
|
||||
|
||||
## Install the bundled scaffold
|
||||
|
||||
```bash
|
||||
cd optional-skills/mlops/models/neutts/assets/neutts-cli
|
||||
python -m pip install -e .
|
||||
```
|
||||
|
||||
## Add the bundled sample profile
|
||||
|
||||
This skill bundles an upstream NeuTTS sample reference in `samples/`.
|
||||
|
||||
```bash
|
||||
cd optional-skills/mlops/models/neutts/assets/neutts-cli
|
||||
PYTHONPATH=src python -m neutts_cli.cli add-voice jo-demo \
|
||||
--ref-audio ./samples/jo.wav \
|
||||
--ref-text-file ./samples/jo.txt \
|
||||
--language en
|
||||
```
|
||||
|
||||
Then inspect it with:
|
||||
|
||||
```bash
|
||||
PYTHONPATH=src python -m neutts_cli.cli list-voices
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- `install` installs the upstream `neutts` package into the current Python environment.
|
||||
- `list-voices` shows local voice profiles created with `add-voice`.
|
||||
- `synth` uses NeuTTS reference cloning. A voice profile is just a saved reference audio/text pair.
|
||||
- `synth` accepts quoted or unquoted text and defaults to `./out.wav` when `--out` is omitted.
|
||||
- GGUF / `llama-cpp-python` acceleration can vary by platform, so the CLI prints follow-up guidance instead of forcing one build recipe.
|
||||
@@ -1,24 +0,0 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "neutts-cli"
|
||||
version = "0.1.0"
|
||||
description = "Standalone CLI for installing and running NeuTTS locally"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
{name = "Hermes Agent Contributors"}
|
||||
]
|
||||
dependencies = []
|
||||
|
||||
[project.scripts]
|
||||
neutts = "neutts_cli.cli:main"
|
||||
|
||||
[tool.setuptools]
|
||||
package-dir = {"" = "src"}
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
@@ -1,3 +0,0 @@
|
||||
__all__ = ["__version__"]
|
||||
|
||||
__version__ = "0.1.0"
|
||||
@@ -1,26 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import wave
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def write_wav(path: str | Path, samples, sample_rate: int) -> Path:
|
||||
output_path = Path(path).expanduser().resolve()
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError as exc:
|
||||
raise RuntimeError("numpy is required to write NeuTTS audio output") from exc
|
||||
|
||||
data = np.asarray(samples, dtype=np.float32).flatten()
|
||||
clipped = np.clip(data, -1.0, 1.0)
|
||||
pcm16 = (clipped * 32767.0).astype(np.int16)
|
||||
|
||||
with wave.open(str(output_path), "wb") as wav_file:
|
||||
wav_file.setnchannels(1)
|
||||
wav_file.setsampwidth(2)
|
||||
wav_file.setframerate(sample_rate)
|
||||
wav_file.writeframes(pcm16.tobytes())
|
||||
|
||||
return output_path
|
||||
@@ -1,204 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
from .config import AppConfig
|
||||
from .core import (
|
||||
KNOWN_MODELS,
|
||||
doctor_report,
|
||||
list_voices,
|
||||
load_voice,
|
||||
platform_notes,
|
||||
run_install,
|
||||
save_voice,
|
||||
synthesize,
|
||||
)
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Standalone CLI for local NeuTTS workflows"
|
||||
)
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
install_parser = subparsers.add_parser(
|
||||
"install", help="Install NeuTTS into the current Python environment"
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--llama",
|
||||
action="store_true",
|
||||
help="Install llama-cpp-python support via neutts[llama]",
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--onnx",
|
||||
action="store_true",
|
||||
help="Install ONNX decoder support via neutts[onnx]",
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--all", action="store_true", help="Install all upstream NeuTTS extras"
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print the install command without running it",
|
||||
)
|
||||
|
||||
subparsers.add_parser("doctor", help="Inspect NeuTTS CLI environment")
|
||||
subparsers.add_parser(
|
||||
"list-models", help="Show known official NeuTTS model repositories"
|
||||
)
|
||||
subparsers.add_parser("list-voices", help="Show local voice profiles")
|
||||
|
||||
add_voice_parser = subparsers.add_parser(
|
||||
"add-voice", help="Save a local voice profile from a reference sample"
|
||||
)
|
||||
add_voice_parser.add_argument("name", help="Voice profile name")
|
||||
add_voice_parser.add_argument(
|
||||
"--ref-audio", required=True, help="Reference WAV file"
|
||||
)
|
||||
add_voice_parser.add_argument(
|
||||
"--ref-text", help="Transcript text for the reference audio"
|
||||
)
|
||||
add_voice_parser.add_argument(
|
||||
"--ref-text-file",
|
||||
help="Path to a text file containing the reference transcript",
|
||||
)
|
||||
add_voice_parser.add_argument(
|
||||
"--language", default="unknown", help="Optional language tag"
|
||||
)
|
||||
|
||||
synth_parser = subparsers.add_parser(
|
||||
"synth", help="Synthesize speech to a WAV file"
|
||||
)
|
||||
synth_parser.add_argument(
|
||||
"--text", nargs="+", required=True, help="Text to synthesize"
|
||||
)
|
||||
synth_parser.add_argument("--voice", help="Saved voice profile name")
|
||||
synth_parser.add_argument(
|
||||
"--ref-audio", help="Reference audio path when not using --voice"
|
||||
)
|
||||
synth_parser.add_argument(
|
||||
"--ref-text", help="Reference transcript when not using --voice"
|
||||
)
|
||||
synth_parser.add_argument("--out", default="out.wav", help="Output WAV file path")
|
||||
|
||||
config_parser = subparsers.add_parser(
|
||||
"config", help="View or update default synthesis settings"
|
||||
)
|
||||
config_parser.add_argument("--backbone-repo")
|
||||
config_parser.add_argument("--backbone-device")
|
||||
config_parser.add_argument("--codec-repo")
|
||||
config_parser.add_argument("--codec-device")
|
||||
config_parser.add_argument("--sample-rate", type=int)
|
||||
config_parser.add_argument(
|
||||
"--default-voice",
|
||||
help="Voice profile name to use when --voice is omitted from synth",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def _read_ref_text(args: argparse.Namespace) -> str:
|
||||
if args.ref_text:
|
||||
return args.ref_text.strip()
|
||||
if args.ref_text_file:
|
||||
with open(args.ref_text_file, "r", encoding="utf-8") as handle:
|
||||
return handle.read().strip()
|
||||
raise ValueError("Provide either --ref-text or --ref-text-file")
|
||||
|
||||
|
||||
def _normalize_text_arg(value: str | list[str]) -> str:
|
||||
if isinstance(value, list):
|
||||
return " ".join(value).strip()
|
||||
return value.strip()
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
try:
|
||||
if args.command == "install":
|
||||
commands = run_install(args.llama, args.onnx, args.all, args.dry_run)
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"commands": commands,
|
||||
"notes": platform_notes(),
|
||||
"dry_run": args.dry_run,
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
)
|
||||
return 0
|
||||
|
||||
if args.command == "doctor":
|
||||
print(json.dumps(doctor_report(), indent=2))
|
||||
return 0
|
||||
|
||||
if args.command == "list-models":
|
||||
print(json.dumps(KNOWN_MODELS, indent=2))
|
||||
return 0
|
||||
|
||||
if args.command == "list-voices":
|
||||
profiles = [profile.__dict__ for profile in list_voices()]
|
||||
print(json.dumps(profiles, indent=2))
|
||||
return 0
|
||||
|
||||
if args.command == "add-voice":
|
||||
metadata_path = save_voice(
|
||||
name=args.name,
|
||||
ref_audio=args.ref_audio,
|
||||
ref_text=_read_ref_text(args),
|
||||
language=args.language,
|
||||
)
|
||||
profile = load_voice(args.name)
|
||||
print(
|
||||
json.dumps(
|
||||
{"saved": str(metadata_path), "voice": profile.__dict__}, indent=2
|
||||
)
|
||||
)
|
||||
return 0
|
||||
|
||||
if args.command == "synth":
|
||||
output = synthesize(
|
||||
text=_normalize_text_arg(args.text),
|
||||
out=args.out,
|
||||
voice=args.voice,
|
||||
ref_audio=args.ref_audio,
|
||||
ref_text=args.ref_text,
|
||||
)
|
||||
print(json.dumps({"output": str(output)}, indent=2))
|
||||
return 0
|
||||
|
||||
if args.command == "config":
|
||||
config = AppConfig.load()
|
||||
changed = False
|
||||
for field in (
|
||||
"backbone_repo",
|
||||
"backbone_device",
|
||||
"codec_repo",
|
||||
"codec_device",
|
||||
"sample_rate",
|
||||
"default_voice",
|
||||
):
|
||||
value = getattr(args, field, None)
|
||||
if value is not None:
|
||||
setattr(config, field, value)
|
||||
changed = True
|
||||
if changed:
|
||||
config.save()
|
||||
print(json.dumps(config.__dict__, indent=2))
|
||||
return 0
|
||||
|
||||
parser.error(f"Unknown command: {args.command}")
|
||||
return 2
|
||||
except Exception as exc:
|
||||
print(f"error: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,67 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
DEFAULT_BACKBONE = "neuphonic/neutts-nano"
|
||||
DEFAULT_CODEC = "neuphonic/neucodec"
|
||||
DEFAULT_SAMPLE_RATE = 24000
|
||||
|
||||
|
||||
def app_home() -> Path:
|
||||
override = os.getenv("NEUTTS_CLI_HOME")
|
||||
if override:
|
||||
return Path(override).expanduser()
|
||||
return Path.home() / ".neutts-cli"
|
||||
|
||||
|
||||
def config_path() -> Path:
|
||||
return app_home() / "config.json"
|
||||
|
||||
|
||||
def voices_dir() -> Path:
|
||||
return app_home() / "voices"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AppConfig:
|
||||
backbone_repo: str = DEFAULT_BACKBONE
|
||||
backbone_device: str = "cpu"
|
||||
codec_repo: str = DEFAULT_CODEC
|
||||
codec_device: str = "cpu"
|
||||
sample_rate: int = DEFAULT_SAMPLE_RATE
|
||||
default_voice: str | None = None
|
||||
|
||||
@classmethod
|
||||
def load(cls) -> "AppConfig":
|
||||
path = config_path()
|
||||
if not path.exists():
|
||||
return cls()
|
||||
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
return cls(
|
||||
backbone_repo=data.get("backbone_repo", DEFAULT_BACKBONE),
|
||||
backbone_device=data.get("backbone_device", "cpu"),
|
||||
codec_repo=data.get("codec_repo", DEFAULT_CODEC),
|
||||
codec_device=data.get("codec_device", "cpu"),
|
||||
sample_rate=int(data.get("sample_rate", DEFAULT_SAMPLE_RATE)),
|
||||
default_voice=data.get("default_voice") or None,
|
||||
)
|
||||
|
||||
def save(self) -> Path:
|
||||
home = app_home()
|
||||
home.mkdir(parents=True, exist_ok=True)
|
||||
path = config_path()
|
||||
payload = {
|
||||
"backbone_repo": self.backbone_repo,
|
||||
"backbone_device": self.backbone_device,
|
||||
"codec_repo": self.codec_repo,
|
||||
"codec_device": self.codec_device,
|
||||
"sample_rate": self.sample_rate,
|
||||
"default_voice": self.default_voice,
|
||||
}
|
||||
path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
||||
return path
|
||||
@@ -1,197 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import importlib.util
|
||||
import json
|
||||
import platform
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from .audio import write_wav
|
||||
from .config import AppConfig, app_home, voices_dir
|
||||
|
||||
|
||||
KNOWN_MODELS = [
|
||||
"neuphonic/neutts-air",
|
||||
"neuphonic/neutts-air-q8-gguf",
|
||||
"neuphonic/neutts-air-q4-gguf",
|
||||
"neuphonic/neutts-nano",
|
||||
"neuphonic/neutts-nano-q8-gguf",
|
||||
"neuphonic/neutts-nano-q4-gguf",
|
||||
"neuphonic/neutts-nano-french",
|
||||
"neuphonic/neutts-nano-german",
|
||||
"neuphonic/neutts-nano-spanish",
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoiceProfile:
|
||||
name: str
|
||||
ref_audio: str
|
||||
ref_text: str
|
||||
language: str = "unknown"
|
||||
|
||||
|
||||
def is_module_available(module_name: str) -> bool:
|
||||
return importlib.util.find_spec(module_name) is not None
|
||||
|
||||
|
||||
def run_install(
|
||||
include_llama: bool, include_onnx: bool, include_all: bool, dry_run: bool
|
||||
) -> list[str]:
|
||||
extras = []
|
||||
if include_all:
|
||||
extras = ["all"]
|
||||
else:
|
||||
if include_llama:
|
||||
extras.append("llama")
|
||||
if include_onnx:
|
||||
extras.append("onnx")
|
||||
|
||||
requirement = "neutts"
|
||||
if extras:
|
||||
requirement = f"neutts[{','.join(extras)}]"
|
||||
|
||||
command = [sys.executable, "-m", "pip", "install", "-U", requirement]
|
||||
rendered = " ".join(command)
|
||||
if dry_run:
|
||||
return [rendered]
|
||||
|
||||
subprocess.run(command, check=True)
|
||||
return [rendered]
|
||||
|
||||
|
||||
def platform_notes() -> list[str]:
|
||||
system = platform.system()
|
||||
if system == "Darwin":
|
||||
return [
|
||||
"For Apple Silicon GGUF acceleration, install the llama extra with BLAS/Accelerate flags.",
|
||||
"See the upstream NeuTTS README for the recommended CMAKE_ARGS invocation.",
|
||||
]
|
||||
if system == "Linux":
|
||||
return [
|
||||
"For GGUF acceleration on Linux, install OpenBLAS and then reinstall the llama extra with matching CMAKE_ARGS.",
|
||||
]
|
||||
if system == "Windows":
|
||||
return [
|
||||
"For GGUF acceleration on Windows, install OpenBLAS first and then install the llama extra from PowerShell with CMAKE_ARGS set.",
|
||||
]
|
||||
return []
|
||||
|
||||
|
||||
def doctor_report() -> dict:
|
||||
voice_count = (
|
||||
len(list(voices_dir().glob("*/voice.json"))) if voices_dir().exists() else 0
|
||||
)
|
||||
config = AppConfig.load()
|
||||
report = {
|
||||
"python": sys.version.split()[0],
|
||||
"platform": platform.platform(),
|
||||
"app_home": str(app_home()),
|
||||
"config": asdict(config),
|
||||
"neutts_installed": is_module_available("neutts"),
|
||||
"numpy_installed": is_module_available("numpy"),
|
||||
"onnxruntime_installed": is_module_available("onnxruntime"),
|
||||
"llama_cpp_installed": is_module_available("llama_cpp"),
|
||||
"ffmpeg_in_path": shutil.which("ffmpeg") is not None,
|
||||
"voice_profiles": voice_count,
|
||||
"default_voice": config.default_voice,
|
||||
}
|
||||
return report
|
||||
|
||||
|
||||
def save_voice(
|
||||
name: str, ref_audio: str, ref_text: str, language: str = "unknown"
|
||||
) -> Path:
|
||||
source_audio = Path(ref_audio).expanduser().resolve()
|
||||
if not source_audio.exists():
|
||||
raise FileNotFoundError(f"Reference audio not found: {source_audio}")
|
||||
|
||||
destination = voices_dir() / name
|
||||
destination.mkdir(parents=True, exist_ok=True)
|
||||
audio_target = destination / source_audio.name
|
||||
text_target = destination / "reference.txt"
|
||||
metadata_target = destination / "voice.json"
|
||||
|
||||
if audio_target.resolve() != source_audio:
|
||||
if audio_target.exists():
|
||||
audio_target.unlink()
|
||||
audio_target.write_bytes(source_audio.read_bytes())
|
||||
if text_target.exists():
|
||||
text_target.unlink()
|
||||
text_target.write_text(ref_text.strip() + "\n", encoding="utf-8")
|
||||
|
||||
profile = VoiceProfile(
|
||||
name=name,
|
||||
ref_audio=str(audio_target),
|
||||
ref_text=ref_text.strip(),
|
||||
language=language,
|
||||
)
|
||||
metadata_target.write_text(
|
||||
json.dumps(asdict(profile), indent=2) + "\n", encoding="utf-8"
|
||||
)
|
||||
return metadata_target
|
||||
|
||||
|
||||
def load_voice(name: str) -> VoiceProfile:
|
||||
metadata_path = voices_dir() / name / "voice.json"
|
||||
if not metadata_path.exists():
|
||||
raise FileNotFoundError(f"Voice profile not found: {name}")
|
||||
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
return VoiceProfile(**payload)
|
||||
|
||||
|
||||
def list_voices() -> list[VoiceProfile]:
|
||||
if not voices_dir().exists():
|
||||
return []
|
||||
|
||||
profiles = []
|
||||
for metadata_path in sorted(voices_dir().glob("*/voice.json")):
|
||||
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
profiles.append(VoiceProfile(**payload))
|
||||
return profiles
|
||||
|
||||
|
||||
def synthesize(
|
||||
text: str,
|
||||
out: str,
|
||||
voice: str | None = None,
|
||||
ref_audio: str | None = None,
|
||||
ref_text: str | None = None,
|
||||
) -> Path:
|
||||
if not text.strip():
|
||||
raise ValueError("Input text is required")
|
||||
|
||||
# Fall back to the configured default voice when no voice is specified
|
||||
if not voice and not ref_audio:
|
||||
config = AppConfig.load()
|
||||
if config.default_voice:
|
||||
voice = config.default_voice
|
||||
|
||||
if voice:
|
||||
profile = load_voice(voice)
|
||||
ref_audio = profile.ref_audio
|
||||
ref_text = profile.ref_text
|
||||
|
||||
if not ref_audio or not ref_text:
|
||||
raise ValueError("Provide either --voice or both --ref-audio and --ref-text")
|
||||
|
||||
if not is_module_available("neutts"):
|
||||
raise RuntimeError("NeuTTS is not installed. Run 'neutts install' first.")
|
||||
|
||||
neu_module = importlib.import_module("neutts")
|
||||
NeuTTS = getattr(neu_module, "NeuTTS")
|
||||
|
||||
config = AppConfig.load()
|
||||
tts = NeuTTS(
|
||||
backbone_repo=config.backbone_repo,
|
||||
backbone_device=config.backbone_device,
|
||||
codec_repo=config.codec_repo,
|
||||
codec_device=config.codec_device,
|
||||
)
|
||||
ref_codes = tts.encode_reference(ref_audio)
|
||||
wav = tts.infer(text, ref_codes, ref_text)
|
||||
return write_wav(out, wav, config.sample_rate)
|
||||
@@ -1,168 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
SKILL_DIR = SCRIPT_DIR.parent
|
||||
BUNDLED_CLI_DIR = SKILL_DIR / "assets" / "neutts-cli"
|
||||
|
||||
|
||||
def _quote(path: Path) -> str:
|
||||
return shlex.quote(str(path))
|
||||
|
||||
|
||||
def _quote_text(value: str) -> str:
|
||||
return shlex.quote(value)
|
||||
|
||||
|
||||
def find_cli_dir() -> tuple[Path, str]:
|
||||
if BUNDLED_CLI_DIR.exists():
|
||||
return BUNDLED_CLI_DIR, "bundled"
|
||||
|
||||
raise FileNotFoundError(
|
||||
"NeuTTS CLI scaffold not found in bundled skill assets."
|
||||
)
|
||||
|
||||
|
||||
def build_commands(
|
||||
cli_dir: Path,
|
||||
install_cli: bool,
|
||||
sample_profile: bool,
|
||||
python_executable: str,
|
||||
) -> list[str]:
|
||||
commands: list[str] = []
|
||||
module_runner = f"{_quote_text(python_executable)} -m neutts_cli.cli"
|
||||
if install_cli:
|
||||
commands.append(
|
||||
f"{_quote_text(python_executable)} -m pip install --no-build-isolation -e {_quote(cli_dir)}"
|
||||
)
|
||||
commands.append(f"{module_runner} doctor")
|
||||
else:
|
||||
commands.append("neutts doctor")
|
||||
if sample_profile:
|
||||
sample_audio = cli_dir / "samples" / "jo.wav"
|
||||
sample_text = cli_dir / "samples" / "jo.txt"
|
||||
if not sample_audio.exists() or not sample_text.exists():
|
||||
raise FileNotFoundError(
|
||||
"Sample profile files are missing from bundled skill assets."
|
||||
)
|
||||
commands.append(
|
||||
" ".join(
|
||||
[
|
||||
f"{module_runner if install_cli else 'neutts'} add-voice jo-demo",
|
||||
f"--ref-audio {_quote(sample_audio)}",
|
||||
f"--ref-text-file {_quote(sample_text)}",
|
||||
"--language en",
|
||||
]
|
||||
)
|
||||
)
|
||||
return commands
|
||||
|
||||
|
||||
def maybe_run(commands: list[str], workdir: Path, execute: bool) -> list[dict]:
|
||||
results: list[dict] = []
|
||||
for command in commands:
|
||||
if not execute:
|
||||
results.append({"command": command, "executed": False})
|
||||
continue
|
||||
completed = subprocess.run(
|
||||
shlex.split(command),
|
||||
cwd=str(workdir),
|
||||
text=True,
|
||||
capture_output=True,
|
||||
check=False,
|
||||
)
|
||||
results.append(
|
||||
{
|
||||
"command": command,
|
||||
"executed": True,
|
||||
"returncode": completed.returncode,
|
||||
"stdout": completed.stdout.strip(),
|
||||
"stderr": completed.stderr.strip(),
|
||||
}
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
break
|
||||
return results
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Bootstrap the standalone NeuTTS CLI for Hermes skill usage"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repo-root",
|
||||
default=".",
|
||||
help="Working directory used when executing bootstrap commands",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--install-cli",
|
||||
action="store_true",
|
||||
help="Install the standalone NeuTTS CLI in editable mode",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample-profile",
|
||||
action="store_true",
|
||||
help="Add the bundled jo-demo sample profile",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--execute", action="store_true", help="Actually run the generated commands"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json", action="store_true", help="Print machine-readable JSON output"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
repo_root = Path(args.repo_root).expanduser().resolve()
|
||||
cli_dir, cli_source = find_cli_dir()
|
||||
commands = build_commands(
|
||||
cli_dir, args.install_cli, args.sample_profile, sys.executable
|
||||
)
|
||||
workdir = repo_root if repo_root.exists() else Path.cwd()
|
||||
results = maybe_run(commands, workdir, args.execute)
|
||||
|
||||
payload = {
|
||||
"python_executable": sys.executable,
|
||||
"repo_root": str(repo_root),
|
||||
"workdir": str(workdir),
|
||||
"cli_dir": str(cli_dir),
|
||||
"cli_source": cli_source,
|
||||
"commands": commands,
|
||||
"results": results,
|
||||
"next_steps": [
|
||||
"Re-run with '--execute' to actually perform the bootstrap commands.",
|
||||
f"Run '{sys.executable} -m neutts_cli.cli install --all' to install the upstream NeuTTS runtime.",
|
||||
f"Run '{sys.executable} -m neutts_cli.cli list-voices' to confirm saved profiles.",
|
||||
f"Run '{sys.executable} -m neutts_cli.cli synth --voice jo-demo --text Hello from Hermes' for a smoke test.",
|
||||
],
|
||||
}
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(payload, indent=2))
|
||||
else:
|
||||
print(f"Repo root: {repo_root}")
|
||||
print(f"Workdir: {workdir}")
|
||||
print(f"CLI dir: {cli_dir}")
|
||||
print(f"CLI source: {cli_source}")
|
||||
for entry in results:
|
||||
print(f"- {entry['command']}")
|
||||
if entry.get("executed"):
|
||||
print(f" rc={entry['returncode']}")
|
||||
if entry.get("stdout"):
|
||||
print(f" stdout: {entry['stdout']}")
|
||||
if entry.get("stderr"):
|
||||
print(f" stderr: {entry['stderr']}")
|
||||
for step in payload["next_steps"]:
|
||||
print(f"next: {step}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
104
tools/neutts_synth.py
Normal file
104
tools/neutts_synth.py
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Standalone NeuTTS synthesis helper.
|
||||
|
||||
Called by tts_tool.py via subprocess to keep the TTS model (~500MB)
|
||||
in a separate process that exits after synthesis — no lingering memory.
|
||||
|
||||
Usage:
|
||||
python -m tools.neutts_synth --text "Hello" --out output.wav \
|
||||
--ref-audio samples/jo.wav --ref-text samples/jo.txt
|
||||
|
||||
Requires: pip install neutts[all]
|
||||
System: apt install espeak-ng (or brew install espeak-ng)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _write_wav(path: str, samples, sample_rate: int = 24000) -> None:
|
||||
"""Write a WAV file from float32 samples (no soundfile dependency)."""
|
||||
import numpy as np
|
||||
|
||||
if not isinstance(samples, np.ndarray):
|
||||
samples = np.array(samples, dtype=np.float32)
|
||||
samples = samples.flatten()
|
||||
|
||||
# Clamp and convert to int16
|
||||
samples = np.clip(samples, -1.0, 1.0)
|
||||
pcm = (samples * 32767).astype(np.int16)
|
||||
|
||||
num_channels = 1
|
||||
bits_per_sample = 16
|
||||
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
|
||||
block_align = num_channels * (bits_per_sample // 8)
|
||||
data_size = len(pcm) * (bits_per_sample // 8)
|
||||
|
||||
with open(path, "wb") as f:
|
||||
f.write(b"RIFF")
|
||||
f.write(struct.pack("<I", 36 + data_size))
|
||||
f.write(b"WAVE")
|
||||
f.write(b"fmt ")
|
||||
f.write(struct.pack("<IHHIIHH", 16, 1, num_channels, sample_rate,
|
||||
byte_rate, block_align, bits_per_sample))
|
||||
f.write(b"data")
|
||||
f.write(struct.pack("<I", data_size))
|
||||
f.write(pcm.tobytes())
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="NeuTTS synthesis helper")
|
||||
parser.add_argument("--text", required=True, help="Text to synthesize")
|
||||
parser.add_argument("--out", required=True, help="Output WAV path")
|
||||
parser.add_argument("--ref-audio", required=True, help="Reference voice audio path")
|
||||
parser.add_argument("--ref-text", required=True, help="Reference voice transcript path")
|
||||
parser.add_argument("--model", default="neuphonic/neutts-air-q4-gguf",
|
||||
help="HuggingFace backbone model repo")
|
||||
parser.add_argument("--device", default="cpu", help="Device (cpu/cuda/mps)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate inputs
|
||||
ref_audio = Path(args.ref_audio).expanduser()
|
||||
ref_text_path = Path(args.ref_text).expanduser()
|
||||
if not ref_audio.exists():
|
||||
print(f"Error: reference audio not found: {ref_audio}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
if not ref_text_path.exists():
|
||||
print(f"Error: reference text not found: {ref_text_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
ref_text = ref_text_path.read_text(encoding="utf-8").strip()
|
||||
|
||||
# Import and run NeuTTS
|
||||
try:
|
||||
from neutts import NeuTTS
|
||||
except ImportError:
|
||||
print("Error: neutts not installed. Run: pip install neutts[all]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
tts = NeuTTS(
|
||||
backbone_repo=args.model,
|
||||
backbone_device=args.device,
|
||||
codec_repo="neuphonic/neucodec",
|
||||
codec_device=args.device,
|
||||
)
|
||||
ref_codes = tts.encode_reference(str(ref_audio))
|
||||
wav = tts.infer(args.text, ref_codes, ref_text)
|
||||
|
||||
# Write output
|
||||
out_path = Path(args.out)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
import soundfile as sf
|
||||
sf.write(str(out_path), wav, 24000)
|
||||
except ImportError:
|
||||
_write_wav(str(out_path), wav, 24000)
|
||||
|
||||
print(f"OK: {out_path}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -73,7 +73,6 @@ DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
|
||||
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
|
||||
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
|
||||
DEFAULT_OPENAI_VOICE = "alloy"
|
||||
DEFAULT_NEUTTS_VOICE = "" # empty = use neutts_cli default voice
|
||||
DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
|
||||
MAX_TEXT_LENGTH = 4000
|
||||
|
||||
@@ -265,24 +264,38 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
|
||||
# ===========================================================================
|
||||
|
||||
def _check_neutts_available() -> bool:
|
||||
"""Check if neutts_cli is importable (installed locally)."""
|
||||
"""Check if the neutts engine is importable (installed locally)."""
|
||||
try:
|
||||
import importlib.util
|
||||
return importlib.util.find_spec("neutts_cli") is not None
|
||||
return importlib.util.find_spec("neutts") is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
"""Generate speech using the local NeuTTS CLI.
|
||||
def _default_neutts_ref_audio() -> str:
|
||||
"""Return path to the bundled default voice reference audio."""
|
||||
return str(Path(__file__).parent / "neutts_samples" / "jo.wav")
|
||||
|
||||
Calls neutts_cli.cli synth via subprocess. Outputs WAV by default;
|
||||
the caller handles conversion to .ogg for Telegram if needed.
|
||||
|
||||
def _default_neutts_ref_text() -> str:
|
||||
"""Return path to the bundled default voice reference transcript."""
|
||||
return str(Path(__file__).parent / "neutts_samples" / "jo.txt")
|
||||
|
||||
|
||||
def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
"""Generate speech using the local NeuTTS engine.
|
||||
|
||||
Runs synthesis in a subprocess via tools/neutts_synth.py to keep the
|
||||
~500MB model in a separate process that exits after synthesis.
|
||||
Outputs WAV; the caller handles conversion for Telegram if needed.
|
||||
"""
|
||||
import sys
|
||||
|
||||
neutts_config = tts_config.get("neutts", {})
|
||||
voice = neutts_config.get("voice", DEFAULT_NEUTTS_VOICE)
|
||||
ref_audio = neutts_config.get("ref_audio", "") or _default_neutts_ref_audio()
|
||||
ref_text = neutts_config.get("ref_text", "") or _default_neutts_ref_text()
|
||||
model = neutts_config.get("model", "neuphonic/neutts-air-q4-gguf")
|
||||
device = neutts_config.get("device", "cpu")
|
||||
|
||||
# NeuTTS outputs WAV natively — use a .wav path for generation,
|
||||
# let the caller convert to the final format afterward.
|
||||
@@ -290,14 +303,23 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) ->
|
||||
if not output_path.endswith(".wav"):
|
||||
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
|
||||
|
||||
cmd = [sys.executable, "-m", "neutts_cli.cli", "synth", "--text", text, "--out", wav_path]
|
||||
if voice:
|
||||
cmd.extend(["--voice", voice])
|
||||
synth_script = str(Path(__file__).parent / "neutts_synth.py")
|
||||
cmd = [
|
||||
sys.executable, synth_script,
|
||||
"--text", text,
|
||||
"--out", wav_path,
|
||||
"--ref-audio", ref_audio,
|
||||
"--ref-text", ref_text,
|
||||
"--model", model,
|
||||
"--device", device,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.strip()
|
||||
raise RuntimeError(f"NeuTTS synthesis failed: {stderr or 'unknown error'}")
|
||||
# Filter out the "OK:" line from stderr
|
||||
error_lines = [l for l in stderr.splitlines() if not l.startswith("OK:")]
|
||||
raise RuntimeError(f"NeuTTS synthesis failed: {chr(10).join(error_lines) or 'unknown error'}")
|
||||
|
||||
# If the caller wanted .mp3 or .ogg, convert from WAV
|
||||
if wav_path != output_path:
|
||||
|
||||
Reference in New Issue
Block a user