Files
hermes-agent/tools/neutts_synth.py
Teknium d50e0711c2 refactor(tts): replace NeuTTS optional skill with built-in provider + setup flow
Remove the optional skill (redundant now that NeuTTS is a built-in TTS
provider). Replace neutts_cli dependency with a standalone synthesis
helper (tools/neutts_synth.py) that calls the neutts Python API directly
in a subprocess.

Add TTS provider selection to hermes setup:
- 'hermes setup' now prompts for TTS provider after model selection
- 'hermes setup tts' available as standalone section
- Selecting NeuTTS checks for deps and offers to install:
  espeak-ng (system) + neutts[all] (pip)
- ElevenLabs/OpenAI selections prompt for API keys
- Tool status display shows NeuTTS install state

Changes:
- Remove optional-skills/mlops/models/neutts/ (skill + CLI scaffold)
- Add tools/neutts_synth.py (standalone synthesis subprocess helper)
- Move jo.wav/jo.txt to tools/neutts_samples/ (bundled default voice)
- Refactor _generate_neutts() — uses neutts API via subprocess, no
  neutts_cli dependency, config-driven ref_audio/ref_text/model/device
- Add TTS setup to hermes_cli/setup.py (SETUP_SECTIONS, tool status)
- Update config.py defaults (ref_audio, ref_text, model, device)
2026-03-17 02:33:12 -07:00

105 lines
3.4 KiB
Python

#!/usr/bin/env python3
"""Standalone NeuTTS synthesis helper.
Called by tts_tool.py via subprocess to keep the TTS model (~500MB)
in a separate process that exits after synthesis — no lingering memory.
Usage:
python -m tools.neutts_synth --text "Hello" --out output.wav \
--ref-audio samples/jo.wav --ref-text samples/jo.txt
Requires: pip install neutts[all]
System: apt install espeak-ng (or brew install espeak-ng)
"""
import argparse
import struct
import sys
from pathlib import Path
def _write_wav(path: str, samples, sample_rate: int = 24000) -> None:
"""Write a WAV file from float32 samples (no soundfile dependency)."""
import numpy as np
if not isinstance(samples, np.ndarray):
samples = np.array(samples, dtype=np.float32)
samples = samples.flatten()
# Clamp and convert to int16
samples = np.clip(samples, -1.0, 1.0)
pcm = (samples * 32767).astype(np.int16)
num_channels = 1
bits_per_sample = 16
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
block_align = num_channels * (bits_per_sample // 8)
data_size = len(pcm) * (bits_per_sample // 8)
with open(path, "wb") as f:
f.write(b"RIFF")
f.write(struct.pack("<I", 36 + data_size))
f.write(b"WAVE")
f.write(b"fmt ")
f.write(struct.pack("<IHHIIHH", 16, 1, num_channels, sample_rate,
byte_rate, block_align, bits_per_sample))
f.write(b"data")
f.write(struct.pack("<I", data_size))
f.write(pcm.tobytes())
def main():
parser = argparse.ArgumentParser(description="NeuTTS synthesis helper")
parser.add_argument("--text", required=True, help="Text to synthesize")
parser.add_argument("--out", required=True, help="Output WAV path")
parser.add_argument("--ref-audio", required=True, help="Reference voice audio path")
parser.add_argument("--ref-text", required=True, help="Reference voice transcript path")
parser.add_argument("--model", default="neuphonic/neutts-air-q4-gguf",
help="HuggingFace backbone model repo")
parser.add_argument("--device", default="cpu", help="Device (cpu/cuda/mps)")
args = parser.parse_args()
# Validate inputs
ref_audio = Path(args.ref_audio).expanduser()
ref_text_path = Path(args.ref_text).expanduser()
if not ref_audio.exists():
print(f"Error: reference audio not found: {ref_audio}", file=sys.stderr)
sys.exit(1)
if not ref_text_path.exists():
print(f"Error: reference text not found: {ref_text_path}", file=sys.stderr)
sys.exit(1)
ref_text = ref_text_path.read_text(encoding="utf-8").strip()
# Import and run NeuTTS
try:
from neutts import NeuTTS
except ImportError:
print("Error: neutts not installed. Run: pip install neutts[all]", file=sys.stderr)
sys.exit(1)
tts = NeuTTS(
backbone_repo=args.model,
backbone_device=args.device,
codec_repo="neuphonic/neucodec",
codec_device=args.device,
)
ref_codes = tts.encode_reference(str(ref_audio))
wav = tts.infer(args.text, ref_codes, ref_text)
# Write output
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
try:
import soundfile as sf
sf.write(str(out_path), wav, 24000)
except ImportError:
_write_wav(str(out_path), wav, 24000)
print(f"OK: {out_path}", file=sys.stderr)
if __name__ == "__main__":
main()