feat: add MiniMax TTS provider support (speech-2.8)
Add MiniMax as a fifth TTS provider alongside Edge TTS, ElevenLabs,
OpenAI, and NeuTTS. Supports speech-2.8-hd (recommended default) and
speech-2.8-turbo models via the MiniMax T2A HTTP API.
Changes:
- Add _generate_minimax_tts() with hex-encoded audio decoding
- Add MiniMax to provider dispatch, requirements check, and Telegram
Opus compatibility handling
- Add MiniMax to interactive setup wizard with API key prompt
- Update TTS documentation and config example
Configuration:
tts:
provider: "minimax"
minimax:
model: "speech-2.8-hd"
voice_id: "English_Graceful_Lady"
Requires MINIMAX_API_KEY environment variable.
API reference: https://platform.minimax.io/docs/api-reference/speech-t2a-http
This commit is contained in:
@@ -539,7 +539,7 @@ platform_toolsets:
|
||||
# skills_hub - skill_hub (search/install/manage from online registries — user-driven only)
|
||||
# moa - mixture_of_agents (requires OPENROUTER_API_KEY)
|
||||
# todo - todo (in-memory task planning, no deps)
|
||||
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI key)
|
||||
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX key)
|
||||
# cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks)
|
||||
# rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
|
||||
#
|
||||
@@ -568,7 +568,7 @@ platform_toolsets:
|
||||
# todo - Task planning and tracking for multi-step work
|
||||
# memory - Persistent memory across sessions (personal notes + user profile)
|
||||
# session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
|
||||
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI)
|
||||
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax)
|
||||
# cronjob - Schedule and manage automated tasks (CLI-only)
|
||||
# rl - RL training tools (Tinker-Atropos)
|
||||
#
|
||||
|
||||
@@ -695,6 +695,8 @@ def _print_setup_summary(config: dict, hermes_home):
|
||||
get_env_value("VOICE_TOOLS_OPENAI_KEY") or get_env_value("OPENAI_API_KEY")
|
||||
):
|
||||
tool_status.append(("Text-to-Speech (OpenAI)", True, None))
|
||||
elif tts_provider == "minimax" and get_env_value("MINIMAX_API_KEY"):
|
||||
tool_status.append(("Text-to-Speech (MiniMax)", True, None))
|
||||
elif tts_provider == "neutts":
|
||||
try:
|
||||
import importlib.util
|
||||
@@ -1180,6 +1182,7 @@ def _setup_tts_provider(config: dict):
|
||||
"edge": "Edge TTS",
|
||||
"elevenlabs": "ElevenLabs",
|
||||
"openai": "OpenAI TTS",
|
||||
"minimax": "MiniMax TTS",
|
||||
"neutts": "NeuTTS",
|
||||
}
|
||||
current_label = provider_labels.get(current_provider, current_provider)
|
||||
@@ -1199,10 +1202,11 @@ def _setup_tts_provider(config: dict):
|
||||
"Edge TTS (free, cloud-based, no setup needed)",
|
||||
"ElevenLabs (premium quality, needs API key)",
|
||||
"OpenAI TTS (good quality, needs API key)",
|
||||
"MiniMax TTS (high quality with voice cloning, needs API key)",
|
||||
"NeuTTS (local on-device, free, ~300MB model download)",
|
||||
]
|
||||
)
|
||||
providers.extend(["edge", "elevenlabs", "openai", "neutts"])
|
||||
providers.extend(["edge", "elevenlabs", "openai", "minimax", "neutts"])
|
||||
choices.append(f"Keep current ({current_label})")
|
||||
keep_current_idx = len(choices) - 1
|
||||
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
|
||||
@@ -1268,6 +1272,18 @@ def _setup_tts_provider(config: dict):
|
||||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
elif selected == "minimax":
|
||||
existing = get_env_value("MINIMAX_API_KEY")
|
||||
if not existing:
|
||||
print()
|
||||
api_key = prompt("MiniMax API key for TTS", password=True)
|
||||
if api_key:
|
||||
save_env_value("MINIMAX_API_KEY", api_key)
|
||||
print_success("MiniMax TTS API key saved")
|
||||
else:
|
||||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
# Save the selection
|
||||
if "tts" not in config:
|
||||
config["tts"] = {}
|
||||
|
||||
@@ -2,10 +2,11 @@
|
||||
"""
|
||||
Text-to-Speech Tool Module
|
||||
|
||||
Supports four TTS providers:
|
||||
Supports five TTS providers:
|
||||
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
|
||||
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
|
||||
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
|
||||
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
|
||||
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
|
||||
|
||||
Output formats:
|
||||
@@ -78,6 +79,9 @@ DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
|
||||
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
|
||||
DEFAULT_OPENAI_VOICE = "alloy"
|
||||
DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
|
||||
DEFAULT_MINIMAX_MODEL = "speech-2.8-hd"
|
||||
DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady"
|
||||
DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2"
|
||||
|
||||
def _get_default_output_dir() -> str:
|
||||
from hermes_constants import get_hermes_dir
|
||||
@@ -274,6 +278,93 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
|
||||
close()
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Provider: MiniMax TTS
|
||||
# ===========================================================================
|
||||
def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Generate audio using MiniMax TTS API.
|
||||
|
||||
MiniMax returns hex-encoded audio data. Supports streaming (SSE) and
|
||||
non-streaming modes. This implementation uses non-streaming for simplicity.
|
||||
|
||||
Args:
|
||||
text: Text to convert (max 10,000 characters).
|
||||
output_path: Where to save the audio file.
|
||||
tts_config: TTS config dict.
|
||||
|
||||
Returns:
|
||||
Path to the saved audio file.
|
||||
"""
|
||||
import requests
|
||||
|
||||
api_key = os.getenv("MINIMAX_API_KEY", "")
|
||||
if not api_key:
|
||||
raise ValueError("MINIMAX_API_KEY not set. Get one at https://platform.minimax.io/")
|
||||
|
||||
mm_config = tts_config.get("minimax", {})
|
||||
model = mm_config.get("model", DEFAULT_MINIMAX_MODEL)
|
||||
voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID)
|
||||
speed = mm_config.get("speed", 1)
|
||||
vol = mm_config.get("vol", 1)
|
||||
pitch = mm_config.get("pitch", 0)
|
||||
base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL)
|
||||
|
||||
# Determine audio format from output extension
|
||||
if output_path.endswith(".wav"):
|
||||
audio_format = "wav"
|
||||
elif output_path.endswith(".flac"):
|
||||
audio_format = "flac"
|
||||
else:
|
||||
audio_format = "mp3"
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"text": text,
|
||||
"stream": False,
|
||||
"voice_setting": {
|
||||
"voice_id": voice_id,
|
||||
"speed": speed,
|
||||
"vol": vol,
|
||||
"pitch": pitch,
|
||||
},
|
||||
"audio_setting": {
|
||||
"sample_rate": 32000,
|
||||
"bitrate": 128000,
|
||||
"format": audio_format,
|
||||
"channel": 1,
|
||||
},
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
}
|
||||
|
||||
response = requests.post(base_url, json=payload, headers=headers, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
base_resp = result.get("base_resp", {})
|
||||
status_code = base_resp.get("status_code", -1)
|
||||
|
||||
if status_code != 0:
|
||||
status_msg = base_resp.get("status_msg", "unknown error")
|
||||
raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}")
|
||||
|
||||
hex_audio = result.get("data", {}).get("audio", "")
|
||||
if not hex_audio:
|
||||
raise RuntimeError("MiniMax TTS returned empty audio data")
|
||||
|
||||
# MiniMax returns hex-encoded audio (not base64)
|
||||
audio_bytes = bytes.fromhex(hex_audio)
|
||||
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(audio_bytes)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# NeuTTS (local, on-device TTS via neutts_cli)
|
||||
# ===========================================================================
|
||||
@@ -434,6 +525,10 @@ def text_to_speech_tool(
|
||||
logger.info("Generating speech with OpenAI TTS...")
|
||||
_generate_openai_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "minimax":
|
||||
logger.info("Generating speech with MiniMax TTS...")
|
||||
_generate_minimax_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "neutts":
|
||||
if not _check_neutts_available():
|
||||
return json.dumps({
|
||||
@@ -484,7 +579,7 @@ def text_to_speech_tool(
|
||||
# Try Opus conversion for Telegram compatibility
|
||||
# Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion
|
||||
voice_compatible = False
|
||||
if provider in ("edge", "neutts") and not file_str.endswith(".ogg"):
|
||||
if provider in ("edge", "neutts", "minimax") and not file_str.endswith(".ogg"):
|
||||
opus_path = _convert_to_opus(file_str)
|
||||
if opus_path:
|
||||
file_str = opus_path
|
||||
@@ -556,6 +651,8 @@ def check_tts_requirements() -> bool:
|
||||
return True
|
||||
except ImportError:
|
||||
pass
|
||||
if os.getenv("MINIMAX_API_KEY"):
|
||||
return True
|
||||
if _check_neutts_available():
|
||||
return True
|
||||
return False
|
||||
@@ -842,6 +939,7 @@ if __name__ == "__main__":
|
||||
" API Key: "
|
||||
f"{'set' if resolve_openai_audio_api_key() else 'not set (VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY)'}"
|
||||
)
|
||||
print(f" MiniMax: {'API key set' if os.getenv('MINIMAX_API_KEY') else 'not set (MINIMAX_API_KEY)'}")
|
||||
print(f" ffmpeg: {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}")
|
||||
print(f"\n Output dir: {DEFAULT_OUTPUT_DIR}")
|
||||
|
||||
|
||||
@@ -10,13 +10,14 @@ Hermes Agent supports both text-to-speech output and voice message transcription
|
||||
|
||||
## Text-to-Speech
|
||||
|
||||
Convert text to speech with four providers:
|
||||
Convert text to speech with five providers:
|
||||
|
||||
| Provider | Quality | Cost | API Key |
|
||||
|----------|---------|------|---------|
|
||||
| **Edge TTS** (default) | Good | Free | None needed |
|
||||
| **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` |
|
||||
| **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` |
|
||||
| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
|
||||
| **NeuTTS** | Good | Free | None needed |
|
||||
|
||||
### Platform Delivery
|
||||
@@ -33,7 +34,7 @@ Convert text to speech with four providers:
|
||||
```yaml
|
||||
# In ~/.hermes/config.yaml
|
||||
tts:
|
||||
provider: "edge" # "edge" | "elevenlabs" | "openai" | "neutts"
|
||||
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "neutts"
|
||||
edge:
|
||||
voice: "en-US-AriaNeural" # 322 voices, 74 languages
|
||||
elevenlabs:
|
||||
@@ -43,6 +44,12 @@ tts:
|
||||
model: "gpt-4o-mini-tts"
|
||||
voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer
|
||||
base_url: "https://api.openai.com/v1" # Override for OpenAI-compatible TTS endpoints
|
||||
minimax:
|
||||
model: "speech-2.8-hd" # speech-2.8-hd (default), speech-2.8-turbo
|
||||
voice_id: "English_Graceful_Lady" # See https://platform.minimax.io/faq/system-voice-id
|
||||
speed: 1 # 0.5 - 2.0
|
||||
vol: 1 # 0 - 10
|
||||
pitch: 0 # -12 - 12
|
||||
neutts:
|
||||
ref_audio: ''
|
||||
ref_text: ''
|
||||
@@ -56,6 +63,7 @@ Telegram voice bubbles require Opus/OGG audio format:
|
||||
|
||||
- **OpenAI and ElevenLabs** produce Opus natively — no extra setup
|
||||
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
|
||||
- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
|
||||
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
|
||||
|
||||
```bash
|
||||
@@ -69,7 +77,7 @@ brew install ffmpeg
|
||||
sudo dnf install ffmpeg
|
||||
```
|
||||
|
||||
Without ffmpeg, Edge TTS and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
|
||||
Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
|
||||
|
||||
:::tip
|
||||
If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider.
|
||||
|
||||
Reference in New Issue
Block a user