diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 922807f17..f43b90838 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -539,7 +539,7 @@ platform_toolsets: # skills_hub - skill_hub (search/install/manage from online registries — user-driven only) # moa - mixture_of_agents (requires OPENROUTER_API_KEY) # todo - todo (in-memory task planning, no deps) -# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI key) +# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX key) # cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks) # rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY) # @@ -568,7 +568,7 @@ platform_toolsets: # todo - Task planning and tracking for multi-step work # memory - Persistent memory across sessions (personal notes + user profile) # session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization) -# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI) +# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax) # cronjob - Schedule and manage automated tasks (CLI-only) # rl - RL training tools (Tinker-Atropos) # diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index a72fd4e2f..98b754152 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -695,6 +695,8 @@ def _print_setup_summary(config: dict, hermes_home): get_env_value("VOICE_TOOLS_OPENAI_KEY") or get_env_value("OPENAI_API_KEY") ): tool_status.append(("Text-to-Speech (OpenAI)", True, None)) + elif tts_provider == "minimax" and get_env_value("MINIMAX_API_KEY"): + tool_status.append(("Text-to-Speech (MiniMax)", True, None)) elif tts_provider == "neutts": try: import importlib.util @@ -1180,6 +1182,7 @@ def _setup_tts_provider(config: dict): "edge": "Edge TTS", "elevenlabs": "ElevenLabs", "openai": "OpenAI TTS", + "minimax": "MiniMax TTS", "neutts": "NeuTTS", } current_label = provider_labels.get(current_provider, current_provider) @@ -1199,10 +1202,11 @@ def _setup_tts_provider(config: dict): "Edge TTS (free, cloud-based, no setup needed)", "ElevenLabs (premium quality, needs API key)", "OpenAI TTS (good quality, needs API key)", + "MiniMax TTS (high quality with voice cloning, needs API key)", "NeuTTS (local on-device, free, ~300MB model download)", ] ) - providers.extend(["edge", "elevenlabs", "openai", "neutts"]) + providers.extend(["edge", "elevenlabs", "openai", "minimax", "neutts"]) choices.append(f"Keep current ({current_label})") keep_current_idx = len(choices) - 1 idx = prompt_choice("Select TTS provider:", choices, keep_current_idx) @@ -1268,6 +1272,18 @@ def _setup_tts_provider(config: dict): print_warning("No API key provided. Falling back to Edge TTS.") selected = "edge" + elif selected == "minimax": + existing = get_env_value("MINIMAX_API_KEY") + if not existing: + print() + api_key = prompt("MiniMax API key for TTS", password=True) + if api_key: + save_env_value("MINIMAX_API_KEY", api_key) + print_success("MiniMax TTS API key saved") + else: + print_warning("No API key provided. Falling back to Edge TTS.") + selected = "edge" + # Save the selection if "tts" not in config: config["tts"] = {} diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 6487dbfa4..a8c2ac05b 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -2,10 +2,11 @@ """ Text-to-Speech Tool Module -Supports four TTS providers: +Supports five TTS providers: - Edge TTS (default, free, no API key): Microsoft Edge neural voices - ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY - OpenAI TTS: Good quality, needs OPENAI_API_KEY +- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY - NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed Output formats: @@ -78,6 +79,9 @@ DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5" DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts" DEFAULT_OPENAI_VOICE = "alloy" DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1" +DEFAULT_MINIMAX_MODEL = "speech-2.8-hd" +DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady" +DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2" def _get_default_output_dir() -> str: from hermes_constants import get_hermes_dir @@ -274,6 +278,93 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any] close() +# =========================================================================== +# Provider: MiniMax TTS +# =========================================================================== +def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: + """ + Generate audio using MiniMax TTS API. + + MiniMax returns hex-encoded audio data. Supports streaming (SSE) and + non-streaming modes. This implementation uses non-streaming for simplicity. + + Args: + text: Text to convert (max 10,000 characters). + output_path: Where to save the audio file. + tts_config: TTS config dict. + + Returns: + Path to the saved audio file. + """ + import requests + + api_key = os.getenv("MINIMAX_API_KEY", "") + if not api_key: + raise ValueError("MINIMAX_API_KEY not set. Get one at https://platform.minimax.io/") + + mm_config = tts_config.get("minimax", {}) + model = mm_config.get("model", DEFAULT_MINIMAX_MODEL) + voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID) + speed = mm_config.get("speed", 1) + vol = mm_config.get("vol", 1) + pitch = mm_config.get("pitch", 0) + base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL) + + # Determine audio format from output extension + if output_path.endswith(".wav"): + audio_format = "wav" + elif output_path.endswith(".flac"): + audio_format = "flac" + else: + audio_format = "mp3" + + payload = { + "model": model, + "text": text, + "stream": False, + "voice_setting": { + "voice_id": voice_id, + "speed": speed, + "vol": vol, + "pitch": pitch, + }, + "audio_setting": { + "sample_rate": 32000, + "bitrate": 128000, + "format": audio_format, + "channel": 1, + }, + } + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + } + + response = requests.post(base_url, json=payload, headers=headers, timeout=60) + response.raise_for_status() + + result = response.json() + base_resp = result.get("base_resp", {}) + status_code = base_resp.get("status_code", -1) + + if status_code != 0: + status_msg = base_resp.get("status_msg", "unknown error") + raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}") + + hex_audio = result.get("data", {}).get("audio", "") + if not hex_audio: + raise RuntimeError("MiniMax TTS returned empty audio data") + + # MiniMax returns hex-encoded audio (not base64) + audio_bytes = bytes.fromhex(hex_audio) + + with open(output_path, "wb") as f: + f.write(audio_bytes) + + return output_path + + # =========================================================================== # NeuTTS (local, on-device TTS via neutts_cli) # =========================================================================== @@ -434,6 +525,10 @@ def text_to_speech_tool( logger.info("Generating speech with OpenAI TTS...") _generate_openai_tts(text, file_str, tts_config) + elif provider == "minimax": + logger.info("Generating speech with MiniMax TTS...") + _generate_minimax_tts(text, file_str, tts_config) + elif provider == "neutts": if not _check_neutts_available(): return json.dumps({ @@ -484,7 +579,7 @@ def text_to_speech_tool( # Try Opus conversion for Telegram compatibility # Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion voice_compatible = False - if provider in ("edge", "neutts") and not file_str.endswith(".ogg"): + if provider in ("edge", "neutts", "minimax") and not file_str.endswith(".ogg"): opus_path = _convert_to_opus(file_str) if opus_path: file_str = opus_path @@ -556,6 +651,8 @@ def check_tts_requirements() -> bool: return True except ImportError: pass + if os.getenv("MINIMAX_API_KEY"): + return True if _check_neutts_available(): return True return False @@ -842,6 +939,7 @@ if __name__ == "__main__": " API Key: " f"{'set' if resolve_openai_audio_api_key() else 'not set (VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY)'}" ) + print(f" MiniMax: {'API key set' if os.getenv('MINIMAX_API_KEY') else 'not set (MINIMAX_API_KEY)'}") print(f" ffmpeg: {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}") print(f"\n Output dir: {DEFAULT_OUTPUT_DIR}") diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index c1de925d1..ca64170d9 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -10,13 +10,14 @@ Hermes Agent supports both text-to-speech output and voice message transcription ## Text-to-Speech -Convert text to speech with four providers: +Convert text to speech with five providers: | Provider | Quality | Cost | API Key | |----------|---------|------|---------| | **Edge TTS** (default) | Good | Free | None needed | | **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` | | **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` | +| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` | | **NeuTTS** | Good | Free | None needed | ### Platform Delivery @@ -33,7 +34,7 @@ Convert text to speech with four providers: ```yaml # In ~/.hermes/config.yaml tts: - provider: "edge" # "edge" | "elevenlabs" | "openai" | "neutts" + provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "neutts" edge: voice: "en-US-AriaNeural" # 322 voices, 74 languages elevenlabs: @@ -43,6 +44,12 @@ tts: model: "gpt-4o-mini-tts" voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer base_url: "https://api.openai.com/v1" # Override for OpenAI-compatible TTS endpoints + minimax: + model: "speech-2.8-hd" # speech-2.8-hd (default), speech-2.8-turbo + voice_id: "English_Graceful_Lady" # See https://platform.minimax.io/faq/system-voice-id + speed: 1 # 0.5 - 2.0 + vol: 1 # 0 - 10 + pitch: 0 # -12 - 12 neutts: ref_audio: '' ref_text: '' @@ -56,6 +63,7 @@ Telegram voice bubbles require Opus/OGG audio format: - **OpenAI and ElevenLabs** produce Opus natively — no extra setup - **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert: +- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles - **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles ```bash @@ -69,7 +77,7 @@ brew install ffmpeg sudo dnf install ffmpeg ``` -Without ffmpeg, Edge TTS and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble). +Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble). :::tip If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider.