diff --git a/README.md b/README.md index 18b016c4b..da9e29579 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,9 @@ All your settings are stored in `~/.hermes/` for easy access: ``` ~/.hermes/ -├── config.yaml # Settings (model, terminal, compression, etc.) +├── config.yaml # Settings (model, terminal, TTS, compression, etc.) ├── .env # API keys and secrets +├── SOUL.md # Optional: global persona (agent embodies this personality) ├── cron/ # Scheduled jobs ├── sessions/ # Gateway sessions └── logs/ # Logs @@ -76,6 +77,8 @@ You need at least one LLM provider: | Web scraping | [Firecrawl](https://firecrawl.dev/) | `FIRECRAWL_API_KEY` | | Browser automation | [Browserbase](https://browserbase.com/) | `BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID` | | Image generation | [FAL](https://fal.ai/) | `FAL_KEY` | +| Premium TTS voices | [ElevenLabs](https://elevenlabs.io/) | `ELEVENLABS_API_KEY` | +| OpenAI TTS voices | [OpenAI](https://platform.openai.com/api-keys) | `OPENAI_API_KEY` | | RL Training | [Tinker](https://tinker-console.thinkingmachines.ai/) + [WandB](https://wandb.ai/) | `TINKER_API_KEY`, `WANDB_API_KEY` | | Messaging | Telegram, Discord | `TELEGRAM_BOT_TOKEN`, `DISCORD_BOT_TOKEN` | @@ -128,7 +131,58 @@ hermes --toolsets "web,terminal" hermes --list-tools ``` -**Available toolsets:** `web`, `terminal`, `browser`, `vision`, `creative`, `reasoning`, `skills`, `cronjob`, and more. +**Available toolsets:** `web`, `terminal`, `browser`, `vision`, `creative`, `reasoning`, `skills`, `tts`, `cronjob`, and more. + +### 🔊 Text-to-Speech + +Convert text to speech with three providers: + +| Provider | Quality | Cost | API Key | +|----------|---------|------|---------| +| **Edge TTS** (default) | Good | Free | None needed | +| **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` | +| **OpenAI TTS** | Good | Paid | `OPENAI_API_KEY` | + +On Telegram, audio plays as native voice bubbles. On Discord/WhatsApp, sent as audio files. In CLI mode, saved to `~/voice-memos/`. + +**Configure in `~/.hermes/config.yaml`:** +```yaml +tts: + provider: "edge" # "edge" | "elevenlabs" | "openai" + edge: + voice: "en-US-AriaNeural" # 322 voices, 74 languages + elevenlabs: + voice_id: "pNInz6obpgDQGcFmaJgB" # Adam + model_id: "eleven_multilingual_v2" + openai: + model: "gpt-4o-mini-tts" + voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer +``` + +> **Note:** Telegram voice bubbles require `ffmpeg` for Opus conversion (Edge TTS only outputs MP3). Install with `apt install ffmpeg` or `brew install ffmpeg`. Without ffmpeg, audio is sent as a file instead of a voice bubble. + +### 📄 Context Files (SOUL.md, AGENTS.md, .cursorrules) + +Drop these files in your project directory and the agent automatically picks them up: + +| File | Purpose | +|------|---------| +| `AGENTS.md` | Project-specific instructions, coding conventions, tool usage guidelines | +| `SOUL.md` | Persona definition -- the agent embodies this personality and tone | +| `.cursorrules` | Cursor IDE rules (also detected) | +| `.cursor/rules/*.mdc` | Cursor rule files (also detected) | + +- **AGENTS.md** is hierarchical: if subdirectories also have `AGENTS.md`, all are combined (like Codex/Cline). +- **SOUL.md** checks cwd first, then `~/.hermes/SOUL.md` as a global fallback. +- All context files are capped at 20,000 characters with smart truncation. + +### 🛡️ Exec Approval (Messaging Platforms) + +When the agent tries to run a potentially dangerous command (rm -rf, chmod 777, etc.) on Telegram/Discord/WhatsApp, instead of blocking it silently, it asks the user for approval: + +> ⚠️ This command is potentially dangerous (recursive delete). Reply "yes" to approve. + +Reply "yes"/"y" to approve or "no"/"n" to deny. In CLI mode, the existing interactive approval prompt (once/session/always/deny) is preserved. ### 🖥️ Terminal Backend diff --git a/cli.py b/cli.py index e295fd9cc..d40ce3f8f 100755 --- a/cli.py +++ b/cli.py @@ -1576,6 +1576,7 @@ class HermesCLI: # Check for commands if user_input.startswith("/"): + print(f"\n⚙️ {user_input}") if not self.process_command(user_input): self._should_exit = True # Schedule app exit @@ -1587,6 +1588,9 @@ class HermesCLI: self._agent_running = True app.invalidate() # Refresh status line + # Echo the user's input so it stays visible in scrollback + print(f"\n💬 You: {user_input}") + try: self.chat(user_input) finally: diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index b3ddb8359..737c6891d 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -240,6 +240,61 @@ class BasePlatformAdapter(ABC): return images, cleaned + async def send_voice( + self, + chat_id: str, + audio_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """ + Send an audio file as a native voice message via the platform API. + + Override in subclasses to send audio as voice bubbles (Telegram) + or file attachments (Discord). Default falls back to sending the + file path as text. + """ + text = f"🔊 Audio: {audio_path}" + if caption: + text = f"{caption}\n{text}" + return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) + + @staticmethod + def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]: + """ + Extract MEDIA: tags and [[audio_as_voice]] directives from response text. + + The TTS tool returns responses like: + [[audio_as_voice]] + MEDIA:/path/to/audio.ogg + + Args: + content: The response text to scan. + + Returns: + Tuple of (list of (path, is_voice) pairs, cleaned content with tags removed). + """ + media = [] + cleaned = content + + # Check for [[audio_as_voice]] directive + has_voice_tag = "[[audio_as_voice]]" in content + cleaned = cleaned.replace("[[audio_as_voice]]", "") + + # Extract MEDIA: tags (path may contain spaces) + media_pattern = r'MEDIA:(\S+)' + for match in re.finditer(media_pattern, content): + path = match.group(1).strip() + if path: + media.append((path, has_voice_tag)) + + # Remove MEDIA tags from content + if media: + cleaned = re.sub(media_pattern, '', cleaned) + cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip() + + return media, cleaned + async def _keep_typing(self, chat_id: str, interval: float = 2.0) -> None: """ Continuously send typing indicator until cancelled. @@ -294,10 +349,13 @@ class BasePlatformAdapter(ABC): # Send response if any if response: + # Extract MEDIA: tags (from TTS tool) before other processing + media_files, response = self.extract_media(response) + # Extract image URLs and send them as native platform attachments images, text_content = self.extract_images(response) - # Send the text portion first (if any remains after extracting images) + # Send the text portion first (if any remains after extractions) if text_content: result = await self.send( chat_id=event.source.chat_id, @@ -329,6 +387,18 @@ class BasePlatformAdapter(ABC): print(f"[{self.name}] Failed to send image: {img_result.error}") except Exception as img_err: print(f"[{self.name}] Error sending image: {img_err}") + + # Send extracted audio/voice files as native attachments + for audio_path, is_voice in media_files: + try: + voice_result = await self.send_voice( + chat_id=event.source.chat_id, + audio_path=audio_path, + ) + if not voice_result.success: + print(f"[{self.name}] Failed to send voice: {voice_result.error}") + except Exception as voice_err: + print(f"[{self.name}] Error sending voice: {voice_err}") # Check if there's a pending message that was queued during our processing if session_key in self._pending_messages: diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index 0d0cc9e25..88c65c5fa 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -174,6 +174,44 @@ class DiscordAdapter(BasePlatformAdapter): except Exception as e: return SendResult(success=False, error=str(e)) + async def send_voice( + self, + chat_id: str, + audio_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send audio as a Discord file attachment.""" + if not self._client: + return SendResult(success=False, error="Not connected") + + try: + import io + + channel = self._client.get_channel(int(chat_id)) + if not channel: + channel = await self._client.fetch_channel(int(chat_id)) + if not channel: + return SendResult(success=False, error=f"Channel {chat_id} not found") + + if not os.path.exists(audio_path): + return SendResult(success=False, error=f"Audio file not found: {audio_path}") + + # Determine filename from path + filename = os.path.basename(audio_path) + + with open(audio_path, "rb") as f: + file = discord.File(io.BytesIO(f.read()), filename=filename) + msg = await channel.send( + content=caption if caption else None, + file=file, + ) + return SendResult(success=True, message_id=str(msg.id)) + + except Exception as e: + print(f"[{self.name}] Failed to send audio: {e}") + return await super().send_voice(chat_id, audio_path, caption, reply_to) + async def send_image( self, chat_id: str, diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index 8cd8fc2fe..baf4d7b1a 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -174,6 +174,44 @@ class TelegramAdapter(BasePlatformAdapter): except Exception as e: return SendResult(success=False, error=str(e)) + async def send_voice( + self, + chat_id: str, + audio_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send audio as a native Telegram voice message or audio file.""" + if not self._bot: + return SendResult(success=False, error="Not connected") + + try: + import os + if not os.path.exists(audio_path): + return SendResult(success=False, error=f"Audio file not found: {audio_path}") + + with open(audio_path, "rb") as audio_file: + # .ogg files -> send as voice (round playable bubble) + if audio_path.endswith(".ogg") or audio_path.endswith(".opus"): + msg = await self._bot.send_voice( + chat_id=int(chat_id), + voice=audio_file, + caption=caption[:1024] if caption else None, + reply_to_message_id=int(reply_to) if reply_to else None, + ) + else: + # .mp3 and others -> send as audio file + msg = await self._bot.send_audio( + chat_id=int(chat_id), + audio=audio_file, + caption=caption[:1024] if caption else None, + reply_to_message_id=int(reply_to) if reply_to else None, + ) + return SendResult(success=True, message_id=str(msg.message_id)) + except Exception as e: + print(f"[{self.name}] Failed to send voice/audio: {e}") + return await super().send_voice(chat_id, audio_path, caption, reply_to) + async def send_image( self, chat_id: str, diff --git a/gateway/run.py b/gateway/run.py index 12aa8be53..de44b1d2f 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -35,6 +35,9 @@ load_dotenv() # Gateway runs in quiet mode - suppress debug output and use cwd directly (no temp dirs) os.environ["HERMES_QUIET"] = "1" +# Enable interactive exec approval for dangerous commands on messaging platforms +os.environ["HERMES_EXEC_ASK"] = "1" + # Set terminal working directory for messaging platforms # Uses MESSAGING_CWD if set, otherwise defaults to home directory # This is separate from CLI which uses the directory where `hermes` is run @@ -77,6 +80,10 @@ class GatewayRunner: # Key: session_key, Value: AIAgent instance self._running_agents: Dict[str, Any] = {} self._pending_messages: Dict[str, str] = {} # Queued messages during interrupt + + # Track pending exec approvals per session + # Key: session_key, Value: {"command": str, "pattern_key": str} + self._pending_approvals: Dict[str, Dict[str, str]] = {} async def start(self) -> bool: """ @@ -246,6 +253,25 @@ class GatewayRunner: if command == "stop": return await self._handle_stop_command(event) + # Check for pending exec approval responses + session_key_preview = f"agent:main:{source.platform.value}:{source.chat_type}:{source.chat_id}" if source.chat_type != "dm" else f"agent:main:{source.platform.value}:dm" + if session_key_preview in self._pending_approvals: + user_text = event.text.strip().lower() + if user_text in ("yes", "y", "approve", "ok", "go", "do it"): + approval = self._pending_approvals.pop(session_key_preview) + cmd = approval["command"] + pattern_key = approval.get("pattern_key", "") + print(f"[gateway] ✅ User approved dangerous command: {cmd[:60]}...") + # Approve for session and re-run via terminal_tool with force=True + from tools.terminal_tool import terminal_tool, _session_approved_patterns + _session_approved_patterns.add(pattern_key) + result = terminal_tool(command=cmd, force=True) + return f"✅ Command approved and executed.\n\n```\n{result[:3500]}\n```" + elif user_text in ("no", "n", "deny", "cancel", "nope"): + self._pending_approvals.pop(session_key_preview) + return "❌ Command denied." + # If it's not clearly an approval/denial, fall through to normal processing + # Get or create session session_entry = self.session_store.get_or_create_session(source) session_key = session_entry.session_key @@ -282,6 +308,17 @@ class GatewayRunner: session_key=session_key ) + # Check if the agent encountered a dangerous command needing approval + # The terminal tool stores the last pending approval globally + try: + from tools.terminal_tool import _last_pending_approval + if _last_pending_approval: + self._pending_approvals[session_key] = _last_pending_approval.copy() + # Clear the global so it doesn't leak to other sessions + _last_pending_approval.clear() + except Exception: + pass + # Append to transcript self.session_store.append_to_transcript( session_entry.session_id, @@ -418,23 +455,35 @@ class GatewayRunner: return last_tool[0] = tool_name - # Build progress message + # Build progress message with primary argument preview tool_emojis = { "terminal": "💻", "web_search": "🔍", "web_extract": "📄", "read_file": "📖", "write_file": "✍️", + "patch": "🔧", + "search": "🔎", "list_directory": "📂", "image_generate": "🎨", + "text_to_speech": "🔊", "browser_navigate": "🌐", "browser_click": "👆", + "browser_type": "⌨️", + "browser_snapshot": "📸", "moa_query": "🧠", + "mixture_of_agents": "🧠", + "vision_analyze": "👁️", + "skill_view": "📚", + "skills_list": "📋", } emoji = tool_emojis.get(tool_name, "⚙️") - if tool_name == "terminal" and preview: - msg = f"{emoji} `{preview}`..." + if preview: + # Truncate preview to keep messages clean + if len(preview) > 40: + preview = preview[:37] + "..." + msg = f"{emoji} {tool_name}... \"{preview}\"" else: msg = f"{emoji} {tool_name}..." diff --git a/hermes_cli/config.py b/hermes_cli/config.py index f31cc040d..b7917eab5 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -99,6 +99,24 @@ DEFAULT_CONFIG = { "personality": "kawaii", }, + # Text-to-speech configuration + "tts": { + "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" + "edge": { + "voice": "en-US-AriaNeural", + # Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural + }, + "elevenlabs": { + "voice_id": "pNInz6obpgDQGcFmaJgB", # Adam + "model_id": "eleven_multilingual_v2", + }, + "openai": { + "model": "gpt-4o-mini-tts", + "voice": "alloy", + # Voices: alloy, echo, fable, onyx, nova, shimmer + }, + }, + # Permanently allowed dangerous command patterns (added via "always" approval) "command_allowlist": [], @@ -202,6 +220,13 @@ OPTIONAL_ENV_VARS = { "url": None, "password": False, }, + # Text-to-speech (premium providers) + "ELEVENLABS_API_KEY": { + "description": "ElevenLabs API key for premium text-to-speech voices", + "prompt": "ElevenLabs API key", + "url": "https://elevenlabs.io/", + "password": True, + }, # Terminal configuration "MESSAGING_CWD": { "description": "Working directory for terminal commands via messaging (Telegram/Discord/etc). CLI always uses current directory.", diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 3b52dd904..adf024279 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -186,6 +186,11 @@ def _print_setup_summary(config: dict, hermes_home): else: tool_status.append(("Image Generation", False, "FAL_KEY")) + # TTS (always available via Edge TTS; ElevenLabs/OpenAI are optional) + tool_status.append(("Text-to-Speech (Edge TTS)", True, None)) + if get_env_value('ELEVENLABS_API_KEY'): + tool_status.append(("Text-to-Speech (ElevenLabs)", True, None)) + # Tinker + WandB (RL training) if get_env_value('TINKER_API_KEY') and get_env_value('WANDB_API_KEY'): tool_status.append(("RL Training (Tinker)", True, None)) @@ -991,6 +996,28 @@ def run_setup_wizard(args): print_success(" Configured ✓") print() + # ElevenLabs - Premium TTS + print_info("─" * 50) + print(color(" Text-to-Speech - ElevenLabs (Premium)", Colors.CYAN)) + print_info(" Enables: Premium TTS voices (Edge TTS is free and works without a key)") + print_info(" Use case: High-quality, customizable voice synthesis") + if get_env_value('ELEVENLABS_API_KEY'): + print_success(" Status: Configured ✓") + if prompt_yes_no(" Update ElevenLabs API key?", False): + api_key = prompt(" API key", password=True) + if api_key: + save_env_value("ELEVENLABS_API_KEY", api_key) + print_success(" Updated") + else: + print_warning(" Status: Not configured (free Edge TTS will be used by default)") + if prompt_yes_no(" Set up ElevenLabs?", False): + print_info(" Get your API key at: https://elevenlabs.io/") + api_key = prompt(" API key", password=True) + if api_key: + save_env_value("ELEVENLABS_API_KEY", api_key) + print_success(" Configured ✓") + print() + # Tinker + WandB - RL Training print_info("─" * 50) print(color(" RL Training (Tinker + WandB)", Colors.CYAN)) diff --git a/hermes_cli/status.py b/hermes_cli/status.py index bbbdc2af5..10064e425 100644 --- a/hermes_cli/status.py +++ b/hermes_cli/status.py @@ -76,6 +76,7 @@ def show_status(args): "FAL": "FAL_KEY", "Tinker": "TINKER_API_KEY", "WandB": "WANDB_API_KEY", + "ElevenLabs": "ELEVENLABS_API_KEY", } for name, env_var in keys.items(): diff --git a/model_tools.py b/model_tools.py index f0250ee21..927eac949 100644 --- a/model_tools.py +++ b/model_tools.py @@ -83,6 +83,8 @@ from tools.browser_tool import ( check_browser_requirements, BROWSER_TOOL_SCHEMAS ) +# Text-to-speech tool (Edge TTS / ElevenLabs / OpenAI) +from tools.tts_tool import text_to_speech_tool, check_tts_requirements from toolsets import ( get_toolset, resolve_toolset, resolve_multiple_toolsets, get_all_toolsets, get_toolset_names, validate_toolset, @@ -165,6 +167,13 @@ TOOLSET_REQUIREMENTS = { "setup_url": None, "tools": ["read_file", "write_file", "patch", "search"], }, + "tts": { + "name": "Text-to-Speech", + "env_vars": [], # Edge TTS needs no key; premium providers checked at runtime + "check_fn": check_tts_requirements, + "setup_url": None, + "tools": ["text_to_speech"], + }, } @@ -862,6 +871,38 @@ def get_file_tool_definitions() -> List[Dict[str, Any]]: ] +def get_tts_tool_definitions() -> List[Dict[str, Any]]: + """ + Get tool definitions for text-to-speech tools in OpenAI's expected format. + + Returns: + List[Dict]: List of TTS tool definitions compatible with OpenAI API + """ + return [ + { + "type": "function", + "function": { + "name": "text_to_speech", + "description": "Convert text to speech audio. Returns a MEDIA: path that the platform delivers as a voice message. On Telegram it plays as a voice bubble, on Discord/WhatsApp as an audio attachment. In CLI mode, saves to ~/voice-memos/. Voice and provider are user-configured, not model-selected.", + "parameters": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to convert to speech. Keep under 4000 characters." + }, + "output_path": { + "type": "string", + "description": "Optional custom file path to save the audio. Defaults to ~/voice-memos/.mp3" + } + }, + "required": ["text"] + } + } + } + ] + + def get_all_tool_names() -> List[str]: """ Get the names of all available tools across all toolsets. @@ -926,6 +967,10 @@ def get_all_tool_names() -> List[str]: "read_file", "write_file", "patch", "search" ]) + # Text-to-speech tools + if check_tts_requirements(): + tool_names.extend(["text_to_speech"]) + return tool_names @@ -967,6 +1012,8 @@ TOOL_TO_TOOLSET_MAP = { "rl_stop_training": "rl_tools", "rl_get_results": "rl_tools", "rl_list_runs": "rl_tools", + # Text-to-speech tools + "text_to_speech": "tts_tools", # File manipulation tools "read_file": "file_tools", "write_file": "file_tools", @@ -1070,6 +1117,11 @@ def get_tool_definitions( for tool in get_file_tool_definitions(): all_available_tools_map[tool["function"]["name"]] = tool + # Text-to-speech tools + if check_tts_requirements(): + for tool in get_tts_tool_definitions(): + all_available_tools_map[tool["function"]["name"]] = tool + # Determine which tools to include based on toolsets tools_to_include = set() @@ -1106,7 +1158,8 @@ def get_tool_definitions( "rl_stop_training", "rl_get_results", "rl_list_runs", "rl_test_inference" ], - "file_tools": ["read_file", "write_file", "patch", "search"] + "file_tools": ["read_file", "write_file", "patch", "search"], + "tts_tools": ["text_to_speech"] } legacy_tools = legacy_map.get(toolset_name, []) tools_to_include.update(legacy_tools) @@ -1159,7 +1212,8 @@ def get_tool_definitions( "rl_stop_training", "rl_get_results", "rl_list_runs", "rl_test_inference" ], - "file_tools": ["read_file", "write_file", "patch", "search"] + "file_tools": ["read_file", "write_file", "patch", "search"], + "tts_tools": ["text_to_speech"] } legacy_tools = legacy_map.get(toolset_name, []) tools_to_include.difference_update(legacy_tools) @@ -1617,6 +1671,28 @@ def handle_file_function_call( return json.dumps({"error": f"Unknown file function: {function_name}"}, ensure_ascii=False) +def handle_tts_function_call( + function_name: str, + function_args: Dict[str, Any] +) -> str: + """ + Handle function calls for text-to-speech tools. + + Args: + function_name (str): Name of the TTS function to call + function_args (Dict): Arguments for the function + + Returns: + str: Function result as JSON string + """ + if function_name == "text_to_speech": + text = function_args.get("text", "") + output_path = function_args.get("output_path") + return text_to_speech_tool(text=text, output_path=output_path) + + return json.dumps({"error": f"Unknown TTS function: {function_name}"}, ensure_ascii=False) + + def handle_function_call( function_name: str, function_args: Dict[str, Any], @@ -1694,6 +1770,10 @@ def handle_function_call( elif function_name in ["read_file", "write_file", "patch", "search"]: return handle_file_function_call(function_name, function_args, task_id) + # Route text-to-speech tools + elif function_name in ["text_to_speech"]: + return handle_tts_function_call(function_name, function_args) + else: error_msg = f"Unknown function: {function_name}" print(f"❌ {error_msg}") @@ -1771,6 +1851,12 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]: "tools": ["read_file", "write_file", "patch", "search"], "description": "File manipulation tools: read/write files, search content/files, patch with fuzzy matching", "requirements": ["Terminal backend available (local/docker/ssh/singularity/modal)"] + }, + "tts_tools": { + "available": check_tts_requirements(), + "tools": ["text_to_speech"], + "description": "Text-to-speech: convert text to audio (Edge TTS free, ElevenLabs, OpenAI)", + "requirements": ["edge-tts package (free) or ELEVENLABS_API_KEY or OPENAI_API_KEY"] } } @@ -1792,7 +1878,8 @@ def check_toolset_requirements() -> Dict[str, bool]: "skills_tools": check_skills_requirements(), "browser_tools": check_browser_requirements(), "cronjob_tools": check_cronjob_requirements(), - "file_tools": check_file_requirements() + "file_tools": check_file_requirements(), + "tts_tools": check_tts_requirements() } if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index d950a50ba..45faea555 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,6 +29,12 @@ platformdirs # Optional: For Modal backend (cloud execution) # swe-rex[modal]>=1.4.0 # Includes modal + boto3 + swe-rex runtime +# Text-to-speech (Edge TTS is free, no API key needed) +edge-tts + +# Optional: Premium TTS providers +# elevenlabs # Uncomment if using ElevenLabs TTS (needs ELEVENLABS_API_KEY) + # Optional: For cron expression parsing (cronjob scheduling) croniter diff --git a/run_agent.py b/run_agent.py index 1d95a19f3..56328a373 100644 --- a/run_agent.py +++ b/run_agent.py @@ -625,6 +625,224 @@ def build_skills_system_prompt() -> str: ) +# ============================================================================= +# Context File Injection (SOUL.md, AGENTS.md, .cursorrules) +# ============================================================================= + +# Maximum characters per context file before truncation +CONTEXT_FILE_MAX_CHARS = 20_000 +# Truncation strategy: keep 70% from the head, 20% from the tail +CONTEXT_TRUNCATE_HEAD_RATIO = 0.7 +CONTEXT_TRUNCATE_TAIL_RATIO = 0.2 + + +def _truncate_content(content: str, filename: str, max_chars: int = CONTEXT_FILE_MAX_CHARS) -> str: + """ + Truncate content if it exceeds max_chars using a head/tail strategy. + + Keeps 70% from the start and 20% from the end, with a truncation + marker in the middle so the model knows content was cut. + """ + if len(content) <= max_chars: + return content + + head_chars = int(max_chars * CONTEXT_TRUNCATE_HEAD_RATIO) + tail_chars = int(max_chars * CONTEXT_TRUNCATE_TAIL_RATIO) + head = content[:head_chars] + tail = content[-tail_chars:] + + marker = f"\n\n[...truncated {filename}: kept {head_chars}+{tail_chars} of {len(content)} chars. Use file tools to read the full file.]\n\n" + return head + marker + tail + + +def build_context_files_prompt(cwd: str = None) -> str: + """ + Discover and load context files (SOUL.md, AGENTS.md, .cursorrules) + for injection into the system prompt. + + Discovery rules: + - AGENTS.md: Recursively search from cwd (only if top-level exists). + Each file becomes a ## section with its relative path. + - .cursorrules: Check cwd for .cursorrules file and .cursor/rules/*.mdc + - SOUL.md: Check cwd first, then ~/.hermes/SOUL.md as global fallback + + Args: + cwd: Working directory to search from. Defaults to os.getcwd(). + + Returns: + str: The context files prompt section, or empty string if none found. + """ + import os + import glob as glob_mod + from pathlib import Path + + if cwd is None: + cwd = os.getcwd() + + cwd_path = Path(cwd).resolve() + sections = [] + + # ----- AGENTS.md (hierarchical, recursive) ----- + top_level_agents = None + for name in ["AGENTS.md", "agents.md"]: + candidate = cwd_path / name + if candidate.exists(): + top_level_agents = candidate + break + + if top_level_agents: + # Recursively find all AGENTS.md files (case-insensitive) + agents_files = [] + for root, dirs, files in os.walk(cwd_path): + # Skip hidden directories and common non-project dirs + dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ('node_modules', '__pycache__', 'venv', '.venv')] + for f in files: + if f.lower() == "agents.md": + agents_files.append(Path(root) / f) + + # Sort by path depth (top-level first, then deeper) + agents_files.sort(key=lambda p: len(p.parts)) + + total_agents_content = "" + for agents_path in agents_files: + try: + content = agents_path.read_text(encoding="utf-8").strip() + if content: + rel_path = agents_path.relative_to(cwd_path) + total_agents_content += f"## {rel_path}\n\n{content}\n\n" + except Exception: + pass + + if total_agents_content: + total_agents_content = _truncate_content(total_agents_content, "AGENTS.md") + sections.append(total_agents_content) + + # ----- .cursorrules ----- + cursorrules_content = "" + + # Check for .cursorrules file + cursorrules_file = cwd_path / ".cursorrules" + if cursorrules_file.exists(): + try: + content = cursorrules_file.read_text(encoding="utf-8").strip() + if content: + cursorrules_content += f"## .cursorrules\n\n{content}\n\n" + except Exception: + pass + + # Check for .cursor/rules/*.mdc files + cursor_rules_dir = cwd_path / ".cursor" / "rules" + if cursor_rules_dir.exists() and cursor_rules_dir.is_dir(): + mdc_files = sorted(cursor_rules_dir.glob("*.mdc")) + for mdc_file in mdc_files: + try: + content = mdc_file.read_text(encoding="utf-8").strip() + if content: + cursorrules_content += f"## .cursor/rules/{mdc_file.name}\n\n{content}\n\n" + except Exception: + pass + + if cursorrules_content: + cursorrules_content = _truncate_content(cursorrules_content, ".cursorrules") + sections.append(cursorrules_content) + + # ----- SOUL.md (cwd first, then ~/.hermes/ fallback) ----- + soul_content = "" + soul_path = None + + for name in ["SOUL.md", "soul.md"]: + candidate = cwd_path / name + if candidate.exists(): + soul_path = candidate + break + + if not soul_path: + # Global fallback + global_soul = Path.home() / ".hermes" / "SOUL.md" + if global_soul.exists(): + soul_path = global_soul + + if soul_path: + try: + content = soul_path.read_text(encoding="utf-8").strip() + if content: + content = _truncate_content(content, "SOUL.md") + soul_content = f"## SOUL.md\n\nIf SOUL.md is present, embody its persona and tone. Avoid stiff, generic replies; follow its guidance unless higher-priority instructions override it.\n\n{content}" + sections.append(soul_content) + except Exception: + pass + + # ----- Assemble ----- + if not sections: + return "" + + return "# Project Context\n\nThe following project context files have been loaded and should be followed:\n\n" + "\n".join(sections) + + +def _build_tool_preview(tool_name: str, args: dict, max_len: int = 40) -> str: + """ + Build a short preview of a tool call's primary argument for display. + + Returns a truncated string showing the most informative argument, + or None if no meaningful preview is available. + + Args: + tool_name: Name of the tool being called + args: The tool call arguments dict + max_len: Maximum preview length before truncation + + Returns: + str or None: Short preview string, or None + """ + # Map tool names to their primary argument key(s) + primary_args = { + "terminal": "command", + "web_search": "query", + "web_extract": "urls", + "read_file": "path", + "write_file": "path", + "patch": "path", + "search": "pattern", + "browser_navigate": "url", + "browser_click": "ref", + "browser_type": "text", + "image_generate": "prompt", + "text_to_speech": "text", + "vision_analyze": "question", + "mixture_of_agents": "user_prompt", + "skill_view": "name", + "skills_list": "category", + "schedule_cronjob": "name", + } + + key = primary_args.get(tool_name) + if not key: + # Try common arg names as fallback + for fallback_key in ("query", "text", "command", "path", "name", "prompt"): + if fallback_key in args: + key = fallback_key + break + + if not key or key not in args: + return None + + value = args[key] + + # Handle list values (e.g., urls) + if isinstance(value, list): + value = value[0] if value else "" + + preview = str(value).strip() + if not preview: + return None + + # Truncate + if len(preview) > max_len: + preview = preview[:max_len - 3] + "..." + + return preview + + class KawaiiSpinner: """ Animated spinner with kawaii faces for CLI feedback during tool execution. @@ -1129,19 +1347,65 @@ class AIAgent: face = random.choice(self.KAWAII_SKILL) return f"{face} 📖 loading {name}... {time_str}" + # File tools + elif tool_name == "read_file": + path = args.get("path", "file") + if len(path) > 30: + path = "..." + path[-27:] + face = random.choice(self.KAWAII_READ) + return f"{face} 📖 reading \"{path}\" {time_str}" + + elif tool_name == "write_file": + path = args.get("path", "file") + if len(path) > 30: + path = "..." + path[-27:] + face = random.choice(self.KAWAII_CREATE) + return f"{face} ✍️ writing \"{path}\" {time_str}" + + elif tool_name == "patch": + path = args.get("path", "file") + if path and len(path) > 30: + path = "..." + path[-27:] + face = random.choice(self.KAWAII_CREATE) + return f"{face} 🔧 patching \"{path}\" {time_str}" + + elif tool_name == "search": + pattern = args.get("pattern", "") + if len(pattern) > 25: + pattern = pattern[:22] + "..." + face = random.choice(self.KAWAII_SEARCH) + return f"{face} 🔎 searching \"{pattern}\" {time_str}" + + # TTS + elif tool_name == "text_to_speech": + text = args.get("text", "") + if len(text) > 25: + text = text[:22] + "..." + face = random.choice(self.KAWAII_CREATE) + return f"{face} 🔊 speaking \"{text}\" {time_str}" + # Vision tools elif tool_name == "vision_analyze": + question = args.get("question", "") + if len(question) > 25: + question = question[:22] + "..." face = random.choice(self.KAWAII_BROWSER) - return f"{face} 👁️✨ analyzing image... {time_str}" + return f"{face} 👁️✨ analyzing \"{question}\" {time_str}" # Mixture of agents elif tool_name == "mixture_of_agents": + prompt = args.get("user_prompt", "") + if len(prompt) > 25: + prompt = prompt[:22] + "..." face = random.choice(self.KAWAII_THINK) - return f"{face} 🧠💭 thinking REALLY hard... {time_str}" + return f"{face} 🧠💭 deep thinking \"{prompt}\" {time_str}" - # Default fallback - random generic kawaii + # Default fallback - random generic kawaii with primary arg preview else: face = random.choice(self.KAWAII_GENERIC) + preview = _build_tool_preview(tool_name, args) + if preview: + return f"{face} ⚡ {tool_name}... \"{preview}\" {time_str}" return f"{face} ⚡ {tool_name}... {time_str}" def _has_content_after_think_block(self, content: str) -> bool: @@ -1708,6 +1972,15 @@ class AIAgent: else: active_system_prompt = base_system_prompt + # Auto-include context files (SOUL.md, AGENTS.md, .cursorrules) + # Discovered from cwd and injected as # Project Context sections. + context_files_prompt = build_context_files_prompt() + if context_files_prompt: + if active_system_prompt: + active_system_prompt = f"{active_system_prompt}\n\n{context_files_prompt}" + else: + active_system_prompt = context_files_prompt + # Main conversation loop api_call_count = 0 final_response = None @@ -2314,12 +2587,8 @@ class AIAgent: # Fire progress callback if registered (for messaging platforms) if self.tool_progress_callback: try: - # Build preview for terminal commands - if function_name == "terminal": - cmd = function_args.get("command", "") - preview = cmd[:50] + "..." if len(cmd) > 50 else cmd - else: - preview = None + # Build a short preview of the primary argument + preview = _build_tool_preview(function_name, function_args) self.tool_progress_callback(function_name, preview) except Exception as cb_err: logging.debug(f"Tool progress callback error: {cb_err}") diff --git a/scripts/install.ps1 b/scripts/install.ps1 index 25a4159e2..2ec25f324 100644 --- a/scripts/install.ps1 +++ b/scripts/install.ps1 @@ -262,6 +262,25 @@ function Test-Ripgrep { return $true # Don't fail - ripgrep is optional } +function Test-Ffmpeg { + Write-Info "Checking ffmpeg (optional, for TTS voice messages)..." + + if (Get-Command ffmpeg -ErrorAction SilentlyContinue) { + $version = ffmpeg -version 2>&1 | Select-Object -First 1 + Write-Success "ffmpeg found" + $script:HasFfmpeg = $true + return $true + } + + Write-Warn "ffmpeg not found (TTS voice bubbles on Telegram will send as audio files instead)" + Write-Info " Install with: winget install ffmpeg" + Write-Info " Or: choco install ffmpeg" + Write-Info " Or download from: https://ffmpeg.org/download.html" + + $script:HasFfmpeg = $false + return $true # Don't fail - ffmpeg is optional +} + # ============================================================================ # Installation # ============================================================================ @@ -567,6 +586,7 @@ function Main { if (-not (Test-Git)) { exit 1 } Test-Node # Optional, doesn't fail Test-Ripgrep # Optional, doesn't fail + Test-Ffmpeg # Optional, doesn't fail Install-Repository Install-Venv diff --git a/scripts/install.sh b/scripts/install.sh index 09f93cb76..f913198c6 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -413,6 +413,45 @@ check_ripgrep() { # Don't exit - ripgrep is optional (grep fallback exists) } +check_ffmpeg() { + log_info "Checking ffmpeg (optional, for TTS voice messages)..." + + if command -v ffmpeg &> /dev/null; then + local ffmpeg_version=$(ffmpeg -version 2>/dev/null | head -1 | awk '{print $3}') + log_success "ffmpeg found: $ffmpeg_version" + HAS_FFMPEG=true + return + fi + + log_warn "ffmpeg not found (TTS voice bubbles on Telegram will send as audio files instead)" + log_info "To install ffmpeg (optional):" + + case "$OS" in + linux) + case "$DISTRO" in + ubuntu|debian) + log_info " sudo apt install ffmpeg" + ;; + fedora) + log_info " sudo dnf install ffmpeg" + ;; + arch) + log_info " sudo pacman -S ffmpeg" + ;; + *) + log_info " https://ffmpeg.org/download.html" + ;; + esac + ;; + macos) + log_info " brew install ffmpeg" + ;; + esac + + HAS_FFMPEG=false + # Don't exit - ffmpeg is optional +} + # ============================================================================ # Installation # ============================================================================ @@ -707,6 +746,7 @@ main() { check_git check_node check_ripgrep + check_ffmpeg clone_repo setup_venv diff --git a/tools/__init__.py b/tools/__init__.py index b69dbbce7..f169949ae 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -122,6 +122,12 @@ from .file_tools import ( clear_file_ops_cache, ) +# Text-to-speech tools (Edge TTS / ElevenLabs / OpenAI) +from .tts_tool import ( + text_to_speech_tool, + check_tts_requirements, +) + # File tools have no external requirements - they use the terminal backend def check_file_requirements(): """File tools only require terminal backend to be available.""" @@ -207,5 +213,8 @@ __all__ = [ 'get_file_tools', 'clear_file_ops_cache', 'check_file_requirements', + # Text-to-speech tools + 'text_to_speech_tool', + 'check_tts_requirements', ] diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py index 2de2997ff..bd6504595 100644 --- a/tools/terminal_tool.py +++ b/tools/terminal_tool.py @@ -237,6 +237,10 @@ _cached_sudo_password: str = "" # Session-cached dangerous command approvals (pattern -> approved) _session_approved_patterns: set = set() +# Last approval-required command (for gateway to pick up) +# Set by _check_dangerous_command when in ask mode, read by gateway +_last_pending_approval: dict = {} + # Dangerous command patterns (regex, description) DANGEROUS_PATTERNS = [ (r'\brm\s+(-[^\s]*\s+)*/', "delete in root path"), @@ -408,12 +412,22 @@ def _check_dangerous_command(command: str, env_type: str) -> dict: # Programmatic use - allow (user opted into local backend) return {"approved": True, "message": None} - if is_gateway: - # Messaging context - return informative denial, agent should ask user + if is_gateway or os.getenv("HERMES_EXEC_ASK"): + # Messaging context - return approval_required so the gateway can + # prompt the user interactively instead of just blocking + global _last_pending_approval + _last_pending_approval = { + "command": command, + "pattern_key": pattern_key, + "description": description, + } return { "approved": False, "pattern_key": pattern_key, - "message": f"BLOCKED: This command is potentially dangerous ({description}). Tell the user and ask if they want to add this command pattern to their allowlist. They can do this via 'hermes config edit' or by running the command directly on their machine." + "status": "approval_required", + "command": command, + "description": description, + "message": f"⚠️ This command is potentially dangerous ({description}). Asking the user for approval..." } # CLI context - prompt user @@ -1586,6 +1600,17 @@ def terminal_tool( if not force: approval = _check_dangerous_command(command, env_type) if not approval["approved"]: + # Check if this is an approval_required (gateway ask mode) + if approval.get("status") == "approval_required": + return json.dumps({ + "output": "", + "exit_code": -1, + "error": approval.get("message", "Waiting for user approval"), + "status": "approval_required", + "command": approval.get("command", command), + "description": approval.get("description", "dangerous command"), + "pattern_key": approval.get("pattern_key", ""), + }, ensure_ascii=False) # Command was blocked - return informative message return json.dumps({ "output": "", diff --git a/tools/tts_tool.py b/tools/tts_tool.py new file mode 100644 index 000000000..5129196a0 --- /dev/null +++ b/tools/tts_tool.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +""" +Text-to-Speech Tool Module + +Supports three TTS providers: +- Edge TTS (default, free, no API key): Microsoft Edge neural voices +- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY +- OpenAI TTS: Good quality, needs OPENAI_API_KEY + +Output formats: +- Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS) +- MP3 (.mp3) for everything else (CLI, Discord, WhatsApp) + +Configuration is loaded from ~/.hermes/config.yaml under the 'tts:' key. +The user chooses the provider and voice; the model just sends text. + +Usage: + from tools.tts_tool import text_to_speech_tool, check_tts_requirements + + result = text_to_speech_tool(text="Hello world") +""" + +import asyncio +import datetime +import json +import os +import shutil +import subprocess +import tempfile +from pathlib import Path +from typing import Dict, Any, Optional + +# --------------------------------------------------------------------------- +# Optional imports -- providers degrade gracefully if not installed +# --------------------------------------------------------------------------- +try: + import edge_tts + _HAS_EDGE_TTS = True +except ImportError: + _HAS_EDGE_TTS = False + +try: + from elevenlabs.client import ElevenLabs + _HAS_ELEVENLABS = True +except ImportError: + _HAS_ELEVENLABS = False + +# openai is a core dependency, but guard anyway +try: + from openai import OpenAI as OpenAIClient + _HAS_OPENAI = True +except ImportError: + _HAS_OPENAI = False + + +# =========================================================================== +# Defaults +# =========================================================================== +DEFAULT_PROVIDER = "edge" +DEFAULT_EDGE_VOICE = "en-US-AriaNeural" +DEFAULT_ELEVENLABS_VOICE_ID = "pNInz6obpgDQGcFmaJgB" # Adam +DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2" +DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts" +DEFAULT_OPENAI_VOICE = "alloy" +DEFAULT_OUTPUT_DIR = os.path.expanduser("~/voice-memos") +MAX_TEXT_LENGTH = 4000 + + +# =========================================================================== +# Config loader -- reads tts: section from ~/.hermes/config.yaml +# =========================================================================== +def _load_tts_config() -> Dict[str, Any]: + """ + Load TTS configuration from ~/.hermes/config.yaml. + + Returns a dict with provider settings. Falls back to defaults + for any missing fields. + """ + try: + from hermes_cli.config import load_config + config = load_config() + return config.get("tts", {}) + except Exception: + return {} + + +def _get_provider(tts_config: Dict[str, Any]) -> str: + """Get the configured TTS provider name.""" + return tts_config.get("provider", DEFAULT_PROVIDER).lower().strip() + + +# =========================================================================== +# ffmpeg Opus conversion (Edge TTS MP3 -> OGG Opus for Telegram) +# =========================================================================== +def _has_ffmpeg() -> bool: + """Check if ffmpeg is available on the system.""" + return shutil.which("ffmpeg") is not None + + +def _convert_to_opus(mp3_path: str) -> Optional[str]: + """ + Convert an MP3 file to OGG Opus format for Telegram voice bubbles. + + Args: + mp3_path: Path to the input MP3 file. + + Returns: + Path to the .ogg file, or None if conversion fails. + """ + if not _has_ffmpeg(): + return None + + ogg_path = mp3_path.rsplit(".", 1)[0] + ".ogg" + try: + subprocess.run( + ["ffmpeg", "-i", mp3_path, "-acodec", "libopus", + "-ac", "1", "-b:a", "64k", "-vbr", "off", ogg_path, "-y"], + capture_output=True, timeout=30, + ) + if os.path.exists(ogg_path) and os.path.getsize(ogg_path) > 0: + return ogg_path + except Exception: + pass + return None + + +# =========================================================================== +# Provider: Edge TTS (free) +# =========================================================================== +async def _generate_edge_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: + """ + Generate audio using Edge TTS. + + Args: + text: Text to convert. + output_path: Where to save the MP3 file. + tts_config: TTS config dict. + + Returns: + Path to the saved audio file. + """ + edge_config = tts_config.get("edge", {}) + voice = edge_config.get("voice", DEFAULT_EDGE_VOICE) + + communicate = edge_tts.Communicate(text, voice) + await communicate.save(output_path) + return output_path + + +# =========================================================================== +# Provider: ElevenLabs (premium) +# =========================================================================== +def _generate_elevenlabs(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: + """ + Generate audio using ElevenLabs. + + Args: + text: Text to convert. + output_path: Where to save the audio file. + tts_config: TTS config dict. + + Returns: + Path to the saved audio file. + """ + api_key = os.getenv("ELEVENLABS_API_KEY", "") + if not api_key: + raise ValueError("ELEVENLABS_API_KEY not set. Get one at https://elevenlabs.io/") + + el_config = tts_config.get("elevenlabs", {}) + voice_id = el_config.get("voice_id", DEFAULT_ELEVENLABS_VOICE_ID) + model_id = el_config.get("model_id", DEFAULT_ELEVENLABS_MODEL_ID) + + # Determine output format based on file extension + if output_path.endswith(".ogg"): + output_format = "opus_48000_64" + else: + output_format = "mp3_44100_128" + + client = ElevenLabs(api_key=api_key) + audio_generator = client.text_to_speech.convert( + text=text, + voice_id=voice_id, + model_id=model_id, + output_format=output_format, + ) + + # audio_generator yields chunks -- write them all + with open(output_path, "wb") as f: + for chunk in audio_generator: + f.write(chunk) + + return output_path + + +# =========================================================================== +# Provider: OpenAI TTS +# =========================================================================== +def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: + """ + Generate audio using OpenAI TTS. + + Args: + text: Text to convert. + output_path: Where to save the audio file. + tts_config: TTS config dict. + + Returns: + Path to the saved audio file. + """ + api_key = os.getenv("OPENAI_API_KEY", "") + if not api_key: + raise ValueError("OPENAI_API_KEY not set. Get one at https://platform.openai.com/api-keys") + + oai_config = tts_config.get("openai", {}) + model = oai_config.get("model", DEFAULT_OPENAI_MODEL) + voice = oai_config.get("voice", DEFAULT_OPENAI_VOICE) + + # Determine response format from extension + if output_path.endswith(".ogg"): + response_format = "opus" + else: + response_format = "mp3" + + client = OpenAIClient(api_key=api_key) + response = client.audio.speech.create( + model=model, + voice=voice, + input=text, + response_format=response_format, + ) + + response.stream_to_file(output_path) + return output_path + + +# =========================================================================== +# Main tool function +# =========================================================================== +def text_to_speech_tool( + text: str, + output_path: Optional[str] = None, +) -> str: + """ + Convert text to speech audio. + + Reads provider/voice config from ~/.hermes/config.yaml (tts: section). + The model sends text; the user configures voice and provider. + + On messaging platforms, the returned MEDIA: tag is intercepted + by the send pipeline and delivered as a native voice message. + In CLI mode, the file is saved to ~/voice-memos/. + + Args: + text: The text to convert to speech. + output_path: Optional custom save path. Defaults to ~/voice-memos/.mp3 + + Returns: + str: JSON result with success, file_path, and optionally MEDIA tag. + """ + if not text or not text.strip(): + return json.dumps({"success": False, "error": "Text is required"}, ensure_ascii=False) + + # Truncate very long text with a warning + if len(text) > MAX_TEXT_LENGTH: + print(f"⚠️ TTS text too long ({len(text)} chars), truncating to {MAX_TEXT_LENGTH}") + text = text[:MAX_TEXT_LENGTH] + + tts_config = _load_tts_config() + provider = _get_provider(tts_config) + + # Determine output path + if output_path: + file_path = Path(output_path).expanduser() + else: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + out_dir = Path(DEFAULT_OUTPUT_DIR) + out_dir.mkdir(parents=True, exist_ok=True) + file_path = out_dir / f"tts_{timestamp}.mp3" + + # Ensure parent directory exists + file_path.parent.mkdir(parents=True, exist_ok=True) + file_str = str(file_path) + + try: + # Generate audio with the configured provider + if provider == "elevenlabs": + if not _HAS_ELEVENLABS: + return json.dumps({ + "success": False, + "error": "ElevenLabs provider selected but 'elevenlabs' package not installed. Run: pip install elevenlabs" + }, ensure_ascii=False) + print(f"🔊 Generating speech with ElevenLabs...") + _generate_elevenlabs(text, file_str, tts_config) + + elif provider == "openai": + if not _HAS_OPENAI: + return json.dumps({ + "success": False, + "error": "OpenAI provider selected but 'openai' package not installed." + }, ensure_ascii=False) + print(f"🔊 Generating speech with OpenAI TTS...") + _generate_openai_tts(text, file_str, tts_config) + + else: + # Default: Edge TTS (free) + if not _HAS_EDGE_TTS: + return json.dumps({ + "success": False, + "error": "Edge TTS not available. Run: pip install edge-tts" + }, ensure_ascii=False) + print(f"🔊 Generating speech with Edge TTS...") + # Edge TTS is async, run it + try: + loop = asyncio.get_running_loop() + import concurrent.futures + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + pool.submit( + lambda: asyncio.run(_generate_edge_tts(text, file_str, tts_config)) + ).result(timeout=60) + except RuntimeError: + asyncio.run(_generate_edge_tts(text, file_str, tts_config)) + + # Check the file was actually created + if not os.path.exists(file_str) or os.path.getsize(file_str) == 0: + return json.dumps({ + "success": False, + "error": f"TTS generation produced no output (provider: {provider})" + }, ensure_ascii=False) + + # Try Opus conversion for Telegram compatibility (Edge TTS only outputs MP3) + voice_compatible = False + if provider == "edge" and file_str.endswith(".mp3"): + opus_path = _convert_to_opus(file_str) + if opus_path: + file_str = opus_path + voice_compatible = True + elif provider in ("elevenlabs", "openai"): + # These providers can output Opus natively if the path ends in .ogg + voice_compatible = file_str.endswith(".ogg") + + file_size = os.path.getsize(file_str) + print(f"✅ TTS audio saved: {file_str} ({file_size:,} bytes, provider: {provider})") + + # Build response with MEDIA tag for platform delivery + media_tag = f"MEDIA:{file_str}" + if voice_compatible: + media_tag = f"[[audio_as_voice]]\n{media_tag}" + + return json.dumps({ + "success": True, + "file_path": file_str, + "media_tag": media_tag, + "provider": provider, + "voice_compatible": voice_compatible, + }, ensure_ascii=False) + + except Exception as e: + error_msg = f"TTS generation failed ({provider}): {e}" + print(f"❌ {error_msg}") + return json.dumps({"success": False, "error": error_msg}, ensure_ascii=False) + + +# =========================================================================== +# Requirements check +# =========================================================================== +def check_tts_requirements() -> bool: + """ + Check if at least one TTS provider is available. + + Edge TTS needs no API key and is the default, so if the package + is installed, TTS is available. + + Returns: + bool: True if at least one provider can work. + """ + if _HAS_EDGE_TTS: + return True + if _HAS_ELEVENLABS and os.getenv("ELEVENLABS_API_KEY"): + return True + if _HAS_OPENAI and os.getenv("OPENAI_API_KEY"): + return True + return False + + +# =========================================================================== +# Main -- quick diagnostics +# =========================================================================== +if __name__ == "__main__": + print("🔊 Text-to-Speech Tool Module") + print("=" * 50) + + print(f"\nProvider availability:") + print(f" Edge TTS: {'✅ installed' if _HAS_EDGE_TTS else '❌ not installed (pip install edge-tts)'}") + print(f" ElevenLabs: {'✅ installed' if _HAS_ELEVENLABS else '❌ not installed (pip install elevenlabs)'}") + print(f" API Key: {'✅ set' if os.getenv('ELEVENLABS_API_KEY') else '❌ not set'}") + print(f" OpenAI: {'✅ installed' if _HAS_OPENAI else '❌ not installed'}") + print(f" API Key: {'✅ set' if os.getenv('OPENAI_API_KEY') else '❌ not set'}") + print(f" ffmpeg: {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}") + print(f"\n Output dir: {DEFAULT_OUTPUT_DIR}") + + config = _load_tts_config() + provider = _get_provider(config) + print(f" Configured provider: {provider}") diff --git a/toolsets.py b/toolsets.py index 7896d1ecd..c90628314 100644 --- a/toolsets.py +++ b/toolsets.py @@ -108,6 +108,12 @@ TOOLSETS = { "includes": [] }, + "tts": { + "description": "Text-to-speech: convert text to audio with Edge TTS (free), ElevenLabs, or OpenAI", + "tools": ["text_to_speech"], + "includes": [] + }, + # Scenario-specific toolsets "debugging": { @@ -148,6 +154,8 @@ TOOLSETS = { "browser_type", "browser_scroll", "browser_back", "browser_press", "browser_close", "browser_get_images", "browser_vision", + # Text-to-speech + "text_to_speech", # Cronjob management (CLI-only) "schedule_cronjob", "list_cronjobs", "remove_cronjob" ], @@ -171,6 +179,8 @@ TOOLSETS = { "vision_analyze", # Image generation "image_generate", + # Text-to-speech + "text_to_speech", # Skills - access knowledge base "skills_list", "skill_view", # Cronjob management - let users schedule tasks @@ -192,6 +202,8 @@ TOOLSETS = { "vision_analyze", # Image generation "image_generate", + # Text-to-speech + "text_to_speech", # Skills - access knowledge base "skills_list", "skill_view", # Cronjob management - let users schedule tasks @@ -213,6 +225,8 @@ TOOLSETS = { "vision_analyze", # Image generation "image_generate", + # Text-to-speech + "text_to_speech", # Skills "skills_list", "skill_view", # Cronjob management