diff --git a/docs/messaging.md b/docs/messaging.md index fa3d447dd..fb52ed4d1 100644 --- a/docs/messaging.md +++ b/docs/messaging.md @@ -307,6 +307,28 @@ This is intentional: CLI users are in a terminal and expect the agent to work in If the agent hits the max iteration limit while working, instead of a generic error, it asks the model to summarize what it found so far. This gives you a useful response even when the task couldn't be fully completed. +## Voice Messages (TTS) + +The `text_to_speech` tool generates audio that the gateway delivers as native voice messages on each platform: + +| Platform | Delivery | Format | +|----------|----------|--------| +| Telegram | Voice bubble (plays inline) | Opus `.ogg` (converted from MP3 via ffmpeg) | +| Discord | Audio file attachment | MP3 | +| WhatsApp | Audio file attachment | MP3 | +| CLI | Saved to `~/voice-memos/` | MP3 (or Opus if ffmpeg available) | + +**Providers:** +- **Edge TTS** (default) — Free, no API key, 322 voices in 74 languages +- **ElevenLabs** — Premium quality, requires `ELEVENLABS_API_KEY` +- **OpenAI TTS** — Good quality, requires `OPENAI_API_KEY` + +Voice and provider are configured by the user in `~/.hermes/config.yaml` under the `tts:` key. The model only sends text; it does not choose the voice. + +The tool returns a `MEDIA:` tag that the gateway send pipeline intercepts and delivers as a native audio message. If `[[audio_as_voice]]` is present (Opus format available), Telegram sends it as a voice bubble instead of an audio file. + +> **Note:** Telegram voice bubbles require `ffmpeg` for Opus conversion (Edge TTS outputs MP3). Install with `apt install ffmpeg` or `brew install ffmpeg`. Without ffmpeg, audio is sent as a regular file. + ## Cron Job Delivery When scheduling cron jobs, you can specify where the output should be delivered: diff --git a/docs/tools.md b/docs/tools.md index 4c84c83c1..3fc60e14e 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -40,11 +40,15 @@ async def web_search(query: str) -> dict: |----------|--------|-------| | **Web** | `web_tools.py` | `web_search`, `web_extract`, `web_crawl` | | **Terminal** | `terminal_tool.py` | `terminal` (local/docker/singularity/modal/ssh backends) | +| **File** | `file_tools.py` | `read_file`, `write_file`, `patch`, `search` | | **Browser** | `browser_tool.py` | `browser_navigate`, `browser_click`, `browser_type`, etc. | | **Vision** | `vision_tools.py` | `vision_analyze` | | **Image Gen** | `image_generation_tool.py` | `image_generate` | +| **TTS** | `tts_tool.py` | `text_to_speech` (Edge TTS free / ElevenLabs / OpenAI) | | **Reasoning** | `mixture_of_agents_tool.py` | `mixture_of_agents` | -| **Skills** | `skills_tool.py` | `skills_categories`, `skills_list`, `skill_view` | +| **Skills** | `skills_tool.py` | `skills_list`, `skill_view` | +| **Cronjob** | `cronjob_tools.py` | `schedule_cronjob`, `list_cronjobs`, `remove_cronjob` | +| **RL Training** | `rl_training_tool.py` | `rl_list_environments`, `rl_start_training`, `rl_check_status`, etc. | ## Tool Registration diff --git a/gateway/run.py b/gateway/run.py index e3ea0f03b..901f581cf 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -15,6 +15,7 @@ Usage: import asyncio import os +import re import sys import signal from pathlib import Path @@ -583,13 +584,37 @@ class GatewayRunner: # Return final response, or a message if something went wrong final_response = result.get("final_response") - if final_response: - return final_response - elif result.get("error"): - # Agent couldn't recover - show the error - return f"⚠️ {result['error']}" - else: + if not final_response: + if result.get("error"): + return f"⚠️ {result['error']}" return "(No response generated)" + + # Scan tool results in the conversation for MEDIA: tags. + # The TTS tool (and potentially other media-producing tools) embed + # MEDIA: tags in their JSON responses, but the model's final reply + # typically doesn't include them -- it just says "here you go". + # We collect those tags and append them to the final response so + # the adapter's extract_media() can find and deliver the files. + media_tags = [] + for msg in result.get("messages", []): + if msg.get("role") == "tool" or (msg.get("role") == "function"): + content = msg.get("content", "") + if "MEDIA:" in content: + # Extract MEDIA: tags from tool result (may be inside JSON). + # Strip trailing JSON artifacts like quotes and commas that + # get caught by the \S+ when the tag is inside a JSON string. + for match in re.finditer(r'MEDIA:(\S+)', content): + path = match.group(1).strip().rstrip('",}') + if path: + media_tags.append(f"MEDIA:{path}") + # Also capture the [[audio_as_voice]] directive + if "[[audio_as_voice]]" in content: + media_tags.insert(0, "[[audio_as_voice]]") + + if media_tags: + final_response = final_response + "\n" + "\n".join(media_tags) + + return final_response # Start progress message sender if enabled progress_task = None diff --git a/pyproject.toml b/pyproject.toml index 30565e088..30bf3563d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,8 @@ dependencies = [ # Tools "firecrawl-py", "fal-client", + # Text-to-speech (Edge TTS is free, no API key needed) + "edge-tts", # mini-swe-agent deps (terminal tool) "litellm>=1.75.5", "typer", @@ -39,12 +41,14 @@ dev = ["pytest", "pytest-asyncio"] messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0"] cron = ["croniter"] cli = ["simple-term-menu"] +tts-premium = ["elevenlabs"] all = [ "hermes-agent[modal]", "hermes-agent[messaging]", "hermes-agent[cron]", "hermes-agent[cli]", "hermes-agent[dev]", + "hermes-agent[tts-premium]", ] [project.scripts]