diff --git a/README.md b/README.md index da9e29579..4494cd4a8 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ Convert text to speech with three providers: | **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` | | **OpenAI TTS** | Good | Paid | `OPENAI_API_KEY` | -On Telegram, audio plays as native voice bubbles. On Discord/WhatsApp, sent as audio files. In CLI mode, saved to `~/voice-memos/`. +On Telegram, audio plays as native voice bubbles (the round, inline-playable kind). On Discord/WhatsApp, sent as audio file attachments. In CLI mode, saved to `~/voice-memos/`. **Configure in `~/.hermes/config.yaml`:** ```yaml @@ -159,7 +159,22 @@ tts: voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer ``` -> **Note:** Telegram voice bubbles require `ffmpeg` for Opus conversion (Edge TTS only outputs MP3). Install with `apt install ffmpeg` or `brew install ffmpeg`. Without ffmpeg, audio is sent as a file instead of a voice bubble. +**Telegram voice bubbles & ffmpeg:** + +Telegram voice bubbles require Opus/OGG audio format. OpenAI and ElevenLabs produce Opus natively — no extra dependencies needed. Edge TTS (the default free provider) outputs MP3 and needs **ffmpeg** to convert to Opus: + +```bash +# Ubuntu/Debian +sudo apt install ffmpeg + +# macOS +brew install ffmpeg + +# Fedora +sudo dnf install ffmpeg +``` + +Without ffmpeg, Edge TTS audio is sent as a regular audio file (playable, but shows as a rectangular player instead of a voice bubble). If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider. ### 📄 Context Files (SOUL.md, AGENTS.md, .cursorrules) diff --git a/docs/messaging.md b/docs/messaging.md index fb52ed4d1..aa67cbe11 100644 --- a/docs/messaging.md +++ b/docs/messaging.md @@ -313,10 +313,10 @@ The `text_to_speech` tool generates audio that the gateway delivers as native vo | Platform | Delivery | Format | |----------|----------|--------| -| Telegram | Voice bubble (plays inline) | Opus `.ogg` (converted from MP3 via ffmpeg) | +| Telegram | Voice bubble (plays inline) | Opus `.ogg` — native from OpenAI/ElevenLabs, converted via ffmpeg for Edge TTS | | Discord | Audio file attachment | MP3 | | WhatsApp | Audio file attachment | MP3 | -| CLI | Saved to `~/voice-memos/` | MP3 (or Opus if ffmpeg available) | +| CLI | Saved to `~/voice-memos/` | MP3 | **Providers:** - **Edge TTS** (default) — Free, no API key, 322 voices in 74 languages @@ -327,7 +327,17 @@ Voice and provider are configured by the user in `~/.hermes/config.yaml` under t The tool returns a `MEDIA:` tag that the gateway send pipeline intercepts and delivers as a native audio message. If `[[audio_as_voice]]` is present (Opus format available), Telegram sends it as a voice bubble instead of an audio file. -> **Note:** Telegram voice bubbles require `ffmpeg` for Opus conversion (Edge TTS outputs MP3). Install with `apt install ffmpeg` or `brew install ffmpeg`. Without ffmpeg, audio is sent as a regular file. +**Telegram voice bubbles & ffmpeg:** + +Telegram requires Opus/OGG format for native voice bubbles (the round, inline-playable kind). **OpenAI and ElevenLabs** produce Opus natively when on Telegram — no extra setup needed. **Edge TTS** (the default free provider) outputs MP3 and needs `ffmpeg` to convert: + +```bash +sudo apt install ffmpeg # Ubuntu/Debian +brew install ffmpeg # macOS +sudo dnf install ffmpeg # Fedora +``` + +Without ffmpeg, Edge TTS audio is sent as a regular audio file (still playable, but shows as a rectangular music player instead of a voice bubble). ## Cron Job Delivery diff --git a/gateway/run.py b/gateway/run.py index 901f581cf..c66ce19ca 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -589,30 +589,37 @@ class GatewayRunner: return f"⚠️ {result['error']}" return "(No response generated)" - # Scan tool results in the conversation for MEDIA: tags. - # The TTS tool (and potentially other media-producing tools) embed - # MEDIA: tags in their JSON responses, but the model's final reply - # typically doesn't include them -- it just says "here you go". - # We collect those tags and append them to the final response so - # the adapter's extract_media() can find and deliver the files. - media_tags = [] - for msg in result.get("messages", []): - if msg.get("role") == "tool" or (msg.get("role") == "function"): - content = msg.get("content", "") - if "MEDIA:" in content: - # Extract MEDIA: tags from tool result (may be inside JSON). - # Strip trailing JSON artifacts like quotes and commas that - # get caught by the \S+ when the tag is inside a JSON string. - for match in re.finditer(r'MEDIA:(\S+)', content): - path = match.group(1).strip().rstrip('",}') - if path: - media_tags.append(f"MEDIA:{path}") - # Also capture the [[audio_as_voice]] directive - if "[[audio_as_voice]]" in content: - media_tags.insert(0, "[[audio_as_voice]]") - - if media_tags: - final_response = final_response + "\n" + "\n".join(media_tags) + # Scan tool results for MEDIA: tags that need to be delivered + # as native audio/file attachments. The TTS tool embeds MEDIA: tags + # in its JSON response, but the model's final text reply usually + # doesn't include them. We collect unique tags from tool results and + # append any that aren't already present in the final response, so the + # adapter's extract_media() can find and deliver the files exactly once. + if "MEDIA:" not in final_response: + media_tags = [] + has_voice_directive = False + for msg in result.get("messages", []): + if msg.get("role") == "tool" or msg.get("role") == "function": + content = msg.get("content", "") + if "MEDIA:" in content: + for match in re.finditer(r'MEDIA:(\S+)', content): + path = match.group(1).strip().rstrip('",}') + if path: + media_tags.append(f"MEDIA:{path}") + if "[[audio_as_voice]]" in content: + has_voice_directive = True + + if media_tags: + # Deduplicate while preserving order + seen = set() + unique_tags = [] + for tag in media_tags: + if tag not in seen: + seen.add(tag) + unique_tags.append(tag) + if has_voice_directive: + unique_tags.insert(0, "[[audio_as_voice]]") + final_response = final_response + "\n" + "\n".join(unique_tags) return final_response diff --git a/scripts/install.sh b/scripts/install.sh index f913198c6..d6c9609f1 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -423,8 +423,12 @@ check_ffmpeg() { return fi - log_warn "ffmpeg not found (TTS voice bubbles on Telegram will send as audio files instead)" - log_info "To install ffmpeg (optional):" + log_warn "ffmpeg not found" + log_info "ffmpeg is needed for Telegram voice bubbles when using the default Edge TTS provider." + log_info "Without it, Edge TTS audio is sent as a file instead of a voice bubble." + log_info "(OpenAI and ElevenLabs TTS produce Opus natively and don't need ffmpeg.)" + log_info "" + log_info "To install ffmpeg:" case "$OS" in linux)