diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index 2679facb7..1fda14229 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -881,7 +881,18 @@ class DiscordAdapter(BasePlatformAdapter): await asyncio.to_thread(VoiceReceiver.pcm_to_wav, pcm_data, wav_path) from tools.transcription_tools import transcribe_audio - result = await asyncio.to_thread(transcribe_audio, wav_path) + # Read STT model from config.yaml + stt_model = None + try: + import yaml as _y + from pathlib import Path as _P + _cfg = _P(os.getenv("HERMES_HOME", _P.home() / ".hermes")) / "config.yaml" + if _cfg.exists(): + with open(_cfg) as _f: + stt_model = (_y.safe_load(_f) or {}).get("stt", {}).get("model") + except Exception: + pass + result = await asyncio.to_thread(transcribe_audio, wav_path, model=stt_model) if not result.get("success"): return diff --git a/gateway/run.py b/gateway/run.py index ae5852ed1..157ed9d99 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -3326,11 +3326,23 @@ class GatewayRunner: from tools.transcription_tools import transcribe_audio import asyncio + # Read STT model from config.yaml (same key the CLI uses) + stt_model = None + try: + import yaml as _y + _cfg = _hermes_home / "config.yaml" + if _cfg.exists(): + with open(_cfg) as _f: + _data = _y.safe_load(_f) or {} + stt_model = _data.get("stt", {}).get("model") + except Exception: + pass + enriched_parts = [] for path in audio_paths: try: logger.debug("Transcribing user voice: %s", path) - result = await asyncio.to_thread(transcribe_audio, path) + result = await asyncio.to_thread(transcribe_audio, path, model=stt_model) if result["success"]: transcript = result["transcript"] enriched_parts.append( diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index 6b9c4b5f6..c962f77c3 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -32,13 +32,13 @@ from typing import Optional, Dict, Any, Tuple logger = logging.getLogger(__name__) -# Default STT models per provider -DEFAULT_STT_MODEL = "whisper-1" -DEFAULT_GROQ_STT_MODEL = "whisper-large-v3-turbo" +# Default STT models per provider (overridable via env) +DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1") +DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo") -# Provider endpoints -GROQ_BASE_URL = "https://api.groq.com/openai/v1" -OPENAI_BASE_URL = "https://api.openai.com/v1" +# Provider endpoints (overridable via env for proxies / self-hosted) +GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1") +OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1") def _resolve_stt_provider() -> Tuple[Optional[str], Optional[str], str]: diff --git a/website/docs/user-guide/features/voice-mode.md b/website/docs/user-guide/features/voice-mode.md index 857efcfaf..87bd72140 100644 --- a/website/docs/user-guide/features/voice-mode.md +++ b/website/docs/user-guide/features/voice-mode.md @@ -78,8 +78,8 @@ Add to `~/.hermes/.env`: ```bash # Speech-to-Text (at least one required) -GROQ_API_KEY=your-key # Groq Whisper — fast, free tier available (recommended) -VOICE_TOOLS_OPENAI_KEY=your-key # OpenAI Whisper — alternative +GROQ_API_KEY=your-key # Groq Whisper — fast, free tier (recommended for most users) +VOICE_TOOLS_OPENAI_KEY=your-key # OpenAI Whisper — used first if both keys are set # Text-to-Speech (optional — Edge TTS works without any key) ELEVENLABS_API_KEY=your-key # ElevenLabs — premium quality @@ -327,16 +327,11 @@ You must be in a voice channel before running `/voice join`. The bot joins the s When the bot joins a voice channel, it: -1. **Captures audio** via Discord's UDP socket (RTP packets) -2. **Decrypts** using NaCl transport encryption (aead_xchacha20_poly1305_rtpsize) -3. **Decrypts** DAVE end-to-end encryption (Discord Audio/Video Encryption) -4. **Decodes** Opus audio to raw PCM (48kHz stereo, per-user decoder) -5. **Detects silence** — 1.5s of silence after at least 0.5s of speech triggers processing -6. **Converts** PCM to 16kHz mono WAV via ffmpeg -7. **Transcribes** via Whisper STT (Groq or OpenAI) -8. **Processes** through the full agent pipeline (session, tools, memory) -9. **Generates TTS** reply audio -10. **Plays** the reply in the voice channel +1. **Listens** to each user's audio stream independently +2. **Detects silence** — 1.5s of silence after at least 0.5s of speech triggers processing +3. **Transcribes** the audio via Whisper STT (Groq or OpenAI) +4. **Processes** through the full agent pipeline (session, tools, memory) +5. **Speaks** the reply back in the voice channel via TTS ### Text Channel Integration @@ -397,7 +392,13 @@ tts: ```bash # Speech-to-Text providers GROQ_API_KEY=... # Groq Whisper (recommended — fast, free tier) -VOICE_TOOLS_OPENAI_KEY=... # OpenAI Whisper (alternative) +VOICE_TOOLS_OPENAI_KEY=... # OpenAI Whisper (used first if both set) + +# STT advanced overrides (optional) +STT_GROQ_MODEL=whisper-large-v3-turbo # Override default Groq STT model +STT_OPENAI_MODEL=whisper-1 # Override default OpenAI STT model +GROQ_BASE_URL=https://api.groq.com/openai/v1 # Custom Groq endpoint +STT_OPENAI_BASE_URL=https://api.openai.com/v1 # Custom OpenAI STT endpoint # Text-to-Speech providers (Edge TTS needs no key) ELEVENLABS_API_KEY=... # ElevenLabs (premium quality) @@ -464,63 +465,9 @@ The bot requires an @mention by default in server channels. Make sure you: - Edge TTS (free, no key) is the default fallback - Check logs for TTS errors -### Web UI not accessible from other devices on the network +### Web UI issues (firewall, mobile mic) -The macOS firewall may block incoming connections. Allow the gateway through: - -1. **System Settings** → **Network** → **Firewall** → **Options** -2. Add `/usr/local/bin/python3` (or your Python path) to the allowed list -3. Or temporarily disable the firewall for testing - -On Linux, allow the port through `ufw`: - -```bash -sudo ufw allow 8765/tcp -``` - -### Web UI microphone not working on mobile - -Mobile browsers require **HTTPS** for microphone access (`navigator.mediaDevices` API). When accessing the Web UI over HTTP on a LAN IP (e.g. `http://192.168.1.x:8765`), the mic button will appear dimmed. - -**Workarounds:** - -**Android Chrome** — flag the LAN IP as secure: -1. Open `chrome://flags/#unsafely-treat-insecure-origin-as-secure` -2. Add your Web UI URL (e.g. `http://192.168.1.106:8765`) -3. Set to **Enabled** and relaunch Chrome - -**iOS Safari / Chrome** — no flag bypass available. Use one of these instead: - -1. **Self-signed HTTPS** with mkcert (recommended): - ```bash - # Install mkcert - brew install mkcert - mkcert -install - - # Generate cert for your LAN IP - mkcert 192.168.1.106 - - # Run a simple HTTPS reverse proxy (requires Node.js) - npx local-ssl-proxy --source 8443 --target 8765 \ - --cert 192.168.1.106.pem --key 192.168.1.106-key.pem - ``` - Then access `https://192.168.1.106:8443` on your iPhone. You'll need to trust the mkcert root CA on iOS: **Settings → General → About → Certificate Trust Settings**. - -2. **Caddy reverse proxy** (auto-HTTPS for local networks): - ```bash - brew install caddy - caddy reverse-proxy --from https://192.168.1.106:8443 --to http://127.0.0.1:8765 - ``` - -3. **SSH tunnel from mobile** (if you have an SSH client like Termius): - ```bash - ssh -L 8765:127.0.0.1:8765 user@your-mac-ip - ``` - Then access `http://localhost:8765` on the mobile browser — localhost is exempt from HTTPS requirement. - -:::tip -Text chat works on mobile over HTTP without any workaround — only the microphone feature requires HTTPS. -::: +See the [Web UI Troubleshooting](../messaging/web.md#troubleshooting) guide for firewall, HTTPS, and mobile microphone issues. ### Whisper returns garbage text diff --git a/website/docs/user-guide/messaging/web.md b/website/docs/user-guide/messaging/web.md index 59c308f47..52fc74b14 100644 --- a/website/docs/user-guide/messaging/web.md +++ b/website/docs/user-guide/messaging/web.md @@ -61,46 +61,6 @@ The web UI starts automatically alongside your other platforms. --- -## Step 1: Configure - -Add to `~/.hermes/.env`: - -```bash -# Enable Web UI -WEB_UI_ENABLED=true - -# Port to listen on (default: 8765) -WEB_UI_PORT=8765 - -# Bind address (default: 0.0.0.0 = all interfaces, for LAN access) -# Set to 127.0.0.1 for localhost-only access -WEB_UI_HOST=0.0.0.0 - -# Access token (leave empty to auto-generate on each startup) -WEB_UI_TOKEN=your-secret-token -``` - -## Step 2: Start the Gateway - -```bash -hermes gateway -``` - -You'll see output like: - -``` -[Web] Web UI: http://192.168.1.106:8765 -[Web] Access token: your-secret-token -``` - -## Step 3: Open in Browser - -1. Open the URL shown in the console on any device on the same network -2. Enter the access token -3. Start chatting - ---- - ## Features ### Markdown & Code Highlighting @@ -111,7 +71,7 @@ Bot responses render full GitHub-flavored Markdown with syntax-highlighted code Click the microphone button to record a voice message. The audio is transcribed via Whisper STT (using OpenAI or Groq as fallback) and sent to the agent. The bot automatically replies with audio playback — voice first, then the text response appears. No extra configuration needed. -STT priority: `VOICE_TOOLS_OPENAI_KEY` (OpenAI Whisper) > `GROQ_API_KEY` (Groq Whisper). TTS uses Edge TTS (free, no key) by default, or ElevenLabs/OpenAI if configured in `~/.hermes/config.yaml`. +STT uses `VOICE_TOOLS_OPENAI_KEY` (OpenAI Whisper) if set, otherwise falls back to `GROQ_API_KEY` (Groq Whisper, free tier). If you only need STT, setting `GROQ_API_KEY` is the simplest option. TTS uses Edge TTS (free, no key) by default, or ElevenLabs/OpenAI if configured in `~/.hermes/config.yaml`. ### Images & Files @@ -211,6 +171,36 @@ WEB_UI_PORT=9000 - HTTPS is required for microphone access on non-localhost origins - On localhost (`127.0.0.1`), HTTP works fine for microphone +### Microphone not working on mobile + +Mobile browsers require **HTTPS** for microphone access (`navigator.mediaDevices` API). When accessing the Web UI over HTTP on a LAN IP (e.g. `http://192.168.1.x:8765`), the mic button will appear dimmed. + +**Android Chrome** — flag the LAN IP as secure: +1. Open `chrome://flags/#unsafely-treat-insecure-origin-as-secure` +2. Add your Web UI URL (e.g. `http://192.168.1.106:8765`) +3. Set to **Enabled** and relaunch Chrome + +**iOS Safari / Chrome** — no flag bypass available. Use one of these instead: + +1. **Self-signed HTTPS** with mkcert (recommended): + ```bash + brew install mkcert && mkcert -install + mkcert 192.168.1.106 + npx local-ssl-proxy --source 8443 --target 8765 \ + --cert 192.168.1.106.pem --key 192.168.1.106-key.pem + ``` + Then access `https://192.168.1.106:8443`. Trust the mkcert root CA on iOS: **Settings > General > About > Certificate Trust Settings**. + +2. **SSH tunnel from mobile** (if you have Termius or similar): + ```bash + ssh -L 8765:127.0.0.1:8765 user@your-mac-ip + ``` + Then access `http://localhost:8765` — localhost is exempt from the HTTPS requirement. + +:::tip +Text chat works on mobile over HTTP without any workaround — only the microphone feature requires HTTPS. +::: + ### CDN resources not loading The UI loads `marked.js` and `highlight.js` from CDN. If you're offline or behind a restrictive proxy, markdown rendering and code highlighting won't work but basic chat still functions.