From 69aa35a51c3db85002892e2fab889287bf170dda Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sun, 15 Feb 2026 21:38:59 -0800 Subject: [PATCH] Add messaging platform enhancements: STT, stickers, Discord UX, Slack, pairing, hooks Major feature additions inspired by OpenClaw/ClawdBot integration analysis: Voice Message Transcription (STT): - Auto-transcribe voice/audio messages via OpenAI Whisper API - Download voice to ~/.hermes/audio_cache/ on Telegram/Discord/WhatsApp - Inject transcript as text so all models can understand voice input - Configurable model (whisper-1, gpt-4o-mini-transcribe, gpt-4o-transcribe) Telegram Sticker Understanding: - Describe static stickers via vision tool with JSON-backed cache - Cache keyed by file_unique_id avoids redundant API calls - Animated/video stickers get emoji-based fallback description Discord Rich UX: - Native slash commands (/ask, /reset, /status, /stop) via app_commands - Button-based exec approvals (Allow Once / Always Allow / Deny) - ExecApprovalView with user authorization and timeout handling Slack Integration: - Full SlackAdapter using slack-bolt with Socket Mode - DMs, channel messages (mention-gated), /hermes slash command - File attachment handling with bot-token-authenticated downloads DM Pairing System: - Code-based user authorization as alternative to static allowlists - 8-char codes from unambiguous alphabet, 1-hour expiry - Rate limiting, lockout after failed attempts, chmod 0600 on data - CLI: hermes pairing list/approve/revoke/clear-pending Event Hook System: - File-based hook discovery from ~/.hermes/hooks/ - HOOK.yaml + handler.py per hook, sync/async handler support - Events: gateway:startup, session:start/reset, agent:start/step/end - Wildcard matching (command:* catches all command events) Cross-Channel Messaging: - send_message agent tool for delivering to any connected platform - Enables cron job delivery and cross-platform notifications Human-Like Response Pacing: - Configurable delays between message chunks (off/natural/custom) - HERMES_HUMAN_DELAY_MODE env var with min/max ms settings Warm Injection Message Style: - Retrofitted image vision messages with friendly kawaii-consistent tone - All new injection messages (STT, stickers, errors) use warm style Also: updated config migration to prompt for optional keys interactively, bumped config version, updated README, AGENTS.md, .env.example, cli-config.yaml.example, install scripts, pyproject.toml, and toolsets. --- .env.example | 30 +++ AGENTS.md | 44 ++++ README.md | 55 +++++ cli-config.yaml.example | 18 ++ gateway/config.py | 17 ++ gateway/hooks.py | 150 ++++++++++++++ gateway/pairing.py | 282 +++++++++++++++++++++++++ gateway/platforms/base.py | 87 ++++++++ gateway/platforms/discord.py | 239 +++++++++++++++++++++- gateway/platforms/slack.py | 374 ++++++++++++++++++++++++++++++++++ gateway/platforms/telegram.py | 102 +++++++++- gateway/platforms/whatsapp.py | 13 +- gateway/run.py | 187 +++++++++++++++-- gateway/sticker_cache.py | 111 ++++++++++ hermes_cli/config.py | 65 +++++- hermes_cli/main.py | 28 +++ hermes_cli/pairing.py | 100 +++++++++ model_tools.py | 56 +++++ pyproject.toml | 4 +- scripts/install.ps1 | 4 + scripts/install.sh | 4 +- tools/transcription_tools.py | 103 ++++++++++ toolsets.py | 39 +++- 23 files changed, 2080 insertions(+), 32 deletions(-) create mode 100644 gateway/hooks.py create mode 100644 gateway/pairing.py create mode 100644 gateway/platforms/slack.py create mode 100644 gateway/sticker_cache.py create mode 100644 hermes_cli/pairing.py create mode 100644 tools/transcription_tools.py diff --git a/.env.example b/.env.example index 3b0b66a0b..7927fd421 100644 --- a/.env.example +++ b/.env.example @@ -140,6 +140,36 @@ BROWSER_INACTIVITY_TIMEOUT=120 # Format: logs/session_YYYYMMDD_HHMMSS_UUID.json # Contains full conversation history in trajectory format for debugging/replay +# ============================================================================= +# VOICE TRANSCRIPTION (Speech-to-Text) +# ============================================================================= +# Required for automatic voice message transcription on messaging platforms. +# Uses OpenAI's Whisper API directly (not via OpenRouter). +# Get at: https://platform.openai.com/api-keys +OPENAI_API_KEY= + +# ============================================================================= +# SLACK INTEGRATION +# ============================================================================= +# Slack Bot Token - From Slack App settings (OAuth & Permissions) +# Get at: https://api.slack.com/apps +# SLACK_BOT_TOKEN=xoxb-... + +# Slack App Token - For Socket Mode (App-Level Tokens in Slack App settings) +# SLACK_APP_TOKEN=xapp-... + +# Slack allowed users (comma-separated Slack user IDs) +# SLACK_ALLOWED_USERS= + +# ============================================================================= +# RESPONSE PACING +# ============================================================================= +# Human-like delays between message chunks on messaging platforms. +# Makes the bot feel less robotic. +# HERMES_HUMAN_DELAY_MODE=off # off | natural | custom +# HERMES_HUMAN_DELAY_MIN_MS=800 # Min delay in ms (custom mode) +# HERMES_HUMAN_DELAY_MAX_MS=2500 # Max delay in ms (custom mode) + # ============================================================================= # LEGACY/OPTIONAL API KEYS # ============================================================================= diff --git a/AGENTS.md b/AGENTS.md index aa64b42dd..e375add61 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -25,7 +25,13 @@ hermes-agent/ │ ├── uninstall.py # Uninstaller │ └── cron.py # Cron job management ├── tools/ # Tool implementations +│ ├── transcription_tools.py # Speech-to-text (Whisper API) ├── gateway/ # Messaging platform adapters +│ ├── pairing.py # DM pairing code system +│ ├── hooks.py # Event hook system +│ ├── sticker_cache.py # Telegram sticker vision cache +│ ├── platforms/ +│ │ └── slack.py # Slack adapter (slack-bolt) ├── cron/ # Scheduler implementation ├── skills/ # Knowledge documents ├── cli.py # Interactive CLI (Rich UI) @@ -39,6 +45,11 @@ hermes-agent/ **User Configuration** (stored in `~/.hermes/`): - `~/.hermes/config.yaml` - Settings (model, terminal, toolsets, etc.) - `~/.hermes/.env` - API keys and secrets +- `~/.hermes/pairing/` - DM pairing data +- `~/.hermes/hooks/` - Custom event hooks +- `~/.hermes/image_cache/` - Cached user images +- `~/.hermes/audio_cache/` - Cached user voice messages +- `~/.hermes/sticker_cache.json` - Telegram sticker descriptions ## File Dependency Chain @@ -179,6 +190,7 @@ The unified `hermes` command provides all functionality: | `hermes gateway` | Start messaging gateway | | `hermes cron list` | View scheduled jobs | | `hermes version` | Show version info | +| `hermes pairing list/approve/revoke` | Manage DM pairing codes | --- @@ -225,6 +237,33 @@ Users can find their IDs: - **Telegram**: Message [@userinfobot](https://t.me/userinfobot) - **Discord**: Enable Developer Mode, right-click name → Copy ID +### DM Pairing System + +Instead of static allowlists, users can pair via one-time codes: +1. Unknown user DMs the bot → receives pairing code +2. Owner runs `hermes pairing approve ` +3. User is permanently authorized + +Security: 8-char codes, 1-hour expiry, rate-limited (1/10min/user), max 3 pending per platform, lockout after 5 failed attempts, `chmod 0600` on data files. + +Files: `gateway/pairing.py`, `hermes_cli/pairing.py` + +### Event Hooks + +Hooks fire at lifecycle points. Place hook directories in `~/.hermes/hooks/`: + +``` +~/.hermes/hooks/my-hook/ +├── HOOK.yaml # name, description, events list +└── handler.py # async def handle(event_type, context): ... +``` + +Events: `gateway:startup`, `session:start`, `session:reset`, `agent:start`, `agent:step`, `agent:end`, `command:*` + +The `agent:step` event fires each iteration of the tool-calling loop with tool names and results. + +Files: `gateway/hooks.py` + ### Tool Progress Notifications When `HERMES_TOOL_PROGRESS=true`, the bot sends status messages as it works: @@ -336,6 +375,11 @@ Agent behavior (in `~/.hermes/.env`): - `MESSAGING_CWD` - Working directory for messaging platforms (default: ~) - `HERMES_TOOL_PROGRESS` - Enable tool progress messages (`true`/`false`) - `HERMES_TOOL_PROGRESS_MODE` - Progress mode: `new` (tool changes) or `all` +- `OPENAI_API_KEY` - Voice transcription (Whisper STT) +- `SLACK_BOT_TOKEN` / `SLACK_APP_TOKEN` - Slack integration (Socket Mode) +- `SLACK_ALLOWED_USERS` - Comma-separated Slack user IDs +- `HERMES_HUMAN_DELAY_MODE` - Response pacing: off/natural/custom +- `HERMES_HUMAN_DELAY_MIN_MS` / `HERMES_HUMAN_DELAY_MAX_MS` - Custom delay range ### Dangerous Command Approval diff --git a/README.md b/README.md index 4494cd4a8..fdd4d72f5 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,8 @@ You need at least one LLM provider: | Premium TTS voices | [ElevenLabs](https://elevenlabs.io/) | `ELEVENLABS_API_KEY` | | OpenAI TTS voices | [OpenAI](https://platform.openai.com/api-keys) | `OPENAI_API_KEY` | | RL Training | [Tinker](https://tinker-console.thinkingmachines.ai/) + [WandB](https://wandb.ai/) | `TINKER_API_KEY`, `WANDB_API_KEY` | +| Voice transcription | [OpenAI](https://platform.openai.com/api-keys) | `OPENAI_API_KEY` | +| Slack integration | [Slack](https://api.slack.com/apps) | `SLACK_BOT_TOKEN`, `SLACK_APP_TOKEN` | | Messaging | Telegram, Discord | `TELEGRAM_BOT_TOKEN`, `DISCORD_BOT_TOKEN` | --- @@ -99,6 +101,7 @@ hermes update # Update to latest version (prompts for new config) hermes uninstall # Uninstall (can keep configs for later reinstall) hermes gateway # Start messaging gateway hermes cron list # View scheduled jobs +hermes pairing list # View/manage DM pairing codes hermes version # Show version info ``` @@ -176,6 +179,23 @@ sudo dnf install ffmpeg Without ffmpeg, Edge TTS audio is sent as a regular audio file (playable, but shows as a rectangular player instead of a voice bubble). If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider. +### 🎙️ Voice Message Transcription + +Voice messages sent on Telegram, Discord, WhatsApp, or Slack are automatically transcribed using OpenAI's Whisper API and injected as text into the conversation. The agent sees the transcript as normal text -- no special handling needed. + +| Provider | Model | Quality | Cost | +|----------|-------|---------|------| +| **OpenAI Whisper** | `whisper-1` (default) | Good | Low | +| **OpenAI GPT-4o** | `gpt-4o-mini-transcribe` | Better | Medium | +| **OpenAI GPT-4o** | `gpt-4o-transcribe` | Best | Higher | + +Requires `OPENAI_API_KEY` in `~/.hermes/.env`. Configure the model in `~/.hermes/config.yaml`: +```yaml +stt: + enabled: true + model: "whisper-1" +``` + ### 📄 Context Files (SOUL.md, AGENTS.md, .cursorrules) Drop these files in your project directory and the agent automatically picks them up: @@ -293,6 +313,40 @@ DISCORD_BOT_TOKEN=MTIz... DISCORD_ALLOWED_USERS=YOUR_USER_ID ``` +#### Slack Setup + +1. **Create an app:** Go to [Slack API](https://api.slack.com/apps), create a new app +2. **Enable Socket Mode:** In app settings → Socket Mode → Enable +3. **Get tokens:** + - Bot Token (`xoxb-...`): OAuth & Permissions → Install to Workspace + - App Token (`xapp-...`): Basic Information → App-Level Tokens → Generate +4. **Configure:** + +```bash +# Add to ~/.hermes/.env: +SLACK_BOT_TOKEN=xoxb-... +SLACK_APP_TOKEN=xapp-... +SLACK_ALLOWED_USERS=U01234ABCDE # Comma-separated Slack user IDs +``` + +5. **Start the gateway:** `hermes gateway` + +#### DM Pairing (Alternative to Allowlists) + +Instead of manually configuring user IDs in allowlists, you can use the pairing system. When an unknown user DMs your bot, they receive a one-time pairing code: + +```bash +# The user sees: "Pairing code: XKGH5N7P" +# You approve them with: +hermes pairing approve telegram XKGH5N7P + +# Other pairing commands: +hermes pairing list # View pending + approved users +hermes pairing revoke telegram 123456789 # Remove access +``` + +Pairing codes expire after 1 hour, are rate-limited, and use cryptographic randomness. + #### Security (Important!) **Without an allowlist, anyone who finds your bot can use it!** @@ -312,6 +366,7 @@ DISCORD_ALLOWED_USERS=123456789012345678 |---------|-------------| | `/new` or `/reset` | Start fresh conversation | | `/status` | Show session info | +| `/hermes` (Discord) | Slash command — ask, reset, status, stop | #### Working Directory diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 0e9099533..ac9a68b6c 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -244,6 +244,24 @@ toolsets: # toolsets: # - safe +# ============================================================================= +# Voice Transcription (Speech-to-Text) +# ============================================================================= +# Automatically transcribe voice messages on messaging platforms. +# Requires OPENAI_API_KEY in .env (uses OpenAI Whisper API directly). +stt: + enabled: true + model: "whisper-1" # whisper-1 (cheapest) | gpt-4o-mini-transcribe | gpt-4o-transcribe + +# ============================================================================= +# Response Pacing (Messaging Platforms) +# ============================================================================= +# Add human-like delays between message chunks. +# human_delay: +# mode: "off" # "off" | "natural" | "custom" +# min_ms: 800 # Min delay (custom mode only) +# max_ms: 2500 # Max delay (custom mode only) + # ============================================================================= # Session Logging # ============================================================================= diff --git a/gateway/config.py b/gateway/config.py index 302ba5361..5ca063a1f 100644 --- a/gateway/config.py +++ b/gateway/config.py @@ -22,6 +22,7 @@ class Platform(Enum): TELEGRAM = "telegram" DISCORD = "discord" WHATSAPP = "whatsapp" + SLACK = "slack" @dataclass @@ -308,6 +309,22 @@ def _apply_env_overrides(config: GatewayConfig) -> None: config.platforms[Platform.WHATSAPP] = PlatformConfig() config.platforms[Platform.WHATSAPP].enabled = True + # Slack + slack_token = os.getenv("SLACK_BOT_TOKEN") + if slack_token: + if Platform.SLACK not in config.platforms: + config.platforms[Platform.SLACK] = PlatformConfig() + config.platforms[Platform.SLACK].enabled = True + config.platforms[Platform.SLACK].token = slack_token + # Home channel + slack_home = os.getenv("SLACK_HOME_CHANNEL") + if slack_home: + config.platforms[Platform.SLACK].home_channel = HomeChannel( + platform=Platform.SLACK, + chat_id=slack_home, + name=os.getenv("SLACK_HOME_CHANNEL_NAME", ""), + ) + # Session settings idle_minutes = os.getenv("SESSION_IDLE_MINUTES") if idle_minutes: diff --git a/gateway/hooks.py b/gateway/hooks.py new file mode 100644 index 000000000..d2face15c --- /dev/null +++ b/gateway/hooks.py @@ -0,0 +1,150 @@ +""" +Event Hook System + +A lightweight event-driven system that fires handlers at key lifecycle points. +Hooks are discovered from ~/.hermes/hooks/ directories, each containing: + - HOOK.yaml (metadata: name, description, events list) + - handler.py (Python handler with async def handle(event_type, context)) + +Events: + - gateway:startup -- Gateway process starts + - session:start -- New session created + - session:reset -- User ran /new or /reset + - agent:start -- Agent begins processing a message + - agent:step -- Each turn in the tool-calling loop + - agent:end -- Agent finishes processing + - command:* -- Any slash command executed (wildcard match) + +Errors in hooks are caught and logged but never block the main pipeline. +""" + +import asyncio +import importlib.util +import os +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional + +import yaml + + +HOOKS_DIR = Path(os.path.expanduser("~/.hermes/hooks")) + + +class HookRegistry: + """ + Discovers, loads, and fires event hooks. + + Usage: + registry = HookRegistry() + registry.discover_and_load() + await registry.emit("agent:start", {"platform": "telegram", ...}) + """ + + def __init__(self): + # event_type -> [handler_fn, ...] + self._handlers: Dict[str, List[Callable]] = {} + self._loaded_hooks: List[dict] = [] # metadata for listing + + @property + def loaded_hooks(self) -> List[dict]: + """Return metadata about all loaded hooks.""" + return list(self._loaded_hooks) + + def discover_and_load(self) -> None: + """ + Scan the hooks directory for hook directories and load their handlers. + + Each hook directory must contain: + - HOOK.yaml with at least 'name' and 'events' keys + - handler.py with a top-level 'handle' function (sync or async) + """ + if not HOOKS_DIR.exists(): + return + + for hook_dir in sorted(HOOKS_DIR.iterdir()): + if not hook_dir.is_dir(): + continue + + manifest_path = hook_dir / "HOOK.yaml" + handler_path = hook_dir / "handler.py" + + if not manifest_path.exists() or not handler_path.exists(): + continue + + try: + manifest = yaml.safe_load(manifest_path.read_text(encoding="utf-8")) + if not manifest or not isinstance(manifest, dict): + print(f"[hooks] Skipping {hook_dir.name}: invalid HOOK.yaml", flush=True) + continue + + hook_name = manifest.get("name", hook_dir.name) + events = manifest.get("events", []) + if not events: + print(f"[hooks] Skipping {hook_name}: no events declared", flush=True) + continue + + # Dynamically load the handler module + spec = importlib.util.spec_from_file_location( + f"hermes_hook_{hook_name}", handler_path + ) + if spec is None or spec.loader is None: + print(f"[hooks] Skipping {hook_name}: could not load handler.py", flush=True) + continue + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + handle_fn = getattr(module, "handle", None) + if handle_fn is None: + print(f"[hooks] Skipping {hook_name}: no 'handle' function found", flush=True) + continue + + # Register the handler for each declared event + for event in events: + self._handlers.setdefault(event, []).append(handle_fn) + + self._loaded_hooks.append({ + "name": hook_name, + "description": manifest.get("description", ""), + "events": events, + "path": str(hook_dir), + }) + + print(f"[hooks] Loaded hook '{hook_name}' for events: {events}", flush=True) + + except Exception as e: + print(f"[hooks] Error loading hook {hook_dir.name}: {e}", flush=True) + + async def emit(self, event_type: str, context: Optional[Dict[str, Any]] = None) -> None: + """ + Fire all handlers registered for an event. + + Supports wildcard matching: handlers registered for "command:*" will + fire for any "command:..." event. Handlers registered for a base type + like "agent" won't fire for "agent:start" -- only exact matches and + explicit wildcards. + + Args: + event_type: The event identifier (e.g. "agent:start"). + context: Optional dict with event-specific data. + """ + if context is None: + context = {} + + # Collect handlers: exact match + wildcard match + handlers = list(self._handlers.get(event_type, [])) + + # Check for wildcard patterns (e.g., "command:*" matches "command:reset") + if ":" in event_type: + base = event_type.split(":")[0] + wildcard_key = f"{base}:*" + handlers.extend(self._handlers.get(wildcard_key, [])) + + for fn in handlers: + try: + result = fn(event_type, context) + # Support both sync and async handlers + if asyncio.iscoroutine(result): + await result + except Exception as e: + print(f"[hooks] Error in handler for '{event_type}': {e}", flush=True) diff --git a/gateway/pairing.py b/gateway/pairing.py new file mode 100644 index 000000000..b1e066ffe --- /dev/null +++ b/gateway/pairing.py @@ -0,0 +1,282 @@ +""" +DM Pairing System + +Code-based approval flow for authorizing new users on messaging platforms. +Instead of static allowlists with user IDs, unknown users receive a one-time +pairing code that the bot owner approves via the CLI. + +Security features (based on OWASP + NIST SP 800-63-4 guidance): + - 8-char codes from 32-char unambiguous alphabet (no 0/O/1/I) + - Cryptographic randomness via secrets.choice() + - 1-hour code expiry + - Max 3 pending codes per platform + - Rate limiting: 1 request per user per 10 minutes + - Lockout after 5 failed approval attempts (1 hour) + - File permissions: chmod 0600 on all data files + - Codes are never logged to stdout + +Storage: ~/.hermes/pairing/ +""" + +import json +import os +import secrets +import time +from pathlib import Path +from typing import Optional + + +# Unambiguous alphabet -- excludes 0/O, 1/I to prevent confusion +ALPHABET = "ABCDEFGHJKLMNPQRSTUVWXYZ23456789" +CODE_LENGTH = 8 + +# Timing constants +CODE_TTL_SECONDS = 3600 # Codes expire after 1 hour +RATE_LIMIT_SECONDS = 600 # 1 request per user per 10 minutes +LOCKOUT_SECONDS = 3600 # Lockout duration after too many failures + +# Limits +MAX_PENDING_PER_PLATFORM = 3 # Max pending codes per platform +MAX_FAILED_ATTEMPTS = 5 # Failed approvals before lockout + +PAIRING_DIR = Path(os.path.expanduser("~/.hermes/pairing")) + + +def _secure_write(path: Path, data: str) -> None: + """Write data to file with restrictive permissions (owner read/write only).""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(data, encoding="utf-8") + try: + os.chmod(path, 0o600) + except OSError: + pass # Windows doesn't support chmod the same way + + +class PairingStore: + """ + Manages pairing codes and approved user lists. + + Data files per platform: + - {platform}-pending.json : pending pairing requests + - {platform}-approved.json : approved (paired) users + - _rate_limits.json : rate limit tracking + """ + + def __init__(self): + PAIRING_DIR.mkdir(parents=True, exist_ok=True) + + def _pending_path(self, platform: str) -> Path: + return PAIRING_DIR / f"{platform}-pending.json" + + def _approved_path(self, platform: str) -> Path: + return PAIRING_DIR / f"{platform}-approved.json" + + def _rate_limit_path(self) -> Path: + return PAIRING_DIR / "_rate_limits.json" + + def _load_json(self, path: Path) -> dict: + if path.exists(): + try: + return json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError): + return {} + return {} + + def _save_json(self, path: Path, data: dict) -> None: + _secure_write(path, json.dumps(data, indent=2, ensure_ascii=False)) + + # ----- Approved users ----- + + def is_approved(self, platform: str, user_id: str) -> bool: + """Check if a user is approved (paired) on a platform.""" + approved = self._load_json(self._approved_path(platform)) + return user_id in approved + + def list_approved(self, platform: str = None) -> list: + """List approved users, optionally filtered by platform.""" + results = [] + platforms = [platform] if platform else self._all_platforms("approved") + for p in platforms: + approved = self._load_json(self._approved_path(p)) + for uid, info in approved.items(): + results.append({"platform": p, "user_id": uid, **info}) + return results + + def _approve_user(self, platform: str, user_id: str, user_name: str = "") -> None: + """Add a user to the approved list.""" + approved = self._load_json(self._approved_path(platform)) + approved[user_id] = { + "user_name": user_name, + "approved_at": time.time(), + } + self._save_json(self._approved_path(platform), approved) + + def revoke(self, platform: str, user_id: str) -> bool: + """Remove a user from the approved list. Returns True if found.""" + path = self._approved_path(platform) + approved = self._load_json(path) + if user_id in approved: + del approved[user_id] + self._save_json(path, approved) + return True + return False + + # ----- Pending codes ----- + + def generate_code( + self, platform: str, user_id: str, user_name: str = "" + ) -> Optional[str]: + """ + Generate a pairing code for a new user. + + Returns the code string, or None if: + - User is rate-limited (too recent request) + - Max pending codes reached for this platform + - User/platform is in lockout due to failed attempts + """ + self._cleanup_expired(platform) + + # Check lockout + if self._is_locked_out(platform): + return None + + # Check rate limit for this specific user + if self._is_rate_limited(platform, user_id): + return None + + # Check max pending + pending = self._load_json(self._pending_path(platform)) + if len(pending) >= MAX_PENDING_PER_PLATFORM: + return None + + # Generate cryptographically random code + code = "".join(secrets.choice(ALPHABET) for _ in range(CODE_LENGTH)) + + # Store pending request + pending[code] = { + "user_id": user_id, + "user_name": user_name, + "created_at": time.time(), + } + self._save_json(self._pending_path(platform), pending) + + # Record rate limit + self._record_rate_limit(platform, user_id) + + return code + + def approve_code(self, platform: str, code: str) -> Optional[dict]: + """ + Approve a pairing code. Adds the user to the approved list. + + Returns {user_id, user_name} on success, None if code is invalid/expired. + """ + self._cleanup_expired(platform) + code = code.upper().strip() + + pending = self._load_json(self._pending_path(platform)) + if code not in pending: + self._record_failed_attempt(platform) + return None + + entry = pending.pop(code) + self._save_json(self._pending_path(platform), pending) + + # Add to approved list + self._approve_user(platform, entry["user_id"], entry.get("user_name", "")) + + return { + "user_id": entry["user_id"], + "user_name": entry.get("user_name", ""), + } + + def list_pending(self, platform: str = None) -> list: + """List pending pairing requests, optionally filtered by platform.""" + results = [] + platforms = [platform] if platform else self._all_platforms("pending") + for p in platforms: + self._cleanup_expired(p) + pending = self._load_json(self._pending_path(p)) + for code, info in pending.items(): + age_min = int((time.time() - info["created_at"]) / 60) + results.append({ + "platform": p, + "code": code, + "user_id": info["user_id"], + "user_name": info.get("user_name", ""), + "age_minutes": age_min, + }) + return results + + def clear_pending(self, platform: str = None) -> int: + """Clear all pending requests. Returns count removed.""" + count = 0 + platforms = [platform] if platform else self._all_platforms("pending") + for p in platforms: + pending = self._load_json(self._pending_path(p)) + count += len(pending) + self._save_json(self._pending_path(p), {}) + return count + + # ----- Rate limiting and lockout ----- + + def _is_rate_limited(self, platform: str, user_id: str) -> bool: + """Check if a user has requested a code too recently.""" + limits = self._load_json(self._rate_limit_path()) + key = f"{platform}:{user_id}" + last_request = limits.get(key, 0) + return (time.time() - last_request) < RATE_LIMIT_SECONDS + + def _record_rate_limit(self, platform: str, user_id: str) -> None: + """Record the time of a pairing request for rate limiting.""" + limits = self._load_json(self._rate_limit_path()) + key = f"{platform}:{user_id}" + limits[key] = time.time() + self._save_json(self._rate_limit_path(), limits) + + def _is_locked_out(self, platform: str) -> bool: + """Check if a platform is in lockout due to failed approval attempts.""" + limits = self._load_json(self._rate_limit_path()) + lockout_key = f"_lockout:{platform}" + lockout_until = limits.get(lockout_key, 0) + return time.time() < lockout_until + + def _record_failed_attempt(self, platform: str) -> None: + """Record a failed approval attempt. Triggers lockout after MAX_FAILED_ATTEMPTS.""" + limits = self._load_json(self._rate_limit_path()) + fail_key = f"_failures:{platform}" + fails = limits.get(fail_key, 0) + 1 + limits[fail_key] = fails + if fails >= MAX_FAILED_ATTEMPTS: + lockout_key = f"_lockout:{platform}" + limits[lockout_key] = time.time() + LOCKOUT_SECONDS + limits[fail_key] = 0 # Reset counter + print(f"[pairing] Platform {platform} locked out for {LOCKOUT_SECONDS}s " + f"after {MAX_FAILED_ATTEMPTS} failed attempts", flush=True) + self._save_json(self._rate_limit_path(), limits) + + # ----- Cleanup ----- + + def _cleanup_expired(self, platform: str) -> None: + """Remove expired pending codes.""" + path = self._pending_path(platform) + pending = self._load_json(path) + now = time.time() + expired = [ + code for code, info in pending.items() + if (now - info["created_at"]) > CODE_TTL_SECONDS + ] + if expired: + for code in expired: + del pending[code] + self._save_json(path, pending) + + def _all_platforms(self, suffix: str) -> list: + """List all platforms that have data files of a given suffix.""" + platforms = [] + for f in PAIRING_DIR.iterdir(): + if f.name.endswith(f"-{suffix}.json"): + platform = f.name.replace(f"-{suffix}.json", "") + if not platform.startswith("_"): + platforms.append(platform) + return platforms diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 94168061d..1659c6524 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -108,6 +108,65 @@ def cleanup_image_cache(max_age_hours: int = 24) -> int: return removed +# --------------------------------------------------------------------------- +# Audio cache utilities +# +# Same pattern as image cache -- voice messages from platforms are downloaded +# here so the STT tool (OpenAI Whisper) can transcribe them from local files. +# --------------------------------------------------------------------------- + +AUDIO_CACHE_DIR = Path(os.path.expanduser("~/.hermes/audio_cache")) + + +def get_audio_cache_dir() -> Path: + """Return the audio cache directory, creating it if it doesn't exist.""" + AUDIO_CACHE_DIR.mkdir(parents=True, exist_ok=True) + return AUDIO_CACHE_DIR + + +def cache_audio_from_bytes(data: bytes, ext: str = ".ogg") -> str: + """ + Save raw audio bytes to the cache and return the absolute file path. + + Args: + data: Raw audio bytes. + ext: File extension including the dot (e.g. ".ogg", ".mp3"). + + Returns: + Absolute path to the cached audio file as a string. + """ + cache_dir = get_audio_cache_dir() + filename = f"audio_{uuid.uuid4().hex[:12]}{ext}" + filepath = cache_dir / filename + filepath.write_bytes(data) + return str(filepath) + + +async def cache_audio_from_url(url: str, ext: str = ".ogg") -> str: + """ + Download an audio file from a URL and save it to the local cache. + + Args: + url: The HTTP/HTTPS URL to download from. + ext: File extension including the dot (e.g. ".ogg", ".mp3"). + + Returns: + Absolute path to the cached audio file as a string. + """ + import httpx + + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: + response = await client.get( + url, + headers={ + "User-Agent": "Mozilla/5.0 (compatible; HermesAgent/1.0)", + "Accept": "audio/*,*/*;q=0.8", + }, + ) + response.raise_for_status() + return cache_audio_from_bytes(response.content, ext) + + class MessageType(Enum): """Types of incoming messages.""" TEXT = "text" @@ -422,6 +481,27 @@ class BasePlatformAdapter(ABC): # Spawn background task to process this message asyncio.create_task(self._process_message_background(event, session_key)) + @staticmethod + def _get_human_delay() -> float: + """ + Return a random delay in seconds for human-like response pacing. + + Reads from env vars: + HERMES_HUMAN_DELAY_MODE: "off" (default) | "natural" | "custom" + HERMES_HUMAN_DELAY_MIN_MS: minimum delay in ms (default 800, custom mode) + HERMES_HUMAN_DELAY_MAX_MS: maximum delay in ms (default 2500, custom mode) + """ + import random + + mode = os.getenv("HERMES_HUMAN_DELAY_MODE", "off").lower() + if mode == "off": + return 0.0 + min_ms = int(os.getenv("HERMES_HUMAN_DELAY_MIN_MS", "800")) + max_ms = int(os.getenv("HERMES_HUMAN_DELAY_MAX_MS", "2500")) + if mode == "natural": + min_ms, max_ms = 800, 2500 + return random.uniform(min_ms / 1000.0, max_ms / 1000.0) + async def _process_message_background(self, event: MessageEvent, session_key: str) -> None: """Background task that actually processes the message.""" # Create interrupt event for this session @@ -463,8 +543,13 @@ class BasePlatformAdapter(ABC): if not fallback_result.success: print(f"[{self.name}] Fallback send also failed: {fallback_result.error}") + # Human-like pacing delay between text and media + human_delay = self._get_human_delay() + # Send extracted images as native attachments for image_url, alt_text in images: + if human_delay > 0: + await asyncio.sleep(human_delay) try: img_result = await self.send_image( chat_id=event.source.chat_id, @@ -478,6 +563,8 @@ class BasePlatformAdapter(ABC): # Send extracted audio/voice files as native attachments for audio_path, is_voice in media_files: + if human_delay > 0: + await asyncio.sleep(human_delay) try: voice_result = await self.send_voice( chat_id=event.source.chat_id, diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index 0e6a8f870..d3861798c 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -33,6 +33,7 @@ from gateway.platforms.base import ( MessageType, SendResult, cache_image_from_url, + cache_audio_from_url, ) @@ -49,7 +50,10 @@ class DiscordAdapter(BasePlatformAdapter): - Receiving messages from servers and DMs - Sending responses with Discord markdown - Thread support - - Slash commands (future) + - Native slash commands (/ask, /reset, /status, /stop) + - Button-based exec approvals + - Auto-threading for long conversations + - Reaction-based feedback """ # Discord message limits @@ -59,6 +63,7 @@ class DiscordAdapter(BasePlatformAdapter): super().__init__(config, Platform.DISCORD) self._client: Optional[commands.Bot] = None self._ready_event = asyncio.Event() + self._allowed_user_ids: set = set() # For button approval authorization async def connect(self) -> bool: """Connect to Discord and start receiving events.""" @@ -83,10 +88,23 @@ class DiscordAdapter(BasePlatformAdapter): intents=intents, ) + # Parse allowed user IDs for button authorization + allowed_env = os.getenv("DISCORD_ALLOWED_USERS", "") + if allowed_env: + self._allowed_user_ids = { + uid.strip() for uid in allowed_env.split(",") if uid.strip() + } + # Register event handlers @self._client.event async def on_ready(): print(f"[{self.name}] Connected as {self._client.user}") + # Sync slash commands with Discord + try: + synced = await self._client.tree.sync() + print(f"[{self.name}] Synced {len(synced)} slash command(s)") + except Exception as e: + print(f"[{self.name}] Slash command sync failed: {e}") self._ready_event.set() @self._client.event @@ -96,6 +114,9 @@ class DiscordAdapter(BasePlatformAdapter): return await self._handle_message(message) + # Register slash commands + self._register_slash_commands() + # Start the bot in background asyncio.create_task(self._client.start(self.config.token)) @@ -325,6 +346,116 @@ class DiscordAdapter(BasePlatformAdapter): # Discord markdown is fairly standard, no special escaping needed return content + def _register_slash_commands(self) -> None: + """Register Discord slash commands on the command tree.""" + if not self._client: + return + + tree = self._client.tree + + @tree.command(name="ask", description="Ask Hermes a question") + @discord.app_commands.describe(question="Your question for Hermes") + async def slash_ask(interaction: discord.Interaction, question: str): + await interaction.response.defer() + event = self._build_slash_event(interaction, question) + await self.handle_message(event) + # The response is sent via the normal send() flow + # Send a followup to close the interaction if needed + try: + await interaction.followup.send("Processing complete~", ephemeral=True) + except Exception: + pass + + @tree.command(name="reset", description="Reset your Hermes session") + async def slash_reset(interaction: discord.Interaction): + await interaction.response.defer(ephemeral=True) + event = self._build_slash_event(interaction, "/reset") + await self.handle_message(event) + try: + await interaction.followup.send("Session reset~", ephemeral=True) + except Exception: + pass + + @tree.command(name="status", description="Show Hermes session status") + async def slash_status(interaction: discord.Interaction): + await interaction.response.defer(ephemeral=True) + event = self._build_slash_event(interaction, "/status") + await self.handle_message(event) + try: + await interaction.followup.send("Status sent~", ephemeral=True) + except Exception: + pass + + @tree.command(name="stop", description="Stop the running Hermes agent") + async def slash_stop(interaction: discord.Interaction): + await interaction.response.defer(ephemeral=True) + event = self._build_slash_event(interaction, "/stop") + await self.handle_message(event) + try: + await interaction.followup.send("Stop requested~", ephemeral=True) + except Exception: + pass + + def _build_slash_event(self, interaction: discord.Interaction, text: str) -> MessageEvent: + """Build a MessageEvent from a Discord slash command interaction.""" + is_dm = isinstance(interaction.channel, discord.DMChannel) + chat_type = "dm" if is_dm else "group" + chat_name = "" + if not is_dm and hasattr(interaction.channel, "name"): + chat_name = interaction.channel.name + if hasattr(interaction.channel, "guild") and interaction.channel.guild: + chat_name = f"{interaction.channel.guild.name} / #{chat_name}" + + source = self.build_source( + chat_id=str(interaction.channel_id), + chat_name=chat_name, + chat_type=chat_type, + user_id=str(interaction.user.id), + user_name=interaction.user.display_name, + ) + + msg_type = MessageType.COMMAND if text.startswith("/") else MessageType.TEXT + return MessageEvent( + text=text, + message_type=msg_type, + source=source, + raw_message=interaction, + ) + + async def send_exec_approval( + self, chat_id: str, command: str, approval_id: str + ) -> SendResult: + """ + Send a button-based exec approval prompt for a dangerous command. + + Returns SendResult. The approval is resolved when a user clicks a button. + """ + if not self._client or not DISCORD_AVAILABLE: + return SendResult(success=False, error="Not connected") + + try: + channel = self._client.get_channel(int(chat_id)) + if not channel: + channel = await self._client.fetch_channel(int(chat_id)) + + embed = discord.Embed( + title="Command Approval Required", + description=f"```\n{command[:500]}\n```", + color=discord.Color.orange(), + ) + embed.set_footer(text=f"Approval ID: {approval_id}") + + view = ExecApprovalView( + approval_id=approval_id, + allowed_user_ids=self._allowed_user_ids, + ) + + msg = await channel.send(embed=embed, view=view) + return SendResult(success=True, message_id=str(msg.id)) + + except Exception as e: + return SendResult(success=False, error=str(e)) + async def _handle_message(self, message: DiscordMessage) -> None: """Handle incoming Discord messages.""" # In server channels (not DMs), require the bot to be @mentioned @@ -424,8 +555,21 @@ class DiscordAdapter(BasePlatformAdapter): # Fall back to the CDN URL if caching fails media_urls.append(att.url) media_types.append(content_type) + elif content_type.startswith("audio/"): + try: + ext = "." + content_type.split("/")[-1].split(";")[0] + if ext not in (".ogg", ".mp3", ".wav", ".webm", ".m4a"): + ext = ".ogg" + cached_path = await cache_audio_from_url(att.url, ext=ext) + media_urls.append(cached_path) + media_types.append(content_type) + print(f"[Discord] Cached user audio: {cached_path}", flush=True) + except Exception as e: + print(f"[Discord] Failed to cache audio attachment: {e}", flush=True) + media_urls.append(att.url) + media_types.append(content_type) else: - # Non-image attachments: keep the original URL + # Other attachments: keep the original URL media_urls.append(att.url) media_types.append(content_type) @@ -442,3 +586,94 @@ class DiscordAdapter(BasePlatformAdapter): ) await self.handle_message(event) + + +# --------------------------------------------------------------------------- +# Discord UI Components (outside the adapter class) +# --------------------------------------------------------------------------- + +if DISCORD_AVAILABLE: + + class ExecApprovalView(discord.ui.View): + """ + Interactive button view for exec approval of dangerous commands. + + Shows three buttons: Allow Once (green), Always Allow (blue), Deny (red). + Only users in the allowed list can click. The view times out after 5 minutes. + """ + + def __init__(self, approval_id: str, allowed_user_ids: set): + super().__init__(timeout=300) # 5-minute timeout + self.approval_id = approval_id + self.allowed_user_ids = allowed_user_ids + self.resolved = False + + def _check_auth(self, interaction: discord.Interaction) -> bool: + """Verify the user clicking is authorized.""" + if not self.allowed_user_ids: + return True # No allowlist = anyone can approve + return str(interaction.user.id) in self.allowed_user_ids + + async def _resolve( + self, interaction: discord.Interaction, action: str, color: discord.Color + ): + """Resolve the approval and update the message.""" + if self.resolved: + await interaction.response.send_message( + "This approval has already been resolved~", ephemeral=True + ) + return + + if not self._check_auth(interaction): + await interaction.response.send_message( + "You're not authorized to approve commands~", ephemeral=True + ) + return + + self.resolved = True + + # Update the embed with the decision + embed = interaction.message.embeds[0] if interaction.message.embeds else None + if embed: + embed.color = color + embed.set_footer(text=f"{action} by {interaction.user.display_name}") + + # Disable all buttons + for child in self.children: + child.disabled = True + + await interaction.response.edit_message(embed=embed, view=self) + + # Store the approval decision for the gateway to pick up + try: + from tools.terminal_tool import _session_approved_patterns + if action == "allow_once": + pass # One-time approval handled by gateway + elif action == "allow_always": + _session_approved_patterns.add(self.approval_id) + except ImportError: + pass + + @discord.ui.button(label="Allow Once", style=discord.ButtonStyle.green) + async def allow_once( + self, interaction: discord.Interaction, button: discord.ui.Button + ): + await self._resolve(interaction, "allow_once", discord.Color.green()) + + @discord.ui.button(label="Always Allow", style=discord.ButtonStyle.blurple) + async def allow_always( + self, interaction: discord.Interaction, button: discord.ui.Button + ): + await self._resolve(interaction, "allow_always", discord.Color.blue()) + + @discord.ui.button(label="Deny", style=discord.ButtonStyle.red) + async def deny( + self, interaction: discord.Interaction, button: discord.ui.Button + ): + await self._resolve(interaction, "deny", discord.Color.red()) + + async def on_timeout(self): + """Handle view timeout -- disable buttons and mark as expired.""" + self.resolved = True + for child in self.children: + child.disabled = True diff --git a/gateway/platforms/slack.py b/gateway/platforms/slack.py new file mode 100644 index 000000000..a0bd83c87 --- /dev/null +++ b/gateway/platforms/slack.py @@ -0,0 +1,374 @@ +""" +Slack platform adapter. + +Uses slack-bolt (Python) with Socket Mode for: +- Receiving messages from channels and DMs +- Sending responses back +- Handling slash commands +- Thread support +""" + +import asyncio +import os +from typing import Dict, List, Optional, Any + +try: + from slack_bolt.async_app import AsyncApp + from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler + from slack_sdk.web.async_client import AsyncWebClient + SLACK_AVAILABLE = True +except ImportError: + SLACK_AVAILABLE = False + AsyncApp = Any + AsyncSocketModeHandler = Any + AsyncWebClient = Any + +import sys +sys.path.insert(0, str(__file__).rsplit("/", 3)[0]) + +from gateway.config import Platform, PlatformConfig +from gateway.platforms.base import ( + BasePlatformAdapter, + MessageEvent, + MessageType, + SendResult, + cache_image_from_url, + cache_audio_from_url, +) + + +def check_slack_requirements() -> bool: + """Check if Slack dependencies are available.""" + return SLACK_AVAILABLE + + +class SlackAdapter(BasePlatformAdapter): + """ + Slack bot adapter using Socket Mode. + + Requires two tokens: + - SLACK_BOT_TOKEN (xoxb-...) for API calls + - SLACK_APP_TOKEN (xapp-...) for Socket Mode connection + + Features: + - DMs and channel messages (mention-gated in channels) + - Thread support + - File/image/audio attachments + - Slash commands (/hermes) + - Typing indicators (not natively supported by Slack bots) + """ + + MAX_MESSAGE_LENGTH = 4000 # Slack's limit is higher but mrkdwn can inflate + + def __init__(self, config: PlatformConfig): + super().__init__(config, Platform.SLACK) + self._app: Optional[AsyncApp] = None + self._handler: Optional[AsyncSocketModeHandler] = None + self._bot_user_id: Optional[str] = None + + async def connect(self) -> bool: + """Connect to Slack via Socket Mode.""" + if not SLACK_AVAILABLE: + print("[Slack] slack-bolt not installed. Run: pip install slack-bolt") + return False + + bot_token = self.config.token + app_token = os.getenv("SLACK_APP_TOKEN") + + if not bot_token: + print("[Slack] SLACK_BOT_TOKEN not set") + return False + if not app_token: + print("[Slack] SLACK_APP_TOKEN not set") + return False + + try: + self._app = AsyncApp(token=bot_token) + + # Get our own bot user ID for mention detection + auth_response = await self._app.client.auth_test() + self._bot_user_id = auth_response.get("user_id") + bot_name = auth_response.get("user", "unknown") + + # Register message event handler + @self._app.event("message") + async def handle_message_event(event, say): + await self._handle_slack_message(event) + + # Register slash command handler + @self._app.command("/hermes") + async def handle_hermes_command(ack, command): + await ack() + await self._handle_slash_command(command) + + # Start Socket Mode handler in background + self._handler = AsyncSocketModeHandler(self._app, app_token) + asyncio.create_task(self._handler.start_async()) + + self._running = True + print(f"[Slack] Connected as @{bot_name} (Socket Mode)") + return True + + except Exception as e: + print(f"[Slack] Connection failed: {e}") + return False + + async def disconnect(self) -> None: + """Disconnect from Slack.""" + if self._handler: + await self._handler.close_async() + self._running = False + print("[Slack] Disconnected") + + async def send( + self, + chat_id: str, + content: str, + reply_to: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> SendResult: + """Send a message to a Slack channel or DM.""" + if not self._app: + return SendResult(success=False, error="Not connected") + + try: + kwargs = { + "channel": chat_id, + "text": content, + } + + # Reply in thread if thread_ts is available + if reply_to: + kwargs["thread_ts"] = reply_to + elif metadata and metadata.get("thread_ts"): + kwargs["thread_ts"] = metadata["thread_ts"] + + result = await self._app.client.chat_postMessage(**kwargs) + + return SendResult( + success=True, + message_id=result.get("ts"), + raw_response=result, + ) + + except Exception as e: + print(f"[Slack] Send error: {e}") + return SendResult(success=False, error=str(e)) + + async def send_typing(self, chat_id: str) -> None: + """Slack doesn't have a direct typing indicator API for bots.""" + pass + + async def send_image( + self, + chat_id: str, + image_url: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send an image to Slack by uploading the URL as a file.""" + if not self._app: + return SendResult(success=False, error="Not connected") + + try: + import httpx + + # Download the image first + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: + response = await client.get(image_url) + response.raise_for_status() + + result = await self._app.client.files_upload_v2( + channel=chat_id, + content=response.content, + filename="image.png", + initial_comment=caption or "", + thread_ts=reply_to, + ) + + return SendResult(success=True, raw_response=result) + + except Exception as e: + # Fall back to sending the URL as text + text = f"{caption}\n{image_url}" if caption else image_url + return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) + + async def send_voice( + self, + chat_id: str, + audio_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send an audio file to Slack.""" + if not self._app: + return SendResult(success=False, error="Not connected") + + try: + result = await self._app.client.files_upload_v2( + channel=chat_id, + file=audio_path, + filename=os.path.basename(audio_path), + initial_comment=caption or "", + thread_ts=reply_to, + ) + return SendResult(success=True, raw_response=result) + + except Exception as e: + return SendResult(success=False, error=str(e)) + + async def get_chat_info(self, chat_id: str) -> Dict[str, Any]: + """Get information about a Slack channel.""" + if not self._app: + return {"name": chat_id, "type": "unknown"} + + try: + result = await self._app.client.conversations_info(channel=chat_id) + channel = result.get("channel", {}) + is_dm = channel.get("is_im", False) + return { + "name": channel.get("name", chat_id), + "type": "dm" if is_dm else "group", + } + except Exception: + return {"name": chat_id, "type": "unknown"} + + # ----- Internal handlers ----- + + async def _handle_slack_message(self, event: dict) -> None: + """Handle an incoming Slack message event.""" + # Ignore bot messages (including our own) + if event.get("bot_id") or event.get("subtype") == "bot_message": + return + + # Ignore message edits and deletions + subtype = event.get("subtype") + if subtype in ("message_changed", "message_deleted"): + return + + text = event.get("text", "") + user_id = event.get("user", "") + channel_id = event.get("channel", "") + thread_ts = event.get("thread_ts") or event.get("ts") + ts = event.get("ts", "") + + # Determine if this is a DM or channel message + channel_type = event.get("channel_type", "") + is_dm = channel_type == "im" + + # In channels, only respond if bot is mentioned + if not is_dm and self._bot_user_id: + if f"<@{self._bot_user_id}>" not in text: + return + # Strip the bot mention from the text + text = text.replace(f"<@{self._bot_user_id}>", "").strip() + + # Determine message type + msg_type = MessageType.TEXT + if text.startswith("/"): + msg_type = MessageType.COMMAND + + # Handle file attachments + media_urls = [] + media_types = [] + files = event.get("files", []) + for f in files: + mimetype = f.get("mimetype", "unknown") + url = f.get("url_private_download") or f.get("url_private", "") + if mimetype.startswith("image/") and url: + try: + ext = "." + mimetype.split("/")[-1].split(";")[0] + if ext not in (".jpg", ".jpeg", ".png", ".gif", ".webp"): + ext = ".jpg" + # Slack private URLs require the bot token as auth header + cached = await self._download_slack_file(url, ext) + media_urls.append(cached) + media_types.append(mimetype) + msg_type = MessageType.PHOTO + except Exception as e: + print(f"[Slack] Failed to cache image: {e}", flush=True) + elif mimetype.startswith("audio/") and url: + try: + ext = "." + mimetype.split("/")[-1].split(";")[0] + if ext not in (".ogg", ".mp3", ".wav", ".webm", ".m4a"): + ext = ".ogg" + cached = await self._download_slack_file(url, ext, audio=True) + media_urls.append(cached) + media_types.append(mimetype) + msg_type = MessageType.VOICE + except Exception as e: + print(f"[Slack] Failed to cache audio: {e}", flush=True) + + # Build source + source = self.build_source( + chat_id=channel_id, + chat_name=channel_id, # Will be resolved later if needed + chat_type="dm" if is_dm else "group", + user_id=user_id, + thread_id=thread_ts, + ) + + msg_event = MessageEvent( + text=text, + message_type=msg_type, + source=source, + raw_message=event, + message_id=ts, + media_urls=media_urls, + media_types=media_types, + reply_to_message_id=thread_ts if thread_ts != ts else None, + ) + + await self.handle_message(msg_event) + + async def _handle_slash_command(self, command: dict) -> None: + """Handle /hermes slash command.""" + text = command.get("text", "").strip() + user_id = command.get("user_id", "") + channel_id = command.get("channel_id", "") + + # Map common slash subcommands to gateway commands + if text in ("new", "reset"): + text = "/reset" + elif text == "status": + text = "/status" + elif text == "stop": + text = "/stop" + elif text: + pass # Treat as a regular question + else: + text = "/help" + + source = self.build_source( + chat_id=channel_id, + chat_type="dm", # Slash commands are always in DM-like context + user_id=user_id, + ) + + event = MessageEvent( + text=text, + message_type=MessageType.COMMAND if text.startswith("/") else MessageType.TEXT, + source=source, + raw_message=command, + ) + + await self.handle_message(event) + + async def _download_slack_file(self, url: str, ext: str, audio: bool = False) -> str: + """Download a Slack file using the bot token for auth.""" + import httpx + + bot_token = self.config.token + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: + response = await client.get( + url, + headers={"Authorization": f"Bearer {bot_token}"}, + ) + response.raise_for_status() + + if audio: + from gateway.platforms.base import cache_audio_from_bytes + return cache_audio_from_bytes(response.content, ext) + else: + from gateway.platforms.base import cache_image_from_bytes + return cache_image_from_bytes(response.content, ext) diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index 88eb4580b..7e0ff1b28 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -39,6 +39,7 @@ from gateway.platforms.base import ( MessageType, SendResult, cache_image_from_bytes, + cache_audio_from_bytes, ) @@ -91,7 +92,7 @@ class TelegramAdapter(BasePlatformAdapter): self._handle_command )) self._app.add_handler(TelegramMessageHandler( - filters.PHOTO | filters.VIDEO | filters.AUDIO | filters.VOICE | filters.Document.ALL, + filters.PHOTO | filters.VIDEO | filters.AUDIO | filters.VOICE | filters.Document.ALL | filters.Sticker.ALL, self._handle_media_message )) @@ -311,7 +312,9 @@ class TelegramAdapter(BasePlatformAdapter): msg = update.message # Determine media type - if msg.photo: + if msg.sticker: + msg_type = MessageType.STICKER + elif msg.photo: msg_type = MessageType.PHOTO elif msg.video: msg_type = MessageType.VIDEO @@ -328,6 +331,12 @@ class TelegramAdapter(BasePlatformAdapter): if msg.caption: event.text = msg.caption + # Handle stickers: describe via vision tool with caching + if msg.sticker: + await self._handle_sticker(msg, event) + await self.handle_message(event) + return + # Download photo to local image cache so the vision tool can access it # even after Telegram's ephemeral file URLs expire (~1 hour). if msg.photo: @@ -352,8 +361,97 @@ class TelegramAdapter(BasePlatformAdapter): except Exception as e: print(f"[Telegram] Failed to cache photo: {e}", flush=True) + # Download voice/audio messages to cache for STT transcription + if msg.voice: + try: + file_obj = await msg.voice.get_file() + audio_bytes = await file_obj.download_as_bytearray() + cached_path = cache_audio_from_bytes(bytes(audio_bytes), ext=".ogg") + event.media_urls = [cached_path] + event.media_types = ["audio/ogg"] + print(f"[Telegram] Cached user voice: {cached_path}", flush=True) + except Exception as e: + print(f"[Telegram] Failed to cache voice: {e}", flush=True) + elif msg.audio: + try: + file_obj = await msg.audio.get_file() + audio_bytes = await file_obj.download_as_bytearray() + cached_path = cache_audio_from_bytes(bytes(audio_bytes), ext=".mp3") + event.media_urls = [cached_path] + event.media_types = ["audio/mp3"] + print(f"[Telegram] Cached user audio: {cached_path}", flush=True) + except Exception as e: + print(f"[Telegram] Failed to cache audio: {e}", flush=True) + await self.handle_message(event) + async def _handle_sticker(self, msg: Message, event: "MessageEvent") -> None: + """ + Describe a Telegram sticker via vision analysis, with caching. + + For static stickers (WEBP), we download, analyze with vision, and cache + the description by file_unique_id. For animated/video stickers, we inject + a placeholder noting the emoji. + """ + from gateway.sticker_cache import ( + get_cached_description, + cache_sticker_description, + build_sticker_injection, + build_animated_sticker_injection, + STICKER_VISION_PROMPT, + ) + + sticker = msg.sticker + emoji = sticker.emoji or "" + set_name = sticker.set_name or "" + + # Animated and video stickers can't be analyzed as static images + if sticker.is_animated or sticker.is_video: + event.text = build_animated_sticker_injection(emoji) + return + + # Check the cache first + cached = get_cached_description(sticker.file_unique_id) + if cached: + event.text = build_sticker_injection( + cached["description"], cached.get("emoji", emoji), cached.get("set_name", set_name) + ) + print(f"[Telegram] Sticker cache hit: {sticker.file_unique_id}", flush=True) + return + + # Cache miss -- download and analyze + try: + file_obj = await sticker.get_file() + image_bytes = await file_obj.download_as_bytearray() + cached_path = cache_image_from_bytes(bytes(image_bytes), ext=".webp") + print(f"[Telegram] Analyzing sticker: {cached_path}", flush=True) + + from tools.vision_tools import vision_analyze_tool + import json as _json + + result_json = await vision_analyze_tool( + image_url=cached_path, + user_prompt=STICKER_VISION_PROMPT, + ) + result = _json.loads(result_json) + + if result.get("success"): + description = result.get("analysis", "a sticker") + cache_sticker_description(sticker.file_unique_id, description, emoji, set_name) + event.text = build_sticker_injection(description, emoji, set_name) + else: + # Vision failed -- use emoji as fallback + event.text = build_sticker_injection( + f"a sticker with emoji {emoji}" if emoji else "a sticker", + emoji, set_name, + ) + except Exception as e: + print(f"[Telegram] Sticker analysis error: {e}", flush=True) + event.text = build_sticker_injection( + f"a sticker with emoji {emoji}" if emoji else "a sticker", + emoji, set_name, + ) + def _build_message_event(self, message: Message, msg_type: MessageType) -> MessageEvent: """Build a MessageEvent from a Telegram message.""" chat = message.chat diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py index fc319704c..ac0015ed0 100644 --- a/gateway/platforms/whatsapp.py +++ b/gateway/platforms/whatsapp.py @@ -31,6 +31,7 @@ from gateway.platforms.base import ( MessageType, SendResult, cache_image_from_url, + cache_audio_from_url, ) @@ -322,8 +323,18 @@ class WhatsAppAdapter(BasePlatformAdapter): print(f"[{self.name}] Cached user image: {cached_path}", flush=True) except Exception as e: print(f"[{self.name}] Failed to cache image: {e}", flush=True) - cached_urls.append(url) # Fall back to original URL + cached_urls.append(url) media_types.append("image/jpeg") + elif msg_type == MessageType.VOICE and url.startswith(("http://", "https://")): + try: + cached_path = await cache_audio_from_url(url, ext=".ogg") + cached_urls.append(cached_path) + media_types.append("audio/ogg") + print(f"[{self.name}] Cached user voice: {cached_path}", flush=True) + except Exception as e: + print(f"[{self.name}] Failed to cache voice: {e}", flush=True) + cached_urls.append(url) + media_types.append("audio/ogg") else: cached_urls.append(url) media_types.append("unknown") diff --git a/gateway/run.py b/gateway/run.py index a08f7b63a..f8a86193e 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -85,6 +85,14 @@ class GatewayRunner: # Track pending exec approvals per session # Key: session_key, Value: {"command": str, "pattern_key": str} self._pending_approvals: Dict[str, Dict[str, str]] = {} + + # DM pairing store for code-based user authorization + from gateway.pairing import PairingStore + self.pairing_store = PairingStore() + + # Event hook system + from gateway.hooks import HookRegistry + self.hooks = HookRegistry() async def start(self) -> bool: """ @@ -95,6 +103,9 @@ class GatewayRunner: print("[gateway] Starting Hermes Gateway...") print(f"[gateway] Session storage: {self.config.sessions_dir}") + # Discover and load event hooks + self.hooks.discover_and_load() + connected_count = 0 # Initialize and connect each configured platform @@ -131,6 +142,15 @@ class GatewayRunner: self.delivery_router.adapters = self.adapters self._running = True + + # Emit gateway:startup hook + hook_count = len(self.hooks.loaded_hooks) + if hook_count: + print(f"[gateway] {hook_count} hook(s) loaded") + await self.hooks.emit("gateway:startup", { + "platforms": [p.value for p in self.adapters.keys()], + }) + print(f"[gateway] Gateway running with {connected_count} platform(s)") print("[gateway] Press Ctrl+C to stop") @@ -183,18 +203,23 @@ class GatewayRunner: return None return WhatsAppAdapter(config) + elif platform == Platform.SLACK: + from gateway.platforms.slack import SlackAdapter, check_slack_requirements + if not check_slack_requirements(): + print(f"[gateway] Slack: slack-bolt not installed. Run: pip install 'hermes-agent[slack]'") + return None + return SlackAdapter(config) + return None def _is_user_authorized(self, source: SessionSource) -> bool: """ Check if a user is authorized to use the bot. - Authorization is checked via environment variables: - - GATEWAY_ALLOWED_USERS: Comma-separated list of user IDs (all platforms) - - TELEGRAM_ALLOWED_USERS: Telegram-specific user IDs - - DISCORD_ALLOWED_USERS: Discord-specific user IDs - - If no allowlist is configured, all users are allowed (open access). + Checks in order: + 1. Environment variable allowlists (TELEGRAM_ALLOWED_USERS, etc.) + 2. DM pairing approved list + 3. If no allowlists AND no pairing approvals exist, allow all (open access) """ user_id = source.user_id if not user_id: @@ -205,12 +230,18 @@ class GatewayRunner: Platform.TELEGRAM: "TELEGRAM_ALLOWED_USERS", Platform.DISCORD: "DISCORD_ALLOWED_USERS", Platform.WHATSAPP: "WHATSAPP_ALLOWED_USERS", + Platform.SLACK: "SLACK_ALLOWED_USERS", } platform_allowlist = os.getenv(platform_env_map.get(source.platform, "")) global_allowlist = os.getenv("GATEWAY_ALLOWED_USERS", "") - # If no allowlists configured, allow all (backward compatible) + # Check pairing store (always checked, regardless of allowlists) + platform_name = source.platform.value if source.platform else "" + if self.pairing_store.is_approved(platform_name, user_id): + return True + + # If no allowlists configured and no pairing approvals, allow all (backward compatible) if not platform_allowlist and not global_allowlist: return True @@ -241,7 +272,31 @@ class GatewayRunner: # Check if user is authorized if not self._is_user_authorized(source): print(f"[gateway] Unauthorized user: {source.user_id} ({source.user_name}) on {source.platform.value}") - return None # Silently ignore unauthorized users + # In DMs: offer pairing code. In groups: silently ignore. + if source.chat_type == "dm": + platform_name = source.platform.value if source.platform else "unknown" + code = self.pairing_store.generate_code( + platform_name, source.user_id, source.user_name or "" + ) + if code: + adapter = self.adapters.get(source.platform) + if adapter: + await adapter.send( + source.chat_id, + f"Hi~ I don't recognize you yet!\n\n" + f"Here's your pairing code: `{code}`\n\n" + f"Ask the bot owner to run:\n" + f"`hermes pairing approve {platform_name} {code}`" + ) + else: + adapter = self.adapters.get(source.platform) + if adapter: + await adapter.send( + source.chat_id, + "Too many pairing requests right now~ " + "Please try again later!" + ) + return None # Check for commands command = event.get_command() @@ -327,7 +382,34 @@ class GatewayRunner: message_text, image_paths ) + # ----------------------------------------------------------------- + # Auto-transcribe voice/audio messages sent by the user + # ----------------------------------------------------------------- + if event.media_urls: + audio_paths = [] + for i, path in enumerate(event.media_urls): + mtype = event.media_types[i] if i < len(event.media_types) else "" + is_audio = ( + mtype.startswith("audio/") + or event.message_type in (MessageType.VOICE, MessageType.AUDIO) + ) + if is_audio: + audio_paths.append(path) + if audio_paths: + message_text = await self._enrich_message_with_transcription( + message_text, audio_paths + ) + try: + # Emit agent:start hook + hook_ctx = { + "platform": source.platform.value if source.platform else "", + "user_id": source.user_id, + "session_id": session_entry.session_id, + "message": message_text[:500], + } + await self.hooks.emit("agent:start", hook_ctx) + # Run the agent response = await self._run_agent( message=message_text, @@ -338,6 +420,12 @@ class GatewayRunner: session_key=session_key ) + # Emit agent:end hook + await self.hooks.emit("agent:end", { + **hook_ctx, + "response": (response or "")[:500], + }) + # Check if the agent encountered a dangerous command needing approval # The terminal tool stores the last pending approval globally try: @@ -382,6 +470,13 @@ class GatewayRunner: # Reset the session new_entry = self.session_store.reset_session(session_key) + # Emit session:reset hook + await self.hooks.emit("session:reset", { + "platform": source.platform.value if source.platform else "", + "user_id": source.user_id, + "session_key": session_key, + }) + if new_entry: return "✨ Session reset! I've started fresh with no memory of our previous conversation." else: @@ -482,23 +577,22 @@ class GatewayRunner: if result.get("success"): description = result.get("analysis", "") enriched_parts.append( - f"[User sent an image. Vision analysis:\n{description}]\n" - f"[To examine this image further, use vision_analyze with " - f"image_url: {path}]" + f"[The user sent an image~ Here's what I can see:\n{description}]\n" + f"[If you need a closer look, use vision_analyze with " + f"image_url: {path} ~]" ) else: - # Analysis failed -- still tell the model the image exists enriched_parts.append( - f"[User sent an image but automatic analysis failed. " - f"You can try analyzing it with vision_analyze using " - f"image_url: {path}]" + "[The user sent an image but I couldn't quite see it " + "this time (>_<) You can try looking at it yourself " + f"with vision_analyze using image_url: {path}]" ) except Exception as e: print(f"[gateway] Vision auto-analysis error: {e}", flush=True) enriched_parts.append( - f"[User sent an image but automatic analysis encountered an error. " - f"You can try analyzing it with vision_analyze using " - f"image_url: {path}]" + f"[The user sent an image but something went wrong when I " + f"tried to look at it~ You can try examining it yourself " + f"with vision_analyze using image_url: {path}]" ) # Combine: vision descriptions first, then the user's original text @@ -509,6 +603,63 @@ class GatewayRunner: return prefix return user_text + async def _enrich_message_with_transcription( + self, + user_text: str, + audio_paths: List[str], + ) -> str: + """ + Auto-transcribe user voice/audio messages using OpenAI Whisper API + and prepend the transcript to the message text. + + Args: + user_text: The user's original caption / message text. + audio_paths: List of local file paths to cached audio files. + + Returns: + The enriched message string with transcriptions prepended. + """ + from tools.transcription_tools import transcribe_audio + import asyncio + + enriched_parts = [] + for path in audio_paths: + try: + print(f"[gateway] Transcribing user voice: {path}", flush=True) + result = await asyncio.to_thread(transcribe_audio, path) + if result["success"]: + transcript = result["transcript"] + enriched_parts.append( + f'[The user sent a voice message~ ' + f'Here\'s what they said: "{transcript}"]' + ) + else: + error = result.get("error", "unknown error") + if "OPENAI_API_KEY" in error: + enriched_parts.append( + "[The user sent a voice message but I can't listen " + "to it right now~ OPENAI_API_KEY isn't set up yet " + "(';w;') Let them know!]" + ) + else: + enriched_parts.append( + "[The user sent a voice message but I had trouble " + f"transcribing it~ ({error})]" + ) + except Exception as e: + print(f"[gateway] Transcription error: {e}", flush=True) + enriched_parts.append( + "[The user sent a voice message but something went wrong " + "when I tried to listen to it~ Let them know!]" + ) + + if enriched_parts: + prefix = "\n\n".join(enriched_parts) + if user_text: + return f"{prefix}\n\n{user_text}" + return prefix + return user_text + async def _run_agent( self, message: str, diff --git a/gateway/sticker_cache.py b/gateway/sticker_cache.py new file mode 100644 index 000000000..597f672ef --- /dev/null +++ b/gateway/sticker_cache.py @@ -0,0 +1,111 @@ +""" +Sticker description cache for Telegram. + +When users send stickers, we describe them via the vision tool and cache +the descriptions keyed by file_unique_id so we don't re-analyze the same +sticker image on every send. Descriptions are concise (1-2 sentences). + +Cache location: ~/.hermes/sticker_cache.json +""" + +import json +import os +import time +from pathlib import Path +from typing import Optional + + +CACHE_PATH = Path(os.path.expanduser("~/.hermes/sticker_cache.json")) + +# Vision prompt for describing stickers -- kept concise to save tokens +STICKER_VISION_PROMPT = ( + "Describe this sticker in 1-2 sentences. Focus on what it depicts -- " + "character, action, emotion. Be concise and objective." +) + + +def _load_cache() -> dict: + """Load the sticker cache from disk.""" + if CACHE_PATH.exists(): + try: + return json.loads(CACHE_PATH.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError): + return {} + return {} + + +def _save_cache(cache: dict) -> None: + """Save the sticker cache to disk.""" + CACHE_PATH.parent.mkdir(parents=True, exist_ok=True) + CACHE_PATH.write_text( + json.dumps(cache, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + + +def get_cached_description(file_unique_id: str) -> Optional[dict]: + """ + Look up a cached sticker description. + + Returns: + dict with keys {description, emoji, set_name, cached_at} or None. + """ + cache = _load_cache() + return cache.get(file_unique_id) + + +def cache_sticker_description( + file_unique_id: str, + description: str, + emoji: str = "", + set_name: str = "", +) -> None: + """ + Store a sticker description in the cache. + + Args: + file_unique_id: Telegram's stable sticker identifier. + description: Vision-generated description text. + emoji: Associated emoji (e.g. "😀"). + set_name: Sticker set name if available. + """ + cache = _load_cache() + cache[file_unique_id] = { + "description": description, + "emoji": emoji, + "set_name": set_name, + "cached_at": time.time(), + } + _save_cache(cache) + + +def build_sticker_injection( + description: str, + emoji: str = "", + set_name: str = "", +) -> str: + """ + Build the warm-style injection text for a sticker description. + + Returns a string like: + [The user sent a sticker 😀 from "MyPack"~ It shows: "A cat waving" (=^.w.^=)] + """ + context = "" + if set_name and emoji: + context = f" {emoji} from \"{set_name}\"" + elif emoji: + context = f" {emoji}" + + return f"[The user sent a sticker{context}~ It shows: \"{description}\" (=^.w.^=)]" + + +def build_animated_sticker_injection(emoji: str = "") -> str: + """ + Build injection text for animated/video stickers we can't analyze. + """ + if emoji: + return ( + f"[The user sent an animated sticker {emoji}~ " + f"I can't see animated ones yet, but the emoji suggests: {emoji}]" + ) + return "[The user sent an animated sticker~ I can't see animated ones yet]" diff --git a/hermes_cli/config.py b/hermes_cli/config.py index b7917eab5..9e677f3e8 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -117,11 +117,22 @@ DEFAULT_CONFIG = { }, }, + "stt": { + "enabled": True, + "model": "whisper-1", + }, + + "human_delay": { + "mode": "off", + "min_ms": 800, + "max_ms": 2500, + }, + # Permanently allowed dangerous command patterns (added via "always" approval) "command_allowlist": [], # Config schema version - bump this when adding new required fields - "_config_version": 1, + "_config_version": 2, } # ============================================================================= @@ -195,6 +206,20 @@ OPTIONAL_ENV_VARS = { "url": None, "password": True, }, + "SLACK_BOT_TOKEN": { + "description": "Slack bot integration", + "prompt": "Slack Bot Token (xoxb-...)", + "url": "https://api.slack.com/apps", + "tools": ["slack"], + "password": True, + }, + "SLACK_APP_TOKEN": { + "description": "Slack Socket Mode connection", + "prompt": "Slack App Token (xapp-...)", + "url": "https://api.slack.com/apps", + "tools": ["slack"], + "password": True, + }, # Messaging platform tokens "TELEGRAM_BOT_TOKEN": { "description": "Telegram bot token from @BotFather", @@ -375,6 +400,44 @@ def migrate_config(interactive: bool = True, quiet: bool = False) -> Dict[str, A results["warnings"].append(f"Skipped {var['name']} - some features may not work") print() + # Check for missing optional env vars and offer to configure + missing_optional = get_missing_env_vars(required_only=False) + # Filter to only truly optional ones (not already handled as required above) + required_names = {v["name"] for v in missing_env} if missing_env else set() + missing_optional = [v for v in missing_optional if v["name"] not in required_names] + + if missing_optional and not quiet: + print(f"\n ℹ️ {len(missing_optional)} optional API key(s) not configured:") + for var in missing_optional: + tools = var.get("tools", []) + tools_str = f" → enables: {', '.join(tools)}" if tools else "" + print(f" • {var['name']}: {var['description']}{tools_str}") + + if interactive and missing_optional: + print("\n Would you like to configure any optional keys now?") + try: + answer = input(" Configure optional keys? [y/N]: ").strip().lower() + except (EOFError, KeyboardInterrupt): + answer = "n" + + if answer in ("y", "yes"): + print() + for var in missing_optional: + if var.get("url"): + print(f" Get your key at: {var['url']}") + + if var.get("password"): + import getpass + value = getpass.getpass(f" {var['prompt']} (Enter to skip): ") + else: + value = input(f" {var['prompt']} (Enter to skip): ").strip() + + if value: + save_env_value(var["name"], value) + results["env_added"].append(var["name"]) + print(f" ✓ Saved {var['name']}") + print() + # Check for missing config fields missing_config = get_missing_config_fields() diff --git a/hermes_cli/main.py b/hermes_cli/main.py index a4c4f644b..52f7d50d5 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -446,6 +446,34 @@ For more help on a command: config_parser.set_defaults(func=cmd_config) + # ========================================================================= + # pairing command + # ========================================================================= + pairing_parser = subparsers.add_parser( + "pairing", + help="Manage DM pairing codes for user authorization", + description="Approve or revoke user access via pairing codes" + ) + pairing_sub = pairing_parser.add_subparsers(dest="pairing_action") + + pairing_list_parser = pairing_sub.add_parser("list", help="Show pending + approved users") + + pairing_approve_parser = pairing_sub.add_parser("approve", help="Approve a pairing code") + pairing_approve_parser.add_argument("platform", help="Platform name (telegram, discord, slack, whatsapp)") + pairing_approve_parser.add_argument("code", help="Pairing code to approve") + + pairing_revoke_parser = pairing_sub.add_parser("revoke", help="Revoke user access") + pairing_revoke_parser.add_argument("platform", help="Platform name") + pairing_revoke_parser.add_argument("user_id", help="User ID to revoke") + + pairing_clear_parser = pairing_sub.add_parser("clear-pending", help="Clear all pending codes") + + def cmd_pairing(args): + from hermes_cli.pairing import pairing_command + pairing_command(args) + + pairing_parser.set_defaults(func=cmd_pairing) + # ========================================================================= # version command # ========================================================================= diff --git a/hermes_cli/pairing.py b/hermes_cli/pairing.py new file mode 100644 index 000000000..d15d8cb6b --- /dev/null +++ b/hermes_cli/pairing.py @@ -0,0 +1,100 @@ +""" +CLI commands for the DM pairing system. + +Usage: + hermes pairing list # Show all pending + approved users + hermes pairing approve # Approve a pairing code + hermes pairing revoke # Revoke user access + hermes pairing clear-pending # Clear all expired/pending codes +""" + +import sys + + +def pairing_command(args): + """Handle hermes pairing subcommands.""" + from gateway.pairing import PairingStore + + store = PairingStore() + action = getattr(args, "pairing_action", None) + + if action == "list": + _cmd_list(store) + elif action == "approve": + _cmd_approve(store, args.platform, args.code) + elif action == "revoke": + _cmd_revoke(store, args.platform, args.user_id) + elif action == "clear-pending": + _cmd_clear_pending(store) + else: + print("Usage: hermes pairing {list|approve|revoke|clear-pending}") + print("Run 'hermes pairing --help' for details.") + + +def _cmd_list(store): + """List all pending and approved users.""" + pending = store.list_pending() + approved = store.list_approved() + + if not pending and not approved: + print("No pairing data found. No one has tried to pair yet~") + return + + if pending: + print(f"\n Pending Pairing Requests ({len(pending)}):") + print(f" {'Platform':<12} {'Code':<10} {'User ID':<20} {'Name':<20} {'Age'}") + print(f" {'--------':<12} {'----':<10} {'-------':<20} {'----':<20} {'---'}") + for p in pending: + print( + f" {p['platform']:<12} {p['code']:<10} {p['user_id']:<20} " + f"{p.get('user_name', ''):<20} {p['age_minutes']}m ago" + ) + else: + print("\n No pending pairing requests.") + + if approved: + print(f"\n Approved Users ({len(approved)}):") + print(f" {'Platform':<12} {'User ID':<20} {'Name':<20}") + print(f" {'--------':<12} {'-------':<20} {'----':<20}") + for a in approved: + print(f" {a['platform']:<12} {a['user_id']:<20} {a.get('user_name', ''):<20}") + else: + print("\n No approved users.") + + print() + + +def _cmd_approve(store, platform: str, code: str): + """Approve a pairing code.""" + platform = platform.lower().strip() + code = code.upper().strip() + + result = store.approve_code(platform, code) + if result: + uid = result["user_id"] + name = result.get("user_name", "") + display = f"{name} ({uid})" if name else uid + print(f"\n Approved! User {display} on {platform} can now use the bot~") + print(f" They'll be recognized automatically on their next message.\n") + else: + print(f"\n Code '{code}' not found or expired for platform '{platform}'.") + print(f" Run 'hermes pairing list' to see pending codes.\n") + + +def _cmd_revoke(store, platform: str, user_id: str): + """Revoke a user's access.""" + platform = platform.lower().strip() + + if store.revoke(platform, user_id): + print(f"\n Revoked access for user {user_id} on {platform}.\n") + else: + print(f"\n User {user_id} not found in approved list for {platform}.\n") + + +def _cmd_clear_pending(store): + """Clear all pending pairing codes.""" + count = store.clear_pending() + if count: + print(f"\n Cleared {count} pending pairing request(s).\n") + else: + print("\n No pending requests to clear.\n") diff --git a/model_tools.py b/model_tools.py index 473f11798..87942e8e1 100644 --- a/model_tools.py +++ b/model_tools.py @@ -903,6 +903,33 @@ def get_tts_tool_definitions() -> List[Dict[str, Any]]: ] +def get_send_message_tool_definitions(): + """Tool definitions for cross-channel messaging.""" + return [ + { + "type": "function", + "function": { + "name": "send_message", + "description": "Send a message to a user or channel on any connected messaging platform. Use this when the user asks you to send something to a different platform, or when delivering notifications/alerts to a specific destination.", + "parameters": { + "type": "object", + "properties": { + "target": { + "type": "string", + "description": "Delivery target. Format: 'platform' (uses home channel) or 'platform:chat_id' (specific chat). Examples: 'telegram', 'discord:123456789', 'slack:C01234ABCDE'" + }, + "message": { + "type": "string", + "description": "The message text to send" + } + }, + "required": ["target", "message"] + } + } + } + ] + + def get_all_tool_names() -> List[str]: """ Get the names of all available tools across all toolsets. @@ -971,6 +998,9 @@ def get_all_tool_names() -> List[str]: if check_tts_requirements(): tool_names.extend(["text_to_speech"]) + # Cross-channel messaging (always available on messaging platforms) + tool_names.extend(["send_message"]) + return tool_names @@ -1019,6 +1049,8 @@ TOOL_TO_TOOLSET_MAP = { "write_file": "file_tools", "patch": "file_tools", "search": "file_tools", + # Cross-channel messaging + "send_message": "messaging_tools", } @@ -1122,6 +1154,10 @@ def get_tool_definitions( for tool in get_tts_tool_definitions(): all_available_tools_map[tool["function"]["name"]] = tool + # Cross-channel messaging (always available on messaging platforms) + for tool in get_send_message_tool_definitions(): + all_available_tools_map[tool["function"]["name"]] = tool + # Determine which tools to include based on toolsets tools_to_include = set() @@ -1693,6 +1729,22 @@ def handle_tts_function_call( return json.dumps({"error": f"Unknown TTS function: {function_name}"}, ensure_ascii=False) +def handle_send_message_function_call(function_name, function_args): + """Handle cross-channel send_message tool calls.""" + import json + target = function_args.get("target", "") + message = function_args.get("message", "") + if not target or not message: + return json.dumps({"error": "Both 'target' and 'message' are required"}) + + # Store the pending message for the gateway to deliver + # The gateway runner checks this after the agent loop completes + import os + os.environ["_HERMES_PENDING_SEND_TARGET"] = target + os.environ["_HERMES_PENDING_SEND_MESSAGE"] = message + return json.dumps({"success": True, "delivered_to": target, "note": "Message queued for delivery"}) + + def handle_function_call( function_name: str, function_args: Dict[str, Any], @@ -1774,6 +1826,10 @@ def handle_function_call( elif function_name in ["text_to_speech"]: return handle_tts_function_call(function_name, function_args) + # Route cross-channel messaging + elif function_name == "send_message": + return handle_send_message_function_call(function_name, function_args) + else: error_msg = f"Unknown function: {function_name}" print(f"❌ {error_msg}") diff --git a/pyproject.toml b/pyproject.toml index 30bf3563d..e0ee1ae63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,8 +38,9 @@ dependencies = [ [project.optional-dependencies] modal = ["swe-rex[modal]>=1.4.0"] dev = ["pytest", "pytest-asyncio"] -messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0"] +messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0", "slack-bolt>=1.18.0", "slack-sdk>=3.27.0"] cron = ["croniter"] +slack = ["slack-bolt>=1.18.0", "slack-sdk>=3.27.0"] cli = ["simple-term-menu"] tts-premium = ["elevenlabs"] all = [ @@ -49,6 +50,7 @@ all = [ "hermes-agent[cli]", "hermes-agent[dev]", "hermes-agent[tts-premium]", + "hermes-agent[slack]", ] [project.scripts] diff --git a/scripts/install.ps1 b/scripts/install.ps1 index 2ec25f324..ff63b0450 100644 --- a/scripts/install.ps1 +++ b/scripts/install.ps1 @@ -443,6 +443,10 @@ function Copy-ConfigTemplates { New-Item -ItemType Directory -Force -Path "$HermesHome\cron" | Out-Null New-Item -ItemType Directory -Force -Path "$HermesHome\sessions" | Out-Null New-Item -ItemType Directory -Force -Path "$HermesHome\logs" | Out-Null + New-Item -ItemType Directory -Force -Path "$HermesHome\pairing" | Out-Null + New-Item -ItemType Directory -Force -Path "$HermesHome\hooks" | Out-Null + New-Item -ItemType Directory -Force -Path "$HermesHome\image_cache" | Out-Null + New-Item -ItemType Directory -Force -Path "$HermesHome\audio_cache" | Out-Null # Create .env $envPath = "$HermesHome\.env" diff --git a/scripts/install.sh b/scripts/install.sh index d6c9609f1..84b25fd5a 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -614,9 +614,7 @@ copy_config_templates() { log_info "Setting up configuration files..." # Create ~/.hermes directory structure (config at top level, code in subdir) - mkdir -p "$HERMES_HOME/cron" - mkdir -p "$HERMES_HOME/sessions" - mkdir -p "$HERMES_HOME/logs" + mkdir -p "$HERMES_HOME"/{cron,sessions,logs,pairing,hooks,image_cache,audio_cache} # Create .env at ~/.hermes/.env (top level, easy to find) if [ ! -f "$HERMES_HOME/.env" ]; then diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py new file mode 100644 index 000000000..18a000ef5 --- /dev/null +++ b/tools/transcription_tools.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +""" +Transcription Tools Module + +Provides speech-to-text transcription using OpenAI's Whisper API. +Used by the messaging gateway to automatically transcribe voice messages +sent by users on Telegram, Discord, WhatsApp, and Slack. + +Supported models: + - whisper-1 (cheapest, good quality) + - gpt-4o-mini-transcribe (better quality, higher cost) + - gpt-4o-transcribe (best quality, highest cost) + +Supported input formats: mp3, mp4, mpeg, mpga, m4a, wav, webm, ogg + +Usage: + from tools.transcription_tools import transcribe_audio + + result = transcribe_audio("/path/to/audio.ogg") + if result["success"]: + print(result["transcript"]) +""" + +import os +from pathlib import Path +from typing import Optional + + +# Default STT model -- cheapest and widely available +DEFAULT_STT_MODEL = "whisper-1" + + +def transcribe_audio(file_path: str, model: Optional[str] = None) -> dict: + """ + Transcribe an audio file using OpenAI's Whisper API. + + This function calls the OpenAI Audio Transcriptions endpoint directly + (not via OpenRouter, since Whisper isn't available there). + + Args: + file_path: Absolute path to the audio file to transcribe. + model: Whisper model to use. Defaults to config or "whisper-1". + + Returns: + dict with keys: + - "success" (bool): Whether transcription succeeded + - "transcript" (str): The transcribed text (empty on failure) + - "error" (str, optional): Error message if success is False + """ + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + return { + "success": False, + "transcript": "", + "error": "OPENAI_API_KEY not set", + } + + audio_path = Path(file_path) + if not audio_path.is_file(): + return { + "success": False, + "transcript": "", + "error": f"Audio file not found: {file_path}", + } + + # Use provided model, or fall back to default + if model is None: + model = DEFAULT_STT_MODEL + + try: + from openai import OpenAI + + client = OpenAI(api_key=api_key) + + with open(file_path, "rb") as audio_file: + transcription = client.audio.transcriptions.create( + model=model, + file=audio_file, + response_format="text", + ) + + # The response is a plain string when response_format="text" + transcript_text = str(transcription).strip() + + print(f"[STT] Transcribed {audio_path.name} ({len(transcript_text)} chars)", flush=True) + + return { + "success": True, + "transcript": transcript_text, + } + + except Exception as e: + print(f"[STT] Transcription error: {e}", flush=True) + return { + "success": False, + "transcript": "", + "error": str(e), + } + + +def check_stt_requirements() -> bool: + """Check if OpenAI API key is available for speech-to-text.""" + return bool(os.getenv("OPENAI_API_KEY")) diff --git a/toolsets.py b/toolsets.py index c90628314..19eb6f272 100644 --- a/toolsets.py +++ b/toolsets.py @@ -184,7 +184,9 @@ TOOLSETS = { # Skills - access knowledge base "skills_list", "skill_view", # Cronjob management - let users schedule tasks - "schedule_cronjob", "list_cronjobs", "remove_cronjob" + "schedule_cronjob", "list_cronjobs", "remove_cronjob", + # Cross-channel messaging + "send_message" ], "includes": [] }, @@ -207,7 +209,9 @@ TOOLSETS = { # Skills - access knowledge base "skills_list", "skill_view", # Cronjob management - let users schedule tasks - "schedule_cronjob", "list_cronjobs", "remove_cronjob" + "schedule_cronjob", "list_cronjobs", "remove_cronjob", + # Cross-channel messaging + "send_message" ], "includes": [] }, @@ -230,7 +234,34 @@ TOOLSETS = { # Skills "skills_list", "skill_view", # Cronjob management - "schedule_cronjob", "list_cronjobs", "remove_cronjob" + "schedule_cronjob", "list_cronjobs", "remove_cronjob", + # Cross-channel messaging + "send_message" + ], + "includes": [] + }, + + "hermes-slack": { + "description": "Slack bot toolset - full access for workspace use (terminal has safety checks)", + "tools": [ + # Terminal - enabled with dangerous command approval system + "terminal", + # File manipulation + "read_file", "write_file", "patch", "search", + # Web tools + "web_search", "web_extract", + # Vision - analyze images sent by users + "vision_analyze", + # Image generation + "image_generate", + # Text-to-speech + "text_to_speech", + # Skills - access knowledge base + "skills_list", "skill_view", + # Cronjob management - let users schedule tasks + "schedule_cronjob", "list_cronjobs", "remove_cronjob", + # Cross-channel messaging + "send_message" ], "includes": [] }, @@ -238,7 +269,7 @@ TOOLSETS = { "hermes-gateway": { "description": "Gateway toolset - union of all messaging platform tools", "tools": [], - "includes": ["hermes-telegram", "hermes-discord", "hermes-whatsapp"] + "includes": ["hermes-telegram", "hermes-discord", "hermes-whatsapp", "hermes-slack"] } }