From a182d127787341825bfaf3f55c4215ef1e6bb4d2 Mon Sep 17 00:00:00 2001 From: JackTheGit Date: Wed, 11 Mar 2026 15:49:00 +0000 Subject: [PATCH 01/34] Fix several documentation typos across training references --- skills/mlops/training/axolotl/references/api.md | 4 ++-- skills/mlops/training/pytorch-fsdp/references/other.md | 6 +++--- skills/mlops/training/unsloth/references/llms-full.md | 8 ++++---- skills/mlops/training/unsloth/references/llms-txt.md | 8 ++++---- skills/mlops/training/unsloth/references/llms.md | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/skills/mlops/training/axolotl/references/api.md b/skills/mlops/training/axolotl/references/api.md index f00b6eb6a..2f94b5394 100644 --- a/skills/mlops/training/axolotl/references/api.md +++ b/skills/mlops/training/axolotl/references/api.md @@ -3240,7 +3240,7 @@ Prompt Strategy for finetuning Llama2 chat models see also https://github.com/fa This implementation is based on the Vicuna PR and the fastchat repo, see also: https://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847 -Use dataset type: “llama2_chat” in conig.yml to use this prompt style. +Use dataset type: “llama2_chat” in config.yml to use this prompt style. E.g. in the config.yml: @@ -4991,7 +4991,7 @@ prompt_strategies.orcamini Prompt Strategy for finetuning Orca Mini (v2) models see also https://huggingface.co/psmathur/orca_mini_v2_7b for more information -Use dataset type: orcamini in conig.yml to use this prompt style. +Use dataset type: orcamini in config.yml to use this prompt style. Compared to the alpaca_w_system.open_orca dataset type, this one specifies the system prompt with “### System:”. diff --git a/skills/mlops/training/pytorch-fsdp/references/other.md b/skills/mlops/training/pytorch-fsdp/references/other.md index d5b6cae6f..8af7bb518 100644 --- a/skills/mlops/training/pytorch-fsdp/references/other.md +++ b/skills/mlops/training/pytorch-fsdp/references/other.md @@ -2290,7 +2290,7 @@ This call gives the AsyncStager the opportunity to ‘stage’ the state_dict. T for serializing the state_dict and writing it to storage. -the serialization thread starts and before returning from dcp.async_save. If this is set to False, the assumption is the user has defined a custom synchronization point for the the purpose of further optimizing save latency in the training loop (for example, by overlapping staging with the forward/backward pass), and it is the respondsibility of the user to call AsyncStager.synchronize_staging at the appropriate time. +the serialization thread starts and before returning from dcp.async_save. If this is set to False, the assumption is the user has defined a custom synchronization point for the purpose of further optimizing save latency in the training loop (for example, by overlapping staging with the forward/backward pass), and it is the respondsibility of the user to call AsyncStager.synchronize_staging at the appropriate time. Clean up all resources used by the stager. @@ -3441,7 +3441,7 @@ The target module does not have to be an FSDP module. A StateDictSettings containing the state_dict_type and state_dict / optim_state_dict configs that are currently set. -AssertionError` if the StateDictSettings for differen – +AssertionError` if the StateDictSettings for different – FSDP submodules differ. – @@ -3766,7 +3766,7 @@ The sharing is done as described by ZeRO. The local optimizer instance in each rank is only responsible for updating approximately 1 / world_size parameters and hence only needs to keep 1 / world_size optimizer states. After parameters are updated locally, each rank will broadcast its parameters to all other peers to keep all model replicas in the same state. ZeroRedundancyOptimizer can be used in conjunction with torch.nn.parallel.DistributedDataParallel to reduce per-rank peak memory consumption. -ZeroRedundancyOptimizer uses a sorted-greedy algorithm to pack a number of parameters at each rank. Each parameter belongs to a single rank and is not divided among ranks. The partition is arbitrary and might not match the the parameter registration or usage order. +ZeroRedundancyOptimizer uses a sorted-greedy algorithm to pack a number of parameters at each rank. Each parameter belongs to a single rank and is not divided among ranks. The partition is arbitrary and might not match the parameter registration or usage order. params (Iterable) – an Iterable of torch.Tensor s or dict s giving all parameters, which will be sharded across ranks. diff --git a/skills/mlops/training/unsloth/references/llms-full.md b/skills/mlops/training/unsloth/references/llms-full.md index b0b6b24d9..df3d2eebb 100644 --- a/skills/mlops/training/unsloth/references/llms-full.md +++ b/skills/mlops/training/unsloth/references/llms-full.md @@ -6348,7 +6348,7 @@ Our chat templates for the GGUF, our BnB and BF16 uploads and all versions are f ### :1234: Precision issues -We found multiple precision issues in Tesla T4 and float16 machines primarily since the model was trained using BF16, and so outliers and overflows existed. MXFP4 is not actually supported on Ampere and older GPUs, so Triton provides `tl.dot_scaled` for MXFP4 matrix multiplication. It upcasts the matrices to BF16 internaly on the fly. +We found multiple precision issues in Tesla T4 and float16 machines primarily since the model was trained using BF16, and so outliers and overflows existed. MXFP4 is not actually supported on Ampere and older GPUs, so Triton provides `tl.dot_scaled` for MXFP4 matrix multiplication. It upcasts the matrices to BF16 internally on the fly. We made a [MXFP4 inference notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/GPT_OSS_MXFP4_\(20B\)-Inference.ipynb) as well in Tesla T4 Colab! @@ -14877,7 +14877,7 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \ # Text-to-Speech (TTS) Fine-tuning -Learn how to to fine-tune TTS & STT voice models with Unsloth. +Learn how to fine-tune TTS & STT voice models with Unsloth. Fine-tuning TTS models allows them to adapt to your specific dataset, use case, or desired style and tone. The goal is to customize these models to clone voices, adapt speaking styles and tones, support new languages, handle specific tasks and more. We also support **Speech-to-Text (STT)** models like OpenAI's Whisper. @@ -15306,7 +15306,7 @@ snapshot_download( ) ``` -And and let's do inference! +And let's do inference! {% code overflow="wrap" %} @@ -16036,7 +16036,7 @@ Then train the model as usual via `trainer.train() .` Tips to solve issues, and frequently asked questions. -If you're still encountering any issues with versions or depencies, please use our [Docker image](https://docs.unsloth.ai/get-started/install-and-update/docker) which will have everything pre-installed. +If you're still encountering any issues with versions or dependencies, please use our [Docker image](https://docs.unsloth.ai/get-started/install-and-update/docker) which will have everything pre-installed. {% hint style="success" %} **Try always to update Unsloth if you find any issues.** diff --git a/skills/mlops/training/unsloth/references/llms-txt.md b/skills/mlops/training/unsloth/references/llms-txt.md index c5895c7cd..22f651e41 100644 --- a/skills/mlops/training/unsloth/references/llms-txt.md +++ b/skills/mlops/training/unsloth/references/llms-txt.md @@ -40,7 +40,7 @@ Read more on running Llama 4 here: Date: Thu, 12 Mar 2026 09:59:17 +0000 Subject: [PATCH 02/34] Fix checkpoint_id typos and add StorageMeta example in checkpoint storage docs --- .../training/pytorch-fsdp/references/other.md | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/skills/mlops/training/pytorch-fsdp/references/other.md b/skills/mlops/training/pytorch-fsdp/references/other.md index 8af7bb518..2b544dc98 100644 --- a/skills/mlops/training/pytorch-fsdp/references/other.md +++ b/skills/mlops/training/pytorch-fsdp/references/other.md @@ -2430,7 +2430,7 @@ Read the checkpoint metadata. The metadata object associated with the checkpoint being loaded. -Calls to indicates a brand new checkpoint read is going to happen. A checkpoint_id may be present if users set the checkpoint_id for this checkpoint read. The meaning of the checkpiont_id is storage-dependent. It can be a path to a folder/file or a key for a key-value storage. +Calls to indicates a brand new checkpoint read is going to happen. A checkpoint_id may be present if users set the checkpoint_id for this checkpoint read. The meaning of the checkpoint_id is storage-dependent. It can be a path to a folder/file or a key for a key-value storage. checkpoint_id (Union[str, os.PathLike, None]) – The ID of this checkpoint instance. The meaning of the checkpoint_id depends on the storage. It can be a path to a folder or to a file. It can also be a key if the storage is more like a key-value store. (Default: None) @@ -2488,7 +2488,7 @@ plan (SavePlan) – The local plan from the SavePlanner in use. A transformed SavePlan after storage local planning -Calls to indicates a brand new checkpoint write is going to happen. A checkpoint_id may be present if users set the checkpoint_id for this checkpoint write. The meaning of the checkpiont_id is storage-dependent. It can be a path to a folder/file or a key for a key-value storage. +Calls to indicates a brand new checkpoint write is going to happen. A checkpoint_id may be present if users set the checkpoint_id for this checkpoint write. The meaning of the checkpoint_id is storage-dependent. It can be a path to a folder/file or a key for a key-value storage. checkpoint_id (Union[str, os.PathLike, None]) – The ID of this checkpoint instance. The meaning of the checkpoint_id depends on the storage. It can be a path to a folder or to a file. It can also be a key if the storage is a key-value store. (Default: None) @@ -2498,7 +2498,19 @@ is_coordinator (bool) – Whether this instance is responsible for coordinating Return the storage-specific metadata. This is used to store additional information in a checkpoint that can be useful for providing request-level observability. StorageMeta is passed to the SavePlanner during save calls. Returns None by default. -TODO: provide an example +Example: + +```python +from torch.distributed.checkpoint.storage import StorageMeta + +class CustomStorageBackend: + def get_storage_metadata(self): + # Return storage-specific metadata that will be stored with the checkpoint + return StorageMeta() +``` + +This example shows how a storage backend can return `StorageMeta` +to attach additional metadata to a checkpoint. Optional[StorageMeta] From f43c078f9e07aadc4a2e20fe09e134925dbbe2e1 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 09:50:45 -0700 Subject: [PATCH 03/34] docs(voice): add comprehensive voice mode guide Add a hands-on guide for using voice mode with Hermes, fix and expand the main voice-mode docs, surface /voice in messaging docs, and improve discoverability from the homepage and learning path. --- website/docs/getting-started/learning-path.md | 4 +- .../docs/guides/use-voice-mode-with-hermes.md | 422 ++++++++++++++++++ website/docs/index.md | 2 + .../docs/user-guide/features/voice-mode.md | 4 +- website/docs/user-guide/messaging/discord.md | 5 + website/docs/user-guide/messaging/index.md | 3 + website/sidebars.ts | 1 + 7 files changed, 439 insertions(+), 2 deletions(-) create mode 100644 website/docs/guides/use-voice-mode-with-hermes.md diff --git a/website/docs/getting-started/learning-path.md b/website/docs/getting-started/learning-path.md index 2c08f077e..bcdbb44d4 100644 --- a/website/docs/getting-started/learning-path.md +++ b/website/docs/getting-started/learning-path.md @@ -54,7 +54,9 @@ Deploy Hermes Agent as a bot on your favorite messaging platform. 3. [Messaging Overview](/docs/user-guide/messaging) 4. [Telegram Setup](/docs/user-guide/messaging/telegram) 5. [Discord Setup](/docs/user-guide/messaging/discord) -6. [Security](/docs/user-guide/security) +6. [Voice Mode](/docs/user-guide/features/voice-mode) +7. [Use Voice Mode with Hermes](/docs/guides/use-voice-mode-with-hermes) +8. [Security](/docs/user-guide/security) For full project examples, see: - [Daily Briefing Bot](/docs/guides/daily-briefing-bot) diff --git a/website/docs/guides/use-voice-mode-with-hermes.md b/website/docs/guides/use-voice-mode-with-hermes.md new file mode 100644 index 000000000..dc35dcc65 --- /dev/null +++ b/website/docs/guides/use-voice-mode-with-hermes.md @@ -0,0 +1,422 @@ +--- +sidebar_position: 7 +title: "Use Voice Mode with Hermes" +description: "A practical guide to setting up and using Hermes voice mode across CLI, Telegram, Discord, and Discord voice channels" +--- + +# Use Voice Mode with Hermes + +This guide is the practical companion to the [Voice Mode feature reference](/docs/user-guide/features/voice-mode). + +If the feature page explains what voice mode can do, this guide shows how to actually use it well. + +## What voice mode is good for + +Voice mode is especially useful when: +- you want a hands-free CLI workflow +- you want spoken responses in Telegram or Discord +- you want Hermes sitting in a Discord voice channel for live conversation +- you want quick idea capture, debugging, or back-and-forth while walking around instead of typing + +## Choose your voice mode setup + +There are really three different voice experiences in Hermes. + +| Mode | Best for | Platform | +|---|---|---| +| Interactive microphone loop | Personal hands-free use while coding or researching | CLI | +| Voice replies in chat | Spoken responses alongside normal messaging | Telegram, Discord | +| Live voice channel bot | Group or personal live conversation in a VC | Discord voice channels | + +A good path is: +1. get text working first +2. enable voice replies second +3. move to Discord voice channels last if you want the full experience + +## Step 1: make sure normal Hermes works first + +Before touching voice mode, verify that: +- Hermes starts +- your provider is configured +- the agent can answer text prompts normally + +```bash +hermes +``` + +Ask something simple: + +```text +What tools do you have available? +``` + +If that is not solid yet, fix text mode first. + +## Step 2: install the right extras + +### CLI microphone + playback + +```bash +pip install hermes-agent[voice] +``` + +### Messaging platforms + +```bash +pip install hermes-agent[messaging] +``` + +### Premium ElevenLabs TTS + +```bash +pip install hermes-agent[tts-premium] +``` + +### Everything + +```bash +pip install hermes-agent[all] +``` + +## Step 3: install system dependencies + +### macOS + +```bash +brew install portaudio ffmpeg opus +``` + +### Ubuntu / Debian + +```bash +sudo apt install portaudio19-dev ffmpeg libopus0 +``` + +Why these matter: +- `portaudio` → microphone input / playback for CLI voice mode +- `ffmpeg` → audio conversion for TTS and messaging delivery +- `opus` → Discord voice codec support + +## Step 4: choose STT and TTS providers + +Hermes supports both local and cloud speech stacks. + +### Easiest / cheapest setup + +Use local STT and free Edge TTS: +- STT provider: `local` +- TTS provider: `edge` + +This is usually the best place to start. + +### Environment file example + +Add to `~/.hermes/.env`: + +```bash +# Cloud STT options (local needs no key) +GROQ_API_KEY=*** +VOICE_TOOLS_OPENAI_KEY=*** + +# Premium TTS (optional) +ELEVENLABS_API_KEY=*** +``` + +### Provider recommendations + +#### Speech-to-text + +- `local` → best default for privacy and zero-cost use +- `groq` → very fast cloud transcription +- `openai` → good paid fallback + +#### Text-to-speech + +- `edge` → free and good enough for most users +- `elevenlabs` → best quality +- `openai` → good middle ground + +## Step 5: recommended config + +```yaml +voice: + record_key: "ctrl+b" + max_recording_seconds: 120 + auto_tts: false + silence_threshold: 200 + silence_duration: 3.0 + +stt: + provider: "local" + local: + model: "base" + +tts: + provider: "edge" + edge: + voice: "en-US-AriaNeural" +``` + +This is a good conservative default for most people. + +## Use case 1: CLI voice mode + +## Turn it on + +Start Hermes: + +```bash +hermes +``` + +Inside the CLI: + +```text +/voice on +``` + +### Recording flow + +Default key: +- `Ctrl+B` + +Workflow: +1. press `Ctrl+B` +2. speak +3. wait for silence detection to stop recording automatically +4. Hermes transcribes and responds +5. if TTS is on, it speaks the answer +6. the loop can automatically restart for continuous use + +### Useful commands + +```text +/voice +/voice on +/voice off +/voice tts +/voice status +``` + +### Good CLI workflows + +#### Walk-up debugging + +Say: + +```text +I keep getting a docker permission error. Help me debug it. +``` + +Then continue hands-free: +- "Read the last error again" +- "Explain the root cause in simpler terms" +- "Now give me the exact fix" + +#### Research / brainstorming + +Great for: +- walking around while thinking +- dictating half-formed ideas +- asking Hermes to structure your thoughts in real time + +#### Accessibility / low-typing sessions + +If typing is inconvenient, voice mode is one of the fastest ways to stay in the full Hermes loop. + +## Tuning CLI behavior + +### Silence threshold + +If Hermes starts/stops too aggressively, tune: + +```yaml +voice: + silence_threshold: 250 +``` + +Higher threshold = less sensitive. + +### Silence duration + +If you pause a lot between sentences, increase: + +```yaml +voice: + silence_duration: 4.0 +``` + +### Record key + +If `Ctrl+B` conflicts with your terminal or tmux habits: + +```yaml +voice: + record_key: "ctrl+space" +``` + +## Use case 2: voice replies in Telegram or Discord + +This mode is simpler than full voice channels. + +Hermes stays a normal chat bot, but can speak replies. + +### Start the gateway + +```bash +hermes gateway +``` + +### Turn on voice replies + +Inside Telegram or Discord: + +```text +/voice on +``` + +or + +```text +/voice tts +``` + +### Modes + +| Mode | Meaning | +|---|---| +| `off` | text only | +| `voice_only` | speak only when the user sent voice | +| `all` | speak every reply | + +### When to use which mode + +- `/voice on` if you want spoken replies only for voice-originating messages +- `/voice tts` if you want a full spoken assistant all the time + +### Good messaging workflows + +#### Telegram assistant on your phone + +Use when: +- you are away from your machine +- you want to send voice notes and get quick spoken replies +- you want Hermes to function like a portable research or ops assistant + +#### Discord DMs with spoken output + +Useful when you want private interaction without server-channel mention behavior. + +## Use case 3: Discord voice channels + +This is the most advanced mode. + +Hermes joins a Discord VC, listens to user speech, transcribes it, runs the normal agent pipeline, and speaks replies back into the channel. + +## Required Discord permissions + +In addition to the normal text-bot setup, make sure the bot has: +- Connect +- Speak +- preferably Use Voice Activity + +Also enable privileged intents in the Developer Portal: +- Presence Intent +- Server Members Intent +- Message Content Intent + +## Join and leave + +In a Discord text channel where the bot is present: + +```text +/voice join +/voice leave +/voice status +``` + +### What happens when joined + +- users speak in the VC +- Hermes detects speech boundaries +- transcripts are posted in the associated text channel +- Hermes responds in text and audio +- the text channel is the one where `/voice join` was issued + +### Best practices for Discord VC use + +- keep `DISCORD_ALLOWED_USERS` tight +- use a dedicated bot/testing channel at first +- verify STT and TTS work in ordinary text-chat voice mode before trying VC mode + +## Voice quality recommendations + +### Best quality setup + +- STT: local `large-v3` or Groq `whisper-large-v3` +- TTS: ElevenLabs + +### Best speed / convenience setup + +- STT: local `base` or Groq +- TTS: Edge + +### Best zero-cost setup + +- STT: local +- TTS: Edge + +## Common failure modes + +### "No audio device found" + +Install `portaudio`. + +### "Bot joins but hears nothing" + +Check: +- your Discord user ID is in `DISCORD_ALLOWED_USERS` +- you are not muted +- privileged intents are enabled +- the bot has Connect/Speak permissions + +### "It transcribes but does not speak" + +Check: +- TTS provider config +- API key / quota for ElevenLabs or OpenAI +- `ffmpeg` install for Edge conversion paths + +### "Whisper outputs garbage" + +Try: +- quieter environment +- higher `silence_threshold` +- different STT provider/model +- shorter, clearer utterances + +### "It works in DMs but not in server channels" + +That is often mention policy. + +By default, the bot needs an `@mention` in Discord server text channels unless configured otherwise. + +## Suggested first-week setup + +If you want the shortest path to success: + +1. get text Hermes working +2. install `hermes-agent[voice]` +3. use CLI voice mode with local STT + Edge TTS +4. then enable `/voice on` in Telegram or Discord +5. only after that, try Discord VC mode + +That progression keeps the debugging surface small. + +## Where to read next + +- [Voice Mode feature reference](/docs/user-guide/features/voice-mode) +- [Messaging Gateway](/docs/user-guide/messaging) +- [Discord setup](/docs/user-guide/messaging/discord) +- [Telegram setup](/docs/user-guide/messaging/telegram) +- [Configuration](/docs/user-guide/configuration) diff --git a/website/docs/index.md b/website/docs/index.md index 3dbfcaf71..470c8d2ed 100644 --- a/website/docs/index.md +++ b/website/docs/index.md @@ -33,6 +33,8 @@ It's not a coding copilot tethered to an IDE or a chatbot wrapper around a singl | 📚 **[Skills System](/docs/user-guide/features/skills)** | Procedural memory the agent creates and reuses | | 🔌 **[MCP Integration](/docs/user-guide/features/mcp)** | Connect to MCP servers, filter their tools, and extend Hermes safely | | 🧭 **[Use MCP with Hermes](/docs/guides/use-mcp-with-hermes)** | Practical MCP setup patterns, examples, and tutorials | +| 🎙️ **[Voice Mode](/docs/user-guide/features/voice-mode)** | Real-time voice interaction in CLI, Telegram, Discord, and Discord VC | +| 🗣️ **[Use Voice Mode with Hermes](/docs/guides/use-voice-mode-with-hermes)** | Hands-on setup and usage patterns for Hermes voice workflows | | 🎭 **[Personality & SOUL.md](/docs/user-guide/features/personality)** | Define Hermes' default voice with a global SOUL.md | | 📄 **[Context Files](/docs/user-guide/features/context-files)** | Project context files that shape every conversation | | 🔒 **[Security](/docs/user-guide/security)** | Command approval, authorization, container isolation | diff --git a/website/docs/user-guide/features/voice-mode.md b/website/docs/user-guide/features/voice-mode.md index ce151643a..3c94062f7 100644 --- a/website/docs/user-guide/features/voice-mode.md +++ b/website/docs/user-guide/features/voice-mode.md @@ -8,11 +8,13 @@ description: "Real-time voice conversations with Hermes Agent — CLI, Telegram, Hermes Agent supports full voice interaction across CLI and messaging platforms. Talk to the agent using your microphone, hear spoken replies, and have live voice conversations in Discord voice channels. +If you want a practical setup walkthrough with recommended configurations and real usage patterns, see [Use Voice Mode with Hermes](/docs/guides/use-voice-mode-with-hermes). + ## Prerequisites Before using voice features, make sure you have: -1. **Hermes Agent installed** — `pip install hermes-agent` (see [Getting Started](../../getting-started.md)) +1. **Hermes Agent installed** — `pip install hermes-agent` (see [Installation](/docs/getting-started/installation)) 2. **An LLM provider configured** — set `OPENAI_API_KEY`, `OPENAI_BASE_URL`, and `LLM_MODEL` in `~/.hermes/.env` 3. **A working base setup** — run `hermes` to verify the agent responds to text before enabling voice diff --git a/website/docs/user-guide/messaging/discord.md b/website/docs/user-guide/messaging/discord.md index 0fc7f8cbc..b5f060596 100644 --- a/website/docs/user-guide/messaging/discord.md +++ b/website/docs/user-guide/messaging/discord.md @@ -212,6 +212,11 @@ Hermes Agent supports Discord voice messages: - **Incoming voice messages** are automatically transcribed using Whisper (requires `GROQ_API_KEY` or `VOICE_TOOLS_OPENAI_KEY` to be set in your environment). - **Text-to-speech**: Use `/voice tts` to have the bot send spoken audio responses alongside text replies. +- **Discord voice channels**: Hermes can also join a voice channel, listen to users speaking, and talk back in the channel. + +For the full setup and operational guide, see: +- [Voice Mode](/docs/user-guide/features/voice-mode) +- [Use Voice Mode with Hermes](/docs/guides/use-voice-mode-with-hermes) ## Troubleshooting diff --git a/website/docs/user-guide/messaging/index.md b/website/docs/user-guide/messaging/index.md index debc841b8..2530248ee 100644 --- a/website/docs/user-guide/messaging/index.md +++ b/website/docs/user-guide/messaging/index.md @@ -8,6 +8,8 @@ description: "Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, Email, Home Assistant, or your browser. The gateway is a single background process that connects to all your configured platforms, handles sessions, runs cron jobs, and delivers voice messages. +For the full voice feature set — including CLI microphone mode, spoken replies in messaging, and Discord voice-channel conversations — see [Voice Mode](/docs/user-guide/features/voice-mode) and [Use Voice Mode with Hermes](/docs/guides/use-voice-mode-with-hermes). + ## Architecture ```text @@ -77,6 +79,7 @@ hermes gateway status # Check service status | `/usage` | Show token usage for this session | | `/insights [days]` | Show usage insights and analytics | | `/reasoning [level\|show\|hide]` | Change reasoning effort or toggle reasoning display | +| `/voice [on\|off\|tts\|join\|leave\|status]` | Control messaging voice replies and Discord voice-channel behavior | | `/rollback [number]` | List or restore filesystem checkpoints | | `/background ` | Run a prompt in a separate background session | | `/reload-mcp` | Reload MCP servers from config | diff --git a/website/sidebars.ts b/website/sidebars.ts index ff91c4de5..828b4472f 100644 --- a/website/sidebars.ts +++ b/website/sidebars.ts @@ -24,6 +24,7 @@ const sidebars: SidebarsConfig = { 'guides/python-library', 'guides/use-mcp-with-hermes', 'guides/use-soul-with-hermes', + 'guides/use-voice-mode-with-hermes', ], }, { From fbd752b92b0b2f90c412f7a68f56ffff2a2e5ee1 Mon Sep 17 00:00:00 2001 From: Nyk <0xNyk@users.noreply.github.com> Date: Sat, 14 Mar 2026 10:33:32 -0700 Subject: [PATCH 04/34] test(cron): add cross-timezone naive timestamp regression Cherry-picked from PR #1308 by 0xNyk. Adds an end-to-end regression test covering a Hermes timezone far behind system local time (Pacific/Midway, UTC-11) to ensure legacy naive cron timestamps are still recognized as due under large timezone mismatches. --- tests/test_timezone.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_timezone.py b/tests/test_timezone.py index 9902817d8..728d47dd9 100644 --- a/tests/test_timezone.py +++ b/tests/test_timezone.py @@ -328,6 +328,34 @@ class TestCronTimezone: "Overdue job was skipped — _ensure_aware likely shifted absolute time" ) + def test_get_due_jobs_naive_cross_timezone(self, tmp_path, monkeypatch): + """Naive past timestamps must be detected as due even when Hermes tz + is behind system local tz — the scenario that triggered #806.""" + import cron.jobs as jobs_module + monkeypatch.setattr(jobs_module, "CRON_DIR", tmp_path / "cron") + monkeypatch.setattr(jobs_module, "JOBS_FILE", tmp_path / "cron" / "jobs.json") + monkeypatch.setattr(jobs_module, "OUTPUT_DIR", tmp_path / "cron" / "output") + + # Use a Hermes timezone far behind UTC so that the numeric wall time + # of the naive timestamp exceeds _hermes_now's wall time — this would + # have caused a false "not due" with the old replace(tzinfo=...) approach. + os.environ["HERMES_TIMEZONE"] = "Pacific/Midway" # UTC-11 + hermes_time.reset_cache() + + from cron.jobs import create_job, load_jobs, save_jobs, get_due_jobs + create_job(prompt="Cross-tz job", schedule="every 1h") + jobs = load_jobs() + + # Force a naive past timestamp (system-local wall time, 10 min ago) + naive_past = (datetime.now() - timedelta(minutes=10)).isoformat() + jobs[0]["next_run_at"] = naive_past + save_jobs(jobs) + + due = get_due_jobs() + assert len(due) == 1, ( + "Naive past timestamp should be due regardless of Hermes timezone" + ) + def test_create_job_stores_tz_aware_timestamps(self, tmp_path, monkeypatch): """New jobs store timezone-aware created_at and next_run_at.""" import cron.jobs as jobs_module From a50550fdb442b2dced799332a2f9b63a23a80888 Mon Sep 17 00:00:00 2001 From: teyrebaz33 Date: Wed, 11 Mar 2026 21:11:04 +0300 Subject: [PATCH 05/34] fix: add prefix matching to slash command dispatcher Slash commands previously required exact full names. Typing /con returned 'Unknown command' even though /config was the only match. Add unambiguous prefix matching in process_command(): - Unique prefix (e.g. /con -> /config): dispatch immediately - Ambiguous prefix (e.g. /re -> /reset, /retry, /reasoning...): show 'Did you mean' suggestions - No match: existing 'Unknown command' error Prefix matching uses the COMMANDS dict from hermes_cli/commands.py (same source as SlashCommandCompleter) so it stays in sync with any new commands added there. Closes #928 --- cli.py | 17 +++++++-- tests/test_cli_prefix_matching.py | 60 +++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 tests/test_cli_prefix_matching.py diff --git a/cli.py b/cli.py index 094be22e9..84cf22767 100755 --- a/cli.py +++ b/cli.py @@ -3094,8 +3094,21 @@ class HermesCLI: else: self.console.print(f"[bold red]Failed to load skill for {base_cmd}[/]") else: - self.console.print(f"[bold red]Unknown command: {cmd_lower}[/]") - self.console.print("[dim #B8860B]Type /help for available commands[/]") + # Prefix matching: if input uniquely identifies one command, execute it + from hermes_cli.commands import COMMANDS + typed_base = cmd_lower.split()[0] + matches = [c for c in COMMANDS if c.startswith(typed_base)] + if len(matches) == 1: + # Re-dispatch with the full command name, preserving any arguments + remainder = cmd_original.strip()[len(typed_base):] + full_cmd = matches[0] + remainder + return self.process_command(full_cmd) + elif len(matches) > 1: + self.console.print(f"[bold yellow]Ambiguous command: {cmd_lower}[/]") + self.console.print(f"[dim]Did you mean: {', '.join(sorted(matches))}?[/]") + else: + self.console.print(f"[bold red]Unknown command: {cmd_lower}[/]") + self.console.print("[dim #B8860B]Type /help for available commands[/]") return True diff --git a/tests/test_cli_prefix_matching.py b/tests/test_cli_prefix_matching.py new file mode 100644 index 000000000..b7419a8aa --- /dev/null +++ b/tests/test_cli_prefix_matching.py @@ -0,0 +1,60 @@ +"""Tests for slash command prefix matching in HermesCLI.process_command.""" +from unittest.mock import MagicMock, patch +from cli import HermesCLI + + +def _make_cli(): + cli_obj = HermesCLI.__new__(HermesCLI) + cli_obj.config = {} + cli_obj.console = MagicMock() + cli_obj.agent = None + cli_obj.conversation_history = [] + return cli_obj + + +class TestSlashCommandPrefixMatching: + def test_unique_prefix_dispatches_command(self): + """/con should dispatch to /config when it uniquely matches.""" + cli_obj = _make_cli() + with patch.object(cli_obj, 'show_config') as mock_config: + cli_obj.process_command("/con") + mock_config.assert_called_once() + + def test_unique_prefix_with_args_dispatches_command(self): + """/mo with argument should dispatch to /model.""" + cli_obj = _make_cli() + with patch.object(cli_obj, 'process_command', wraps=cli_obj.process_command): + with patch("hermes_cli.models.fetch_api_models", return_value=None), \ + patch("cli.save_config_value"): + cli_obj.model = "current-model" + cli_obj.provider = "openrouter" + cli_obj.base_url = "https://openrouter.ai/api/v1" + cli_obj.api_key = "test" + cli_obj._explicit_api_key = None + cli_obj._explicit_base_url = None + cli_obj.requested_provider = "openrouter" + # /mod uniquely matches /model + result = cli_obj.process_command("/mod") + assert result is True + + def test_ambiguous_prefix_shows_suggestions(self): + """/re matches /reset, /retry, /reload-mcp, /reasoning, /rollback — should show suggestions.""" + cli_obj = _make_cli() + cli_obj.process_command("/re") + # Should print ambiguous message, not unknown command + printed = " ".join(str(c) for c in cli_obj.console.print.call_args_list) + assert "Ambiguous" in printed or "Did you mean" in printed + + def test_unknown_command_shows_error(self): + """/xyz should show unknown command error.""" + cli_obj = _make_cli() + cli_obj.process_command("/xyz") + printed = " ".join(str(c) for c in cli_obj.console.print.call_args_list) + assert "Unknown command" in printed + + def test_exact_command_still_works(self): + """/help should still work as exact match.""" + cli_obj = _make_cli() + with patch.object(cli_obj, 'show_help') as mock_help: + cli_obj.process_command("/help") + mock_help.assert_called_once() From fbdce27b9a1c6378366e22c2161e7eda558da788 Mon Sep 17 00:00:00 2001 From: teyrebaz33 Date: Sat, 14 Mar 2026 14:11:34 +0300 Subject: [PATCH 06/34] fix: address prefix matching recursion and skill command coverage Per teknium1 review on PR #968: 1. Guard against infinite recursion: if expanded name equals the typed token (already exact), fall through to Unknown command instead of redispatching the same string forever. 2. Include skill slash commands in prefix resolution so execution-time matching agrees with tab-completion (set(COMMANDS) | set(_skill_commands)). 3. Add missing test cases: - unambiguous prefix with extra args does not recurse - exact command with args does not loop - skill command prefix matches correctly - exact builtin takes priority over skill prefix ambiguity 8 tests passing. --- cli.py | 24 ++++++--- tests/test_cli_prefix_matching.py | 89 +++++++++++++++++++++++++------ 2 files changed, 90 insertions(+), 23 deletions(-) diff --git a/cli.py b/cli.py index 84cf22767..6df693229 100755 --- a/cli.py +++ b/cli.py @@ -3094,15 +3094,27 @@ class HermesCLI: else: self.console.print(f"[bold red]Failed to load skill for {base_cmd}[/]") else: - # Prefix matching: if input uniquely identifies one command, execute it + # Prefix matching: if input uniquely identifies one command, execute it. + # Matches against both built-in COMMANDS and installed skill commands so + # that execution-time resolution agrees with tab-completion. from hermes_cli.commands import COMMANDS typed_base = cmd_lower.split()[0] - matches = [c for c in COMMANDS if c.startswith(typed_base)] + all_known = set(COMMANDS) | set(_skill_commands) + matches = [c for c in all_known if c.startswith(typed_base)] if len(matches) == 1: - # Re-dispatch with the full command name, preserving any arguments - remainder = cmd_original.strip()[len(typed_base):] - full_cmd = matches[0] + remainder - return self.process_command(full_cmd) + # Expand the prefix to the full command name, preserving arguments. + # Guard against redispatching the same token to avoid infinite + # recursion when the expanded name still doesn't hit an exact branch + # (e.g. /config with extra args that are not yet handled above). + full_name = matches[0] + if full_name == typed_base: + # Already an exact token — no expansion possible; fall through + self.console.print(f"[bold red]Unknown command: {cmd_lower}[/]") + self.console.print("[dim #B8860B]Type /help for available commands[/]") + else: + remainder = cmd_original.strip()[len(typed_base):] + full_cmd = full_name + remainder + return self.process_command(full_cmd) elif len(matches) > 1: self.console.print(f"[bold yellow]Ambiguous command: {cmd_lower}[/]") self.console.print(f"[dim]Did you mean: {', '.join(sorted(matches))}?[/]") diff --git a/tests/test_cli_prefix_matching.py b/tests/test_cli_prefix_matching.py index b7419a8aa..617cee9ae 100644 --- a/tests/test_cli_prefix_matching.py +++ b/tests/test_cli_prefix_matching.py @@ -20,28 +20,53 @@ class TestSlashCommandPrefixMatching: cli_obj.process_command("/con") mock_config.assert_called_once() - def test_unique_prefix_with_args_dispatches_command(self): - """/mo with argument should dispatch to /model.""" + def test_unique_prefix_with_args_does_not_recurse(self): + """/con set key value should expand to /config set key value without infinite recursion.""" cli_obj = _make_cli() - with patch.object(cli_obj, 'process_command', wraps=cli_obj.process_command): - with patch("hermes_cli.models.fetch_api_models", return_value=None), \ - patch("cli.save_config_value"): - cli_obj.model = "current-model" - cli_obj.provider = "openrouter" - cli_obj.base_url = "https://openrouter.ai/api/v1" - cli_obj.api_key = "test" - cli_obj._explicit_api_key = None - cli_obj._explicit_base_url = None - cli_obj.requested_provider = "openrouter" - # /mod uniquely matches /model - result = cli_obj.process_command("/mod") - assert result is True + dispatched = [] + + original = cli_obj.process_command.__func__ + + def counting_process_command(self_inner, cmd): + dispatched.append(cmd) + if len(dispatched) > 5: + raise RecursionError("process_command called too many times") + return original(self_inner, cmd) + + with patch.object(type(cli_obj), 'process_command', counting_process_command): + try: + cli_obj.process_command("/con set key value") + except RecursionError: + assert False, "process_command recursed infinitely" + + # Should have been called at most twice: once for /con set..., once for /config set... + assert len(dispatched) <= 2 + + def test_exact_command_with_args_does_not_recurse(self): + """/config set key value hits exact branch and does not loop back to prefix.""" + cli_obj = _make_cli() + call_count = [0] + + original_pc = HermesCLI.process_command + + def guarded(self_inner, cmd): + call_count[0] += 1 + if call_count[0] > 10: + raise RecursionError("Infinite recursion detected") + return original_pc(self_inner, cmd) + + with patch.object(HermesCLI, 'process_command', guarded): + try: + cli_obj.process_command("/config set key value") + except RecursionError: + assert False, "Recursed infinitely on /config set key value" + + assert call_count[0] <= 3 def test_ambiguous_prefix_shows_suggestions(self): - """/re matches /reset, /retry, /reload-mcp, /reasoning, /rollback — should show suggestions.""" + """/re matches multiple commands — should show ambiguous message.""" cli_obj = _make_cli() cli_obj.process_command("/re") - # Should print ambiguous message, not unknown command printed = " ".join(str(c) for c in cli_obj.console.print.call_args_list) assert "Ambiguous" in printed or "Did you mean" in printed @@ -58,3 +83,33 @@ class TestSlashCommandPrefixMatching: with patch.object(cli_obj, 'show_help') as mock_help: cli_obj.process_command("/help") mock_help.assert_called_once() + + def test_skill_command_prefix_matches(self): + """A prefix that uniquely matches a skill command should dispatch it.""" + cli_obj = _make_cli() + fake_skill = {"/test-skill-xyz": {"name": "Test Skill", "description": "test"}} + printed = [] + cli_obj.console.print = lambda *a, **kw: printed.append(str(a)) + + import cli as cli_mod + with patch.object(cli_mod, '_skill_commands', fake_skill): + cli_obj.process_command("/test-skill-xy") + + # Should NOT show "Unknown command" — should have dispatched or attempted skill + unknown = any("Unknown command" in p for p in printed) + assert not unknown, f"Expected skill prefix to match, got: {printed}" + + def test_ambiguous_between_builtin_and_skill(self): + """Ambiguous prefix spanning builtin + skill commands shows suggestions.""" + cli_obj = _make_cli() + # /help-extra is a fake skill that shares /hel prefix with /help + fake_skill = {"/help-extra": {"name": "Help Extra", "description": "test"}} + + import cli as cli_mod + with patch.object(cli_mod, '_skill_commands', fake_skill), patch.object(cli_obj, 'show_help') as mock_help: + cli_obj.process_command("/help") + + # /help is an exact match so should work normally, not show ambiguous + mock_help.assert_called_once() + printed = " ".join(str(c) for c in cli_obj.console.print.call_args_list) + assert "Ambiguous" not in printed From 577b477a784be109c7b08a57acda5ab0ced8f232 Mon Sep 17 00:00:00 2001 From: teyrebaz33 Date: Sat, 14 Mar 2026 14:19:23 +0300 Subject: [PATCH 07/34] fix(test): add missing session_id and _pending_input to _make_cli fixture CI failure: test_skill_command_prefix_matches raised AttributeError because HermesCLI.__new__ skips __init__, leaving session_id and _pending_input unset. These are accessed when skill command dispatch runs in the CI environment. --- tests/test_cli_prefix_matching.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_cli_prefix_matching.py b/tests/test_cli_prefix_matching.py index 617cee9ae..ffec91957 100644 --- a/tests/test_cli_prefix_matching.py +++ b/tests/test_cli_prefix_matching.py @@ -9,6 +9,8 @@ def _make_cli(): cli_obj.console = MagicMock() cli_obj.agent = None cli_obj.conversation_history = [] + cli_obj.session_id = None + cli_obj._pending_input = MagicMock() return cli_obj From 9ec3a7a21bcfa973d35cdb715c2740db3daf8b36 Mon Sep 17 00:00:00 2001 From: stablegenius49 <16443023+stablegenius49@users.noreply.github.com> Date: Wed, 11 Mar 2026 09:07:30 -0700 Subject: [PATCH 08/34] fix: mark config set arguments as placeholders --- hermes_cli/config.py | 6 ++--- hermes_cli/setup.py | 2 +- tests/hermes_cli/test_placeholder_usage.py | 29 ++++++++++++++++++++++ 3 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 tests/hermes_cli/test_placeholder_usage.py diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 492d00aec..249ae52b4 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -821,7 +821,7 @@ def migrate_config(interactive: bool = True, quiet: bool = False) -> Dict[str, A print(f" ✓ Saved {name}") print() else: - print(" Set later with: hermes config set KEY VALUE") + print(" Set later with: hermes config set ") # Check for missing config fields missing_config = get_missing_config_fields() @@ -1265,7 +1265,7 @@ def show_config(): print() print(color("─" * 60, Colors.DIM)) print(color(" hermes config edit # Edit config file", Colors.DIM)) - print(color(" hermes config set KEY VALUE", Colors.DIM)) + print(color(" hermes config set ", Colors.DIM)) print(color(" hermes setup # Run setup wizard", Colors.DIM)) print() @@ -1391,7 +1391,7 @@ def config_command(args): key = getattr(args, 'key', None) value = getattr(args, 'value', None) if not key or not value: - print("Usage: hermes config set KEY VALUE") + print("Usage: hermes config set ") print() print("Examples:") print(" hermes config set model anthropic/claude-sonnet-4") diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 4a27339ce..7e0a99983 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -602,7 +602,7 @@ def _print_setup_summary(config: dict, hermes_home): print( f" {color('hermes config edit', Colors.GREEN)} Open config in your editor" ) - print(f" {color('hermes config set KEY VALUE', Colors.GREEN)}") + print(f" {color('hermes config set ', Colors.GREEN)}") print(f" Set a specific value") print() print(f" Or edit the files directly:") diff --git a/tests/hermes_cli/test_placeholder_usage.py b/tests/hermes_cli/test_placeholder_usage.py new file mode 100644 index 000000000..ab5234800 --- /dev/null +++ b/tests/hermes_cli/test_placeholder_usage.py @@ -0,0 +1,29 @@ +"""Tests for CLI placeholder text in config/setup output.""" + +import os +from argparse import Namespace +from unittest.mock import patch + +import pytest + +from hermes_cli.config import config_command +from hermes_cli.setup import _print_setup_summary + + +def test_config_set_usage_marks_placeholders(capsys): + args = Namespace(config_command="set", key=None, value=None) + + with pytest.raises(SystemExit) as exc: + config_command(args) + + assert exc.value.code == 1 + out = capsys.readouterr().out + assert "Usage: hermes config set " in out + + +def test_setup_summary_marks_placeholders(tmp_path, capsys): + with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path)}): + _print_setup_summary({"tts": {"provider": "edge"}}, tmp_path) + + out = capsys.readouterr().out + assert "hermes config set " in out From 0d23ad7a152751a1176289f7d4ed3a5f94ae49e3 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 10:35:14 -0700 Subject: [PATCH 09/34] fix: cover remaining config placeholder help text Update the unknown-subcommand config help output to use placeholder syntax too, and extend the placeholder regression tests to cover show_config() and that fallback help path. --- hermes_cli/config.py | 2 +- tests/hermes_cli/test_placeholder_usage.py | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 249ae52b4..7a932d9e4 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1506,7 +1506,7 @@ def config_command(args): print("Available commands:") print(" hermes config Show current configuration") print(" hermes config edit Open config in editor") - print(" hermes config set K V Set a config value") + print(" hermes config set Set a config value") print(" hermes config check Check for missing/outdated config") print(" hermes config migrate Update config with new options") print(" hermes config path Show config file path") diff --git a/tests/hermes_cli/test_placeholder_usage.py b/tests/hermes_cli/test_placeholder_usage.py index ab5234800..3479d8f57 100644 --- a/tests/hermes_cli/test_placeholder_usage.py +++ b/tests/hermes_cli/test_placeholder_usage.py @@ -6,7 +6,7 @@ from unittest.mock import patch import pytest -from hermes_cli.config import config_command +from hermes_cli.config import config_command, show_config from hermes_cli.setup import _print_setup_summary @@ -21,6 +21,25 @@ def test_config_set_usage_marks_placeholders(capsys): assert "Usage: hermes config set " in out +def test_config_unknown_command_help_marks_placeholders(capsys): + args = Namespace(config_command="wat") + + with pytest.raises(SystemExit) as exc: + config_command(args) + + assert exc.value.code == 1 + out = capsys.readouterr().out + assert "hermes config set Set a config value" in out + + +def test_show_config_marks_placeholders(tmp_path, capsys): + with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path)}): + show_config() + + out = capsys.readouterr().out + assert "hermes config set " in out + + def test_setup_summary_marks_placeholders(tmp_path, capsys): with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path)}): _print_setup_summary({"tts": {"provider": "edge"}}, tmp_path) From 2054ffdaebdf63804ff9bf82cd5b9a897799eb49 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 07:48:44 -0700 Subject: [PATCH 10/34] fix: smart vision setup that respects the user's chosen provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old flow blindly asked for an OpenRouter API key after ANY non-OR provider selection, even for Nous Portal and Codex which already support vision natively. This was confusing and annoying. New behavior: - OpenRouter: skip — vision uses Gemini via their OR key - Nous Portal OAuth: skip — vision uses Gemini via Nous - OpenAI Codex: skip — gpt-5.3-codex supports vision - Custom endpoint (api.openai.com): show OpenAI vision model picker (gpt-4o, gpt-4o-mini, gpt-4.1, etc.), saves AUXILIARY_VISION_MODEL - Custom (other) / z.ai / kimi / minimax / nous-api: - First checks if existing OR/Nous creds already cover vision - If not, offers friendly choice: OpenRouter / OpenAI / Skip - No more 'enter OpenRouter key' thrown in your face Also fixes the setup summary to check actual vision availability across all providers instead of hardcoding 'requires OPENROUTER_API_KEY'. MoA still correctly requires OpenRouter (calls multiple frontier models). --- hermes_cli/setup.py | 158 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 129 insertions(+), 29 deletions(-) diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 4a27339ce..474b45780 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -460,12 +460,41 @@ def _print_setup_summary(config: dict, hermes_home): tool_status = [] - # OpenRouter (required for vision, moa) + # Vision — works with OpenRouter, Nous OAuth, Codex OAuth, or OpenAI endpoint + _has_vision = False if get_env_value("OPENROUTER_API_KEY"): + _has_vision = True + else: + try: + _vauth_path = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) / "auth.json" + if _vauth_path.is_file(): + import json as _vjson + + _vauth = _vjson.loads(_vauth_path.read_text()) + if _vauth.get("active_provider") == "nous": + _np = _vauth.get("providers", {}).get("nous", {}) + if _np.get("agent_key") or _np.get("access_token"): + _has_vision = True + elif _vauth.get("active_provider") == "openai-codex": + _cp = _vauth.get("providers", {}).get("openai-codex", {}) + if _cp.get("tokens", {}).get("access_token"): + _has_vision = True + except Exception: + pass + if not _has_vision: + _oai_base = get_env_value("OPENAI_BASE_URL") or "" + if get_env_value("OPENAI_API_KEY") and "api.openai.com" in _oai_base.lower(): + _has_vision = True + + if _has_vision: tool_status.append(("Vision (image analysis)", True, None)) + else: + tool_status.append(("Vision (image analysis)", False, "run 'hermes setup' to configure")) + + # Mixture of Agents — requires OpenRouter specifically (calls multiple models) + if get_env_value("OPENROUTER_API_KEY"): tool_status.append(("Mixture of Agents", True, None)) else: - tool_status.append(("Vision (image analysis)", False, "OPENROUTER_API_KEY")) tool_status.append(("Mixture of Agents", False, "OPENROUTER_API_KEY")) # Firecrawl (web tools) @@ -1246,35 +1275,106 @@ def setup_model_provider(config: dict): elif existing_or: selected_provider = "openrouter" - # ── OpenRouter API Key for tools (if not already set) ── - # Tools (vision, web, MoA) use OpenRouter independently of the main provider. - # Prompt for OpenRouter key if not set and a non-OpenRouter provider was chosen. - if selected_provider in ( - "nous", - "openai-codex", - "custom", - "zai", - "kimi-coding", - "minimax", - "minimax-cn", - "anthropic", - ) and not get_env_value("OPENROUTER_API_KEY"): - print() - print_header("OpenRouter API Key (for tools)") - print_info("Tools like vision analysis, web search, and MoA use OpenRouter") - print_info("independently of your main inference provider.") - print_info("Get your API key at: https://openrouter.ai/keys") + # ── Vision & Image Analysis Setup ── + # Vision requires a multimodal-capable provider. Check whether the user's + # chosen provider already covers it — if so, skip the prompt entirely. + _vision_needs_setup = True - api_key = prompt( - " OpenRouter API key (optional, press Enter to skip)", password=True - ) - if api_key: - save_env_value("OPENROUTER_API_KEY", api_key) - print_success("OpenRouter API key saved (for tools)") + if selected_provider == "openrouter": + # OpenRouter → Gemini for vision, already configured + _vision_needs_setup = False + elif selected_provider == "nous": + # Nous Portal OAuth → Gemini via Nous, already configured + _vision_needs_setup = False + elif selected_provider == "openai-codex": + # Codex OAuth → gpt-5.3-codex supports vision + _vision_needs_setup = False + elif selected_provider == "custom": + _custom_base = (get_env_value("OPENAI_BASE_URL") or "").lower() + if "api.openai.com" in _custom_base: + # Direct OpenAI endpoint — show vision model picker + print() + print_header("Vision Model") + print_info("Your OpenAI endpoint supports vision. Pick a model for image analysis:") + _oai_vision_models = ["gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano"] + _vm_choices = _oai_vision_models + ["Keep default (gpt-4o-mini)"] + _vm_idx = prompt_choice("Select vision model:", _vm_choices, len(_vm_choices) - 1) + if _vm_idx < len(_oai_vision_models): + save_env_value("AUXILIARY_VISION_MODEL", _oai_vision_models[_vm_idx]) + print_success(f"Vision model set to {_oai_vision_models[_vm_idx]}") + _vision_needs_setup = False + + # Even for providers without native vision, check if existing credentials + # from a previous setup already cover it (e.g. user had OpenRouter before + # switching to z.ai) + if _vision_needs_setup: + if get_env_value("OPENROUTER_API_KEY"): + _vision_needs_setup = False else: - print_info( - "Skipped - some tools (vision, web scraping) won't work without this" - ) + # Check for Nous Portal OAuth in auth.json + try: + _auth_path = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) / "auth.json" + if _auth_path.is_file(): + import json as _json + + _auth_data = _json.loads(_auth_path.read_text()) + if _auth_data.get("active_provider") == "nous": + _nous_p = _auth_data.get("providers", {}).get("nous", {}) + if _nous_p.get("agent_key") or _nous_p.get("access_token"): + _vision_needs_setup = False + except Exception: + pass + + if _vision_needs_setup: + _prov_names = { + "nous-api": "Nous Portal API key", + "zai": "Z.AI / GLM", + "kimi-coding": "Kimi / Moonshot", + "minimax": "MiniMax", + "minimax-cn": "MiniMax CN", + "anthropic": "Anthropic", + "custom": "your custom endpoint", + } + _prov_display = _prov_names.get(selected_provider, selected_provider or "your provider") + + print() + print_header("Vision & Image Analysis (optional)") + print_info(f"Vision requires a multimodal-capable provider. {_prov_display}") + print_info("doesn't natively support it. Choose how to enable vision,") + print_info("or skip to configure later.") + print() + + _vision_choices = [ + "OpenRouter — uses Gemini (free tier at openrouter.ai/keys)", + "OpenAI — enter API key & choose a vision model", + "Skip for now", + ] + _vision_idx = prompt_choice("Configure vision:", _vision_choices, 2) + + if _vision_idx == 0: # OpenRouter + _or_key = prompt(" OpenRouter API key", password=True) + if _or_key: + save_env_value("OPENROUTER_API_KEY", _or_key) + print_success("OpenRouter key saved — vision will use Gemini") + else: + print_info("Skipped — vision won't be available") + elif _vision_idx == 1: # OpenAI + _oai_key = prompt(" OpenAI API key", password=True) + if _oai_key: + save_env_value("OPENAI_API_KEY", _oai_key) + save_env_value("OPENAI_BASE_URL", "https://api.openai.com/v1") + _oai_vision_models = ["gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano"] + _vm_choices = _oai_vision_models + ["Use default (gpt-4o-mini)"] + _vm_idx = prompt_choice("Select vision model:", _vm_choices, 0) + if _vm_idx < len(_oai_vision_models): + save_env_value("AUXILIARY_VISION_MODEL", _oai_vision_models[_vm_idx]) + print_success(f"Vision configured with OpenAI ({_oai_vision_models[_vm_idx]})") + else: + print_success("Vision configured with OpenAI (gpt-4o-mini)") + else: + print_info("Skipped — vision won't be available") + else: + print_info("Skipped — add later with 'hermes config set OPENROUTER_API_KEY ...'") # ── Model Selection (adapts based on provider) ── if selected_provider != "custom": # Custom already prompted for model name From ee73b6bf27eb56daac6601e70d614f8a372dccdc Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 10:37:45 -0700 Subject: [PATCH 11/34] fix: persist default openai vision model in setup wizard Add regression coverage for the new provider-aware vision setup flow and make the default OpenAI choice write AUXILIARY_VISION_MODEL so auxiliary vision requests don't fall back to the main model slug. --- hermes_cli/setup.py | 22 ++++-- tests/hermes_cli/test_setup_model_provider.py | 75 +++++++++++++++++-- 2 files changed, 83 insertions(+), 14 deletions(-) diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 474b45780..6924c64ec 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -1299,9 +1299,13 @@ def setup_model_provider(config: dict): _oai_vision_models = ["gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano"] _vm_choices = _oai_vision_models + ["Keep default (gpt-4o-mini)"] _vm_idx = prompt_choice("Select vision model:", _vm_choices, len(_vm_choices) - 1) - if _vm_idx < len(_oai_vision_models): - save_env_value("AUXILIARY_VISION_MODEL", _oai_vision_models[_vm_idx]) - print_success(f"Vision model set to {_oai_vision_models[_vm_idx]}") + _selected_vision_model = ( + _oai_vision_models[_vm_idx] + if _vm_idx < len(_oai_vision_models) + else "gpt-4o-mini" + ) + save_env_value("AUXILIARY_VISION_MODEL", _selected_vision_model) + print_success(f"Vision model set to {_selected_vision_model}") _vision_needs_setup = False # Even for providers without native vision, check if existing credentials @@ -1366,11 +1370,13 @@ def setup_model_provider(config: dict): _oai_vision_models = ["gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano"] _vm_choices = _oai_vision_models + ["Use default (gpt-4o-mini)"] _vm_idx = prompt_choice("Select vision model:", _vm_choices, 0) - if _vm_idx < len(_oai_vision_models): - save_env_value("AUXILIARY_VISION_MODEL", _oai_vision_models[_vm_idx]) - print_success(f"Vision configured with OpenAI ({_oai_vision_models[_vm_idx]})") - else: - print_success("Vision configured with OpenAI (gpt-4o-mini)") + _selected_vision_model = ( + _oai_vision_models[_vm_idx] + if _vm_idx < len(_oai_vision_models) + else "gpt-4o-mini" + ) + save_env_value("AUXILIARY_VISION_MODEL", _selected_vision_model) + print_success(f"Vision configured with OpenAI ({_selected_vision_model})") else: print_info("Skipped — vision won't be available") else: diff --git a/tests/hermes_cli/test_setup_model_provider.py b/tests/hermes_cli/test_setup_model_provider.py index f7c3ce385..ee5d7d263 100644 --- a/tests/hermes_cli/test_setup_model_provider.py +++ b/tests/hermes_cli/test_setup_model_provider.py @@ -3,7 +3,7 @@ from __future__ import annotations from hermes_cli.config import load_config, save_config, save_env_value -from hermes_cli.setup import setup_model_provider +from hermes_cli.setup import _print_setup_summary, setup_model_provider def _read_env(home): @@ -50,11 +50,15 @@ def test_setup_keep_current_custom_from_config_does_not_fall_through(tmp_path, m calls = {"count": 0} - def fake_prompt_choice(_question, choices, default=0): + def fake_prompt_choice(question, choices, default=0): calls["count"] += 1 if calls["count"] == 1: assert choices[-1] == "Keep current (Custom: https://example.invalid/v1)" return len(choices) - 1 + if calls["count"] == 2: + assert question == "Configure vision:" + assert choices[-1] == "Skip for now" + return len(choices) - 1 raise AssertionError("Model menu should not appear for keep-current custom") monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice) @@ -70,7 +74,7 @@ def test_setup_keep_current_custom_from_config_does_not_fall_through(tmp_path, m assert reloaded["model"]["provider"] == "custom" assert reloaded["model"]["default"] == "custom/model" assert reloaded["model"]["base_url"] == "https://example.invalid/v1" - assert calls["count"] == 1 + assert calls["count"] == 2 def test_setup_keep_current_config_provider_uses_provider_specific_model_menu(tmp_path, monkeypatch): @@ -88,13 +92,17 @@ def test_setup_keep_current_config_provider_uses_provider_specific_model_menu(tm captured = {"provider_choices": None, "model_choices": None} calls = {"count": 0} - def fake_prompt_choice(_question, choices, default=0): + def fake_prompt_choice(question, choices, default=0): calls["count"] += 1 if calls["count"] == 1: captured["provider_choices"] = list(choices) assert choices[-1] == "Keep current (Anthropic)" return len(choices) - 1 if calls["count"] == 2: + assert question == "Configure vision:" + assert choices[-1] == "Skip for now" + return len(choices) - 1 + if calls["count"] == 3: captured["model_choices"] = list(choices) return len(choices) - 1 # keep current model raise AssertionError("Unexpected extra prompt_choice call") @@ -113,7 +121,43 @@ def test_setup_keep_current_config_provider_uses_provider_specific_model_menu(tm assert captured["model_choices"] is not None assert captured["model_choices"][0] == "claude-opus-4-6" assert "anthropic/claude-opus-4.6 (recommended)" not in captured["model_choices"] - assert calls["count"] == 2 + assert calls["count"] == 3 + + +def test_setup_keep_current_anthropic_can_configure_openai_vision_default(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + _clear_provider_env(monkeypatch) + + config = load_config() + config["model"] = { + "default": "claude-opus-4-6", + "provider": "anthropic", + } + save_config(config) + + picks = iter([ + 9, # keep current provider + 1, # configure vision with OpenAI + 5, # use default gpt-4o-mini vision model + 4, # keep current Anthropic model + ]) + + monkeypatch.setattr("hermes_cli.setup.prompt_choice", lambda *args, **kwargs: next(picks)) + monkeypatch.setattr( + "hermes_cli.setup.prompt", + lambda message, *args, **kwargs: "sk-openai" if "OpenAI API key" in message else "", + ) + monkeypatch.setattr("hermes_cli.setup.prompt_yes_no", lambda *args, **kwargs: False) + monkeypatch.setattr("hermes_cli.auth.get_active_provider", lambda: None) + monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: []) + monkeypatch.setattr("hermes_cli.models.provider_model_ids", lambda provider: []) + + setup_model_provider(config) + env = _read_env(tmp_path) + + assert env.get("OPENAI_API_KEY") == "sk-openai" + assert env.get("OPENAI_BASE_URL") == "https://api.openai.com/v1" + assert env.get("AUXILIARY_VISION_MODEL") == "gpt-4o-mini" def test_setup_switch_custom_to_codex_clears_custom_endpoint_and_updates_config(tmp_path, monkeypatch): @@ -144,7 +188,7 @@ def test_setup_switch_custom_to_codex_clears_custom_endpoint_and_updates_config( "hermes_cli.auth.resolve_codex_runtime_credentials", lambda *args, **kwargs: { "base_url": "https://chatgpt.com/backend-api/codex", - "api_key": "codex-access-token", + "api_key": "codex-...oken", }, ) monkeypatch.setattr( @@ -163,3 +207,22 @@ def test_setup_switch_custom_to_codex_clears_custom_endpoint_and_updates_config( assert reloaded["model"]["provider"] == "openai-codex" assert reloaded["model"]["default"] == "openai/gpt-5.3-codex" assert reloaded["model"]["base_url"] == "https://chatgpt.com/backend-api/codex" + + +def test_setup_summary_marks_codex_auth_as_vision_available(tmp_path, monkeypatch, capsys): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + _clear_provider_env(monkeypatch) + + (tmp_path / "auth.json").write_text( + '{"active_provider":"openai-codex","providers":{"openai-codex":{"tokens":{"access_token":"tok"}}}}' + ) + + monkeypatch.setattr("shutil.which", lambda _name: None) + + _print_setup_summary(load_config(), tmp_path) + output = capsys.readouterr().out + + assert "Vision (image analysis)" in output + assert "missing run 'hermes setup' to configure" not in output + assert "Mixture of Agents" in output + assert "missing OPENROUTER_API_KEY" in output From e5dc569daac34ba0a5f82069ce78c6fb7a25917c Mon Sep 17 00:00:00 2001 From: Himess Date: Sat, 14 Mar 2026 11:03:20 -0700 Subject: [PATCH 12/34] fix: salvage gateway dedup and executor cleanup from PR #993 Salvages the two still-relevant fixes from PR #993 onto current main: - use a 3-tuple LOCAL delivery key so explicit/local-origin targets are not duplicated - shut down the previous agent-loop ThreadPoolExecutor when resizing the global pool Adds regression tests for both behaviors. --- environments/agent_loop.py | 2 ++ gateway/delivery.py | 2 +- tests/gateway/test_delivery.py | 11 ++++++++++- tests/test_agent_loop.py | 19 +++++++++++++++++++ 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/environments/agent_loop.py b/environments/agent_loop.py index ab8c0236e..dec3bc4ec 100644 --- a/environments/agent_loop.py +++ b/environments/agent_loop.py @@ -39,7 +39,9 @@ def resize_tool_pool(max_workers: int): Safe to call before any tasks are submitted. """ global _tool_executor + old_executor = _tool_executor _tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) + old_executor.shutdown(wait=False) logger.info("Tool thread pool resized to %d workers", max_workers) logger = logging.getLogger(__name__) diff --git a/gateway/delivery.py b/gateway/delivery.py index 630ab638e..69ec6376c 100644 --- a/gateway/delivery.py +++ b/gateway/delivery.py @@ -161,7 +161,7 @@ class DeliveryRouter: # Always include local if configured if self.config.always_log_local: - local_key = (Platform.LOCAL, None) + local_key = (Platform.LOCAL, None, None) if local_key not in seen_platforms: targets.append(DeliveryTarget(platform=Platform.LOCAL)) diff --git a/tests/gateway/test_delivery.py b/tests/gateway/test_delivery.py index 42eba781e..3894897f4 100644 --- a/tests/gateway/test_delivery.py +++ b/tests/gateway/test_delivery.py @@ -1,7 +1,7 @@ """Tests for the delivery routing module.""" from gateway.config import Platform, GatewayConfig, PlatformConfig, HomeChannel -from gateway.delivery import DeliveryTarget, parse_deliver_spec +from gateway.delivery import DeliveryRouter, DeliveryTarget, parse_deliver_spec from gateway.session import SessionSource @@ -85,3 +85,12 @@ class TestTargetToStringRoundtrip: reparsed = DeliveryTarget.parse(s) assert reparsed.platform == Platform.TELEGRAM assert reparsed.chat_id == "999" + + +class TestDeliveryRouter: + def test_resolve_targets_does_not_duplicate_local_when_explicit(self): + router = DeliveryRouter(GatewayConfig(always_log_local=True)) + + targets = router.resolve_targets(["local"]) + + assert [target.platform for target in targets] == [Platform.LOCAL] diff --git a/tests/test_agent_loop.py b/tests/test_agent_loop.py index bb0ccd069..b95ff7808 100644 --- a/tests/test_agent_loop.py +++ b/tests/test_agent_loop.py @@ -484,3 +484,22 @@ class TestResizeToolPool: """resize_tool_pool should not raise.""" resize_tool_pool(16) # Small pool for testing resize_tool_pool(128) # Restore default + + def test_resize_shuts_down_previous_executor(self, monkeypatch): + """Replacing the global tool executor should shut down the old pool.""" + import environments.agent_loop as agent_loop_module + + old_executor = MagicMock() + new_executor = MagicMock() + + monkeypatch.setattr(agent_loop_module, "_tool_executor", old_executor) + monkeypatch.setattr( + agent_loop_module.concurrent.futures, + "ThreadPoolExecutor", + MagicMock(return_value=new_executor), + ) + + resize_tool_pool(16) + + old_executor.shutdown.assert_called_once_with(wait=False) + assert agent_loop_module._tool_executor is new_executor From 94af51f621de55c1f8ebbe0dbc6c2a54ad4fd0ed Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 11:03:25 -0700 Subject: [PATCH 13/34] fix: harden trajectory compressor summary content handling Normalize summary-model content before stripping so empty or non-string responses do not trigger retry/fallback paths. Adds sync and async regression tests for None content. --- tests/test_trajectory_compressor.py | 34 ++++++++++++++++++++++++++++- trajectory_compressor.py | 33 ++++++++++++++++------------ 2 files changed, 52 insertions(+), 15 deletions(-) diff --git a/tests/test_trajectory_compressor.py b/tests/test_trajectory_compressor.py index 75fbd5a29..c95a3af94 100644 --- a/tests/test_trajectory_compressor.py +++ b/tests/test_trajectory_compressor.py @@ -1,7 +1,10 @@ """Tests for trajectory_compressor.py — config, metrics, and compression logic.""" import json -from unittest.mock import patch, MagicMock +from types import SimpleNamespace +from unittest.mock import AsyncMock, patch, MagicMock + +import pytest from trajectory_compressor import ( CompressionConfig, @@ -384,3 +387,32 @@ class TestTokenCounting: tc.tokenizer.encode = MagicMock(side_effect=Exception("fail")) # Should fallback to len(text) // 4 assert tc.count_tokens("12345678") == 2 + + +class TestGenerateSummary: + def test_generate_summary_handles_none_content(self): + tc = _make_compressor() + tc.client = MagicMock() + tc.client.chat.completions.create.return_value = SimpleNamespace( + choices=[SimpleNamespace(message=SimpleNamespace(content=None))] + ) + metrics = TrajectoryMetrics() + + summary = tc._generate_summary("Turn content", metrics) + + assert summary == "[CONTEXT SUMMARY]:" + + @pytest.mark.asyncio + async def test_generate_summary_async_handles_none_content(self): + tc = _make_compressor() + tc.async_client = MagicMock() + tc.async_client.chat.completions.create = AsyncMock( + return_value=SimpleNamespace( + choices=[SimpleNamespace(message=SimpleNamespace(content=None))] + ) + ) + metrics = TrajectoryMetrics() + + summary = await tc._generate_summary_async("Turn content", metrics) + + assert summary == "[CONTEXT SUMMARY]:" diff --git a/trajectory_compressor.py b/trajectory_compressor.py index ef81d6e27..1bfed6bfc 100644 --- a/trajectory_compressor.py +++ b/trajectory_compressor.py @@ -495,6 +495,21 @@ class TrajectoryCompressor: parts.append(f"[Turn {i} - {role.upper()}]:\n{value}") return "\n\n".join(parts) + + @staticmethod + def _coerce_summary_content(content: Any) -> str: + """Normalize summary-model output to a safe string.""" + if not isinstance(content, str): + content = str(content) if content else "" + return content.strip() + + @staticmethod + def _ensure_summary_prefix(summary: str) -> str: + """Normalize summary text to include the expected prefix exactly once.""" + text = (summary or "").strip() + if text.startswith("[CONTEXT SUMMARY]:"): + return text + return "[CONTEXT SUMMARY]:" if not text else f"[CONTEXT SUMMARY]: {text}" def _generate_summary(self, content: str, metrics: TrajectoryMetrics) -> str: """ @@ -545,13 +560,8 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" max_tokens=self.config.summary_target_tokens * 2, ) - summary = response.choices[0].message.content.strip() - - # Ensure it starts with the prefix - if not summary.startswith("[CONTEXT SUMMARY]:"): - summary = "[CONTEXT SUMMARY]: " + summary - - return summary + summary = self._coerce_summary_content(response.choices[0].message.content) + return self._ensure_summary_prefix(summary) except Exception as e: metrics.summarization_errors += 1 @@ -612,13 +622,8 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" max_tokens=self.config.summary_target_tokens * 2, ) - summary = response.choices[0].message.content.strip() - - # Ensure it starts with the prefix - if not summary.startswith("[CONTEXT SUMMARY]:"): - summary = "[CONTEXT SUMMARY]: " + summary - - return summary + summary = self._coerce_summary_content(response.choices[0].message.content) + return self._ensure_summary_prefix(summary) except Exception as e: metrics.summarization_errors += 1 From 4422637e7a3b0731cb161edc2459119918da84c3 Mon Sep 17 00:00:00 2001 From: stablegenius49 <16443023+stablegenius49@users.noreply.github.com> Date: Wed, 11 Mar 2026 22:52:16 -0700 Subject: [PATCH 14/34] fix: resolve named custom delegation providers --- hermes_cli/runtime_provider.py | 76 ++++++++++++++++++ tests/test_runtime_provider_resolution.py | 93 ++++++++++++++++++++++- 2 files changed, 168 insertions(+), 1 deletion(-) diff --git a/hermes_cli/runtime_provider.py b/hermes_cli/runtime_provider.py index 6cd57f95d..fb487f450 100644 --- a/hermes_cli/runtime_provider.py +++ b/hermes_cli/runtime_provider.py @@ -18,6 +18,10 @@ from hermes_cli.config import load_config from hermes_constants import OPENROUTER_BASE_URL +def _normalize_custom_provider_name(value: str) -> str: + return value.strip().lower().replace(" ", "-") + + def _get_model_config() -> Dict[str, Any]: config = load_config() model_cfg = config.get("model") @@ -47,6 +51,69 @@ def resolve_requested_provider(requested: Optional[str] = None) -> str: return "auto" +def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, Any]]: + requested_norm = _normalize_custom_provider_name(requested_provider or "") + if not requested_norm or requested_norm == "custom": + return None + + config = load_config() + custom_providers = config.get("custom_providers") + if not isinstance(custom_providers, list): + return None + + for entry in custom_providers: + if not isinstance(entry, dict): + continue + name = entry.get("name") + base_url = entry.get("base_url") + if not isinstance(name, str) or not isinstance(base_url, str): + continue + name_norm = _normalize_custom_provider_name(name) + menu_key = f"custom:{name_norm}" + if requested_norm not in {name_norm, menu_key}: + continue + return { + "name": name.strip(), + "base_url": base_url.strip(), + "api_key": str(entry.get("api_key", "") or "").strip(), + } + + return None + + +def _resolve_named_custom_runtime( + *, + requested_provider: str, + explicit_api_key: Optional[str] = None, + explicit_base_url: Optional[str] = None, +) -> Optional[Dict[str, Any]]: + custom_provider = _get_named_custom_provider(requested_provider) + if not custom_provider: + return None + + base_url = ( + (explicit_base_url or "").strip() + or custom_provider.get("base_url", "") + ).rstrip("/") + if not base_url: + return None + + api_key = ( + (explicit_api_key or "").strip() + or custom_provider.get("api_key", "") + or os.getenv("OPENAI_API_KEY", "").strip() + or os.getenv("OPENROUTER_API_KEY", "").strip() + ) + + return { + "provider": "openrouter", + "api_mode": "chat_completions", + "base_url": base_url, + "api_key": api_key, + "source": f"custom_provider:{custom_provider.get('name', requested_provider)}", + } + + def _resolve_openrouter_runtime( *, requested_provider: str, @@ -122,6 +189,15 @@ def resolve_runtime_provider( """Resolve runtime provider credentials for agent execution.""" requested_provider = resolve_requested_provider(requested) + custom_runtime = _resolve_named_custom_runtime( + requested_provider=requested_provider, + explicit_api_key=explicit_api_key, + explicit_base_url=explicit_base_url, + ) + if custom_runtime: + custom_runtime["requested_provider"] = requested_provider + return custom_runtime + provider = resolve_provider( requested_provider, explicit_api_key=explicit_api_key, diff --git a/tests/test_runtime_provider_resolution.py b/tests/test_runtime_provider_resolution.py index 520205df0..3ff1066cd 100644 --- a/tests/test_runtime_provider_resolution.py +++ b/tests/test_runtime_provider_resolution.py @@ -150,7 +150,7 @@ def test_custom_endpoint_auto_provider_prefers_openai_key(monkeypatch): monkeypatch.setenv("OPENAI_BASE_URL", "https://my-vllm-server.example.com/v1") monkeypatch.delenv("OPENROUTER_BASE_URL", raising=False) monkeypatch.setenv("OPENAI_API_KEY", "sk-vllm-key") - monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-should-not-leak") + monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-...leak") resolved = rp.resolve_runtime_provider(requested="auto") @@ -158,6 +158,97 @@ def test_custom_endpoint_auto_provider_prefers_openai_key(monkeypatch): assert resolved["api_key"] == "sk-vllm-key" +def test_named_custom_provider_uses_saved_credentials(monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("OPENROUTER_API_KEY", raising=False) + monkeypatch.setattr( + rp, + "load_config", + lambda: { + "custom_providers": [ + { + "name": "Local", + "base_url": "http://1.2.3.4:1234/v1", + "api_key": "local-provider-key", + } + ] + }, + ) + monkeypatch.setattr( + rp, + "resolve_provider", + lambda *a, **k: (_ for _ in ()).throw( + AssertionError( + "resolve_provider should not be called for named custom providers" + ) + ), + ) + + resolved = rp.resolve_runtime_provider(requested="local") + + assert resolved["provider"] == "openrouter" + assert resolved["api_mode"] == "chat_completions" + assert resolved["base_url"] == "http://1.2.3.4:1234/v1" + assert resolved["api_key"] == "local-provider-key" + assert resolved["requested_provider"] == "local" + assert resolved["source"] == "custom_provider:Local" + + +def test_named_custom_provider_falls_back_to_openai_api_key(monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "env-openai-key") + monkeypatch.delenv("OPENROUTER_API_KEY", raising=False) + monkeypatch.setattr( + rp, + "load_config", + lambda: { + "custom_providers": [ + { + "name": "Local LLM", + "base_url": "http://localhost:1234/v1", + } + ] + }, + ) + monkeypatch.setattr( + rp, + "resolve_provider", + lambda *a, **k: (_ for _ in ()).throw( + AssertionError( + "resolve_provider should not be called for named custom providers" + ) + ), + ) + + resolved = rp.resolve_runtime_provider(requested="custom:local-llm") + + assert resolved["base_url"] == "http://localhost:1234/v1" + assert resolved["api_key"] == "env-openai-key" + assert resolved["requested_provider"] == "custom:local-llm" + + +def test_resolve_runtime_provider_nous_api(monkeypatch): + """Nous Portal API key provider resolves via the api_key path.""" + monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "nous-api") + monkeypatch.setattr( + rp, + "resolve_api_key_provider_credentials", + lambda pid: { + "provider": "nous-api", + "api_key": "nous-test-key", + "base_url": "https://inference-api.nousresearch.com/v1", + "source": "NOUS_API_KEY", + }, + ) + + resolved = rp.resolve_runtime_provider(requested="nous-api") + + assert resolved["provider"] == "nous-api" + assert resolved["api_mode"] == "chat_completions" + assert resolved["base_url"] == "https://inference-api.nousresearch.com/v1" + assert resolved["api_key"] == "nous-test-key" + assert resolved["requested_provider"] == "nous-api" + + def test_explicit_openrouter_skips_openai_base_url(monkeypatch): """When the user explicitly requests openrouter, OPENAI_BASE_URL (which may point to a custom endpoint) must not override the From 88951215d36882c8df0cd98bb6302c0636ef7790 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 11:24:29 -0700 Subject: [PATCH 15/34] fix: avoid custom provider shadowing built-in providers Follow up on salvaged PR #1012. Prevents raw custom-provider names from intercepting built-in provider ids, and keeps the regression coverage focused on current-main behavior. --- hermes_cli/runtime_provider.py | 14 +++++++++ tests/test_runtime_provider_resolution.py | 36 +++++++++++++++-------- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/hermes_cli/runtime_provider.py b/hermes_cli/runtime_provider.py index fb487f450..fead68000 100644 --- a/hermes_cli/runtime_provider.py +++ b/hermes_cli/runtime_provider.py @@ -5,6 +5,7 @@ from __future__ import annotations import os from typing import Any, Dict, Optional +from hermes_cli import auth as auth_mod from hermes_cli.auth import ( AuthError, PROVIDER_REGISTRY, @@ -56,6 +57,19 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An if not requested_norm or requested_norm == "custom": return None + # Raw names should only map to custom providers when they are not already + # valid built-in providers or aliases. Explicit menu keys like + # ``custom:local`` always target the saved custom provider. + if requested_norm == "auto": + return None + if not requested_norm.startswith("custom:"): + try: + auth_mod.resolve_provider(requested_norm) + except AuthError: + pass + else: + return None + config = load_config() custom_providers = config.get("custom_providers") if not isinstance(custom_providers, list): diff --git a/tests/test_runtime_provider_resolution.py b/tests/test_runtime_provider_resolution.py index 3ff1066cd..a53c716a3 100644 --- a/tests/test_runtime_provider_resolution.py +++ b/tests/test_runtime_provider_resolution.py @@ -226,27 +226,37 @@ def test_named_custom_provider_falls_back_to_openai_api_key(monkeypatch): assert resolved["requested_provider"] == "custom:local-llm" -def test_resolve_runtime_provider_nous_api(monkeypatch): - """Nous Portal API key provider resolves via the api_key path.""" - monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "nous-api") +def test_named_custom_provider_does_not_shadow_builtin_provider(monkeypatch): monkeypatch.setattr( rp, - "resolve_api_key_provider_credentials", - lambda pid: { - "provider": "nous-api", - "api_key": "nous-test-key", + "load_config", + lambda: { + "custom_providers": [ + { + "name": "nous", + "base_url": "http://localhost:1234/v1", + "api_key": "shadow-key", + } + ] + }, + ) + monkeypatch.setattr( + rp, + "resolve_nous_runtime_credentials", + lambda **kwargs: { "base_url": "https://inference-api.nousresearch.com/v1", - "source": "NOUS_API_KEY", + "api_key": "nous-runtime-key", + "source": "portal", + "expires_at": None, }, ) - resolved = rp.resolve_runtime_provider(requested="nous-api") + resolved = rp.resolve_runtime_provider(requested="nous") - assert resolved["provider"] == "nous-api" - assert resolved["api_mode"] == "chat_completions" + assert resolved["provider"] == "nous" assert resolved["base_url"] == "https://inference-api.nousresearch.com/v1" - assert resolved["api_key"] == "nous-test-key" - assert resolved["requested_provider"] == "nous-api" + assert resolved["api_key"] == "nous-runtime-key" + assert resolved["requested_provider"] == "nous" def test_explicit_openrouter_skips_openai_base_url(monkeypatch): From 3325e51e530b42712f8828dfef843b7bda942e6f Mon Sep 17 00:00:00 2001 From: Stable Genius <259448942+stablegenius49@users.noreply.github.com> Date: Sat, 14 Mar 2026 11:27:02 -0700 Subject: [PATCH 16/34] fix(skills): honor policy table for dangerous verdicts Salvaged from PR #1007 by stablegenius49. - let INSTALL_POLICY decide dangerous verdict handling for builtin skills - allow --force to override blocked dangerous decisions for trusted and community sources - accept --yes / -y as aliases for --force in /skills install - update regression tests to match the intended policy precedence --- hermes_cli/skills_hub.py | 4 +- tests/hermes_cli/test_skills_hub.py | 2 +- tests/tools/test_force_dangerous_override.py | 44 +++++++------------- tests/tools/test_skills_guard.py | 29 +++++++------ tools/skills_guard.py | 10 ++--- 5 files changed, 40 insertions(+), 49 deletions(-) diff --git a/hermes_cli/skills_hub.py b/hermes_cli/skills_hub.py index 60cfaf6be..e2d17557a 100644 --- a/hermes_cli/skills_hub.py +++ b/hermes_cli/skills_hub.py @@ -1050,11 +1050,11 @@ def handle_skills_slash(cmd: str, console: Optional[Console] = None) -> None: elif action == "install": if not args: - c.print("[bold red]Usage:[/] /skills install [--category ] [--force]\n") + c.print("[bold red]Usage:[/] /skills install [--category ] [--force|--yes]\n") return identifier = args[0] category = "" - force = "--force" in args + force = any(flag in args for flag in ("--force", "--yes", "-y")) for i, a in enumerate(args): if a == "--category" and i + 1 < len(args): category = args[i + 1] diff --git a/tests/hermes_cli/test_skills_hub.py b/tests/hermes_cli/test_skills_hub.py index 4e3af6c7d..d1169120b 100644 --- a/tests/hermes_cli/test_skills_hub.py +++ b/tests/hermes_cli/test_skills_hub.py @@ -3,7 +3,7 @@ from io import StringIO import pytest from rich.console import Console -from hermes_cli.skills_hub import do_check, do_list, do_update +from hermes_cli.skills_hub import do_check, do_list, do_update, handle_skills_slash class _DummyLockFile: diff --git a/tests/tools/test_force_dangerous_override.py b/tests/tools/test_force_dangerous_override.py index ab9600f20..3a727bf1c 100644 --- a/tests/tools/test_force_dangerous_override.py +++ b/tests/tools/test_force_dangerous_override.py @@ -1,11 +1,8 @@ -"""Tests for the --force flag dangerous verdict bypass fix in skills_guard.py. +"""Regression tests for skills guard policy precedence. -Regression test: the old code had `if result.verdict == "dangerous" and not force:` -which meant force=True would skip the early return, fall through the policy -lookup, and hit `if force: return True` - allowing installation of skills -flagged as dangerous (reverse shells, data exfiltration, etc). - -The docstring explicitly states: "never overrides dangerous". +Official/builtin skills should follow the INSTALL_POLICY table even when their +scan verdict is dangerous, and --force should override blocked verdicts for +non-builtin sources. """ @@ -44,10 +41,6 @@ def _new_should_allow(verdict, trust_level, force): } VERDICT_INDEX = {"safe": 0, "caution": 1, "dangerous": 2} - # Fixed: no `and not force` - dangerous is always blocked - if verdict == "dangerous": - return False - policy = INSTALL_POLICY.get(trust_level, INSTALL_POLICY["community"]) vi = VERDICT_INDEX.get(verdict, 2) decision = policy[vi] @@ -61,35 +54,28 @@ def _new_should_allow(verdict, trust_level, force): return False -class TestForceNeverOverridesDangerous: - """The core bug: --force bypassed the dangerous verdict block.""" +class TestPolicyPrecedenceForDangerousVerdicts: + def test_builtin_dangerous_is_allowed_by_policy(self): + assert _new_should_allow("dangerous", "builtin", force=False) is True - def test_old_code_allows_dangerous_with_force(self): - """Old code: force=True lets dangerous skills through.""" - assert _old_should_allow("dangerous", "community", force=True) is True + def test_trusted_dangerous_is_blocked_without_force(self): + assert _new_should_allow("dangerous", "trusted", force=False) is False - def test_new_code_blocks_dangerous_with_force(self): - """Fixed code: force=True still blocks dangerous skills.""" - assert _new_should_allow("dangerous", "community", force=True) is False + def test_force_overrides_dangerous_for_community(self): + assert _new_should_allow("dangerous", "community", force=True) is True - def test_new_code_blocks_dangerous_trusted_with_force(self): - """Fixed code: even trusted + force cannot install dangerous.""" - assert _new_should_allow("dangerous", "trusted", force=True) is False + def test_force_overrides_dangerous_for_trusted(self): + assert _new_should_allow("dangerous", "trusted", force=True) is True def test_force_still_overrides_caution(self): - """force=True should still work for caution verdicts.""" assert _new_should_allow("caution", "community", force=True) is True def test_caution_community_blocked_without_force(self): - """Caution + community is blocked without force (unchanged).""" assert _new_should_allow("caution", "community", force=False) is False def test_safe_always_allowed(self): - """Safe verdict is always allowed regardless of force.""" assert _new_should_allow("safe", "community", force=False) is True assert _new_should_allow("safe", "community", force=True) is True - def test_dangerous_blocked_without_force(self): - """Dangerous is blocked without force (both old and new agree).""" - assert _old_should_allow("dangerous", "community", force=False) is False - assert _new_should_allow("dangerous", "community", force=False) is False + def test_old_code_happened_to_allow_forced_dangerous_community(self): + assert _old_should_allow("dangerous", "community", force=True) is True diff --git a/tests/tools/test_skills_guard.py b/tests/tools/test_skills_guard.py index 70eb9fc69..7bcf55e81 100644 --- a/tests/tools/test_skills_guard.py +++ b/tests/tools/test_skills_guard.py @@ -46,9 +46,9 @@ from tools.skills_guard import ( class TestResolveTrustLevel: - def test_builtin_not_exposed(self): - # builtin is only used internally, not resolved from source string - assert _resolve_trust_level("openai/skills") == "trusted" + def test_official_sources_resolve_to_builtin(self): + assert _resolve_trust_level("official") == "builtin" + assert _resolve_trust_level("official/email/agentmail") == "builtin" def test_trusted_repos(self): assert _resolve_trust_level("openai/skills") == "trusted" @@ -116,11 +116,17 @@ class TestShouldAllowInstall: allowed, _ = should_allow_install(self._result("trusted", "caution", f)) assert allowed is True - def test_dangerous_blocked_even_trusted(self): + def test_trusted_dangerous_blocked_without_force(self): f = [Finding("x", "critical", "c", "f", 1, "m", "d")] allowed, _ = should_allow_install(self._result("trusted", "dangerous", f)) assert allowed is False + def test_builtin_dangerous_allowed_without_force(self): + f = [Finding("x", "critical", "c", "f", 1, "m", "d")] + allowed, reason = should_allow_install(self._result("builtin", "dangerous", f)) + assert allowed is True + assert "builtin source" in reason + def test_force_overrides_caution(self): f = [Finding("x", "high", "c", "f", 1, "m", "d")] allowed, reason = should_allow_install(self._result("community", "caution", f), force=True) @@ -132,22 +138,21 @@ class TestShouldAllowInstall: allowed, _ = should_allow_install(self._result("community", "dangerous", f), force=False) assert allowed is False - def test_force_never_overrides_dangerous(self): - """--force must not bypass dangerous verdict (regression test).""" + def test_force_overrides_dangerous_for_community(self): f = [Finding("x", "critical", "c", "f", 1, "m", "d")] allowed, reason = should_allow_install( self._result("community", "dangerous", f), force=True ) - assert allowed is False - assert "DANGEROUS" in reason + assert allowed is True + assert "Force-installed" in reason - def test_force_never_overrides_dangerous_trusted(self): - """--force must not bypass dangerous even for trusted sources.""" + def test_force_overrides_dangerous_for_trusted(self): f = [Finding("x", "critical", "c", "f", 1, "m", "d")] - allowed, _ = should_allow_install( + allowed, reason = should_allow_install( self._result("trusted", "dangerous", f), force=True ) - assert allowed is False + assert allowed is True + assert "Force-installed" in reason # --------------------------------------------------------------------------- diff --git a/tools/skills_guard.py b/tools/skills_guard.py index c354d6548..df62edbe6 100644 --- a/tools/skills_guard.py +++ b/tools/skills_guard.py @@ -645,14 +645,11 @@ def should_allow_install(result: ScanResult, force: bool = False) -> Tuple[bool, Args: result: Scan result from scan_skill() - force: If True, override blocks for caution verdicts (never overrides dangerous) + force: If True, override blocked policy decisions for this scan result Returns: (allowed, reason) tuple """ - if result.verdict == "dangerous": - return False, f"Scan verdict is DANGEROUS ({len(result.findings)} findings). Blocked." - policy = INSTALL_POLICY.get(result.trust_level, INSTALL_POLICY["community"]) vi = VERDICT_INDEX.get(result.verdict, 2) decision = policy[vi] @@ -661,7 +658,10 @@ def should_allow_install(result: ScanResult, force: bool = False) -> Tuple[bool, return True, f"Allowed ({result.trust_level} source, {result.verdict} verdict)" if force: - return True, f"Force-installed despite {result.verdict} verdict ({len(result.findings)} findings)" + return True, ( + f"Force-installed despite blocked {result.verdict} verdict " + f"({len(result.findings)} findings)" + ) return False, ( f"Blocked ({result.trust_level} source + {result.verdict} verdict, " From 21ad98b74ce2223f443ba628acd8ad9b47149e7c Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 11:27:08 -0700 Subject: [PATCH 17/34] fix(cli): add --yes alias for skills install Keep the argparse CLI aligned with the slash command so --yes and -y behave the same as --force for hermes skills install. Add a parser-level regression test. --- hermes_cli/main.py | 2 +- tests/hermes_cli/test_skills_install_flags.py | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 tests/hermes_cli/test_skills_install_flags.py diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 6adf4ff70..9609f3998 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -2701,7 +2701,7 @@ For more help on a command: skills_install = skills_subparsers.add_parser("install", help="Install a skill") skills_install.add_argument("identifier", help="Skill identifier (e.g. openai/skills/skill-creator)") skills_install.add_argument("--category", default="", help="Category folder to install into") - skills_install.add_argument("--force", action="store_true", help="Install despite caution verdict") + skills_install.add_argument("--force", "--yes", "-y", dest="force", action="store_true", help="Install despite blocked scan verdict") skills_inspect = skills_subparsers.add_parser("inspect", help="Preview a skill without installing") skills_inspect.add_argument("identifier", help="Skill identifier") diff --git a/tests/hermes_cli/test_skills_install_flags.py b/tests/hermes_cli/test_skills_install_flags.py new file mode 100644 index 000000000..bca0404d0 --- /dev/null +++ b/tests/hermes_cli/test_skills_install_flags.py @@ -0,0 +1,26 @@ +import sys +from types import SimpleNamespace + + +def test_cli_skills_install_accepts_yes_alias(monkeypatch): + from hermes_cli.main import main + + captured = {} + + def fake_skills_command(args): + captured["identifier"] = args.identifier + captured["force"] = args.force + + monkeypatch.setattr("hermes_cli.skills_hub.skills_command", fake_skills_command) + monkeypatch.setattr( + sys, + "argv", + ["hermes", "skills", "install", "official/email/agentmail", "--yes"], + ) + + main() + + assert captured == { + "identifier": "official/email/agentmail", + "force": True, + } From 895fe5a5d3454d8ffcdd39d236056b6bb56d353f Mon Sep 17 00:00:00 2001 From: Dave Tist <109555139+davetist@users.noreply.github.com> Date: Thu, 12 Mar 2026 02:49:24 +0100 Subject: [PATCH 18/34] Fix browser cleanup consistency and screenshot recovery Unify browser session teardown so manual close, inactivity cleanup, and emergency shutdown all follow the same cleanup path instead of partially duplicating logic. This changes browser_close() to delegate to cleanup_browser(), which means recording shutdown, Browserbase release, activity bookkeeping cleanup, and local socket-directory removal now happen consistently. It also updates emergency cleanup to route through cleanup_all_browsers() and explicitly clear in-memory tracking state after teardown so stale active-session, last-activity, and recording entries are not left behind on exit. The screenshot fallback path has also been fixed. _extract_screenshot_path_from_text() now matches real absolute PNG paths, including quoted output, so browser_vision() can recover screenshots when agent-browser emits human-readable text instead of JSON. Regression coverage was added in tests/tools/test_browser_cleanup.py for screenshot path extraction, cleanup_browser() state removal, browser_close() delegation, and emergency cleanup state clearing. Verified with: - python -m pytest tests/tools/test_browser_cleanup.py -q - python -m pytest tests/tools/test_browser_console.py tests/gateway/test_send_image_file.py -q --- tests/tools/test_browser_cleanup.py | 96 ++++ tools/browser_tool.py | 698 +++++++++++++++------------- 2 files changed, 469 insertions(+), 325 deletions(-) create mode 100644 tests/tools/test_browser_cleanup.py diff --git a/tests/tools/test_browser_cleanup.py b/tests/tools/test_browser_cleanup.py new file mode 100644 index 000000000..9dfabe640 --- /dev/null +++ b/tests/tools/test_browser_cleanup.py @@ -0,0 +1,96 @@ +"""Regression tests for browser session cleanup and screenshot recovery.""" + +from unittest.mock import patch + + +class TestScreenshotPathRecovery: + def test_extracts_standard_absolute_path(self): + from tools.browser_tool import _extract_screenshot_path_from_text + + assert ( + _extract_screenshot_path_from_text("Screenshot saved to /tmp/foo.png") + == "/tmp/foo.png" + ) + + def test_extracts_quoted_absolute_path(self): + from tools.browser_tool import _extract_screenshot_path_from_text + + assert ( + _extract_screenshot_path_from_text( + "Screenshot saved to '/Users/david/.hermes/browser_screenshots/shot.png'" + ) + == "/Users/david/.hermes/browser_screenshots/shot.png" + ) + + +class TestBrowserCleanup: + def setup_method(self): + from tools import browser_tool + + self.browser_tool = browser_tool + self.orig_active_sessions = browser_tool._active_sessions.copy() + self.orig_session_last_activity = browser_tool._session_last_activity.copy() + self.orig_recording_sessions = browser_tool._recording_sessions.copy() + self.orig_cleanup_done = browser_tool._cleanup_done + + def teardown_method(self): + self.browser_tool._active_sessions.clear() + self.browser_tool._active_sessions.update(self.orig_active_sessions) + self.browser_tool._session_last_activity.clear() + self.browser_tool._session_last_activity.update(self.orig_session_last_activity) + self.browser_tool._recording_sessions.clear() + self.browser_tool._recording_sessions.update(self.orig_recording_sessions) + self.browser_tool._cleanup_done = self.orig_cleanup_done + + def test_cleanup_browser_clears_tracking_state(self): + browser_tool = self.browser_tool + browser_tool._active_sessions["task-1"] = { + "session_name": "sess-1", + "bb_session_id": None, + } + browser_tool._session_last_activity["task-1"] = 123.0 + + with ( + patch("tools.browser_tool._maybe_stop_recording") as mock_stop, + patch( + "tools.browser_tool._run_browser_command", + return_value={"success": True}, + ) as mock_run, + patch("tools.browser_tool.os.path.exists", return_value=False), + ): + browser_tool.cleanup_browser("task-1") + + assert "task-1" not in browser_tool._active_sessions + assert "task-1" not in browser_tool._session_last_activity + mock_stop.assert_called_once_with("task-1") + mock_run.assert_called_once_with("task-1", "close", [], timeout=10) + + def test_browser_close_delegates_to_cleanup_browser(self): + import json + + browser_tool = self.browser_tool + browser_tool._active_sessions["task-2"] = {"session_name": "sess-2"} + + with patch("tools.browser_tool.cleanup_browser") as mock_cleanup: + result = json.loads(browser_tool.browser_close("task-2")) + + assert result == {"success": True, "closed": True} + mock_cleanup.assert_called_once_with("task-2") + + def test_emergency_cleanup_clears_all_tracking_state(self): + browser_tool = self.browser_tool + browser_tool._cleanup_done = False + browser_tool._active_sessions["task-1"] = {"session_name": "sess-1"} + browser_tool._active_sessions["task-2"] = {"session_name": "sess-2"} + browser_tool._session_last_activity["task-1"] = 1.0 + browser_tool._session_last_activity["task-2"] = 2.0 + browser_tool._recording_sessions.update({"task-1", "task-2"}) + + with patch("tools.browser_tool.cleanup_all_browsers") as mock_cleanup_all: + browser_tool._emergency_cleanup_all_sessions() + + mock_cleanup_all.assert_called_once_with() + assert browser_tool._active_sessions == {} + assert browser_tool._session_last_activity == {} + assert browser_tool._recording_sessions == set() + assert browser_tool._cleanup_done is True diff --git a/tools/browser_tool.py b/tools/browser_tool.py index b3516c4f2..9a2c9de4c 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -49,10 +49,12 @@ Usage: browser_click("@e5", task_id="task_123") """ +from tools.registry import registry import atexit import json import logging import os +import re import signal import subprocess import shutil @@ -126,7 +128,8 @@ def _socket_safe_tmpdir() -> str: # Track active sessions per task # Stores: session_name (always), bb_session_id + cdp_url (cloud mode only) -_active_sessions: Dict[str, Dict[str, str]] = {} # task_id -> {session_name, ...} +# task_id -> {session_name, ...} +_active_sessions: Dict[str, Dict[str, str]] = {} _recording_sessions: set = set() # task_ids with active recordings # Flag to track if cleanup has been done @@ -139,7 +142,8 @@ _cleanup_done = False # Session inactivity timeout (seconds) - cleanup if no activity for this long # Default: 5 minutes. Needs headroom for LLM reasoning between browser commands, # especially when subagents are doing multi-step browser tasks. -BROWSER_SESSION_INACTIVITY_TIMEOUT = int(os.environ.get("BROWSER_INACTIVITY_TIMEOUT", "300")) +BROWSER_SESSION_INACTIVITY_TIMEOUT = int( + os.environ.get("BROWSER_INACTIVITY_TIMEOUT", "300")) # Track last activity time per session _session_last_activity: Dict[str, float] = {} @@ -161,67 +165,22 @@ def _emergency_cleanup_all_sessions(): if _cleanup_done: return _cleanup_done = True - + if not _active_sessions: return - - logger.info("Emergency cleanup: closing %s active session(s)...", len(_active_sessions)) - + + logger.info("Emergency cleanup: closing %s active session(s)...", + len(_active_sessions)) + try: - if _is_local_mode(): - # Local mode: just close agent-browser sessions via CLI - for task_id, session_info in list(_active_sessions.items()): - session_name = session_info.get("session_name") - if session_name: - try: - browser_cmd = _find_agent_browser() - task_socket_dir = os.path.join( - _socket_safe_tmpdir(), - f"agent-browser-{session_name}" - ) - env = {**os.environ, "AGENT_BROWSER_SOCKET_DIR": task_socket_dir} - subprocess.run( - browser_cmd.split() + ["--session", session_name, "--json", "close"], - capture_output=True, timeout=5, env=env, - ) - logger.info("Closed local session %s", session_name) - except Exception as e: - logger.debug("Error closing local session %s: %s", session_name, e) - else: - # Cloud mode: release Browserbase sessions via API - api_key = os.environ.get("BROWSERBASE_API_KEY") - project_id = os.environ.get("BROWSERBASE_PROJECT_ID") - - if not api_key or not project_id: - logger.warning("Cannot cleanup - missing BROWSERBASE credentials") - return - - for task_id, session_info in list(_active_sessions.items()): - bb_session_id = session_info.get("bb_session_id") - if bb_session_id: - try: - response = requests.post( - f"https://api.browserbase.com/v1/sessions/{bb_session_id}", - headers={ - "X-BB-API-Key": api_key, - "Content-Type": "application/json" - }, - json={ - "projectId": project_id, - "status": "REQUEST_RELEASE" - }, - timeout=5 # Short timeout for cleanup - ) - if response.status_code in (200, 201, 204): - logger.info("Closed session %s", bb_session_id) - else: - logger.warning("Failed to close session %s: HTTP %s", bb_session_id, response.status_code) - except Exception as e: - logger.error("Error closing session %s: %s", bb_session_id, e) - - _active_sessions.clear() + cleanup_all_browsers() except Exception as e: logger.error("Emergency cleanup error: %s", e) + finally: + with _cleanup_lock: + _active_sessions.clear() + _session_last_activity.clear() + _recording_sessions.clear() # Register cleanup via atexit only. Previous versions installed SIGINT/SIGTERM @@ -240,46 +199,49 @@ atexit.register(_emergency_cleanup_all_sessions) def _cleanup_inactive_browser_sessions(): """ Clean up browser sessions that have been inactive for longer than the timeout. - + This function is called periodically by the background cleanup thread to automatically close sessions that haven't been used recently, preventing orphaned sessions (local or Browserbase) from accumulating. """ current_time = time.time() sessions_to_cleanup = [] - + with _cleanup_lock: for task_id, last_time in list(_session_last_activity.items()): if current_time - last_time > BROWSER_SESSION_INACTIVITY_TIMEOUT: sessions_to_cleanup.append(task_id) - + for task_id in sessions_to_cleanup: try: - elapsed = int(current_time - _session_last_activity.get(task_id, current_time)) - logger.info("Cleaning up inactive session for task: %s (inactive for %ss)", task_id, elapsed) + elapsed = int( + current_time - _session_last_activity.get(task_id, current_time)) + logger.info( + "Cleaning up inactive session for task: %s (inactive for %ss)", task_id, elapsed) cleanup_browser(task_id) with _cleanup_lock: if task_id in _session_last_activity: del _session_last_activity[task_id] except Exception as e: - logger.warning("Error cleaning up inactive session %s: %s", task_id, e) + logger.warning( + "Error cleaning up inactive session %s: %s", task_id, e) def _browser_cleanup_thread_worker(): """ Background thread that periodically cleans up inactive browser sessions. - + Runs every 30 seconds and checks for sessions that haven't been used within the BROWSER_SESSION_INACTIVITY_TIMEOUT period. """ global _cleanup_running - + while _cleanup_running: try: _cleanup_inactive_browser_sessions() except Exception as e: logger.warning("Cleanup thread error: %s", e) - + # Sleep in 1-second intervals so we can stop quickly if needed for _ in range(30): if not _cleanup_running: @@ -290,7 +252,7 @@ def _browser_cleanup_thread_worker(): def _start_browser_cleanup_thread(): """Start the background cleanup thread if not already running.""" global _cleanup_thread, _cleanup_running - + with _cleanup_lock: if _cleanup_thread is None or not _cleanup_thread.is_alive(): _cleanup_running = True @@ -300,7 +262,8 @@ def _start_browser_cleanup_thread(): name="browser-cleanup" ) _cleanup_thread.start() - logger.info("Started inactivity cleanup thread (timeout: %ss)", BROWSER_SESSION_INACTIVITY_TIMEOUT) + logger.info("Started inactivity cleanup thread (timeout: %ss)", + BROWSER_SESSION_INACTIVITY_TIMEOUT) def _stop_browser_cleanup_thread(): @@ -487,38 +450,41 @@ BROWSER_TOOL_SCHEMAS = [ def _create_browserbase_session(task_id: str) -> Dict[str, str]: """ Create a Browserbase session with stealth features. - + Browserbase Stealth Modes: - Basic Stealth: ALWAYS enabled automatically. Generates random fingerprints, viewports, and solves visual CAPTCHAs. No configuration needed. - Advanced Stealth: Uses custom Chromium build for better bot detection avoidance. Requires Scale Plan. Enable via BROWSERBASE_ADVANCED_STEALTH=true. - + Proxies are enabled by default to route traffic through residential IPs, which significantly improves CAPTCHA solving rates. Can be disabled via BROWSERBASE_PROXIES=false if needed. - + Args: task_id: Unique identifier for the task - + Returns: Dict with session_name, bb_session_id, cdp_url, and feature flags """ import uuid import sys - + config = _get_browserbase_config() - + # Check for optional settings from environment # Proxies: enabled by default for better CAPTCHA solving - enable_proxies = os.environ.get("BROWSERBASE_PROXIES", "true").lower() != "false" + enable_proxies = os.environ.get( + "BROWSERBASE_PROXIES", "true").lower() != "false" # Advanced Stealth: requires Scale Plan, disabled by default - enable_advanced_stealth = os.environ.get("BROWSERBASE_ADVANCED_STEALTH", "false").lower() == "true" + enable_advanced_stealth = os.environ.get( + "BROWSERBASE_ADVANCED_STEALTH", "false").lower() == "true" # keepAlive: enabled by default (requires paid plan) - allows reconnection after disconnects - enable_keep_alive = os.environ.get("BROWSERBASE_KEEP_ALIVE", "true").lower() != "false" + enable_keep_alive = os.environ.get( + "BROWSERBASE_KEEP_ALIVE", "true").lower() != "false" # Custom session timeout in milliseconds (optional) - extends session beyond project default custom_timeout_ms = os.environ.get("BROWSERBASE_SESSION_TIMEOUT") - + # Track which features are actually enabled for logging/debugging features_enabled = { "basic_stealth": True, # Always on @@ -527,18 +493,18 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: "keep_alive": False, "custom_timeout": False, } - + # Build session configuration # Note: Basic stealth mode is ALWAYS active - no configuration needed session_config = { "projectId": config["project_id"], } - + # Enable keepAlive for session reconnection (default: true, requires paid plan) # Allows reconnecting to the same session after network hiccups if enable_keep_alive: session_config["keepAlive"] = True - + # Add custom timeout if specified (in milliseconds) # This extends session duration beyond project's default timeout if custom_timeout_ms: @@ -547,20 +513,21 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: if timeout_val > 0: session_config["timeout"] = timeout_val except ValueError: - logger.warning("Invalid BROWSERBASE_SESSION_TIMEOUT value: %s", custom_timeout_ms) - + logger.warning( + "Invalid BROWSERBASE_SESSION_TIMEOUT value: %s", custom_timeout_ms) + # Enable proxies for better CAPTCHA solving (default: true) # Routes traffic through residential IPs for more reliable access if enable_proxies: session_config["proxies"] = True - + # Add advanced stealth if enabled (requires Scale Plan) # Uses custom Chromium build to avoid bot detection altogether if enable_advanced_stealth: session_config["browserSettings"] = { "advancedStealth": True, } - + # Create session via Browserbase API response = requests.post( "https://api.browserbase.com/v1/sessions", @@ -571,11 +538,11 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: json=session_config, timeout=30 ) - + # Track if we fell back from paid features proxies_fallback = False keepalive_fallback = False - + # Handle 402 Payment Required - likely paid features not available # Try to identify which feature caused the issue and retry without it if response.status_code == 402: @@ -583,7 +550,7 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: if enable_keep_alive: keepalive_fallback = True logger.warning("keepAlive may require paid plan (402), retrying without it. " - "Sessions may timeout during long operations.") + "Sessions may timeout during long operations.") session_config.pop("keepAlive", None) response = requests.post( "https://api.browserbase.com/v1/sessions", @@ -594,12 +561,12 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: json=session_config, timeout=30 ) - + # If still 402, try without proxies too if response.status_code == 402 and enable_proxies: proxies_fallback = True logger.warning("Proxies unavailable (402), retrying without proxies. " - "Bot detection may be less effective.") + "Bot detection may be less effective.") session_config.pop("proxies", None) response = requests.post( "https://api.browserbase.com/v1/sessions", @@ -610,13 +577,14 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: json=session_config, timeout=30 ) - + if not response.ok: - raise RuntimeError(f"Failed to create Browserbase session: {response.status_code} {response.text}") - + raise RuntimeError( + f"Failed to create Browserbase session: {response.status_code} {response.text}") + session_data = response.json() session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}" - + # Update features based on what actually succeeded if enable_proxies and not proxies_fallback: features_enabled["proxies"] = True @@ -626,11 +594,12 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: features_enabled["keep_alive"] = True if custom_timeout_ms and "timeout" in session_config: features_enabled["custom_timeout"] = True - + # Log session info for debugging feature_str = ", ".join(k for k, v in features_enabled.items() if v) - logger.info("Created session %s with features: %s", session_name, feature_str) - + logger.info("Created session %s with features: %s", + session_name, feature_str) + return { "session_name": session_name, "bb_session_id": session_data["id"], @@ -640,18 +609,14 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: def _create_local_session(task_id: str) -> Dict[str, str]: - """Create a lightweight local browser session (no cloud API call). - - Returns the same dict shape as ``_create_browserbase_session`` so the rest - of the code can treat both modes uniformly. - """ import uuid - session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}" - logger.info("Created local browser session %s", session_name) + session_name = f"h_{uuid.uuid4().hex[:10]}" + logger.info("Created local browser session %s for task %s", + session_name, task_id) return { "session_name": session_name, - "bb_session_id": None, # Not applicable in local mode - "cdp_url": None, # Not applicable in local mode + "bb_session_id": None, + "cdp_url": None, "features": {"local": True}, } @@ -659,51 +624,51 @@ def _create_local_session(task_id: str) -> Dict[str, str]: def _get_session_info(task_id: Optional[str] = None) -> Dict[str, str]: """ Get or create session info for the given task. - + In cloud mode, creates a Browserbase session with proxies enabled. In local mode, generates a session name for agent-browser --session. Also starts the inactivity cleanup thread and updates activity tracking. Thread-safe: multiple subagents can call this concurrently. - + Args: task_id: Unique identifier for the task - + Returns: Dict with session_name (always), bb_session_id + cdp_url (cloud only) """ if task_id is None: task_id = "default" - + # Start the cleanup thread if not running (handles inactivity timeouts) _start_browser_cleanup_thread() - + # Update activity timestamp for this session _update_session_activity(task_id) - + with _cleanup_lock: # Check if we already have a session for this task if task_id in _active_sessions: return _active_sessions[task_id] - + # Create session outside the lock (network call in cloud mode) if _is_local_mode(): session_info = _create_local_session(task_id) else: session_info = _create_browserbase_session(task_id) - + with _cleanup_lock: _active_sessions[task_id] = session_info - + return session_info def _get_session_name(task_id: Optional[str] = None) -> str: """ Get the session name for agent-browser CLI. - + Args: task_id: Unique identifier for the task - + Returns: Session name for agent-browser """ @@ -714,22 +679,22 @@ def _get_session_name(task_id: Optional[str] = None) -> str: def _get_browserbase_config() -> Dict[str, str]: """ Get Browserbase configuration from environment. - + Returns: Dict with api_key and project_id - + Raises: ValueError: If required env vars are not set """ api_key = os.environ.get("BROWSERBASE_API_KEY") project_id = os.environ.get("BROWSERBASE_PROJECT_ID") - + if not api_key or not project_id: raise ValueError( "BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID environment variables are required. " "Get your credentials at https://browserbase.com" ) - + return { "api_key": api_key, "project_id": project_id @@ -739,12 +704,12 @@ def _get_browserbase_config() -> Dict[str, str]: def _find_agent_browser() -> str: """ Find the agent-browser CLI executable. - + Checks in order: PATH, local node_modules/.bin/, npx fallback. - + Returns: Path to agent-browser executable - + Raises: FileNotFoundError: If agent-browser is not installed """ @@ -753,18 +718,18 @@ def _find_agent_browser() -> str: which_result = shutil.which("agent-browser") if which_result: return which_result - + # Check local node_modules/.bin/ (npm install in repo root) repo_root = Path(__file__).parent.parent local_bin = repo_root / "node_modules" / ".bin" / "agent-browser" if local_bin.exists(): return str(local_bin) - + # Check common npx locations npx_path = shutil.which("npx") if npx_path: return "npx agent-browser" - + raise FileNotFoundError( "agent-browser CLI not found. Install it with: npm install -g agent-browser\n" "Or run 'npm install' in the repo root to install locally.\n" @@ -772,6 +737,27 @@ def _find_agent_browser() -> str: ) +def _extract_screenshot_path_from_text(text: str) -> Optional[str]: + """Extract a screenshot file path from agent-browser human-readable output.""" + if not text: + return None + + patterns = [ + r"Screenshot saved to ['\"](?P/[^'\"]+?\.png)['\"]", + r"Screenshot saved to (?P/\S+?\.png)(?:\s|$)", + r"(?P/\S+?\.png)(?:\s|$)", + ] + + for pattern in patterns: + match = re.search(pattern, text) + if match: + path = match.group("path").strip().strip("'\"") + if path: + return path + + return None + + def _run_browser_command( task_id: str, command: str, @@ -780,25 +766,25 @@ def _run_browser_command( ) -> Dict[str, Any]: """ Run an agent-browser CLI command using our pre-created Browserbase session. - + Args: task_id: Task identifier to get the right session command: The command to run (e.g., "open", "click") args: Additional arguments for the command timeout: Command timeout in seconds - + Returns: Parsed JSON response from agent-browser """ args = args or [] - + # Build the command try: browser_cmd = _find_agent_browser() except FileNotFoundError as e: logger.warning("agent-browser CLI not found: %s", e) return {"success": False, "error": str(e)} - + from tools.interrupt import is_interrupted if is_interrupted(): return {"success": False, "error": "Interrupted"} @@ -807,9 +793,10 @@ def _run_browser_command( try: session_info = _get_session_info(task_id) except Exception as e: - logger.warning("Failed to create browser session for task=%s: %s", task_id, e) + logger.warning( + "Failed to create browser session for task=%s: %s", task_id, e) return {"success": False, "error": f"Failed to create browser session: {str(e)}"} - + # Build the command with the appropriate backend flag. # Cloud mode: --cdp connects to Browserbase. # Local mode: --session launches a local headless Chromium. @@ -827,7 +814,7 @@ def _run_browser_command( "--json", command ] + args - + try: # Give each task its own socket directory to prevent concurrency conflicts. # Without this, parallel workers fight over the same default socket path, @@ -839,13 +826,31 @@ def _run_browser_command( os.makedirs(task_socket_dir, mode=0o700, exist_ok=True) logger.debug("browser cmd=%s task=%s socket_dir=%s (%d chars)", command, task_id, task_socket_dir, len(task_socket_dir)) - + browser_env = {**os.environ} - # Ensure PATH includes standard dirs (systemd services may have minimal PATH) - if "/usr/bin" not in browser_env.get("PATH", "").split(":"): - browser_env["PATH"] = f"{browser_env.get('PATH', '')}:{_SANE_PATH}" + + # Ensure PATH includes Hermes-managed Node first, then standard system dirs. + hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) + hermes_node_bin = str(hermes_home / "node" / "bin") + + existing_path = browser_env.get("PATH", "") + path_parts = [p for p in existing_path.split(":") if p] + candidate_dirs = [hermes_node_bin] + [p for p in _SANE_PATH.split(":") if p] + + for part in reversed(candidate_dirs): + if os.path.isdir(part) and part not in path_parts: + path_parts.insert(0, part) + + browser_env["PATH"] = ":".join(path_parts) browser_env["AGENT_BROWSER_SOCKET_DIR"] = task_socket_dir - + + node_path = shutil.which("node", path=browser_env["PATH"]) + if node_path: + logger.debug("browser subprocess using node at: %s", node_path) + else: + logger.warning("node not found in browser PATH: %s", + browser_env["PATH"]) + result = subprocess.run( cmd_parts, capture_output=True, @@ -853,12 +858,13 @@ def _run_browser_command( timeout=timeout, env=browser_env, ) - + # Log stderr for diagnostics — use warning level on failure so it's visible if result.stderr and result.stderr.strip(): level = logging.WARNING if result.returncode != 0 else logging.DEBUG - logger.log(level, "browser '%s' stderr: %s", command, result.stderr.strip()[:500]) - + logger.log(level, "browser '%s' stderr: %s", + command, result.stderr.strip()[:500]) + # Log empty output as warning — common sign of broken agent-browser if not result.stdout.strip() and result.returncode == 0: logger.warning("browser '%s' returned empty stdout with rc=0. " @@ -866,11 +872,11 @@ def _run_browser_command( command, " ".join(cmd_parts[:4]) + "...", (result.stderr or "")[:200]) - # Parse JSON output - if result.stdout.strip(): + stdout_text = result.stdout.strip() + + if stdout_text: try: - parsed = json.loads(result.stdout.strip()) - # Warn if snapshot came back empty (common sign of daemon/CDP issues) + parsed = json.loads(stdout_text) if command == "snapshot" and parsed.get("success"): snap_data = parsed.get("data", {}) if not snap_data.get("snapshot") and not snap_data.get("refs"): @@ -879,23 +885,46 @@ def _run_browser_command( "returncode=%s", result.returncode) return parsed except json.JSONDecodeError: - # Non-JSON output indicates agent-browser crash or version mismatch - raw = result.stdout.strip()[:500] + raw = stdout_text[:2000] logger.warning("browser '%s' returned non-JSON output (rc=%s): %s", - command, result.returncode, raw[:200]) + command, result.returncode, raw[:500]) + + if command == "screenshot": + stderr_text = (result.stderr or "").strip() + combined_text = "\n".join( + part for part in [stdout_text, stderr_text] if part + ) + recovered_path = _extract_screenshot_path_from_text( + combined_text) + + if recovered_path and Path(recovered_path).exists(): + logger.info( + "browser 'screenshot' recovered file from non-JSON output: %s", + recovered_path, + ) + return { + "success": True, + "data": { + "path": recovered_path, + "raw": raw, + }, + } + return { - "success": True, - "data": {"raw": raw} + "success": False, + "error": f"Non-JSON output from agent-browser for '{command}': {raw}" } - + # Check for errors if result.returncode != 0: - error_msg = result.stderr.strip() if result.stderr else f"Command failed with code {result.returncode}" - logger.warning("browser '%s' failed (rc=%s): %s", command, result.returncode, error_msg[:300]) + error_msg = result.stderr.strip( + ) if result.stderr else f"Command failed with code {result.returncode}" + logger.warning("browser '%s' failed (rc=%s): %s", + command, result.returncode, error_msg[:300]) return {"success": False, "error": error_msg} - + return {"success": True, "data": {}} - + except subprocess.TimeoutExpired: logger.warning("browser '%s' timed out after %ds (task=%s, socket_dir=%s)", command, timeout, task_id, task_socket_dir) @@ -955,17 +984,17 @@ def _extract_relevant_content( def _truncate_snapshot(snapshot_text: str, max_chars: int = 8000) -> str: """ Simple truncation fallback for snapshots. - + Args: snapshot_text: The snapshot text to truncate max_chars: Maximum characters to keep - + Returns: Truncated text with indicator if truncated """ if len(snapshot_text) <= max_chars: return snapshot_text - + return snapshot_text[:max_chars] + "\n\n[... content truncated ...]" @@ -976,39 +1005,39 @@ def _truncate_snapshot(snapshot_text: str, max_chars: int = 8000) -> str: def browser_navigate(url: str, task_id: Optional[str] = None) -> str: """ Navigate to a URL in the browser. - + Args: url: The URL to navigate to task_id: Task identifier for session isolation - + Returns: JSON string with navigation result (includes stealth features info on first nav) """ effective_task_id = task_id or "default" - + # Get session info to check if this is a new session # (will create one with features logged if not exists) session_info = _get_session_info(effective_task_id) is_first_nav = session_info.get("_first_nav", True) - + # Auto-start recording if configured and this is first navigation if is_first_nav: session_info["_first_nav"] = False _maybe_start_recording(effective_task_id) - + result = _run_browser_command(effective_task_id, "open", [url], timeout=60) - + if result.get("success"): data = result.get("data", {}) title = data.get("title", "") final_url = data.get("url", url) - + response = { "success": True, "url": final_url, "title": title } - + # Detect common "blocked" page patterns from title/url blocked_patterns = [ "access denied", "access to this page has been denied", @@ -1018,7 +1047,7 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: "just a moment", "attention required" ] title_lower = title.lower() - + if any(pattern in title_lower for pattern in blocked_patterns): response["bot_detection_warning"] = ( f"Page title '{title}' suggests bot detection. The site may have blocked this request. " @@ -1026,7 +1055,7 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: "3) Enable advanced stealth (BROWSERBASE_ADVANCED_STEALTH=true, requires Scale plan), " "4) Some sites have very aggressive bot detection that may be unavoidable." ) - + # Include feature info on first navigation so model knows what's active if is_first_nav and "features" in session_info: features = session_info["features"] @@ -1037,7 +1066,7 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: "Consider upgrading Browserbase plan for proxy support." ) response["stealth_features"] = active_features - + return json.dumps(response, ensure_ascii=False) else: return json.dumps({ @@ -1053,41 +1082,41 @@ def browser_snapshot( ) -> str: """ Get a text-based snapshot of the current page's accessibility tree. - + Args: full: If True, return complete snapshot. If False, return compact view. task_id: Task identifier for session isolation user_task: The user's current task (for task-aware extraction) - + Returns: JSON string with page snapshot """ effective_task_id = task_id or "default" - + # Build command args based on full flag args = [] if not full: args.extend(["-c"]) # Compact mode - + result = _run_browser_command(effective_task_id, "snapshot", args) - + if result.get("success"): data = result.get("data", {}) snapshot_text = data.get("snapshot", "") refs = data.get("refs", {}) - + # Check if snapshot needs summarization if len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD and user_task: snapshot_text = _extract_relevant_content(snapshot_text, user_task) elif len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD: snapshot_text = _truncate_snapshot(snapshot_text) - + response = { "success": True, "snapshot": snapshot_text, "element_count": len(refs) if refs else 0 } - + return json.dumps(response, ensure_ascii=False) else: return json.dumps({ @@ -1099,22 +1128,22 @@ def browser_snapshot( def browser_click(ref: str, task_id: Optional[str] = None) -> str: """ Click on an element. - + Args: ref: Element reference (e.g., "@e5") task_id: Task identifier for session isolation - + Returns: JSON string with click result """ effective_task_id = task_id or "default" - + # Ensure ref starts with @ if not ref.startswith("@"): ref = f"@{ref}" - + result = _run_browser_command(effective_task_id, "click", [ref]) - + if result.get("success"): return json.dumps({ "success": True, @@ -1130,24 +1159,24 @@ def browser_click(ref: str, task_id: Optional[str] = None) -> str: def browser_type(ref: str, text: str, task_id: Optional[str] = None) -> str: """ Type text into an input field. - + Args: ref: Element reference (e.g., "@e3") text: Text to type task_id: Task identifier for session isolation - + Returns: JSON string with type result """ effective_task_id = task_id or "default" - + # Ensure ref starts with @ if not ref.startswith("@"): ref = f"@{ref}" - + # Use fill command (clears then types) result = _run_browser_command(effective_task_id, "fill", [ref, text]) - + if result.get("success"): return json.dumps({ "success": True, @@ -1164,25 +1193,25 @@ def browser_type(ref: str, text: str, task_id: Optional[str] = None) -> str: def browser_scroll(direction: str, task_id: Optional[str] = None) -> str: """ Scroll the page. - + Args: direction: "up" or "down" task_id: Task identifier for session isolation - + Returns: JSON string with scroll result """ effective_task_id = task_id or "default" - + # Validate direction if direction not in ["up", "down"]: return json.dumps({ "success": False, "error": f"Invalid direction '{direction}'. Use 'up' or 'down'." }, ensure_ascii=False) - + result = _run_browser_command(effective_task_id, "scroll", [direction]) - + if result.get("success"): return json.dumps({ "success": True, @@ -1198,16 +1227,16 @@ def browser_scroll(direction: str, task_id: Optional[str] = None) -> str: def browser_back(task_id: Optional[str] = None) -> str: """ Navigate back in browser history. - + Args: task_id: Task identifier for session isolation - + Returns: JSON string with navigation result """ effective_task_id = task_id or "default" result = _run_browser_command(effective_task_id, "back", []) - + if result.get("success"): data = result.get("data", {}) return json.dumps({ @@ -1224,17 +1253,17 @@ def browser_back(task_id: Optional[str] = None) -> str: def browser_press(key: str, task_id: Optional[str] = None) -> str: """ Press a keyboard key. - + Args: key: Key to press (e.g., "Enter", "Tab") task_id: Task identifier for session isolation - + Returns: JSON string with key press result """ effective_task_id = task_id or "default" result = _run_browser_command(effective_task_id, "press", [key]) - + if result.get("success"): return json.dumps({ "success": True, @@ -1250,69 +1279,51 @@ def browser_press(key: str, task_id: Optional[str] = None) -> str: def browser_close(task_id: Optional[str] = None) -> str: """ Close the browser session. - + Args: task_id: Task identifier for session isolation - + Returns: JSON string with close result """ effective_task_id = task_id or "default" - - # Stop auto-recording before closing - _maybe_stop_recording(effective_task_id) - - result = _run_browser_command(effective_task_id, "close", []) - - # Close the backend session (Browserbase API in cloud mode, nothing extra in local mode) - session_key = task_id if task_id and task_id in _active_sessions else "default" - if session_key in _active_sessions: - session_info = _active_sessions[session_key] - bb_session_id = session_info.get("bb_session_id") - if bb_session_id: - # Cloud mode: release the Browserbase session via API - try: - config = _get_browserbase_config() - _close_browserbase_session(bb_session_id, config["api_key"], config["project_id"]) - except Exception as e: - logger.warning("Could not close BrowserBase session: %s", e) - del _active_sessions[session_key] - - if result.get("success"): - return json.dumps({ - "success": True, - "closed": True - }, ensure_ascii=False) - else: - # Even if close fails, session was released - return json.dumps({ - "success": True, - "closed": True, - "warning": result.get("error", "Session may not have been active") - }, ensure_ascii=False) + with _cleanup_lock: + had_session = effective_task_id in _active_sessions + + cleanup_browser(effective_task_id) + + response = { + "success": True, + "closed": True, + } + if not had_session: + response["warning"] = "Session may not have been active" + return json.dumps(response, ensure_ascii=False) def browser_console(clear: bool = False, task_id: Optional[str] = None) -> str: """Get browser console messages and JavaScript errors. - + Returns both console output (log/warn/error/info from the page's JS) and uncaught exceptions (crashes, unhandled promise rejections). - + Args: clear: If True, clear the message/error buffers after reading task_id: Task identifier for session isolation - + Returns: JSON string with console messages and JS errors """ effective_task_id = task_id or "default" - + console_args = ["--clear"] if clear else [] error_args = ["--clear"] if clear else [] - - console_result = _run_browser_command(effective_task_id, "console", console_args) - errors_result = _run_browser_command(effective_task_id, "errors", error_args) - + + console_result = _run_browser_command( + effective_task_id, "console", console_args) + errors_result = _run_browser_command( + effective_task_id, "errors", error_args) + messages = [] if console_result.get("success"): for msg in console_result.get("data", {}).get("messages", []): @@ -1321,7 +1332,7 @@ def browser_console(clear: bool = False, task_id: Optional[str] = None) -> str: "text": msg.get("text", ""), "source": "console", }) - + errors = [] if errors_result.get("success"): for err in errors_result.get("data", {}).get("errors", []): @@ -1329,7 +1340,7 @@ def browser_console(clear: bool = False, task_id: Optional[str] = None) -> str: "message": err.get("message", ""), "source": "exception", }) - + return json.dumps({ "success": True, "console_messages": messages, @@ -1344,32 +1355,38 @@ def _maybe_start_recording(task_id: str): if task_id in _recording_sessions: return try: - hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) + hermes_home = Path(os.environ.get( + "HERMES_HOME", Path.home() / ".hermes")) config_path = hermes_home / "config.yaml" record_enabled = False if config_path.exists(): import yaml with open(config_path) as f: cfg = yaml.safe_load(f) or {} - record_enabled = cfg.get("browser", {}).get("record_sessions", False) - + record_enabled = cfg.get("browser", {}).get( + "record_sessions", False) + if not record_enabled: return - + recordings_dir = hermes_home / "browser_recordings" recordings_dir.mkdir(parents=True, exist_ok=True) _cleanup_old_recordings(max_age_hours=72) - + import time timestamp = time.strftime("%Y%m%d_%H%M%S") - recording_path = recordings_dir / f"session_{timestamp}_{task_id[:16]}.webm" - - result = _run_browser_command(task_id, "record", ["start", str(recording_path)]) + recording_path = recordings_dir / \ + f"session_{timestamp}_{task_id[:16]}.webm" + + result = _run_browser_command( + task_id, "record", ["start", str(recording_path)]) if result.get("success"): _recording_sessions.add(task_id) - logger.info("Auto-recording browser session %s to %s", task_id, recording_path) + logger.info("Auto-recording browser session %s to %s", + task_id, recording_path) else: - logger.debug("Could not start auto-recording: %s", result.get("error")) + logger.debug("Could not start auto-recording: %s", + result.get("error")) except Exception as e: logger.debug("Auto-recording setup failed: %s", e) @@ -1382,7 +1399,8 @@ def _maybe_stop_recording(task_id: str): result = _run_browser_command(task_id, "record", ["stop"]) if result.get("success"): path = result.get("data", {}).get("path", "") - logger.info("Saved browser recording for session %s: %s", task_id, path) + logger.info( + "Saved browser recording for session %s: %s", task_id, path) except Exception as e: logger.debug("Could not stop recording for %s: %s", task_id, e) finally: @@ -1392,15 +1410,15 @@ def _maybe_stop_recording(task_id: str): def browser_get_images(task_id: Optional[str] = None) -> str: """ Get all images on the current page. - + Args: task_id: Task identifier for session isolation - + Returns: JSON string with list of images (src and alt) """ effective_task_id = task_id or "default" - + # Use eval to run JavaScript that extracts images js_code = """JSON.stringify( [...document.images].map(img => ({ @@ -1410,20 +1428,20 @@ def browser_get_images(task_id: Optional[str] = None) -> str: height: img.naturalHeight })).filter(img => img.src && !img.src.startsWith('data:')) )""" - + result = _run_browser_command(effective_task_id, "eval", [js_code]) - + if result.get("success"): data = result.get("data", {}) raw_result = data.get("result", "[]") - + try: # Parse the JSON string returned by JavaScript if isinstance(raw_result, str): images = json.loads(raw_result) else: images = raw_result - + return json.dumps({ "success": True, "images": images, @@ -1446,51 +1464,53 @@ def browser_get_images(task_id: Optional[str] = None) -> str: def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str: """ Take a screenshot of the current page and analyze it with vision AI. - + This tool captures what's visually displayed in the browser and sends it to Gemini for analysis. Useful for understanding visual content that the text-based snapshot may not capture (CAPTCHAs, verification challenges, images, complex layouts, etc.). - + The screenshot is saved persistently and its file path is returned alongside the analysis, so it can be shared with users via MEDIA: in the response. - + Args: question: What you want to know about the page visually annotate: If True, overlay numbered [N] labels on interactive elements task_id: Task identifier for session isolation - + Returns: JSON string with vision analysis results and screenshot_path """ import base64 import uuid as uuid_mod from pathlib import Path - + effective_task_id = task_id or "default" - # Save screenshot to persistent location so it can be shared with users hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) screenshots_dir = hermes_home / "browser_screenshots" - screenshot_path = screenshots_dir / f"browser_screenshot_{uuid_mod.uuid4().hex}.png" - + screenshot_path = screenshots_dir / \ + f"browser_screenshot_{uuid_mod.uuid4().hex}.png" + try: screenshots_dir.mkdir(parents=True, exist_ok=True) - + # Prune old screenshots (older than 24 hours) to prevent unbounded disk growth _cleanup_old_screenshots(screenshots_dir, max_age_hours=24) - + # Take screenshot using agent-browser - screenshot_args = [str(screenshot_path)] + screenshot_args = [] if annotate: - screenshot_args.insert(0, "--annotate") + screenshot_args.append("--annotate") + screenshot_args.append("--full") + screenshot_args.append(str(screenshot_path)) result = _run_browser_command( - effective_task_id, - "screenshot", + effective_task_id, + "screenshot", screenshot_args, timeout=30 ) - + if not result.get("success"): error_detail = result.get("error", "Unknown error") mode = "local" if _is_local_mode() else "cloud" @@ -1498,7 +1518,11 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] "success": False, "error": f"Failed to take screenshot ({mode} mode): {error_detail}" }, ensure_ascii=False) - + + actual_screenshot_path = result.get("data", {}).get("path") + if actual_screenshot_path: + screenshot_path = Path(actual_screenshot_path) + # Check if screenshot file was created if not screenshot_path.exists(): mode = "local" if _is_local_mode() else "cloud" @@ -1511,12 +1535,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] f"or a stale daemon process." ), }, ensure_ascii=False) - + # Read and convert to base64 image_data = screenshot_path.read_bytes() image_base64 = base64.b64encode(image_data).decode("ascii") data_url = f"data:image/png;base64,{image_base64}" - + vision_prompt = ( f"You are analyzing a screenshot of a web browser.\n\n" f"User's question: {question}\n\n" @@ -1547,7 +1571,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] if vision_model: call_kwargs["model"] = vision_model response = call_llm(**call_kwargs) - + analysis = response.choices[0].message.content response_data = { "success": True, @@ -1558,14 +1582,15 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] if annotate and result.get("data", {}).get("annotations"): response_data["annotations"] = result["data"]["annotations"] return json.dumps(response_data, ensure_ascii=False) - + except Exception as e: # Keep the screenshot if it was captured successfully — the failure is # in the LLM vision analysis, not the capture. Deleting a valid # screenshot loses evidence the user might need. The 24-hour cleanup # in _cleanup_old_screenshots prevents unbounded disk growth. logger.warning("browser_vision failed: %s", e, exc_info=True) - error_info = {"success": False, "error": f"Error during vision analysis: {str(e)}"} + error_info = {"success": False, + "error": f"Error during vision analysis: {str(e)}"} if screenshot_path.exists(): error_info["screenshot_path"] = str(screenshot_path) error_info["note"] = "Screenshot was captured but vision analysis failed. You can still share it via MEDIA:." @@ -1600,7 +1625,8 @@ def _cleanup_old_recordings(max_age_hours=72): """Remove browser recordings older than max_age_hours to prevent disk bloat.""" import time try: - hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) + hermes_home = Path(os.environ.get( + "HERMES_HOME", Path.home() / ".hermes")) recordings_dir = hermes_home / "browser_recordings" if not recordings_dir.exists(): return @@ -1622,15 +1648,15 @@ def _cleanup_old_recordings(max_age_hours=72): def _close_browserbase_session(session_id: str, api_key: str, project_id: str) -> bool: """ Close a Browserbase session immediately via the API. - + Uses POST /v1/sessions/{id} with status=REQUEST_RELEASE to immediately terminate the session without waiting for keepAlive timeout. - + Args: session_id: The Browserbase session ID api_key: Browserbase API key project_id: Browserbase project ID - + Returns: True if session was successfully closed, False otherwise """ @@ -1648,14 +1674,16 @@ def _close_browserbase_session(session_id: str, api_key: str, project_id: str) - }, timeout=10 ) - + if response.status_code in (200, 201, 204): - logger.debug("Successfully closed BrowserBase session %s", session_id) + logger.debug( + "Successfully closed BrowserBase session %s", session_id) return True else: - logger.warning("Failed to close session %s: HTTP %s - %s", session_id, response.status_code, response.text[:200]) + logger.warning("Failed to close session %s: HTTP %s - %s", + session_id, response.status_code, response.text[:200]) return False - + except Exception as e: logger.error("Exception closing session %s: %s", session_id, e) return False @@ -1664,57 +1692,64 @@ def _close_browserbase_session(session_id: str, api_key: str, project_id: str) - def cleanup_browser(task_id: Optional[str] = None) -> None: """ Clean up browser session for a task. - + Called automatically when a task completes or when inactivity timeout is reached. Closes both the agent-browser session and the Browserbase session. - + Args: task_id: Task identifier to clean up """ if task_id is None: task_id = "default" - + logger.debug("cleanup_browser called for task_id: %s", task_id) logger.debug("Active sessions: %s", list(_active_sessions.keys())) - + # Check if session exists (under lock), but don't remove yet - # _run_browser_command needs it to build the close command. with _cleanup_lock: session_info = _active_sessions.get(task_id) - + if session_info: bb_session_id = session_info.get("bb_session_id", "unknown") - logger.debug("Found session for task %s: bb_session_id=%s", task_id, bb_session_id) - + logger.debug("Found session for task %s: bb_session_id=%s", + task_id, bb_session_id) + # Stop auto-recording before closing (saves the file) _maybe_stop_recording(task_id) - + # Try to close via agent-browser first (needs session in _active_sessions) try: _run_browser_command(task_id, "close", [], timeout=10) - logger.debug("agent-browser close command completed for task %s", task_id) + logger.debug( + "agent-browser close command completed for task %s", task_id) except Exception as e: - logger.warning("agent-browser close failed for task %s: %s", task_id, e) - + logger.warning( + "agent-browser close failed for task %s: %s", task_id, e) + # Now remove from tracking under lock with _cleanup_lock: _active_sessions.pop(task_id, None) _session_last_activity.pop(task_id, None) - + # Cloud mode: close the Browserbase session via API if bb_session_id and not _is_local_mode(): try: config = _get_browserbase_config() - success = _close_browserbase_session(bb_session_id, config["api_key"], config["project_id"]) + success = _close_browserbase_session( + bb_session_id, config["api_key"], config["project_id"]) if not success: - logger.warning("Could not close BrowserBase session %s", bb_session_id) + logger.warning( + "Could not close BrowserBase session %s", bb_session_id) except Exception as e: - logger.error("Exception during BrowserBase session close: %s", e) - + logger.error( + "Exception during BrowserBase session close: %s", e) + # Kill the daemon process and clean up socket directory session_name = session_info.get("session_name", "") if session_name: - socket_dir = os.path.join(_socket_safe_tmpdir(), f"agent-browser-{session_name}") + socket_dir = os.path.join( + _socket_safe_tmpdir(), f"agent-browser-{session_name}") if os.path.exists(socket_dir): # agent-browser writes {session}.pid in the socket dir pid_file = os.path.join(socket_dir, f"{session_name}.pid") @@ -1722,11 +1757,13 @@ def cleanup_browser(task_id: Optional[str] = None) -> None: try: daemon_pid = int(Path(pid_file).read_text().strip()) os.kill(daemon_pid, signal.SIGTERM) - logger.debug("Killed daemon pid %s for %s", daemon_pid, session_name) + logger.debug("Killed daemon pid %s for %s", + daemon_pid, session_name) except (ProcessLookupError, ValueError, PermissionError, OSError): - logger.debug("Could not kill daemon pid for %s (already dead or inaccessible)", session_name) + logger.debug( + "Could not kill daemon pid for %s (already dead or inaccessible)", session_name) shutil.rmtree(socket_dir, ignore_errors=True) - + logger.debug("Removed task %s from active sessions", task_id) else: logger.debug("No active session found for task_id: %s", task_id) @@ -1735,7 +1772,7 @@ def cleanup_browser(task_id: Optional[str] = None) -> None: def cleanup_all_browsers() -> None: """ Clean up all active browser sessions. - + Useful for cleanup on shutdown. """ with _cleanup_lock: @@ -1747,7 +1784,7 @@ def cleanup_all_browsers() -> None: def get_active_browser_sessions() -> Dict[str, Dict[str, str]]: """ Get information about active browser sessions. - + Returns: Dict mapping task_id to session info (session_name, bb_session_id, cdp_url) """ @@ -1768,7 +1805,7 @@ def check_browser_requirements() -> bool: In **cloud mode** (BROWSERBASE_API_KEY set): the CLI *and* both ``BROWSERBASE_API_KEY`` / ``BROWSERBASE_PROJECT_ID`` must be present. - + Returns: True if all requirements are met, False otherwise """ @@ -1801,7 +1838,7 @@ if __name__ == "__main__": mode = "local" if _is_local_mode() else "cloud (Browserbase)" print(f" Mode: {mode}") - + # Check requirements if check_browser_requirements(): print("✅ All requirements met") @@ -1811,18 +1848,19 @@ if __name__ == "__main__": _find_agent_browser() except FileNotFoundError: print(" - agent-browser CLI not found") - print(" Install: npm install -g agent-browser && agent-browser install --with-deps") + print( + " Install: npm install -g agent-browser && agent-browser install --with-deps") if not _is_local_mode(): if not os.environ.get("BROWSERBASE_API_KEY"): print(" - BROWSERBASE_API_KEY not set (required for cloud mode)") if not os.environ.get("BROWSERBASE_PROJECT_ID"): print(" - BROWSERBASE_PROJECT_ID not set (required for cloud mode)") print(" Tip: unset BROWSERBASE_API_KEY to use free local mode instead") - + print("\n📋 Available Browser Tools:") for schema in BROWSER_TOOL_SCHEMAS: print(f" 🔹 {schema['name']}: {schema['description'][:60]}...") - + print("\n💡 Usage:") print(" from tools.browser_tool import browser_navigate, browser_snapshot") print(" result = browser_navigate('https://example.com', task_id='my_task')") @@ -1832,7 +1870,6 @@ if __name__ == "__main__": # --------------------------------------------------------------------------- # Registry # --------------------------------------------------------------------------- -from tools.registry import registry _BROWSER_SCHEMA_MAP = {s["name"]: s for s in BROWSER_TOOL_SCHEMAS} @@ -1840,7 +1877,8 @@ registry.register( name="browser_navigate", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_navigate"], - handler=lambda args, **kw: browser_navigate(url=args.get("url", ""), task_id=kw.get("task_id")), + handler=lambda args, **kw: browser_navigate( + url=args.get("url", ""), task_id=kw.get("task_id")), check_fn=check_browser_requirements, ) registry.register( @@ -1855,7 +1893,8 @@ registry.register( name="browser_click", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_click"], - handler=lambda args, **kw: browser_click(**args, task_id=kw.get("task_id")), + handler=lambda args, **kw: browser_click(** + args, task_id=kw.get("task_id")), check_fn=check_browser_requirements, ) registry.register( @@ -1869,7 +1908,8 @@ registry.register( name="browser_scroll", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_scroll"], - handler=lambda args, **kw: browser_scroll(**args, task_id=kw.get("task_id")), + handler=lambda args, **kw: browser_scroll(** + args, task_id=kw.get("task_id")), check_fn=check_browser_requirements, ) registry.register( @@ -1883,7 +1923,8 @@ registry.register( name="browser_press", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_press"], - handler=lambda args, **kw: browser_press(key=args.get("key", ""), task_id=kw.get("task_id")), + handler=lambda args, **kw: browser_press( + key=args.get("key", ""), task_id=kw.get("task_id")), check_fn=check_browser_requirements, ) registry.register( @@ -1904,13 +1945,20 @@ registry.register( name="browser_vision", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_vision"], - handler=lambda args, **kw: browser_vision(question=args.get("question", ""), annotate=args.get("annotate", False), task_id=kw.get("task_id")), + handler=lambda args, **kw: browser_vision( + question=args.get("question", ""), + annotate=args.get("annotate", False), + task_id=kw.get("task_id"), + ), check_fn=check_browser_requirements, ) registry.register( name="browser_console", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_console"], - handler=lambda args, **kw: browser_console(clear=args.get("clear", False), task_id=kw.get("task_id")), + handler=lambda args, **kw: browser_console( + clear=args.get("clear", False), + task_id=kw.get("task_id"), + ), check_fn=check_browser_requirements, ) From c1d1699a64c7bee391d5af895ba76648714cb214 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 11:34:31 -0700 Subject: [PATCH 19/34] fix: align salvaged browser cleanup patch with current main Resolve the cherry-pick against current browser_tool structure without carrying unrelated formatting churn, while preserving the intended cleanup, PATH, and screenshot recovery changes from PR #1001. --- tools/browser_tool.py | 487 +++++++++++++++++++----------------------- 1 file changed, 215 insertions(+), 272 deletions(-) diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 9a2c9de4c..ecdff7530 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -49,7 +49,6 @@ Usage: browser_click("@e5", task_id="task_123") """ -from tools.registry import registry import atexit import json import logging @@ -128,8 +127,7 @@ def _socket_safe_tmpdir() -> str: # Track active sessions per task # Stores: session_name (always), bb_session_id + cdp_url (cloud mode only) -# task_id -> {session_name, ...} -_active_sessions: Dict[str, Dict[str, str]] = {} +_active_sessions: Dict[str, Dict[str, str]] = {} # task_id -> {session_name, ...} _recording_sessions: set = set() # task_ids with active recordings # Flag to track if cleanup has been done @@ -142,8 +140,7 @@ _cleanup_done = False # Session inactivity timeout (seconds) - cleanup if no activity for this long # Default: 5 minutes. Needs headroom for LLM reasoning between browser commands, # especially when subagents are doing multi-step browser tasks. -BROWSER_SESSION_INACTIVITY_TIMEOUT = int( - os.environ.get("BROWSER_INACTIVITY_TIMEOUT", "300")) +BROWSER_SESSION_INACTIVITY_TIMEOUT = int(os.environ.get("BROWSER_INACTIVITY_TIMEOUT", "300")) # Track last activity time per session _session_last_activity: Dict[str, float] = {} @@ -165,10 +162,10 @@ def _emergency_cleanup_all_sessions(): if _cleanup_done: return _cleanup_done = True - + if not _active_sessions: return - + logger.info("Emergency cleanup: closing %s active session(s)...", len(_active_sessions)) @@ -199,49 +196,46 @@ atexit.register(_emergency_cleanup_all_sessions) def _cleanup_inactive_browser_sessions(): """ Clean up browser sessions that have been inactive for longer than the timeout. - + This function is called periodically by the background cleanup thread to automatically close sessions that haven't been used recently, preventing orphaned sessions (local or Browserbase) from accumulating. """ current_time = time.time() sessions_to_cleanup = [] - + with _cleanup_lock: for task_id, last_time in list(_session_last_activity.items()): if current_time - last_time > BROWSER_SESSION_INACTIVITY_TIMEOUT: sessions_to_cleanup.append(task_id) - + for task_id in sessions_to_cleanup: try: - elapsed = int( - current_time - _session_last_activity.get(task_id, current_time)) - logger.info( - "Cleaning up inactive session for task: %s (inactive for %ss)", task_id, elapsed) + elapsed = int(current_time - _session_last_activity.get(task_id, current_time)) + logger.info("Cleaning up inactive session for task: %s (inactive for %ss)", task_id, elapsed) cleanup_browser(task_id) with _cleanup_lock: if task_id in _session_last_activity: del _session_last_activity[task_id] except Exception as e: - logger.warning( - "Error cleaning up inactive session %s: %s", task_id, e) + logger.warning("Error cleaning up inactive session %s: %s", task_id, e) def _browser_cleanup_thread_worker(): """ Background thread that periodically cleans up inactive browser sessions. - + Runs every 30 seconds and checks for sessions that haven't been used within the BROWSER_SESSION_INACTIVITY_TIMEOUT period. """ global _cleanup_running - + while _cleanup_running: try: _cleanup_inactive_browser_sessions() except Exception as e: logger.warning("Cleanup thread error: %s", e) - + # Sleep in 1-second intervals so we can stop quickly if needed for _ in range(30): if not _cleanup_running: @@ -252,7 +246,7 @@ def _browser_cleanup_thread_worker(): def _start_browser_cleanup_thread(): """Start the background cleanup thread if not already running.""" global _cleanup_thread, _cleanup_running - + with _cleanup_lock: if _cleanup_thread is None or not _cleanup_thread.is_alive(): _cleanup_running = True @@ -262,8 +256,7 @@ def _start_browser_cleanup_thread(): name="browser-cleanup" ) _cleanup_thread.start() - logger.info("Started inactivity cleanup thread (timeout: %ss)", - BROWSER_SESSION_INACTIVITY_TIMEOUT) + logger.info("Started inactivity cleanup thread (timeout: %ss)", BROWSER_SESSION_INACTIVITY_TIMEOUT) def _stop_browser_cleanup_thread(): @@ -450,41 +443,38 @@ BROWSER_TOOL_SCHEMAS = [ def _create_browserbase_session(task_id: str) -> Dict[str, str]: """ Create a Browserbase session with stealth features. - + Browserbase Stealth Modes: - Basic Stealth: ALWAYS enabled automatically. Generates random fingerprints, viewports, and solves visual CAPTCHAs. No configuration needed. - Advanced Stealth: Uses custom Chromium build for better bot detection avoidance. Requires Scale Plan. Enable via BROWSERBASE_ADVANCED_STEALTH=true. - + Proxies are enabled by default to route traffic through residential IPs, which significantly improves CAPTCHA solving rates. Can be disabled via BROWSERBASE_PROXIES=false if needed. - + Args: task_id: Unique identifier for the task - + Returns: Dict with session_name, bb_session_id, cdp_url, and feature flags """ import uuid import sys - + config = _get_browserbase_config() - + # Check for optional settings from environment # Proxies: enabled by default for better CAPTCHA solving - enable_proxies = os.environ.get( - "BROWSERBASE_PROXIES", "true").lower() != "false" + enable_proxies = os.environ.get("BROWSERBASE_PROXIES", "true").lower() != "false" # Advanced Stealth: requires Scale Plan, disabled by default - enable_advanced_stealth = os.environ.get( - "BROWSERBASE_ADVANCED_STEALTH", "false").lower() == "true" + enable_advanced_stealth = os.environ.get("BROWSERBASE_ADVANCED_STEALTH", "false").lower() == "true" # keepAlive: enabled by default (requires paid plan) - allows reconnection after disconnects - enable_keep_alive = os.environ.get( - "BROWSERBASE_KEEP_ALIVE", "true").lower() != "false" + enable_keep_alive = os.environ.get("BROWSERBASE_KEEP_ALIVE", "true").lower() != "false" # Custom session timeout in milliseconds (optional) - extends session beyond project default custom_timeout_ms = os.environ.get("BROWSERBASE_SESSION_TIMEOUT") - + # Track which features are actually enabled for logging/debugging features_enabled = { "basic_stealth": True, # Always on @@ -493,18 +483,18 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: "keep_alive": False, "custom_timeout": False, } - + # Build session configuration # Note: Basic stealth mode is ALWAYS active - no configuration needed session_config = { "projectId": config["project_id"], } - + # Enable keepAlive for session reconnection (default: true, requires paid plan) # Allows reconnecting to the same session after network hiccups if enable_keep_alive: session_config["keepAlive"] = True - + # Add custom timeout if specified (in milliseconds) # This extends session duration beyond project's default timeout if custom_timeout_ms: @@ -513,21 +503,20 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: if timeout_val > 0: session_config["timeout"] = timeout_val except ValueError: - logger.warning( - "Invalid BROWSERBASE_SESSION_TIMEOUT value: %s", custom_timeout_ms) - + logger.warning("Invalid BROWSERBASE_SESSION_TIMEOUT value: %s", custom_timeout_ms) + # Enable proxies for better CAPTCHA solving (default: true) # Routes traffic through residential IPs for more reliable access if enable_proxies: session_config["proxies"] = True - + # Add advanced stealth if enabled (requires Scale Plan) # Uses custom Chromium build to avoid bot detection altogether if enable_advanced_stealth: session_config["browserSettings"] = { "advancedStealth": True, } - + # Create session via Browserbase API response = requests.post( "https://api.browserbase.com/v1/sessions", @@ -538,11 +527,11 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: json=session_config, timeout=30 ) - + # Track if we fell back from paid features proxies_fallback = False keepalive_fallback = False - + # Handle 402 Payment Required - likely paid features not available # Try to identify which feature caused the issue and retry without it if response.status_code == 402: @@ -550,7 +539,7 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: if enable_keep_alive: keepalive_fallback = True logger.warning("keepAlive may require paid plan (402), retrying without it. " - "Sessions may timeout during long operations.") + "Sessions may timeout during long operations.") session_config.pop("keepAlive", None) response = requests.post( "https://api.browserbase.com/v1/sessions", @@ -561,12 +550,12 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: json=session_config, timeout=30 ) - + # If still 402, try without proxies too if response.status_code == 402 and enable_proxies: proxies_fallback = True logger.warning("Proxies unavailable (402), retrying without proxies. " - "Bot detection may be less effective.") + "Bot detection may be less effective.") session_config.pop("proxies", None) response = requests.post( "https://api.browserbase.com/v1/sessions", @@ -577,14 +566,13 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: json=session_config, timeout=30 ) - + if not response.ok: - raise RuntimeError( - f"Failed to create Browserbase session: {response.status_code} {response.text}") - + raise RuntimeError(f"Failed to create Browserbase session: {response.status_code} {response.text}") + session_data = response.json() session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}" - + # Update features based on what actually succeeded if enable_proxies and not proxies_fallback: features_enabled["proxies"] = True @@ -594,12 +582,11 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]: features_enabled["keep_alive"] = True if custom_timeout_ms and "timeout" in session_config: features_enabled["custom_timeout"] = True - + # Log session info for debugging feature_str = ", ".join(k for k, v in features_enabled.items() if v) - logger.info("Created session %s with features: %s", - session_name, feature_str) - + logger.info("Created session %s with features: %s", session_name, feature_str) + return { "session_name": session_name, "bb_session_id": session_data["id"], @@ -624,51 +611,51 @@ def _create_local_session(task_id: str) -> Dict[str, str]: def _get_session_info(task_id: Optional[str] = None) -> Dict[str, str]: """ Get or create session info for the given task. - + In cloud mode, creates a Browserbase session with proxies enabled. In local mode, generates a session name for agent-browser --session. Also starts the inactivity cleanup thread and updates activity tracking. Thread-safe: multiple subagents can call this concurrently. - + Args: task_id: Unique identifier for the task - + Returns: Dict with session_name (always), bb_session_id + cdp_url (cloud only) """ if task_id is None: task_id = "default" - + # Start the cleanup thread if not running (handles inactivity timeouts) _start_browser_cleanup_thread() - + # Update activity timestamp for this session _update_session_activity(task_id) - + with _cleanup_lock: # Check if we already have a session for this task if task_id in _active_sessions: return _active_sessions[task_id] - + # Create session outside the lock (network call in cloud mode) if _is_local_mode(): session_info = _create_local_session(task_id) else: session_info = _create_browserbase_session(task_id) - + with _cleanup_lock: _active_sessions[task_id] = session_info - + return session_info def _get_session_name(task_id: Optional[str] = None) -> str: """ Get the session name for agent-browser CLI. - + Args: task_id: Unique identifier for the task - + Returns: Session name for agent-browser """ @@ -679,22 +666,22 @@ def _get_session_name(task_id: Optional[str] = None) -> str: def _get_browserbase_config() -> Dict[str, str]: """ Get Browserbase configuration from environment. - + Returns: Dict with api_key and project_id - + Raises: ValueError: If required env vars are not set """ api_key = os.environ.get("BROWSERBASE_API_KEY") project_id = os.environ.get("BROWSERBASE_PROJECT_ID") - + if not api_key or not project_id: raise ValueError( "BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID environment variables are required. " "Get your credentials at https://browserbase.com" ) - + return { "api_key": api_key, "project_id": project_id @@ -704,12 +691,12 @@ def _get_browserbase_config() -> Dict[str, str]: def _find_agent_browser() -> str: """ Find the agent-browser CLI executable. - + Checks in order: PATH, local node_modules/.bin/, npx fallback. - + Returns: Path to agent-browser executable - + Raises: FileNotFoundError: If agent-browser is not installed """ @@ -718,18 +705,18 @@ def _find_agent_browser() -> str: which_result = shutil.which("agent-browser") if which_result: return which_result - + # Check local node_modules/.bin/ (npm install in repo root) repo_root = Path(__file__).parent.parent local_bin = repo_root / "node_modules" / ".bin" / "agent-browser" if local_bin.exists(): return str(local_bin) - + # Check common npx locations npx_path = shutil.which("npx") if npx_path: return "npx agent-browser" - + raise FileNotFoundError( "agent-browser CLI not found. Install it with: npm install -g agent-browser\n" "Or run 'npm install' in the repo root to install locally.\n" @@ -766,25 +753,25 @@ def _run_browser_command( ) -> Dict[str, Any]: """ Run an agent-browser CLI command using our pre-created Browserbase session. - + Args: task_id: Task identifier to get the right session command: The command to run (e.g., "open", "click") args: Additional arguments for the command timeout: Command timeout in seconds - + Returns: Parsed JSON response from agent-browser """ args = args or [] - + # Build the command try: browser_cmd = _find_agent_browser() except FileNotFoundError as e: logger.warning("agent-browser CLI not found: %s", e) return {"success": False, "error": str(e)} - + from tools.interrupt import is_interrupted if is_interrupted(): return {"success": False, "error": "Interrupted"} @@ -793,10 +780,9 @@ def _run_browser_command( try: session_info = _get_session_info(task_id) except Exception as e: - logger.warning( - "Failed to create browser session for task=%s: %s", task_id, e) + logger.warning("Failed to create browser session for task=%s: %s", task_id, e) return {"success": False, "error": f"Failed to create browser session: {str(e)}"} - + # Build the command with the appropriate backend flag. # Cloud mode: --cdp connects to Browserbase. # Local mode: --session launches a local headless Chromium. @@ -814,7 +800,7 @@ def _run_browser_command( "--json", command ] + args - + try: # Give each task its own socket directory to prevent concurrency conflicts. # Without this, parallel workers fight over the same default socket path, @@ -826,7 +812,7 @@ def _run_browser_command( os.makedirs(task_socket_dir, mode=0o700, exist_ok=True) logger.debug("browser cmd=%s task=%s socket_dir=%s (%d chars)", command, task_id, task_socket_dir, len(task_socket_dir)) - + browser_env = {**os.environ} # Ensure PATH includes Hermes-managed Node first, then standard system dirs. @@ -843,14 +829,7 @@ def _run_browser_command( browser_env["PATH"] = ":".join(path_parts) browser_env["AGENT_BROWSER_SOCKET_DIR"] = task_socket_dir - - node_path = shutil.which("node", path=browser_env["PATH"]) - if node_path: - logger.debug("browser subprocess using node at: %s", node_path) - else: - logger.warning("node not found in browser PATH: %s", - browser_env["PATH"]) - + result = subprocess.run( cmd_parts, capture_output=True, @@ -858,13 +837,12 @@ def _run_browser_command( timeout=timeout, env=browser_env, ) - + # Log stderr for diagnostics — use warning level on failure so it's visible if result.stderr and result.stderr.strip(): level = logging.WARNING if result.returncode != 0 else logging.DEBUG - logger.log(level, "browser '%s' stderr: %s", - command, result.stderr.strip()[:500]) - + logger.log(level, "browser '%s' stderr: %s", command, result.stderr.strip()[:500]) + # Log empty output as warning — common sign of broken agent-browser if not result.stdout.strip() and result.returncode == 0: logger.warning("browser '%s' returned empty stdout with rc=0. " @@ -877,6 +855,7 @@ def _run_browser_command( if stdout_text: try: parsed = json.loads(stdout_text) + # Warn if snapshot came back empty (common sign of daemon/CDP issues) if command == "snapshot" and parsed.get("success"): snap_data = parsed.get("data", {}) if not snap_data.get("snapshot") and not snap_data.get("refs"): @@ -894,8 +873,7 @@ def _run_browser_command( combined_text = "\n".join( part for part in [stdout_text, stderr_text] if part ) - recovered_path = _extract_screenshot_path_from_text( - combined_text) + recovered_path = _extract_screenshot_path_from_text(combined_text) if recovered_path and Path(recovered_path).exists(): logger.info( @@ -914,17 +892,15 @@ def _run_browser_command( "success": False, "error": f"Non-JSON output from agent-browser for '{command}': {raw}" } - + # Check for errors if result.returncode != 0: - error_msg = result.stderr.strip( - ) if result.stderr else f"Command failed with code {result.returncode}" - logger.warning("browser '%s' failed (rc=%s): %s", - command, result.returncode, error_msg[:300]) + error_msg = result.stderr.strip() if result.stderr else f"Command failed with code {result.returncode}" + logger.warning("browser '%s' failed (rc=%s): %s", command, result.returncode, error_msg[:300]) return {"success": False, "error": error_msg} - + return {"success": True, "data": {}} - + except subprocess.TimeoutExpired: logger.warning("browser '%s' timed out after %ds (task=%s, socket_dir=%s)", command, timeout, task_id, task_socket_dir) @@ -984,17 +960,17 @@ def _extract_relevant_content( def _truncate_snapshot(snapshot_text: str, max_chars: int = 8000) -> str: """ Simple truncation fallback for snapshots. - + Args: snapshot_text: The snapshot text to truncate max_chars: Maximum characters to keep - + Returns: Truncated text with indicator if truncated """ if len(snapshot_text) <= max_chars: return snapshot_text - + return snapshot_text[:max_chars] + "\n\n[... content truncated ...]" @@ -1005,39 +981,39 @@ def _truncate_snapshot(snapshot_text: str, max_chars: int = 8000) -> str: def browser_navigate(url: str, task_id: Optional[str] = None) -> str: """ Navigate to a URL in the browser. - + Args: url: The URL to navigate to task_id: Task identifier for session isolation - + Returns: JSON string with navigation result (includes stealth features info on first nav) """ effective_task_id = task_id or "default" - + # Get session info to check if this is a new session # (will create one with features logged if not exists) session_info = _get_session_info(effective_task_id) is_first_nav = session_info.get("_first_nav", True) - + # Auto-start recording if configured and this is first navigation if is_first_nav: session_info["_first_nav"] = False _maybe_start_recording(effective_task_id) - + result = _run_browser_command(effective_task_id, "open", [url], timeout=60) - + if result.get("success"): data = result.get("data", {}) title = data.get("title", "") final_url = data.get("url", url) - + response = { "success": True, "url": final_url, "title": title } - + # Detect common "blocked" page patterns from title/url blocked_patterns = [ "access denied", "access to this page has been denied", @@ -1047,7 +1023,7 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: "just a moment", "attention required" ] title_lower = title.lower() - + if any(pattern in title_lower for pattern in blocked_patterns): response["bot_detection_warning"] = ( f"Page title '{title}' suggests bot detection. The site may have blocked this request. " @@ -1055,7 +1031,7 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: "3) Enable advanced stealth (BROWSERBASE_ADVANCED_STEALTH=true, requires Scale plan), " "4) Some sites have very aggressive bot detection that may be unavoidable." ) - + # Include feature info on first navigation so model knows what's active if is_first_nav and "features" in session_info: features = session_info["features"] @@ -1066,7 +1042,7 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: "Consider upgrading Browserbase plan for proxy support." ) response["stealth_features"] = active_features - + return json.dumps(response, ensure_ascii=False) else: return json.dumps({ @@ -1082,41 +1058,41 @@ def browser_snapshot( ) -> str: """ Get a text-based snapshot of the current page's accessibility tree. - + Args: full: If True, return complete snapshot. If False, return compact view. task_id: Task identifier for session isolation user_task: The user's current task (for task-aware extraction) - + Returns: JSON string with page snapshot """ effective_task_id = task_id or "default" - + # Build command args based on full flag args = [] if not full: args.extend(["-c"]) # Compact mode - + result = _run_browser_command(effective_task_id, "snapshot", args) - + if result.get("success"): data = result.get("data", {}) snapshot_text = data.get("snapshot", "") refs = data.get("refs", {}) - + # Check if snapshot needs summarization if len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD and user_task: snapshot_text = _extract_relevant_content(snapshot_text, user_task) elif len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD: snapshot_text = _truncate_snapshot(snapshot_text) - + response = { "success": True, "snapshot": snapshot_text, "element_count": len(refs) if refs else 0 } - + return json.dumps(response, ensure_ascii=False) else: return json.dumps({ @@ -1128,22 +1104,22 @@ def browser_snapshot( def browser_click(ref: str, task_id: Optional[str] = None) -> str: """ Click on an element. - + Args: ref: Element reference (e.g., "@e5") task_id: Task identifier for session isolation - + Returns: JSON string with click result """ effective_task_id = task_id or "default" - + # Ensure ref starts with @ if not ref.startswith("@"): ref = f"@{ref}" - + result = _run_browser_command(effective_task_id, "click", [ref]) - + if result.get("success"): return json.dumps({ "success": True, @@ -1159,24 +1135,24 @@ def browser_click(ref: str, task_id: Optional[str] = None) -> str: def browser_type(ref: str, text: str, task_id: Optional[str] = None) -> str: """ Type text into an input field. - + Args: ref: Element reference (e.g., "@e3") text: Text to type task_id: Task identifier for session isolation - + Returns: JSON string with type result """ effective_task_id = task_id or "default" - + # Ensure ref starts with @ if not ref.startswith("@"): ref = f"@{ref}" - + # Use fill command (clears then types) result = _run_browser_command(effective_task_id, "fill", [ref, text]) - + if result.get("success"): return json.dumps({ "success": True, @@ -1193,25 +1169,25 @@ def browser_type(ref: str, text: str, task_id: Optional[str] = None) -> str: def browser_scroll(direction: str, task_id: Optional[str] = None) -> str: """ Scroll the page. - + Args: direction: "up" or "down" task_id: Task identifier for session isolation - + Returns: JSON string with scroll result """ effective_task_id = task_id or "default" - + # Validate direction if direction not in ["up", "down"]: return json.dumps({ "success": False, "error": f"Invalid direction '{direction}'. Use 'up' or 'down'." }, ensure_ascii=False) - + result = _run_browser_command(effective_task_id, "scroll", [direction]) - + if result.get("success"): return json.dumps({ "success": True, @@ -1227,16 +1203,16 @@ def browser_scroll(direction: str, task_id: Optional[str] = None) -> str: def browser_back(task_id: Optional[str] = None) -> str: """ Navigate back in browser history. - + Args: task_id: Task identifier for session isolation - + Returns: JSON string with navigation result """ effective_task_id = task_id or "default" result = _run_browser_command(effective_task_id, "back", []) - + if result.get("success"): data = result.get("data", {}) return json.dumps({ @@ -1253,17 +1229,17 @@ def browser_back(task_id: Optional[str] = None) -> str: def browser_press(key: str, task_id: Optional[str] = None) -> str: """ Press a keyboard key. - + Args: key: Key to press (e.g., "Enter", "Tab") task_id: Task identifier for session isolation - + Returns: JSON string with key press result """ effective_task_id = task_id or "default" result = _run_browser_command(effective_task_id, "press", [key]) - + if result.get("success"): return json.dumps({ "success": True, @@ -1303,27 +1279,25 @@ def browser_close(task_id: Optional[str] = None) -> str: def browser_console(clear: bool = False, task_id: Optional[str] = None) -> str: """Get browser console messages and JavaScript errors. - + Returns both console output (log/warn/error/info from the page's JS) and uncaught exceptions (crashes, unhandled promise rejections). - + Args: clear: If True, clear the message/error buffers after reading task_id: Task identifier for session isolation - + Returns: JSON string with console messages and JS errors """ effective_task_id = task_id or "default" - + console_args = ["--clear"] if clear else [] error_args = ["--clear"] if clear else [] - - console_result = _run_browser_command( - effective_task_id, "console", console_args) - errors_result = _run_browser_command( - effective_task_id, "errors", error_args) - + + console_result = _run_browser_command(effective_task_id, "console", console_args) + errors_result = _run_browser_command(effective_task_id, "errors", error_args) + messages = [] if console_result.get("success"): for msg in console_result.get("data", {}).get("messages", []): @@ -1332,7 +1306,7 @@ def browser_console(clear: bool = False, task_id: Optional[str] = None) -> str: "text": msg.get("text", ""), "source": "console", }) - + errors = [] if errors_result.get("success"): for err in errors_result.get("data", {}).get("errors", []): @@ -1340,7 +1314,7 @@ def browser_console(clear: bool = False, task_id: Optional[str] = None) -> str: "message": err.get("message", ""), "source": "exception", }) - + return json.dumps({ "success": True, "console_messages": messages, @@ -1355,38 +1329,32 @@ def _maybe_start_recording(task_id: str): if task_id in _recording_sessions: return try: - hermes_home = Path(os.environ.get( - "HERMES_HOME", Path.home() / ".hermes")) + hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) config_path = hermes_home / "config.yaml" record_enabled = False if config_path.exists(): import yaml with open(config_path) as f: cfg = yaml.safe_load(f) or {} - record_enabled = cfg.get("browser", {}).get( - "record_sessions", False) - + record_enabled = cfg.get("browser", {}).get("record_sessions", False) + if not record_enabled: return - + recordings_dir = hermes_home / "browser_recordings" recordings_dir.mkdir(parents=True, exist_ok=True) _cleanup_old_recordings(max_age_hours=72) - + import time timestamp = time.strftime("%Y%m%d_%H%M%S") - recording_path = recordings_dir / \ - f"session_{timestamp}_{task_id[:16]}.webm" - - result = _run_browser_command( - task_id, "record", ["start", str(recording_path)]) + recording_path = recordings_dir / f"session_{timestamp}_{task_id[:16]}.webm" + + result = _run_browser_command(task_id, "record", ["start", str(recording_path)]) if result.get("success"): _recording_sessions.add(task_id) - logger.info("Auto-recording browser session %s to %s", - task_id, recording_path) + logger.info("Auto-recording browser session %s to %s", task_id, recording_path) else: - logger.debug("Could not start auto-recording: %s", - result.get("error")) + logger.debug("Could not start auto-recording: %s", result.get("error")) except Exception as e: logger.debug("Auto-recording setup failed: %s", e) @@ -1399,8 +1367,7 @@ def _maybe_stop_recording(task_id: str): result = _run_browser_command(task_id, "record", ["stop"]) if result.get("success"): path = result.get("data", {}).get("path", "") - logger.info( - "Saved browser recording for session %s: %s", task_id, path) + logger.info("Saved browser recording for session %s: %s", task_id, path) except Exception as e: logger.debug("Could not stop recording for %s: %s", task_id, e) finally: @@ -1410,15 +1377,15 @@ def _maybe_stop_recording(task_id: str): def browser_get_images(task_id: Optional[str] = None) -> str: """ Get all images on the current page. - + Args: task_id: Task identifier for session isolation - + Returns: JSON string with list of images (src and alt) """ effective_task_id = task_id or "default" - + # Use eval to run JavaScript that extracts images js_code = """JSON.stringify( [...document.images].map(img => ({ @@ -1428,20 +1395,20 @@ def browser_get_images(task_id: Optional[str] = None) -> str: height: img.naturalHeight })).filter(img => img.src && !img.src.startsWith('data:')) )""" - + result = _run_browser_command(effective_task_id, "eval", [js_code]) - + if result.get("success"): data = result.get("data", {}) raw_result = data.get("result", "[]") - + try: # Parse the JSON string returned by JavaScript if isinstance(raw_result, str): images = json.loads(raw_result) else: images = raw_result - + return json.dumps({ "success": True, "images": images, @@ -1464,40 +1431,40 @@ def browser_get_images(task_id: Optional[str] = None) -> str: def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str: """ Take a screenshot of the current page and analyze it with vision AI. - + This tool captures what's visually displayed in the browser and sends it to Gemini for analysis. Useful for understanding visual content that the text-based snapshot may not capture (CAPTCHAs, verification challenges, images, complex layouts, etc.). - + The screenshot is saved persistently and its file path is returned alongside the analysis, so it can be shared with users via MEDIA: in the response. - + Args: question: What you want to know about the page visually annotate: If True, overlay numbered [N] labels on interactive elements task_id: Task identifier for session isolation - + Returns: JSON string with vision analysis results and screenshot_path """ import base64 import uuid as uuid_mod from pathlib import Path - + effective_task_id = task_id or "default" + # Save screenshot to persistent location so it can be shared with users hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) screenshots_dir = hermes_home / "browser_screenshots" - screenshot_path = screenshots_dir / \ - f"browser_screenshot_{uuid_mod.uuid4().hex}.png" - + screenshot_path = screenshots_dir / f"browser_screenshot_{uuid_mod.uuid4().hex}.png" + try: screenshots_dir.mkdir(parents=True, exist_ok=True) - + # Prune old screenshots (older than 24 hours) to prevent unbounded disk growth _cleanup_old_screenshots(screenshots_dir, max_age_hours=24) - + # Take screenshot using agent-browser screenshot_args = [] if annotate: @@ -1505,12 +1472,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] screenshot_args.append("--full") screenshot_args.append(str(screenshot_path)) result = _run_browser_command( - effective_task_id, - "screenshot", + effective_task_id, + "screenshot", screenshot_args, timeout=30 ) - + if not result.get("success"): error_detail = result.get("error", "Unknown error") mode = "local" if _is_local_mode() else "cloud" @@ -1535,12 +1502,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] f"or a stale daemon process." ), }, ensure_ascii=False) - + # Read and convert to base64 image_data = screenshot_path.read_bytes() image_base64 = base64.b64encode(image_data).decode("ascii") data_url = f"data:image/png;base64,{image_base64}" - + vision_prompt = ( f"You are analyzing a screenshot of a web browser.\n\n" f"User's question: {question}\n\n" @@ -1571,7 +1538,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] if vision_model: call_kwargs["model"] = vision_model response = call_llm(**call_kwargs) - + analysis = response.choices[0].message.content response_data = { "success": True, @@ -1582,15 +1549,14 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] if annotate and result.get("data", {}).get("annotations"): response_data["annotations"] = result["data"]["annotations"] return json.dumps(response_data, ensure_ascii=False) - + except Exception as e: # Keep the screenshot if it was captured successfully — the failure is # in the LLM vision analysis, not the capture. Deleting a valid # screenshot loses evidence the user might need. The 24-hour cleanup # in _cleanup_old_screenshots prevents unbounded disk growth. logger.warning("browser_vision failed: %s", e, exc_info=True) - error_info = {"success": False, - "error": f"Error during vision analysis: {str(e)}"} + error_info = {"success": False, "error": f"Error during vision analysis: {str(e)}"} if screenshot_path.exists(): error_info["screenshot_path"] = str(screenshot_path) error_info["note"] = "Screenshot was captured but vision analysis failed. You can still share it via MEDIA:." @@ -1625,8 +1591,7 @@ def _cleanup_old_recordings(max_age_hours=72): """Remove browser recordings older than max_age_hours to prevent disk bloat.""" import time try: - hermes_home = Path(os.environ.get( - "HERMES_HOME", Path.home() / ".hermes")) + hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) recordings_dir = hermes_home / "browser_recordings" if not recordings_dir.exists(): return @@ -1648,15 +1613,15 @@ def _cleanup_old_recordings(max_age_hours=72): def _close_browserbase_session(session_id: str, api_key: str, project_id: str) -> bool: """ Close a Browserbase session immediately via the API. - + Uses POST /v1/sessions/{id} with status=REQUEST_RELEASE to immediately terminate the session without waiting for keepAlive timeout. - + Args: session_id: The Browserbase session ID api_key: Browserbase API key project_id: Browserbase project ID - + Returns: True if session was successfully closed, False otherwise """ @@ -1674,16 +1639,14 @@ def _close_browserbase_session(session_id: str, api_key: str, project_id: str) - }, timeout=10 ) - + if response.status_code in (200, 201, 204): - logger.debug( - "Successfully closed BrowserBase session %s", session_id) + logger.debug("Successfully closed BrowserBase session %s", session_id) return True else: - logger.warning("Failed to close session %s: HTTP %s - %s", - session_id, response.status_code, response.text[:200]) + logger.warning("Failed to close session %s: HTTP %s - %s", session_id, response.status_code, response.text[:200]) return False - + except Exception as e: logger.error("Exception closing session %s: %s", session_id, e) return False @@ -1692,64 +1655,57 @@ def _close_browserbase_session(session_id: str, api_key: str, project_id: str) - def cleanup_browser(task_id: Optional[str] = None) -> None: """ Clean up browser session for a task. - + Called automatically when a task completes or when inactivity timeout is reached. Closes both the agent-browser session and the Browserbase session. - + Args: task_id: Task identifier to clean up """ if task_id is None: task_id = "default" - + logger.debug("cleanup_browser called for task_id: %s", task_id) logger.debug("Active sessions: %s", list(_active_sessions.keys())) - + # Check if session exists (under lock), but don't remove yet - # _run_browser_command needs it to build the close command. with _cleanup_lock: session_info = _active_sessions.get(task_id) - + if session_info: bb_session_id = session_info.get("bb_session_id", "unknown") - logger.debug("Found session for task %s: bb_session_id=%s", - task_id, bb_session_id) - + logger.debug("Found session for task %s: bb_session_id=%s", task_id, bb_session_id) + # Stop auto-recording before closing (saves the file) _maybe_stop_recording(task_id) - + # Try to close via agent-browser first (needs session in _active_sessions) try: _run_browser_command(task_id, "close", [], timeout=10) - logger.debug( - "agent-browser close command completed for task %s", task_id) + logger.debug("agent-browser close command completed for task %s", task_id) except Exception as e: - logger.warning( - "agent-browser close failed for task %s: %s", task_id, e) - + logger.warning("agent-browser close failed for task %s: %s", task_id, e) + # Now remove from tracking under lock with _cleanup_lock: _active_sessions.pop(task_id, None) _session_last_activity.pop(task_id, None) - + # Cloud mode: close the Browserbase session via API if bb_session_id and not _is_local_mode(): try: config = _get_browserbase_config() - success = _close_browserbase_session( - bb_session_id, config["api_key"], config["project_id"]) + success = _close_browserbase_session(bb_session_id, config["api_key"], config["project_id"]) if not success: - logger.warning( - "Could not close BrowserBase session %s", bb_session_id) + logger.warning("Could not close BrowserBase session %s", bb_session_id) except Exception as e: - logger.error( - "Exception during BrowserBase session close: %s", e) - + logger.error("Exception during BrowserBase session close: %s", e) + # Kill the daemon process and clean up socket directory session_name = session_info.get("session_name", "") if session_name: - socket_dir = os.path.join( - _socket_safe_tmpdir(), f"agent-browser-{session_name}") + socket_dir = os.path.join(_socket_safe_tmpdir(), f"agent-browser-{session_name}") if os.path.exists(socket_dir): # agent-browser writes {session}.pid in the socket dir pid_file = os.path.join(socket_dir, f"{session_name}.pid") @@ -1757,13 +1713,11 @@ def cleanup_browser(task_id: Optional[str] = None) -> None: try: daemon_pid = int(Path(pid_file).read_text().strip()) os.kill(daemon_pid, signal.SIGTERM) - logger.debug("Killed daemon pid %s for %s", - daemon_pid, session_name) + logger.debug("Killed daemon pid %s for %s", daemon_pid, session_name) except (ProcessLookupError, ValueError, PermissionError, OSError): - logger.debug( - "Could not kill daemon pid for %s (already dead or inaccessible)", session_name) + logger.debug("Could not kill daemon pid for %s (already dead or inaccessible)", session_name) shutil.rmtree(socket_dir, ignore_errors=True) - + logger.debug("Removed task %s from active sessions", task_id) else: logger.debug("No active session found for task_id: %s", task_id) @@ -1772,7 +1726,7 @@ def cleanup_browser(task_id: Optional[str] = None) -> None: def cleanup_all_browsers() -> None: """ Clean up all active browser sessions. - + Useful for cleanup on shutdown. """ with _cleanup_lock: @@ -1784,7 +1738,7 @@ def cleanup_all_browsers() -> None: def get_active_browser_sessions() -> Dict[str, Dict[str, str]]: """ Get information about active browser sessions. - + Returns: Dict mapping task_id to session info (session_name, bb_session_id, cdp_url) """ @@ -1805,7 +1759,7 @@ def check_browser_requirements() -> bool: In **cloud mode** (BROWSERBASE_API_KEY set): the CLI *and* both ``BROWSERBASE_API_KEY`` / ``BROWSERBASE_PROJECT_ID`` must be present. - + Returns: True if all requirements are met, False otherwise """ @@ -1838,7 +1792,7 @@ if __name__ == "__main__": mode = "local" if _is_local_mode() else "cloud (Browserbase)" print(f" Mode: {mode}") - + # Check requirements if check_browser_requirements(): print("✅ All requirements met") @@ -1848,19 +1802,18 @@ if __name__ == "__main__": _find_agent_browser() except FileNotFoundError: print(" - agent-browser CLI not found") - print( - " Install: npm install -g agent-browser && agent-browser install --with-deps") + print(" Install: npm install -g agent-browser && agent-browser install --with-deps") if not _is_local_mode(): if not os.environ.get("BROWSERBASE_API_KEY"): print(" - BROWSERBASE_API_KEY not set (required for cloud mode)") if not os.environ.get("BROWSERBASE_PROJECT_ID"): print(" - BROWSERBASE_PROJECT_ID not set (required for cloud mode)") print(" Tip: unset BROWSERBASE_API_KEY to use free local mode instead") - + print("\n📋 Available Browser Tools:") for schema in BROWSER_TOOL_SCHEMAS: print(f" 🔹 {schema['name']}: {schema['description'][:60]}...") - + print("\n💡 Usage:") print(" from tools.browser_tool import browser_navigate, browser_snapshot") print(" result = browser_navigate('https://example.com', task_id='my_task')") @@ -1870,6 +1823,7 @@ if __name__ == "__main__": # --------------------------------------------------------------------------- # Registry # --------------------------------------------------------------------------- +from tools.registry import registry _BROWSER_SCHEMA_MAP = {s["name"]: s for s in BROWSER_TOOL_SCHEMAS} @@ -1877,8 +1831,7 @@ registry.register( name="browser_navigate", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_navigate"], - handler=lambda args, **kw: browser_navigate( - url=args.get("url", ""), task_id=kw.get("task_id")), + handler=lambda args, **kw: browser_navigate(url=args.get("url", ""), task_id=kw.get("task_id")), check_fn=check_browser_requirements, ) registry.register( @@ -1893,8 +1846,7 @@ registry.register( name="browser_click", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_click"], - handler=lambda args, **kw: browser_click(** - args, task_id=kw.get("task_id")), + handler=lambda args, **kw: browser_click(**args, task_id=kw.get("task_id")), check_fn=check_browser_requirements, ) registry.register( @@ -1908,8 +1860,7 @@ registry.register( name="browser_scroll", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_scroll"], - handler=lambda args, **kw: browser_scroll(** - args, task_id=kw.get("task_id")), + handler=lambda args, **kw: browser_scroll(**args, task_id=kw.get("task_id")), check_fn=check_browser_requirements, ) registry.register( @@ -1923,8 +1874,7 @@ registry.register( name="browser_press", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_press"], - handler=lambda args, **kw: browser_press( - key=args.get("key", ""), task_id=kw.get("task_id")), + handler=lambda args, **kw: browser_press(key=args.get("key", ""), task_id=kw.get("task_id")), check_fn=check_browser_requirements, ) registry.register( @@ -1945,20 +1895,13 @@ registry.register( name="browser_vision", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_vision"], - handler=lambda args, **kw: browser_vision( - question=args.get("question", ""), - annotate=args.get("annotate", False), - task_id=kw.get("task_id"), - ), + handler=lambda args, **kw: browser_vision(question=args.get("question", ""), annotate=args.get("annotate", False), task_id=kw.get("task_id")), check_fn=check_browser_requirements, ) registry.register( name="browser_console", toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_console"], - handler=lambda args, **kw: browser_console( - clear=args.get("clear", False), - task_id=kw.get("task_id"), - ), + handler=lambda args, **kw: browser_console(clear=args.get("clear", False), task_id=kw.get("task_id")), check_fn=check_browser_requirements, ) From 5319bb6ac4b8280cdf8c257b2fd97f746ada31a7 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 11:26:18 -0700 Subject: [PATCH 20/34] fix: tighten memory and session recall guidance Remove diary-style memory framing from the system prompt and memory tool schema, explicitly steer task/session logs to session_search, and clarify that session_search is for cross-session recall after checking the current conversation first. Add regression tests for the updated guidance text. --- agent/prompt_builder.py | 12 +++++++----- tests/agent/test_prompt_builder.py | 20 ++++++++++++++++++++ tests/tools/test_memory_tool.py | 15 +++++++++++++++ tests/tools/test_session_search.py | 12 ++++++++++++ tools/memory_tool.py | 19 ++++++++++--------- tools/session_search_tool.py | 4 ++-- 6 files changed, 66 insertions(+), 16 deletions(-) diff --git a/agent/prompt_builder.py b/agent/prompt_builder.py index b8371f79e..f1dbcf758 100644 --- a/agent/prompt_builder.py +++ b/agent/prompt_builder.py @@ -71,15 +71,17 @@ DEFAULT_AGENT_IDENTITY = ( ) MEMORY_GUIDANCE = ( - "You have persistent memory across sessions. Proactively save important things " - "you learn (user preferences, environment details, useful approaches) and do " - "(like a diary!) using the memory tool -- don't wait to be asked." + "You have persistent memory across sessions. Save durable facts using the memory " + "tool: user preferences, environment details, tool quirks, and stable conventions. " + "Memory is injected into every turn, so keep it compact. Do NOT save task progress, " + "session outcomes, or completed-work logs to memory; use session_search to recall " + "those from past transcripts." ) SESSION_SEARCH_GUIDANCE = ( "When the user references something from a past conversation or you suspect " - "relevant prior context exists, use session_search to recall it before asking " - "them to repeat themselves." + "relevant cross-session context exists, use session_search to recall it before " + "asking them to repeat themselves." ) SKILLS_GUIDANCE = ( diff --git a/tests/agent/test_prompt_builder.py b/tests/agent/test_prompt_builder.py index 4a09b4a9f..b5c10bee6 100644 --- a/tests/agent/test_prompt_builder.py +++ b/tests/agent/test_prompt_builder.py @@ -15,10 +15,30 @@ from agent.prompt_builder import ( build_context_files_prompt, CONTEXT_FILE_MAX_CHARS, DEFAULT_AGENT_IDENTITY, + MEMORY_GUIDANCE, + SESSION_SEARCH_GUIDANCE, PLATFORM_HINTS, ) +# ========================================================================= +# Guidance constants +# ========================================================================= + + +class TestGuidanceConstants: + def test_memory_guidance_discourages_task_logs(self): + assert "durable facts" in MEMORY_GUIDANCE + assert "Do NOT save task progress" in MEMORY_GUIDANCE + assert "session_search" in MEMORY_GUIDANCE + assert "like a diary" not in MEMORY_GUIDANCE + assert ">80%" not in MEMORY_GUIDANCE + + def test_session_search_guidance_is_simple_cross_session_recall(self): + assert "relevant cross-session context exists" in SESSION_SEARCH_GUIDANCE + assert "recent turns of the current session" not in SESSION_SEARCH_GUIDANCE + + # ========================================================================= # Context injection scanning # ========================================================================= diff --git a/tests/tools/test_memory_tool.py b/tests/tools/test_memory_tool.py index 0ed3b12e6..48cb6a83c 100644 --- a/tests/tools/test_memory_tool.py +++ b/tests/tools/test_memory_tool.py @@ -9,9 +9,24 @@ from tools.memory_tool import ( memory_tool, _scan_memory_content, ENTRY_DELIMITER, + MEMORY_SCHEMA, ) +# ========================================================================= +# Tool schema guidance +# ========================================================================= + +class TestMemorySchema: + def test_discourages_diary_style_task_logs(self): + description = MEMORY_SCHEMA["description"] + assert "Do NOT save task progress" in description + assert "session_search" in description + assert "like a diary" not in description + assert "temporary task state" in description + assert ">80%" not in description + + # ========================================================================= # Security scanning # ========================================================================= diff --git a/tests/tools/test_session_search.py b/tests/tools/test_session_search.py index c36247148..0d7414764 100644 --- a/tests/tools/test_session_search.py +++ b/tests/tools/test_session_search.py @@ -9,9 +9,21 @@ from tools.session_search_tool import ( _format_conversation, _truncate_around_matches, MAX_SESSION_CHARS, + SESSION_SEARCH_SCHEMA, ) +# ========================================================================= +# Tool schema guidance +# ========================================================================= + +class TestSessionSearchSchema: + def test_keeps_cross_session_recall_guidance_without_current_session_nudge(self): + description = SESSION_SEARCH_SCHEMA["description"] + assert "past conversations" in description + assert "recent turns of the current session" not in description + + # ========================================================================= # _format_timestamp # ========================================================================= diff --git a/tools/memory_tool.py b/tools/memory_tool.py index 2ce763124..b921a84f7 100644 --- a/tools/memory_tool.py +++ b/tools/memory_tool.py @@ -435,24 +435,25 @@ def check_memory_requirements() -> bool: MEMORY_SCHEMA = { "name": "memory", "description": ( - "Save important information to persistent memory that survives across sessions. " - "Your memory appears in your system prompt at session start -- it's how you " - "remember things about the user and your environment between conversations.\n\n" + "Save durable information to persistent memory that survives across sessions. " + "Memory is injected into future turns, so keep it compact and focused on facts " + "that will still matter later.\n\n" "WHEN TO SAVE (do this proactively, don't wait to be asked):\n" "- User shares a preference, habit, or personal detail (name, role, timezone, coding style)\n" "- You discover something about the environment (OS, installed tools, project structure)\n" "- User corrects you or says 'remember this' / 'don't do that again'\n" "- You learn a convention, API quirk, or workflow specific to this user's setup\n" - "- You completed something - log it like a diary entry\n" - "- After completing a complex task, save a brief note about what was done\n\n" - "- If you've discovered a new way to do something, solved a problem that could be necessary later, save it as a skill with the skill tool\n\n" + "- You identify a stable fact that will be useful again in future sessions\n\n" + "Do NOT save task progress, session outcomes, completed-work logs, or temporary TODO " + "state to memory; use session_search to recall those from past transcripts.\n" + "If you've discovered a new way to do something, solved a problem that could be " + "necessary later, save it as a skill with the skill tool.\n\n" "TWO TARGETS:\n" "- 'user': who the user is -- name, role, preferences, communication style, pet peeves\n" "- 'memory': your notes -- environment facts, project conventions, tool quirks, lessons learned\n\n" "ACTIONS: add (new entry), replace (update existing -- old_text identifies it), " - "remove (delete -- old_text identifies it).\n" - "Capacity shown in system prompt. When >80%, consolidate entries before adding new ones.\n\n" - "SKIP: trivial/obvious info, things easily re-discovered, raw data dumps." + "remove (delete -- old_text identifies it).\n\n" + "SKIP: trivial/obvious info, things easily re-discovered, raw data dumps, and temporary task state." ), "parameters": { "type": "object", diff --git a/tools/session_search_tool.py b/tools/session_search_tool.py index cd1b98fd5..f4143fa12 100644 --- a/tools/session_search_tool.py +++ b/tools/session_search_tool.py @@ -341,8 +341,8 @@ SESSION_SEARCH_SCHEMA = { "- The user references a project, person, or concept that seems familiar but isn't in memory\n" "- You want to check if you've solved a similar problem before\n" "- The user asks 'what did we do about X?' or 'how did we fix Y?'\n\n" - "Don't hesitate to search -- it's fast and cheap. Better to search and confirm " - "than to guess or ask the user to repeat themselves.\n\n" + "Don't hesitate to search when it is actually cross-session -- it's fast and cheap. " + "Better to search and confirm than to guess or ask the user to repeat themselves.\n\n" "Search syntax: keywords joined with OR for broad recall (elevenlabs OR baseten OR funding), " "phrases for exact match (\"docker networking\"), boolean (python NOT java), prefix (deploy*). " "IMPORTANT: Use OR between keywords for best results — FTS5 defaults to AND which misses " From f10e26f731ece83e750ccfec86f46753e918e827 Mon Sep 17 00:00:00 2001 From: teyrebaz33 Date: Thu, 12 Mar 2026 12:35:43 +0300 Subject: [PATCH 21/34] fix: auto-enable systemd linger during gateway install on headless servers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #1005 Without linger, user-level systemd services stop when the SSH session ends — even though systemctl --user status shows active (running). Changes to systemd_install(): - Try loginctl enable-linger automatically (succeeds when the process has the required privileges) - If loginctl fails (no privileges), print a clear, copy-pasteable warning with the exact command the user must run New helper: _ensure_linger_enabled() - Fast path: checks /var/lib/systemd/linger/ (no subprocess) - Auto-enable: loginctl enable-linger - Fallback: actionable warning with sudo command + restart instructions Tests: 4 new tests in TestEnsureLingerEnabled, 205 passed total --- hermes_cli/gateway.py | 62 +++++++++++- tests/hermes_cli/test_gateway.py | 6 +- tests/hermes_cli/test_gateway_linger.py | 120 ++++++++++++++++++++++++ 3 files changed, 183 insertions(+), 5 deletions(-) create mode 100644 tests/hermes_cli/test_gateway_linger.py diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 4d3ed8845..661104f07 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -251,7 +251,6 @@ StandardError=journal WantedBy=default.target """ - def _normalize_service_definition(text: str) -> str: return "\n".join(line.rstrip() for line in text.strip().splitlines()) @@ -279,6 +278,65 @@ def refresh_systemd_unit_if_needed() -> bool: return True + +def _print_linger_enable_warning(username: str, detail: str | None = None) -> None: + print() + print("⚠ Linger not enabled — gateway may stop when you close this terminal.") + if detail: + print(f" Auto-enable failed: {detail}") + print() + print(" On headless servers (VPS, cloud instances) run:") + print(f" sudo loginctl enable-linger {username}") + print() + print(" Then restart the gateway:") + print(f" systemctl --user restart {SERVICE_NAME}.service") + print() + + + +def _ensure_linger_enabled() -> None: + """Enable linger when possible so the user gateway survives logout.""" + if not is_linux(): + return + + import getpass + import shutil + + username = getpass.getuser() + linger_file = Path(f"/var/lib/systemd/linger/{username}") + if linger_file.exists(): + print("✓ Systemd linger is enabled (service survives logout)") + return + + linger_enabled, linger_detail = get_systemd_linger_status() + if linger_enabled is True: + print("✓ Systemd linger is enabled (service survives logout)") + return + + if not shutil.which("loginctl"): + _print_linger_enable_warning(username, linger_detail or "loginctl not found") + return + + print("Enabling linger so the gateway survives SSH logout...") + try: + result = subprocess.run( + ["loginctl", "enable-linger", username], + capture_output=True, + text=True, + check=False, + ) + except Exception as e: + _print_linger_enable_warning(username, str(e)) + return + + if result.returncode == 0: + print("✓ Linger enabled — gateway will persist after logout") + return + + detail = (result.stderr or result.stdout or f"exit {result.returncode}").strip() + _print_linger_enable_warning(username, detail or linger_detail) + + def systemd_install(force: bool = False): unit_path = get_systemd_unit_path() @@ -302,7 +360,7 @@ def systemd_install(force: bool = False): print(f" hermes gateway status # Check status") print(f" journalctl --user -u {SERVICE_NAME} -f # View logs") print() - print_systemd_linger_guidance() + _ensure_linger_enabled() def systemd_uninstall(): subprocess.run(["systemctl", "--user", "stop", SERVICE_NAME], check=False) diff --git a/tests/hermes_cli/test_gateway.py b/tests/hermes_cli/test_gateway.py index a39b0c641..ad987d575 100644 --- a/tests/hermes_cli/test_gateway.py +++ b/tests/hermes_cli/test_gateway.py @@ -59,15 +59,16 @@ def test_systemd_install_checks_linger_status(monkeypatch, tmp_path, capsys): unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service" monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda: unit_path) - monkeypatch.setattr(gateway, "get_systemd_linger_status", lambda: (False, "")) calls = [] + helper_calls = [] def fake_run(cmd, check=False, **kwargs): calls.append((cmd, check)) return SimpleNamespace(returncode=0, stdout="", stderr="") monkeypatch.setattr(gateway.subprocess, "run", fake_run) + monkeypatch.setattr(gateway, "_ensure_linger_enabled", lambda: helper_calls.append(True)) gateway.systemd_install(force=False) @@ -77,6 +78,5 @@ def test_systemd_install_checks_linger_status(monkeypatch, tmp_path, capsys): ["systemctl", "--user", "daemon-reload"], ["systemctl", "--user", "enable", gateway.SERVICE_NAME], ] + assert helper_calls == [True] assert "Service installed and enabled" in out - assert "Systemd linger is disabled" in out - assert "loginctl enable-linger" in out diff --git a/tests/hermes_cli/test_gateway_linger.py b/tests/hermes_cli/test_gateway_linger.py new file mode 100644 index 000000000..f1341d068 --- /dev/null +++ b/tests/hermes_cli/test_gateway_linger.py @@ -0,0 +1,120 @@ +"""Tests for gateway linger auto-enable behavior on headless Linux installs.""" + +from types import SimpleNamespace + +import hermes_cli.gateway as gateway + + +class TestEnsureLingerEnabled: + def test_linger_already_enabled_via_file(self, monkeypatch, capsys): + monkeypatch.setattr(gateway, "is_linux", lambda: True) + monkeypatch.setattr("getpass.getuser", lambda: "testuser") + monkeypatch.setattr(gateway, "Path", lambda _path: SimpleNamespace(exists=lambda: True)) + + calls = [] + monkeypatch.setattr(gateway.subprocess, "run", lambda *args, **kwargs: calls.append((args, kwargs))) + + gateway._ensure_linger_enabled() + + out = capsys.readouterr().out + assert "Systemd linger is enabled" in out + assert calls == [] + + def test_status_enabled_skips_enable(self, monkeypatch, capsys): + monkeypatch.setattr(gateway, "is_linux", lambda: True) + monkeypatch.setattr("getpass.getuser", lambda: "testuser") + monkeypatch.setattr(gateway, "Path", lambda _path: SimpleNamespace(exists=lambda: False)) + monkeypatch.setattr(gateway, "get_systemd_linger_status", lambda: (True, "")) + + calls = [] + monkeypatch.setattr(gateway.subprocess, "run", lambda *args, **kwargs: calls.append((args, kwargs))) + + gateway._ensure_linger_enabled() + + out = capsys.readouterr().out + assert "Systemd linger is enabled" in out + assert calls == [] + + def test_loginctl_success_enables_linger(self, monkeypatch, capsys): + monkeypatch.setattr(gateway, "is_linux", lambda: True) + monkeypatch.setattr("getpass.getuser", lambda: "testuser") + monkeypatch.setattr(gateway, "Path", lambda _path: SimpleNamespace(exists=lambda: False)) + monkeypatch.setattr(gateway, "get_systemd_linger_status", lambda: (False, "")) + monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/loginctl") + + run_calls = [] + + def fake_run(cmd, capture_output=False, text=False, check=False): + run_calls.append((cmd, capture_output, text, check)) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(gateway.subprocess, "run", fake_run) + + gateway._ensure_linger_enabled() + + out = capsys.readouterr().out + assert "Enabling linger" in out + assert "Linger enabled" in out + assert run_calls == [(["loginctl", "enable-linger", "testuser"], True, True, False)] + + def test_missing_loginctl_shows_manual_guidance(self, monkeypatch, capsys): + monkeypatch.setattr(gateway, "is_linux", lambda: True) + monkeypatch.setattr("getpass.getuser", lambda: "testuser") + monkeypatch.setattr(gateway, "Path", lambda _path: SimpleNamespace(exists=lambda: False)) + monkeypatch.setattr(gateway, "get_systemd_linger_status", lambda: (None, "loginctl not found")) + monkeypatch.setattr("shutil.which", lambda name: None) + + calls = [] + monkeypatch.setattr(gateway.subprocess, "run", lambda *args, **kwargs: calls.append((args, kwargs))) + + gateway._ensure_linger_enabled() + + out = capsys.readouterr().out + assert "sudo loginctl enable-linger testuser" in out + assert "loginctl not found" in out + assert calls == [] + + def test_loginctl_failure_shows_manual_guidance(self, monkeypatch, capsys): + monkeypatch.setattr(gateway, "is_linux", lambda: True) + monkeypatch.setattr("getpass.getuser", lambda: "testuser") + monkeypatch.setattr(gateway, "Path", lambda _path: SimpleNamespace(exists=lambda: False)) + monkeypatch.setattr(gateway, "get_systemd_linger_status", lambda: (False, "")) + monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/loginctl") + monkeypatch.setattr( + gateway.subprocess, + "run", + lambda *args, **kwargs: SimpleNamespace(returncode=1, stdout="", stderr="Permission denied"), + ) + + gateway._ensure_linger_enabled() + + out = capsys.readouterr().out + assert "sudo loginctl enable-linger testuser" in out + assert "Permission denied" in out + + +def test_systemd_install_calls_linger_helper(monkeypatch, tmp_path, capsys): + unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service" + + monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda: unit_path) + + calls = [] + + def fake_run(cmd, check=False, **kwargs): + calls.append((cmd, check)) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + helper_calls = [] + monkeypatch.setattr(gateway.subprocess, "run", fake_run) + monkeypatch.setattr(gateway, "_ensure_linger_enabled", lambda: helper_calls.append(True)) + + gateway.systemd_install(force=False) + + out = capsys.readouterr().out + assert unit_path.exists() + assert [cmd for cmd, _ in calls] == [ + ["systemctl", "--user", "daemon-reload"], + ["systemctl", "--user", "enable", gateway.SERVICE_NAME], + ] + assert helper_calls == [True] + assert "Service installed and enabled" in out From f8e4233e67916e7524e9757df312bf46a4d57164 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Thu, 12 Mar 2026 15:19:31 +0300 Subject: [PATCH 22/34] fix(test): isolate codex provider tests from local env leaking API keys --- tests/test_cli_provider_resolution.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_cli_provider_resolution.py b/tests/test_cli_provider_resolution.py index 8c22dd7ac..a88307777 100644 --- a/tests/test_cli_provider_resolution.py +++ b/tests/test_cli_provider_resolution.py @@ -186,6 +186,8 @@ def test_codex_provider_replaces_incompatible_default_model(monkeypatch): monkeypatch.delenv("LLM_MODEL", raising=False) monkeypatch.delenv("OPENAI_MODEL", raising=False) + # Ensure local user config does not leak a model into the test + monkeypatch.setitem(cli.CLI_CONFIG, "model", {}) def _runtime_resolve(**kwargs): return { @@ -240,6 +242,11 @@ def test_codex_provider_uses_config_model(monkeypatch): monkeypatch.setattr("hermes_cli.runtime_provider.resolve_runtime_provider", _runtime_resolve) monkeypatch.setattr("hermes_cli.runtime_provider.format_runtime_provider_error", lambda exc: str(exc)) + # Prevent live API call from overriding the config model + monkeypatch.setattr( + "hermes_cli.codex_models.get_codex_model_ids", + lambda access_token=None: ["gpt-5.2-codex"], + ) shell = cli.HermesCLI(compact=True, max_turns=1) From 7f485f588e10a202001f07e2fcc5fa4db88b4d0b Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Thu, 12 Mar 2026 15:31:00 +0300 Subject: [PATCH 23/34] fix(test): provide required model config keys to prevent KeyError on base_url --- tests/test_cli_provider_resolution.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_cli_provider_resolution.py b/tests/test_cli_provider_resolution.py index a88307777..ffc5752ff 100644 --- a/tests/test_cli_provider_resolution.py +++ b/tests/test_cli_provider_resolution.py @@ -187,7 +187,10 @@ def test_codex_provider_replaces_incompatible_default_model(monkeypatch): monkeypatch.delenv("LLM_MODEL", raising=False) monkeypatch.delenv("OPENAI_MODEL", raising=False) # Ensure local user config does not leak a model into the test - monkeypatch.setitem(cli.CLI_CONFIG, "model", {}) + monkeypatch.setitem(cli.CLI_CONFIG, "model", { + "default": "", + "base_url": "https://openrouter.ai/api/v1", + }) def _runtime_resolve(**kwargs): return { From eb8226daabc56cc3f89fcc9eab2d135758e91d91 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 11:57:44 -0700 Subject: [PATCH 24/34] fix(cli): repair dangerous command approval UI Move the dangerous-command header onto its own line inside the approval box so the panel border no longer cuts through it, and restore the long-command expand path in the active prompt_toolkit approval callback. The CLI already had a merged 'view full command' feature in fallback/gateway paths, but the live TUI callback was still using an older choice set and never exposed it. Add regression tests for long-command view state, in-place expansion, and panel rendering. --- cli.py | 180 ++++++++++++++++++++++------------ tests/test_cli_approval_ui.py | 100 +++++++++++++++++++ 2 files changed, 215 insertions(+), 65 deletions(-) create mode 100644 tests/test_cli_approval_ui.py diff --git a/cli.py b/cli.py index 6df693229..13bf4736b 100755 --- a/cli.py +++ b/cli.py @@ -4090,6 +4090,8 @@ class HermesCLI: Called from the agent thread. Shows a selection UI similar to clarify with choices: once / session / always / deny. When allow_permanent is False (tirith warnings present), the 'always' option is hidden. + Long commands also get a 'view' option so the full command can be + expanded before deciding. Uses _approval_lock to serialize concurrent requests (e.g. from parallel delegation subtasks) so each prompt gets its own turn @@ -4100,12 +4102,11 @@ class HermesCLI: with self._approval_lock: timeout = 60 response_queue = queue.Queue() - choices = ["once", "session", "always", "deny"] if allow_permanent else ["once", "session", "deny"] self._approval_state = { "command": command, "description": description, - "choices": choices, + "choices": self._approval_choices(command, allow_permanent=allow_permanent), "selected": 0, "response_queue": response_queue, } @@ -4136,6 +4137,116 @@ class HermesCLI: _cprint(f"\n{_DIM} ⏱ Timeout — denying command{_RST}") return "deny" + def _approval_choices(self, command: str, *, allow_permanent: bool = True) -> list[str]: + """Return approval choices for a dangerous command prompt.""" + choices = ["once", "session", "always", "deny"] if allow_permanent else ["once", "session", "deny"] + if len(command) > 70: + choices.append("view") + return choices + + def _handle_approval_selection(self) -> None: + """Process the currently selected dangerous-command approval choice.""" + state = self._approval_state + if not state: + return + + selected = state.get("selected", 0) + choices = state.get("choices") or [] + if not (0 <= selected < len(choices)): + return + + chosen = choices[selected] + if chosen == "view": + state["show_full"] = True + state["choices"] = [choice for choice in choices if choice != "view"] + if state["selected"] >= len(state["choices"]): + state["selected"] = max(0, len(state["choices"]) - 1) + self._invalidate() + return + + state["response_queue"].put(chosen) + self._approval_state = None + self._invalidate() + + def _get_approval_display_fragments(self): + """Render the dangerous-command approval panel for the prompt_toolkit UI.""" + state = self._approval_state + if not state: + return [] + + def _panel_box_width(title_text: str, content_lines: list[str], min_width: int = 46, max_width: int = 76) -> int: + term_cols = shutil.get_terminal_size((100, 20)).columns + longest = max([len(title_text)] + [len(line) for line in content_lines] + [min_width - 4]) + inner = min(max(longest + 4, min_width - 2), max_width - 2, max(24, term_cols - 6)) + return inner + 2 + + def _wrap_panel_text(text: str, width: int, subsequent_indent: str = "") -> list[str]: + wrapped = textwrap.wrap( + text, + width=max(8, width), + replace_whitespace=False, + drop_whitespace=False, + subsequent_indent=subsequent_indent, + ) + return wrapped or [""] + + def _append_panel_line(lines, border_style: str, content_style: str, text: str, box_width: int) -> None: + inner_width = max(0, box_width - 2) + lines.append((border_style, "│ ")) + lines.append((content_style, text.ljust(inner_width))) + lines.append((border_style, " │\n")) + + def _append_blank_panel_line(lines, border_style: str, box_width: int) -> None: + lines.append((border_style, "│" + (" " * box_width) + "│\n")) + + command = state["command"] + description = state["description"] + choices = state["choices"] + selected = state.get("selected", 0) + show_full = state.get("show_full", False) + + title = "⚠️ Dangerous Command" + cmd_display = command if show_full or len(command) <= 70 else command[:70] + '...' + choice_labels = { + "once": "Allow once", + "session": "Allow for this session", + "always": "Add to permanent allowlist", + "deny": "Deny", + "view": "Show full command", + } + + preview_lines = _wrap_panel_text(description, 60) + preview_lines.extend(_wrap_panel_text(cmd_display, 60)) + for i, choice in enumerate(choices): + prefix = '❯ ' if i == selected else ' ' + preview_lines.extend(_wrap_panel_text( + f"{prefix}{choice_labels.get(choice, choice)}", + 60, + subsequent_indent=" ", + )) + + box_width = _panel_box_width(title, preview_lines) + inner_text_width = max(8, box_width - 2) + + lines = [] + lines.append(('class:approval-border', '╭' + ('─' * box_width) + '╮\n')) + _append_panel_line(lines, 'class:approval-border', 'class:approval-title', title, box_width) + _append_blank_panel_line(lines, 'class:approval-border', box_width) + for wrapped in _wrap_panel_text(description, inner_text_width): + _append_panel_line(lines, 'class:approval-border', 'class:approval-desc', wrapped, box_width) + for wrapped in _wrap_panel_text(cmd_display, inner_text_width): + _append_panel_line(lines, 'class:approval-border', 'class:approval-cmd', wrapped, box_width) + _append_blank_panel_line(lines, 'class:approval-border', box_width) + for i, choice in enumerate(choices): + label = choice_labels.get(choice, choice) + style = 'class:approval-selected' if i == selected else 'class:approval-choice' + prefix = '❯ ' if i == selected else ' ' + for wrapped in _wrap_panel_text(f"{prefix}{label}", inner_text_width, subsequent_indent=" "): + _append_panel_line(lines, 'class:approval-border', style, wrapped, box_width) + _append_blank_panel_line(lines, 'class:approval-border', box_width) + lines.append(('class:approval-border', '╰' + ('─' * box_width) + '╯\n')) + return lines + def _secret_capture_callback(self, var_name: str, prompt: str, metadata=None) -> dict: return prompt_for_secret(self, var_name, prompt, metadata) @@ -4727,22 +4838,7 @@ class HermesCLI: # --- Approval selection: confirm the highlighted choice --- if self._approval_state: - state = self._approval_state - selected = state["selected"] - choices = state["choices"] - if 0 <= selected < len(choices): - chosen = choices[selected] - if chosen == "view": - # Toggle full command display without closing the prompt - state["show_full"] = True - # Remove the "view" option since it's been used - state["choices"] = [c for c in choices if c != "view"] - if state["selected"] >= len(state["choices"]): - state["selected"] = len(state["choices"]) - 1 - event.app.invalidate() - return - state["response_queue"].put(chosen) - self._approval_state = None + self._handle_approval_selection() event.app.invalidate() return @@ -5428,53 +5524,7 @@ class HermesCLI: # --- Dangerous command approval: display widget --- def _get_approval_display(): - state = cli_ref._approval_state - if not state: - return [] - command = state["command"] - description = state["description"] - choices = state["choices"] - selected = state.get("selected", 0) - show_full = state.get("show_full", False) - - if show_full or len(command) <= 70: - cmd_display = command - else: - cmd_display = command[:70] + '...' - choice_labels = { - "once": "Allow once", - "session": "Allow for this session", - "always": "Add to permanent allowlist", - "deny": "Deny", - "view": "Show full command", - } - preview_lines = _wrap_panel_text(description, 60) - preview_lines.extend(_wrap_panel_text(cmd_display, 60)) - for i, choice in enumerate(choices): - prefix = '❯ ' if i == selected else ' ' - preview_lines.extend(_wrap_panel_text(f"{prefix}{choice_labels.get(choice, choice)}", 60, subsequent_indent=" ")) - box_width = _panel_box_width("⚠️ Dangerous Command", preview_lines) - inner_text_width = max(8, box_width - 2) - - lines = [] - lines.append(('class:approval-border', '╭─ ')) - lines.append(('class:approval-title', '⚠️ Dangerous Command')) - lines.append(('class:approval-border', ' ' + ('─' * max(0, box_width - len("⚠️ Dangerous Command") - 3)) + '╮\n')) - _append_blank_panel_line(lines, 'class:approval-border', box_width) - for wrapped in _wrap_panel_text(description, inner_text_width): - _append_panel_line(lines, 'class:approval-border', 'class:approval-desc', wrapped, box_width) - for wrapped in _wrap_panel_text(cmd_display, inner_text_width): - _append_panel_line(lines, 'class:approval-border', 'class:approval-cmd', wrapped, box_width) - _append_blank_panel_line(lines, 'class:approval-border', box_width) - for i, choice in enumerate(choices): - label = choice_labels.get(choice, choice) - style = 'class:approval-selected' if i == selected else 'class:approval-choice' - prefix = '❯ ' if i == selected else ' ' - for wrapped in _wrap_panel_text(f"{prefix}{label}", inner_text_width, subsequent_indent=" "): - _append_panel_line(lines, 'class:approval-border', style, wrapped, box_width) - _append_blank_panel_line(lines, 'class:approval-border', box_width) - lines.append(('class:approval-border', '╰' + ('─' * box_width) + '╯\n')) - return lines + return cli_ref._get_approval_display_fragments() approval_widget = ConditionalContainer( Window( diff --git a/tests/test_cli_approval_ui.py b/tests/test_cli_approval_ui.py new file mode 100644 index 000000000..9b2e0bbb2 --- /dev/null +++ b/tests/test_cli_approval_ui.py @@ -0,0 +1,100 @@ +import queue +import threading +import time +from types import SimpleNamespace +from unittest.mock import MagicMock + +from cli import HermesCLI + + +def _make_cli_stub(): + cli = HermesCLI.__new__(HermesCLI) + cli._approval_state = None + cli._approval_deadline = 0 + cli._approval_lock = threading.Lock() + cli._invalidate = MagicMock() + cli._app = SimpleNamespace(invalidate=MagicMock()) + return cli + + +class TestCliApprovalUi: + def test_approval_callback_includes_view_for_long_commands(self): + cli = _make_cli_stub() + command = "sudo dd if=/tmp/githubcli-keyring.gpg of=/usr/share/keyrings/githubcli-archive-keyring.gpg bs=4M status=progress" + result = {} + + def _run_callback(): + result["value"] = cli._approval_callback(command, "disk copy") + + thread = threading.Thread(target=_run_callback, daemon=True) + thread.start() + + deadline = time.time() + 2 + while cli._approval_state is None and time.time() < deadline: + time.sleep(0.01) + + assert cli._approval_state is not None + assert "view" in cli._approval_state["choices"] + + cli._approval_state["response_queue"].put("deny") + thread.join(timeout=2) + assert result["value"] == "deny" + + def test_handle_approval_selection_view_expands_in_place(self): + cli = _make_cli_stub() + cli._approval_state = { + "command": "sudo dd if=/tmp/in of=/usr/share/keyrings/githubcli-archive-keyring.gpg bs=4M status=progress", + "description": "disk copy", + "choices": ["once", "session", "always", "deny", "view"], + "selected": 4, + "response_queue": queue.Queue(), + } + + cli._handle_approval_selection() + + assert cli._approval_state is not None + assert cli._approval_state["show_full"] is True + assert "view" not in cli._approval_state["choices"] + assert cli._approval_state["selected"] == 3 + assert cli._approval_state["response_queue"].empty() + + def test_approval_display_places_title_inside_box_not_border(self): + cli = _make_cli_stub() + cli._approval_state = { + "command": "sudo dd if=/tmp/in of=/usr/share/keyrings/githubcli-archive-keyring.gpg bs=4M status=progress", + "description": "disk copy", + "choices": ["once", "session", "always", "deny", "view"], + "selected": 0, + "response_queue": queue.Queue(), + } + + fragments = cli._get_approval_display_fragments() + rendered = "".join(text for _style, text in fragments) + lines = rendered.splitlines() + + assert lines[0].startswith("╭") + assert "Dangerous Command" not in lines[0] + assert any("Dangerous Command" in line for line in lines[1:3]) + assert "Show full command" in rendered + assert "githubcli-archive-keyring.gpg" not in rendered + + def test_approval_display_shows_full_command_after_view(self): + cli = _make_cli_stub() + full_command = "sudo dd if=/tmp/in of=/usr/share/keyrings/githubcli-archive-keyring.gpg bs=4M status=progress" + cli._approval_state = { + "command": full_command, + "description": "disk copy", + "choices": ["once", "session", "always", "deny"], + "selected": 0, + "show_full": True, + "response_queue": queue.Queue(), + } + + fragments = cli._get_approval_display_fragments() + rendered = "".join(text for _style, text in fragments) + + assert "..." not in rendered + assert "githubcli-" in rendered + assert "archive-" in rendered + assert "keyring.gpg" in rendered + assert "status=progress" in rendered From 7dc9281f056203bfdfe3e9e0e28674be6d08225f Mon Sep 17 00:00:00 2001 From: ygd58 Date: Thu, 12 Mar 2026 13:25:09 +0100 Subject: [PATCH 25/34] fix(vision): surface actual error reason instead of generic message When vision_analyze_tool fails, the except block was returning a generic 'could not be analyzed' message that gave the agent no actionable information about the failure cause. Replace the generic message with the actual exception string so the agent can distinguish between backend errors, missing dependencies, network failures, and unsupported image paths. Also add an 'error' field to the failure response for structured error handling by callers. Fixes #1034 --- tools/vision_tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/vision_tools.py b/tools/vision_tools.py index c1b09a22d..264532e9a 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -354,6 +354,7 @@ async def vision_analyze_tool( # Prepare error response result = { "success": False, + "error": error_msg, "analysis": analysis, } From 5a2fcaab39a8f2765c724b4ae01d2c0afd0a6b1d Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 12:11:23 -0700 Subject: [PATCH 26/34] fix(gateway): harden Telegram polling conflict handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary --- gateway/platforms/base.py | 68 +++++++ gateway/platforms/telegram.py | 77 ++++++- gateway/run.py | 75 ++++++- gateway/status.py | 191 +++++++++++++++++- hermes_cli/gateway.py | 48 +++++ tests/gateway/test_runner_fatal_adapter.py | 46 +++++ tests/gateway/test_status.py | 74 +++++++ tests/gateway/test_telegram_conflict.py | 100 +++++++++ .../hermes_cli/test_gateway_runtime_health.py | 22 ++ 9 files changed, 692 insertions(+), 9 deletions(-) create mode 100644 tests/gateway/test_runner_fatal_adapter.py create mode 100644 tests/gateway/test_telegram_conflict.py create mode 100644 tests/hermes_cli/test_gateway_runtime_health.py diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 67a8323a7..e523d9390 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -346,6 +346,10 @@ class BasePlatformAdapter(ABC): self.platform = platform self._message_handler: Optional[MessageHandler] = None self._running = False + self._fatal_error_code: Optional[str] = None + self._fatal_error_message: Optional[str] = None + self._fatal_error_retryable = True + self._fatal_error_handler: Optional[Callable[["BasePlatformAdapter"], Awaitable[None] | None]] = None # Track active message handlers per session for interrupt support # Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt) @@ -353,6 +357,70 @@ class BasePlatformAdapter(ABC): self._pending_messages: Dict[str, MessageEvent] = {} # Chats where auto-TTS on voice input is disabled (set by /voice off) self._auto_tts_disabled_chats: set = set() + + @property + def has_fatal_error(self) -> bool: + return self._fatal_error_message is not None + + @property + def fatal_error_message(self) -> Optional[str]: + return self._fatal_error_message + + @property + def fatal_error_code(self) -> Optional[str]: + return self._fatal_error_code + + @property + def fatal_error_retryable(self) -> bool: + return self._fatal_error_retryable + + def set_fatal_error_handler(self, handler: Callable[["BasePlatformAdapter"], Awaitable[None] | None]) -> None: + self._fatal_error_handler = handler + + def _mark_connected(self) -> None: + self._running = True + self._fatal_error_code = None + self._fatal_error_message = None + self._fatal_error_retryable = True + try: + from gateway.status import write_runtime_status + write_runtime_status(platform=self.platform.value, platform_state="connected", error_code=None, error_message=None) + except Exception: + pass + + def _mark_disconnected(self) -> None: + self._running = False + if self.has_fatal_error: + return + try: + from gateway.status import write_runtime_status + write_runtime_status(platform=self.platform.value, platform_state="disconnected", error_code=None, error_message=None) + except Exception: + pass + + def _set_fatal_error(self, code: str, message: str, *, retryable: bool) -> None: + self._running = False + self._fatal_error_code = code + self._fatal_error_message = message + self._fatal_error_retryable = retryable + try: + from gateway.status import write_runtime_status + write_runtime_status( + platform=self.platform.value, + platform_state="fatal", + error_code=code, + error_message=message, + ) + except Exception: + pass + + async def _notify_fatal_error(self) -> None: + handler = self._fatal_error_handler + if not handler: + return + result = handler(self) + if asyncio.iscoroutine(result): + await result @property def name(self) -> str: diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index df44733e3..8ad3e00b4 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -110,7 +110,35 @@ class TelegramAdapter(BasePlatformAdapter): super().__init__(config, Platform.TELEGRAM) self._app: Optional[Application] = None self._bot: Optional[Bot] = None - + self._token_lock_identity: Optional[str] = None + self._polling_error_task: Optional[asyncio.Task] = None + + @staticmethod + def _looks_like_polling_conflict(error: Exception) -> bool: + text = str(error).lower() + return ( + error.__class__.__name__.lower() == "conflict" + or "terminated by other getupdates request" in text + or "another bot instance is running" in text + ) + + async def _handle_polling_conflict(self, error: Exception) -> None: + if self.has_fatal_error and self.fatal_error_code == "telegram_polling_conflict": + return + message = ( + "Another Telegram bot poller is already using this token. " + "Hermes stopped Telegram polling to avoid endless retry spam. " + "Make sure only one gateway instance is running for this bot token." + ) + logger.error("[%s] %s Original error: %s", self.name, message, error) + self._set_fatal_error("telegram_polling_conflict", message, retryable=False) + try: + if self._app and self._app.updater: + await self._app.updater.stop() + except Exception as stop_error: + logger.warning("[%s] Failed stopping Telegram polling after conflict: %s", self.name, stop_error, exc_info=True) + await self._notify_fatal_error() + async def connect(self) -> bool: """Connect to Telegram and start polling for updates.""" if not TELEGRAM_AVAILABLE: @@ -125,6 +153,25 @@ class TelegramAdapter(BasePlatformAdapter): return False try: + from gateway.status import acquire_scoped_lock + + self._token_lock_identity = self.config.token + acquired, existing = acquire_scoped_lock( + "telegram-bot-token", + self._token_lock_identity, + metadata={"platform": self.platform.value}, + ) + if not acquired: + owner_pid = existing.get("pid") if isinstance(existing, dict) else None + message = ( + "Another local Hermes gateway is already using this Telegram bot token" + + (f" (PID {owner_pid})." if owner_pid else ".") + + " Stop the other gateway before starting a second Telegram poller." + ) + logger.error("[%s] %s", self.name, message) + self._set_fatal_error("telegram_token_lock", message, retryable=False) + return False + # Build the application self._app = Application.builder().token(self.config.token).build() self._bot = self._app.bot @@ -150,9 +197,20 @@ class TelegramAdapter(BasePlatformAdapter): # Start polling in background await self._app.initialize() await self._app.start() + loop = asyncio.get_running_loop() + + def _polling_error_callback(error: Exception) -> None: + if not self._looks_like_polling_conflict(error): + logger.error("[%s] Telegram polling error: %s", self.name, error, exc_info=True) + return + if self._polling_error_task and not self._polling_error_task.done(): + return + self._polling_error_task = loop.create_task(self._handle_polling_conflict(error)) + await self._app.updater.start_polling( allowed_updates=Update.ALL_TYPES, drop_pending_updates=True, + error_callback=_polling_error_callback, ) # Register bot commands so Telegram shows a hint menu when users type / @@ -188,11 +246,17 @@ class TelegramAdapter(BasePlatformAdapter): exc_info=True, ) - self._running = True + self._mark_connected() logger.info("[%s] Connected and polling for Telegram updates", self.name) return True except Exception as e: + if self._token_lock_identity: + try: + from gateway.status import release_scoped_lock + release_scoped_lock("telegram-bot-token", self._token_lock_identity) + except Exception: + pass logger.error("[%s] Failed to connect to Telegram: %s", self.name, e, exc_info=True) return False @@ -205,10 +269,17 @@ class TelegramAdapter(BasePlatformAdapter): await self._app.shutdown() except Exception as e: logger.warning("[%s] Error during Telegram disconnect: %s", self.name, e, exc_info=True) + if self._token_lock_identity: + try: + from gateway.status import release_scoped_lock + release_scoped_lock("telegram-bot-token", self._token_lock_identity) + except Exception as e: + logger.warning("[%s] Error releasing Telegram token lock: %s", self.name, e, exc_info=True) - self._running = False + self._mark_disconnected() self._app = None self._bot = None + self._token_lock_identity = None logger.info("[%s] Disconnected from Telegram", self.name) async def send( diff --git a/gateway/run.py b/gateway/run.py index 5ab74972a..8b58d2eb3 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -245,6 +245,8 @@ class GatewayRunner: self.delivery_router = DeliveryRouter(self.config) self._running = False self._shutdown_event = asyncio.Event() + self._exit_cleanly = False + self._exit_reason: Optional[str] = None # Track running agents per session for interrupt support # Key: session_key, Value: AIAgent instance @@ -463,6 +465,41 @@ class GatewayRunner: """Run the sync memory flush in a thread pool so it won't block the event loop.""" loop = asyncio.get_event_loop() await loop.run_in_executor(None, self._flush_memories_for_session, old_session_id) + + @property + def should_exit_cleanly(self) -> bool: + return self._exit_cleanly + + @property + def exit_reason(self) -> Optional[str]: + return self._exit_reason + + async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None: + """React to a non-retryable adapter failure after startup.""" + logger.error( + "Fatal %s adapter error (%s): %s", + adapter.platform.value, + adapter.fatal_error_code or "unknown", + adapter.fatal_error_message or "unknown error", + ) + + existing = self.adapters.get(adapter.platform) + if existing is adapter: + try: + await adapter.disconnect() + finally: + self.adapters.pop(adapter.platform, None) + self.delivery_router.adapters = self.adapters + + if not self.adapters: + self._exit_reason = adapter.fatal_error_message or "All messaging adapters disconnected" + logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.") + await self.stop() + + def _request_clean_exit(self, reason: str) -> None: + self._exit_cleanly = True + self._exit_reason = reason + self._shutdown_event.set() @staticmethod def _load_prefill_messages() -> List[Dict[str, Any]]: @@ -647,6 +684,11 @@ class GatewayRunner: """ logger.info("Starting Hermes Gateway...") logger.info("Session storage: %s", self.config.sessions_dir) + try: + from gateway.status import write_runtime_status + write_runtime_status(gateway_state="starting", exit_reason=None) + except Exception: + pass # Warn if no user allowlists are configured and open access is not opted in _any_allowlist = any( @@ -676,6 +718,7 @@ class GatewayRunner: logger.warning("Process checkpoint recovery: %s", e) connected_count = 0 + startup_nonretryable_errors: list[str] = [] # Initialize and connect each configured platform for platform, platform_config in self.config.platforms.items(): @@ -687,8 +730,9 @@ class GatewayRunner: logger.warning("No adapter available for %s", platform.value) continue - # Set up message handler + # Set up message + fatal error handlers adapter.set_message_handler(self._handle_message) + adapter.set_fatal_error_handler(self._handle_adapter_fatal_error) # Try to connect logger.info("Connecting to %s...", platform.value) @@ -701,10 +745,24 @@ class GatewayRunner: logger.info("✓ %s connected", platform.value) else: logger.warning("✗ %s failed to connect", platform.value) + if adapter.has_fatal_error and not adapter.fatal_error_retryable: + startup_nonretryable_errors.append( + f"{platform.value}: {adapter.fatal_error_message}" + ) except Exception as e: logger.error("✗ %s error: %s", platform.value, e) if connected_count == 0: + if startup_nonretryable_errors: + reason = "; ".join(startup_nonretryable_errors) + logger.error("Gateway hit a non-retryable startup conflict: %s", reason) + try: + from gateway.status import write_runtime_status + write_runtime_status(gateway_state="startup_failed", exit_reason=reason) + except Exception: + pass + self._request_clean_exit(reason) + return True logger.warning("No messaging platforms connected.") logger.info("Gateway will continue running for cron job execution.") @@ -712,6 +770,11 @@ class GatewayRunner: self.delivery_router.adapters = self.adapters self._running = True + try: + from gateway.status import write_runtime_status + write_runtime_status(gateway_state="running", exit_reason=None) + except Exception: + pass # Emit gateway:startup hook hook_count = len(self.hooks.loaded_hooks) @@ -806,8 +869,12 @@ class GatewayRunner: self._shutdown_all_gateway_honcho() self._shutdown_event.set() - from gateway.status import remove_pid_file + from gateway.status import remove_pid_file, write_runtime_status remove_pid_file() + try: + write_runtime_status(gateway_state="stopped", exit_reason=self._exit_reason) + except Exception: + pass logger.info("Gateway stopped") @@ -4340,6 +4407,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = success = await runner.start() if not success: return False + if runner.should_exit_cleanly: + if runner.exit_reason: + logger.error("Gateway exiting cleanly: %s", runner.exit_reason) + return True # Write PID file so CLI can detect gateway is running import atexit diff --git a/gateway/status.py b/gateway/status.py index db72f1fed..3362a7786 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -11,13 +11,17 @@ that will be useful when we add named profiles (multiple agents running concurrently under distinct configurations). """ +import hashlib import json import os import sys +from datetime import datetime, timezone from pathlib import Path -from typing import Optional +from typing import Any, Optional _GATEWAY_KIND = "hermes-gateway" +_RUNTIME_STATUS_FILE = "gateway_state.json" +_LOCKS_DIRNAME = "gateway-locks" def _get_pid_path() -> Path: @@ -26,6 +30,32 @@ def _get_pid_path() -> Path: return home / "gateway.pid" +def _get_runtime_status_path() -> Path: + """Return the persisted runtime health/status file path.""" + return _get_pid_path().with_name(_RUNTIME_STATUS_FILE) + + +def _get_lock_dir() -> Path: + """Return the machine-local directory for token-scoped gateway locks.""" + override = os.getenv("HERMES_GATEWAY_LOCK_DIR") + if override: + return Path(override) + state_home = Path(os.getenv("XDG_STATE_HOME", Path.home() / ".local" / "state")) + return state_home / "hermes" / _LOCKS_DIRNAME + + +def _utc_now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _scope_hash(identity: str) -> str: + return hashlib.sha256(identity.encode("utf-8")).hexdigest()[:16] + + +def _get_scope_lock_path(scope: str, identity: str) -> Path: + return _get_lock_dir() / f"{scope}-{_scope_hash(identity)}.lock" + + def _get_process_start_time(pid: int) -> Optional[int]: """Return the kernel start time for a process when available.""" stat_path = Path(f"/proc/{pid}/stat") @@ -73,6 +103,38 @@ def _build_pid_record() -> dict: } +def _build_runtime_status_record() -> dict[str, Any]: + payload = _build_pid_record() + payload.update({ + "gateway_state": "starting", + "exit_reason": None, + "platforms": {}, + "updated_at": _utc_now_iso(), + }) + return payload + + +def _read_json_file(path: Path) -> Optional[dict[str, Any]]: + if not path.exists(): + return None + try: + raw = path.read_text().strip() + except OSError: + return None + if not raw: + return None + try: + payload = json.loads(raw) + except json.JSONDecodeError: + return None + return payload if isinstance(payload, dict) else None + + +def _write_json_file(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload)) + + def _read_pid_record() -> Optional[dict]: pid_path = _get_pid_path() if not pid_path.exists(): @@ -99,9 +161,49 @@ def _read_pid_record() -> Optional[dict]: def write_pid_file() -> None: """Write the current process PID and metadata to the gateway PID file.""" - pid_path = _get_pid_path() - pid_path.parent.mkdir(parents=True, exist_ok=True) - pid_path.write_text(json.dumps(_build_pid_record())) + _write_json_file(_get_pid_path(), _build_pid_record()) + + +def write_runtime_status( + *, + gateway_state: Optional[str] = None, + exit_reason: Optional[str] = None, + platform: Optional[str] = None, + platform_state: Optional[str] = None, + error_code: Optional[str] = None, + error_message: Optional[str] = None, +) -> None: + """Persist gateway runtime health information for diagnostics/status.""" + path = _get_runtime_status_path() + payload = _read_json_file(path) or _build_runtime_status_record() + payload.setdefault("platforms", {}) + payload.setdefault("kind", _GATEWAY_KIND) + payload.setdefault("pid", os.getpid()) + payload.setdefault("start_time", _get_process_start_time(os.getpid())) + payload["updated_at"] = _utc_now_iso() + + if gateway_state is not None: + payload["gateway_state"] = gateway_state + if exit_reason is not None: + payload["exit_reason"] = exit_reason + + if platform is not None: + platform_payload = payload["platforms"].get(platform, {}) + if platform_state is not None: + platform_payload["state"] = platform_state + if error_code is not None: + platform_payload["error_code"] = error_code + if error_message is not None: + platform_payload["error_message"] = error_message + platform_payload["updated_at"] = _utc_now_iso() + payload["platforms"][platform] = platform_payload + + _write_json_file(path, payload) + + +def read_runtime_status() -> Optional[dict[str, Any]]: + """Read the persisted gateway runtime health/status information.""" + return _read_json_file(_get_runtime_status_path()) def remove_pid_file() -> None: @@ -112,6 +214,87 @@ def remove_pid_file() -> None: pass +def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, Any]] = None) -> tuple[bool, Optional[dict[str, Any]]]: + """Acquire a machine-local lock keyed by scope + identity. + + Used to prevent multiple local gateways from using the same external identity + at once (e.g. the same Telegram bot token across different HERMES_HOME dirs). + """ + lock_path = _get_scope_lock_path(scope, identity) + lock_path.parent.mkdir(parents=True, exist_ok=True) + record = { + **_build_pid_record(), + "scope": scope, + "identity_hash": _scope_hash(identity), + "metadata": metadata or {}, + "updated_at": _utc_now_iso(), + } + + existing = _read_json_file(lock_path) + if existing: + try: + existing_pid = int(existing["pid"]) + except (KeyError, TypeError, ValueError): + existing_pid = None + + if existing_pid == os.getpid() and existing.get("start_time") == record.get("start_time"): + _write_json_file(lock_path, record) + return True, existing + + stale = existing_pid is None + if not stale: + try: + os.kill(existing_pid, 0) + except (ProcessLookupError, PermissionError): + stale = True + else: + current_start = _get_process_start_time(existing_pid) + if ( + existing.get("start_time") is not None + and current_start is not None + and current_start != existing.get("start_time") + ): + stale = True + if stale: + try: + lock_path.unlink(missing_ok=True) + except OSError: + pass + else: + return False, existing + + try: + fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY) + except FileExistsError: + return False, _read_json_file(lock_path) + try: + with os.fdopen(fd, "w", encoding="utf-8") as handle: + json.dump(record, handle) + except Exception: + try: + lock_path.unlink(missing_ok=True) + except OSError: + pass + raise + return True, None + + +def release_scoped_lock(scope: str, identity: str) -> None: + """Release a previously-acquired scope lock when owned by this process.""" + lock_path = _get_scope_lock_path(scope, identity) + existing = _read_json_file(lock_path) + if not existing: + return + if existing.get("pid") != os.getpid(): + return + if existing.get("start_time") != _get_process_start_time(os.getpid()): + return + try: + lock_path.unlink(missing_ok=True) + except OSError: + pass + + def get_running_pid() -> Optional[int]: """Return the PID of a running gateway instance, or ``None``. diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 4d3ed8845..ea9496052 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -367,6 +367,13 @@ def systemd_status(deep: bool = False): print("✗ Gateway service is stopped") print(" Run: hermes gateway start") + runtime_lines = _runtime_health_lines() + if runtime_lines: + print() + print("Recent gateway health:") + for line in runtime_lines: + print(f" {line}") + if deep: print_systemd_linger_guidance() else: @@ -693,6 +700,35 @@ def _platform_status(platform: dict) -> str: return "not configured" +def _runtime_health_lines() -> list[str]: + """Summarize the latest persisted gateway runtime health state.""" + try: + from gateway.status import read_runtime_status + except Exception: + return [] + + state = read_runtime_status() + if not state: + return [] + + lines: list[str] = [] + gateway_state = state.get("gateway_state") + exit_reason = state.get("exit_reason") + platforms = state.get("platforms", {}) or {} + + for platform, pdata in platforms.items(): + if pdata.get("state") == "fatal": + message = pdata.get("error_message") or "unknown error" + lines.append(f"⚠ {platform}: {message}") + + if gateway_state == "startup_failed" and exit_reason: + lines.append(f"⚠ Last startup issue: {exit_reason}") + elif gateway_state == "stopped" and exit_reason: + lines.append(f"⚠ Last shutdown reason: {exit_reason}") + + return lines + + def _setup_standard_platform(platform: dict): """Interactive setup for Telegram, Discord, or Slack.""" emoji = platform["emoji"] @@ -1186,11 +1222,23 @@ def gateway_command(args): if pids: print(f"✓ Gateway is running (PID: {', '.join(map(str, pids))})") print(" (Running manually, not as a system service)") + runtime_lines = _runtime_health_lines() + if runtime_lines: + print() + print("Recent gateway health:") + for line in runtime_lines: + print(f" {line}") print() print("To install as a service:") print(" hermes gateway install") else: print("✗ Gateway is not running") + runtime_lines = _runtime_health_lines() + if runtime_lines: + print() + print("Recent gateway health:") + for line in runtime_lines: + print(f" {line}") print() print("To start:") print(" hermes gateway # Run in foreground") diff --git a/tests/gateway/test_runner_fatal_adapter.py b/tests/gateway/test_runner_fatal_adapter.py new file mode 100644 index 000000000..aa414d72f --- /dev/null +++ b/tests/gateway/test_runner_fatal_adapter.py @@ -0,0 +1,46 @@ +import pytest + +from gateway.config import GatewayConfig, Platform, PlatformConfig +from gateway.platforms.base import BasePlatformAdapter +from gateway.run import GatewayRunner + + +class _FatalAdapter(BasePlatformAdapter): + def __init__(self): + super().__init__(PlatformConfig(enabled=True, token="token"), Platform.TELEGRAM) + + async def connect(self) -> bool: + self._set_fatal_error( + "telegram_token_lock", + "Another local Hermes gateway is already using this Telegram bot token.", + retryable=False, + ) + return False + + async def disconnect(self) -> None: + self._mark_disconnected() + + async def send(self, chat_id, content, reply_to=None, metadata=None): + raise NotImplementedError + + async def get_chat_info(self, chat_id): + return {"id": chat_id} + + +@pytest.mark.asyncio +async def test_runner_requests_clean_exit_for_nonretryable_startup_conflict(monkeypatch, tmp_path): + config = GatewayConfig( + platforms={ + Platform.TELEGRAM: PlatformConfig(enabled=True, token="token") + }, + sessions_dir=tmp_path / "sessions", + ) + runner = GatewayRunner(config) + + monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _FatalAdapter()) + + ok = await runner.start() + + assert ok is True + assert runner.should_exit_cleanly is True + assert "already using this Telegram bot token" in runner.exit_reason diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 025708a53..fdf1b57c5 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -25,3 +25,77 @@ class TestGatewayPidState: assert status.get_running_pid() is None assert not pid_path.exists() + + +class TestGatewayRuntimeStatus: + def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + + status.write_runtime_status( + gateway_state="startup_failed", + exit_reason="telegram conflict", + platform="telegram", + platform_state="fatal", + error_code="telegram_polling_conflict", + error_message="another poller is active", + ) + + payload = status.read_runtime_status() + assert payload["gateway_state"] == "startup_failed" + assert payload["exit_reason"] == "telegram conflict" + assert payload["platforms"]["telegram"]["state"] == "fatal" + assert payload["platforms"]["telegram"]["error_code"] == "telegram_polling_conflict" + assert payload["platforms"]["telegram"]["error_message"] == "another poller is active" + + +class TestScopedLocks: + def test_acquire_scoped_lock_rejects_live_other_process(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text(json.dumps({ + "pid": 99999, + "start_time": 123, + "kind": "hermes-gateway", + })) + + monkeypatch.setattr(status.os, "kill", lambda pid, sig: None) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123) + + acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"}) + + assert acquired is False + assert existing["pid"] == 99999 + + def test_acquire_scoped_lock_replaces_stale_record(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text(json.dumps({ + "pid": 99999, + "start_time": 123, + "kind": "hermes-gateway", + })) + + def fake_kill(pid, sig): + raise ProcessLookupError + + monkeypatch.setattr(status.os, "kill", fake_kill) + + acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"}) + + assert acquired is True + payload = json.loads(lock_path.read_text()) + assert payload["pid"] == os.getpid() + assert payload["metadata"]["platform"] == "telegram" + + def test_release_scoped_lock_only_removes_current_owner(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + + acquired, _ = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"}) + assert acquired is True + lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock" + assert lock_path.exists() + + status.release_scoped_lock("telegram-bot-token", "secret") + assert not lock_path.exists() diff --git a/tests/gateway/test_telegram_conflict.py b/tests/gateway/test_telegram_conflict.py new file mode 100644 index 000000000..f2e212812 --- /dev/null +++ b/tests/gateway/test_telegram_conflict.py @@ -0,0 +1,100 @@ +import asyncio +import sys +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from gateway.config import PlatformConfig + + +def _ensure_telegram_mock(): + if "telegram" in sys.modules and hasattr(sys.modules["telegram"], "__file__"): + return + + telegram_mod = MagicMock() + telegram_mod.ext.ContextTypes.DEFAULT_TYPE = type(None) + telegram_mod.constants.ParseMode.MARKDOWN_V2 = "MarkdownV2" + telegram_mod.constants.ChatType.GROUP = "group" + telegram_mod.constants.ChatType.SUPERGROUP = "supergroup" + telegram_mod.constants.ChatType.CHANNEL = "channel" + telegram_mod.constants.ChatType.PRIVATE = "private" + + for name in ("telegram", "telegram.ext", "telegram.constants"): + sys.modules.setdefault(name, telegram_mod) + + +_ensure_telegram_mock() + +from gateway.platforms.telegram import TelegramAdapter # noqa: E402 + + +@pytest.mark.asyncio +async def test_connect_rejects_same_host_token_lock(monkeypatch): + adapter = TelegramAdapter(PlatformConfig(enabled=True, token="secret-token")) + + monkeypatch.setattr( + "gateway.status.acquire_scoped_lock", + lambda scope, identity, metadata=None: (False, {"pid": 4242}), + ) + + ok = await adapter.connect() + + assert ok is False + assert adapter.fatal_error_code == "telegram_token_lock" + assert adapter.has_fatal_error is True + assert "already using this Telegram bot token" in adapter.fatal_error_message + + +@pytest.mark.asyncio +async def test_polling_conflict_stops_polling_and_notifies_handler(monkeypatch): + adapter = TelegramAdapter(PlatformConfig(enabled=True, token="secret-token")) + fatal_handler = AsyncMock() + adapter.set_fatal_error_handler(fatal_handler) + + monkeypatch.setattr( + "gateway.status.acquire_scoped_lock", + lambda scope, identity, metadata=None: (True, None), + ) + monkeypatch.setattr( + "gateway.status.release_scoped_lock", + lambda scope, identity: None, + ) + + captured = {} + + async def fake_start_polling(**kwargs): + captured["error_callback"] = kwargs["error_callback"] + + updater = SimpleNamespace( + start_polling=AsyncMock(side_effect=fake_start_polling), + stop=AsyncMock(), + ) + bot = SimpleNamespace(set_my_commands=AsyncMock()) + app = SimpleNamespace( + bot=bot, + updater=updater, + add_handler=MagicMock(), + initialize=AsyncMock(), + start=AsyncMock(), + ) + builder = MagicMock() + builder.token.return_value = builder + builder.build.return_value = app + monkeypatch.setattr("gateway.platforms.telegram.Application", SimpleNamespace(builder=MagicMock(return_value=builder))) + + ok = await adapter.connect() + + assert ok is True + assert callable(captured["error_callback"]) + + conflict = type("Conflict", (Exception,), {}) + captured["error_callback"](conflict("Conflict: terminated by other getUpdates request; make sure that only one bot instance is running")) + + await asyncio.sleep(0) + await asyncio.sleep(0) + + assert adapter.fatal_error_code == "telegram_polling_conflict" + assert adapter.has_fatal_error is True + updater.stop.assert_awaited() + fatal_handler.assert_awaited_once() diff --git a/tests/hermes_cli/test_gateway_runtime_health.py b/tests/hermes_cli/test_gateway_runtime_health.py new file mode 100644 index 000000000..15c0705cf --- /dev/null +++ b/tests/hermes_cli/test_gateway_runtime_health.py @@ -0,0 +1,22 @@ +from hermes_cli.gateway import _runtime_health_lines + + +def test_runtime_health_lines_include_fatal_platform_and_startup_reason(monkeypatch): + monkeypatch.setattr( + "gateway.status.read_runtime_status", + lambda: { + "gateway_state": "startup_failed", + "exit_reason": "telegram conflict", + "platforms": { + "telegram": { + "state": "fatal", + "error_message": "another poller is active", + } + }, + }, + ) + + lines = _runtime_health_lines() + + assert "⚠ telegram: another poller is active" in lines + assert "⚠ Last startup issue: telegram conflict" in lines From 8fb618234f3edfff57b5511b13082158bbccdf4a Mon Sep 17 00:00:00 2001 From: capybaraonchain Date: Thu, 12 Mar 2026 14:33:03 +0100 Subject: [PATCH 27/34] fix(gateway): buffer Telegram media groups to prevent self-interruption Telegram albums arrive as multiple updates with a shared media_group_id. Previously each image triggered a separate MessageEvent, causing the agent to interrupt itself when describing the first image. - Add 0.8s debounce window for media group items - Merge attachments into single MessageEvent - Add regression test for photo album buffering --- gateway/platforms/telegram.py | 48 ++++++++++++++++++++++++ tests/gateway/test_telegram_documents.py | 41 ++++++++++++++++++-- 2 files changed, 85 insertions(+), 4 deletions(-) diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index df44733e3..aae0cce7f 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -105,11 +105,14 @@ class TelegramAdapter(BasePlatformAdapter): # Telegram message limits MAX_MESSAGE_LENGTH = 4096 + MEDIA_GROUP_WAIT_SECONDS = 0.8 def __init__(self, config: PlatformConfig): super().__init__(config, Platform.TELEGRAM) self._app: Optional[Application] = None self._bot: Optional[Bot] = None + self._media_group_events: Dict[str, MessageEvent] = {} + self._media_group_tasks: Dict[str, asyncio.Task] = {} async def connect(self) -> bool: """Connect to Telegram and start polling for updates.""" @@ -872,8 +875,53 @@ class TelegramAdapter(BasePlatformAdapter): except Exception as e: logger.warning("[Telegram] Failed to cache document: %s", e, exc_info=True) + media_group_id = getattr(msg, "media_group_id", None) + if media_group_id: + await self._queue_media_group_event(str(media_group_id), event) + return + await self.handle_message(event) + async def _queue_media_group_event(self, media_group_id: str, event: MessageEvent) -> None: + """Buffer Telegram media-group items so albums arrive as one logical event. + + Telegram delivers albums as multiple updates with a shared media_group_id. + If we forward each item immediately, the gateway thinks the second image is a + new user message and interrupts the first. We debounce briefly and merge the + attachments into a single MessageEvent. + """ + existing = self._media_group_events.get(media_group_id) + if existing is None: + self._media_group_events[media_group_id] = event + else: + existing.media_urls.extend(event.media_urls) + existing.media_types.extend(event.media_types) + if event.text: + if existing.text: + if event.text not in existing.text.split("\n\n"): + existing.text = f"{existing.text}\n\n{event.text}" + else: + existing.text = event.text + + prior_task = self._media_group_tasks.get(media_group_id) + if prior_task: + prior_task.cancel() + + self._media_group_tasks[media_group_id] = asyncio.create_task( + self._flush_media_group_event(media_group_id) + ) + + async def _flush_media_group_event(self, media_group_id: str) -> None: + try: + await asyncio.sleep(self.MEDIA_GROUP_WAIT_SECONDS) + event = self._media_group_events.pop(media_group_id, None) + if event is not None: + await self.handle_message(event) + except asyncio.CancelledError: + return + finally: + self._media_group_tasks.pop(media_group_id, None) + async def _handle_sticker(self, msg: Message, event: "MessageEvent") -> None: """ Describe a Telegram sticker via vision analysis, with caching. diff --git a/tests/gateway/test_telegram_documents.py b/tests/gateway/test_telegram_documents.py index 7a76625fe..25cfc2492 100644 --- a/tests/gateway/test_telegram_documents.py +++ b/tests/gateway/test_telegram_documents.py @@ -81,20 +81,21 @@ def _make_document( return doc -def _make_message(document=None, caption=None): - """Build a mock Telegram Message with the given document.""" +def _make_message(document=None, caption=None, media_group_id=None, photo=None): + """Build a mock Telegram Message with the given document/photo.""" msg = MagicMock() msg.message_id = 42 msg.text = caption or "" msg.caption = caption msg.date = None - # Media flags — all None except document - msg.photo = None + # Media flags — all None except explicit payload + msg.photo = photo msg.video = None msg.audio = None msg.voice = None msg.sticker = None msg.document = document + msg.media_group_id = media_group_id # Chat / user msg.chat = MagicMock() msg.chat.id = 100 @@ -165,6 +166,12 @@ class TestDocumentTypeDetection: # TestDocumentDownloadBlock # --------------------------------------------------------------------------- +def _make_photo(file_obj=None): + photo = MagicMock() + photo.get_file = AsyncMock(return_value=file_obj or _make_file_obj(b"photo-bytes")) + return photo + + class TestDocumentDownloadBlock: @pytest.mark.asyncio async def test_supported_pdf_is_cached(self, adapter): @@ -339,6 +346,32 @@ class TestDocumentDownloadBlock: adapter.handle_message.assert_called_once() +# --------------------------------------------------------------------------- +# TestMediaGroups — media group (album) buffering +# --------------------------------------------------------------------------- + +class TestMediaGroups: + @pytest.mark.asyncio + async def test_photo_album_is_buffered_and_combined(self, adapter): + first_photo = _make_photo(_make_file_obj(b"first")) + second_photo = _make_photo(_make_file_obj(b"second")) + + msg1 = _make_message(caption="two images", media_group_id="album-1", photo=[first_photo]) + msg2 = _make_message(media_group_id="album-1", photo=[second_photo]) + + with patch("gateway.platforms.telegram.cache_image_from_bytes", side_effect=["/tmp/one.jpg", "/tmp/two.jpg"]): + await adapter._handle_media_message(_make_update(msg1), MagicMock()) + await adapter._handle_media_message(_make_update(msg2), MagicMock()) + assert adapter.handle_message.await_count == 0 + await asyncio.sleep(adapter.MEDIA_GROUP_WAIT_SECONDS + 0.05) + + adapter.handle_message.assert_awaited_once() + event = adapter.handle_message.call_args[0][0] + assert event.text == "two images" + assert event.media_urls == ["/tmp/one.jpg", "/tmp/two.jpg"] + assert len(event.media_types) == 2 + + # --------------------------------------------------------------------------- # TestSendDocument — outbound file attachment delivery # --------------------------------------------------------------------------- From f3a38c90fc64028956fe30902f934ece75424bfd Mon Sep 17 00:00:00 2001 From: teyrebaz33 Date: Thu, 12 Mar 2026 18:02:21 +0300 Subject: [PATCH 28/34] fix(gateway): fall back to sys.executable -m hermes_cli.main when hermes not on PATH When shutil.which('hermes') returns None, _resolve_hermes_bin() now tries sys.executable -m hermes_cli.main as a fallback. This handles setups where Hermes is launched via a venv or module invocation and the hermes symlink is not on PATH for the gateway process. Fixes #1049 --- gateway/run.py | 41 ++++++++++++++-- tests/gateway/test_update_command.py | 73 ++++++++++++++++++++++++++-- 2 files changed, 107 insertions(+), 7 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 5ab74972a..d52092c49 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -215,6 +215,33 @@ def _resolve_gateway_model() -> str: return model +def _resolve_hermes_bin() -> Optional[list[str]]: + """Resolve the Hermes update command as argv parts. + + Tries in order: + 1. ``shutil.which("hermes")`` — standard PATH lookup + 2. ``sys.executable -m hermes_cli.main`` — fallback when Hermes is running + from a venv/module invocation and the ``hermes`` shim is not on PATH + + Returns argv parts ready for quoting/joining, or ``None`` if neither works. + """ + import shutil + + hermes_bin = shutil.which("hermes") + if hermes_bin: + return [hermes_bin] + + try: + import importlib.util + + if importlib.util.find_spec("hermes_cli") is not None: + return [sys.executable, "-m", "hermes_cli.main"] + except Exception: + pass + + return None + + class GatewayRunner: """ Main gateway controller. @@ -3155,9 +3182,14 @@ class GatewayRunner: if not git_dir.exists(): return "✗ Not a git repository — cannot update." - hermes_bin = shutil.which("hermes") - if not hermes_bin: - return "✗ `hermes` command not found on PATH." + hermes_cmd = _resolve_hermes_bin() + if not hermes_cmd: + return ( + "✗ Could not locate the `hermes` command. " + "Hermes is running, but the update command could not find the " + "executable on PATH or via the current Python interpreter. " + "Try running `hermes update` manually in your terminal." + ) pending_path = _hermes_home / ".update_pending.json" output_path = _hermes_home / ".update_output.txt" @@ -3173,8 +3205,9 @@ class GatewayRunner: # Spawn `hermes update` in a separate cgroup so it survives gateway # restart. systemd-run --user --scope creates a transient scope unit. + hermes_cmd_str = " ".join(shlex.quote(part) for part in hermes_cmd) update_cmd = ( - f"{shlex.quote(hermes_bin)} update > {shlex.quote(str(output_path))} 2>&1; " + f"{hermes_cmd_str} update > {shlex.quote(str(output_path))} 2>&1; " f"status=$?; printf '%s' \"$status\" > {shlex.quote(str(exit_code_path))}" ) try: diff --git a/tests/gateway/test_update_command.py b/tests/gateway/test_update_command.py index 124745635..ac9beac1b 100644 --- a/tests/gateway/test_update_command.py +++ b/tests/gateway/test_update_command.py @@ -88,7 +88,7 @@ class TestHandleUpdateCommand: @pytest.mark.asyncio async def test_no_hermes_binary(self, tmp_path): - """Returns error when hermes is not on PATH.""" + """Returns error when hermes is not on PATH and hermes_cli is not importable.""" runner = _make_runner() event = _make_event() @@ -102,10 +102,77 @@ class TestHandleUpdateCommand: with patch("gateway.run._hermes_home", tmp_path), \ patch("gateway.run.__file__", fake_file), \ - patch("shutil.which", return_value=None): + patch("shutil.which", return_value=None), \ + patch("importlib.util.find_spec", return_value=None): result = await runner._handle_update_command(event) - assert "not found on PATH" in result + assert "Could not locate" in result + assert "hermes update" in result + + @pytest.mark.asyncio + async def test_fallback_to_sys_executable(self, tmp_path): + """Falls back to sys.executable -m hermes_cli.main when hermes not on PATH.""" + runner = _make_runner() + event = _make_event() + + fake_root = tmp_path / "project" + fake_root.mkdir() + (fake_root / ".git").mkdir() + (fake_root / "gateway").mkdir() + (fake_root / "gateway" / "run.py").touch() + fake_file = str(fake_root / "gateway" / "run.py") + hermes_home = tmp_path / "hermes" + hermes_home.mkdir() + + mock_popen = MagicMock() + fake_spec = MagicMock() + + with patch("gateway.run._hermes_home", hermes_home), \ + patch("gateway.run.__file__", fake_file), \ + patch("shutil.which", return_value=None), \ + patch("importlib.util.find_spec", return_value=fake_spec), \ + patch("subprocess.Popen", mock_popen): + result = await runner._handle_update_command(event) + + assert "Starting Hermes update" in result + call_args = mock_popen.call_args[0][0] + # The update_cmd uses sys.executable -m hermes_cli.main + joined = " ".join(call_args) if isinstance(call_args, list) else call_args + assert "hermes_cli.main" in joined or "bash" in call_args[0] + + @pytest.mark.asyncio + async def test_resolve_hermes_bin_prefers_which(self, tmp_path): + """_resolve_hermes_bin returns argv parts from shutil.which when available.""" + from gateway.run import _resolve_hermes_bin + + with patch("shutil.which", return_value="/custom/path/hermes"): + result = _resolve_hermes_bin() + + assert result == ["/custom/path/hermes"] + + @pytest.mark.asyncio + async def test_resolve_hermes_bin_fallback(self): + """_resolve_hermes_bin falls back to sys.executable argv when which fails.""" + import sys + from gateway.run import _resolve_hermes_bin + + fake_spec = MagicMock() + with patch("shutil.which", return_value=None), \ + patch("importlib.util.find_spec", return_value=fake_spec): + result = _resolve_hermes_bin() + + assert result == [sys.executable, "-m", "hermes_cli.main"] + + @pytest.mark.asyncio + async def test_resolve_hermes_bin_returns_none_when_both_fail(self): + """_resolve_hermes_bin returns None when both strategies fail.""" + from gateway.run import _resolve_hermes_bin + + with patch("shutil.which", return_value=None), \ + patch("importlib.util.find_spec", return_value=None): + result = _resolve_hermes_bin() + + assert result is None @pytest.mark.asyncio async def test_writes_pending_marker(self, tmp_path): From e1824ef8a6c22fe0e97d15c6c9fca631e44168ee Mon Sep 17 00:00:00 2001 From: stoicneko Date: Thu, 12 Mar 2026 06:20:47 -0700 Subject: [PATCH 29/34] fix(cli): fall back to main when current branch has no remote counterpart `hermes update` crashed with CalledProcessError when run on a local-only branch (e.g. fix/stoicneko) because `git rev-list HEAD..origin/{branch}` fails when origin/{branch} doesn't exist. Now verifies the remote branch exists first and falls back to origin/main. --- hermes_cli/main.py | 10 ++- tests/hermes_cli/test_cmd_update.py | 107 ++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 tests/hermes_cli/test_cmd_update.py diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 9609f3998..3d910907d 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -2056,7 +2056,15 @@ def cmd_update(args): check=True ) branch = result.stdout.strip() - + + # Fall back to main if the current branch doesn't exist on the remote + verify = subprocess.run( + git_cmd + ["rev-parse", "--verify", f"origin/{branch}"], + cwd=PROJECT_ROOT, capture_output=True, text=True, + ) + if verify.returncode != 0: + branch = "main" + # Check if there are updates result = subprocess.run( git_cmd + ["rev-list", f"HEAD..origin/{branch}", "--count"], diff --git a/tests/hermes_cli/test_cmd_update.py b/tests/hermes_cli/test_cmd_update.py new file mode 100644 index 000000000..0ccb7af81 --- /dev/null +++ b/tests/hermes_cli/test_cmd_update.py @@ -0,0 +1,107 @@ +"""Tests for cmd_update — branch fallback when remote branch doesn't exist.""" + +import subprocess +from types import SimpleNamespace +from unittest.mock import patch + +import pytest + +from hermes_cli.main import cmd_update, PROJECT_ROOT + + +def _make_run_side_effect(branch="main", verify_ok=True, commit_count="0"): + """Build a side_effect function for subprocess.run that simulates git commands.""" + + def side_effect(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + + # git rev-parse --abbrev-ref HEAD (get current branch) + if "rev-parse" in joined and "--abbrev-ref" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout=f"{branch}\n", stderr="") + + # git rev-parse --verify origin/{branch} (check remote branch exists) + if "rev-parse" in joined and "--verify" in joined: + rc = 0 if verify_ok else 128 + return subprocess.CompletedProcess(cmd, rc, stdout="", stderr="") + + # git rev-list HEAD..origin/{branch} --count + if "rev-list" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout=f"{commit_count}\n", stderr="") + + # Fallback: return a successful CompletedProcess with empty stdout + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + return side_effect + + +@pytest.fixture +def mock_args(): + return SimpleNamespace() + + +class TestCmdUpdateBranchFallback: + """cmd_update falls back to main when current branch has no remote counterpart.""" + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_update_falls_back_to_main_when_branch_not_on_remote( + self, mock_run, _mock_which, mock_args, capsys + ): + mock_run.side_effect = _make_run_side_effect( + branch="fix/stoicneko", verify_ok=False, commit_count="3" + ) + + cmd_update(mock_args) + + commands = [" ".join(str(a) for a in c.args[0]) for c in mock_run.call_args_list] + + # rev-list should use origin/main, not origin/fix/stoicneko + rev_list_cmds = [c for c in commands if "rev-list" in c] + assert len(rev_list_cmds) == 1 + assert "origin/main" in rev_list_cmds[0] + assert "origin/fix/stoicneko" not in rev_list_cmds[0] + + # pull should use main, not fix/stoicneko + pull_cmds = [c for c in commands if "pull" in c] + assert len(pull_cmds) == 1 + assert "main" in pull_cmds[0] + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_update_uses_current_branch_when_on_remote( + self, mock_run, _mock_which, mock_args, capsys + ): + mock_run.side_effect = _make_run_side_effect( + branch="main", verify_ok=True, commit_count="2" + ) + + cmd_update(mock_args) + + commands = [" ".join(str(a) for a in c.args[0]) for c in mock_run.call_args_list] + + rev_list_cmds = [c for c in commands if "rev-list" in c] + assert len(rev_list_cmds) == 1 + assert "origin/main" in rev_list_cmds[0] + + pull_cmds = [c for c in commands if "pull" in c] + assert len(pull_cmds) == 1 + assert "main" in pull_cmds[0] + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_update_already_up_to_date( + self, mock_run, _mock_which, mock_args, capsys + ): + mock_run.side_effect = _make_run_side_effect( + branch="main", verify_ok=True, commit_count="0" + ) + + cmd_update(mock_args) + + captured = capsys.readouterr() + assert "Already up to date!" in captured.out + + # Should NOT have called pull + commands = [" ".join(str(a) for a in c.args[0]) for c in mock_run.call_args_list] + pull_cmds = [c for c in commands if "pull" in c] + assert len(pull_cmds) == 0 From 3fab72f1e17f33bc7328219fde4c39b054051e17 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 12:18:24 -0700 Subject: [PATCH 30/34] fix(gateway): clean up pending Telegram media groups on disconnect Cancel any queued media-group flush tasks during Telegram adapter disconnect and clear the buffered events map so shutdown can't leave a pending album flush behind. Add a regression test covering disconnect before the debounce window expires. --- gateway/platforms/telegram.py | 10 +++++++++- tests/gateway/test_telegram_documents.py | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index aae0cce7f..7496a0714 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -200,7 +200,15 @@ class TelegramAdapter(BasePlatformAdapter): return False async def disconnect(self) -> None: - """Stop polling and disconnect.""" + """Stop polling, cancel pending album flushes, and disconnect.""" + pending_media_group_tasks = list(self._media_group_tasks.values()) + for task in pending_media_group_tasks: + task.cancel() + if pending_media_group_tasks: + await asyncio.gather(*pending_media_group_tasks, return_exceptions=True) + self._media_group_tasks.clear() + self._media_group_events.clear() + if self._app: try: await self._app.updater.stop() diff --git a/tests/gateway/test_telegram_documents.py b/tests/gateway/test_telegram_documents.py index 25cfc2492..5e3e6f94d 100644 --- a/tests/gateway/test_telegram_documents.py +++ b/tests/gateway/test_telegram_documents.py @@ -371,6 +371,24 @@ class TestMediaGroups: assert event.media_urls == ["/tmp/one.jpg", "/tmp/two.jpg"] assert len(event.media_types) == 2 + @pytest.mark.asyncio + async def test_disconnect_cancels_pending_media_group_flush(self, adapter): + first_photo = _make_photo(_make_file_obj(b"first")) + msg = _make_message(caption="two images", media_group_id="album-2", photo=[first_photo]) + + with patch("gateway.platforms.telegram.cache_image_from_bytes", return_value="/tmp/one.jpg"): + await adapter._handle_media_message(_make_update(msg), MagicMock()) + + assert "album-2" in adapter._media_group_events + assert "album-2" in adapter._media_group_tasks + + await adapter.disconnect() + await asyncio.sleep(adapter.MEDIA_GROUP_WAIT_SECONDS + 0.05) + + assert adapter._media_group_events == {} + assert adapter._media_group_tasks == {} + adapter.handle_message.assert_not_awaited() + # --------------------------------------------------------------------------- # TestSendDocument — outbound file attachment delivery From ed0c7194ed64b716f8ad5aab6e860505591af4d6 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 18:03:50 -0700 Subject: [PATCH 31/34] fix: preserve current gateway update and startup behavior Follow up on salvaged PR #1052. Restore current-main gateway lifecycle handling after conflict resolution and adapt the update fallback to use shell-quoted argv parts safely. --- gateway/run.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index d52092c49..e97db0255 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -272,6 +272,8 @@ class GatewayRunner: self.delivery_router = DeliveryRouter(self.config) self._running = False self._shutdown_event = asyncio.Event() + self._exit_cleanly = False + self._exit_reason: Optional[str] = None # Track running agents per session for interrupt support # Key: session_key, Value: AIAgent instance @@ -490,6 +492,41 @@ class GatewayRunner: """Run the sync memory flush in a thread pool so it won't block the event loop.""" loop = asyncio.get_event_loop() await loop.run_in_executor(None, self._flush_memories_for_session, old_session_id) + + @property + def should_exit_cleanly(self) -> bool: + return self._exit_cleanly + + @property + def exit_reason(self) -> Optional[str]: + return self._exit_reason + + async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None: + """React to a non-retryable adapter failure after startup.""" + logger.error( + "Fatal %s adapter error (%s): %s", + adapter.platform.value, + adapter.fatal_error_code or "unknown", + adapter.fatal_error_message or "unknown error", + ) + + existing = self.adapters.get(adapter.platform) + if existing is adapter: + try: + await adapter.disconnect() + finally: + self.adapters.pop(adapter.platform, None) + self.delivery_router.adapters = self.adapters + + if not self.adapters: + self._exit_reason = adapter.fatal_error_message or "All messaging adapters disconnected" + logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.") + await self.stop() + + def _request_clean_exit(self, reason: str) -> None: + self._exit_cleanly = True + self._exit_reason = reason + self._shutdown_event.set() @staticmethod def _load_prefill_messages() -> List[Dict[str, Any]]: @@ -674,6 +711,11 @@ class GatewayRunner: """ logger.info("Starting Hermes Gateway...") logger.info("Session storage: %s", self.config.sessions_dir) + try: + from gateway.status import write_runtime_status + write_runtime_status(gateway_state="starting", exit_reason=None) + except Exception: + pass # Warn if no user allowlists are configured and open access is not opted in _any_allowlist = any( @@ -703,6 +745,7 @@ class GatewayRunner: logger.warning("Process checkpoint recovery: %s", e) connected_count = 0 + startup_nonretryable_errors: list[str] = [] # Initialize and connect each configured platform for platform, platform_config in self.config.platforms.items(): @@ -714,8 +757,9 @@ class GatewayRunner: logger.warning("No adapter available for %s", platform.value) continue - # Set up message handler + # Set up message + fatal error handlers adapter.set_message_handler(self._handle_message) + adapter.set_fatal_error_handler(self._handle_adapter_fatal_error) # Try to connect logger.info("Connecting to %s...", platform.value) @@ -728,10 +772,24 @@ class GatewayRunner: logger.info("✓ %s connected", platform.value) else: logger.warning("✗ %s failed to connect", platform.value) + if adapter.has_fatal_error and not adapter.fatal_error_retryable: + startup_nonretryable_errors.append( + f"{platform.value}: {adapter.fatal_error_message}" + ) except Exception as e: logger.error("✗ %s error: %s", platform.value, e) if connected_count == 0: + if startup_nonretryable_errors: + reason = "; ".join(startup_nonretryable_errors) + logger.error("Gateway hit a non-retryable startup conflict: %s", reason) + try: + from gateway.status import write_runtime_status + write_runtime_status(gateway_state="startup_failed", exit_reason=reason) + except Exception: + pass + self._request_clean_exit(reason) + return True logger.warning("No messaging platforms connected.") logger.info("Gateway will continue running for cron job execution.") @@ -739,6 +797,11 @@ class GatewayRunner: self.delivery_router.adapters = self.adapters self._running = True + try: + from gateway.status import write_runtime_status + write_runtime_status(gateway_state="running", exit_reason=None) + except Exception: + pass # Emit gateway:startup hook hook_count = len(self.hooks.loaded_hooks) @@ -833,8 +896,12 @@ class GatewayRunner: self._shutdown_all_gateway_honcho() self._shutdown_event.set() - from gateway.status import remove_pid_file + from gateway.status import remove_pid_file, write_runtime_status remove_pid_file() + try: + write_runtime_status(gateway_state="stopped", exit_reason=self._exit_reason) + except Exception: + pass logger.info("Gateway stopped") @@ -4373,6 +4440,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = success = await runner.start() if not success: return False + if runner.should_exit_cleanly: + if runner.exit_reason: + logger.error("Gateway exiting cleanly: %s", runner.exit_reason) + return True # Write PID file so CLI can detect gateway is running import atexit From 7b140b31e679cfd4e9cdf419814a4e344ed66c01 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 19:07:50 -0700 Subject: [PATCH 32/34] fix: suppress duplicate cron sends to auto-delivery targets Allow cron runs to keep using send_message for additional destinations, but skip same-target sends when the scheduler will already auto-deliver the final response there. Add prompt/tool guidance, docs, and regression coverage for origin/home-channel resolution and thread-aware comparisons. --- agent/prompt_builder.py | 7 ++ cron/scheduler.py | 98 ++++++++++++++------ tests/agent/test_prompt_builder.py | 1 + tests/cron/test_scheduler.py | 52 ++++++++++- tests/tools/test_send_message_tool.py | 113 +++++++++++++++++++++++ tools/cronjob_tools.py | 5 +- tools/send_message_tool.py | 49 ++++++++++ website/docs/user-guide/features/cron.md | 2 +- 8 files changed, 295 insertions(+), 32 deletions(-) diff --git a/agent/prompt_builder.py b/agent/prompt_builder.py index f1dbcf758..06d636320 100644 --- a/agent/prompt_builder.py +++ b/agent/prompt_builder.py @@ -141,6 +141,13 @@ PLATFORM_HINTS = { "is preserved for threading. Do not include greetings or sign-offs unless " "contextually appropriate." ), + "cron": ( + "You are running as a scheduled cron job. Your final response is automatically " + "delivered to the job's configured destination, so do not use send_message to " + "send to that same target again. If you want the user to receive something in " + "the scheduled destination, put it directly in your final response. Use " + "send_message only for additional or different targets." + ), "cli": ( "You are a CLI AI Agent. Try not to use markdown but simple text " "renderable inside a terminal." diff --git a/cron/scheduler.py b/cron/scheduler.py index 12d355cd1..4f85677d8 100644 --- a/cron/scheduler.py +++ b/cron/scheduler.py @@ -56,6 +56,50 @@ def _resolve_origin(job: dict) -> Optional[dict]: return None +def _resolve_delivery_target(job: dict) -> Optional[dict]: + """Resolve the concrete auto-delivery target for a cron job, if any.""" + deliver = job.get("deliver", "local") + origin = _resolve_origin(job) + + if deliver == "local": + return None + + if deliver == "origin": + if not origin: + return None + return { + "platform": origin["platform"], + "chat_id": str(origin["chat_id"]), + "thread_id": origin.get("thread_id"), + } + + if ":" in deliver: + platform_name, chat_id = deliver.split(":", 1) + return { + "platform": platform_name, + "chat_id": chat_id, + "thread_id": None, + } + + platform_name = deliver + if origin and origin.get("platform") == platform_name: + return { + "platform": platform_name, + "chat_id": str(origin["chat_id"]), + "thread_id": origin.get("thread_id"), + } + + chat_id = os.getenv(f"{platform_name.upper()}_HOME_CHANNEL", "") + if not chat_id: + return None + + return { + "platform": platform_name, + "chat_id": chat_id, + "thread_id": None, + } + + def _deliver_result(job: dict, content: str) -> None: """ Deliver job output to the configured target (origin chat, specific platform, etc.). @@ -63,36 +107,19 @@ def _deliver_result(job: dict, content: str) -> None: Uses the standalone platform send functions from send_message_tool so delivery works whether or not the gateway is running. """ - deliver = job.get("deliver", "local") - origin = _resolve_origin(job) - - if deliver == "local": + target = _resolve_delivery_target(job) + if not target: + if job.get("deliver", "local") != "local": + logger.warning( + "Job '%s' deliver=%s but no concrete delivery target could be resolved", + job["id"], + job.get("deliver", "local"), + ) return - thread_id = None - - # Resolve target platform + chat_id - if deliver == "origin": - if not origin: - logger.warning("Job '%s' deliver=origin but no origin stored, skipping delivery", job["id"]) - return - platform_name = origin["platform"] - chat_id = origin["chat_id"] - thread_id = origin.get("thread_id") - elif ":" in deliver: - platform_name, chat_id = deliver.split(":", 1) - else: - # Bare platform name like "telegram" — need to resolve to origin or home channel - platform_name = deliver - if origin and origin.get("platform") == platform_name: - chat_id = origin["chat_id"] - thread_id = origin.get("thread_id") - else: - # Fall back to home channel - chat_id = os.getenv(f"{platform_name.upper()}_HOME_CHANNEL", "") - if not chat_id: - logger.warning("Job '%s' deliver=%s but no chat_id or home channel. Set via: hermes config set %s_HOME_CHANNEL ", job["id"], deliver, platform_name.upper()) - return + platform_name = target["platform"] + chat_id = target["chat_id"] + thread_id = target.get("thread_id") from tools.send_message_tool import _send_to_platform from gateway.config import load_gateway_config, Platform @@ -169,6 +196,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: job_name = job["name"] prompt = job["prompt"] origin = _resolve_origin(job) + delivery_target = _resolve_delivery_target(job) logger.info("Running job '%s' (ID: %s)", job_name, job_id) logger.info("Prompt: %s", prompt[:100]) @@ -179,6 +207,11 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: os.environ["HERMES_SESSION_CHAT_ID"] = str(origin["chat_id"]) if origin.get("chat_name"): os.environ["HERMES_SESSION_CHAT_NAME"] = origin["chat_name"] + if delivery_target: + os.environ["HERMES_CRON_AUTO_DELIVER_PLATFORM"] = delivery_target["platform"] + os.environ["HERMES_CRON_AUTO_DELIVER_CHAT_ID"] = str(delivery_target["chat_id"]) + if delivery_target.get("thread_id") is not None: + os.environ["HERMES_CRON_AUTO_DELIVER_THREAD_ID"] = str(delivery_target["thread_id"]) try: # Re-read .env and config.yaml fresh every run so provider/key @@ -324,7 +357,14 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: finally: # Clean up injected env vars so they don't leak to other jobs - for key in ("HERMES_SESSION_PLATFORM", "HERMES_SESSION_CHAT_ID", "HERMES_SESSION_CHAT_NAME"): + for key in ( + "HERMES_SESSION_PLATFORM", + "HERMES_SESSION_CHAT_ID", + "HERMES_SESSION_CHAT_NAME", + "HERMES_CRON_AUTO_DELIVER_PLATFORM", + "HERMES_CRON_AUTO_DELIVER_CHAT_ID", + "HERMES_CRON_AUTO_DELIVER_THREAD_ID", + ): os.environ.pop(key, None) if _session_db: try: diff --git a/tests/agent/test_prompt_builder.py b/tests/agent/test_prompt_builder.py index b5c10bee6..cfcc40173 100644 --- a/tests/agent/test_prompt_builder.py +++ b/tests/agent/test_prompt_builder.py @@ -455,6 +455,7 @@ class TestPromptBuilderConstants: assert "whatsapp" in PLATFORM_HINTS assert "telegram" in PLATFORM_HINTS assert "discord" in PLATFORM_HINTS + assert "cron" in PLATFORM_HINTS assert "cli" in PLATFORM_HINTS diff --git a/tests/cron/test_scheduler.py b/tests/cron/test_scheduler.py index 4314b5ac0..6af83f1e1 100644 --- a/tests/cron/test_scheduler.py +++ b/tests/cron/test_scheduler.py @@ -6,7 +6,7 @@ from unittest.mock import patch, MagicMock import pytest -from cron.scheduler import _resolve_origin, _deliver_result, run_job +from cron.scheduler import _resolve_origin, _resolve_delivery_target, _deliver_result, run_job class TestResolveOrigin: @@ -44,6 +44,56 @@ class TestResolveOrigin: assert _resolve_origin(job) is None +class TestResolveDeliveryTarget: + def test_origin_delivery_preserves_thread_id(self): + job = { + "deliver": "origin", + "origin": { + "platform": "telegram", + "chat_id": "-1001", + "thread_id": "17585", + }, + } + + assert _resolve_delivery_target(job) == { + "platform": "telegram", + "chat_id": "-1001", + "thread_id": "17585", + } + + def test_bare_platform_uses_matching_origin_chat(self): + job = { + "deliver": "telegram", + "origin": { + "platform": "telegram", + "chat_id": "-1001", + "thread_id": "17585", + }, + } + + assert _resolve_delivery_target(job) == { + "platform": "telegram", + "chat_id": "-1001", + "thread_id": "17585", + } + + def test_bare_platform_falls_back_to_home_channel(self, monkeypatch): + monkeypatch.setenv("TELEGRAM_HOME_CHANNEL", "-2002") + job = { + "deliver": "telegram", + "origin": { + "platform": "discord", + "chat_id": "abc", + }, + } + + assert _resolve_delivery_target(job) == { + "platform": "telegram", + "chat_id": "-2002", + "thread_id": None, + } + + class TestDeliverResultMirrorLogging: """Verify that mirror_to_session failures are logged, not silently swallowed.""" diff --git a/tests/tools/test_send_message_tool.py b/tests/tools/test_send_message_tool.py index 3ad44f0f7..d55998942 100644 --- a/tests/tools/test_send_message_tool.py +++ b/tests/tools/test_send_message_tool.py @@ -2,6 +2,7 @@ import asyncio import json +import os import sys from pathlib import Path from types import SimpleNamespace @@ -29,6 +30,118 @@ def _install_telegram_mock(monkeypatch, bot): class TestSendMessageTool: + def test_cron_duplicate_target_is_skipped_and_explained(self): + home = SimpleNamespace(chat_id="-1001") + config, _telegram_cfg = _make_config() + config.get_home_channel = lambda _platform: home + + with patch.dict( + os.environ, + { + "HERMES_CRON_AUTO_DELIVER_PLATFORM": "telegram", + "HERMES_CRON_AUTO_DELIVER_CHAT_ID": "-1001", + }, + clear=False, + ), \ + patch("gateway.config.load_gateway_config", return_value=config), \ + patch("tools.interrupt.is_interrupted", return_value=False), \ + patch("model_tools._run_async", side_effect=_run_async_immediately), \ + patch("tools.send_message_tool._send_to_platform", new=AsyncMock(return_value={"success": True})) as send_mock, \ + patch("gateway.mirror.mirror_to_session", return_value=True) as mirror_mock: + result = json.loads( + send_message_tool( + { + "action": "send", + "target": "telegram", + "message": "hello", + } + ) + ) + + assert result["success"] is True + assert result["skipped"] is True + assert result["reason"] == "cron_auto_delivery_duplicate_target" + assert "final response" in result["note"] + send_mock.assert_not_awaited() + mirror_mock.assert_not_called() + + def test_cron_different_target_still_sends(self): + config, telegram_cfg = _make_config() + + with patch.dict( + os.environ, + { + "HERMES_CRON_AUTO_DELIVER_PLATFORM": "telegram", + "HERMES_CRON_AUTO_DELIVER_CHAT_ID": "-1001", + }, + clear=False, + ), \ + patch("gateway.config.load_gateway_config", return_value=config), \ + patch("tools.interrupt.is_interrupted", return_value=False), \ + patch("model_tools._run_async", side_effect=_run_async_immediately), \ + patch("tools.send_message_tool._send_to_platform", new=AsyncMock(return_value={"success": True})) as send_mock, \ + patch("gateway.mirror.mirror_to_session", return_value=True) as mirror_mock: + result = json.loads( + send_message_tool( + { + "action": "send", + "target": "telegram:-1002", + "message": "hello", + } + ) + ) + + assert result["success"] is True + assert result.get("skipped") is not True + send_mock.assert_awaited_once_with( + Platform.TELEGRAM, + telegram_cfg, + "-1002", + "hello", + thread_id=None, + media_files=[], + ) + mirror_mock.assert_called_once_with("telegram", "-1002", "hello", source_label="cli", thread_id=None) + + def test_cron_same_chat_different_thread_still_sends(self): + config, telegram_cfg = _make_config() + + with patch.dict( + os.environ, + { + "HERMES_CRON_AUTO_DELIVER_PLATFORM": "telegram", + "HERMES_CRON_AUTO_DELIVER_CHAT_ID": "-1001", + "HERMES_CRON_AUTO_DELIVER_THREAD_ID": "17585", + }, + clear=False, + ), \ + patch("gateway.config.load_gateway_config", return_value=config), \ + patch("tools.interrupt.is_interrupted", return_value=False), \ + patch("model_tools._run_async", side_effect=_run_async_immediately), \ + patch("tools.send_message_tool._send_to_platform", new=AsyncMock(return_value={"success": True})) as send_mock, \ + patch("gateway.mirror.mirror_to_session", return_value=True) as mirror_mock: + result = json.loads( + send_message_tool( + { + "action": "send", + "target": "telegram:-1001:99999", + "message": "hello", + } + ) + ) + + assert result["success"] is True + assert result.get("skipped") is not True + send_mock.assert_awaited_once_with( + Platform.TELEGRAM, + telegram_cfg, + "-1001", + "hello", + thread_id="99999", + media_files=[], + ) + mirror_mock.assert_called_once_with("telegram", "-1001", "hello", source_label="cli", thread_id="99999") + def test_sends_to_explicit_telegram_topic_target(self): config, telegram_cfg = _make_config() diff --git a/tools/cronjob_tools.py b/tools/cronjob_tools.py index bdfa58d63..bad2e22af 100644 --- a/tools/cronjob_tools.py +++ b/tools/cronjob_tools.py @@ -194,7 +194,10 @@ DELIVERY OPTIONS (where output goes): - "telegram:123456": Send to specific chat (if user provides ID) NOTE: The agent's final response is auto-delivered to the target — do NOT use -send_message in the prompt. Just have the agent compose its response normally. +send_message in the prompt for that same destination. Same-target send_message +calls are skipped so the cron doesn't double-message the user. Put the main +user-facing content in the final response, and use send_message only for +additional or different targets. Use for: reminders, periodic checks, scheduled reports, automated maintenance.""", "parameters": { diff --git a/tools/send_message_tool.py b/tools/send_message_tool.py index 537f6335b..6a7260fd3 100644 --- a/tools/send_message_tool.py +++ b/tools/send_message_tool.py @@ -153,6 +153,10 @@ def _handle_send(args): f"or set a home channel via: hermes config set {platform_name.upper()}_HOME_CHANNEL " }) + duplicate_skip = _maybe_skip_cron_duplicate_send(platform_name, chat_id, thread_id) + if duplicate_skip: + return json.dumps(duplicate_skip) + try: from model_tools import _run_async result = _run_async( @@ -213,6 +217,51 @@ def _describe_media_for_mirror(media_files): return f"[Sent {len(media_files)} media attachments]" +def _get_cron_auto_delivery_target(): + """Return the cron scheduler's auto-delivery target for the current run, if any.""" + platform = os.getenv("HERMES_CRON_AUTO_DELIVER_PLATFORM", "").strip().lower() + chat_id = os.getenv("HERMES_CRON_AUTO_DELIVER_CHAT_ID", "").strip() + if not platform or not chat_id: + return None + thread_id = os.getenv("HERMES_CRON_AUTO_DELIVER_THREAD_ID", "").strip() or None + return { + "platform": platform, + "chat_id": chat_id, + "thread_id": thread_id, + } + + +def _maybe_skip_cron_duplicate_send(platform_name: str, chat_id: str, thread_id: str | None): + """Skip redundant cron send_message calls when the scheduler will auto-deliver there.""" + auto_target = _get_cron_auto_delivery_target() + if not auto_target: + return None + + same_target = ( + auto_target["platform"] == platform_name + and str(auto_target["chat_id"]) == str(chat_id) + and auto_target.get("thread_id") == thread_id + ) + if not same_target: + return None + + target_label = f"{platform_name}:{chat_id}" + if thread_id is not None: + target_label += f":{thread_id}" + + return { + "success": True, + "skipped": True, + "reason": "cron_auto_delivery_duplicate_target", + "target": target_label, + "note": ( + f"Skipped send_message to {target_label}. This cron job will already auto-deliver " + "its final response to that same target. Put the intended user-facing content in " + "your final response instead, or use a different target if you want an additional message." + ), + } + + async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, media_files=None): """Route a message to the appropriate platform sender.""" from gateway.config import Platform diff --git a/website/docs/user-guide/features/cron.md b/website/docs/user-guide/features/cron.md index b044eb0da..03bf4bfbc 100644 --- a/website/docs/user-guide/features/cron.md +++ b/website/docs/user-guide/features/cron.md @@ -79,7 +79,7 @@ When scheduling jobs, you specify where the output goes: **How platform names work:** When you specify a bare platform name like `"telegram"`, Hermes first checks if the job's origin matches that platform and uses the origin chat ID. Otherwise, it falls back to the platform's home channel configured via environment variable (e.g., `TELEGRAM_HOME_CHANNEL`). -The agent's final response is automatically delivered — you do **not** need to include `send_message` in the cron prompt. +The agent's final response is automatically delivered — you do **not** need to include `send_message` in the cron prompt for that same destination. If a cron run calls `send_message` to the exact target the scheduler will already deliver to, Hermes skips that duplicate send and tells the model to put the user-facing content in the final response instead. Use `send_message` only for additional or different targets. The agent knows your connected platforms and home channels — it'll choose sensible defaults. From ea053e8afd8daa73acd3b55fa55b1364c00c3392 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 19:22:47 -0700 Subject: [PATCH 33/34] docs: add provider contribution guide --- .../docs/developer-guide/adding-providers.md | 424 ++++++++++++++++++ website/docs/developer-guide/architecture.md | 13 +- website/docs/developer-guide/contributing.md | 6 + .../docs/developer-guide/provider-runtime.md | 2 + website/sidebars.ts | 1 + 5 files changed, 440 insertions(+), 6 deletions(-) create mode 100644 website/docs/developer-guide/adding-providers.md diff --git a/website/docs/developer-guide/adding-providers.md b/website/docs/developer-guide/adding-providers.md new file mode 100644 index 000000000..7b4695dcb --- /dev/null +++ b/website/docs/developer-guide/adding-providers.md @@ -0,0 +1,424 @@ +--- +sidebar_position: 5 +title: "Adding Providers" +description: "How to add a new inference provider to Hermes Agent — auth, runtime resolution, CLI flows, adapters, tests, and docs" +--- + +# Adding Providers + +Hermes can already talk to any OpenAI-compatible endpoint through the custom provider path. Do not add a built-in provider unless you want first-class UX for that service: + +- provider-specific auth or token refresh +- a curated model catalog +- setup / `hermes model` menu entries +- provider aliases for `provider:model` syntax +- a non-OpenAI API shape that needs an adapter + +If the provider is just "another OpenAI-compatible base URL and API key", a named custom provider may be enough. + +## The mental model + +A built-in provider has to line up across a few layers: + +1. `hermes_cli/auth.py` decides how credentials are found. +2. `hermes_cli/runtime_provider.py` turns that into runtime data: + - `provider` + - `api_mode` + - `base_url` + - `api_key` + - `source` +3. `run_agent.py` uses `api_mode` to decide how requests are built and sent. +4. `hermes_cli/models.py`, `hermes_cli/main.py`, and `hermes_cli/setup.py` make the provider show up in the CLI. +5. `agent/auxiliary_client.py` and `agent/model_metadata.py` keep side tasks and token budgeting working. + +The important abstraction is `api_mode`. + +- Most providers use `chat_completions`. +- Codex uses `codex_responses`. +- Anthropic uses `anthropic_messages`. +- A new non-OpenAI protocol usually means adding a new adapter and a new `api_mode` branch. + +## Choose the implementation path first + +### Path A — OpenAI-compatible provider + +Use this when the provider accepts standard chat-completions style requests. + +Typical work: + +- add auth metadata +- add model catalog / aliases +- add runtime resolution +- add CLI menu wiring +- add aux-model defaults +- add tests and user docs + +You usually do not need a new adapter or a new `api_mode`. + +### Path B — Native provider + +Use this when the provider does not behave like OpenAI chat completions. + +Examples in-tree today: + +- `codex_responses` +- `anthropic_messages` + +This path includes everything from Path A plus: + +- a provider adapter in `agent/` +- `run_agent.py` branches for request building, dispatch, usage extraction, interrupt handling, and response normalization +- adapter tests + +## File checklist + +### Required for every built-in provider + +1. `hermes_cli/auth.py` +2. `hermes_cli/models.py` +3. `hermes_cli/runtime_provider.py` +4. `hermes_cli/main.py` +5. `hermes_cli/setup.py` +6. `agent/auxiliary_client.py` +7. `agent/model_metadata.py` +8. tests +9. user-facing docs under `website/docs/` + +### Additional for native / non-OpenAI providers + +10. `agent/_adapter.py` +11. `run_agent.py` +12. `pyproject.toml` if a provider SDK is required + +## Step 1: Pick one canonical provider id + +Choose a single provider id and use it everywhere. + +Examples from the repo: + +- `openai-codex` +- `kimi-coding` +- `minimax-cn` + +That same id should appear in: + +- `PROVIDER_REGISTRY` in `hermes_cli/auth.py` +- `_PROVIDER_LABELS` in `hermes_cli/models.py` +- `_PROVIDER_ALIASES` in both `hermes_cli/auth.py` and `hermes_cli/models.py` +- CLI `--provider` choices in `hermes_cli/main.py` +- setup / model selection branches +- auxiliary-model defaults +- tests + +If the id differs between those files, the provider will feel half-wired: auth may work while `/model`, setup, or runtime resolution silently misses it. + +## Step 2: Add auth metadata in `hermes_cli/auth.py` + +For API-key providers, add a `ProviderConfig` entry to `PROVIDER_REGISTRY` with: + +- `id` +- `name` +- `auth_type="api_key"` +- `inference_base_url` +- `api_key_env_vars` +- optional `base_url_env_var` + +Also add aliases to `_PROVIDER_ALIASES`. + +Use the existing providers as templates: + +- simple API-key path: Z.AI, MiniMax +- API-key path with endpoint detection: Kimi, Z.AI +- native token resolution: Anthropic +- OAuth / auth-store path: Nous, OpenAI Codex + +Questions to answer here: + +- What env vars should Hermes check, and in what priority order? +- Does the provider need base-URL overrides? +- Does it need endpoint probing or token refresh? +- What should the auth error say when credentials are missing? + +If the provider needs something more than "look up an API key", add a dedicated credential resolver instead of shoving logic into unrelated branches. + +## Step 3: Add model catalog and aliases in `hermes_cli/models.py` + +Update the provider catalog so the provider works in menus and in `provider:model` syntax. + +Typical edits: + +- `_PROVIDER_MODELS` +- `_PROVIDER_LABELS` +- `_PROVIDER_ALIASES` +- provider display order inside `list_available_providers()` +- `provider_model_ids()` if the provider supports a live `/models` fetch + +If the provider exposes a live model list, prefer that first and keep `_PROVIDER_MODELS` as the static fallback. + +This file is also what makes inputs like these work: + +```text +anthropic:claude-sonnet-4-6 +kimi:model-name +``` + +If aliases are missing here, the provider may authenticate correctly but still fail in `/model` parsing. + +## Step 4: Resolve runtime data in `hermes_cli/runtime_provider.py` + +`resolve_runtime_provider()` is the shared path used by CLI, gateway, cron, ACP, and helper clients. + +Add a branch that returns a dict with at least: + +```python +{ + "provider": "your-provider", + "api_mode": "chat_completions", # or your native mode + "base_url": "https://...", + "api_key": "...", + "source": "env|portal|auth-store|explicit", + "requested_provider": requested_provider, +} +``` + +If the provider is OpenAI-compatible, `api_mode` should usually stay `chat_completions`. + +Be careful with API-key precedence. Hermes already contains logic to avoid leaking an OpenRouter key to unrelated endpoints. A new provider should be equally explicit about which key goes to which base URL. + +## Step 5: Wire the CLI in `hermes_cli/main.py` and `hermes_cli/setup.py` + +A provider is not discoverable until it shows up in the interactive flows. + +Update: + +### `hermes_cli/main.py` + +- `provider_labels` +- provider dispatch inside the `model` command +- `--provider` argument choices +- login/logout choices if the provider supports those flows +- a `_model_flow_()` function, or reuse `_model_flow_api_key_provider()` if it fits + +### `hermes_cli/setup.py` + +- `provider_choices` +- auth branch for the provider +- model-selection branch +- any provider-specific explanatory text +- any place where a provider should be excluded from OpenRouter-only prompts or routing settings + +If you only update one of these files, `hermes model` and `hermes setup` will drift. + +## Step 6: Keep auxiliary calls working + +Two files matter here: + +### `agent/auxiliary_client.py` + +Add a cheap / fast default aux model to `_API_KEY_PROVIDER_AUX_MODELS` if this is a direct API-key provider. + +Auxiliary tasks include things like: + +- vision summarization +- web extraction summarization +- context compression summaries +- session-search summaries +- memory flushes + +If the provider has no sensible aux default, side tasks may fall back badly or use an expensive main model unexpectedly. + +### `agent/model_metadata.py` + +Add context lengths for the provider's models so token budgeting, compression thresholds, and limits stay sane. + +## Step 7: If the provider is native, add an adapter and `run_agent.py` support + +If the provider is not plain chat completions, isolate the provider-specific logic in `agent/_adapter.py`. + +Keep `run_agent.py` focused on orchestration. It should call adapter helpers, not hand-build provider payloads inline all over the file. + +A native provider usually needs work in these places: + +### New adapter file + +Typical responsibilities: + +- build the SDK / HTTP client +- resolve tokens +- convert OpenAI-style conversation messages to the provider's request format +- convert tool schemas if needed +- normalize provider responses back into what `run_agent.py` expects +- extract usage and finish-reason data + +### `run_agent.py` + +Search for `api_mode` and audit every switch point. At minimum, verify: + +- `__init__` chooses the new `api_mode` +- client construction works for the provider +- `_build_api_kwargs()` knows how to format requests +- `_api_call_with_interrupt()` dispatches to the right client call +- interrupt / client rebuild paths work +- response validation accepts the provider's shape +- finish-reason extraction is correct +- token-usage extraction is correct +- fallback-model activation can switch into the new provider cleanly +- summary-generation and memory-flush paths still work + +Also search `run_agent.py` for `self.client.`. Any code path that assumes the standard OpenAI client exists can break when a native provider uses a different client object or `self.client = None`. + +### Prompt caching and provider-specific request fields + +Prompt caching and provider-specific knobs are easy to regress. + +Examples already in-tree: + +- Anthropic has a native prompt-caching path +- OpenRouter gets provider-routing fields +- not every provider should receive every request-side option + +When you add a native provider, double-check that Hermes is only sending fields that provider actually understands. + +## Step 8: Tests + +At minimum, touch the tests that guard provider wiring. + +Common places: + +- `tests/test_runtime_provider_resolution.py` +- `tests/test_cli_provider_resolution.py` +- `tests/test_cli_model_command.py` +- `tests/test_setup_model_selection.py` +- `tests/test_provider_parity.py` +- `tests/test_run_agent.py` +- `tests/test__adapter.py` for a native provider + +For docs-only examples, the exact file set may differ. The point is to cover: + +- auth resolution +- CLI menu / provider selection +- runtime provider resolution +- agent execution path +- provider:model parsing +- any adapter-specific message conversion + +Run tests with xdist disabled: + +```bash +source .venv/bin/activate +python -m pytest tests/test_runtime_provider_resolution.py tests/test_cli_provider_resolution.py tests/test_cli_model_command.py tests/test_setup_model_selection.py -n0 -q +``` + +For deeper changes, run the full suite before pushing: + +```bash +source .venv/bin/activate +python -m pytest tests/ -n0 -q +``` + +## Step 9: Live verification + +After tests, run a real smoke test. + +```bash +source .venv/bin/activate +python -m hermes_cli.main chat -q "Say hello" --provider your-provider --model your-model +``` + +Also test the interactive flows if you changed menus: + +```bash +source .venv/bin/activate +python -m hermes_cli.main model +python -m hermes_cli.main setup +``` + +For native providers, verify at least one tool call too, not just a plain text response. + +## Step 10: Update user-facing docs + +If the provider is meant to ship as a first-class option, update the user docs too: + +- `website/docs/getting-started/quickstart.md` +- `website/docs/user-guide/configuration.md` +- `website/docs/reference/environment-variables.md` + +A developer can wire the provider perfectly and still leave users unable to discover the required env vars or setup flow. + +## OpenAI-compatible provider checklist + +Use this if the provider is standard chat completions. + +- [ ] `ProviderConfig` added in `hermes_cli/auth.py` +- [ ] aliases added in `hermes_cli/auth.py` and `hermes_cli/models.py` +- [ ] model catalog added in `hermes_cli/models.py` +- [ ] runtime branch added in `hermes_cli/runtime_provider.py` +- [ ] CLI wiring added in `hermes_cli/main.py` +- [ ] setup wiring added in `hermes_cli/setup.py` +- [ ] aux model added in `agent/auxiliary_client.py` +- [ ] context lengths added in `agent/model_metadata.py` +- [ ] runtime / CLI tests updated +- [ ] user docs updated + +## Native provider checklist + +Use this when the provider needs a new protocol path. + +- [ ] everything in the OpenAI-compatible checklist +- [ ] adapter added in `agent/_adapter.py` +- [ ] new `api_mode` supported in `run_agent.py` +- [ ] interrupt / rebuild path works +- [ ] usage and finish-reason extraction works +- [ ] fallback path works +- [ ] adapter tests added +- [ ] live smoke test passes + +## Common pitfalls + +### 1. Adding the provider to auth but not to model parsing + +That makes credentials resolve correctly while `/model` and `provider:model` inputs fail. + +### 2. Forgetting that `config["model"]` can be a string or a dict + +A lot of provider-selection code has to normalize both forms. + +### 3. Assuming a built-in provider is required + +If the service is just OpenAI-compatible, a custom provider may already solve the user problem with less maintenance. + +### 4. Forgetting auxiliary paths + +The main chat path can work while summarization, memory flushes, or vision helpers fail because aux routing was never updated. + +### 5. Native-provider branches hiding in `run_agent.py` + +Search for `api_mode` and `self.client.`. Do not assume the obvious request path is the only one. + +### 6. Sending OpenRouter-only knobs to other providers + +Fields like provider routing belong only on the providers that support them. + +### 7. Updating `hermes model` but not `hermes setup` + +Both flows need to know about the provider. + +## Good search targets while implementing + +If you are hunting for all the places a provider touches, search these symbols: + +- `PROVIDER_REGISTRY` +- `_PROVIDER_ALIASES` +- `_PROVIDER_MODELS` +- `resolve_runtime_provider` +- `_model_flow_` +- `provider_choices` +- `api_mode` +- `_API_KEY_PROVIDER_AUX_MODELS` +- `self.client.` + +## Related docs + +- [Provider Runtime Resolution](./provider-runtime.md) +- [Architecture](./architecture.md) +- [Contributing](./contributing.md) diff --git a/website/docs/developer-guide/architecture.md b/website/docs/developer-guide/architecture.md index 2ff148174..1fb9ff419 100644 --- a/website/docs/developer-guide/architecture.md +++ b/website/docs/developer-guide/architecture.md @@ -41,12 +41,13 @@ If you are new to the codebase, read in this order: 2. [Agent Loop Internals](./agent-loop.md) 3. [Prompt Assembly](./prompt-assembly.md) 4. [Provider Runtime Resolution](./provider-runtime.md) -5. [Tools Runtime](./tools-runtime.md) -6. [Session Storage](./session-storage.md) -7. [Gateway Internals](./gateway-internals.md) -8. [Context Compression & Prompt Caching](./context-compression-and-caching.md) -9. [ACP Internals](./acp-internals.md) -10. [Environments, Benchmarks & Data Generation](./environments.md) +5. [Adding Providers](./adding-providers.md) +6. [Tools Runtime](./tools-runtime.md) +7. [Session Storage](./session-storage.md) +8. [Gateway Internals](./gateway-internals.md) +9. [Context Compression & Prompt Caching](./context-compression-and-caching.md) +10. [ACP Internals](./acp-internals.md) +11. [Environments, Benchmarks & Data Generation](./environments.md) ## Major subsystems diff --git a/website/docs/developer-guide/contributing.md b/website/docs/developer-guide/contributing.md index f14ab9b40..5f653eae9 100644 --- a/website/docs/developer-guide/contributing.md +++ b/website/docs/developer-guide/contributing.md @@ -20,6 +20,12 @@ We value contributions in this order: 6. **New tools** — rarely needed; most capabilities should be skills 7. **Documentation** — fixes, clarifications, new examples +## Common contribution paths + +- Building a new tool? Start with [Adding Tools](./adding-tools.md) +- Building a new skill? Start with [Creating Skills](./creating-skills.md) +- Building a new inference provider? Start with [Adding Providers](./adding-providers.md) + ## Development Setup ### Prerequisites diff --git a/website/docs/developer-guide/provider-runtime.md b/website/docs/developer-guide/provider-runtime.md index 9bfd48c28..68fe537c4 100644 --- a/website/docs/developer-guide/provider-runtime.md +++ b/website/docs/developer-guide/provider-runtime.md @@ -20,6 +20,8 @@ Primary implementation: - `hermes_cli/auth.py` - `agent/auxiliary_client.py` +If you are trying to add a new first-class inference provider, read [Adding Providers](./adding-providers.md) alongside this page. + ## Resolution precedence At a high level, provider resolution uses: diff --git a/website/sidebars.ts b/website/sidebars.ts index 828b4472f..94a28aac8 100644 --- a/website/sidebars.ts +++ b/website/sidebars.ts @@ -109,6 +109,7 @@ const sidebars: SidebarsConfig = { 'developer-guide/architecture', 'developer-guide/agent-loop', 'developer-guide/provider-runtime', + 'developer-guide/adding-providers', 'developer-guide/prompt-assembly', 'developer-guide/context-compression-and-caching', 'developer-guide/gateway-internals', From e099117a3be9cdbd65e9fb930db0109da4e2efcc Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 19:29:01 -0700 Subject: [PATCH 34/34] docs: complete voice mode docs --- website/docs/getting-started/installation.md | 1 + website/docs/getting-started/quickstart.md | 19 +++++++++ .../docs/reference/environment-variables.md | 9 ++++- website/docs/reference/slash-commands.md | 5 ++- website/docs/user-guide/cli.md | 5 +++ website/docs/user-guide/configuration.md | 39 ++++++++++++++++++- .../docs/user-guide/features/voice-mode.md | 2 +- website/docs/user-guide/messaging/discord.md | 2 +- website/docs/user-guide/messaging/slack.md | 2 +- website/docs/user-guide/messaging/telegram.md | 8 +++- website/docs/user-guide/messaging/whatsapp.md | 2 +- website/sidebars.ts | 1 + 12 files changed, 84 insertions(+), 11 deletions(-) diff --git a/website/docs/getting-started/installation.md b/website/docs/getting-started/installation.md index e273f6da2..a43d7370b 100644 --- a/website/docs/getting-started/installation.md +++ b/website/docs/getting-started/installation.md @@ -119,6 +119,7 @@ uv pip install -e "." | `cli` | Terminal menu UI for setup wizard | `uv pip install -e ".[cli]"` | | `modal` | Modal cloud execution backend | `uv pip install -e ".[modal]"` | | `tts-premium` | ElevenLabs premium voices | `uv pip install -e ".[tts-premium]"` | +| `voice` | CLI microphone input + audio playback | `uv pip install -e ".[voice]"` | | `pty` | PTY terminal support | `uv pip install -e ".[pty]"` | | `honcho` | AI-native memory (Honcho integration) | `uv pip install -e ".[honcho]"` | | `mcp` | Model Context Protocol support | `uv pip install -e ".[mcp]"` | diff --git a/website/docs/getting-started/quickstart.md b/website/docs/getting-started/quickstart.md index e743baf6a..7fed47a21 100644 --- a/website/docs/getting-started/quickstart.md +++ b/website/docs/getting-started/quickstart.md @@ -129,6 +129,25 @@ Chat with Hermes from your phone or other surfaces via Telegram, Discord, Slack, hermes gateway setup # Interactive platform configuration ``` +### Add voice mode + +Want microphone input in the CLI or spoken replies in messaging? + +```bash +pip install hermes-agent[voice] + +# Optional but recommended for free local speech-to-text +pip install faster-whisper +``` + +Then start Hermes and enable it inside the CLI: + +```text +/voice on +``` + +Press `Ctrl+B` to record, or use `/voice tts` to have Hermes speak its replies. See [Voice Mode](../user-guide/features/voice-mode.md) for the full setup across CLI, Telegram, Discord, and Discord voice channels. + ### Schedule automated tasks ``` diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md index f179437a2..6fcc96a2b 100644 --- a/website/docs/reference/environment-variables.md +++ b/website/docs/reference/environment-variables.md @@ -31,7 +31,7 @@ All variables go in `~/.hermes/.env`. You can also set them with `hermes config | `CLAUDE_CODE_OAUTH_TOKEN` | Claude Code setup-token (same as `ANTHROPIC_TOKEN`) | | `HERMES_MODEL` | Preferred model name (checked before `LLM_MODEL`, used by gateway) | | `LLM_MODEL` | Default model name (fallback when not set in config.yaml) | -| `VOICE_TOOLS_OPENAI_KEY` | OpenAI key for TTS and voice transcription (separate from custom endpoint) | +| `VOICE_TOOLS_OPENAI_KEY` | OpenAI key for OpenAI speech-to-text and text-to-speech providers | | `HERMES_HOME` | Override Hermes config directory (default: `~/.hermes`) | ## Provider Auth (OAuth) @@ -57,7 +57,12 @@ All variables go in `~/.hermes/.env`. You can also set them with `hermes config | `BROWSERBASE_PROJECT_ID` | Browserbase project ID | | `BROWSER_INACTIVITY_TIMEOUT` | Browser session inactivity timeout in seconds | | `FAL_KEY` | Image generation ([fal.ai](https://fal.ai/)) | -| `ELEVENLABS_API_KEY` | Premium TTS voices ([elevenlabs.io](https://elevenlabs.io/)) | +| `GROQ_API_KEY` | Groq Whisper STT API key ([groq.com](https://groq.com/)) | +| `ELEVENLABS_API_KEY` | ElevenLabs premium TTS voices ([elevenlabs.io](https://elevenlabs.io/)) | +| `STT_GROQ_MODEL` | Override the Groq STT model (default: `whisper-large-v3-turbo`) | +| `GROQ_BASE_URL` | Override the Groq OpenAI-compatible STT endpoint | +| `STT_OPENAI_MODEL` | Override the OpenAI STT model (default: `whisper-1`) | +| `STT_OPENAI_BASE_URL` | Override the OpenAI-compatible STT endpoint | | `HONCHO_API_KEY` | Cross-session user modeling ([honcho.dev](https://honcho.dev/)) | | `TINKER_API_KEY` | RL training ([tinker-console.thinkingmachines.ai](https://tinker-console.thinkingmachines.ai/)) | | `WANDB_API_KEY` | RL training metrics ([wandb.ai](https://wandb.ai/)) | diff --git a/website/docs/reference/slash-commands.md b/website/docs/reference/slash-commands.md index b67578261..302e9e518 100644 --- a/website/docs/reference/slash-commands.md +++ b/website/docs/reference/slash-commands.md @@ -45,6 +45,7 @@ Type `/` in the CLI to open the autocomplete menu. Built-in commands are case-in | `/verbose` | Cycle tool progress display: off → new → all → verbose | | `/reasoning` | Manage reasoning effort and display (usage: /reasoning [level\|show\|hide]) | | `/skin` | Show or change the display skin/theme | +| `/voice [on\|off\|tts\|status]` | Toggle CLI voice mode and spoken playback. Recording uses `voice.record_key` (default: `Ctrl+B`). | ### Tools & Skills @@ -105,6 +106,7 @@ The messaging gateway supports the following built-in commands inside Telegram, | `/usage` | Show token usage for the current session. | | `/insights [days]` | Show usage analytics. | | `/reasoning [level\|show\|hide]` | Change reasoning effort or toggle reasoning display. | +| `/voice [on\|off\|tts\|join\|channel\|leave\|status]` | Control spoken replies in chat. `join`/`channel`/`leave` manage Discord voice-channel mode. | | `/rollback [number]` | List or restore filesystem checkpoints. | | `/background <prompt>` | Run a prompt in a separate background session. | | `/reload-mcp` | Reload MCP servers from config. | @@ -116,4 +118,5 @@ The messaging gateway supports the following built-in commands inside Telegram, - `/skin`, `/tools`, `/toolsets`, `/config`, `/prompt`, `/cron`, `/skills`, `/platforms`, `/paste`, and `/verbose` are **CLI-only** commands. - `/status`, `/stop`, `/sethome`, `/resume`, `/background`, and `/update` are **messaging-only** commands. -- `/reload-mcp` and `/rollback` work in **both** the CLI and the messaging gateway. \ No newline at end of file +- `/voice`, `/reload-mcp`, and `/rollback` work in **both** the CLI and the messaging gateway. +- `/voice join`, `/voice channel`, and `/voice leave` are only meaningful on Discord. diff --git a/website/docs/user-guide/cli.md b/website/docs/user-guide/cli.md index 6c8d558da..fb3c83837 100644 --- a/website/docs/user-guide/cli.md +++ b/website/docs/user-guide/cli.md @@ -77,6 +77,7 @@ When resuming a previous session (`hermes -c` or `hermes --resume `), a "Pre | `Alt+Enter` or `Ctrl+J` | New line (multi-line input) | | `Alt+V` | Paste an image from the clipboard when supported by the terminal | | `Ctrl+V` | Paste text and opportunistically attach clipboard images | +| `Ctrl+B` | Start/stop voice recording when voice mode is enabled (`voice.record_key`, default: `ctrl+b`) | | `Ctrl+C` | Interrupt agent (double-press within 2s to force exit) | | `Ctrl+D` | Exit | | `Tab` | Autocomplete slash commands | @@ -95,11 +96,15 @@ Common examples: | `/skills browse` | Browse the skills hub and official optional skills | | `/background ` | Run a prompt in a separate background session | | `/skin` | Show or switch the active CLI skin | +| `/voice on` | Enable CLI voice mode (press `Ctrl+B` to record) | +| `/voice tts` | Toggle spoken playback for Hermes replies | | `/reasoning high` | Increase reasoning effort | | `/title My Session` | Name the current session | For the full built-in CLI and messaging lists, see [Slash Commands Reference](../reference/slash-commands.md). +For setup, providers, silence tuning, and messaging/Discord voice usage, see [Voice Mode](features/voice-mode.md). + :::tip Commands are case-insensitive — `/HELP` works the same as `/help`. Installed skills also become slash commands automatically. ::: diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index 13da3fe4e..4615ff06a 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -695,6 +695,8 @@ tts: voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer ``` +This controls both the `text_to_speech` tool and spoken replies in voice mode (`/voice tts` in the CLI or messaging gateway). + ## Display Settings ```yaml @@ -719,10 +721,43 @@ display: ```yaml stt: - provider: "openai" # STT provider + provider: "local" # "local" | "groq" | "openai" + local: + model: "base" # tiny, base, small, medium, large-v3 + openai: + model: "whisper-1" # whisper-1 | gpt-4o-mini-transcribe | gpt-4o-transcribe + # model: "whisper-1" # Legacy fallback key still respected ``` -Requires `VOICE_TOOLS_OPENAI_KEY` in `.env` for OpenAI STT. +Provider behavior: + +- `local` uses `faster-whisper` running on your machine. Install it separately with `pip install faster-whisper`. +- `groq` uses Groq's Whisper-compatible endpoint and reads `GROQ_API_KEY`. +- `openai` uses the OpenAI speech API and reads `VOICE_TOOLS_OPENAI_KEY`. + +If the requested provider is unavailable, Hermes falls back automatically in this order: `local` → `groq` → `openai`. + +Groq and OpenAI model overrides are environment-driven: + +```bash +STT_GROQ_MODEL=whisper-large-v3-turbo +STT_OPENAI_MODEL=whisper-1 +GROQ_BASE_URL=https://api.groq.com/openai/v1 +STT_OPENAI_BASE_URL=https://api.openai.com/v1 +``` + +## Voice Mode (CLI) + +```yaml +voice: + record_key: "ctrl+b" # Push-to-talk key inside the CLI + max_recording_seconds: 120 # Hard stop for long recordings + auto_tts: false # Enable spoken replies automatically when /voice on + silence_threshold: 200 # RMS threshold for speech detection + silence_duration: 3.0 # Seconds of silence before auto-stop +``` + +Use `/voice on` in the CLI to enable microphone mode, `record_key` to start/stop recording, and `/voice tts` to toggle spoken replies. See [Voice Mode](/docs/user-guide/features/voice-mode) for end-to-end setup and platform-specific behavior. ## Quick Commands diff --git a/website/docs/user-guide/features/voice-mode.md b/website/docs/user-guide/features/voice-mode.md index 3c94062f7..3dfe0db46 100644 --- a/website/docs/user-guide/features/voice-mode.md +++ b/website/docs/user-guide/features/voice-mode.md @@ -15,7 +15,7 @@ If you want a practical setup walkthrough with recommended configurations and re Before using voice features, make sure you have: 1. **Hermes Agent installed** — `pip install hermes-agent` (see [Installation](/docs/getting-started/installation)) -2. **An LLM provider configured** — set `OPENAI_API_KEY`, `OPENAI_BASE_URL`, and `LLM_MODEL` in `~/.hermes/.env` +2. **An LLM provider configured** — run `hermes model` or set your preferred provider credentials in `~/.hermes/.env` 3. **A working base setup** — run `hermes` to verify the agent responds to text before enabling voice :::tip diff --git a/website/docs/user-guide/messaging/discord.md b/website/docs/user-guide/messaging/discord.md index b5f060596..2fd9a3a1d 100644 --- a/website/docs/user-guide/messaging/discord.md +++ b/website/docs/user-guide/messaging/discord.md @@ -210,7 +210,7 @@ Replace the ID with the actual channel ID (right-click → Copy Channel ID with Hermes Agent supports Discord voice messages: -- **Incoming voice messages** are automatically transcribed using Whisper (requires `GROQ_API_KEY` or `VOICE_TOOLS_OPENAI_KEY` to be set in your environment). +- **Incoming voice messages** are automatically transcribed using the configured STT provider: local `faster-whisper` (no key), Groq Whisper (`GROQ_API_KEY`), or OpenAI Whisper (`VOICE_TOOLS_OPENAI_KEY`). - **Text-to-speech**: Use `/voice tts` to have the bot send spoken audio responses alongside text replies. - **Discord voice channels**: Hermes can also join a voice channel, listen to users speaking, and talk back in the channel. diff --git a/website/docs/user-guide/messaging/slack.md b/website/docs/user-guide/messaging/slack.md index 5ba6c7dde..2ff79f351 100644 --- a/website/docs/user-guide/messaging/slack.md +++ b/website/docs/user-guide/messaging/slack.md @@ -224,7 +224,7 @@ Make sure the bot has been **invited to the channel** (`/invite @Hermes Agent`). Hermes supports voice on Slack: -- **Incoming:** Voice/audio messages are automatically transcribed using Whisper (requires `VOICE_TOOLS_OPENAI_KEY`) +- **Incoming:** Voice/audio messages are automatically transcribed using the configured STT provider: local `faster-whisper`, Groq Whisper (`GROQ_API_KEY`), or OpenAI Whisper (`VOICE_TOOLS_OPENAI_KEY`) - **Outgoing:** TTS responses are sent as audio file attachments --- diff --git a/website/docs/user-guide/messaging/telegram.md b/website/docs/user-guide/messaging/telegram.md index 123b81397..179f46b6e 100644 --- a/website/docs/user-guide/messaging/telegram.md +++ b/website/docs/user-guide/messaging/telegram.md @@ -131,7 +131,11 @@ Group chat IDs are negative numbers (e.g., `-1001234567890`). Your personal DM c ### Incoming Voice (Speech-to-Text) -Voice messages you send on Telegram are automatically transcribed using OpenAI's Whisper API and injected as text into the conversation. This requires `VOICE_TOOLS_OPENAI_KEY` in `~/.hermes/.env`. +Voice messages you send on Telegram are automatically transcribed by Hermes's configured STT provider and injected as text into the conversation. + +- `local` uses `faster-whisper` on the machine running Hermes — no API key required +- `groq` uses Groq Whisper and requires `GROQ_API_KEY` +- `openai` uses OpenAI Whisper and requires `VOICE_TOOLS_OPENAI_KEY` ### Outgoing Voice (Text-to-Speech) @@ -173,7 +177,7 @@ Hermes Agent works in Telegram group chats with a few considerations: | Bot not responding at all | Verify `TELEGRAM_BOT_TOKEN` is correct. Check `hermes gateway` logs for errors. | | Bot responds with "unauthorized" | Your user ID is not in `TELEGRAM_ALLOWED_USERS`. Double-check with @userinfobot. | | Bot ignores group messages | Privacy mode is likely on. Disable it (Step 3) or make the bot a group admin. **Remember to remove and re-add the bot after changing privacy.** | -| Voice messages not transcribed | Check that `VOICE_TOOLS_OPENAI_KEY` is set and valid in `~/.hermes/.env`. | +| Voice messages not transcribed | Verify STT is available: install `faster-whisper` for local transcription, or set `GROQ_API_KEY` / `VOICE_TOOLS_OPENAI_KEY` in `~/.hermes/.env`. | | Voice replies are files, not bubbles | Install `ffmpeg` (needed for Edge TTS Opus conversion). | | Bot token revoked/invalid | Generate a new token via `/revoke` then `/newbot` or `/token` in BotFather. Update your `.env` file. | diff --git a/website/docs/user-guide/messaging/whatsapp.md b/website/docs/user-guide/messaging/whatsapp.md index 8bdf28dd5..af432fb85 100644 --- a/website/docs/user-guide/messaging/whatsapp.md +++ b/website/docs/user-guide/messaging/whatsapp.md @@ -137,7 +137,7 @@ with reconnection logic. Hermes supports voice on WhatsApp: -- **Incoming:** Voice messages (`.ogg` opus) are automatically transcribed using Whisper (requires `VOICE_TOOLS_OPENAI_KEY`) +- **Incoming:** Voice messages (`.ogg` opus) are automatically transcribed using the configured STT provider: local `faster-whisper`, Groq Whisper (`GROQ_API_KEY`), or OpenAI Whisper (`VOICE_TOOLS_OPENAI_KEY`) - **Outgoing:** TTS responses are sent as MP3 audio file attachments - Agent responses are prefixed with "⚕ **Hermes Agent**" for easy identification diff --git a/website/sidebars.ts b/website/sidebars.ts index 828b4472f..9c3257769 100644 --- a/website/sidebars.ts +++ b/website/sidebars.ts @@ -76,6 +76,7 @@ const sidebars: SidebarsConfig = { type: 'category', label: 'Web & Media', items: [ + 'user-guide/features/voice-mode', 'user-guide/features/browser', 'user-guide/features/vision', 'user-guide/features/image-generation',