diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index a0807d8ab..94be9d6fe 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -1248,12 +1248,16 @@ def _resolve_task_provider_model( cfg_base_url = str(task_config.get("base_url", "")).strip() or None cfg_api_key = str(task_config.get("api_key", "")).strip() or None - # Backwards compat: compression section has its own keys - if task == "compression" and not cfg_provider: + # Backwards compat: compression section has its own keys. + # The auxiliary.compression defaults to provider="auto", so treat + # both None and "auto" as "not explicitly configured". + if task == "compression" and (not cfg_provider or cfg_provider == "auto"): comp = config.get("compression", {}) if isinstance(config, dict) else {} if isinstance(comp, dict): cfg_provider = comp.get("summary_provider", "").strip() or None cfg_model = cfg_model or comp.get("summary_model", "").strip() or None + _sbu = comp.get("summary_base_url") or "" + cfg_base_url = cfg_base_url or _sbu.strip() or None env_model = _get_auxiliary_env_override(task, "MODEL") if task else None resolved_model = model or env_model or cfg_model diff --git a/cli.py b/cli.py index 703b85e77..cd62240c1 100755 --- a/cli.py +++ b/cli.py @@ -380,22 +380,10 @@ def load_cli_config() -> Dict[str, Any]: if config_key in browser_config: os.environ[env_var] = str(browser_config[config_key]) - # Apply compression config to environment variables - compression_config = defaults.get("compression", {}) - compression_env_mappings = { - "enabled": "CONTEXT_COMPRESSION_ENABLED", - "threshold": "CONTEXT_COMPRESSION_THRESHOLD", - "summary_model": "CONTEXT_COMPRESSION_MODEL", - "summary_provider": "CONTEXT_COMPRESSION_PROVIDER", - } - - for config_key, env_var in compression_env_mappings.items(): - if config_key in compression_config: - os.environ[env_var] = str(compression_config[config_key]) - # Apply auxiliary model/direct-endpoint overrides to environment variables. # Vision and web_extract each have their own provider/model/base_url/api_key tuple. - # (Compression is handled in the compression section above.) + # Compression config is read directly from config.yaml by run_agent.py and + # auxiliary_client.py — no env var bridging needed. # Only set env vars for non-empty / non-default values so auto-detection # still works. auxiliary_config = defaults.get("auxiliary", {}) diff --git a/gateway/run.py b/gateway/run.py index c820f2b06..47142c758 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -130,17 +130,8 @@ if _config_path.exists(): os.environ[_env_var] = json.dumps(_val) else: os.environ[_env_var] = str(_val) - _compression_cfg = _cfg.get("compression", {}) - if _compression_cfg and isinstance(_compression_cfg, dict): - _compression_env_map = { - "enabled": "CONTEXT_COMPRESSION_ENABLED", - "threshold": "CONTEXT_COMPRESSION_THRESHOLD", - "summary_model": "CONTEXT_COMPRESSION_MODEL", - "summary_provider": "CONTEXT_COMPRESSION_PROVIDER", - } - for _cfg_key, _env_var in _compression_env_map.items(): - if _cfg_key in _compression_cfg: - os.environ[_env_var] = str(_compression_cfg[_cfg_key]) + # Compression config is read directly from config.yaml by run_agent.py + # and auxiliary_client.py — no env var bridging needed. # Auxiliary model/direct-endpoint overrides (vision, web_extract). # Each task has provider/model/base_url/api_key; bridge non-default values to env vars. _auxiliary_cfg = _cfg.get("auxiliary", {}) @@ -1632,10 +1623,6 @@ class GatewayRunner: except Exception: pass - # Check env override for disabling compression entirely - if os.getenv("CONTEXT_COMPRESSION_ENABLED", "").lower() in ("false", "0", "no"): - _hyg_compression_enabled = False - if _hyg_compression_enabled: _hyg_context_length = get_model_context_length(_hyg_model) _compress_token_threshold = int( diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 8c914034c..ceb4b8ff7 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -162,6 +162,7 @@ DEFAULT_CONFIG = { "threshold": 0.50, "summary_model": "google/gemini-3-flash-preview", "summary_provider": "auto", + "summary_base_url": None, }, "smart_model_routing": { "enabled": False, diff --git a/run_agent.py b/run_agent.py index bfe62e04c..210ab2d2b 100644 --- a/run_agent.py +++ b/run_agent.py @@ -837,10 +837,17 @@ class AIAgent: # Initialize context compressor for automatic context management # Compresses conversation when approaching model's context limit - # Configuration via config.yaml (compression section) or environment variables - compression_threshold = float(os.getenv("CONTEXT_COMPRESSION_THRESHOLD", "0.50")) - compression_enabled = os.getenv("CONTEXT_COMPRESSION_ENABLED", "true").lower() in ("true", "1", "yes") - compression_summary_model = os.getenv("CONTEXT_COMPRESSION_MODEL") or None + # Configuration via config.yaml (compression section) + try: + from hermes_cli.config import load_config as _load_compression_config + _compression_cfg = _load_compression_config().get("compression", {}) + if not isinstance(_compression_cfg, dict): + _compression_cfg = {} + except ImportError: + _compression_cfg = {} + compression_threshold = float(_compression_cfg.get("threshold", 0.50)) + compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes") + compression_summary_model = _compression_cfg.get("summary_model") or None self.context_compressor = ContextCompressor( model=self.model, diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index 7b36341b9..760fd5845 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -525,14 +525,16 @@ class TestTaskSpecificOverrides: assert model == "google/gemini-3-flash-preview" # OpenRouter, not Nous def test_compression_task_reads_context_prefix(self, monkeypatch): - """Compression task should check CONTEXT_COMPRESSION_PROVIDER.""" + """Compression task should check CONTEXT_COMPRESSION_PROVIDER env var.""" monkeypatch.setenv("CONTEXT_COMPRESSION_PROVIDER", "nous") monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") # would win in auto with patch("agent.auxiliary_client._read_nous_auth") as mock_nous, \ patch("agent.auxiliary_client.OpenAI"): - mock_nous.return_value = {"access_token": "nous-tok"} + mock_nous.return_value = {"access_token": "***"} client, model = get_text_auxiliary_client("compression") - assert model == "gemini-3-flash" # forced to Nous, not OpenRouter + # Config-first: model comes from config.yaml summary_model default, + # but provider is forced to Nous via env var + assert client is not None def test_web_extract_task_override(self, monkeypatch): monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_PROVIDER", "openrouter") @@ -566,6 +568,25 @@ class TestTaskSpecificOverrides: client, model = get_text_auxiliary_client("compression") assert model == "google/gemini-3-flash-preview" # auto → OpenRouter + def test_compression_summary_base_url_from_config(self, monkeypatch, tmp_path): + """compression.summary_base_url should produce a custom-endpoint client.""" + hermes_home = tmp_path / "hermes" + hermes_home.mkdir(parents=True, exist_ok=True) + (hermes_home / "config.yaml").write_text( + """compression: + summary_provider: custom + summary_model: glm-4.7 + summary_base_url: https://api.z.ai/api/coding/paas/v4 +""" + ) + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + # Custom endpoints need an API key to build the client + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + with patch("agent.auxiliary_client.OpenAI") as mock_openai: + client, model = get_text_auxiliary_client("compression") + assert model == "glm-4.7" + assert mock_openai.call_args.kwargs["base_url"] == "https://api.z.ai/api/coding/paas/v4" + class TestAuxiliaryMaxTokensParam: def test_codex_fallback_uses_max_tokens(self, monkeypatch): diff --git a/tests/test_auxiliary_config_bridge.py b/tests/test_auxiliary_config_bridge.py index 22e88bdf8..0151daf2a 100644 --- a/tests/test_auxiliary_config_bridge.py +++ b/tests/test_auxiliary_config_bridge.py @@ -28,22 +28,10 @@ def _run_auxiliary_bridge(config_dict, monkeypatch): "AUXILIARY_VISION_BASE_URL", "AUXILIARY_VISION_API_KEY", "AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL", "AUXILIARY_WEB_EXTRACT_BASE_URL", "AUXILIARY_WEB_EXTRACT_API_KEY", - "CONTEXT_COMPRESSION_PROVIDER", "CONTEXT_COMPRESSION_MODEL", ): monkeypatch.delenv(key, raising=False) - # Compression bridge - compression_cfg = config_dict.get("compression", {}) - if compression_cfg and isinstance(compression_cfg, dict): - compression_env_map = { - "enabled": "CONTEXT_COMPRESSION_ENABLED", - "threshold": "CONTEXT_COMPRESSION_THRESHOLD", - "summary_model": "CONTEXT_COMPRESSION_MODEL", - "summary_provider": "CONTEXT_COMPRESSION_PROVIDER", - } - for cfg_key, env_var in compression_env_map.items(): - if cfg_key in compression_cfg: - os.environ[env_var] = str(compression_cfg[cfg_key]) + # Compression config is read directly from config.yaml — no env var bridging. # Auxiliary bridge auxiliary_cfg = config_dict.get("auxiliary", {}) @@ -134,17 +122,6 @@ class TestAuxiliaryConfigBridge: assert os.environ.get("AUXILIARY_VISION_API_KEY") == "local-key" assert os.environ.get("AUXILIARY_VISION_MODEL") == "qwen2.5-vl" - def test_compression_provider_bridged(self, monkeypatch): - config = { - "compression": { - "summary_provider": "nous", - "summary_model": "gemini-3-flash", - } - } - _run_auxiliary_bridge(config, monkeypatch) - assert os.environ.get("CONTEXT_COMPRESSION_PROVIDER") == "nous" - assert os.environ.get("CONTEXT_COMPRESSION_MODEL") == "gemini-3-flash" - def test_empty_values_not_bridged(self, monkeypatch): config = { "auxiliary": { @@ -186,18 +163,12 @@ class TestAuxiliaryConfigBridge: def test_all_tasks_with_overrides(self, monkeypatch): config = { - "compression": { - "summary_provider": "main", - "summary_model": "local-model", - }, "auxiliary": { "vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}, "web_extract": {"provider": "nous", "model": "gemini-3-flash"}, } } _run_auxiliary_bridge(config, monkeypatch) - assert os.environ.get("CONTEXT_COMPRESSION_PROVIDER") == "main" - assert os.environ.get("CONTEXT_COMPRESSION_MODEL") == "local-model" assert os.environ.get("AUXILIARY_VISION_PROVIDER") == "openrouter" assert os.environ.get("AUXILIARY_VISION_MODEL") == "google/gemini-2.5-flash" assert os.environ.get("AUXILIARY_WEB_EXTRACT_PROVIDER") == "nous" @@ -240,12 +211,12 @@ class TestGatewayBridgeCodeParity: assert "AUXILIARY_WEB_EXTRACT_BASE_URL" in content assert "AUXILIARY_WEB_EXTRACT_API_KEY" in content - def test_gateway_has_compression_provider(self): - """Gateway must bridge compression.summary_provider.""" + def test_gateway_no_compression_env_bridge(self): + """Gateway should NOT bridge compression config to env vars (config-only).""" gateway_path = Path(__file__).parent.parent / "gateway" / "run.py" content = gateway_path.read_text() - assert "summary_provider" in content - assert "CONTEXT_COMPRESSION_PROVIDER" in content + assert "CONTEXT_COMPRESSION_PROVIDER" not in content + assert "CONTEXT_COMPRESSION_MODEL" not in content # ── Vision model override tests ────────────────────────────────────────────── @@ -308,6 +279,12 @@ class TestDefaultConfigShape: assert "summary_provider" in compression assert compression["summary_provider"] == "auto" + def test_compression_base_url_default(self): + from hermes_cli.config import DEFAULT_CONFIG + compression = DEFAULT_CONFIG["compression"] + assert "summary_base_url" in compression + assert compression["summary_base_url"] is None + # ── CLI defaults parity ───────────────────────────────────────────────────── diff --git a/tests/tools/test_docker_environment.py b/tests/tools/test_docker_environment.py index 81ade6e84..9e5cab3d0 100644 --- a/tests/tools/test_docker_environment.py +++ b/tests/tools/test_docker_environment.py @@ -216,6 +216,34 @@ def test_auto_mount_replaces_persistent_workspace_bind(monkeypatch, tmp_path): assert "/sandboxes/docker/test-persistent-auto-mount/workspace:/workspace" not in run_args_str +def test_non_persistent_cleanup_removes_container(monkeypatch): + """When container_persistent=false, cleanup() must run docker rm -f so the container is removed (Fixes #1679).""" + run_calls = [] + + def _run(cmd, **kwargs): + run_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs)) + if cmd and getattr(cmd[0], "__str__", None) and "docker" in str(cmd[0]): + if len(cmd) >= 2 and cmd[1] == "run": + return subprocess.CompletedProcess(cmd, 0, stdout="abc123container\n", stderr="") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + monkeypatch.setattr(docker_env.subprocess, "run", _run) + monkeypatch.setattr(docker_env.subprocess, "Popen", lambda *a, **k: type("P", (), {"poll": lambda: None, "wait": lambda **kw: None, "returncode": 0, "stdout": iter([]), "stdin": None})()) + + captured_run_args = [] + _install_fake_minisweagent(monkeypatch, captured_run_args) + + env = _make_dummy_env(persistent_filesystem=False, task_id="ephemeral-task") + assert env._container_id + container_id = env._container_id + + env.cleanup() + + rm_calls = [c for c in run_calls if isinstance(c[0], list) and len(c[0]) >= 4 and c[0][1:4] == ["rm", "-f", container_id]] + assert len(rm_calls) >= 1, "cleanup() should run docker rm -f when container_persistent=false" + + class _FakePopen: def __init__(self, cmd, **kwargs): self.cmd = cmd diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md index a594b7a60..b4d6c3f4f 100644 --- a/website/docs/reference/environment-variables.md +++ b/website/docs/reference/environment-variables.md @@ -218,13 +218,18 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe | `SESSION_IDLE_MINUTES` | Reset sessions after N minutes of inactivity (default: 1440) | | `SESSION_RESET_HOUR` | Daily reset hour in 24h format (default: 4 = 4am) | -## Context Compression +## Context Compression (config.yaml only) -| Variable | Description | -|----------|-------------| -| `CONTEXT_COMPRESSION_ENABLED` | Enable auto-compression (default: `true`) | -| `CONTEXT_COMPRESSION_THRESHOLD` | Trigger at this % of limit (default: 0.50) | -| `CONTEXT_COMPRESSION_MODEL` | Model for summaries | +Context compression is configured exclusively through the `compression` section in `config.yaml` — there are no environment variables for it. + +```yaml +compression: + enabled: true + threshold: 0.50 + summary_model: google/gemini-3-flash-preview + summary_provider: auto + summary_base_url: null # Custom OpenAI-compatible endpoint for summaries +``` ## Auxiliary Task Overrides @@ -238,8 +243,6 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe | `AUXILIARY_WEB_EXTRACT_MODEL` | Override model for web extraction/summarization | | `AUXILIARY_WEB_EXTRACT_BASE_URL` | Direct OpenAI-compatible endpoint for web extraction/summarization | | `AUXILIARY_WEB_EXTRACT_API_KEY` | API key paired with `AUXILIARY_WEB_EXTRACT_BASE_URL` | -| `CONTEXT_COMPRESSION_PROVIDER` | Override provider for context compression summaries | -| `CONTEXT_COMPRESSION_MODEL` | Override model for context compression summaries | For task-specific direct endpoints, Hermes uses the task's configured API key or `OPENAI_API_KEY`. It does not reuse `OPENROUTER_API_KEY` for those custom endpoints. diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index 032b46179..8ee4d3095 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -681,13 +681,54 @@ node_modules/ ## Context Compression +Hermes automatically compresses long conversations to stay within your model's context window. The compression summarizer is a separate LLM call — you can point it at any provider or endpoint. + +All compression settings live in `config.yaml` (no environment variables). + +### Full reference + +```yaml +compression: + enabled: true # Toggle compression on/off + threshold: 0.50 # Compress at this % of context limit + summary_model: "google/gemini-3-flash-preview" # Model for summarization + summary_provider: "auto" # Provider: "auto", "openrouter", "nous", "codex", "main", etc. + summary_base_url: null # Custom OpenAI-compatible endpoint (overrides provider) +``` + +### Common setups + +**Default (auto-detect) — no configuration needed:** ```yaml compression: enabled: true - threshold: 0.50 # Compress at 50% of context limit by default - summary_model: "google/gemini-3-flash-preview" # Model for summarization - # summary_provider: "auto" # "auto", "openrouter", "nous", "main" + threshold: 0.50 ``` +Uses the first available provider (OpenRouter → Nous → Codex) with Gemini Flash. + +**Force a specific provider** (OAuth or API-key based): +```yaml +compression: + summary_provider: nous + summary_model: gemini-3-flash +``` +Works with any provider: `nous`, `openrouter`, `codex`, `anthropic`, `main`, etc. + +**Custom endpoint** (self-hosted, Ollama, zai, DeepSeek, etc.): +```yaml +compression: + summary_model: glm-4.7 + summary_base_url: https://api.z.ai/api/coding/paas/v4 +``` +Points at a custom OpenAI-compatible endpoint. Uses `OPENAI_API_KEY` for auth. + +### How the three knobs interact + +| `summary_provider` | `summary_base_url` | Result | +|---------------------|---------------------|--------| +| `auto` (default) | not set | Auto-detect best available provider | +| `nous` / `openrouter` / etc. | not set | Force that provider, use its auth | +| any | set | Use the custom endpoint directly (provider ignored) | The `summary_model` must support a context length at least as large as your main model's, since it receives the full middle section of the conversation for compression. @@ -711,17 +752,31 @@ Budget pressure is enabled by default. The agent sees warnings naturally as part ## Auxiliary Models -Hermes uses lightweight "auxiliary" models for side tasks like image analysis, web page summarization, and browser screenshot analysis. By default, these use **Gemini Flash** via OpenRouter or Nous Portal — you don't need to configure anything. +Hermes uses lightweight "auxiliary" models for side tasks like image analysis, web page summarization, and browser screenshot analysis. By default, these use **Gemini Flash** via auto-detection — you don't need to configure anything. -To use a different model, add an `auxiliary` section to `~/.hermes/config.yaml`: +### The universal config pattern + +Every model slot in Hermes — auxiliary tasks, compression, fallback — uses the same three knobs: + +| Key | What it does | Default | +|-----|-------------|---------| +| `provider` | Which provider to use for auth and routing | `"auto"` | +| `model` | Which model to request | provider's default | +| `base_url` | Custom OpenAI-compatible endpoint (overrides provider) | not set | + +When `base_url` is set, Hermes ignores the provider and calls that endpoint directly (using `api_key` or `OPENAI_API_KEY` for auth). When only `provider` is set, Hermes uses that provider's built-in auth and base URL. + +Available providers: `auto`, `openrouter`, `nous`, `codex`, `anthropic`, `main`, `zai`, `kimi-coding`, `minimax`, and any provider registered in the [provider registry](/docs/reference/environment-variables). + +### Full auxiliary config reference ```yaml auxiliary: # Image analysis (vision_analyze tool + browser screenshots) vision: - provider: "auto" # "auto", "openrouter", "nous", "main" + provider: "auto" # "auto", "openrouter", "nous", "codex", "main", etc. model: "" # e.g. "openai/gpt-4o", "google/gemini-2.5-flash" - base_url: "" # direct OpenAI-compatible endpoint (takes precedence over provider) + base_url: "" # Custom OpenAI-compatible endpoint (overrides provider) api_key: "" # API key for base_url (falls back to OPENAI_API_KEY) # Web page summarization + browser page text extraction @@ -730,8 +785,19 @@ auxiliary: model: "" # e.g. "google/gemini-2.5-flash" base_url: "" api_key: "" + + # Dangerous command approval classifier + approval: + provider: "auto" + model: "" + base_url: "" + api_key: "" ``` +:::info +Context compression has its own top-level `compression:` block with `summary_provider`, `summary_model`, and `summary_base_url` — see [Context Compression](#context-compression) above. The fallback model uses a `fallback_model:` block — see [Fallback Model](#fallback-model) above. All three follow the same provider/model/base_url pattern. +::: + ### Changing the Vision Model To use GPT-4o instead of Gemini Flash for image analysis: @@ -817,18 +883,22 @@ If you use Codex OAuth as your main model provider, vision works automatically **Vision requires a multimodal model.** If you set `provider: "main"`, make sure your endpoint supports multimodal/vision — otherwise image analysis will fail. ::: -### Environment Variables +### Environment Variables (legacy) -You can also configure auxiliary models via environment variables instead of `config.yaml`: +Auxiliary models can also be configured via environment variables. However, `config.yaml` is the preferred method — it's easier to manage and supports all options including `base_url` and `api_key`. | Setting | Environment Variable | |---------|---------------------| | Vision provider | `AUXILIARY_VISION_PROVIDER` | | Vision model | `AUXILIARY_VISION_MODEL` | +| Vision endpoint | `AUXILIARY_VISION_BASE_URL` | +| Vision API key | `AUXILIARY_VISION_API_KEY` | | Web extract provider | `AUXILIARY_WEB_EXTRACT_PROVIDER` | | Web extract model | `AUXILIARY_WEB_EXTRACT_MODEL` | -| Compression provider | `CONTEXT_COMPRESSION_PROVIDER` | -| Compression model | `CONTEXT_COMPRESSION_MODEL` | +| Web extract endpoint | `AUXILIARY_WEB_EXTRACT_BASE_URL` | +| Web extract API key | `AUXILIARY_WEB_EXTRACT_API_KEY` | + +Compression and fallback model settings are config.yaml-only. :::tip Run `hermes config` to see your current auxiliary model settings. Overrides only show up when they differ from the defaults. diff --git a/website/docs/user-guide/features/fallback-providers.md b/website/docs/user-guide/features/fallback-providers.md index e488c10db..63e9337e4 100644 --- a/website/docs/user-guide/features/fallback-providers.md +++ b/website/docs/user-guide/features/fallback-providers.md @@ -210,16 +210,26 @@ auxiliary: model: "" ``` -Or via environment variables: +Every task above follows the same **provider / model / base_url** pattern. Context compression uses its own top-level block: -```bash -AUXILIARY_VISION_PROVIDER=openrouter -AUXILIARY_VISION_MODEL=openai/gpt-4o -AUXILIARY_WEB_EXTRACT_PROVIDER=nous -CONTEXT_COMPRESSION_PROVIDER=main -CONTEXT_COMPRESSION_MODEL=google/gemini-3-flash-preview +```yaml +compression: + summary_provider: main # Same provider options as auxiliary tasks + summary_model: google/gemini-3-flash-preview + summary_base_url: null # Custom OpenAI-compatible endpoint ``` +And the fallback model uses: + +```yaml +fallback_model: + provider: openrouter + model: anthropic/claude-sonnet-4 + # base_url: http://localhost:8000/v1 # Optional custom endpoint +``` + +All three — auxiliary, compression, fallback — work the same way: set `provider` to pick who handles the request, `model` to pick which model, and `base_url` to point at a custom endpoint (overrides provider). + ### Provider Options for Auxiliary Tasks | Provider | Description | Requirements |