diff --git a/AGENTS.md b/AGENTS.md index fbdbc9530..db4aaa918 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -744,10 +744,10 @@ compression: | `"auto"` | Best available (default). For vision, only tries OpenRouter + Nous. | | `"openrouter"` | Force OpenRouter (requires `OPENROUTER_API_KEY`) | | `"nous"` | Force Nous Portal (requires `hermes login`) | -| `"openai"` | Force OpenAI direct API at `api.openai.com` (requires `OPENAI_API_KEY`). Supports vision via GPT-4o. | -| `"main"` | Use the same provider as your main chat model. Skips OpenRouter/Nous. Useful for local models. | +| `"codex"` | Force Codex OAuth (ChatGPT account). Supports vision via gpt-5.3-codex. | +| `"main"` | Use your custom endpoint (`OPENAI_BASE_URL` + `OPENAI_API_KEY`). Works with OpenAI API, local models, etc. | -**Important:** Vision tasks require a multimodal-capable model. In `auto` mode, only OpenRouter and Nous Portal are tried (they route to Gemini, which supports images). The `"openai"` provider also works for vision since GPT-4o supports image input. Setting `provider: "main"` for vision will work only if your main endpoint supports multimodal input. +**Important:** Vision tasks require a multimodal-capable model. In `auto` mode, OpenRouter, Nous Portal, and Codex OAuth are tried (they all support vision). Setting `provider: "main"` for vision will work only if your endpoint supports multimodal input (e.g. OpenAI with GPT-4o, or a local model with vision). **Key files:** `agent/auxiliary_client.py` (resolution chain), `tools/vision_tools.py`, `tools/browser_tool.py`, `tools/web_tools.py` diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 5bd523b64..a32e3a293 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -21,7 +21,7 @@ Resolution order for vision/multimodal tasks (auto mode): Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER, CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task: -"openrouter", "nous", "openai", or "main" (= steps 3-5). +"openrouter", "nous", "codex", or "main" (= steps 3-5). Default "auto" follows the chains above. Per-task model overrides (e.g. AUXILIARY_VISION_MODEL, @@ -71,11 +71,6 @@ _NOUS_MODEL = "gemini-3-flash" _NOUS_DEFAULT_BASE_URL = "https://inference-api.nousresearch.com/v1" _AUTH_JSON_PATH = Path.home() / ".hermes" / "auth.json" -# OpenAI direct: uses OPENAI_API_KEY with the official API endpoint. -# gpt-4o-mini is cheap/fast and supports vision — good default for auxiliary tasks. -_OPENAI_AUX_MODEL = "gpt-4o-mini" -_OPENAI_BASE_URL = "https://api.openai.com/v1" - # Codex fallback: uses the Responses API (the only endpoint the Codex # OAuth token can access) with a fast model for auxiliary tasks. _CODEX_AUX_MODEL = "gpt-5.3-codex" @@ -440,15 +435,6 @@ def _try_nous() -> Tuple[Optional[OpenAI], Optional[str]]: ) -def _try_openai() -> Tuple[Optional[OpenAI], Optional[str]]: - """Try OpenAI direct API (api.openai.com) using OPENAI_API_KEY.""" - api_key = os.getenv("OPENAI_API_KEY", "").strip() - if not api_key: - return None, None - logger.debug("Auxiliary client: OpenAI direct (%s)", _OPENAI_AUX_MODEL) - return OpenAI(api_key=api_key, base_url=_OPENAI_BASE_URL), _OPENAI_AUX_MODEL - - def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]: custom_base = os.getenv("OPENAI_BASE_URL") custom_key = os.getenv("OPENAI_API_KEY") @@ -482,12 +468,6 @@ def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[st logger.warning("auxiliary.provider=nous but Nous Portal not configured (run: hermes login)") return client, model - if forced == "openai": - client, model = _try_openai() - if client is None: - logger.warning("auxiliary.provider=openai but OPENAI_API_KEY not set") - return client, model - if forced == "codex": client, model = _try_codex() if client is None: @@ -606,10 +586,6 @@ def auxiliary_max_tokens_param(value: int) -> dict: The Codex adapter translates max_tokens internally, so we use max_tokens for it as well. """ - # Check if any auxiliary task is explicitly forced to "openai" - for task in ("vision", "web_extract", "compression"): - if _get_auxiliary_provider(task) == "openai": - return {"max_completion_tokens": value} custom_base = os.getenv("OPENAI_BASE_URL", "") or_key = os.getenv("OPENROUTER_API_KEY") # Only use max_completion_tokens for direct OpenAI custom endpoints diff --git a/cli-config.yaml.example b/cli-config.yaml.example index caabba436..6b1cf97c0 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -241,14 +241,11 @@ compression: # "auto" - Best available: OpenRouter → Nous Portal → main endpoint (default) # "openrouter" - Force OpenRouter (requires OPENROUTER_API_KEY) # "nous" - Force Nous Portal (requires: hermes login) -# "openai" - Force OpenAI direct API (requires OPENAI_API_KEY). -# Uses api.openai.com/v1 with models like gpt-4o, gpt-4o-mini. -# Great for vision since GPT-4o supports image input. -# "main" - Use the same provider & credentials as your main chat model. -# Skips OpenRouter/Nous and uses your custom endpoint -# (OPENAI_BASE_URL), Codex OAuth, or API-key provider directly. -# Useful if you run a local model and want auxiliary tasks to -# use it too. +# "codex" - Force Codex OAuth (requires: hermes model → Codex). +# Uses gpt-5.3-codex which supports vision. +# "main" - Use your custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY). +# Works with OpenAI API, local models, or any OpenAI-compatible +# endpoint. Also falls back to Codex OAuth and API-key providers. # # Model: leave empty to use the provider's default. When empty, OpenRouter # uses "google/gemini-3-flash-preview" and Nous uses "gemini-3-flash". diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index 8454bbea4..66187d055 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -220,14 +220,15 @@ class TestVisionClientFallback: assert client is None assert model is None - def test_vision_forced_openai(self, monkeypatch): - """When forced to 'openai', vision uses OpenAI direct API.""" - monkeypatch.setenv("AUXILIARY_VISION_PROVIDER", "openai") - monkeypatch.setenv("OPENAI_API_KEY", "sk-test") - with patch("agent.auxiliary_client.OpenAI") as mock_openai: + def test_vision_forced_codex(self, monkeypatch, codex_auth_dir): + """When forced to 'codex', vision uses Codex OAuth.""" + monkeypatch.setenv("AUXILIARY_VISION_PROVIDER", "codex") + with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ + patch("agent.auxiliary_client.OpenAI"): client, model = get_vision_auxiliary_client() - assert client is not None - assert model == "gpt-4o-mini" + from agent.auxiliary_client import CodexAuxiliaryClient + assert isinstance(client, CodexAuxiliaryClient) + assert model == "gpt-5.3-codex" class TestGetAuxiliaryProvider: @@ -324,19 +325,17 @@ class TestResolveForcedProvider: assert isinstance(client, CodexAuxiliaryClient) assert model == "gpt-5.3-codex" - def test_forced_openai_with_key(self, monkeypatch): - monkeypatch.setenv("OPENAI_API_KEY", "sk-test-key") - with patch("agent.auxiliary_client.OpenAI") as mock_openai: - client, model = _resolve_forced_provider("openai") - assert model == "gpt-4o-mini" - assert client is not None - call_kwargs = mock_openai.call_args - assert call_kwargs.kwargs["base_url"] == "https://api.openai.com/v1" - assert call_kwargs.kwargs["api_key"] == "sk-test-key" + def test_forced_codex(self, codex_auth_dir, monkeypatch): + with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ + patch("agent.auxiliary_client.OpenAI"): + client, model = _resolve_forced_provider("codex") + from agent.auxiliary_client import CodexAuxiliaryClient + assert isinstance(client, CodexAuxiliaryClient) + assert model == "gpt-5.3-codex" - def test_forced_openai_no_key(self, monkeypatch): - monkeypatch.delenv("OPENAI_API_KEY", raising=False) - client, model = _resolve_forced_provider("openai") + def test_forced_codex_no_token(self, monkeypatch): + with patch("agent.auxiliary_client._read_codex_access_token", return_value=None): + client, model = _resolve_forced_provider("codex") assert client is None assert model is None diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index 3f7214e2f..f2abd16ca 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -481,17 +481,20 @@ AUXILIARY_VISION_MODEL=openai/gpt-4o | `"auto"` | Best available (default). Vision tries OpenRouter → Nous → Codex. | — | | `"openrouter"` | Force OpenRouter — routes to any model (Gemini, GPT-4o, Claude, etc.) | `OPENROUTER_API_KEY` | | `"nous"` | Force Nous Portal | `hermes login` | -| `"openai"` | Force OpenAI direct API (`api.openai.com`). Supports vision (GPT-4o). | `OPENAI_API_KEY` | | `"codex"` | Force Codex OAuth (ChatGPT account). Supports vision (gpt-5.3-codex). | `hermes model` → Codex | -| `"main"` | Use your main chat model's provider. For local/self-hosted models. | Depends on your setup | +| `"main"` | Use your custom endpoint (`OPENAI_BASE_URL` + `OPENAI_API_KEY`). Works with OpenAI, local models, or any OpenAI-compatible API. | `OPENAI_BASE_URL` + `OPENAI_API_KEY` | ### Common Setups -**Using OpenAI for vision** (if you have an OpenAI API key): +**Using OpenAI API key for vision:** ```yaml +# In ~/.hermes/.env: +# OPENAI_BASE_URL=https://api.openai.com/v1 +# OPENAI_API_KEY=sk-... + auxiliary: vision: - provider: "openai" + provider: "main" model: "gpt-4o" # or "gpt-4o-mini" for cheaper ```