2026-02-22 02:16:11 -08:00
|
|
|
"""Shared auxiliary OpenAI client for cheap/fast side tasks.
|
|
|
|
|
|
|
|
|
|
Provides a single resolution chain so every consumer (context compression,
|
|
|
|
|
session search, web extraction, vision analysis, browser vision) picks up
|
|
|
|
|
the best available backend without duplicating fallback logic.
|
|
|
|
|
|
2026-03-08 18:06:40 -07:00
|
|
|
Resolution order for text tasks (auto mode):
|
2026-02-22 02:16:11 -08:00
|
|
|
1. OpenRouter (OPENROUTER_API_KEY)
|
|
|
|
|
2. Nous Portal (~/.hermes/auth.json active provider)
|
|
|
|
|
3. Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY)
|
2026-02-28 21:47:51 -08:00
|
|
|
4. Codex OAuth (Responses API via chatgpt.com with gpt-5.3-codex,
|
|
|
|
|
wrapped to look like a chat.completions client)
|
2026-03-06 19:08:54 -08:00
|
|
|
5. Direct API-key providers (z.ai/GLM, Kimi/Moonshot, MiniMax, MiniMax-CN)
|
|
|
|
|
— checked via PROVIDER_REGISTRY entries with auth_type='api_key'
|
|
|
|
|
6. None
|
2026-02-22 02:16:11 -08:00
|
|
|
|
2026-03-08 18:06:40 -07:00
|
|
|
Resolution order for vision/multimodal tasks (auto mode):
|
|
|
|
|
1. OpenRouter
|
|
|
|
|
2. Nous Portal
|
2026-03-11 20:02:36 -07:00
|
|
|
3. Codex OAuth (gpt-5.3-codex supports vision via Responses API)
|
|
|
|
|
4. Custom endpoint (for local vision models: Qwen-VL, LLaVA, Pixtral, etc.)
|
|
|
|
|
5. None (API-key providers like z.ai/Kimi/MiniMax are skipped —
|
|
|
|
|
they may not support multimodal)
|
2026-03-08 18:06:40 -07:00
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER,
|
|
|
|
|
CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task:
|
2026-03-08 18:50:26 -07:00
|
|
|
"openrouter", "nous", "codex", or "main" (= steps 3-5).
|
2026-03-08 18:06:40 -07:00
|
|
|
Default "auto" follows the chains above.
|
|
|
|
|
|
|
|
|
|
Per-task model overrides (e.g. AUXILIARY_VISION_MODEL,
|
|
|
|
|
AUXILIARY_WEB_EXTRACT_MODEL) let callers use a different model slug
|
|
|
|
|
than the provider's default.
|
2026-02-22 02:16:11 -08:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
|
import os
|
|
|
|
|
from pathlib import Path
|
2026-02-28 21:47:51 -08:00
|
|
|
from types import SimpleNamespace
|
|
|
|
|
from typing import Any, Dict, List, Optional, Tuple
|
2026-02-22 02:16:11 -08:00
|
|
|
|
|
|
|
|
from openai import OpenAI
|
|
|
|
|
|
fix(cli): respect HERMES_HOME in all remaining hardcoded ~/.hermes paths
Several files resolved paths via Path.home() / ".hermes" or
os.path.expanduser("~/.hermes/..."), bypassing the HERMES_HOME
environment variable. This broke isolation when running multiple
Hermes instances with distinct HERMES_HOME directories.
Replace all hardcoded paths with calls to get_hermes_home() from
hermes_cli.config, consistent with the rest of the codebase.
Files fixed:
- tools/process_registry.py (processes.json)
- gateway/pairing.py (pairing/)
- gateway/sticker_cache.py (sticker_cache.json)
- gateway/channel_directory.py (channel_directory.json, sessions.json)
- gateway/config.py (gateway.json, config.yaml, sessions_dir)
- gateway/mirror.py (sessions/)
- gateway/hooks.py (hooks/)
- gateway/platforms/base.py (image_cache/, audio_cache/, document_cache/)
- gateway/platforms/whatsapp.py (whatsapp/session)
- gateway/delivery.py (cron/output)
- agent/auxiliary_client.py (auth.json)
- agent/prompt_builder.py (SOUL.md)
- cli.py (config.yaml, images/, pastes/, history)
- run_agent.py (logs/)
- tools/environments/base.py (sandboxes/)
- tools/environments/modal.py (modal_snapshots.json)
- tools/environments/singularity.py (singularity_snapshots.json)
- tools/tts_tool.py (audio_cache)
- hermes_cli/status.py (cron/jobs.json, sessions.json)
- hermes_cli/gateway.py (logs/, whatsapp session)
- hermes_cli/main.py (whatsapp/session)
Tests updated to use HERMES_HOME env var instead of patching Path.home().
Closes #892
(cherry picked from commit 78ac1bba43b8b74a934c6172f2c29bb4d03164b9)
2026-03-11 07:31:41 +01:00
|
|
|
from hermes_cli.config import get_hermes_home
|
2026-02-22 02:16:11 -08:00
|
|
|
from hermes_constants import OPENROUTER_BASE_URL
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
2026-03-06 19:08:54 -08:00
|
|
|
# Default auxiliary models for direct API-key providers (cheap/fast for side tasks)
|
|
|
|
|
_API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = {
|
|
|
|
|
"zai": "glm-4.5-flash",
|
|
|
|
|
"kimi-coding": "kimi-k2-turbo-preview",
|
|
|
|
|
"minimax": "MiniMax-M2.5-highspeed",
|
|
|
|
|
"minimax-cn": "MiniMax-M2.5-highspeed",
|
feat: native Anthropic provider with Claude Code credential auto-discovery
Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).
## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
- Reads Claude Code's OAuth credentials
- Checks token expiry with 60s buffer
- Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
- Regular API keys use standard x-api-key header
## Changes by file
### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
format conversion, Claude Code credential reader, token resolver.
Handles system prompt extraction, tool_use/tool_result blocks,
thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic
### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
* Client init (Anthropic SDK instead of OpenAI)
* API call dispatch (_anthropic_client.messages.create)
* Response validation (content blocks)
* finish_reason mapping (stop_reason -> finish_reason)
* Token usage (input_tokens/output_tokens)
* Response normalization (normalize_anthropic_response)
* Client interrupt/rebuild
* Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
expect native routing, add test_prompt_caching_native_anthropic
2026-03-12 15:47:45 -07:00
|
|
|
"anthropic": "claude-haiku-4-5-20251001",
|
2026-03-06 19:08:54 -08:00
|
|
|
}
|
|
|
|
|
|
2026-02-25 16:34:47 -08:00
|
|
|
# OpenRouter app attribution headers
|
|
|
|
|
_OR_HEADERS = {
|
2026-03-12 16:20:22 -07:00
|
|
|
"HTTP-Referer": "https://hermes-agent.nousresearch.com",
|
2026-02-25 16:34:47 -08:00
|
|
|
"X-OpenRouter-Title": "Hermes Agent",
|
2026-02-28 10:38:49 -08:00
|
|
|
"X-OpenRouter-Categories": "productivity,cli-agent",
|
2026-02-25 16:34:47 -08:00
|
|
|
}
|
|
|
|
|
|
2026-02-25 18:39:36 -08:00
|
|
|
# Nous Portal extra_body for product attribution.
|
|
|
|
|
# Callers should pass this as extra_body in chat.completions.create()
|
|
|
|
|
# when the auxiliary client is backed by Nous Portal.
|
|
|
|
|
NOUS_EXTRA_BODY = {"tags": ["product=hermes-agent"]}
|
|
|
|
|
|
|
|
|
|
# Set at resolve time — True if the auxiliary client points to Nous Portal
|
|
|
|
|
auxiliary_is_nous: bool = False
|
|
|
|
|
|
2026-02-22 02:16:11 -08:00
|
|
|
# Default auxiliary models per provider
|
|
|
|
|
_OPENROUTER_MODEL = "google/gemini-3-flash-preview"
|
|
|
|
|
_NOUS_MODEL = "gemini-3-flash"
|
|
|
|
|
_NOUS_DEFAULT_BASE_URL = "https://inference-api.nousresearch.com/v1"
|
fix(cli): respect HERMES_HOME in all remaining hardcoded ~/.hermes paths
Several files resolved paths via Path.home() / ".hermes" or
os.path.expanduser("~/.hermes/..."), bypassing the HERMES_HOME
environment variable. This broke isolation when running multiple
Hermes instances with distinct HERMES_HOME directories.
Replace all hardcoded paths with calls to get_hermes_home() from
hermes_cli.config, consistent with the rest of the codebase.
Files fixed:
- tools/process_registry.py (processes.json)
- gateway/pairing.py (pairing/)
- gateway/sticker_cache.py (sticker_cache.json)
- gateway/channel_directory.py (channel_directory.json, sessions.json)
- gateway/config.py (gateway.json, config.yaml, sessions_dir)
- gateway/mirror.py (sessions/)
- gateway/hooks.py (hooks/)
- gateway/platforms/base.py (image_cache/, audio_cache/, document_cache/)
- gateway/platforms/whatsapp.py (whatsapp/session)
- gateway/delivery.py (cron/output)
- agent/auxiliary_client.py (auth.json)
- agent/prompt_builder.py (SOUL.md)
- cli.py (config.yaml, images/, pastes/, history)
- run_agent.py (logs/)
- tools/environments/base.py (sandboxes/)
- tools/environments/modal.py (modal_snapshots.json)
- tools/environments/singularity.py (singularity_snapshots.json)
- tools/tts_tool.py (audio_cache)
- hermes_cli/status.py (cron/jobs.json, sessions.json)
- hermes_cli/gateway.py (logs/, whatsapp session)
- hermes_cli/main.py (whatsapp/session)
Tests updated to use HERMES_HOME env var instead of patching Path.home().
Closes #892
(cherry picked from commit 78ac1bba43b8b74a934c6172f2c29bb4d03164b9)
2026-03-11 07:31:41 +01:00
|
|
|
_AUTH_JSON_PATH = get_hermes_home() / "auth.json"
|
2026-02-22 02:16:11 -08:00
|
|
|
|
2026-02-28 21:47:51 -08:00
|
|
|
# Codex fallback: uses the Responses API (the only endpoint the Codex
|
|
|
|
|
# OAuth token can access) with a fast model for auxiliary tasks.
|
|
|
|
|
_CODEX_AUX_MODEL = "gpt-5.3-codex"
|
|
|
|
|
_CODEX_AUX_BASE_URL = "https://chatgpt.com/backend-api/codex"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ── Codex Responses → chat.completions adapter ─────────────────────────────
|
|
|
|
|
# All auxiliary consumers call client.chat.completions.create(**kwargs) and
|
|
|
|
|
# read response.choices[0].message.content. This adapter translates those
|
|
|
|
|
# calls to the Codex Responses API so callers don't need any changes.
|
|
|
|
|
|
2026-03-08 18:44:25 -07:00
|
|
|
|
|
|
|
|
def _convert_content_for_responses(content: Any) -> Any:
|
|
|
|
|
"""Convert chat.completions content to Responses API format.
|
|
|
|
|
|
|
|
|
|
chat.completions uses:
|
|
|
|
|
{"type": "text", "text": "..."}
|
|
|
|
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
|
|
|
|
|
|
|
|
|
|
Responses API uses:
|
|
|
|
|
{"type": "input_text", "text": "..."}
|
|
|
|
|
{"type": "input_image", "image_url": "data:image/png;base64,..."}
|
|
|
|
|
|
|
|
|
|
If content is a plain string, it's returned as-is (the Responses API
|
|
|
|
|
accepts strings directly for text-only messages).
|
|
|
|
|
"""
|
|
|
|
|
if isinstance(content, str):
|
|
|
|
|
return content
|
|
|
|
|
if not isinstance(content, list):
|
|
|
|
|
return str(content) if content else ""
|
|
|
|
|
|
|
|
|
|
converted: List[Dict[str, Any]] = []
|
|
|
|
|
for part in content:
|
|
|
|
|
if not isinstance(part, dict):
|
|
|
|
|
continue
|
|
|
|
|
ptype = part.get("type", "")
|
|
|
|
|
if ptype == "text":
|
|
|
|
|
converted.append({"type": "input_text", "text": part.get("text", "")})
|
|
|
|
|
elif ptype == "image_url":
|
|
|
|
|
# chat.completions nests the URL: {"image_url": {"url": "..."}}
|
|
|
|
|
image_data = part.get("image_url", {})
|
|
|
|
|
url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data)
|
|
|
|
|
entry: Dict[str, Any] = {"type": "input_image", "image_url": url}
|
|
|
|
|
# Preserve detail if specified
|
|
|
|
|
detail = image_data.get("detail") if isinstance(image_data, dict) else None
|
|
|
|
|
if detail:
|
|
|
|
|
entry["detail"] = detail
|
|
|
|
|
converted.append(entry)
|
|
|
|
|
elif ptype in ("input_text", "input_image"):
|
|
|
|
|
# Already in Responses format — pass through
|
|
|
|
|
converted.append(part)
|
|
|
|
|
else:
|
|
|
|
|
# Unknown content type — try to preserve as text
|
|
|
|
|
text = part.get("text", "")
|
|
|
|
|
if text:
|
|
|
|
|
converted.append({"type": "input_text", "text": text})
|
|
|
|
|
|
|
|
|
|
return converted or ""
|
|
|
|
|
|
|
|
|
|
|
2026-02-28 21:47:51 -08:00
|
|
|
class _CodexCompletionsAdapter:
|
|
|
|
|
"""Drop-in shim that accepts chat.completions.create() kwargs and
|
|
|
|
|
routes them through the Codex Responses streaming API."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, real_client: OpenAI, model: str):
|
|
|
|
|
self._client = real_client
|
|
|
|
|
self._model = model
|
|
|
|
|
|
|
|
|
|
def create(self, **kwargs) -> Any:
|
|
|
|
|
messages = kwargs.get("messages", [])
|
|
|
|
|
model = kwargs.get("model", self._model)
|
|
|
|
|
temperature = kwargs.get("temperature")
|
|
|
|
|
|
2026-03-08 18:44:25 -07:00
|
|
|
# Separate system/instructions from conversation messages.
|
|
|
|
|
# Convert chat.completions multimodal content blocks to Responses
|
|
|
|
|
# API format (input_text / input_image instead of text / image_url).
|
2026-02-28 21:47:51 -08:00
|
|
|
instructions = "You are a helpful assistant."
|
|
|
|
|
input_msgs: List[Dict[str, Any]] = []
|
|
|
|
|
for msg in messages:
|
|
|
|
|
role = msg.get("role", "user")
|
2026-03-02 02:23:53 -08:00
|
|
|
content = msg.get("content") or ""
|
2026-02-28 21:47:51 -08:00
|
|
|
if role == "system":
|
2026-03-08 18:44:25 -07:00
|
|
|
instructions = content if isinstance(content, str) else str(content)
|
2026-02-28 21:47:51 -08:00
|
|
|
else:
|
2026-03-08 18:44:25 -07:00
|
|
|
input_msgs.append({
|
|
|
|
|
"role": role,
|
|
|
|
|
"content": _convert_content_for_responses(content),
|
|
|
|
|
})
|
2026-02-28 21:47:51 -08:00
|
|
|
|
|
|
|
|
resp_kwargs: Dict[str, Any] = {
|
|
|
|
|
"model": model,
|
|
|
|
|
"instructions": instructions,
|
|
|
|
|
"input": input_msgs or [{"role": "user", "content": ""}],
|
|
|
|
|
"store": False,
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-08 18:44:25 -07:00
|
|
|
# Note: the Codex endpoint (chatgpt.com/backend-api/codex) does NOT
|
|
|
|
|
# support max_output_tokens or temperature — omit to avoid 400 errors.
|
2026-02-28 21:47:51 -08:00
|
|
|
|
|
|
|
|
# Tools support for flush_memories and similar callers
|
|
|
|
|
tools = kwargs.get("tools")
|
|
|
|
|
if tools:
|
|
|
|
|
converted = []
|
|
|
|
|
for t in tools:
|
|
|
|
|
fn = t.get("function", {}) if isinstance(t, dict) else {}
|
|
|
|
|
name = fn.get("name")
|
|
|
|
|
if not name:
|
|
|
|
|
continue
|
|
|
|
|
converted.append({
|
|
|
|
|
"type": "function",
|
|
|
|
|
"name": name,
|
|
|
|
|
"description": fn.get("description", ""),
|
|
|
|
|
"parameters": fn.get("parameters", {}),
|
|
|
|
|
})
|
|
|
|
|
if converted:
|
|
|
|
|
resp_kwargs["tools"] = converted
|
|
|
|
|
|
|
|
|
|
# Stream and collect the response
|
|
|
|
|
text_parts: List[str] = []
|
|
|
|
|
tool_calls_raw: List[Any] = []
|
|
|
|
|
usage = None
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with self._client.responses.stream(**resp_kwargs) as stream:
|
|
|
|
|
for _event in stream:
|
|
|
|
|
pass
|
|
|
|
|
final = stream.get_final_response()
|
|
|
|
|
|
|
|
|
|
# Extract text and tool calls from the Responses output
|
|
|
|
|
for item in getattr(final, "output", []):
|
|
|
|
|
item_type = getattr(item, "type", None)
|
|
|
|
|
if item_type == "message":
|
|
|
|
|
for part in getattr(item, "content", []):
|
|
|
|
|
ptype = getattr(part, "type", None)
|
|
|
|
|
if ptype in ("output_text", "text"):
|
|
|
|
|
text_parts.append(getattr(part, "text", ""))
|
|
|
|
|
elif item_type == "function_call":
|
|
|
|
|
tool_calls_raw.append(SimpleNamespace(
|
|
|
|
|
id=getattr(item, "call_id", ""),
|
|
|
|
|
type="function",
|
|
|
|
|
function=SimpleNamespace(
|
|
|
|
|
name=getattr(item, "name", ""),
|
|
|
|
|
arguments=getattr(item, "arguments", "{}"),
|
|
|
|
|
),
|
|
|
|
|
))
|
|
|
|
|
|
|
|
|
|
resp_usage = getattr(final, "usage", None)
|
|
|
|
|
if resp_usage:
|
|
|
|
|
usage = SimpleNamespace(
|
|
|
|
|
prompt_tokens=getattr(resp_usage, "input_tokens", 0),
|
|
|
|
|
completion_tokens=getattr(resp_usage, "output_tokens", 0),
|
|
|
|
|
total_tokens=getattr(resp_usage, "total_tokens", 0),
|
|
|
|
|
)
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
logger.debug("Codex auxiliary Responses API call failed: %s", exc)
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
content = "".join(text_parts).strip() or None
|
|
|
|
|
|
|
|
|
|
# Build a response that looks like chat.completions
|
|
|
|
|
message = SimpleNamespace(
|
|
|
|
|
role="assistant",
|
|
|
|
|
content=content,
|
|
|
|
|
tool_calls=tool_calls_raw or None,
|
|
|
|
|
)
|
|
|
|
|
choice = SimpleNamespace(
|
|
|
|
|
index=0,
|
|
|
|
|
message=message,
|
|
|
|
|
finish_reason="stop" if not tool_calls_raw else "tool_calls",
|
|
|
|
|
)
|
|
|
|
|
return SimpleNamespace(
|
|
|
|
|
choices=[choice],
|
|
|
|
|
model=model,
|
|
|
|
|
usage=usage,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _CodexChatShim:
|
|
|
|
|
"""Wraps the adapter to provide client.chat.completions.create()."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, adapter: _CodexCompletionsAdapter):
|
|
|
|
|
self.completions = adapter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CodexAuxiliaryClient:
|
|
|
|
|
"""OpenAI-client-compatible wrapper that routes through Codex Responses API.
|
|
|
|
|
|
|
|
|
|
Consumers can call client.chat.completions.create(**kwargs) as normal.
|
|
|
|
|
Also exposes .api_key and .base_url for introspection by async wrappers.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, real_client: OpenAI, model: str):
|
|
|
|
|
self._real_client = real_client
|
|
|
|
|
adapter = _CodexCompletionsAdapter(real_client, model)
|
|
|
|
|
self.chat = _CodexChatShim(adapter)
|
|
|
|
|
self.api_key = real_client.api_key
|
|
|
|
|
self.base_url = real_client.base_url
|
|
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
|
self._real_client.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _AsyncCodexCompletionsAdapter:
|
|
|
|
|
"""Async version of the Codex Responses adapter.
|
|
|
|
|
|
|
|
|
|
Wraps the sync adapter via asyncio.to_thread() so async consumers
|
|
|
|
|
(web_tools, session_search) can await it as normal.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, sync_adapter: _CodexCompletionsAdapter):
|
|
|
|
|
self._sync = sync_adapter
|
|
|
|
|
|
|
|
|
|
async def create(self, **kwargs) -> Any:
|
|
|
|
|
import asyncio
|
|
|
|
|
return await asyncio.to_thread(self._sync.create, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _AsyncCodexChatShim:
|
|
|
|
|
def __init__(self, adapter: _AsyncCodexCompletionsAdapter):
|
|
|
|
|
self.completions = adapter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AsyncCodexAuxiliaryClient:
|
|
|
|
|
"""Async-compatible wrapper matching AsyncOpenAI.chat.completions.create()."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, sync_wrapper: "CodexAuxiliaryClient"):
|
|
|
|
|
sync_adapter = sync_wrapper.chat.completions
|
|
|
|
|
async_adapter = _AsyncCodexCompletionsAdapter(sync_adapter)
|
|
|
|
|
self.chat = _AsyncCodexChatShim(async_adapter)
|
|
|
|
|
self.api_key = sync_wrapper.api_key
|
|
|
|
|
self.base_url = sync_wrapper.base_url
|
|
|
|
|
|
2026-02-22 02:16:11 -08:00
|
|
|
|
|
|
|
|
def _read_nous_auth() -> Optional[dict]:
|
|
|
|
|
"""Read and validate ~/.hermes/auth.json for an active Nous provider.
|
|
|
|
|
|
|
|
|
|
Returns the provider state dict if Nous is active with tokens,
|
|
|
|
|
otherwise None.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
if not _AUTH_JSON_PATH.is_file():
|
|
|
|
|
return None
|
|
|
|
|
data = json.loads(_AUTH_JSON_PATH.read_text())
|
|
|
|
|
if data.get("active_provider") != "nous":
|
|
|
|
|
return None
|
|
|
|
|
provider = data.get("providers", {}).get("nous", {})
|
|
|
|
|
# Must have at least an access_token or agent_key
|
|
|
|
|
if not provider.get("agent_key") and not provider.get("access_token"):
|
|
|
|
|
return None
|
|
|
|
|
return provider
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
logger.debug("Could not read Nous auth: %s", exc)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _nous_api_key(provider: dict) -> str:
|
|
|
|
|
"""Extract the best API key from a Nous provider state dict."""
|
|
|
|
|
return provider.get("agent_key") or provider.get("access_token", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _nous_base_url() -> str:
|
|
|
|
|
"""Resolve the Nous inference base URL from env or default."""
|
|
|
|
|
return os.getenv("NOUS_INFERENCE_BASE_URL", _NOUS_DEFAULT_BASE_URL)
|
|
|
|
|
|
|
|
|
|
|
2026-02-28 21:47:51 -08:00
|
|
|
def _read_codex_access_token() -> Optional[str]:
|
2026-03-01 19:59:24 -08:00
|
|
|
"""Read a valid Codex OAuth access token from Hermes auth store (~/.hermes/auth.json)."""
|
2026-02-28 21:47:51 -08:00
|
|
|
try:
|
2026-03-01 19:59:24 -08:00
|
|
|
from hermes_cli.auth import _read_codex_tokens
|
|
|
|
|
data = _read_codex_tokens()
|
|
|
|
|
tokens = data.get("tokens", {})
|
2026-02-28 21:47:51 -08:00
|
|
|
access_token = tokens.get("access_token")
|
|
|
|
|
if isinstance(access_token, str) and access_token.strip():
|
|
|
|
|
return access_token.strip()
|
|
|
|
|
return None
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
logger.debug("Could not read Codex auth for auxiliary client: %s", exc)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2026-03-06 19:08:54 -08:00
|
|
|
def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
|
|
|
|
|
"""Try each API-key provider in PROVIDER_REGISTRY order.
|
|
|
|
|
|
|
|
|
|
Returns (client, model) for the first provider whose env var is set,
|
|
|
|
|
or (None, None) if none are configured.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
from hermes_cli.auth import PROVIDER_REGISTRY
|
|
|
|
|
except ImportError:
|
|
|
|
|
logger.debug("Could not import PROVIDER_REGISTRY for API-key fallback")
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
for provider_id, pconfig in PROVIDER_REGISTRY.items():
|
|
|
|
|
if pconfig.auth_type != "api_key":
|
|
|
|
|
continue
|
|
|
|
|
# Check if any of the provider's env vars are set
|
|
|
|
|
api_key = ""
|
|
|
|
|
for env_var in pconfig.api_key_env_vars:
|
|
|
|
|
val = os.getenv(env_var, "").strip()
|
|
|
|
|
if val:
|
|
|
|
|
api_key = val
|
|
|
|
|
break
|
|
|
|
|
if not api_key:
|
|
|
|
|
continue
|
|
|
|
|
# Resolve base URL (with optional env-var override)
|
2026-03-07 20:43:34 -05:00
|
|
|
# Kimi Code keys (sk-kimi-) need api.kimi.com/coding/v1
|
|
|
|
|
env_url = ""
|
2026-03-06 19:08:54 -08:00
|
|
|
if pconfig.base_url_env_var:
|
|
|
|
|
env_url = os.getenv(pconfig.base_url_env_var, "").strip()
|
2026-03-07 20:43:34 -05:00
|
|
|
if env_url:
|
|
|
|
|
base_url = env_url.rstrip("/")
|
|
|
|
|
elif provider_id == "kimi-coding" and api_key.startswith("sk-kimi-"):
|
|
|
|
|
base_url = "https://api.kimi.com/coding/v1"
|
|
|
|
|
else:
|
|
|
|
|
base_url = pconfig.inference_base_url
|
2026-03-06 19:08:54 -08:00
|
|
|
model = _API_KEY_PROVIDER_AUX_MODELS.get(provider_id, "default")
|
|
|
|
|
logger.debug("Auxiliary text client: %s (%s)", pconfig.name, model)
|
2026-03-07 20:43:34 -05:00
|
|
|
extra = {}
|
|
|
|
|
if "api.kimi.com" in base_url.lower():
|
|
|
|
|
extra["default_headers"] = {"User-Agent": "KimiCLI/1.0"}
|
|
|
|
|
return OpenAI(api_key=api_key, base_url=base_url, **extra), model
|
2026-03-06 19:08:54 -08:00
|
|
|
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
# ── Provider resolution helpers ─────────────────────────────────────────────
|
2026-02-22 02:16:11 -08:00
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
def _get_auxiliary_provider(task: str = "") -> str:
|
|
|
|
|
"""Read the provider override for a specific auxiliary task.
|
2026-02-22 02:16:11 -08:00
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
Checks AUXILIARY_{TASK}_PROVIDER first (e.g. AUXILIARY_VISION_PROVIDER),
|
|
|
|
|
then CONTEXT_{TASK}_PROVIDER (for the compression section's summary_provider),
|
|
|
|
|
then falls back to "auto". Returns one of: "auto", "openrouter", "nous", "main".
|
2026-02-22 02:16:11 -08:00
|
|
|
"""
|
2026-03-07 08:52:06 -08:00
|
|
|
if task:
|
|
|
|
|
for prefix in ("AUXILIARY_", "CONTEXT_"):
|
|
|
|
|
val = os.getenv(f"{prefix}{task.upper()}_PROVIDER", "").strip().lower()
|
|
|
|
|
if val and val != "auto":
|
|
|
|
|
return val
|
|
|
|
|
return "auto"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]:
|
2026-02-22 02:16:11 -08:00
|
|
|
or_key = os.getenv("OPENROUTER_API_KEY")
|
2026-03-07 08:52:06 -08:00
|
|
|
if not or_key:
|
|
|
|
|
return None, None
|
|
|
|
|
logger.debug("Auxiliary client: OpenRouter")
|
|
|
|
|
return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
|
|
|
|
|
default_headers=_OR_HEADERS), _OPENROUTER_MODEL
|
2026-02-22 02:16:11 -08:00
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
|
|
|
|
|
def _try_nous() -> Tuple[Optional[OpenAI], Optional[str]]:
|
2026-02-22 02:16:11 -08:00
|
|
|
nous = _read_nous_auth()
|
2026-03-07 08:52:06 -08:00
|
|
|
if not nous:
|
|
|
|
|
return None, None
|
|
|
|
|
global auxiliary_is_nous
|
|
|
|
|
auxiliary_is_nous = True
|
|
|
|
|
logger.debug("Auxiliary client: Nous Portal")
|
|
|
|
|
return (
|
|
|
|
|
OpenAI(api_key=_nous_api_key(nous), base_url=_nous_base_url()),
|
|
|
|
|
_NOUS_MODEL,
|
|
|
|
|
)
|
|
|
|
|
|
2026-02-22 02:16:11 -08:00
|
|
|
|
fix: auxiliary client uses main model for custom/local endpoints instead of gpt-4o-mini (#1189)
* fix: prevent model/provider mismatch when switching providers during active gateway
When _update_config_for_provider() writes the new provider and base_url
to config.yaml, the gateway (which re-reads config per-message) can pick
up the change before model selection completes. This causes the old model
name (e.g. 'anthropic/claude-opus-4.6') to be sent to the new provider's
API (e.g. MiniMax), which fails.
Changes:
- _update_config_for_provider() now accepts an optional default_model
parameter. When provided and the current model.default is empty or
uses OpenRouter format (contains '/'), it sets a safe default model
for the new provider.
- All setup.py callers for direct-API providers (zai, kimi, minimax,
minimax-cn, anthropic) now pass a provider-appropriate default model.
- _setup_provider_model_selection() now validates the 'Keep current'
choice: if the current model uses OpenRouter format and wouldn't work
with the new provider, it warns and switches to the provider's first
default model instead of silently keeping the incompatible name.
Reported by a user on Home Assistant whose gateway started sending
'anthropic/claude-opus-4.6' to MiniMax's API after running hermes setup.
* fix: auxiliary client uses main model for custom/local endpoints instead of gpt-4o-mini
When a user runs a local server (e.g. Qwen3.5-9B via OPENAI_BASE_URL),
the auxiliary client (context compression, vision, session search) would
send requests for 'gpt-4o-mini' or 'google/gemini-3-flash-preview' to
the local server, which only serves one model — causing 404 errors
mid-task.
Changes:
- _try_custom_endpoint() now reads the user's configured main model via
_read_main_model() (checks OPENAI_MODEL → HERMES_MODEL → LLM_MODEL →
config.yaml model.default) instead of hardcoding 'gpt-4o-mini'.
- resolve_provider_client() auto mode now detects when an OpenRouter-
formatted model override (containing '/') would be sent to a non-
OpenRouter provider (like a local server) and drops it in favor of
the provider's default model.
- Test isolation fixes: properly clear env vars in 'nothing available'
tests to prevent host environment leakage.
2026-03-13 10:02:16 -07:00
|
|
|
def _read_main_model() -> str:
|
|
|
|
|
"""Read the user's configured main model from config/env.
|
|
|
|
|
|
|
|
|
|
Falls back through HERMES_MODEL → LLM_MODEL → config.yaml model.default
|
|
|
|
|
so the auxiliary client can use the same model as the main agent when no
|
|
|
|
|
dedicated auxiliary model is available.
|
|
|
|
|
"""
|
|
|
|
|
from_env = os.getenv("OPENAI_MODEL") or os.getenv("HERMES_MODEL") or os.getenv("LLM_MODEL")
|
|
|
|
|
if from_env:
|
|
|
|
|
return from_env.strip()
|
|
|
|
|
try:
|
|
|
|
|
from hermes_cli.config import load_config
|
|
|
|
|
cfg = load_config()
|
|
|
|
|
model_cfg = cfg.get("model", {})
|
|
|
|
|
if isinstance(model_cfg, str) and model_cfg.strip():
|
|
|
|
|
return model_cfg.strip()
|
|
|
|
|
if isinstance(model_cfg, dict):
|
|
|
|
|
default = model_cfg.get("default", "")
|
|
|
|
|
if isinstance(default, str) and default.strip():
|
|
|
|
|
return default.strip()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
|
2026-02-22 02:16:11 -08:00
|
|
|
custom_base = os.getenv("OPENAI_BASE_URL")
|
|
|
|
|
custom_key = os.getenv("OPENAI_API_KEY")
|
2026-03-07 08:52:06 -08:00
|
|
|
if not custom_base or not custom_key:
|
|
|
|
|
return None, None
|
fix: auxiliary client uses main model for custom/local endpoints instead of gpt-4o-mini (#1189)
* fix: prevent model/provider mismatch when switching providers during active gateway
When _update_config_for_provider() writes the new provider and base_url
to config.yaml, the gateway (which re-reads config per-message) can pick
up the change before model selection completes. This causes the old model
name (e.g. 'anthropic/claude-opus-4.6') to be sent to the new provider's
API (e.g. MiniMax), which fails.
Changes:
- _update_config_for_provider() now accepts an optional default_model
parameter. When provided and the current model.default is empty or
uses OpenRouter format (contains '/'), it sets a safe default model
for the new provider.
- All setup.py callers for direct-API providers (zai, kimi, minimax,
minimax-cn, anthropic) now pass a provider-appropriate default model.
- _setup_provider_model_selection() now validates the 'Keep current'
choice: if the current model uses OpenRouter format and wouldn't work
with the new provider, it warns and switches to the provider's first
default model instead of silently keeping the incompatible name.
Reported by a user on Home Assistant whose gateway started sending
'anthropic/claude-opus-4.6' to MiniMax's API after running hermes setup.
* fix: auxiliary client uses main model for custom/local endpoints instead of gpt-4o-mini
When a user runs a local server (e.g. Qwen3.5-9B via OPENAI_BASE_URL),
the auxiliary client (context compression, vision, session search) would
send requests for 'gpt-4o-mini' or 'google/gemini-3-flash-preview' to
the local server, which only serves one model — causing 404 errors
mid-task.
Changes:
- _try_custom_endpoint() now reads the user's configured main model via
_read_main_model() (checks OPENAI_MODEL → HERMES_MODEL → LLM_MODEL →
config.yaml model.default) instead of hardcoding 'gpt-4o-mini'.
- resolve_provider_client() auto mode now detects when an OpenRouter-
formatted model override (containing '/') would be sent to a non-
OpenRouter provider (like a local server) and drops it in favor of
the provider's default model.
- Test isolation fixes: properly clear env vars in 'nothing available'
tests to prevent host environment leakage.
2026-03-13 10:02:16 -07:00
|
|
|
model = _read_main_model() or "gpt-4o-mini"
|
2026-03-07 08:52:06 -08:00
|
|
|
logger.debug("Auxiliary client: custom endpoint (%s)", model)
|
|
|
|
|
return OpenAI(api_key=custom_key, base_url=custom_base), model
|
|
|
|
|
|
2026-02-22 02:16:11 -08:00
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
def _try_codex() -> Tuple[Optional[Any], Optional[str]]:
|
2026-02-28 21:47:51 -08:00
|
|
|
codex_token = _read_codex_access_token()
|
2026-03-07 08:52:06 -08:00
|
|
|
if not codex_token:
|
|
|
|
|
return None, None
|
|
|
|
|
logger.debug("Auxiliary client: Codex OAuth (%s via Responses API)", _CODEX_AUX_MODEL)
|
|
|
|
|
real_client = OpenAI(api_key=codex_token, base_url=_CODEX_AUX_BASE_URL)
|
|
|
|
|
return CodexAuxiliaryClient(real_client, _CODEX_AUX_MODEL), _CODEX_AUX_MODEL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[str]]:
|
|
|
|
|
"""Resolve a specific forced provider. Returns (None, None) if creds missing."""
|
|
|
|
|
if forced == "openrouter":
|
|
|
|
|
client, model = _try_openrouter()
|
|
|
|
|
if client is None:
|
|
|
|
|
logger.warning("auxiliary.provider=openrouter but OPENROUTER_API_KEY not set")
|
|
|
|
|
return client, model
|
|
|
|
|
|
|
|
|
|
if forced == "nous":
|
|
|
|
|
client, model = _try_nous()
|
|
|
|
|
if client is None:
|
|
|
|
|
logger.warning("auxiliary.provider=nous but Nous Portal not configured (run: hermes login)")
|
|
|
|
|
return client, model
|
|
|
|
|
|
2026-03-08 18:44:25 -07:00
|
|
|
if forced == "codex":
|
|
|
|
|
client, model = _try_codex()
|
|
|
|
|
if client is None:
|
|
|
|
|
logger.warning("auxiliary.provider=codex but no Codex OAuth token found (run: hermes model)")
|
|
|
|
|
return client, model
|
|
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
if forced == "main":
|
|
|
|
|
# "main" = skip OpenRouter/Nous, use the main chat model's credentials.
|
|
|
|
|
for try_fn in (_try_custom_endpoint, _try_codex, _resolve_api_key_provider):
|
|
|
|
|
client, model = try_fn()
|
|
|
|
|
if client is not None:
|
|
|
|
|
return client, model
|
|
|
|
|
logger.warning("auxiliary.provider=main but no main endpoint credentials found")
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
# Unknown provider name — fall through to auto
|
|
|
|
|
logger.warning("Unknown auxiliary.provider=%r, falling back to auto", forced)
|
2026-02-22 02:16:11 -08:00
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
|
|
|
|
|
"""Full auto-detection chain: OpenRouter → Nous → custom → Codex → API-key → None."""
|
|
|
|
|
for try_fn in (_try_openrouter, _try_nous, _try_custom_endpoint,
|
|
|
|
|
_try_codex, _resolve_api_key_provider):
|
|
|
|
|
client, model = try_fn()
|
|
|
|
|
if client is not None:
|
|
|
|
|
return client, model
|
|
|
|
|
logger.debug("Auxiliary client: none available")
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
# ── Centralized Provider Router ─────────────────────────────────────────────
|
|
|
|
|
#
|
|
|
|
|
# resolve_provider_client() is the single entry point for creating a properly
|
|
|
|
|
# configured client given a (provider, model) pair. It handles auth lookup,
|
|
|
|
|
# base URL resolution, provider-specific headers, and API format differences
|
|
|
|
|
# (Chat Completions vs Responses API for Codex).
|
|
|
|
|
#
|
|
|
|
|
# All auxiliary consumer code should go through this or the public helpers
|
|
|
|
|
# below — never look up auth env vars ad-hoc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _to_async_client(sync_client, model: str):
|
|
|
|
|
"""Convert a sync client to its async counterpart, preserving Codex routing."""
|
|
|
|
|
from openai import AsyncOpenAI
|
|
|
|
|
|
|
|
|
|
if isinstance(sync_client, CodexAuxiliaryClient):
|
|
|
|
|
return AsyncCodexAuxiliaryClient(sync_client), model
|
|
|
|
|
|
|
|
|
|
async_kwargs = {
|
|
|
|
|
"api_key": sync_client.api_key,
|
|
|
|
|
"base_url": str(sync_client.base_url),
|
|
|
|
|
}
|
|
|
|
|
base_lower = str(sync_client.base_url).lower()
|
|
|
|
|
if "openrouter" in base_lower:
|
|
|
|
|
async_kwargs["default_headers"] = dict(_OR_HEADERS)
|
|
|
|
|
elif "api.kimi.com" in base_lower:
|
|
|
|
|
async_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.0"}
|
|
|
|
|
return AsyncOpenAI(**async_kwargs), model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resolve_provider_client(
|
|
|
|
|
provider: str,
|
|
|
|
|
model: str = None,
|
|
|
|
|
async_mode: bool = False,
|
2026-03-11 21:38:29 -07:00
|
|
|
raw_codex: bool = False,
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
) -> Tuple[Optional[Any], Optional[str]]:
|
|
|
|
|
"""Central router: given a provider name and optional model, return a
|
|
|
|
|
configured client with the correct auth, base URL, and API format.
|
|
|
|
|
|
|
|
|
|
The returned client always exposes ``.chat.completions.create()`` — for
|
|
|
|
|
Codex/Responses API providers, an adapter handles the translation
|
|
|
|
|
transparently.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
provider: Provider identifier. One of:
|
|
|
|
|
"openrouter", "nous", "openai-codex" (or "codex"),
|
2026-03-11 20:14:44 -07:00
|
|
|
"zai", "kimi-coding", "minimax", "minimax-cn",
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
"custom" (OPENAI_BASE_URL + OPENAI_API_KEY),
|
|
|
|
|
"auto" (full auto-detection chain).
|
|
|
|
|
model: Model slug override. If None, uses the provider's default
|
|
|
|
|
auxiliary model.
|
|
|
|
|
async_mode: If True, return an async-compatible client.
|
2026-03-11 21:38:29 -07:00
|
|
|
raw_codex: If True, return a raw OpenAI client for Codex providers
|
|
|
|
|
instead of wrapping in CodexAuxiliaryClient. Use this when
|
|
|
|
|
the caller needs direct access to responses.stream() (e.g.,
|
|
|
|
|
the main agent loop).
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
(client, resolved_model) or (None, None) if auth is unavailable.
|
|
|
|
|
"""
|
|
|
|
|
# Normalise aliases
|
|
|
|
|
provider = (provider or "auto").strip().lower()
|
|
|
|
|
if provider == "codex":
|
|
|
|
|
provider = "openai-codex"
|
|
|
|
|
if provider == "main":
|
|
|
|
|
provider = "custom"
|
|
|
|
|
|
|
|
|
|
# ── Auto: try all providers in priority order ────────────────────
|
|
|
|
|
if provider == "auto":
|
|
|
|
|
client, resolved = _resolve_auto()
|
|
|
|
|
if client is None:
|
|
|
|
|
return None, None
|
fix: auxiliary client uses main model for custom/local endpoints instead of gpt-4o-mini (#1189)
* fix: prevent model/provider mismatch when switching providers during active gateway
When _update_config_for_provider() writes the new provider and base_url
to config.yaml, the gateway (which re-reads config per-message) can pick
up the change before model selection completes. This causes the old model
name (e.g. 'anthropic/claude-opus-4.6') to be sent to the new provider's
API (e.g. MiniMax), which fails.
Changes:
- _update_config_for_provider() now accepts an optional default_model
parameter. When provided and the current model.default is empty or
uses OpenRouter format (contains '/'), it sets a safe default model
for the new provider.
- All setup.py callers for direct-API providers (zai, kimi, minimax,
minimax-cn, anthropic) now pass a provider-appropriate default model.
- _setup_provider_model_selection() now validates the 'Keep current'
choice: if the current model uses OpenRouter format and wouldn't work
with the new provider, it warns and switches to the provider's first
default model instead of silently keeping the incompatible name.
Reported by a user on Home Assistant whose gateway started sending
'anthropic/claude-opus-4.6' to MiniMax's API after running hermes setup.
* fix: auxiliary client uses main model for custom/local endpoints instead of gpt-4o-mini
When a user runs a local server (e.g. Qwen3.5-9B via OPENAI_BASE_URL),
the auxiliary client (context compression, vision, session search) would
send requests for 'gpt-4o-mini' or 'google/gemini-3-flash-preview' to
the local server, which only serves one model — causing 404 errors
mid-task.
Changes:
- _try_custom_endpoint() now reads the user's configured main model via
_read_main_model() (checks OPENAI_MODEL → HERMES_MODEL → LLM_MODEL →
config.yaml model.default) instead of hardcoding 'gpt-4o-mini'.
- resolve_provider_client() auto mode now detects when an OpenRouter-
formatted model override (containing '/') would be sent to a non-
OpenRouter provider (like a local server) and drops it in favor of
the provider's default model.
- Test isolation fixes: properly clear env vars in 'nothing available'
tests to prevent host environment leakage.
2026-03-13 10:02:16 -07:00
|
|
|
# When auto-detection lands on a non-OpenRouter provider (e.g. a
|
|
|
|
|
# local server), an OpenRouter-formatted model override like
|
|
|
|
|
# "google/gemini-3-flash-preview" won't work. Drop it and use
|
|
|
|
|
# the provider's own default model instead.
|
|
|
|
|
if model and "/" in model and resolved and "/" not in resolved:
|
|
|
|
|
logger.debug(
|
|
|
|
|
"Dropping OpenRouter-format model %r for non-OpenRouter "
|
|
|
|
|
"auxiliary provider (using %r instead)", model, resolved)
|
|
|
|
|
model = None
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
final_model = model or resolved
|
|
|
|
|
return (_to_async_client(client, final_model) if async_mode
|
|
|
|
|
else (client, final_model))
|
|
|
|
|
|
|
|
|
|
# ── OpenRouter ───────────────────────────────────────────────────
|
|
|
|
|
if provider == "openrouter":
|
|
|
|
|
client, default = _try_openrouter()
|
|
|
|
|
if client is None:
|
|
|
|
|
logger.warning("resolve_provider_client: openrouter requested "
|
|
|
|
|
"but OPENROUTER_API_KEY not set")
|
|
|
|
|
return None, None
|
|
|
|
|
final_model = model or default
|
|
|
|
|
return (_to_async_client(client, final_model) if async_mode
|
|
|
|
|
else (client, final_model))
|
|
|
|
|
|
|
|
|
|
# ── Nous Portal (OAuth) ──────────────────────────────────────────
|
|
|
|
|
if provider == "nous":
|
|
|
|
|
client, default = _try_nous()
|
|
|
|
|
if client is None:
|
|
|
|
|
logger.warning("resolve_provider_client: nous requested "
|
|
|
|
|
"but Nous Portal not configured (run: hermes login)")
|
|
|
|
|
return None, None
|
|
|
|
|
final_model = model or default
|
|
|
|
|
return (_to_async_client(client, final_model) if async_mode
|
|
|
|
|
else (client, final_model))
|
|
|
|
|
|
|
|
|
|
# ── OpenAI Codex (OAuth → Responses API) ─────────────────────────
|
|
|
|
|
if provider == "openai-codex":
|
2026-03-11 21:38:29 -07:00
|
|
|
if raw_codex:
|
|
|
|
|
# Return the raw OpenAI client for callers that need direct
|
|
|
|
|
# access to responses.stream() (e.g., the main agent loop).
|
|
|
|
|
codex_token = _read_codex_access_token()
|
|
|
|
|
if not codex_token:
|
|
|
|
|
logger.warning("resolve_provider_client: openai-codex requested "
|
|
|
|
|
"but no Codex OAuth token found (run: hermes model)")
|
|
|
|
|
return None, None
|
|
|
|
|
final_model = model or _CODEX_AUX_MODEL
|
|
|
|
|
raw_client = OpenAI(api_key=codex_token, base_url=_CODEX_AUX_BASE_URL)
|
|
|
|
|
return (raw_client, final_model)
|
|
|
|
|
# Standard path: wrap in CodexAuxiliaryClient adapter
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
client, default = _try_codex()
|
|
|
|
|
if client is None:
|
|
|
|
|
logger.warning("resolve_provider_client: openai-codex requested "
|
|
|
|
|
"but no Codex OAuth token found (run: hermes model)")
|
|
|
|
|
return None, None
|
|
|
|
|
final_model = model or default
|
|
|
|
|
return (_to_async_client(client, final_model) if async_mode
|
|
|
|
|
else (client, final_model))
|
|
|
|
|
|
|
|
|
|
# ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ───────────
|
|
|
|
|
if provider == "custom":
|
|
|
|
|
# Try custom first, then codex, then API-key providers
|
|
|
|
|
for try_fn in (_try_custom_endpoint, _try_codex,
|
|
|
|
|
_resolve_api_key_provider):
|
|
|
|
|
client, default = try_fn()
|
|
|
|
|
if client is not None:
|
|
|
|
|
final_model = model or default
|
|
|
|
|
return (_to_async_client(client, final_model) if async_mode
|
|
|
|
|
else (client, final_model))
|
|
|
|
|
logger.warning("resolve_provider_client: custom/main requested "
|
|
|
|
|
"but no endpoint credentials found")
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
# ── API-key providers from PROVIDER_REGISTRY ─────────────────────
|
|
|
|
|
try:
|
|
|
|
|
from hermes_cli.auth import PROVIDER_REGISTRY, _resolve_kimi_base_url
|
|
|
|
|
except ImportError:
|
|
|
|
|
logger.debug("hermes_cli.auth not available for provider %s", provider)
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
pconfig = PROVIDER_REGISTRY.get(provider)
|
|
|
|
|
if pconfig is None:
|
|
|
|
|
logger.warning("resolve_provider_client: unknown provider %r", provider)
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
if pconfig.auth_type == "api_key":
|
|
|
|
|
# Find the first configured API key
|
|
|
|
|
api_key = ""
|
|
|
|
|
for env_var in pconfig.api_key_env_vars:
|
|
|
|
|
api_key = os.getenv(env_var, "").strip()
|
|
|
|
|
if api_key:
|
|
|
|
|
break
|
|
|
|
|
if not api_key:
|
|
|
|
|
logger.warning("resolve_provider_client: provider %s has no API "
|
|
|
|
|
"key configured (tried: %s)",
|
|
|
|
|
provider, ", ".join(pconfig.api_key_env_vars))
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
# Resolve base URL (env override → provider-specific logic → default)
|
|
|
|
|
base_url_override = os.getenv(pconfig.base_url_env_var, "").strip() if pconfig.base_url_env_var else ""
|
|
|
|
|
if provider == "kimi-coding":
|
|
|
|
|
base_url = _resolve_kimi_base_url(api_key, pconfig.inference_base_url, base_url_override)
|
|
|
|
|
elif base_url_override:
|
|
|
|
|
base_url = base_url_override
|
|
|
|
|
else:
|
|
|
|
|
base_url = pconfig.inference_base_url
|
|
|
|
|
|
|
|
|
|
default_model = _API_KEY_PROVIDER_AUX_MODELS.get(provider, "")
|
|
|
|
|
final_model = model or default_model
|
|
|
|
|
|
|
|
|
|
# Provider-specific headers
|
|
|
|
|
headers = {}
|
|
|
|
|
if "api.kimi.com" in base_url.lower():
|
|
|
|
|
headers["User-Agent"] = "KimiCLI/1.0"
|
|
|
|
|
|
|
|
|
|
client = OpenAI(api_key=api_key, base_url=base_url,
|
|
|
|
|
**({"default_headers": headers} if headers else {}))
|
|
|
|
|
logger.debug("resolve_provider_client: %s (%s)", provider, final_model)
|
|
|
|
|
return (_to_async_client(client, final_model) if async_mode
|
|
|
|
|
else (client, final_model))
|
|
|
|
|
|
|
|
|
|
elif pconfig.auth_type in ("oauth_device_code", "oauth_external"):
|
|
|
|
|
# OAuth providers — route through their specific try functions
|
|
|
|
|
if provider == "nous":
|
|
|
|
|
return resolve_provider_client("nous", model, async_mode)
|
|
|
|
|
if provider == "openai-codex":
|
|
|
|
|
return resolve_provider_client("openai-codex", model, async_mode)
|
2026-03-11 20:14:44 -07:00
|
|
|
# Other OAuth providers not directly supported
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
logger.warning("resolve_provider_client: OAuth provider %s not "
|
|
|
|
|
"directly supported, try 'auto'", provider)
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
logger.warning("resolve_provider_client: unhandled auth_type %s for %s",
|
|
|
|
|
pconfig.auth_type, provider)
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
# ── Public API ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optional[str]]:
|
|
|
|
|
"""Return (client, default_model_slug) for text-only auxiliary tasks.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
task: Optional task name ("compression", "web_extract") to check
|
|
|
|
|
for a task-specific provider override.
|
|
|
|
|
|
|
|
|
|
Callers may override the returned model with a per-task env var
|
|
|
|
|
(e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
|
|
|
|
|
"""
|
|
|
|
|
forced = _get_auxiliary_provider(task)
|
|
|
|
|
if forced != "auto":
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
return resolve_provider_client(forced)
|
|
|
|
|
return resolve_provider_client("auto")
|
2026-03-07 08:52:06 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_async_text_auxiliary_client(task: str = ""):
|
2026-02-28 21:47:51 -08:00
|
|
|
"""Return (async_client, model_slug) for async consumers.
|
|
|
|
|
|
|
|
|
|
For standard providers returns (AsyncOpenAI, model). For Codex returns
|
|
|
|
|
(AsyncCodexAuxiliaryClient, model) which wraps the Responses API.
|
|
|
|
|
Returns (None, None) when no provider is available.
|
|
|
|
|
"""
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
forced = _get_auxiliary_provider(task)
|
|
|
|
|
if forced != "auto":
|
|
|
|
|
return resolve_provider_client(forced, async_mode=True)
|
|
|
|
|
return resolve_provider_client("auto", async_mode=True)
|
2026-02-28 21:47:51 -08:00
|
|
|
|
|
|
|
|
|
2026-02-22 02:16:11 -08:00
|
|
|
def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
|
2026-03-07 08:52:06 -08:00
|
|
|
"""Return (client, default_model_slug) for vision/multimodal auxiliary tasks.
|
2026-02-22 02:16:11 -08:00
|
|
|
|
2026-03-07 08:52:06 -08:00
|
|
|
Checks AUXILIARY_VISION_PROVIDER for a forced provider, otherwise
|
|
|
|
|
auto-detects. Callers may override the returned model with
|
|
|
|
|
AUXILIARY_VISION_MODEL.
|
2026-03-08 18:06:40 -07:00
|
|
|
|
2026-03-08 18:44:25 -07:00
|
|
|
In auto mode, only providers known to support multimodal are tried:
|
|
|
|
|
OpenRouter, Nous Portal, and Codex OAuth (gpt-5.3-codex supports
|
|
|
|
|
vision via the Responses API). Custom endpoints and API-key
|
|
|
|
|
providers are skipped — they may not handle vision input. To use
|
|
|
|
|
them, set AUXILIARY_VISION_PROVIDER explicitly.
|
2026-02-22 02:16:11 -08:00
|
|
|
"""
|
2026-03-07 08:52:06 -08:00
|
|
|
forced = _get_auxiliary_provider("vision")
|
|
|
|
|
if forced != "auto":
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
return resolve_provider_client(forced)
|
2026-03-09 15:36:19 -07:00
|
|
|
# Auto: try providers known to support multimodal first, then fall
|
|
|
|
|
# back to the user's custom endpoint. Many local models (Qwen-VL,
|
|
|
|
|
# LLaVA, Pixtral, etc.) support vision — skipping them entirely
|
|
|
|
|
# caused silent failures for local-only users.
|
2026-03-12 17:14:22 -07:00
|
|
|
for try_fn in (_try_openrouter, _try_nous, _try_codex,
|
|
|
|
|
_try_custom_endpoint):
|
2026-03-08 18:06:40 -07:00
|
|
|
client, model = try_fn()
|
|
|
|
|
if client is not None:
|
|
|
|
|
return client, model
|
2026-03-09 15:36:19 -07:00
|
|
|
logger.debug("Auxiliary vision client: none available")
|
2026-03-08 18:06:40 -07:00
|
|
|
return None, None
|
2026-02-25 18:39:36 -08:00
|
|
|
|
|
|
|
|
|
feat: centralized provider router + fix Codex vision bypass + vision error handling
Three interconnected fixes for auxiliary client infrastructure:
1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
Add resolve_provider_client(provider, model, async_mode) — a single
entry point for creating properly configured clients. Given a provider
name and optional model, it handles auth lookup (env vars, OAuth
tokens, auth.json), base URL resolution, provider-specific headers,
and API format differences (Chat Completions vs Responses API for
Codex). All auxiliary consumers should route through this instead of
ad-hoc env var lookups.
Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
and get_vision_auxiliary_client to use the router internally.
2. FIX CODEX VISION BYPASS (vision_tools.py)
vision_tools.py was constructing a raw AsyncOpenAI client from the
sync vision client's api_key/base_url, completely bypassing the Codex
Responses API adapter. When the vision provider resolved to Codex,
the raw client would hit chatgpt.com/backend-api/codex with
chat.completions.create() which only supports the Responses API.
Fix: Added get_async_vision_auxiliary_client() which properly wraps
Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
instead of manual client construction.
3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
- context_compressor.py: Removed _get_fallback_client() which blindly
looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
API-key providers, users without OPENAI_BASE_URL set). Replaced
with fallback loop through resolve_provider_client() for each
known provider, with same-provider dedup.
- vision_tools.py: Added error detection for vision capability
failures. Returns clear message to the model when the configured
model doesn't support vision, instead of a generic error.
Addresses #886
2026-03-11 19:46:47 -07:00
|
|
|
def get_async_vision_auxiliary_client():
|
|
|
|
|
"""Return (async_client, model_slug) for async vision consumers.
|
|
|
|
|
|
|
|
|
|
Properly handles Codex routing — unlike manually constructing
|
|
|
|
|
AsyncOpenAI from a sync client, this preserves the Responses API
|
|
|
|
|
adapter for Codex providers.
|
|
|
|
|
|
|
|
|
|
Returns (None, None) when no provider is available.
|
|
|
|
|
"""
|
|
|
|
|
sync_client, model = get_vision_auxiliary_client()
|
|
|
|
|
if sync_client is None:
|
|
|
|
|
return None, None
|
|
|
|
|
return _to_async_client(sync_client, model)
|
|
|
|
|
|
|
|
|
|
|
2026-02-25 18:39:36 -08:00
|
|
|
def get_auxiliary_extra_body() -> dict:
|
|
|
|
|
"""Return extra_body kwargs for auxiliary API calls.
|
|
|
|
|
|
|
|
|
|
Includes Nous Portal product tags when the auxiliary client is backed
|
|
|
|
|
by Nous Portal. Returns empty dict otherwise.
|
|
|
|
|
"""
|
|
|
|
|
return dict(NOUS_EXTRA_BODY) if auxiliary_is_nous else {}
|
2026-02-26 20:23:56 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def auxiliary_max_tokens_param(value: int) -> dict:
|
|
|
|
|
"""Return the correct max tokens kwarg for the auxiliary client's provider.
|
|
|
|
|
|
|
|
|
|
OpenRouter and local models use 'max_tokens'. Direct OpenAI with newer
|
|
|
|
|
models (gpt-4o, o-series, gpt-5+) requires 'max_completion_tokens'.
|
2026-02-28 21:47:51 -08:00
|
|
|
The Codex adapter translates max_tokens internally, so we use max_tokens
|
|
|
|
|
for it as well.
|
2026-02-26 20:23:56 -08:00
|
|
|
"""
|
|
|
|
|
custom_base = os.getenv("OPENAI_BASE_URL", "")
|
|
|
|
|
or_key = os.getenv("OPENROUTER_API_KEY")
|
2026-02-28 21:47:51 -08:00
|
|
|
# Only use max_completion_tokens for direct OpenAI custom endpoints
|
2026-02-26 20:23:56 -08:00
|
|
|
if (not or_key
|
|
|
|
|
and _read_nous_auth() is None
|
|
|
|
|
and "api.openai.com" in custom_base.lower()):
|
|
|
|
|
return {"max_completion_tokens": value}
|
|
|
|
|
return {"max_tokens": value}
|
2026-03-11 20:52:19 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# ── Centralized LLM Call API ────────────────────────────────────────────────
|
|
|
|
|
#
|
|
|
|
|
# call_llm() and async_call_llm() own the full request lifecycle:
|
|
|
|
|
# 1. Resolve provider + model from task config (or explicit args)
|
|
|
|
|
# 2. Get or create a cached client for that provider
|
|
|
|
|
# 3. Format request args for the provider + model (max_tokens handling, etc.)
|
|
|
|
|
# 4. Make the API call
|
|
|
|
|
# 5. Return the response
|
|
|
|
|
#
|
|
|
|
|
# Every auxiliary LLM consumer should use these instead of manually
|
|
|
|
|
# constructing clients and calling .chat.completions.create().
|
|
|
|
|
|
|
|
|
|
# Client cache: (provider, async_mode) -> (client, default_model)
|
|
|
|
|
_client_cache: Dict[tuple, tuple] = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_cached_client(
|
|
|
|
|
provider: str, model: str = None, async_mode: bool = False,
|
|
|
|
|
) -> Tuple[Optional[Any], Optional[str]]:
|
|
|
|
|
"""Get or create a cached client for the given provider."""
|
|
|
|
|
cache_key = (provider, async_mode)
|
|
|
|
|
if cache_key in _client_cache:
|
|
|
|
|
cached_client, cached_default = _client_cache[cache_key]
|
|
|
|
|
return cached_client, model or cached_default
|
|
|
|
|
client, default_model = resolve_provider_client(provider, model, async_mode)
|
|
|
|
|
if client is not None:
|
|
|
|
|
_client_cache[cache_key] = (client, default_model)
|
|
|
|
|
return client, model or default_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _resolve_task_provider_model(
|
|
|
|
|
task: str = None,
|
|
|
|
|
provider: str = None,
|
|
|
|
|
model: str = None,
|
|
|
|
|
) -> Tuple[str, Optional[str]]:
|
|
|
|
|
"""Determine provider + model for a call.
|
|
|
|
|
|
|
|
|
|
Priority:
|
|
|
|
|
1. Explicit provider/model args (always win)
|
|
|
|
|
2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.)
|
|
|
|
|
3. Config file (auxiliary.{task}.provider/model or compression.*)
|
|
|
|
|
4. "auto" (full auto-detection chain)
|
|
|
|
|
|
|
|
|
|
Returns (provider, model) where model may be None (use provider default).
|
|
|
|
|
"""
|
|
|
|
|
if provider:
|
|
|
|
|
return provider, model
|
|
|
|
|
|
|
|
|
|
if task:
|
|
|
|
|
# Check env var overrides first
|
|
|
|
|
env_provider = _get_auxiliary_provider(task)
|
|
|
|
|
if env_provider != "auto":
|
|
|
|
|
# Check for env var model override too
|
|
|
|
|
env_model = None
|
|
|
|
|
for prefix in ("AUXILIARY_", "CONTEXT_"):
|
|
|
|
|
val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip()
|
|
|
|
|
if val:
|
|
|
|
|
env_model = val
|
|
|
|
|
break
|
|
|
|
|
return env_provider, model or env_model
|
|
|
|
|
|
|
|
|
|
# Read from config file
|
|
|
|
|
try:
|
|
|
|
|
from hermes_cli.config import load_config
|
|
|
|
|
config = load_config()
|
|
|
|
|
except ImportError:
|
|
|
|
|
return "auto", model
|
|
|
|
|
|
|
|
|
|
# Check auxiliary.{task} section
|
|
|
|
|
aux = config.get("auxiliary", {})
|
|
|
|
|
task_config = aux.get(task, {})
|
|
|
|
|
cfg_provider = task_config.get("provider", "").strip() or None
|
|
|
|
|
cfg_model = task_config.get("model", "").strip() or None
|
|
|
|
|
|
|
|
|
|
# Backwards compat: compression section has its own keys
|
|
|
|
|
if task == "compression" and not cfg_provider:
|
|
|
|
|
comp = config.get("compression", {})
|
|
|
|
|
cfg_provider = comp.get("summary_provider", "").strip() or None
|
|
|
|
|
cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
|
|
|
|
|
|
|
|
|
|
if cfg_provider and cfg_provider != "auto":
|
|
|
|
|
return cfg_provider, model or cfg_model
|
|
|
|
|
return "auto", model or cfg_model
|
|
|
|
|
|
|
|
|
|
return "auto", model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_call_kwargs(
|
|
|
|
|
provider: str,
|
|
|
|
|
model: str,
|
|
|
|
|
messages: list,
|
|
|
|
|
temperature: Optional[float] = None,
|
|
|
|
|
max_tokens: Optional[int] = None,
|
|
|
|
|
tools: Optional[list] = None,
|
|
|
|
|
timeout: float = 30.0,
|
|
|
|
|
extra_body: Optional[dict] = None,
|
|
|
|
|
) -> dict:
|
|
|
|
|
"""Build kwargs for .chat.completions.create() with model/provider adjustments."""
|
|
|
|
|
kwargs: Dict[str, Any] = {
|
|
|
|
|
"model": model,
|
|
|
|
|
"messages": messages,
|
|
|
|
|
"timeout": timeout,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if temperature is not None:
|
|
|
|
|
kwargs["temperature"] = temperature
|
|
|
|
|
|
|
|
|
|
if max_tokens is not None:
|
|
|
|
|
# Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
|
|
|
|
|
# Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
|
|
|
|
|
if provider == "custom":
|
|
|
|
|
custom_base = os.getenv("OPENAI_BASE_URL", "")
|
|
|
|
|
if "api.openai.com" in custom_base.lower():
|
|
|
|
|
kwargs["max_completion_tokens"] = max_tokens
|
|
|
|
|
else:
|
|
|
|
|
kwargs["max_tokens"] = max_tokens
|
|
|
|
|
else:
|
|
|
|
|
kwargs["max_tokens"] = max_tokens
|
|
|
|
|
|
|
|
|
|
if tools:
|
|
|
|
|
kwargs["tools"] = tools
|
|
|
|
|
|
|
|
|
|
# Provider-specific extra_body
|
|
|
|
|
merged_extra = dict(extra_body or {})
|
|
|
|
|
if provider == "nous" or auxiliary_is_nous:
|
|
|
|
|
merged_extra.setdefault("tags", []).extend(["product=hermes-agent"])
|
|
|
|
|
if merged_extra:
|
|
|
|
|
kwargs["extra_body"] = merged_extra
|
|
|
|
|
|
|
|
|
|
return kwargs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def call_llm(
|
|
|
|
|
task: str = None,
|
|
|
|
|
*,
|
|
|
|
|
provider: str = None,
|
|
|
|
|
model: str = None,
|
|
|
|
|
messages: list,
|
|
|
|
|
temperature: float = None,
|
|
|
|
|
max_tokens: int = None,
|
|
|
|
|
tools: list = None,
|
|
|
|
|
timeout: float = 30.0,
|
|
|
|
|
extra_body: dict = None,
|
|
|
|
|
) -> Any:
|
|
|
|
|
"""Centralized synchronous LLM call.
|
|
|
|
|
|
|
|
|
|
Resolves provider + model (from task config, explicit args, or auto-detect),
|
|
|
|
|
handles auth, request formatting, and model-specific arg adjustments.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
task: Auxiliary task name ("compression", "vision", "web_extract",
|
|
|
|
|
"session_search", "skills_hub", "mcp", "flush_memories").
|
|
|
|
|
Reads provider:model from config/env. Ignored if provider is set.
|
|
|
|
|
provider: Explicit provider override.
|
|
|
|
|
model: Explicit model override.
|
|
|
|
|
messages: Chat messages list.
|
|
|
|
|
temperature: Sampling temperature (None = provider default).
|
|
|
|
|
max_tokens: Max output tokens (handles max_tokens vs max_completion_tokens).
|
|
|
|
|
tools: Tool definitions (for function calling).
|
|
|
|
|
timeout: Request timeout in seconds.
|
|
|
|
|
extra_body: Additional request body fields.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Response object with .choices[0].message.content
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
RuntimeError: If no provider is configured.
|
|
|
|
|
"""
|
|
|
|
|
resolved_provider, resolved_model = _resolve_task_provider_model(
|
|
|
|
|
task, provider, model)
|
|
|
|
|
|
|
|
|
|
client, final_model = _get_cached_client(resolved_provider, resolved_model)
|
|
|
|
|
if client is None:
|
|
|
|
|
# Fallback: try openrouter
|
|
|
|
|
if resolved_provider != "openrouter":
|
|
|
|
|
logger.warning("Provider %s unavailable, falling back to openrouter",
|
|
|
|
|
resolved_provider)
|
|
|
|
|
client, final_model = _get_cached_client(
|
|
|
|
|
"openrouter", resolved_model or _OPENROUTER_MODEL)
|
|
|
|
|
if client is None:
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
f"No LLM provider configured for task={task} provider={resolved_provider}. "
|
|
|
|
|
f"Run: hermes setup")
|
|
|
|
|
|
|
|
|
|
kwargs = _build_call_kwargs(
|
|
|
|
|
resolved_provider, final_model, messages,
|
|
|
|
|
temperature=temperature, max_tokens=max_tokens,
|
|
|
|
|
tools=tools, timeout=timeout, extra_body=extra_body)
|
|
|
|
|
|
|
|
|
|
# Handle max_tokens vs max_completion_tokens retry
|
|
|
|
|
try:
|
|
|
|
|
return client.chat.completions.create(**kwargs)
|
|
|
|
|
except Exception as first_err:
|
|
|
|
|
err_str = str(first_err)
|
|
|
|
|
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
|
|
|
|
|
kwargs.pop("max_tokens", None)
|
|
|
|
|
kwargs["max_completion_tokens"] = max_tokens
|
|
|
|
|
return client.chat.completions.create(**kwargs)
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def async_call_llm(
|
|
|
|
|
task: str = None,
|
|
|
|
|
*,
|
|
|
|
|
provider: str = None,
|
|
|
|
|
model: str = None,
|
|
|
|
|
messages: list,
|
|
|
|
|
temperature: float = None,
|
|
|
|
|
max_tokens: int = None,
|
|
|
|
|
tools: list = None,
|
|
|
|
|
timeout: float = 30.0,
|
|
|
|
|
extra_body: dict = None,
|
|
|
|
|
) -> Any:
|
|
|
|
|
"""Centralized asynchronous LLM call.
|
|
|
|
|
|
|
|
|
|
Same as call_llm() but async. See call_llm() for full documentation.
|
|
|
|
|
"""
|
|
|
|
|
resolved_provider, resolved_model = _resolve_task_provider_model(
|
|
|
|
|
task, provider, model)
|
|
|
|
|
|
|
|
|
|
client, final_model = _get_cached_client(
|
|
|
|
|
resolved_provider, resolved_model, async_mode=True)
|
|
|
|
|
if client is None:
|
|
|
|
|
if resolved_provider != "openrouter":
|
|
|
|
|
logger.warning("Provider %s unavailable, falling back to openrouter",
|
|
|
|
|
resolved_provider)
|
|
|
|
|
client, final_model = _get_cached_client(
|
|
|
|
|
"openrouter", resolved_model or _OPENROUTER_MODEL,
|
|
|
|
|
async_mode=True)
|
|
|
|
|
if client is None:
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
f"No LLM provider configured for task={task} provider={resolved_provider}. "
|
|
|
|
|
f"Run: hermes setup")
|
|
|
|
|
|
|
|
|
|
kwargs = _build_call_kwargs(
|
|
|
|
|
resolved_provider, final_model, messages,
|
|
|
|
|
temperature=temperature, max_tokens=max_tokens,
|
|
|
|
|
tools=tools, timeout=timeout, extra_body=extra_body)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
return await client.chat.completions.create(**kwargs)
|
|
|
|
|
except Exception as first_err:
|
|
|
|
|
err_str = str(first_err)
|
|
|
|
|
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
|
|
|
|
|
kwargs.pop("max_tokens", None)
|
|
|
|
|
kwargs["max_completion_tokens"] = max_tokens
|
|
|
|
|
return await client.chat.completions.create(**kwargs)
|
|
|
|
|
raise
|