diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index dd8f22bb7..957452fc3 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -30,6 +30,10 @@ Default "auto" follows the chains above. Per-task model overrides (e.g. AUXILIARY_VISION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL) let callers use a different model slug than the provider's default. + +Per-task direct endpoint overrides (e.g. AUXILIARY_VISION_BASE_URL, +AUXILIARY_VISION_API_KEY) let callers route a specific auxiliary task to a +custom OpenAI-compatible endpoint without touching the main model settings. """ import json @@ -418,6 +422,17 @@ def _get_auxiliary_provider(task: str = "") -> str: return "auto" +def _get_auxiliary_env_override(task: str, suffix: str) -> Optional[str]: + """Read an auxiliary env override from AUXILIARY_* or CONTEXT_* prefixes.""" + if not task: + return None + for prefix in ("AUXILIARY_", "CONTEXT_"): + val = os.getenv(f"{prefix}{task.upper()}_{suffix}", "").strip() + if val: + return val + return None + + def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]: or_key = os.getenv("OPENROUTER_API_KEY") if not or_key: @@ -564,6 +579,8 @@ def resolve_provider_client( model: str = None, async_mode: bool = False, raw_codex: bool = False, + explicit_base_url: str = None, + explicit_api_key: str = None, ) -> Tuple[Optional[Any], Optional[str]]: """Central router: given a provider name and optional model, return a configured client with the correct auth, base URL, and API format. @@ -585,6 +602,8 @@ def resolve_provider_client( instead of wrapping in CodexAuxiliaryClient. Use this when the caller needs direct access to responses.stream() (e.g., the main agent loop). + explicit_base_url: Optional direct OpenAI-compatible endpoint. + explicit_api_key: Optional API key paired with explicit_base_url. Returns: (client, resolved_model) or (None, None) if auth is unavailable. @@ -661,6 +680,22 @@ def resolve_provider_client( # ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ─────────── if provider == "custom": + if explicit_base_url: + custom_base = explicit_base_url.strip() + custom_key = ( + (explicit_api_key or "").strip() + or os.getenv("OPENAI_API_KEY", "").strip() + ) + if not custom_base or not custom_key: + logger.warning( + "resolve_provider_client: explicit custom endpoint requested " + "but no API key was found (set explicit_api_key or OPENAI_API_KEY)" + ) + return None, None + final_model = model or _read_main_model() or "gpt-4o-mini" + client = OpenAI(api_key=custom_key, base_url=custom_base) + return (_to_async_client(client, final_model) if async_mode + else (client, final_model)) # Try custom first, then codex, then API-key providers for try_fn in (_try_custom_endpoint, _try_codex, _resolve_api_key_provider): @@ -749,10 +784,13 @@ def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optiona Callers may override the returned model with a per-task env var (e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL). """ - forced = _get_auxiliary_provider(task) - if forced != "auto": - return resolve_provider_client(forced) - return resolve_provider_client("auto") + provider, model, base_url, api_key = _resolve_task_provider_model(task or None) + return resolve_provider_client( + provider, + model=model, + explicit_base_url=base_url, + explicit_api_key=api_key, + ) def get_async_text_auxiliary_client(task: str = ""): @@ -762,10 +800,14 @@ def get_async_text_auxiliary_client(task: str = ""): (AsyncCodexAuxiliaryClient, model) which wraps the Responses API. Returns (None, None) when no provider is available. """ - forced = _get_auxiliary_provider(task) - if forced != "auto": - return resolve_provider_client(forced, async_mode=True) - return resolve_provider_client("auto", async_mode=True) + provider, model, base_url, api_key = _resolve_task_provider_model(task or None) + return resolve_provider_client( + provider, + model=model, + async_mode=True, + explicit_base_url=base_url, + explicit_api_key=api_key, + ) _VISION_AUTO_PROVIDER_ORDER = ( @@ -821,26 +863,43 @@ def resolve_vision_provider_client( provider: Optional[str] = None, model: Optional[str] = None, *, + base_url: Optional[str] = None, + api_key: Optional[str] = None, async_mode: bool = False, ) -> Tuple[Optional[str], Optional[Any], Optional[str]]: """Resolve the client actually used for vision tasks. - Explicit provider overrides still use the generic provider router for - non-standard backends, so users can intentionally force experimental - providers. Auto mode stays conservative and only tries vision backends - known to work today. + Direct endpoint overrides take precedence over provider selection. Explicit + provider overrides still use the generic provider router for non-standard + backends, so users can intentionally force experimental providers. Auto mode + stays conservative and only tries vision backends known to work today. """ - requested = _normalize_vision_provider(provider or _get_auxiliary_provider("vision")) + requested, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model( + "vision", provider, model, base_url, api_key + ) + requested = _normalize_vision_provider(requested) def _finalize(resolved_provider: str, sync_client: Any, default_model: Optional[str]): if sync_client is None: return resolved_provider, None, None - final_model = model or default_model + final_model = resolved_model or default_model if async_mode: async_client, async_model = _to_async_client(sync_client, final_model) return resolved_provider, async_client, async_model return resolved_provider, sync_client, final_model + if resolved_base_url: + client, final_model = resolve_provider_client( + "custom", + model=resolved_model, + async_mode=async_mode, + explicit_base_url=resolved_base_url, + explicit_api_key=resolved_api_key, + ) + if client is None: + return "custom", None, None + return "custom", client, final_model + if requested == "auto": for candidate in get_available_vision_backends(): sync_client, default_model = _resolve_strict_vision_backend(candidate) @@ -853,7 +912,7 @@ def resolve_vision_provider_client( sync_client, default_model = _resolve_strict_vision_backend(requested) return _finalize(requested, sync_client, default_model) - client, final_model = _get_cached_client(requested, model, async_mode) + client, final_model = _get_cached_client(requested, resolved_model, async_mode) if client is None: return requested, None, None return requested, client, final_model @@ -910,19 +969,29 @@ def auxiliary_max_tokens_param(value: int) -> dict: # Every auxiliary LLM consumer should use these instead of manually # constructing clients and calling .chat.completions.create(). -# Client cache: (provider, async_mode) -> (client, default_model) +# Client cache: (provider, async_mode, base_url, api_key) -> (client, default_model) _client_cache: Dict[tuple, tuple] = {} def _get_cached_client( - provider: str, model: str = None, async_mode: bool = False, + provider: str, + model: str = None, + async_mode: bool = False, + base_url: str = None, + api_key: str = None, ) -> Tuple[Optional[Any], Optional[str]]: """Get or create a cached client for the given provider.""" - cache_key = (provider, async_mode) + cache_key = (provider, async_mode, base_url or "", api_key or "") if cache_key in _client_cache: cached_client, cached_default = _client_cache[cache_key] return cached_client, model or cached_default - client, default_model = resolve_provider_client(provider, model, async_mode) + client, default_model = resolve_provider_client( + provider, + model, + async_mode, + explicit_base_url=base_url, + explicit_api_key=api_key, + ) if client is not None: _client_cache[cache_key] = (client, default_model) return client, model or default_model @@ -932,57 +1001,75 @@ def _resolve_task_provider_model( task: str = None, provider: str = None, model: str = None, -) -> Tuple[str, Optional[str]]: + base_url: str = None, + api_key: str = None, +) -> Tuple[str, Optional[str], Optional[str], Optional[str]]: """Determine provider + model for a call. Priority: - 1. Explicit provider/model args (always win) - 2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.) - 3. Config file (auxiliary.{task}.provider/model or compression.*) + 1. Explicit provider/model/base_url/api_key args (always win) + 2. Env var overrides (AUXILIARY_{TASK}_*, CONTEXT_{TASK}_*) + 3. Config file (auxiliary.{task}.* or compression.*) 4. "auto" (full auto-detection chain) - Returns (provider, model) where model may be None (use provider default). + Returns (provider, model, base_url, api_key) where model may be None + (use provider default). When base_url is set, provider is forced to + "custom" and the task uses that direct endpoint. """ - if provider: - return provider, model + config = {} + cfg_provider = None + cfg_model = None + cfg_base_url = None + cfg_api_key = None if task: - # Check env var overrides first - env_provider = _get_auxiliary_provider(task) - if env_provider != "auto": - # Check for env var model override too - env_model = None - for prefix in ("AUXILIARY_", "CONTEXT_"): - val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip() - if val: - env_model = val - break - return env_provider, model or env_model - - # Read from config file try: from hermes_cli.config import load_config config = load_config() except ImportError: - return "auto", model + config = {} - # Check auxiliary.{task} section - aux = config.get("auxiliary", {}) - task_config = aux.get(task, {}) - cfg_provider = task_config.get("provider", "").strip() or None - cfg_model = task_config.get("model", "").strip() or None + aux = config.get("auxiliary", {}) if isinstance(config, dict) else {} + task_config = aux.get(task, {}) if isinstance(aux, dict) else {} + if not isinstance(task_config, dict): + task_config = {} + cfg_provider = str(task_config.get("provider", "")).strip() or None + cfg_model = str(task_config.get("model", "")).strip() or None + cfg_base_url = str(task_config.get("base_url", "")).strip() or None + cfg_api_key = str(task_config.get("api_key", "")).strip() or None # Backwards compat: compression section has its own keys if task == "compression" and not cfg_provider: - comp = config.get("compression", {}) - cfg_provider = comp.get("summary_provider", "").strip() or None - cfg_model = cfg_model or comp.get("summary_model", "").strip() or None + comp = config.get("compression", {}) if isinstance(config, dict) else {} + if isinstance(comp, dict): + cfg_provider = comp.get("summary_provider", "").strip() or None + cfg_model = cfg_model or comp.get("summary_model", "").strip() or None + env_model = _get_auxiliary_env_override(task, "MODEL") if task else None + resolved_model = model or env_model or cfg_model + + if base_url: + return "custom", resolved_model, base_url, api_key + if provider: + return provider, resolved_model, base_url, api_key + + if task: + env_base_url = _get_auxiliary_env_override(task, "BASE_URL") + env_api_key = _get_auxiliary_env_override(task, "API_KEY") + if env_base_url: + return "custom", resolved_model, env_base_url, env_api_key or cfg_api_key + + env_provider = _get_auxiliary_provider(task) + if env_provider != "auto": + return env_provider, resolved_model, None, None + + if cfg_base_url: + return "custom", resolved_model, cfg_base_url, cfg_api_key if cfg_provider and cfg_provider != "auto": - return cfg_provider, model or cfg_model - return "auto", model or cfg_model + return cfg_provider, resolved_model, None, None + return "auto", resolved_model, None, None - return "auto", model + return "auto", resolved_model, None, None def _build_call_kwargs( @@ -994,6 +1081,7 @@ def _build_call_kwargs( tools: Optional[list] = None, timeout: float = 30.0, extra_body: Optional[dict] = None, + base_url: Optional[str] = None, ) -> dict: """Build kwargs for .chat.completions.create() with model/provider adjustments.""" kwargs: Dict[str, Any] = { @@ -1009,7 +1097,7 @@ def _build_call_kwargs( # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens. # Direct OpenAI api.openai.com with newer models needs max_completion_tokens. if provider == "custom": - custom_base = os.getenv("OPENAI_BASE_URL", "") + custom_base = base_url or os.getenv("OPENAI_BASE_URL", "") if "api.openai.com" in custom_base.lower(): kwargs["max_completion_tokens"] = max_tokens else: @@ -1035,6 +1123,8 @@ def call_llm( *, provider: str = None, model: str = None, + base_url: str = None, + api_key: str = None, messages: list, temperature: float = None, max_tokens: int = None, @@ -1066,16 +1156,18 @@ def call_llm( Raises: RuntimeError: If no provider is configured. """ - resolved_provider, resolved_model = _resolve_task_provider_model( - task, provider, model) + resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model( + task, provider, model, base_url, api_key) if task == "vision": effective_provider, client, final_model = resolve_vision_provider_client( - provider=resolved_provider, - model=resolved_model, + provider=provider, + model=model, + base_url=base_url, + api_key=api_key, async_mode=False, ) - if client is None and resolved_provider != "auto": + if client is None and resolved_provider != "auto" and not resolved_base_url: logger.warning( "Vision provider %s unavailable, falling back to auto vision backends", resolved_provider, @@ -1092,10 +1184,15 @@ def call_llm( ) resolved_provider = effective_provider or resolved_provider else: - client, final_model = _get_cached_client(resolved_provider, resolved_model) + client, final_model = _get_cached_client( + resolved_provider, + resolved_model, + base_url=resolved_base_url, + api_key=resolved_api_key, + ) if client is None: # Fallback: try openrouter - if resolved_provider != "openrouter": + if resolved_provider != "openrouter" and not resolved_base_url: logger.warning("Provider %s unavailable, falling back to openrouter", resolved_provider) client, final_model = _get_cached_client( @@ -1108,7 +1205,8 @@ def call_llm( kwargs = _build_call_kwargs( resolved_provider, final_model, messages, temperature=temperature, max_tokens=max_tokens, - tools=tools, timeout=timeout, extra_body=extra_body) + tools=tools, timeout=timeout, extra_body=extra_body, + base_url=resolved_base_url) # Handle max_tokens vs max_completion_tokens retry try: @@ -1127,6 +1225,8 @@ async def async_call_llm( *, provider: str = None, model: str = None, + base_url: str = None, + api_key: str = None, messages: list, temperature: float = None, max_tokens: int = None, @@ -1138,16 +1238,18 @@ async def async_call_llm( Same as call_llm() but async. See call_llm() for full documentation. """ - resolved_provider, resolved_model = _resolve_task_provider_model( - task, provider, model) + resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model( + task, provider, model, base_url, api_key) if task == "vision": effective_provider, client, final_model = resolve_vision_provider_client( - provider=resolved_provider, - model=resolved_model, + provider=provider, + model=model, + base_url=base_url, + api_key=api_key, async_mode=True, ) - if client is None and resolved_provider != "auto": + if client is None and resolved_provider != "auto" and not resolved_base_url: logger.warning( "Vision provider %s unavailable, falling back to auto vision backends", resolved_provider, @@ -1165,9 +1267,14 @@ async def async_call_llm( resolved_provider = effective_provider or resolved_provider else: client, final_model = _get_cached_client( - resolved_provider, resolved_model, async_mode=True) + resolved_provider, + resolved_model, + async_mode=True, + base_url=resolved_base_url, + api_key=resolved_api_key, + ) if client is None: - if resolved_provider != "openrouter": + if resolved_provider != "openrouter" and not resolved_base_url: logger.warning("Provider %s unavailable, falling back to openrouter", resolved_provider) client, final_model = _get_cached_client( @@ -1181,7 +1288,8 @@ async def async_call_llm( kwargs = _build_call_kwargs( resolved_provider, final_model, messages, temperature=temperature, max_tokens=max_tokens, - tools=tools, timeout=timeout, extra_body=extra_body) + tools=tools, timeout=timeout, extra_body=extra_body, + base_url=resolved_base_url) try: return await client.chat.completions.create(**kwargs) diff --git a/cli.py b/cli.py index 44c7889cd..1bebbf4fa 100755 --- a/cli.py +++ b/cli.py @@ -218,11 +218,27 @@ def load_cli_config() -> Dict[str, Any]: "timeout": 300, # Max seconds a sandbox script can run before being killed (5 min) "max_tool_calls": 50, # Max RPC tool calls per execution }, + "auxiliary": { + "vision": { + "provider": "auto", + "model": "", + "base_url": "", + "api_key": "", + }, + "web_extract": { + "provider": "auto", + "model": "", + "base_url": "", + "api_key": "", + }, + }, "delegation": { "max_iterations": 45, # Max tool-calling turns per child agent "default_toolsets": ["terminal", "file", "web"], # Default toolsets for subagents "model": "", # Subagent model override (empty = inherit parent model) "provider": "", # Subagent provider override (empty = inherit parent provider) + "base_url": "", # Direct OpenAI-compatible endpoint for subagents + "api_key": "", # API key for delegation.base_url (falls back to OPENAI_API_KEY) }, } @@ -363,28 +379,44 @@ def load_cli_config() -> Dict[str, Any]: if config_key in compression_config: os.environ[env_var] = str(compression_config[config_key]) - # Apply auxiliary model overrides to environment variables. - # Vision and web_extract each have their own provider + model pair. + # Apply auxiliary model/direct-endpoint overrides to environment variables. + # Vision and web_extract each have their own provider/model/base_url/api_key tuple. # (Compression is handled in the compression section above.) # Only set env vars for non-empty / non-default values so auto-detection # still works. auxiliary_config = defaults.get("auxiliary", {}) auxiliary_task_env = { - # config key → (provider env var, model env var) - "vision": ("AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL"), - "web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL"), + # config key → env var mapping + "vision": { + "provider": "AUXILIARY_VISION_PROVIDER", + "model": "AUXILIARY_VISION_MODEL", + "base_url": "AUXILIARY_VISION_BASE_URL", + "api_key": "AUXILIARY_VISION_API_KEY", + }, + "web_extract": { + "provider": "AUXILIARY_WEB_EXTRACT_PROVIDER", + "model": "AUXILIARY_WEB_EXTRACT_MODEL", + "base_url": "AUXILIARY_WEB_EXTRACT_BASE_URL", + "api_key": "AUXILIARY_WEB_EXTRACT_API_KEY", + }, } - for task_key, (prov_env, model_env) in auxiliary_task_env.items(): + for task_key, env_map in auxiliary_task_env.items(): task_cfg = auxiliary_config.get(task_key, {}) if not isinstance(task_cfg, dict): continue prov = str(task_cfg.get("provider", "")).strip() model = str(task_cfg.get("model", "")).strip() + base_url = str(task_cfg.get("base_url", "")).strip() + api_key = str(task_cfg.get("api_key", "")).strip() if prov and prov != "auto": - os.environ[prov_env] = prov + os.environ[env_map["provider"]] = prov if model: - os.environ[model_env] = model + os.environ[env_map["model"]] = model + if base_url: + os.environ[env_map["base_url"]] = base_url + if api_key: + os.environ[env_map["api_key"]] = api_key # Security settings security_config = defaults.get("security", {}) diff --git a/gateway/run.py b/gateway/run.py index e973852b4..8941fcec5 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -100,24 +100,40 @@ if _config_path.exists(): for _cfg_key, _env_var in _compression_env_map.items(): if _cfg_key in _compression_cfg: os.environ[_env_var] = str(_compression_cfg[_cfg_key]) - # Auxiliary model overrides (vision, web_extract). - # Each task has provider + model; bridge non-default values to env vars. + # Auxiliary model/direct-endpoint overrides (vision, web_extract). + # Each task has provider/model/base_url/api_key; bridge non-default values to env vars. _auxiliary_cfg = _cfg.get("auxiliary", {}) if _auxiliary_cfg and isinstance(_auxiliary_cfg, dict): _aux_task_env = { - "vision": ("AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL"), - "web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL"), + "vision": { + "provider": "AUXILIARY_VISION_PROVIDER", + "model": "AUXILIARY_VISION_MODEL", + "base_url": "AUXILIARY_VISION_BASE_URL", + "api_key": "AUXILIARY_VISION_API_KEY", + }, + "web_extract": { + "provider": "AUXILIARY_WEB_EXTRACT_PROVIDER", + "model": "AUXILIARY_WEB_EXTRACT_MODEL", + "base_url": "AUXILIARY_WEB_EXTRACT_BASE_URL", + "api_key": "AUXILIARY_WEB_EXTRACT_API_KEY", + }, } - for _task_key, (_prov_env, _model_env) in _aux_task_env.items(): + for _task_key, _env_map in _aux_task_env.items(): _task_cfg = _auxiliary_cfg.get(_task_key, {}) if not isinstance(_task_cfg, dict): continue _prov = str(_task_cfg.get("provider", "")).strip() _model = str(_task_cfg.get("model", "")).strip() + _base_url = str(_task_cfg.get("base_url", "")).strip() + _api_key = str(_task_cfg.get("api_key", "")).strip() if _prov and _prov != "auto": - os.environ[_prov_env] = _prov + os.environ[_env_map["provider"]] = _prov if _model: - os.environ[_model_env] = _model + os.environ[_env_map["model"]] = _model + if _base_url: + os.environ[_env_map["base_url"]] = _base_url + if _api_key: + os.environ[_env_map["api_key"]] = _api_key _agent_cfg = _cfg.get("agent", {}) if _agent_cfg and isinstance(_agent_cfg, dict): if "max_turns" in _agent_cfg: diff --git a/hermes_cli/config.py b/hermes_cli/config.py index bdde858d3..b67405a09 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -150,30 +150,44 @@ DEFAULT_CONFIG = { "vision": { "provider": "auto", # auto | openrouter | nous | codex | custom "model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o" + "base_url": "", # direct OpenAI-compatible endpoint (takes precedence over provider) + "api_key": "", # API key for base_url (falls back to OPENAI_API_KEY) }, "web_extract": { "provider": "auto", "model": "", + "base_url": "", + "api_key": "", }, "compression": { "provider": "auto", "model": "", + "base_url": "", + "api_key": "", }, "session_search": { "provider": "auto", "model": "", + "base_url": "", + "api_key": "", }, "skills_hub": { "provider": "auto", "model": "", + "base_url": "", + "api_key": "", }, "mcp": { "provider": "auto", "model": "", + "base_url": "", + "api_key": "", }, "flush_memories": { "provider": "auto", "model": "", + "base_url": "", + "api_key": "", }, }, @@ -243,6 +257,8 @@ DEFAULT_CONFIG = { "delegation": { "model": "", # e.g. "google/gemini-3-flash-preview" (empty = inherit parent model) "provider": "", # e.g. "openrouter" (empty = inherit parent provider + credentials) + "base_url": "", # direct OpenAI-compatible endpoint for subagents + "api_key": "", # API key for delegation.base_url (falls back to OPENAI_API_KEY) }, # Ephemeral prefill messages file — JSON list of {role, content} dicts diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index 57c73eb8b..d60e3c813 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -24,9 +24,11 @@ def _clean_env(monkeypatch): for key in ( "OPENROUTER_API_KEY", "OPENAI_BASE_URL", "OPENAI_API_KEY", "OPENAI_MODEL", "LLM_MODEL", "NOUS_INFERENCE_BASE_URL", - # Per-task provider/model overrides + # Per-task provider/model/direct-endpoint overrides "AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL", + "AUXILIARY_VISION_BASE_URL", "AUXILIARY_VISION_API_KEY", "AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL", + "AUXILIARY_WEB_EXTRACT_BASE_URL", "AUXILIARY_WEB_EXTRACT_API_KEY", "CONTEXT_COMPRESSION_PROVIDER", "CONTEXT_COMPRESSION_MODEL", ): monkeypatch.delenv(key, raising=False) @@ -142,6 +144,27 @@ class TestGetTextAuxiliaryClient: call_kwargs = mock_openai.call_args assert call_kwargs.kwargs["base_url"] == "http://localhost:1234/v1" + def test_task_direct_endpoint_override(self, monkeypatch): + monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") + monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_BASE_URL", "http://localhost:2345/v1") + monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_API_KEY", "task-key") + monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_MODEL", "task-model") + with patch("agent.auxiliary_client.OpenAI") as mock_openai: + client, model = get_text_auxiliary_client("web_extract") + assert model == "task-model" + assert mock_openai.call_args.kwargs["base_url"] == "http://localhost:2345/v1" + assert mock_openai.call_args.kwargs["api_key"] == "task-key" + + def test_task_direct_endpoint_without_openai_key_does_not_fall_back(self, monkeypatch): + monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") + monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_BASE_URL", "http://localhost:2345/v1") + monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_MODEL", "task-model") + with patch("agent.auxiliary_client.OpenAI") as mock_openai: + client, model = get_text_auxiliary_client("web_extract") + assert client is None + assert model is None + mock_openai.assert_not_called() + def test_codex_fallback_when_nothing_else(self, codex_auth_dir): with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ patch("agent.auxiliary_client.OpenAI") as mock_openai: @@ -194,6 +217,27 @@ class TestVisionClientFallback: client, model = get_vision_auxiliary_client() assert client is not None # Custom endpoint picked up as fallback + def test_vision_direct_endpoint_override(self, monkeypatch): + monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") + monkeypatch.setenv("AUXILIARY_VISION_BASE_URL", "http://localhost:4567/v1") + monkeypatch.setenv("AUXILIARY_VISION_API_KEY", "vision-key") + monkeypatch.setenv("AUXILIARY_VISION_MODEL", "vision-model") + with patch("agent.auxiliary_client.OpenAI") as mock_openai: + client, model = get_vision_auxiliary_client() + assert model == "vision-model" + assert mock_openai.call_args.kwargs["base_url"] == "http://localhost:4567/v1" + assert mock_openai.call_args.kwargs["api_key"] == "vision-key" + + def test_vision_direct_endpoint_requires_openai_api_key(self, monkeypatch): + monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") + monkeypatch.setenv("AUXILIARY_VISION_BASE_URL", "http://localhost:4567/v1") + monkeypatch.setenv("AUXILIARY_VISION_MODEL", "vision-model") + with patch("agent.auxiliary_client.OpenAI") as mock_openai: + client, model = get_vision_auxiliary_client() + assert client is None + assert model is None + mock_openai.assert_not_called() + def test_vision_uses_openrouter_when_available(self, monkeypatch): monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") with patch("agent.auxiliary_client.OpenAI") as mock_openai: @@ -390,6 +434,24 @@ class TestTaskSpecificOverrides: client, model = get_text_auxiliary_client("web_extract") assert model == "google/gemini-3-flash-preview" + def test_task_direct_endpoint_from_config(self, monkeypatch, tmp_path): + hermes_home = tmp_path / "hermes" + hermes_home.mkdir(parents=True, exist_ok=True) + (hermes_home / "config.yaml").write_text( + """auxiliary: + web_extract: + base_url: http://localhost:3456/v1 + api_key: config-key + model: config-model +""" + ) + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + with patch("agent.auxiliary_client.OpenAI") as mock_openai: + client, model = get_text_auxiliary_client("web_extract") + assert model == "config-model" + assert mock_openai.call_args.kwargs["base_url"] == "http://localhost:3456/v1" + assert mock_openai.call_args.kwargs["api_key"] == "config-key" + def test_task_without_override_uses_auto(self, monkeypatch): """A task with no provider env var falls through to auto chain.""" monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") diff --git a/tests/conftest.py b/tests/conftest.py index 9c9f9a44e..67fad819b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,6 +26,12 @@ def _isolate_hermes_home(tmp_path, monkeypatch): (fake_home / "memories").mkdir() (fake_home / "skills").mkdir() monkeypatch.setenv("HERMES_HOME", str(fake_home)) + # Tests should not inherit the agent's current gateway/messaging surface. + # Individual tests that need gateway behavior set these explicitly. + monkeypatch.delenv("HERMES_SESSION_PLATFORM", raising=False) + monkeypatch.delenv("HERMES_SESSION_CHAT_ID", raising=False) + monkeypatch.delenv("HERMES_SESSION_CHAT_NAME", raising=False) + monkeypatch.delenv("HERMES_GATEWAY_SESSION", raising=False) @pytest.fixture() diff --git a/tests/test_auxiliary_config_bridge.py b/tests/test_auxiliary_config_bridge.py index a4d65c2af..22e88bdf8 100644 --- a/tests/test_auxiliary_config_bridge.py +++ b/tests/test_auxiliary_config_bridge.py @@ -25,7 +25,9 @@ def _run_auxiliary_bridge(config_dict, monkeypatch): # Clear env vars for key in ( "AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL", + "AUXILIARY_VISION_BASE_URL", "AUXILIARY_VISION_API_KEY", "AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL", + "AUXILIARY_WEB_EXTRACT_BASE_URL", "AUXILIARY_WEB_EXTRACT_API_KEY", "CONTEXT_COMPRESSION_PROVIDER", "CONTEXT_COMPRESSION_MODEL", ): monkeypatch.delenv(key, raising=False) @@ -47,19 +49,35 @@ def _run_auxiliary_bridge(config_dict, monkeypatch): auxiliary_cfg = config_dict.get("auxiliary", {}) if auxiliary_cfg and isinstance(auxiliary_cfg, dict): aux_task_env = { - "vision": ("AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL"), - "web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL"), + "vision": { + "provider": "AUXILIARY_VISION_PROVIDER", + "model": "AUXILIARY_VISION_MODEL", + "base_url": "AUXILIARY_VISION_BASE_URL", + "api_key": "AUXILIARY_VISION_API_KEY", + }, + "web_extract": { + "provider": "AUXILIARY_WEB_EXTRACT_PROVIDER", + "model": "AUXILIARY_WEB_EXTRACT_MODEL", + "base_url": "AUXILIARY_WEB_EXTRACT_BASE_URL", + "api_key": "AUXILIARY_WEB_EXTRACT_API_KEY", + }, } - for task_key, (prov_env, model_env) in aux_task_env.items(): + for task_key, env_map in aux_task_env.items(): task_cfg = auxiliary_cfg.get(task_key, {}) if not isinstance(task_cfg, dict): continue prov = str(task_cfg.get("provider", "")).strip() model = str(task_cfg.get("model", "")).strip() + base_url = str(task_cfg.get("base_url", "")).strip() + api_key = str(task_cfg.get("api_key", "")).strip() if prov and prov != "auto": - os.environ[prov_env] = prov + os.environ[env_map["provider"]] = prov if model: - os.environ[model_env] = model + os.environ[env_map["model"]] = model + if base_url: + os.environ[env_map["base_url"]] = base_url + if api_key: + os.environ[env_map["api_key"]] = api_key # ── Config bridging tests ──────────────────────────────────────────────────── @@ -101,6 +119,21 @@ class TestAuxiliaryConfigBridge: assert os.environ.get("AUXILIARY_WEB_EXTRACT_PROVIDER") == "nous" assert os.environ.get("AUXILIARY_WEB_EXTRACT_MODEL") == "gemini-2.5-flash" + def test_direct_endpoint_bridged(self, monkeypatch): + config = { + "auxiliary": { + "vision": { + "base_url": "http://localhost:1234/v1", + "api_key": "local-key", + "model": "qwen2.5-vl", + } + } + } + _run_auxiliary_bridge(config, monkeypatch) + assert os.environ.get("AUXILIARY_VISION_BASE_URL") == "http://localhost:1234/v1" + assert os.environ.get("AUXILIARY_VISION_API_KEY") == "local-key" + assert os.environ.get("AUXILIARY_VISION_MODEL") == "qwen2.5-vl" + def test_compression_provider_bridged(self, monkeypatch): config = { "compression": { @@ -200,8 +233,12 @@ class TestGatewayBridgeCodeParity: # Check for key patterns that indicate the bridge is present assert "AUXILIARY_VISION_PROVIDER" in content assert "AUXILIARY_VISION_MODEL" in content + assert "AUXILIARY_VISION_BASE_URL" in content + assert "AUXILIARY_VISION_API_KEY" in content assert "AUXILIARY_WEB_EXTRACT_PROVIDER" in content assert "AUXILIARY_WEB_EXTRACT_MODEL" in content + assert "AUXILIARY_WEB_EXTRACT_BASE_URL" in content + assert "AUXILIARY_WEB_EXTRACT_API_KEY" in content def test_gateway_has_compression_provider(self): """Gateway must bridge compression.summary_provider.""" diff --git a/tests/tools/test_delegate.py b/tests/tools/test_delegate.py index 680233b0f..a29560b2c 100644 --- a/tests/tools/test_delegate.py +++ b/tests/tools/test_delegate.py @@ -10,6 +10,7 @@ Run with: python -m pytest tests/test_delegate.py -v """ import json +import os import sys import unittest from unittest.mock import MagicMock, patch @@ -462,6 +463,43 @@ class TestDelegationCredentialResolution(unittest.TestCase): self.assertEqual(creds["api_mode"], "chat_completions") mock_resolve.assert_called_once_with(requested="openrouter") + def test_direct_endpoint_uses_configured_base_url_and_api_key(self): + parent = _make_mock_parent(depth=0) + cfg = { + "model": "qwen2.5-coder", + "provider": "openrouter", + "base_url": "http://localhost:1234/v1", + "api_key": "local-key", + } + creds = _resolve_delegation_credentials(cfg, parent) + self.assertEqual(creds["model"], "qwen2.5-coder") + self.assertEqual(creds["provider"], "custom") + self.assertEqual(creds["base_url"], "http://localhost:1234/v1") + self.assertEqual(creds["api_key"], "local-key") + self.assertEqual(creds["api_mode"], "chat_completions") + + def test_direct_endpoint_falls_back_to_openai_api_key_env(self): + parent = _make_mock_parent(depth=0) + cfg = { + "model": "qwen2.5-coder", + "base_url": "http://localhost:1234/v1", + } + with patch.dict(os.environ, {"OPENAI_API_KEY": "env-openai-key"}, clear=False): + creds = _resolve_delegation_credentials(cfg, parent) + self.assertEqual(creds["api_key"], "env-openai-key") + self.assertEqual(creds["provider"], "custom") + + def test_direct_endpoint_does_not_fall_back_to_openrouter_api_key_env(self): + parent = _make_mock_parent(depth=0) + cfg = { + "model": "qwen2.5-coder", + "base_url": "http://localhost:1234/v1", + } + with patch.dict(os.environ, {"OPENROUTER_API_KEY": "env-openrouter-key"}, clear=False): + with self.assertRaises(ValueError) as ctx: + _resolve_delegation_credentials(cfg, parent) + self.assertIn("OPENAI_API_KEY", str(ctx.exception)) + @patch("hermes_cli.runtime_provider.resolve_runtime_provider") def test_nous_provider_resolves_nous_credentials(self, mock_resolve): """Nous provider resolves Nous Portal base_url and api_key.""" @@ -589,6 +627,40 @@ class TestDelegationProviderIntegration(unittest.TestCase): self.assertNotEqual(kwargs["base_url"], parent.base_url) self.assertNotEqual(kwargs["api_key"], parent.api_key) + @patch("tools.delegate_tool._load_config") + @patch("tools.delegate_tool._resolve_delegation_credentials") + def test_direct_endpoint_credentials_reach_child_agent(self, mock_creds, mock_cfg): + mock_cfg.return_value = { + "max_iterations": 45, + "model": "qwen2.5-coder", + "base_url": "http://localhost:1234/v1", + "api_key": "local-key", + } + mock_creds.return_value = { + "model": "qwen2.5-coder", + "provider": "custom", + "base_url": "http://localhost:1234/v1", + "api_key": "local-key", + "api_mode": "chat_completions", + } + parent = _make_mock_parent(depth=0) + + with patch("run_agent.AIAgent") as MockAgent: + mock_child = MagicMock() + mock_child.run_conversation.return_value = { + "final_response": "done", "completed": True, "api_calls": 1 + } + MockAgent.return_value = mock_child + + delegate_task(goal="Direct endpoint test", parent_agent=parent) + + _, kwargs = MockAgent.call_args + self.assertEqual(kwargs["model"], "qwen2.5-coder") + self.assertEqual(kwargs["provider"], "custom") + self.assertEqual(kwargs["base_url"], "http://localhost:1234/v1") + self.assertEqual(kwargs["api_key"], "local-key") + self.assertEqual(kwargs["api_mode"], "chat_completions") + @patch("tools.delegate_tool._load_config") @patch("tools.delegate_tool._resolve_delegation_credentials") def test_empty_config_inherits_parent(self, mock_creds, mock_cfg): diff --git a/tools/delegate_tool.py b/tools/delegate_tool.py index 76026be59..0d5908ab5 100644 --- a/tools/delegate_tool.py +++ b/tools/delegate_tool.py @@ -540,18 +540,51 @@ def delegate_task( def _resolve_delegation_credentials(cfg: dict, parent_agent) -> dict: """Resolve credentials for subagent delegation. - If ``delegation.provider`` is configured, resolves the full credential - bundle (base_url, api_key, api_mode, provider) via the runtime provider - system — the same path used by CLI/gateway startup. This lets subagents - run on a completely different provider:model pair. + If ``delegation.base_url`` is configured, subagents use that direct + OpenAI-compatible endpoint. Otherwise, if ``delegation.provider`` is + configured, the full credential bundle (base_url, api_key, api_mode, + provider) is resolved via the runtime provider system — the same path used + by CLI/gateway startup. This lets subagents run on a completely different + provider:model pair. - If no provider is configured, returns None values so the child inherits - everything from the parent agent. + If neither base_url nor provider is configured, returns None values so the + child inherits everything from the parent agent. Raises ValueError with a user-friendly message on credential failure. """ - configured_model = cfg.get("model") or None - configured_provider = cfg.get("provider") or None + configured_model = str(cfg.get("model") or "").strip() or None + configured_provider = str(cfg.get("provider") or "").strip() or None + configured_base_url = str(cfg.get("base_url") or "").strip() or None + configured_api_key = str(cfg.get("api_key") or "").strip() or None + + if configured_base_url: + api_key = ( + configured_api_key + or os.getenv("OPENAI_API_KEY", "").strip() + ) + if not api_key: + raise ValueError( + "Delegation base_url is configured but no API key was found. " + "Set delegation.api_key or OPENAI_API_KEY." + ) + + base_lower = configured_base_url.lower() + provider = "custom" + api_mode = "chat_completions" + if "chatgpt.com/backend-api/codex" in base_lower: + provider = "openai-codex" + api_mode = "codex_responses" + elif "api.anthropic.com" in base_lower: + provider = "anthropic" + api_mode = "anthropic_messages" + + return { + "model": configured_model, + "provider": provider, + "base_url": configured_base_url, + "api_key": api_key, + "api_mode": api_mode, + } if not configured_provider: # No provider override — child inherits everything from parent @@ -570,7 +603,8 @@ def _resolve_delegation_credentials(cfg: dict, parent_agent) -> dict: except Exception as exc: raise ValueError( f"Cannot resolve delegation provider '{configured_provider}': {exc}. " - f"Check that the provider is configured (API key set, valid provider name). " + f"Check that the provider is configured (API key set, valid provider name), " + f"or set delegation.base_url/delegation.api_key for a direct endpoint. " f"Available providers: openrouter, nous, zai, kimi-coding, minimax." ) from exc diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md index d4f633ee0..36a54d26c 100644 --- a/website/docs/reference/environment-variables.md +++ b/website/docs/reference/environment-variables.md @@ -180,6 +180,23 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe | `CONTEXT_COMPRESSION_THRESHOLD` | Trigger at this % of limit (default: 0.50) | | `CONTEXT_COMPRESSION_MODEL` | Model for summaries | +## Auxiliary Task Overrides + +| Variable | Description | +|----------|-------------| +| `AUXILIARY_VISION_PROVIDER` | Override provider for vision tasks | +| `AUXILIARY_VISION_MODEL` | Override model for vision tasks | +| `AUXILIARY_VISION_BASE_URL` | Direct OpenAI-compatible endpoint for vision tasks | +| `AUXILIARY_VISION_API_KEY` | API key paired with `AUXILIARY_VISION_BASE_URL` | +| `AUXILIARY_WEB_EXTRACT_PROVIDER` | Override provider for web extraction/summarization | +| `AUXILIARY_WEB_EXTRACT_MODEL` | Override model for web extraction/summarization | +| `AUXILIARY_WEB_EXTRACT_BASE_URL` | Direct OpenAI-compatible endpoint for web extraction/summarization | +| `AUXILIARY_WEB_EXTRACT_API_KEY` | API key paired with `AUXILIARY_WEB_EXTRACT_BASE_URL` | +| `CONTEXT_COMPRESSION_PROVIDER` | Override provider for context compression summaries | +| `CONTEXT_COMPRESSION_MODEL` | Override model for context compression summaries | + +For task-specific direct endpoints, Hermes uses the task's configured API key or `OPENAI_API_KEY`. It does not reuse `OPENROUTER_API_KEY` for those custom endpoints. + ## Provider Routing (config.yaml only) These go in `~/.hermes/config.yaml` under the `provider_routing` section: diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index 71525764e..0a1c50cb0 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -569,11 +569,15 @@ auxiliary: vision: provider: "auto" # "auto", "openrouter", "nous", "main" model: "" # e.g. "openai/gpt-4o", "google/gemini-2.5-flash" + base_url: "" # direct OpenAI-compatible endpoint (takes precedence over provider) + api_key: "" # API key for base_url (falls back to OPENAI_API_KEY) # Web page summarization + browser page text extraction web_extract: provider: "auto" model: "" # e.g. "google/gemini-2.5-flash" + base_url: "" + api_key: "" ``` ### Changing the Vision Model @@ -604,6 +608,17 @@ AUXILIARY_VISION_MODEL=openai/gpt-4o ### Common Setups +**Using a direct custom endpoint** (clearer than `provider: "main"` for local/self-hosted APIs): +```yaml +auxiliary: + vision: + base_url: "http://localhost:1234/v1" + api_key: "local-key" + model: "qwen2.5-vl" +``` + +`base_url` takes precedence over `provider`, so this is the most explicit way to route an auxiliary task to a specific endpoint. For direct endpoint overrides, Hermes uses the configured `api_key` or falls back to `OPENAI_API_KEY`; it does not reuse `OPENROUTER_API_KEY` for that custom endpoint. + **Using OpenAI API key for vision:** ```yaml # In ~/.hermes/.env: @@ -848,13 +863,17 @@ delegation: - web # model: "google/gemini-3-flash-preview" # Override model (empty = inherit parent) # provider: "openrouter" # Override provider (empty = inherit parent) + # base_url: "http://localhost:1234/v1" # Direct OpenAI-compatible endpoint (takes precedence over provider) + # api_key: "local-key" # API key for base_url (falls back to OPENAI_API_KEY) ``` **Subagent provider:model override:** By default, subagents inherit the parent agent's provider and model. Set `delegation.provider` and `delegation.model` to route subagents to a different provider:model pair — e.g., use a cheap/fast model for narrowly-scoped subtasks while your primary agent runs an expensive reasoning model. +**Direct endpoint override:** If you want the obvious custom-endpoint path, set `delegation.base_url`, `delegation.api_key`, and `delegation.model`. That sends subagents directly to that OpenAI-compatible endpoint and takes precedence over `delegation.provider`. If `delegation.api_key` is omitted, Hermes falls back to `OPENAI_API_KEY` only. + The delegation provider uses the same credential resolution as CLI/gateway startup. All configured providers are supported: `openrouter`, `nous`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`. When a provider is set, the system automatically resolves the correct base URL, API key, and API mode — no manual credential wiring needed. -**Precedence:** `delegation.provider` in config → parent provider (inherited). `delegation.model` in config → parent model (inherited). Setting just `model` without `provider` changes only the model name while keeping the parent's credentials (useful for switching models within the same provider like OpenRouter). +**Precedence:** `delegation.base_url` in config → `delegation.provider` in config → parent provider (inherited). `delegation.model` in config → parent model (inherited). Setting just `model` without `provider` changes only the model name while keeping the parent's credentials (useful for switching models within the same provider like OpenRouter). ## Clarify diff --git a/website/docs/user-guide/features/delegation.md b/website/docs/user-guide/features/delegation.md index 782371677..f3193d9a2 100644 --- a/website/docs/user-guide/features/delegation.md +++ b/website/docs/user-guide/features/delegation.md @@ -209,6 +209,14 @@ Delegation has a **depth limit of 2** — a parent (depth 0) can spawn children delegation: max_iterations: 50 # Max turns per child (default: 50) default_toolsets: ["terminal", "file", "web"] # Default toolsets + model: "google/gemini-3-flash-preview" # Optional provider/model override + provider: "openrouter" # Optional built-in provider + +# Or use a direct custom endpoint instead of provider: +delegation: + model: "qwen2.5-coder" + base_url: "http://localhost:1234/v1" + api_key: "local-key" ``` :::tip