feat: enhance auxiliary model configuration and environment variable handling

- Added support for auxiliary model overrides in the configuration, allowing users to specify providers and models for vision and web extraction tasks. - Updated the CLI configuration example to include new auxiliary model settings. - Enhanced the environment variable mapping in the CLI to accommodate auxiliary model configurations. - Improved the resolution logic for auxiliary clients to support task-specific provider overrides. - Updated relevant documentation and comments for clarity on the new features and their usage.
2026-03-07 08:52:06 -08:00
parent 0efbb137e8
commit d9f373654b
9 changed files with 271 additions and 81 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -4,7 +4,7 @@ Provides a single resolution chain so every consumer (context compression,
 session search, web extraction, vision analysis, browser vision) picks up
 the best available backend without duplicating fallback logic.

-Resolution order for text tasks:
+Resolution order (same for text and vision tasks):
  1. OpenRouter  (OPENROUTER_API_KEY)
  2. Nous Portal (~/.hermes/auth.json active provider)
  3. Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY)
@@ -14,10 +14,10 @@ Resolution order for text tasks:
     — checked via PROVIDER_REGISTRY entries with auth_type='api_key'
  6. None

-Resolution order for vision/multimodal tasks:
-  1. OpenRouter
-  2. Nous Portal
-  3. None  (custom endpoints can't substitute for Gemini multimodal)
+Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER,
+CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task:
+"openrouter", "nous", or "main" (= steps 3-5).
+Default "auto" follows the full chain above.
 """

 import json
@@ -337,59 +337,122 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
    return None, None


-# ── Public API ──────────────────────────────────────────────────────────────
+# ── Provider resolution helpers ─────────────────────────────────────────────

-def get_text_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
-    """Return (client, model_slug) for text-only auxiliary tasks.
+def _get_auxiliary_provider(task: str = "") -> str:
+    """Read the provider override for a specific auxiliary task.

-    Falls through OpenRouter -> Nous Portal -> custom endpoint -> Codex OAuth
-    -> direct API-key providers -> (None, None).
+    Checks AUXILIARY_{TASK}_PROVIDER first (e.g. AUXILIARY_VISION_PROVIDER),
+    then CONTEXT_{TASK}_PROVIDER (for the compression section's summary_provider),
+    then falls back to "auto".  Returns one of: "auto", "openrouter", "nous", "main".
    """
-    # 1. OpenRouter
+    if task:
+        for prefix in ("AUXILIARY_", "CONTEXT_"):
+            val = os.getenv(f"{prefix}{task.upper()}_PROVIDER", "").strip().lower()
+            if val and val != "auto":
+                return val
+    return "auto"
+
+
+def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]:
    or_key = os.getenv("OPENROUTER_API_KEY")
-    if or_key:
-        logger.debug("Auxiliary text client: OpenRouter")
-        return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
-                       default_headers=_OR_HEADERS), _OPENROUTER_MODEL
+    if not or_key:
+        return None, None
+    logger.debug("Auxiliary client: OpenRouter")
+    return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
+                   default_headers=_OR_HEADERS), _OPENROUTER_MODEL

-    # 2. Nous Portal
+
+def _try_nous() -> Tuple[Optional[OpenAI], Optional[str]]:
    nous = _read_nous_auth()
-    if nous:
-        global auxiliary_is_nous
-        auxiliary_is_nous = True
-        logger.debug("Auxiliary text client: Nous Portal")
-        return (
-            OpenAI(api_key=_nous_api_key(nous), base_url=_nous_base_url()),
-            _NOUS_MODEL,
-        )
+    if not nous:
+        return None, None
+    global auxiliary_is_nous
+    auxiliary_is_nous = True
+    logger.debug("Auxiliary client: Nous Portal")
+    return (
+        OpenAI(api_key=_nous_api_key(nous), base_url=_nous_base_url()),
+        _NOUS_MODEL,
+    )

-    # 3. Custom endpoint (both base URL and key must be set)
+
+def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
    custom_base = os.getenv("OPENAI_BASE_URL")
    custom_key = os.getenv("OPENAI_API_KEY")
-    if custom_base and custom_key:
-        model = os.getenv("OPENAI_MODEL") or os.getenv("LLM_MODEL") or "gpt-4o-mini"
-        logger.debug("Auxiliary text client: custom endpoint (%s)", model)
-        return OpenAI(api_key=custom_key, base_url=custom_base), model
+    if not custom_base or not custom_key:
+        return None, None
+    model = os.getenv("OPENAI_MODEL") or os.getenv("LLM_MODEL") or "gpt-4o-mini"
+    logger.debug("Auxiliary client: custom endpoint (%s)", model)
+    return OpenAI(api_key=custom_key, base_url=custom_base), model

-    # 4. Codex OAuth -- uses the Responses API (only endpoint the token
-    # can access), wrapped to look like a chat.completions client.
+
+def _try_codex() -> Tuple[Optional[Any], Optional[str]]:
    codex_token = _read_codex_access_token()
-    if codex_token:
-        logger.debug("Auxiliary text client: Codex OAuth (%s via Responses API)", _CODEX_AUX_MODEL)
-        real_client = OpenAI(api_key=codex_token, base_url=_CODEX_AUX_BASE_URL)
-        return CodexAuxiliaryClient(real_client, _CODEX_AUX_MODEL), _CODEX_AUX_MODEL
+    if not codex_token:
+        return None, None
+    logger.debug("Auxiliary client: Codex OAuth (%s via Responses API)", _CODEX_AUX_MODEL)
+    real_client = OpenAI(api_key=codex_token, base_url=_CODEX_AUX_BASE_URL)
+    return CodexAuxiliaryClient(real_client, _CODEX_AUX_MODEL), _CODEX_AUX_MODEL

-    # 5. Direct API-key providers (z.ai/GLM, Kimi/Moonshot, MiniMax, etc.)
-    api_client, api_model = _resolve_api_key_provider()
-    if api_client is not None:
-        return api_client, api_model

-    # 6. Nothing available
-    logger.debug("Auxiliary text client: none available")
+def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[str]]:
+    """Resolve a specific forced provider.  Returns (None, None) if creds missing."""
+    if forced == "openrouter":
+        client, model = _try_openrouter()
+        if client is None:
+            logger.warning("auxiliary.provider=openrouter but OPENROUTER_API_KEY not set")
+        return client, model
+
+    if forced == "nous":
+        client, model = _try_nous()
+        if client is None:
+            logger.warning("auxiliary.provider=nous but Nous Portal not configured (run: hermes login)")
+        return client, model
+
+    if forced == "main":
+        # "main" = skip OpenRouter/Nous, use the main chat model's credentials.
+        for try_fn in (_try_custom_endpoint, _try_codex, _resolve_api_key_provider):
+            client, model = try_fn()
+            if client is not None:
+                return client, model
+        logger.warning("auxiliary.provider=main but no main endpoint credentials found")
+        return None, None
+
+    # Unknown provider name — fall through to auto
+    logger.warning("Unknown auxiliary.provider=%r, falling back to auto", forced)
    return None, None


-def get_async_text_auxiliary_client():
+def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
+    """Full auto-detection chain: OpenRouter → Nous → custom → Codex → API-key → None."""
+    for try_fn in (_try_openrouter, _try_nous, _try_custom_endpoint,
+                   _try_codex, _resolve_api_key_provider):
+        client, model = try_fn()
+        if client is not None:
+            return client, model
+    logger.debug("Auxiliary client: none available")
+    return None, None
+
+
+# ── Public API ──────────────────────────────────────────────────────────────
+
+def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optional[str]]:
+    """Return (client, default_model_slug) for text-only auxiliary tasks.
+
+    Args:
+        task: Optional task name ("compression", "web_extract") to check
+              for a task-specific provider override.
+
+    Callers may override the returned model with a per-task env var
+    (e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
+    """
+    forced = _get_auxiliary_provider(task)
+    if forced != "auto":
+        return _resolve_forced_provider(forced)
+    return _resolve_auto()
+
+
+def get_async_text_auxiliary_client(task: str = ""):
    """Return (async_client, model_slug) for async consumers.

    For standard providers returns (AsyncOpenAI, model). For Codex returns
@@ -398,7 +461,7 @@ def get_async_text_auxiliary_client():
    """
    from openai import AsyncOpenAI

-    sync_client, model = get_text_auxiliary_client()
+    sync_client, model = get_text_auxiliary_client(task)
    if sync_client is None:
        return None, None

@@ -417,30 +480,16 @@ def get_async_text_auxiliary_client():


 def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
-    """Return (client, model_slug) for vision/multimodal auxiliary tasks.
+    """Return (client, default_model_slug) for vision/multimodal auxiliary tasks.

-    Only OpenRouter and Nous Portal qualify — custom endpoints cannot
-    substitute for Gemini multimodal.
+    Checks AUXILIARY_VISION_PROVIDER for a forced provider, otherwise
+    auto-detects.  Callers may override the returned model with
+    AUXILIARY_VISION_MODEL.
    """
-    # 1. OpenRouter
-    or_key = os.getenv("OPENROUTER_API_KEY")
-    if or_key:
-        logger.debug("Auxiliary vision client: OpenRouter")
-        return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
-                       default_headers=_OR_HEADERS), _OPENROUTER_MODEL
-
-    # 2. Nous Portal
-    nous = _read_nous_auth()
-    if nous:
-        logger.debug("Auxiliary vision client: Nous Portal")
-        return (
-            OpenAI(api_key=_nous_api_key(nous), base_url=_nous_base_url()),
-            _NOUS_MODEL,
-        )
-
-    # 3. Nothing suitable
-    logger.debug("Auxiliary vision client: none available")
-    return None, None
+    forced = _get_auxiliary_provider("vision")
+    if forced != "auto":
+        return _resolve_forced_provider(forced)
+    return _resolve_auto()


 def get_auxiliary_extra_body() -> dict:
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -53,7 +53,7 @@ class ContextCompressor:
        self.last_completion_tokens = 0
        self.last_total_tokens = 0

-        self.client, default_model = get_text_auxiliary_client()
+        self.client, default_model = get_text_auxiliary_client("compression")
        self.summary_model = summary_model_override or default_model

    def update_from_response(self, usage: Dict[str, Any]):
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@@ -209,8 +209,58 @@ compression:
  threshold: 0.85
  
  # Model to use for generating summaries (fast/cheap recommended)
-  # This model compresses the middle turns into a concise summary
+  # This model compresses the middle turns into a concise summary.
+  # IMPORTANT: it receives the full middle section of the conversation, so it
+  # MUST support a context length at least as large as your main model's.
  summary_model: "google/gemini-3-flash-preview"
+  
+  # Provider for the summary model (default: "auto")
+  # Options: "auto", "openrouter", "nous", "main"
+  # summary_provider: "auto"
+
+# =============================================================================
+# Auxiliary Models (Advanced — Experimental)
+# =============================================================================
+# Hermes uses lightweight "auxiliary" models for side tasks: image analysis,
+# browser screenshot analysis, web page summarization, and context compression.
+#
+# By default these use Gemini Flash via OpenRouter or Nous Portal and are
+# auto-detected from your credentials.  You do NOT need to change anything
+# here for normal usage.
+#
+# WARNING: Overriding these with providers other than OpenRouter or Nous Portal
+# is EXPERIMENTAL and may not work.  Not all models/providers support vision,
+# produce usable summaries, or accept the same API format.  Change at your own
+# risk — if things break, reset to "auto" / empty values.
+#
+# Each task has its own provider + model pair so you can mix providers.
+# For example: OpenRouter for vision (needs multimodal), but your main
+# local endpoint for compression (just needs text).
+#
+# Provider options:
+#   "auto"       - Best available: OpenRouter → Nous Portal → main endpoint (default)
+#   "openrouter" - Force OpenRouter (requires OPENROUTER_API_KEY)
+#   "nous"       - Force Nous Portal (requires: hermes login)
+#   "main"       - Use the same provider & credentials as your main chat model.
+#                  Skips OpenRouter/Nous and uses your custom endpoint
+#                  (OPENAI_BASE_URL), Codex OAuth, or API-key provider directly.
+#                  Useful if you run a local model and want auxiliary tasks to
+#                  use it too.
+#
+# Model: leave empty to use the provider's default.  When empty, OpenRouter
+# uses "google/gemini-3-flash-preview" and Nous uses "gemini-3-flash".
+# Other providers pick a sensible default automatically.
+#
+# auxiliary:
+#   # Image analysis: vision_analyze tool + browser screenshots
+#   vision:
+#     provider: "auto"
+#     model: ""              # e.g. "google/gemini-2.5-flash", "openai/gpt-4o"
+#
+#   # Web page scraping / summarization + browser page text extraction
+#   web_extract:
+#     provider: "auto"
+#     model: ""

 # =============================================================================
 # Persistent Memory
--- a/cli.py
+++ b/cli.py
@@ -333,12 +333,36 @@ def load_cli_config() -> Dict[str, Any]:
        "enabled": "CONTEXT_COMPRESSION_ENABLED",
        "threshold": "CONTEXT_COMPRESSION_THRESHOLD",
        "summary_model": "CONTEXT_COMPRESSION_MODEL",
+        "summary_provider": "CONTEXT_COMPRESSION_PROVIDER",
    }
    
    for config_key, env_var in compression_env_mappings.items():
        if config_key in compression_config:
            os.environ[env_var] = str(compression_config[config_key])
    
+    # Apply auxiliary model overrides to environment variables.
+    # Vision and web_extract each have their own provider + model pair.
+    # (Compression is handled in the compression section above.)
+    # Only set env vars for non-empty / non-default values so auto-detection
+    # still works.
+    auxiliary_config = defaults.get("auxiliary", {})
+    auxiliary_task_env = {
+        # config key → (provider env var, model env var)
+        "vision":      ("AUXILIARY_VISION_PROVIDER",      "AUXILIARY_VISION_MODEL"),
+        "web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER",  "AUXILIARY_WEB_EXTRACT_MODEL"),
+    }
+    
+    for task_key, (prov_env, model_env) in auxiliary_task_env.items():
+        task_cfg = auxiliary_config.get(task_key, {})
+        if not isinstance(task_cfg, dict):
+            continue
+        prov = str(task_cfg.get("provider", "")).strip()
+        model = str(task_cfg.get("model", "")).strip()
+        if prov and prov != "auto":
+            os.environ[prov_env] = prov
+        if model:
+            os.environ[model_env] = model
+    
    return defaults

 # Load configuration at module startup
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -87,6 +87,20 @@ DEFAULT_CONFIG = {
        "enabled": True,
        "threshold": 0.85,
        "summary_model": "google/gemini-3-flash-preview",
+        "summary_provider": "auto",
+    },
+    
+    # Auxiliary model overrides (advanced).  By default Hermes auto-selects
+    # the provider and model for each side task.  Set these to override.
+    "auxiliary": {
+        "vision": {
+            "provider": "auto",    # auto | openrouter | nous | main
+            "model": "",           # e.g. "google/gemini-2.5-flash", "gpt-4o"
+        },
+        "web_extract": {
+            "provider": "auto",
+            "model": "",
+        },
    },
    
    "display": {
@@ -913,6 +927,31 @@ def show_config():
    if enabled:
        print(f"  Threshold:    {compression.get('threshold', 0.85) * 100:.0f}%")
        print(f"  Model:        {compression.get('summary_model', 'google/gemini-3-flash-preview')}")
+        comp_provider = compression.get('summary_provider', 'auto')
+        if comp_provider != 'auto':
+            print(f"  Provider:     {comp_provider}")
+    
+    # Auxiliary models
+    auxiliary = config.get('auxiliary', {})
+    aux_tasks = {
+        "Vision":      auxiliary.get('vision', {}),
+        "Web extract": auxiliary.get('web_extract', {}),
+    }
+    has_overrides = any(
+        t.get('provider', 'auto') != 'auto' or t.get('model', '')
+        for t in aux_tasks.values()
+    )
+    if has_overrides:
+        print()
+        print(color("◆ Auxiliary Models (overrides)", Colors.CYAN, Colors.BOLD))
+        for label, task_cfg in aux_tasks.items():
+            prov = task_cfg.get('provider', 'auto')
+            mdl = task_cfg.get('model', '')
+            if prov != 'auto' or mdl:
+                parts = [f"provider={prov}"]
+                if mdl:
+                    parts.append(f"model={mdl}")
+                print(f"  {label:12s}  {', '.join(parts)}")
    
    # Messaging
    print()
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@@ -151,10 +151,10 @@ class TestGetTextAuxiliaryClient:
        assert model is None


-class TestCodexNotInVisionClient:
-    """Codex fallback should NOT apply to vision tasks."""
+class TestVisionClientFallback:
+    """Vision client uses the same full fallback chain as text."""

-    def test_vision_returns_none_without_openrouter_nous(self):
+    def test_vision_returns_none_without_any_credentials(self):
        with patch("agent.auxiliary_client._read_nous_auth", return_value=None):
            client, model = get_vision_auxiliary_client()
        assert client is None
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@@ -63,7 +63,7 @@ import time
 import requests
 from typing import Dict, Any, Optional, List
 from pathlib import Path
-from agent.auxiliary_client import get_vision_auxiliary_client
+from agent.auxiliary_client import get_vision_auxiliary_client, get_text_auxiliary_client

 logger = logging.getLogger(__name__)

@@ -80,8 +80,28 @@ DEFAULT_SESSION_TIMEOUT = 300
 # Max tokens for snapshot content before summarization
 SNAPSHOT_SUMMARIZE_THRESHOLD = 8000

-# Resolve vision auxiliary client for extraction/vision tasks
-_aux_vision_client, EXTRACTION_MODEL = get_vision_auxiliary_client()
+# Vision client — for browser_vision (screenshot analysis)
+_aux_vision_client, _DEFAULT_VISION_MODEL = get_vision_auxiliary_client()
+
+# Text client — for page snapshot summarization (same config as web_extract)
+_aux_text_client, _DEFAULT_TEXT_MODEL = get_text_auxiliary_client("web_extract")
+
+# Module-level alias for availability checks
+EXTRACTION_MODEL = _DEFAULT_TEXT_MODEL or _DEFAULT_VISION_MODEL
+
+
+def _get_vision_model() -> str:
+    """Model for browser_vision (screenshot analysis — multimodal)."""
+    return (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
+            or _DEFAULT_VISION_MODEL
+            or "google/gemini-3-flash-preview")
+
+
+def _get_extraction_model() -> str:
+    """Model for page snapshot text summarization — same as web_extract."""
+    return (os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
+            or _DEFAULT_TEXT_MODEL
+            or "google/gemini-3-flash-preview")


 def _is_local_mode() -> bool:
@@ -860,9 +880,9 @@ def _extract_relevant_content(
 ) -> str:
    """Use LLM to extract relevant content from a snapshot based on the user's task.

-    Falls back to simple truncation when no auxiliary vision model is configured.
+    Falls back to simple truncation when no auxiliary text model is configured.
    """
-    if _aux_vision_client is None or EXTRACTION_MODEL is None:
+    if _aux_text_client is None:
        return _truncate_snapshot(snapshot_text)

    if user_task:
@@ -890,8 +910,8 @@ def _extract_relevant_content(

    try:
        from agent.auxiliary_client import auxiliary_max_tokens_param
-        response = _aux_vision_client.chat.completions.create(
-            model=EXTRACTION_MODEL,
+        response = _aux_text_client.chat.completions.create(
+            model=_get_extraction_model(),
            messages=[{"role": "user", "content": extraction_prompt}],
            **auxiliary_max_tokens_param(4000),
            temperature=0.1,
@@ -1316,7 +1336,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
    effective_task_id = task_id or "default"
    
    # Check auxiliary vision client
-    if _aux_vision_client is None or EXTRACTION_MODEL is None:
+    if _aux_vision_client is None or _DEFAULT_VISION_MODEL is None:
        return json.dumps({
            "success": False,
            "error": "Browser vision unavailable: no auxiliary vision model configured. "
@@ -1372,7 +1392,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
        # Use the sync auxiliary vision client directly
        from agent.auxiliary_client import auxiliary_max_tokens_param
        response = _aux_vision_client.chat.completions.create(
-            model=EXTRACTION_MODEL,
+            model=_get_vision_model(),
            messages=[
                {
                    "role": "user",
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@@ -468,7 +468,9 @@ def _handle_vision_analyze(args, **kw):
    image_url = args.get("image_url", "")
    question = args.get("question", "")
    full_prompt = f"Fully describe and explain everything about this image, then answer the following question:\n\n{question}"
-    model = DEFAULT_VISION_MODEL or "google/gemini-3-flash-preview"
+    model = (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
+             or DEFAULT_VISION_MODEL
+             or "google/gemini-3-flash-preview")
    return vision_analyze_tool(image_url, full_prompt, model)


--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@@ -85,7 +85,13 @@ DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000

 # Resolve async auxiliary client at module level.
 # Handles Codex Responses API adapter transparently.
-_aux_async_client, DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client()
+_aux_async_client, _DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client("web_extract")
+
+# Allow per-task override via config.yaml auxiliary.web_extract_model
+DEFAULT_SUMMARIZER_MODEL = (
+    os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
+    or _DEFAULT_SUMMARIZER_MODEL
+)

 _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")