diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 264bab3f4..04afe4c78 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -784,3 +784,253 @@ def auxiliary_max_tokens_param(value: int) -> dict: and "api.openai.com" in custom_base.lower()): return {"max_completion_tokens": value} return {"max_tokens": value} + + +# ── Centralized LLM Call API ──────────────────────────────────────────────── +# +# call_llm() and async_call_llm() own the full request lifecycle: +# 1. Resolve provider + model from task config (or explicit args) +# 2. Get or create a cached client for that provider +# 3. Format request args for the provider + model (max_tokens handling, etc.) +# 4. Make the API call +# 5. Return the response +# +# Every auxiliary LLM consumer should use these instead of manually +# constructing clients and calling .chat.completions.create(). + +# Client cache: (provider, async_mode) -> (client, default_model) +_client_cache: Dict[tuple, tuple] = {} + + +def _get_cached_client( + provider: str, model: str = None, async_mode: bool = False, +) -> Tuple[Optional[Any], Optional[str]]: + """Get or create a cached client for the given provider.""" + cache_key = (provider, async_mode) + if cache_key in _client_cache: + cached_client, cached_default = _client_cache[cache_key] + return cached_client, model or cached_default + client, default_model = resolve_provider_client(provider, model, async_mode) + if client is not None: + _client_cache[cache_key] = (client, default_model) + return client, model or default_model + + +def _resolve_task_provider_model( + task: str = None, + provider: str = None, + model: str = None, +) -> Tuple[str, Optional[str]]: + """Determine provider + model for a call. + + Priority: + 1. Explicit provider/model args (always win) + 2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.) + 3. Config file (auxiliary.{task}.provider/model or compression.*) + 4. "auto" (full auto-detection chain) + + Returns (provider, model) where model may be None (use provider default). + """ + if provider: + return provider, model + + if task: + # Check env var overrides first + env_provider = _get_auxiliary_provider(task) + if env_provider != "auto": + # Check for env var model override too + env_model = None + for prefix in ("AUXILIARY_", "CONTEXT_"): + val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip() + if val: + env_model = val + break + return env_provider, model or env_model + + # Read from config file + try: + from hermes_cli.config import load_config + config = load_config() + except ImportError: + return "auto", model + + # Check auxiliary.{task} section + aux = config.get("auxiliary", {}) + task_config = aux.get(task, {}) + cfg_provider = task_config.get("provider", "").strip() or None + cfg_model = task_config.get("model", "").strip() or None + + # Backwards compat: compression section has its own keys + if task == "compression" and not cfg_provider: + comp = config.get("compression", {}) + cfg_provider = comp.get("summary_provider", "").strip() or None + cfg_model = cfg_model or comp.get("summary_model", "").strip() or None + + if cfg_provider and cfg_provider != "auto": + return cfg_provider, model or cfg_model + return "auto", model or cfg_model + + return "auto", model + + +def _build_call_kwargs( + provider: str, + model: str, + messages: list, + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + tools: Optional[list] = None, + timeout: float = 30.0, + extra_body: Optional[dict] = None, +) -> dict: + """Build kwargs for .chat.completions.create() with model/provider adjustments.""" + kwargs: Dict[str, Any] = { + "model": model, + "messages": messages, + "timeout": timeout, + } + + if temperature is not None: + kwargs["temperature"] = temperature + + if max_tokens is not None: + # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens. + # Direct OpenAI api.openai.com with newer models needs max_completion_tokens. + if provider == "custom": + custom_base = os.getenv("OPENAI_BASE_URL", "") + if "api.openai.com" in custom_base.lower(): + kwargs["max_completion_tokens"] = max_tokens + else: + kwargs["max_tokens"] = max_tokens + else: + kwargs["max_tokens"] = max_tokens + + if tools: + kwargs["tools"] = tools + + # Provider-specific extra_body + merged_extra = dict(extra_body or {}) + if provider == "nous" or auxiliary_is_nous: + merged_extra.setdefault("tags", []).extend(["product=hermes-agent"]) + if merged_extra: + kwargs["extra_body"] = merged_extra + + return kwargs + + +def call_llm( + task: str = None, + *, + provider: str = None, + model: str = None, + messages: list, + temperature: float = None, + max_tokens: int = None, + tools: list = None, + timeout: float = 30.0, + extra_body: dict = None, +) -> Any: + """Centralized synchronous LLM call. + + Resolves provider + model (from task config, explicit args, or auto-detect), + handles auth, request formatting, and model-specific arg adjustments. + + Args: + task: Auxiliary task name ("compression", "vision", "web_extract", + "session_search", "skills_hub", "mcp", "flush_memories"). + Reads provider:model from config/env. Ignored if provider is set. + provider: Explicit provider override. + model: Explicit model override. + messages: Chat messages list. + temperature: Sampling temperature (None = provider default). + max_tokens: Max output tokens (handles max_tokens vs max_completion_tokens). + tools: Tool definitions (for function calling). + timeout: Request timeout in seconds. + extra_body: Additional request body fields. + + Returns: + Response object with .choices[0].message.content + + Raises: + RuntimeError: If no provider is configured. + """ + resolved_provider, resolved_model = _resolve_task_provider_model( + task, provider, model) + + client, final_model = _get_cached_client(resolved_provider, resolved_model) + if client is None: + # Fallback: try openrouter + if resolved_provider != "openrouter": + logger.warning("Provider %s unavailable, falling back to openrouter", + resolved_provider) + client, final_model = _get_cached_client( + "openrouter", resolved_model or _OPENROUTER_MODEL) + if client is None: + raise RuntimeError( + f"No LLM provider configured for task={task} provider={resolved_provider}. " + f"Run: hermes setup") + + kwargs = _build_call_kwargs( + resolved_provider, final_model, messages, + temperature=temperature, max_tokens=max_tokens, + tools=tools, timeout=timeout, extra_body=extra_body) + + # Handle max_tokens vs max_completion_tokens retry + try: + return client.chat.completions.create(**kwargs) + except Exception as first_err: + err_str = str(first_err) + if "max_tokens" in err_str or "unsupported_parameter" in err_str: + kwargs.pop("max_tokens", None) + kwargs["max_completion_tokens"] = max_tokens + return client.chat.completions.create(**kwargs) + raise + + +async def async_call_llm( + task: str = None, + *, + provider: str = None, + model: str = None, + messages: list, + temperature: float = None, + max_tokens: int = None, + tools: list = None, + timeout: float = 30.0, + extra_body: dict = None, +) -> Any: + """Centralized asynchronous LLM call. + + Same as call_llm() but async. See call_llm() for full documentation. + """ + resolved_provider, resolved_model = _resolve_task_provider_model( + task, provider, model) + + client, final_model = _get_cached_client( + resolved_provider, resolved_model, async_mode=True) + if client is None: + if resolved_provider != "openrouter": + logger.warning("Provider %s unavailable, falling back to openrouter", + resolved_provider) + client, final_model = _get_cached_client( + "openrouter", resolved_model or _OPENROUTER_MODEL, + async_mode=True) + if client is None: + raise RuntimeError( + f"No LLM provider configured for task={task} provider={resolved_provider}. " + f"Run: hermes setup") + + kwargs = _build_call_kwargs( + resolved_provider, final_model, messages, + temperature=temperature, max_tokens=max_tokens, + tools=tools, timeout=timeout, extra_body=extra_body) + + try: + return await client.chat.completions.create(**kwargs) + except Exception as first_err: + err_str = str(first_err) + if "max_tokens" in err_str or "unsupported_parameter" in err_str: + kwargs.pop("max_tokens", None) + kwargs["max_completion_tokens"] = max_tokens + return await client.chat.completions.create(**kwargs) + raise diff --git a/agent/context_compressor.py b/agent/context_compressor.py index fae483fd8..a0ca0c991 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -9,7 +9,7 @@ import logging import os from typing import Any, Dict, List, Optional -from agent.auxiliary_client import get_text_auxiliary_client +from agent.auxiliary_client import call_llm from agent.model_metadata import ( get_model_context_length, estimate_messages_tokens_rough, @@ -53,8 +53,7 @@ class ContextCompressor: self.last_completion_tokens = 0 self.last_total_tokens = 0 - self.client, default_model = get_text_auxiliary_client("compression") - self.summary_model = summary_model_override or default_model + self.summary_model = summary_model_override or "" def update_from_response(self, usage: Dict[str, Any]): """Update tracked token usage from API response.""" @@ -120,73 +119,30 @@ TURNS TO SUMMARIZE: Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" - # 1. Try the auxiliary model (cheap/fast) - if self.client: - try: - return self._call_summary_model(self.client, self.summary_model, prompt) - except Exception as e: - logging.warning(f"Failed to generate context summary with auxiliary model: {e}") - - # 2. Fallback: re-try via the centralized provider router. - # This covers all configured providers (Codex OAuth, API-key - # providers, etc.) without ad-hoc env var lookups. - from agent.auxiliary_client import resolve_provider_client - fallback_providers = ["custom", "openrouter", "nous", "codex"] - for fb_provider in fallback_providers: - try: - fb_client, fb_model = resolve_provider_client( - fb_provider, model=self.model) - if fb_client is None: - continue - # Don't retry the same client that just failed - if (self.client is not None - and hasattr(fb_client, "base_url") - and hasattr(self.client, "base_url") - and str(fb_client.base_url) == str(self.client.base_url)): - continue - logger.info("Retrying context summary with fallback provider " - "%s (%s)", fb_provider, fb_model) - summary = self._call_summary_model(fb_client, fb_model, prompt) - # Promote successful fallback for future compressions - self.client = fb_client - self.summary_model = fb_model - return summary - except Exception as fallback_err: - logging.warning("Fallback provider %s failed: %s", - fb_provider, fallback_err) - - # 3. All providers failed — return None so the caller drops turns - # without a summary. - logging.warning("Context compression: no provider available for " - "summary. Middle turns will be dropped without summary.") - return None - - def _call_summary_model(self, client, model: str, prompt: str) -> str: - """Make the actual LLM call to generate a summary. Raises on failure.""" - kwargs = { - "model": model, - "messages": [{"role": "user", "content": prompt}], - "temperature": 0.3, - "timeout": 30.0, - } - # Most providers (OpenRouter, local models) use max_tokens. - # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+) - # requires max_completion_tokens instead. + # Use the centralized LLM router — handles provider resolution, + # auth, and fallback internally. try: - kwargs["max_tokens"] = self.summary_target_tokens * 2 - response = client.chat.completions.create(**kwargs) - except Exception as first_err: - if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err): - kwargs.pop("max_tokens", None) - kwargs["max_completion_tokens"] = self.summary_target_tokens * 2 - response = client.chat.completions.create(**kwargs) - else: - raise - - summary = response.choices[0].message.content.strip() - if not summary.startswith("[CONTEXT SUMMARY]:"): - summary = "[CONTEXT SUMMARY]: " + summary - return summary + call_kwargs = { + "task": "compression", + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.3, + "max_tokens": self.summary_target_tokens * 2, + "timeout": 30.0, + } + if self.summary_model: + call_kwargs["model"] = self.summary_model + response = call_llm(**call_kwargs) + summary = response.choices[0].message.content.strip() + if not summary.startswith("[CONTEXT SUMMARY]:"): + summary = "[CONTEXT SUMMARY]: " + summary + return summary + except RuntimeError: + logging.warning("Context compression: no provider available for " + "summary. Middle turns will be dropped without summary.") + return None + except Exception as e: + logging.warning("Failed to generate context summary: %s", e) + return None # ------------------------------------------------------------------ # Tool-call / tool-result pair integrity helpers diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 677de678c..990089781 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -125,17 +125,41 @@ DEFAULT_CONFIG = { "summary_provider": "auto", }, - # Auxiliary model overrides (advanced). By default Hermes auto-selects - # the provider and model for each side task. Set these to override. + # Auxiliary model config — provider:model for each side task. + # Format: provider is the provider name, model is the model slug. + # "auto" for provider = auto-detect best available provider. + # Empty model = use provider's default auxiliary model. + # All tasks fall back to openrouter:google/gemini-3-flash-preview if + # the configured provider is unavailable. "auxiliary": { "vision": { - "provider": "auto", # auto | openrouter | nous | main + "provider": "auto", # auto | openrouter | nous | codex | custom "model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o" }, "web_extract": { "provider": "auto", "model": "", }, + "compression": { + "provider": "auto", + "model": "", + }, + "session_search": { + "provider": "auto", + "model": "", + }, + "skills_hub": { + "provider": "auto", + "model": "", + }, + "mcp": { + "provider": "auto", + "model": "", + }, + "flush_memories": { + "provider": "auto", + "model": "", + }, }, "display": { @@ -217,7 +241,7 @@ DEFAULT_CONFIG = { "personalities": {}, # Config schema version - bump this when adding new required fields - "_config_version": 6, + "_config_version": 7, } # ============================================================================= diff --git a/run_agent.py b/run_agent.py index db35d85fd..8849d25c3 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2623,19 +2623,22 @@ class AIAgent: # Use auxiliary client for the flush call when available -- # it's cheaper and avoids Codex Responses API incompatibility. - from agent.auxiliary_client import get_text_auxiliary_client - aux_client, aux_model = get_text_auxiliary_client() + from agent.auxiliary_client import call_llm as _call_llm + _aux_available = True + try: + response = _call_llm( + task="flush_memories", + messages=api_messages, + tools=[memory_tool_def], + temperature=0.3, + max_tokens=5120, + timeout=30.0, + ) + except RuntimeError: + _aux_available = False + response = None - if aux_client: - api_kwargs = { - "model": aux_model, - "messages": api_messages, - "tools": [memory_tool_def], - "temperature": 0.3, - "max_tokens": 5120, - } - response = aux_client.chat.completions.create(**api_kwargs, timeout=30.0) - elif self.api_mode == "codex_responses": + if not _aux_available and self.api_mode == "codex_responses": # No auxiliary client -- use the Codex Responses path directly codex_kwargs = self._build_api_kwargs(api_messages) codex_kwargs["tools"] = self._responses_tools([memory_tool_def]) @@ -2643,7 +2646,7 @@ class AIAgent: if "max_output_tokens" in codex_kwargs: codex_kwargs["max_output_tokens"] = 5120 response = self._run_codex_stream(codex_kwargs) - else: + elif not _aux_available: api_kwargs = { "model": self.model, "messages": api_messages, @@ -2655,7 +2658,7 @@ class AIAgent: # Extract tool calls from the response, handling both API formats tool_calls = [] - if self.api_mode == "codex_responses" and not aux_client: + if self.api_mode == "codex_responses" and not _aux_available: assistant_msg, _ = self._normalize_codex_response(response) if assistant_msg and assistant_msg.tool_calls: tool_calls = assistant_msg.tool_calls diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 12fa374c8..82ee93503 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -9,8 +9,7 @@ from agent.context_compressor import ContextCompressor @pytest.fixture() def compressor(): """Create a ContextCompressor with mocked dependencies.""" - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor( model="test/model", threshold_percent=0.85, @@ -119,14 +118,11 @@ class TestGenerateSummaryNoneContent: """Regression: content=None (from tool-call-only assistant messages) must not crash.""" def test_none_content_does_not_crash(self): - mock_client = MagicMock() mock_response = MagicMock() mock_response.choices = [MagicMock()] mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: tool calls happened" - mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True) messages = [ @@ -139,14 +135,14 @@ class TestGenerateSummaryNoneContent: {"role": "user", "content": "thanks"}, ] - summary = c._generate_summary(messages) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + summary = c._generate_summary(messages) assert isinstance(summary, str) assert "CONTEXT SUMMARY" in summary def test_none_content_in_system_message_compress(self): """System message with content=None should not crash during compress.""" - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) msgs = [{"role": "system", "content": None}] + [ @@ -165,12 +161,12 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened" mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True) msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)] - result = c.compress(msgs) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + result = c.compress(msgs) # Should have summary message in the middle contents = [m.get("content", "") for m in result] @@ -184,8 +180,7 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle" mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor( model="test", quiet_mode=True, @@ -212,7 +207,8 @@ class TestCompressWithClient: {"role": "user", "content": "later 4"}, ] - result = c.compress(msgs) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + result = c.compress(msgs) answered_ids = { msg.get("tool_call_id") @@ -232,8 +228,7 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened" mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) # Last head message (index 1) is "assistant" → summary should be "user" @@ -245,7 +240,8 @@ class TestCompressWithClient: {"role": "user", "content": "msg 4"}, {"role": "assistant", "content": "msg 5"}, ] - result = c.compress(msgs) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + result = c.compress(msgs) summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")] assert len(summary_msg) == 1 assert summary_msg[0]["role"] == "user" @@ -258,8 +254,7 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened" mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=2) # Last head message (index 2) is "user" → summary should be "assistant" @@ -273,20 +268,18 @@ class TestCompressWithClient: {"role": "user", "content": "msg 6"}, {"role": "assistant", "content": "msg 7"}, ] - result = c.compress(msgs) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + result = c.compress(msgs) summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")] assert len(summary_msg) == 1 assert summary_msg[0]["role"] == "assistant" def test_summarization_does_not_start_tail_with_tool_outputs(self): - mock_client = MagicMock() mock_response = MagicMock() mock_response.choices = [MagicMock()] mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle" - mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor( model="test", quiet_mode=True, @@ -309,7 +302,8 @@ class TestCompressWithClient: {"role": "user", "content": "latest user"}, ] - result = c.compress(msgs) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + result = c.compress(msgs) called_ids = { tc["id"] diff --git a/tests/tools/test_mcp_tool.py b/tests/tools/test_mcp_tool.py index 446f80d3e..0d527e95d 100644 --- a/tests/tools/test_mcp_tool.py +++ b/tests/tools/test_mcp_tool.py @@ -1828,8 +1828,8 @@ class TestSamplingCallbackText: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): params = _make_sampling_params() result = asyncio.run(self.handler(None, params)) @@ -1847,13 +1847,13 @@ class TestSamplingCallbackText: fake_client.chat.completions.create.return_value = _make_llm_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), - ): + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, + ) as mock_call: params = _make_sampling_params(system_prompt="Be helpful") asyncio.run(self.handler(None, params)) - call_args = fake_client.chat.completions.create.call_args + call_args = mock_call.call_args messages = call_args.kwargs["messages"] assert messages[0] == {"role": "system", "content": "Be helpful"} @@ -1865,8 +1865,8 @@ class TestSamplingCallbackText: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): params = _make_sampling_params() result = asyncio.run(self.handler(None, params)) @@ -1889,8 +1889,8 @@ class TestSamplingCallbackToolUse: fake_client.chat.completions.create.return_value = _make_llm_tool_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): params = _make_sampling_params() result = asyncio.run(self.handler(None, params)) @@ -1916,8 +1916,8 @@ class TestSamplingCallbackToolUse: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(self.handler(None, _make_sampling_params())) @@ -1939,8 +1939,8 @@ class TestToolLoopGovernance: fake_client.chat.completions.create.return_value = _make_llm_tool_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): params = _make_sampling_params() # Round 1, 2: allowed @@ -1959,8 +1959,8 @@ class TestToolLoopGovernance: fake_client = MagicMock() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): # Tool response (round 1 of 1 allowed) fake_client.chat.completions.create.return_value = _make_llm_tool_response() @@ -1984,8 +1984,8 @@ class TestToolLoopGovernance: fake_client.chat.completions.create.return_value = _make_llm_tool_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, ErrorData) @@ -2003,8 +2003,8 @@ class TestSamplingErrors: fake_client.chat.completions.create.return_value = _make_llm_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): # First call succeeds r1 = asyncio.run(handler(None, _make_sampling_params())) @@ -2017,20 +2017,16 @@ class TestSamplingErrors: def test_timeout_error(self): handler = SamplingHandler("to", {"timeout": 0.05}) - fake_client = MagicMock() def slow_call(**kwargs): import threading - # Use an event to ensure the thread truly blocks long enough evt = threading.Event() evt.wait(5) # blocks for up to 5 seconds (cancelled by timeout) return _make_llm_response() - fake_client.chat.completions.create.side_effect = slow_call - with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + side_effect=slow_call, ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, ErrorData) @@ -2041,12 +2037,11 @@ class TestSamplingErrors: handler = SamplingHandler("np", {}) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(None, None), + "agent.auxiliary_client.call_llm", + side_effect=RuntimeError("No LLM provider configured"), ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, ErrorData) - assert "No LLM provider" in result.message assert handler.metrics["errors"] == 1 def test_empty_choices_returns_error(self): @@ -2060,8 +2055,8 @@ class TestSamplingErrors: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) @@ -2080,8 +2075,8 @@ class TestSamplingErrors: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) @@ -2099,8 +2094,8 @@ class TestSamplingErrors: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) @@ -2120,8 +2115,8 @@ class TestModelWhitelist: fake_client.chat.completions.create.return_value = _make_llm_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "test-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, CreateMessageResult) @@ -2131,8 +2126,8 @@ class TestModelWhitelist: fake_client = MagicMock() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "gpt-3.5-turbo"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, ErrorData) @@ -2145,8 +2140,8 @@ class TestModelWhitelist: fake_client.chat.completions.create.return_value = _make_llm_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "any-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, CreateMessageResult) @@ -2166,8 +2161,8 @@ class TestMalformedToolCallArgs: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) @@ -2194,8 +2189,8 @@ class TestMalformedToolCallArgs: fake_client.chat.completions.create.return_value = response with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) @@ -2214,8 +2209,8 @@ class TestMetricsTracking: fake_client.chat.completions.create.return_value = _make_llm_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): asyncio.run(handler(None, _make_sampling_params())) @@ -2229,8 +2224,8 @@ class TestMetricsTracking: fake_client.chat.completions.create.return_value = _make_llm_tool_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): asyncio.run(handler(None, _make_sampling_params())) @@ -2241,8 +2236,8 @@ class TestMetricsTracking: handler = SamplingHandler("met3", {}) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(None, None), + "agent.auxiliary_client.call_llm", + side_effect=RuntimeError("No LLM provider configured"), ): asyncio.run(handler(None, _make_sampling_params())) diff --git a/tools/browser_tool.py b/tools/browser_tool.py index dd44549b9..ae9515748 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -63,7 +63,7 @@ import time import requests from typing import Dict, Any, Optional, List from pathlib import Path -from agent.auxiliary_client import get_vision_auxiliary_client, get_text_auxiliary_client +from agent.auxiliary_client import call_llm logger = logging.getLogger(__name__) @@ -80,38 +80,15 @@ DEFAULT_SESSION_TIMEOUT = 300 # Max tokens for snapshot content before summarization SNAPSHOT_SUMMARIZE_THRESHOLD = 8000 -# Vision client — for browser_vision (screenshot analysis) -# Wrapped in try/except so a broken auxiliary config doesn't prevent the entire -# browser_tool module from importing (which would disable all 10 browser tools). -try: - _aux_vision_client, _DEFAULT_VISION_MODEL = get_vision_auxiliary_client() -except Exception as _init_err: - logger.debug("Could not initialise vision auxiliary client: %s", _init_err) - _aux_vision_client, _DEFAULT_VISION_MODEL = None, None -# Text client — for page snapshot summarization (same config as web_extract) -try: - _aux_text_client, _DEFAULT_TEXT_MODEL = get_text_auxiliary_client("web_extract") -except Exception as _init_err: - logger.debug("Could not initialise text auxiliary client: %s", _init_err) - _aux_text_client, _DEFAULT_TEXT_MODEL = None, None - -# Module-level alias for availability checks -EXTRACTION_MODEL = _DEFAULT_TEXT_MODEL or _DEFAULT_VISION_MODEL - - -def _get_vision_model() -> str: +def _get_vision_model() -> Optional[str]: """Model for browser_vision (screenshot analysis — multimodal).""" - return (os.getenv("AUXILIARY_VISION_MODEL", "").strip() - or _DEFAULT_VISION_MODEL - or "google/gemini-3-flash-preview") + return os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None -def _get_extraction_model() -> str: +def _get_extraction_model() -> Optional[str]: """Model for page snapshot text summarization — same as web_extract.""" - return (os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() - or _DEFAULT_TEXT_MODEL - or "google/gemini-3-flash-preview") + return os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None def _is_local_mode() -> bool: @@ -941,9 +918,6 @@ def _extract_relevant_content( Falls back to simple truncation when no auxiliary text model is configured. """ - if _aux_text_client is None: - return _truncate_snapshot(snapshot_text) - if user_task: extraction_prompt = ( f"You are a content extractor for a browser automation agent.\n\n" @@ -968,13 +942,16 @@ def _extract_relevant_content( ) try: - from agent.auxiliary_client import auxiliary_max_tokens_param - response = _aux_text_client.chat.completions.create( - model=_get_extraction_model(), - messages=[{"role": "user", "content": extraction_prompt}], - **auxiliary_max_tokens_param(4000), - temperature=0.1, - ) + call_kwargs = { + "task": "web_extract", + "messages": [{"role": "user", "content": extraction_prompt}], + "max_tokens": 4000, + "temperature": 0.1, + } + model = _get_extraction_model() + if model: + call_kwargs["model"] = model + response = call_llm(**call_kwargs) return response.choices[0].message.content except Exception: return _truncate_snapshot(snapshot_text) @@ -1497,14 +1474,6 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] effective_task_id = task_id or "default" - # Check auxiliary vision client - if _aux_vision_client is None or _DEFAULT_VISION_MODEL is None: - return json.dumps({ - "success": False, - "error": "Browser vision unavailable: no auxiliary vision model configured. " - "Set OPENROUTER_API_KEY or configure Nous Portal to enable browser vision." - }, ensure_ascii=False) - # Save screenshot to persistent location so it can be shared with users hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) screenshots_dir = hermes_home / "browser_screenshots" @@ -1562,14 +1531,13 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] f"Focus on answering the user's specific question." ) - # Use the sync auxiliary vision client directly - from agent.auxiliary_client import auxiliary_max_tokens_param + # Use the centralized LLM router vision_model = _get_vision_model() - logger.debug("browser_vision: analysing screenshot (%d bytes) with model=%s", - len(image_data), vision_model) - response = _aux_vision_client.chat.completions.create( - model=vision_model, - messages=[ + logger.debug("browser_vision: analysing screenshot (%d bytes)", + len(image_data)) + call_kwargs = { + "task": "vision", + "messages": [ { "role": "user", "content": [ @@ -1578,9 +1546,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] ], } ], - **auxiliary_max_tokens_param(2000), - temperature=0.1, - ) + "max_tokens": 2000, + "temperature": 0.1, + } + if vision_model: + call_kwargs["model"] = vision_model + response = call_llm(**call_kwargs) analysis = response.choices[0].message.content response_data = { diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index b0fc35f7f..e1137909e 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -456,17 +456,13 @@ class SamplingHandler: # Resolve model model = self._resolve_model(getattr(params, "modelPreferences", None)) - # Get auxiliary LLM client - from agent.auxiliary_client import get_text_auxiliary_client - client, default_model = get_text_auxiliary_client() - if client is None: - self.metrics["errors"] += 1 - return self._error("No LLM provider available for sampling") + # Get auxiliary LLM client via centralized router + from agent.auxiliary_client import call_llm - resolved_model = model or default_model + # Model whitelist check (we need to resolve model before calling) + resolved_model = model or self.model_override or "" - # Model whitelist check - if self.allowed_models and resolved_model not in self.allowed_models: + if self.allowed_models and resolved_model and resolved_model not in self.allowed_models: logger.warning( "MCP server '%s' requested model '%s' not in allowed_models", self.server_name, resolved_model, @@ -484,20 +480,15 @@ class SamplingHandler: # Build LLM call kwargs max_tokens = min(params.maxTokens, self.max_tokens_cap) - call_kwargs: dict = { - "model": resolved_model, - "messages": messages, - "max_tokens": max_tokens, - } + call_temperature = None if hasattr(params, "temperature") and params.temperature is not None: - call_kwargs["temperature"] = params.temperature - if stop := getattr(params, "stopSequences", None): - call_kwargs["stop"] = stop + call_temperature = params.temperature # Forward server-provided tools + call_tools = None server_tools = getattr(params, "tools", None) if server_tools: - call_kwargs["tools"] = [ + call_tools = [ { "type": "function", "function": { @@ -508,9 +499,6 @@ class SamplingHandler: } for t in server_tools ] - if tool_choice := getattr(params, "toolChoice", None): - mode = getattr(tool_choice, "mode", "auto") - call_kwargs["tool_choice"] = {"auto": "auto", "required": "required", "none": "none"}.get(mode, "auto") logger.log( self.audit_level, @@ -520,7 +508,15 @@ class SamplingHandler: # Offload sync LLM call to thread (non-blocking) def _sync_call(): - return client.chat.completions.create(**call_kwargs) + return call_llm( + task="mcp", + model=resolved_model or None, + messages=messages, + temperature=call_temperature, + max_tokens=max_tokens, + tools=call_tools, + timeout=self.timeout, + ) try: response = await asyncio.wait_for( diff --git a/tools/session_search_tool.py b/tools/session_search_tool.py index 4bf88cbf0..cd1b98fd5 100644 --- a/tools/session_search_tool.py +++ b/tools/session_search_tool.py @@ -22,13 +22,7 @@ import os import logging from typing import Dict, Any, List, Optional, Union -from openai import AsyncOpenAI, OpenAI - -from agent.auxiliary_client import get_async_text_auxiliary_client - -# Resolve the async auxiliary client at import time so we have the model slug. -# Handles Codex Responses API adapter transparently. -_async_aux_client, _SUMMARIZER_MODEL = get_async_text_auxiliary_client() +from agent.auxiliary_client import async_call_llm MAX_SESSION_CHARS = 100_000 MAX_SUMMARY_TOKENS = 10000 @@ -156,26 +150,22 @@ async def _summarize_session( f"Summarize this conversation with focus on: {query}" ) - if _async_aux_client is None or _SUMMARIZER_MODEL is None: - logging.warning("No auxiliary model available for session summarization") - return None - max_retries = 3 for attempt in range(max_retries): try: - from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param - _extra = get_auxiliary_extra_body() - response = await _async_aux_client.chat.completions.create( - model=_SUMMARIZER_MODEL, + response = await async_call_llm( + task="session_search", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ], - **({} if not _extra else {"extra_body": _extra}), temperature=0.1, - **auxiliary_max_tokens_param(MAX_SUMMARY_TOKENS), + max_tokens=MAX_SUMMARY_TOKENS, ) return response.choices[0].message.content.strip() + except RuntimeError: + logging.warning("No auxiliary model available for session summarization") + return None except Exception as e: if attempt < max_retries - 1: await asyncio.sleep(1 * (attempt + 1)) @@ -333,8 +323,6 @@ def session_search( def check_session_search_requirements() -> bool: """Requires SQLite state database and an auxiliary text model.""" - if _async_aux_client is None: - return False try: from hermes_state import DEFAULT_DB_PATH return DEFAULT_DB_PATH.parent.exists() diff --git a/tools/skills_guard.py b/tools/skills_guard.py index 8234b0a20..c354d6548 100644 --- a/tools/skills_guard.py +++ b/tools/skills_guard.py @@ -936,13 +936,10 @@ def llm_audit_skill(skill_path: Path, static_result: ScanResult, # Call the LLM via the centralized provider router try: - from agent.auxiliary_client import resolve_provider_client + from agent.auxiliary_client import call_llm - client, _default_model = resolve_provider_client("openrouter") - if client is None: - return static_result - - response = client.chat.completions.create( + response = call_llm( + provider="openrouter", model=model, messages=[{ "role": "user", diff --git a/tools/vision_tools.py b/tools/vision_tools.py index ee89b58a4..c1b09a22d 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -37,16 +37,11 @@ from pathlib import Path from typing import Any, Awaitable, Dict, Optional from urllib.parse import urlparse import httpx -from agent.auxiliary_client import get_async_vision_auxiliary_client +from agent.auxiliary_client import async_call_llm from tools.debug_helpers import DebugSession logger = logging.getLogger(__name__) -# Resolve vision auxiliary client at module level. -# Uses get_async_vision_auxiliary_client() which properly handles Codex -# routing (Responses API adapter) instead of raw AsyncOpenAI construction. -_aux_async_client, DEFAULT_VISION_MODEL = get_async_vision_auxiliary_client() - _debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG") @@ -185,7 +180,7 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None) async def vision_analyze_tool( image_url: str, user_prompt: str, - model: str = DEFAULT_VISION_MODEL, + model: str = None, ) -> str: """ Analyze an image from a URL or local file path using vision AI. @@ -245,15 +240,6 @@ async def vision_analyze_tool( logger.info("Analyzing image: %s", image_url[:60]) logger.info("User prompt: %s", user_prompt[:100]) - # Check auxiliary vision client availability - if _aux_async_client is None or DEFAULT_VISION_MODEL is None: - logger.error("Vision analysis unavailable: no auxiliary vision model configured") - return json.dumps({ - "success": False, - "analysis": "Vision analysis unavailable: no auxiliary vision model configured. " - "Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools." - }, indent=2, ensure_ascii=False) - # Determine if this is a local file path or a remote URL local_path = Path(image_url) if local_path.is_file(): @@ -309,18 +295,18 @@ async def vision_analyze_tool( } ] - logger.info("Processing image with %s...", model) + logger.info("Processing image with vision model...") - # Call the vision API - from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param - _extra = get_auxiliary_extra_body() - response = await _aux_async_client.chat.completions.create( - model=model, - messages=messages, - temperature=0.1, - **auxiliary_max_tokens_param(2000), - **({} if not _extra else {"extra_body": _extra}), - ) + # Call the vision API via centralized router + call_kwargs = { + "task": "vision", + "messages": messages, + "temperature": 0.1, + "max_tokens": 2000, + } + if model: + call_kwargs["model"] = model + response = await async_call_llm(**call_kwargs) # Extract the analysis analysis = response.choices[0].message.content.strip() @@ -391,7 +377,18 @@ async def vision_analyze_tool( def check_vision_requirements() -> bool: """Check if an auxiliary vision model is available.""" - return _aux_async_client is not None + try: + from agent.auxiliary_client import resolve_provider_client + client, _ = resolve_provider_client("openrouter") + if client is not None: + return True + client, _ = resolve_provider_client("nous") + if client is not None: + return True + client, _ = resolve_provider_client("custom") + return client is not None + except Exception: + return False def get_debug_session_info() -> Dict[str, Any]: @@ -419,10 +416,9 @@ if __name__ == "__main__": print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.") exit(1) else: - print(f"✅ Vision model available: {DEFAULT_VISION_MODEL}") + print("✅ Vision model available") print("🛠️ Vision tools ready for use!") - print(f"🧠 Using model: {DEFAULT_VISION_MODEL}") # Show debug mode status if _debug.active: @@ -489,9 +485,7 @@ def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]: "Fully describe and explain everything about this image, then answer the " f"following question:\n\n{question}" ) - model = (os.getenv("AUXILIARY_VISION_MODEL", "").strip() - or DEFAULT_VISION_MODEL - or "google/gemini-3-flash-preview") + model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None return vision_analyze_tool(image_url, full_prompt, model) diff --git a/tools/web_tools.py b/tools/web_tools.py index e99d94fb0..71a882a5e 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -47,8 +47,7 @@ import re import asyncio from typing import List, Dict, Any, Optional from firecrawl import Firecrawl -from openai import AsyncOpenAI -from agent.auxiliary_client import get_async_text_auxiliary_client +from agent.auxiliary_client import async_call_llm from tools.debug_helpers import DebugSession logger = logging.getLogger(__name__) @@ -83,15 +82,8 @@ def _get_firecrawl_client(): DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000 -# Resolve async auxiliary client at module level. -# Handles Codex Responses API adapter transparently. -_aux_async_client, _DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client("web_extract") - -# Allow per-task override via config.yaml auxiliary.web_extract_model -DEFAULT_SUMMARIZER_MODEL = ( - os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() - or _DEFAULT_SUMMARIZER_MODEL -) +# Allow per-task override via env var +DEFAULT_SUMMARIZER_MODEL = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG") @@ -249,22 +241,22 @@ Create a markdown summary that captures all key information in a well-organized, for attempt in range(max_retries): try: - if _aux_async_client is None: - logger.warning("No auxiliary model available for web content processing") - return None - from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param - _extra = get_auxiliary_extra_body() - response = await _aux_async_client.chat.completions.create( - model=model, - messages=[ + call_kwargs = { + "task": "web_extract", + "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ], - temperature=0.1, - **auxiliary_max_tokens_param(max_tokens), - **({} if not _extra else {"extra_body": _extra}), - ) + "temperature": 0.1, + "max_tokens": max_tokens, + } + if model: + call_kwargs["model"] = model + response = await async_call_llm(**call_kwargs) return response.choices[0].message.content.strip() + except RuntimeError: + logger.warning("No auxiliary model available for web content processing") + return None except Exception as api_error: last_error = api_error if attempt < max_retries - 1: @@ -368,25 +360,18 @@ Synthesize these into ONE cohesive, comprehensive summary that: Create a single, unified markdown summary.""" try: - if _aux_async_client is None: - logger.warning("No auxiliary model for synthesis, concatenating summaries") - fallback = "\n\n".join(summaries) - if len(fallback) > max_output_size: - fallback = fallback[:max_output_size] + "\n\n[... truncated ...]" - return fallback - - from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param - _extra = get_auxiliary_extra_body() - response = await _aux_async_client.chat.completions.create( - model=model, - messages=[ + call_kwargs = { + "task": "web_extract", + "messages": [ {"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."}, {"role": "user", "content": synthesis_prompt} ], - temperature=0.1, - **auxiliary_max_tokens_param(20000), - **({} if not _extra else {"extra_body": _extra}), - ) + "temperature": 0.1, + "max_tokens": 20000, + } + if model: + call_kwargs["model"] = model + response = await async_call_llm(**call_kwargs) final_summary = response.choices[0].message.content.strip() # Enforce hard cap @@ -713,8 +698,8 @@ async def web_extract_tool( debug_call_data["pages_extracted"] = pages_extracted debug_call_data["original_response_size"] = len(json.dumps(response)) - # Process each result with LLM if enabled and auxiliary client is available - if use_llm_processing and _aux_async_client is not None: + # Process each result with LLM if enabled + if use_llm_processing: logger.info("Processing extracted content with LLM (parallel)...") debug_call_data["processing_applied"].append("llm_processing") @@ -780,10 +765,6 @@ async def web_extract_tool( else: logger.warning("%s (no content to process)", url) else: - if use_llm_processing and _aux_async_client is None: - logger.warning("LLM processing requested but no auxiliary model available, returning raw content") - debug_call_data["processing_applied"].append("llm_processing_unavailable") - # Print summary of extracted pages for debugging (original behavior) for result in response.get('results', []): url = result.get('url', 'Unknown URL') @@ -1013,8 +994,8 @@ async def web_crawl_tool( debug_call_data["pages_crawled"] = pages_crawled debug_call_data["original_response_size"] = len(json.dumps(response)) - # Process each result with LLM if enabled and auxiliary client is available - if use_llm_processing and _aux_async_client is not None: + # Process each result with LLM if enabled + if use_llm_processing: logger.info("Processing crawled content with LLM (parallel)...") debug_call_data["processing_applied"].append("llm_processing") @@ -1080,10 +1061,6 @@ async def web_crawl_tool( else: logger.warning("%s (no content to process)", page_url) else: - if use_llm_processing and _aux_async_client is None: - logger.warning("LLM processing requested but no auxiliary model available, returning raw content") - debug_call_data["processing_applied"].append("llm_processing_unavailable") - # Print summary of crawled pages for debugging (original behavior) for result in response.get('results', []): page_url = result.get('url', 'Unknown URL') @@ -1138,7 +1115,15 @@ def check_firecrawl_api_key() -> bool: def check_auxiliary_model() -> bool: """Check if an auxiliary text model is available for LLM content processing.""" - return _aux_async_client is not None + try: + from agent.auxiliary_client import resolve_provider_client + for p in ("openrouter", "nous", "custom", "codex"): + client, _ = resolve_provider_client(p) + if client is not None: + return True + return False + except Exception: + return False def get_debug_session_info() -> Dict[str, Any]: diff --git a/trajectory_compressor.py b/trajectory_compressor.py index 5f1c84c6a..ef81d6e27 100644 --- a/trajectory_compressor.py +++ b/trajectory_compressor.py @@ -344,28 +344,32 @@ class TrajectoryCompressor: raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}") def _init_summarizer(self): - """Initialize LLM client for summarization (sync and async). + """Initialize LLM routing for summarization (sync and async). - Routes through the centralized provider router for known providers - (OpenRouter, Nous, Codex, etc.) so auth and headers are handled - consistently. Falls back to raw construction for custom endpoints. + Uses call_llm/async_call_llm from the centralized provider router + which handles auth, headers, and provider detection internally. + For custom endpoints, falls back to raw client construction. """ - from agent.auxiliary_client import resolve_provider_client + from agent.auxiliary_client import call_llm, async_call_llm provider = self._detect_provider() if provider: - # Use centralized router — handles auth, headers, Codex adapter - self.client, _ = resolve_provider_client( + # Store provider for use in _generate_summary calls + self._llm_provider = provider + self._use_call_llm = True + # Verify the provider is available + from agent.auxiliary_client import resolve_provider_client + client, _ = resolve_provider_client( provider, model=self.config.summarization_model) - self.async_client, _ = resolve_provider_client( - provider, model=self.config.summarization_model, - async_mode=True) - if self.client is None: + if client is None: raise RuntimeError( f"Provider '{provider}' is not configured. " f"Check your API key or run: hermes setup") + self.client = None # Not used directly + self.async_client = None # Not used directly else: # Custom endpoint — use config's raw base_url + api_key_env + self._use_call_llm = False api_key = os.getenv(self.config.api_key_env) if not api_key: raise RuntimeError( @@ -524,12 +528,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" try: metrics.summarization_api_calls += 1 - response = self.client.chat.completions.create( - model=self.config.summarization_model, - messages=[{"role": "user", "content": prompt}], - temperature=self.config.temperature, - max_tokens=self.config.summary_target_tokens * 2, - ) + if getattr(self, '_use_call_llm', False): + from agent.auxiliary_client import call_llm + response = call_llm( + provider=self._llm_provider, + model=self.config.summarization_model, + messages=[{"role": "user", "content": prompt}], + temperature=self.config.temperature, + max_tokens=self.config.summary_target_tokens * 2, + ) + else: + response = self.client.chat.completions.create( + model=self.config.summarization_model, + messages=[{"role": "user", "content": prompt}], + temperature=self.config.temperature, + max_tokens=self.config.summary_target_tokens * 2, + ) summary = response.choices[0].message.content.strip() @@ -581,12 +595,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" try: metrics.summarization_api_calls += 1 - response = await self.async_client.chat.completions.create( - model=self.config.summarization_model, - messages=[{"role": "user", "content": prompt}], - temperature=self.config.temperature, - max_tokens=self.config.summary_target_tokens * 2, - ) + if getattr(self, '_use_call_llm', False): + from agent.auxiliary_client import async_call_llm + response = await async_call_llm( + provider=self._llm_provider, + model=self.config.summarization_model, + messages=[{"role": "user", "content": prompt}], + temperature=self.config.temperature, + max_tokens=self.config.summary_target_tokens * 2, + ) + else: + response = await self.async_client.chat.completions.create( + model=self.config.summarization_model, + messages=[{"role": "user", "content": prompt}], + temperature=self.config.temperature, + max_tokens=self.config.summary_target_tokens * 2, + ) summary = response.choices[0].message.content.strip()