From 3e2ed18ad0ddc9c6b191eef1409c428111fbc372 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 4 Mar 2026 17:58:09 -0800
Subject: [PATCH] fix: fallback to main model endpoint when auxiliary summary
 client fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the auxiliary client (used for context compression summaries) fails
— e.g. due to a stale OpenRouter API key after switching to a local LLM
— fall back to the user's active endpoint (OPENAI_BASE_URL) instead of
returning a useless static summary string.

This handles the common scenario where a user switches providers via
'hermes model' but the old provider's API key remains in .env. The
auxiliary client picks up the stale key, fails (402/auth error), and
previously compression would produce garbage. Now it gracefully retries
with the working endpoint.

On successful fallback, the working client is cached for future
compressions in the same session so the fallback cost is paid only once.

Ref: #348
---
 agent/context_compressor.py | 100 +++++++++++++++++++++++++++---------
 1 file changed, 75 insertions(+), 25 deletions(-)

diff --git a/agent/context_compressor.py b/agent/context_compressor.py
index f6cfa5b9f..1e8129f20 100644
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -115,34 +115,84 @@ TURNS TO SUMMARIZE:
 Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
 
         try:
-            kwargs = {
-                "model": self.summary_model,
-                "messages": [{"role": "user", "content": prompt}],
-                "temperature": 0.3,
-                "timeout": 30.0,
-            }
-            # Most providers (OpenRouter, local models) use max_tokens.
-            # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
-            # requires max_completion_tokens instead.
-            try:
-                kwargs["max_tokens"] = self.summary_target_tokens * 2
-                response = self.client.chat.completions.create(**kwargs)
-            except Exception as first_err:
-                if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
-                    kwargs.pop("max_tokens", None)
-                    kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
-                    response = self.client.chat.completions.create(**kwargs)
-                else:
-                    raise
-
-            summary = response.choices[0].message.content.strip()
-            if not summary.startswith("[CONTEXT SUMMARY]:"):
-                summary = "[CONTEXT SUMMARY]: " + summary
-            return summary
+            return self._call_summary_model(self.client, self.summary_model, prompt)
         except Exception as e:
-            logging.warning(f"Failed to generate context summary: {e}")
+            logging.warning(f"Failed to generate context summary with auxiliary model: {e}")
+
+            # Fallback: try the main model's endpoint.  This handles the common
+            # case where the user switched providers (e.g. OpenRouter → local LLM)
+            # but a stale API key causes the auxiliary client to pick the old
+            # provider which then fails (402, auth error, etc.).
+            fallback_client, fallback_model = self._get_fallback_client()
+            if fallback_client is not None:
+                try:
+                    logger.info("Retrying context summary with fallback client (%s)", fallback_model)
+                    summary = self._call_summary_model(fallback_client, fallback_model, prompt)
+                    # Success — swap in the working client for future compressions
+                    self.client = fallback_client
+                    self.summary_model = fallback_model
+                    return summary
+                except Exception as fallback_err:
+                    logging.warning(f"Fallback summary model also failed: {fallback_err}")
+
             return "[CONTEXT SUMMARY]: Previous conversation turns have been compressed. The assistant performed tool calls and received responses."
 
+    def _call_summary_model(self, client, model: str, prompt: str) -> str:
+        """Make the actual LLM call to generate a summary. Raises on failure."""
+        kwargs = {
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": 0.3,
+            "timeout": 30.0,
+        }
+        # Most providers (OpenRouter, local models) use max_tokens.
+        # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
+        # requires max_completion_tokens instead.
+        try:
+            kwargs["max_tokens"] = self.summary_target_tokens * 2
+            response = client.chat.completions.create(**kwargs)
+        except Exception as first_err:
+            if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
+                kwargs.pop("max_tokens", None)
+                kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
+                response = client.chat.completions.create(**kwargs)
+            else:
+                raise
+
+        summary = response.choices[0].message.content.strip()
+        if not summary.startswith("[CONTEXT SUMMARY]:"):
+            summary = "[CONTEXT SUMMARY]: " + summary
+        return summary
+
+    def _get_fallback_client(self):
+        """Try to build a fallback client from the main model's endpoint config.
+
+        When the primary auxiliary client fails (e.g. stale OpenRouter key), this
+        creates a client using the user's active custom endpoint (OPENAI_BASE_URL)
+        so compression can still produce a real summary instead of a static string.
+
+        Returns (client, model) or (None, None).
+        """
+        custom_base = os.getenv("OPENAI_BASE_URL")
+        custom_key = os.getenv("OPENAI_API_KEY")
+        if not custom_base or not custom_key:
+            return None, None
+
+        # Don't fallback to the same provider that just failed
+        from hermes_constants import OPENROUTER_BASE_URL
+        if custom_base.rstrip("/") == OPENROUTER_BASE_URL.rstrip("/"):
+            return None, None
+
+        model = os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL") or self.model
+        try:
+            from openai import OpenAI as _OpenAI
+            client = _OpenAI(api_key=custom_key, base_url=custom_base)
+            logger.debug("Built fallback auxiliary client: %s via %s", model, custom_base)
+            return client, model
+        except Exception as exc:
+            logger.debug("Could not build fallback auxiliary client: %s", exc)
+            return None, None
+
     def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None) -> List[Dict[str, Any]]:
         """Compress conversation messages by summarizing middle turns.