diff --git a/agent/context_compressor.py b/agent/context_compressor.py
index f6cfa5b9f..1e8129f20 100644
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -115,34 +115,84 @@ TURNS TO SUMMARIZE:
 Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
 
         try:
-            kwargs = {
-                "model": self.summary_model,
-                "messages": [{"role": "user", "content": prompt}],
-                "temperature": 0.3,
-                "timeout": 30.0,
-            }
-            # Most providers (OpenRouter, local models) use max_tokens.
-            # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
-            # requires max_completion_tokens instead.
-            try:
-                kwargs["max_tokens"] = self.summary_target_tokens * 2
-                response = self.client.chat.completions.create(**kwargs)
-            except Exception as first_err:
-                if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
-                    kwargs.pop("max_tokens", None)
-                    kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
-                    response = self.client.chat.completions.create(**kwargs)
-                else:
-                    raise
-
-            summary = response.choices[0].message.content.strip()
-            if not summary.startswith("[CONTEXT SUMMARY]:"):
-                summary = "[CONTEXT SUMMARY]: " + summary
-            return summary
+            return self._call_summary_model(self.client, self.summary_model, prompt)
         except Exception as e:
-            logging.warning(f"Failed to generate context summary: {e}")
+            logging.warning(f"Failed to generate context summary with auxiliary model: {e}")
+
+            # Fallback: try the main model's endpoint.  This handles the common
+            # case where the user switched providers (e.g. OpenRouter → local LLM)
+            # but a stale API key causes the auxiliary client to pick the old
+            # provider which then fails (402, auth error, etc.).
+            fallback_client, fallback_model = self._get_fallback_client()
+            if fallback_client is not None:
+                try:
+                    logger.info("Retrying context summary with fallback client (%s)", fallback_model)
+                    summary = self._call_summary_model(fallback_client, fallback_model, prompt)
+                    # Success — swap in the working client for future compressions
+                    self.client = fallback_client
+                    self.summary_model = fallback_model
+                    return summary
+                except Exception as fallback_err:
+                    logging.warning(f"Fallback summary model also failed: {fallback_err}")
+
             return "[CONTEXT SUMMARY]: Previous conversation turns have been compressed. The assistant performed tool calls and received responses."
 
+    def _call_summary_model(self, client, model: str, prompt: str) -> str:
+        """Make the actual LLM call to generate a summary. Raises on failure."""
+        kwargs = {
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": 0.3,
+            "timeout": 30.0,
+        }
+        # Most providers (OpenRouter, local models) use max_tokens.
+        # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
+        # requires max_completion_tokens instead.
+        try:
+            kwargs["max_tokens"] = self.summary_target_tokens * 2
+            response = client.chat.completions.create(**kwargs)
+        except Exception as first_err:
+            if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
+                kwargs.pop("max_tokens", None)
+                kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
+                response = client.chat.completions.create(**kwargs)
+            else:
+                raise
+
+        summary = response.choices[0].message.content.strip()
+        if not summary.startswith("[CONTEXT SUMMARY]:"):
+            summary = "[CONTEXT SUMMARY]: " + summary
+        return summary
+
+    def _get_fallback_client(self):
+        """Try to build a fallback client from the main model's endpoint config.
+
+        When the primary auxiliary client fails (e.g. stale OpenRouter key), this
+        creates a client using the user's active custom endpoint (OPENAI_BASE_URL)
+        so compression can still produce a real summary instead of a static string.
+
+        Returns (client, model) or (None, None).
+        """
+        custom_base = os.getenv("OPENAI_BASE_URL")
+        custom_key = os.getenv("OPENAI_API_KEY")
+        if not custom_base or not custom_key:
+            return None, None
+
+        # Don't fallback to the same provider that just failed
+        from hermes_constants import OPENROUTER_BASE_URL
+        if custom_base.rstrip("/") == OPENROUTER_BASE_URL.rstrip("/"):
+            return None, None
+
+        model = os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL") or self.model
+        try:
+            from openai import OpenAI as _OpenAI
+            client = _OpenAI(api_key=custom_key, base_url=custom_base)
+            logger.debug("Built fallback auxiliary client: %s via %s", model, custom_base)
+            return client, model
+        except Exception as exc:
+            logger.debug("Could not build fallback auxiliary client: %s", exc)
+            return None, None
+
     def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None) -> List[Dict[str, Any]]:
         """Compress conversation messages by summarizing middle turns.