diff --git a/agent/context_compressor.py b/agent/context_compressor.py index f6cfa5b9f..1e8129f20 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -115,34 +115,84 @@ TURNS TO SUMMARIZE: Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" try: - kwargs = { - "model": self.summary_model, - "messages": [{"role": "user", "content": prompt}], - "temperature": 0.3, - "timeout": 30.0, - } - # Most providers (OpenRouter, local models) use max_tokens. - # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+) - # requires max_completion_tokens instead. - try: - kwargs["max_tokens"] = self.summary_target_tokens * 2 - response = self.client.chat.completions.create(**kwargs) - except Exception as first_err: - if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err): - kwargs.pop("max_tokens", None) - kwargs["max_completion_tokens"] = self.summary_target_tokens * 2 - response = self.client.chat.completions.create(**kwargs) - else: - raise - - summary = response.choices[0].message.content.strip() - if not summary.startswith("[CONTEXT SUMMARY]:"): - summary = "[CONTEXT SUMMARY]: " + summary - return summary + return self._call_summary_model(self.client, self.summary_model, prompt) except Exception as e: - logging.warning(f"Failed to generate context summary: {e}") + logging.warning(f"Failed to generate context summary with auxiliary model: {e}") + + # Fallback: try the main model's endpoint. This handles the common + # case where the user switched providers (e.g. OpenRouter → local LLM) + # but a stale API key causes the auxiliary client to pick the old + # provider which then fails (402, auth error, etc.). + fallback_client, fallback_model = self._get_fallback_client() + if fallback_client is not None: + try: + logger.info("Retrying context summary with fallback client (%s)", fallback_model) + summary = self._call_summary_model(fallback_client, fallback_model, prompt) + # Success — swap in the working client for future compressions + self.client = fallback_client + self.summary_model = fallback_model + return summary + except Exception as fallback_err: + logging.warning(f"Fallback summary model also failed: {fallback_err}") + return "[CONTEXT SUMMARY]: Previous conversation turns have been compressed. The assistant performed tool calls and received responses." + def _call_summary_model(self, client, model: str, prompt: str) -> str: + """Make the actual LLM call to generate a summary. Raises on failure.""" + kwargs = { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.3, + "timeout": 30.0, + } + # Most providers (OpenRouter, local models) use max_tokens. + # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+) + # requires max_completion_tokens instead. + try: + kwargs["max_tokens"] = self.summary_target_tokens * 2 + response = client.chat.completions.create(**kwargs) + except Exception as first_err: + if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err): + kwargs.pop("max_tokens", None) + kwargs["max_completion_tokens"] = self.summary_target_tokens * 2 + response = client.chat.completions.create(**kwargs) + else: + raise + + summary = response.choices[0].message.content.strip() + if not summary.startswith("[CONTEXT SUMMARY]:"): + summary = "[CONTEXT SUMMARY]: " + summary + return summary + + def _get_fallback_client(self): + """Try to build a fallback client from the main model's endpoint config. + + When the primary auxiliary client fails (e.g. stale OpenRouter key), this + creates a client using the user's active custom endpoint (OPENAI_BASE_URL) + so compression can still produce a real summary instead of a static string. + + Returns (client, model) or (None, None). + """ + custom_base = os.getenv("OPENAI_BASE_URL") + custom_key = os.getenv("OPENAI_API_KEY") + if not custom_base or not custom_key: + return None, None + + # Don't fallback to the same provider that just failed + from hermes_constants import OPENROUTER_BASE_URL + if custom_base.rstrip("/") == OPENROUTER_BASE_URL.rstrip("/"): + return None, None + + model = os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL") or self.model + try: + from openai import OpenAI as _OpenAI + client = _OpenAI(api_key=custom_key, base_url=custom_base) + logger.debug("Built fallback auxiliary client: %s via %s", model, custom_base) + return client, model + except Exception as exc: + logger.debug("Could not build fallback auxiliary client: %s", exc) + return None, None + def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None) -> List[Dict[str, Any]]: """Compress conversation messages by summarizing middle turns.