diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 798536fb..35897ccc 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -7,7 +7,7 @@ protecting head and tail context. import logging import os -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from agent.auxiliary_client import get_text_auxiliary_client from agent.model_metadata import ( @@ -82,11 +82,14 @@ class ContextCompressor: "compression_count": self.compression_count, } - def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> str: - """Generate a concise summary of conversation turns using a fast model.""" - if not self.client: - return "[CONTEXT SUMMARY]: Previous conversation turns have been compressed to save space. The assistant performed various actions and received responses." + def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]: + """Generate a concise summary of conversation turns. + Tries the auxiliary model first, then falls back to the user's main + model. Returns None if all attempts fail — the caller should drop + the middle turns without a summary rather than inject a useless + placeholder. + """ parts = [] for msg in turns_to_summarize: role = msg.get("role", "unknown") @@ -117,28 +120,28 @@ TURNS TO SUMMARIZE: Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" - try: - return self._call_summary_model(self.client, self.summary_model, prompt) - except Exception as e: - logging.warning(f"Failed to generate context summary with auxiliary model: {e}") + # 1. Try the auxiliary model (cheap/fast) + if self.client: + try: + return self._call_summary_model(self.client, self.summary_model, prompt) + except Exception as e: + logging.warning(f"Failed to generate context summary with auxiliary model: {e}") - # Fallback: try the main model's endpoint. This handles the common - # case where the user switched providers (e.g. OpenRouter → local LLM) - # but a stale API key causes the auxiliary client to pick the old - # provider which then fails (402, auth error, etc.). - fallback_client, fallback_model = self._get_fallback_client() - if fallback_client is not None: - try: - logger.info("Retrying context summary with fallback client (%s)", fallback_model) - summary = self._call_summary_model(fallback_client, fallback_model, prompt) - # Success — swap in the working client for future compressions - self.client = fallback_client - self.summary_model = fallback_model - return summary - except Exception as fallback_err: - logging.warning(f"Fallback summary model also failed: {fallback_err}") + # 2. Fallback: try the user's main model endpoint + fallback_client, fallback_model = self._get_fallback_client() + if fallback_client is not None: + try: + logger.info("Retrying context summary with main model (%s)", fallback_model) + summary = self._call_summary_model(fallback_client, fallback_model, prompt) + self.client = fallback_client + self.summary_model = fallback_model + return summary + except Exception as fallback_err: + logging.warning(f"Main model summary also failed: {fallback_err}") - return "[CONTEXT SUMMARY]: Previous conversation turns have been compressed. The assistant performed tool calls and received responses." + # 3. All models failed — return None so the caller drops turns without a summary + logging.warning("Context compression: no model available for summary. Middle turns will be dropped without summary.") + return None def _call_summary_model(self, client, model: str, prompt: str) -> str: """Make the actual LLM call to generate a summary. Raises on failure.""" @@ -326,25 +329,6 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" print(f"\nšŸ“¦ Context compression triggered ({display_tokens:,} tokens ≄ {self.threshold_tokens:,} threshold)") print(f" šŸ“Š Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})") - # Truncation fallback when no auxiliary model is available - if self.client is None: - print("āš ļø Context compression: no auxiliary model available. Falling back to message truncation.") - # Keep system message(s) at the front and the protected tail; - # simply drop the oldest non-system messages until under threshold. - kept = [] - for msg in messages: - if msg.get("role") == "system": - kept.append(msg.copy()) - else: - break - tail = messages[-self.protect_last_n:] - kept.extend(m.copy() for m in tail) - self.compression_count += 1 - kept = self._sanitize_tool_pairs(kept) - if not self.quiet_mode: - print(f" āœ‚ļø Truncated: {len(messages)} → {len(kept)} messages (dropped middle turns)") - return kept - if not self.quiet_mode: print(f" šŸ—œļø Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)") @@ -357,7 +341,11 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" msg["content"] = (msg.get("content") or "") + "\n\n[Note: Some earlier conversation turns may be summarized to preserve context space.]" compressed.append(msg) - compressed.append({"role": "user", "content": summary}) + if summary: + compressed.append({"role": "user", "content": summary}) + else: + if not self.quiet_mode: + print(" āš ļø No summary model available — middle turns dropped without summary") for i in range(compress_end, n_messages): compressed.append(messages[i].copy())