From 9231a335d4bb3da55e7dab4ca49d2de719763735 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 24 Mar 2026 17:45:49 -0700 Subject: [PATCH] fix(compression): replace dead summary_target_tokens with ratio-based scaling (#2554) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The summary_target_tokens parameter was accepted in the constructor, stored on the instance, and never used — the summary budget was always computed from hardcoded module constants (_SUMMARY_RATIO=0.20, _MAX_SUMMARY_TOKENS=8000). This caused two compounding problems: 1. The config value was silently ignored, giving users no control over post-compression size. 2. Fixed budgets (20K tail, 8K summary cap) didn't scale with context window size. Switching from a 1M-context model to a 200K model would trigger compression that nuked 350K tokens of conversation history down to ~30K. Changes: - Replace summary_target_tokens with summary_target_ratio (default 0.40) which sets the post-compression target as a fraction of context_length. Tail token budget and summary cap now scale proportionally: MiniMax 200K → ~80K post-compression GPT-5 1M → ~400K post-compression - Change threshold_percent default: 0.50 → 0.80 (don't fire until 80% of context is consumed) - Change protect_last_n default: 4 → 20 (preserve ~10 full turns) - Summary token cap scales to 5% of context (was fixed 8K), capped at 32K ceiling - Read target_ratio and protect_last_n from config.yaml compression section (both are now configurable) - Remove hardcoded summary_target_tokens=500 from run_agent.py - Add 5 new tests for ratio scaling, clamping, and new defaults --- agent/context_compressor.py | 47 ++++++++++++++++-------- cli-config.yaml.example | 20 +++++++++-- run_agent.py | 8 +++-- tests/agent/test_context_compressor.py | 49 +++++++++++++++++++++++++- 4 files changed, 103 insertions(+), 21 deletions(-) diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 5f4ea4a3c..52428d954 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -35,14 +35,12 @@ SUMMARY_PREFIX = ( ) LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:" -# Minimum / maximum tokens for the summary output +# Minimum tokens for the summary output _MIN_SUMMARY_TOKENS = 2000 -_MAX_SUMMARY_TOKENS = 8000 # Proportion of compressed content to allocate for summary _SUMMARY_RATIO = 0.20 - -# Token budget for tail protection (keep most-recent context) -_DEFAULT_TAIL_TOKEN_BUDGET = 20_000 +# Absolute ceiling for summary tokens (even on very large context windows) +_SUMMARY_TOKENS_CEILING = 32_000 # Placeholder used when pruning old tool results _PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]" @@ -65,10 +63,10 @@ class ContextCompressor: def __init__( self, model: str, - threshold_percent: float = 0.50, + threshold_percent: float = 0.80, protect_first_n: int = 3, - protect_last_n: int = 4, - summary_target_tokens: int = 2500, + protect_last_n: int = 20, + summary_target_ratio: float = 0.40, quiet_mode: bool = False, summary_model_override: str = None, base_url: str = "", @@ -83,7 +81,7 @@ class ContextCompressor: self.threshold_percent = threshold_percent self.protect_first_n = protect_first_n self.protect_last_n = protect_last_n - self.summary_target_tokens = summary_target_tokens + self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80)) self.quiet_mode = quiet_mode self.context_length = get_model_context_length( @@ -94,12 +92,22 @@ class ContextCompressor: self.threshold_tokens = int(self.context_length * threshold_percent) self.compression_count = 0 + # Derive token budgets from the target ratio and context length + target_tokens = int(self.context_length * self.summary_target_ratio) + self.tail_token_budget = target_tokens + self.max_summary_tokens = min( + int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING, + ) + if not quiet_mode: logger.info( "Context compressor initialized: model=%s context_length=%d " - "threshold=%d (%.0f%%) provider=%s base_url=%s", + "threshold=%d (%.0f%%) target_ratio=%.0f%% tail_budget=%d " + "provider=%s base_url=%s", model, self.context_length, self.threshold_tokens, - threshold_percent * 100, provider or "none", base_url or "none", + threshold_percent * 100, self.summary_target_ratio * 100, + self.tail_token_budget, + provider or "none", base_url or "none", ) self._context_probed = False # True after a step-down from context error @@ -179,10 +187,15 @@ class ContextCompressor: # ------------------------------------------------------------------ def _compute_summary_budget(self, turns_to_summarize: List[Dict[str, Any]]) -> int: - """Scale summary token budget with the amount of content being compressed.""" + """Scale summary token budget with the amount of content being compressed. + + The maximum scales with the model's context window (5% of context, + capped at ``_SUMMARY_TOKENS_CEILING``) so large-context models get + richer summaries instead of being hard-capped at 8K tokens. + """ content_tokens = estimate_messages_tokens_rough(turns_to_summarize) budget = int(content_tokens * _SUMMARY_RATIO) - return max(_MIN_SUMMARY_TOKENS, min(budget, _MAX_SUMMARY_TOKENS)) + return max(_MIN_SUMMARY_TOKENS, min(budget, self.max_summary_tokens)) def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str: """Serialize conversation turns into labeled text for the summarizer. @@ -477,14 +490,20 @@ Write only the summary body. Do not include any preamble or prefix.""" def _find_tail_cut_by_tokens( self, messages: List[Dict[str, Any]], head_end: int, - token_budget: int = _DEFAULT_TAIL_TOKEN_BUDGET, + token_budget: int | None = None, ) -> int: """Walk backward from the end of messages, accumulating tokens until the budget is reached. Returns the index where the tail starts. + ``token_budget`` defaults to ``self.tail_token_budget`` which is + derived from ``summary_target_ratio * context_length``, so it + scales automatically with the model's context window. + Never cuts inside a tool_call/result group. Falls back to the old ``protect_last_n`` if the budget would protect fewer messages. """ + if token_budget is None: + token_budget = self.tail_token_budget n = len(messages) min_tail = self.protect_last_n accumulated = 0 diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 89d6b9f80..4469fa54b 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -232,19 +232,33 @@ browser: # 1. Tracks actual token usage from API responses (not estimates) # 2. When prompt_tokens >= threshold% of model's context_length, triggers compression # 3. Protects first 3 turns (system prompt, initial request, first response) -# 4. Protects last 4 turns (recent context is most relevant) +# 4. Protects last N turns (default 20 messages = ~10 full turns of recent context) # 5. Summarizes middle turns using a fast/cheap model # 6. Inserts summary as a user message, continues conversation seamlessly # +# Post-compression size scales with the model's context window via target_ratio: +# MiniMax 200K context → ~80K post-compression (at 0.40 ratio) +# GPT-5 1M context → ~400K post-compression (at 0.40 ratio) +# compression: # Enable automatic context compression (default: true) # Set to false if you prefer to manage context manually or want errors on overflow enabled: true - # Trigger compression at this % of model's context limit (default: 0.85 = 85%) + # Trigger compression at this % of model's context limit (default: 0.80 = 80%) # Lower values = more aggressive compression, higher values = compress later - threshold: 0.85 + threshold: 0.80 + # Target post-compression size as a fraction of context window (default: 0.40 = 40%) + # Controls how much context survives compression. Tail token budget and summary + # cap scale with this value. Range: 0.10 - 0.80 + target_ratio: 0.40 + + # Number of most-recent messages to always preserve (default: 20 ≈ 10 full turns) + # Higher values keep more recent conversation intact at the cost of more aggressive + # compression of older turns. + protect_last_n: 20 + # Model to use for generating summaries (fast/cheap recommended) # This model compresses the middle turns into a concise summary. # IMPORTANT: it receives the full middle section of the conversation, so it diff --git a/run_agent.py b/run_agent.py index 220921160..11d728e83 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1009,9 +1009,11 @@ class AIAgent: _compression_cfg = _agent_cfg.get("compression", {}) if not isinstance(_compression_cfg, dict): _compression_cfg = {} - compression_threshold = float(_compression_cfg.get("threshold", 0.50)) + compression_threshold = float(_compression_cfg.get("threshold", 0.80)) compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes") compression_summary_model = _compression_cfg.get("summary_model") or None + compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.40)) + compression_protect_last = int(_compression_cfg.get("protect_last_n", 20)) # Read explicit context_length override from model config _model_cfg = _agent_cfg.get("model", {}) @@ -1050,8 +1052,8 @@ class AIAgent: model=self.model, threshold_percent=compression_threshold, protect_first_n=3, - protect_last_n=4, - summary_target_tokens=500, + protect_last_n=compression_protect_last, + summary_target_ratio=compression_target_ratio, summary_model_override=compression_summary_model, quiet_mode=self.quiet_mode, base_url=self.base_url, diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 0fbcf4021..380dc8a42 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -217,7 +217,7 @@ class TestCompressWithClient: mock_client.chat.completions.create.return_value = mock_response with patch("agent.context_compressor.get_model_context_length", return_value=100000): - c = ContextCompressor(model="test", quiet_mode=True) + c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)] with patch("agent.context_compressor.call_llm", return_value=mock_response): @@ -513,3 +513,50 @@ class TestCompressWithClient: for msg in result: if msg.get("role") == "tool" and msg.get("tool_call_id"): assert msg["tool_call_id"] in called_ids + + +class TestSummaryTargetRatio: + """Verify that summary_target_ratio properly scales budgets with context window.""" + + def test_tail_budget_scales_with_context(self): + """Tail token budget should be context_length * summary_target_ratio.""" + with patch("agent.context_compressor.get_model_context_length", return_value=200_000): + c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40) + assert c.tail_token_budget == 80_000 + + with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000): + c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40) + assert c.tail_token_budget == 400_000 + + def test_summary_cap_scales_with_context(self): + """Max summary tokens should be 5% of context, capped at 32K.""" + with patch("agent.context_compressor.get_model_context_length", return_value=200_000): + c = ContextCompressor(model="test", quiet_mode=True) + assert c.max_summary_tokens == 10_000 # 200K * 0.05 + + with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000): + c = ContextCompressor(model="test", quiet_mode=True) + assert c.max_summary_tokens == 32_000 # capped at ceiling + + def test_ratio_clamped(self): + """Ratio should be clamped to [0.10, 0.80].""" + with patch("agent.context_compressor.get_model_context_length", return_value=100_000): + c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.05) + assert c.summary_target_ratio == 0.10 + + with patch("agent.context_compressor.get_model_context_length", return_value=100_000): + c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.95) + assert c.summary_target_ratio == 0.80 + + def test_default_threshold_is_80_percent(self): + """Default compression threshold should be 80%.""" + with patch("agent.context_compressor.get_model_context_length", return_value=100_000): + c = ContextCompressor(model="test", quiet_mode=True) + assert c.threshold_percent == 0.80 + assert c.threshold_tokens == 80_000 + + def test_default_protect_last_n_is_20(self): + """Default protect_last_n should be 20.""" + with patch("agent.context_compressor.get_model_context_length", return_value=100_000): + c = ContextCompressor(model="test", quiet_mode=True) + assert c.protect_last_n == 20