From 9231a335d4bb3da55e7dab4ca49d2de719763735 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Tue, 24 Mar 2026 17:45:49 -0700
Subject: [PATCH] fix(compression): replace dead summary_target_tokens with
 ratio-based scaling (#2554)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The summary_target_tokens parameter was accepted in the constructor,
stored on the instance, and never used — the summary budget was always
computed from hardcoded module constants (_SUMMARY_RATIO=0.20,
_MAX_SUMMARY_TOKENS=8000). This caused two compounding problems:

1. The config value was silently ignored, giving users no control
   over post-compression size.
2. Fixed budgets (20K tail, 8K summary cap) didn't scale with
   context window size. Switching from a 1M-context model to a
   200K model would trigger compression that nuked 350K tokens
   of conversation history down to ~30K.

Changes:
- Replace summary_target_tokens with summary_target_ratio (default 0.40)
  which sets the post-compression target as a fraction of context_length.
  Tail token budget and summary cap now scale proportionally:
    MiniMax 200K → ~80K post-compression
    GPT-5   1M  → ~400K post-compression
- Change threshold_percent default: 0.50 → 0.80 (don't fire until
  80% of context is consumed)
- Change protect_last_n default: 4 → 20 (preserve ~10 full turns)
- Summary token cap scales to 5% of context (was fixed 8K), capped
  at 32K ceiling
- Read target_ratio and protect_last_n from config.yaml compression
  section (both are now configurable)
- Remove hardcoded summary_target_tokens=500 from run_agent.py
- Add 5 new tests for ratio scaling, clamping, and new defaults
---
 agent/context_compressor.py            | 47 ++++++++++++++++--------
 cli-config.yaml.example                | 20 +++++++++--
 run_agent.py                           |  8 +++--
 tests/agent/test_context_compressor.py | 49 +++++++++++++++++++++++++-
 4 files changed, 103 insertions(+), 21 deletions(-)

diff --git a/agent/context_compressor.py b/agent/context_compressor.py
index 5f4ea4a3c..52428d954 100644
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -35,14 +35,12 @@ SUMMARY_PREFIX = (
 )
 LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:"
 
-# Minimum / maximum tokens for the summary output
+# Minimum tokens for the summary output
 _MIN_SUMMARY_TOKENS = 2000
-_MAX_SUMMARY_TOKENS = 8000
 # Proportion of compressed content to allocate for summary
 _SUMMARY_RATIO = 0.20
-
-# Token budget for tail protection (keep most-recent context)
-_DEFAULT_TAIL_TOKEN_BUDGET = 20_000
+# Absolute ceiling for summary tokens (even on very large context windows)
+_SUMMARY_TOKENS_CEILING = 32_000
 
 # Placeholder used when pruning old tool results
 _PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
@@ -65,10 +63,10 @@ class ContextCompressor:
     def __init__(
         self,
         model: str,
-        threshold_percent: float = 0.50,
+        threshold_percent: float = 0.80,
         protect_first_n: int = 3,
-        protect_last_n: int = 4,
-        summary_target_tokens: int = 2500,
+        protect_last_n: int = 20,
+        summary_target_ratio: float = 0.40,
         quiet_mode: bool = False,
         summary_model_override: str = None,
         base_url: str = "",
@@ -83,7 +81,7 @@ class ContextCompressor:
         self.threshold_percent = threshold_percent
         self.protect_first_n = protect_first_n
         self.protect_last_n = protect_last_n
-        self.summary_target_tokens = summary_target_tokens
+        self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80))
         self.quiet_mode = quiet_mode
 
         self.context_length = get_model_context_length(
@@ -94,12 +92,22 @@ class ContextCompressor:
         self.threshold_tokens = int(self.context_length * threshold_percent)
         self.compression_count = 0
 
+        # Derive token budgets from the target ratio and context length
+        target_tokens = int(self.context_length * self.summary_target_ratio)
+        self.tail_token_budget = target_tokens
+        self.max_summary_tokens = min(
+            int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING,
+        )
+
         if not quiet_mode:
             logger.info(
                 "Context compressor initialized: model=%s context_length=%d "
-                "threshold=%d (%.0f%%) provider=%s base_url=%s",
+                "threshold=%d (%.0f%%) target_ratio=%.0f%% tail_budget=%d "
+                "provider=%s base_url=%s",
                 model, self.context_length, self.threshold_tokens,
-                threshold_percent * 100, provider or "none", base_url or "none",
+                threshold_percent * 100, self.summary_target_ratio * 100,
+                self.tail_token_budget,
+                provider or "none", base_url or "none",
             )
         self._context_probed = False  # True after a step-down from context error
 
@@ -179,10 +187,15 @@ class ContextCompressor:
     # ------------------------------------------------------------------
 
     def _compute_summary_budget(self, turns_to_summarize: List[Dict[str, Any]]) -> int:
-        """Scale summary token budget with the amount of content being compressed."""
+        """Scale summary token budget with the amount of content being compressed.
+
+        The maximum scales with the model's context window (5% of context,
+        capped at ``_SUMMARY_TOKENS_CEILING``) so large-context models get
+        richer summaries instead of being hard-capped at 8K tokens.
+        """
         content_tokens = estimate_messages_tokens_rough(turns_to_summarize)
         budget = int(content_tokens * _SUMMARY_RATIO)
-        return max(_MIN_SUMMARY_TOKENS, min(budget, _MAX_SUMMARY_TOKENS))
+        return max(_MIN_SUMMARY_TOKENS, min(budget, self.max_summary_tokens))
 
     def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str:
         """Serialize conversation turns into labeled text for the summarizer.
@@ -477,14 +490,20 @@ Write only the summary body. Do not include any preamble or prefix."""
 
     def _find_tail_cut_by_tokens(
         self, messages: List[Dict[str, Any]], head_end: int,
-        token_budget: int = _DEFAULT_TAIL_TOKEN_BUDGET,
+        token_budget: int | None = None,
     ) -> int:
         """Walk backward from the end of messages, accumulating tokens until
         the budget is reached. Returns the index where the tail starts.
 
+        ``token_budget`` defaults to ``self.tail_token_budget`` which is
+        derived from ``summary_target_ratio * context_length``, so it
+        scales automatically with the model's context window.
+
         Never cuts inside a tool_call/result group. Falls back to the old
         ``protect_last_n`` if the budget would protect fewer messages.
         """
+        if token_budget is None:
+            token_budget = self.tail_token_budget
         n = len(messages)
         min_tail = self.protect_last_n
         accumulated = 0
diff --git a/cli-config.yaml.example b/cli-config.yaml.example
index 89d6b9f80..4469fa54b 100644
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@@ -232,19 +232,33 @@ browser:
 # 1. Tracks actual token usage from API responses (not estimates)
 # 2. When prompt_tokens >= threshold% of model's context_length, triggers compression
 # 3. Protects first 3 turns (system prompt, initial request, first response)
-# 4. Protects last 4 turns (recent context is most relevant)
+# 4. Protects last N turns (default 20 messages = ~10 full turns of recent context)
 # 5. Summarizes middle turns using a fast/cheap model
 # 6. Inserts summary as a user message, continues conversation seamlessly
 #
+# Post-compression size scales with the model's context window via target_ratio:
+#   MiniMax 200K context → ~80K post-compression (at 0.40 ratio)
+#   GPT-5   1M   context → ~400K post-compression (at 0.40 ratio)
+#
 compression:
   # Enable automatic context compression (default: true)
   # Set to false if you prefer to manage context manually or want errors on overflow
   enabled: true
   
-  # Trigger compression at this % of model's context limit (default: 0.85 = 85%)
+  # Trigger compression at this % of model's context limit (default: 0.80 = 80%)
   # Lower values = more aggressive compression, higher values = compress later
-  threshold: 0.85
+  threshold: 0.80
   
+  # Target post-compression size as a fraction of context window (default: 0.40 = 40%)
+  # Controls how much context survives compression. Tail token budget and summary
+  # cap scale with this value. Range: 0.10 - 0.80
+  target_ratio: 0.40
+
+  # Number of most-recent messages to always preserve (default: 20 ≈ 10 full turns)
+  # Higher values keep more recent conversation intact at the cost of more aggressive
+  # compression of older turns.
+  protect_last_n: 20
+
   # Model to use for generating summaries (fast/cheap recommended)
   # This model compresses the middle turns into a concise summary.
   # IMPORTANT: it receives the full middle section of the conversation, so it
diff --git a/run_agent.py b/run_agent.py
index 220921160..11d728e83 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -1009,9 +1009,11 @@ class AIAgent:
         _compression_cfg = _agent_cfg.get("compression", {})
         if not isinstance(_compression_cfg, dict):
             _compression_cfg = {}
-        compression_threshold = float(_compression_cfg.get("threshold", 0.50))
+        compression_threshold = float(_compression_cfg.get("threshold", 0.80))
         compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
         compression_summary_model = _compression_cfg.get("summary_model") or None
+        compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.40))
+        compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
 
         # Read explicit context_length override from model config
         _model_cfg = _agent_cfg.get("model", {})
@@ -1050,8 +1052,8 @@ class AIAgent:
             model=self.model,
             threshold_percent=compression_threshold,
             protect_first_n=3,
-            protect_last_n=4,
-            summary_target_tokens=500,
+            protect_last_n=compression_protect_last,
+            summary_target_ratio=compression_target_ratio,
             summary_model_override=compression_summary_model,
             quiet_mode=self.quiet_mode,
             base_url=self.base_url,
diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py
index 0fbcf4021..380dc8a42 100644
--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@@ -217,7 +217,7 @@ class TestCompressWithClient:
         mock_client.chat.completions.create.return_value = mock_response
 
         with patch("agent.context_compressor.get_model_context_length", return_value=100000):
-            c = ContextCompressor(model="test", quiet_mode=True)
+            c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
 
         msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)]
         with patch("agent.context_compressor.call_llm", return_value=mock_response):
@@ -513,3 +513,50 @@ class TestCompressWithClient:
         for msg in result:
             if msg.get("role") == "tool" and msg.get("tool_call_id"):
                 assert msg["tool_call_id"] in called_ids
+
+
+class TestSummaryTargetRatio:
+    """Verify that summary_target_ratio properly scales budgets with context window."""
+
+    def test_tail_budget_scales_with_context(self):
+        """Tail token budget should be context_length * summary_target_ratio."""
+        with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
+            c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
+        assert c.tail_token_budget == 80_000
+
+        with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
+            c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
+        assert c.tail_token_budget == 400_000
+
+    def test_summary_cap_scales_with_context(self):
+        """Max summary tokens should be 5% of context, capped at 32K."""
+        with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
+            c = ContextCompressor(model="test", quiet_mode=True)
+        assert c.max_summary_tokens == 10_000  # 200K * 0.05
+
+        with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
+            c = ContextCompressor(model="test", quiet_mode=True)
+        assert c.max_summary_tokens == 32_000  # capped at ceiling
+
+    def test_ratio_clamped(self):
+        """Ratio should be clamped to [0.10, 0.80]."""
+        with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
+            c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.05)
+        assert c.summary_target_ratio == 0.10
+
+        with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
+            c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.95)
+        assert c.summary_target_ratio == 0.80
+
+    def test_default_threshold_is_80_percent(self):
+        """Default compression threshold should be 80%."""
+        with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
+            c = ContextCompressor(model="test", quiet_mode=True)
+        assert c.threshold_percent == 0.80
+        assert c.threshold_tokens == 80_000
+
+    def test_default_protect_last_n_is_20(self):
+        """Default protect_last_n should be 20."""
+        with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
+            c = ContextCompressor(model="test", quiet_mode=True)
+        assert c.protect_last_n == 20