fix(compression): restore sane defaults and cap summary at 12K tokens
- threshold: 0.80 → 0.50 (compress at 50%, not 80%) - target_ratio: 0.40 → 0.20, now relative to threshold not total context (20% of 50% = 10% of context as tail budget) - summary ceiling: 32K → 12K (Gemini can't output more than ~12K) - Updated DEFAULT_CONFIG, config display, example config, and tests
This commit is contained in:
@@ -40,7 +40,7 @@ _MIN_SUMMARY_TOKENS = 2000
|
||||
# Proportion of compressed content to allocate for summary
|
||||
_SUMMARY_RATIO = 0.20
|
||||
# Absolute ceiling for summary tokens (even on very large context windows)
|
||||
_SUMMARY_TOKENS_CEILING = 32_000
|
||||
_SUMMARY_TOKENS_CEILING = 12_000
|
||||
|
||||
# Placeholder used when pruning old tool results
|
||||
_PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
|
||||
@@ -63,10 +63,10 @@ class ContextCompressor:
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
threshold_percent: float = 0.80,
|
||||
threshold_percent: float = 0.50,
|
||||
protect_first_n: int = 3,
|
||||
protect_last_n: int = 20,
|
||||
summary_target_ratio: float = 0.40,
|
||||
summary_target_ratio: float = 0.20,
|
||||
quiet_mode: bool = False,
|
||||
summary_model_override: str = None,
|
||||
base_url: str = "",
|
||||
@@ -92,8 +92,8 @@ class ContextCompressor:
|
||||
self.threshold_tokens = int(self.context_length * threshold_percent)
|
||||
self.compression_count = 0
|
||||
|
||||
# Derive token budgets from the target ratio and context length
|
||||
target_tokens = int(self.context_length * self.summary_target_ratio)
|
||||
# Derive token budgets: ratio is relative to the threshold, not total context
|
||||
target_tokens = int(self.threshold_tokens * self.summary_target_ratio)
|
||||
self.tail_token_budget = target_tokens
|
||||
self.max_summary_tokens = min(
|
||||
int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING,
|
||||
|
||||
@@ -236,23 +236,24 @@ browser:
|
||||
# 5. Summarizes middle turns using a fast/cheap model
|
||||
# 6. Inserts summary as a user message, continues conversation seamlessly
|
||||
#
|
||||
# Post-compression size scales with the model's context window via target_ratio:
|
||||
# MiniMax 200K context → ~80K post-compression (at 0.40 ratio)
|
||||
# GPT-5 1M context → ~400K post-compression (at 0.40 ratio)
|
||||
# Post-compression tail budget is target_ratio × threshold × context_length:
|
||||
# 200K context, threshold 0.50, ratio 0.20 → 20K tokens of recent tail preserved
|
||||
# 1M context, threshold 0.50, ratio 0.20 → 100K tokens of recent tail preserved
|
||||
#
|
||||
compression:
|
||||
# Enable automatic context compression (default: true)
|
||||
# Set to false if you prefer to manage context manually or want errors on overflow
|
||||
enabled: true
|
||||
|
||||
# Trigger compression at this % of model's context limit (default: 0.80 = 80%)
|
||||
# Trigger compression at this % of model's context limit (default: 0.50 = 50%)
|
||||
# Lower values = more aggressive compression, higher values = compress later
|
||||
threshold: 0.80
|
||||
threshold: 0.50
|
||||
|
||||
# Target post-compression size as a fraction of context window (default: 0.40 = 40%)
|
||||
# Controls how much context survives compression. Tail token budget and summary
|
||||
# cap scale with this value. Range: 0.10 - 0.80
|
||||
target_ratio: 0.40
|
||||
# Fraction of the threshold to preserve as recent tail (default: 0.20 = 20%)
|
||||
# e.g. 20% of 50% threshold = 10% of total context kept as recent messages.
|
||||
# Summary output is separately capped at 12K tokens (Gemini output limit).
|
||||
# Range: 0.10 - 0.80
|
||||
target_ratio: 0.20
|
||||
|
||||
# Number of most-recent messages to always preserve (default: 20 ≈ 10 full turns)
|
||||
# Higher values keep more recent conversation intact at the cost of more aggressive
|
||||
|
||||
@@ -163,8 +163,8 @@ DEFAULT_CONFIG = {
|
||||
|
||||
"compression": {
|
||||
"enabled": True,
|
||||
"threshold": 0.80, # compress when context usage exceeds this ratio
|
||||
"target_ratio": 0.40, # fraction of context to preserve as recent tail
|
||||
"threshold": 0.50, # compress when context usage exceeds this ratio
|
||||
"target_ratio": 0.20, # fraction of threshold to preserve as recent tail
|
||||
"protect_last_n": 20, # minimum recent messages to keep uncompressed
|
||||
"summary_model": "", # empty = use main configured model
|
||||
"summary_provider": "auto",
|
||||
@@ -1686,8 +1686,8 @@ def show_config():
|
||||
enabled = compression.get('enabled', True)
|
||||
print(f" Enabled: {'yes' if enabled else 'no'}")
|
||||
if enabled:
|
||||
print(f" Threshold: {compression.get('threshold', 0.80) * 100:.0f}%")
|
||||
print(f" Target ratio: {compression.get('target_ratio', 0.40) * 100:.0f}% of context preserved")
|
||||
print(f" Threshold: {compression.get('threshold', 0.50) * 100:.0f}%")
|
||||
print(f" Target ratio: {compression.get('target_ratio', 0.20) * 100:.0f}% of threshold preserved")
|
||||
print(f" Protect last: {compression.get('protect_last_n', 20)} messages")
|
||||
_sm = compression.get('summary_model', '') or '(main model)'
|
||||
print(f" Model: {_sm}")
|
||||
|
||||
@@ -1009,10 +1009,10 @@ class AIAgent:
|
||||
_compression_cfg = _agent_cfg.get("compression", {})
|
||||
if not isinstance(_compression_cfg, dict):
|
||||
_compression_cfg = {}
|
||||
compression_threshold = float(_compression_cfg.get("threshold", 0.80))
|
||||
compression_threshold = float(_compression_cfg.get("threshold", 0.50))
|
||||
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
|
||||
compression_summary_model = _compression_cfg.get("summary_model") or None
|
||||
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.40))
|
||||
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.20))
|
||||
compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
|
||||
|
||||
# Read explicit context_length override from model config
|
||||
|
||||
@@ -519,24 +519,26 @@ class TestSummaryTargetRatio:
|
||||
"""Verify that summary_target_ratio properly scales budgets with context window."""
|
||||
|
||||
def test_tail_budget_scales_with_context(self):
|
||||
"""Tail token budget should be context_length * summary_target_ratio."""
|
||||
"""Tail token budget should be threshold_tokens * summary_target_ratio."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
|
||||
assert c.tail_token_budget == 80_000
|
||||
# 200K * 0.50 threshold * 0.40 ratio = 40K
|
||||
assert c.tail_token_budget == 40_000
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
|
||||
assert c.tail_token_budget == 400_000
|
||||
# 1M * 0.50 threshold * 0.40 ratio = 200K
|
||||
assert c.tail_token_budget == 200_000
|
||||
|
||||
def test_summary_cap_scales_with_context(self):
|
||||
"""Max summary tokens should be 5% of context, capped at 32K."""
|
||||
"""Max summary tokens should be 5% of context, capped at 12K."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
assert c.max_summary_tokens == 10_000 # 200K * 0.05
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
assert c.max_summary_tokens == 32_000 # capped at ceiling
|
||||
assert c.max_summary_tokens == 12_000 # capped at 12K ceiling
|
||||
|
||||
def test_ratio_clamped(self):
|
||||
"""Ratio should be clamped to [0.10, 0.80]."""
|
||||
@@ -548,12 +550,12 @@ class TestSummaryTargetRatio:
|
||||
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.95)
|
||||
assert c.summary_target_ratio == 0.80
|
||||
|
||||
def test_default_threshold_is_80_percent(self):
|
||||
"""Default compression threshold should be 80%."""
|
||||
def test_default_threshold_is_50_percent(self):
|
||||
"""Default compression threshold should be 50%."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
assert c.threshold_percent == 0.80
|
||||
assert c.threshold_tokens == 80_000
|
||||
assert c.threshold_percent == 0.50
|
||||
assert c.threshold_tokens == 50_000
|
||||
|
||||
def test_default_protect_last_n_is_20(self):
|
||||
"""Default protect_last_n should be 20."""
|
||||
|
||||
Reference in New Issue
Block a user