diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 52428d954..4ec8b3cf3 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -40,7 +40,7 @@ _MIN_SUMMARY_TOKENS = 2000 # Proportion of compressed content to allocate for summary _SUMMARY_RATIO = 0.20 # Absolute ceiling for summary tokens (even on very large context windows) -_SUMMARY_TOKENS_CEILING = 32_000 +_SUMMARY_TOKENS_CEILING = 12_000 # Placeholder used when pruning old tool results _PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]" @@ -63,10 +63,10 @@ class ContextCompressor: def __init__( self, model: str, - threshold_percent: float = 0.80, + threshold_percent: float = 0.50, protect_first_n: int = 3, protect_last_n: int = 20, - summary_target_ratio: float = 0.40, + summary_target_ratio: float = 0.20, quiet_mode: bool = False, summary_model_override: str = None, base_url: str = "", @@ -92,8 +92,8 @@ class ContextCompressor: self.threshold_tokens = int(self.context_length * threshold_percent) self.compression_count = 0 - # Derive token budgets from the target ratio and context length - target_tokens = int(self.context_length * self.summary_target_ratio) + # Derive token budgets: ratio is relative to the threshold, not total context + target_tokens = int(self.threshold_tokens * self.summary_target_ratio) self.tail_token_budget = target_tokens self.max_summary_tokens = min( int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING, diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 4469fa54b..dc1a33199 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -236,23 +236,24 @@ browser: # 5. Summarizes middle turns using a fast/cheap model # 6. Inserts summary as a user message, continues conversation seamlessly # -# Post-compression size scales with the model's context window via target_ratio: -# MiniMax 200K context → ~80K post-compression (at 0.40 ratio) -# GPT-5 1M context → ~400K post-compression (at 0.40 ratio) +# Post-compression tail budget is target_ratio × threshold × context_length: +# 200K context, threshold 0.50, ratio 0.20 → 20K tokens of recent tail preserved +# 1M context, threshold 0.50, ratio 0.20 → 100K tokens of recent tail preserved # compression: # Enable automatic context compression (default: true) # Set to false if you prefer to manage context manually or want errors on overflow enabled: true - # Trigger compression at this % of model's context limit (default: 0.80 = 80%) + # Trigger compression at this % of model's context limit (default: 0.50 = 50%) # Lower values = more aggressive compression, higher values = compress later - threshold: 0.80 + threshold: 0.50 - # Target post-compression size as a fraction of context window (default: 0.40 = 40%) - # Controls how much context survives compression. Tail token budget and summary - # cap scale with this value. Range: 0.10 - 0.80 - target_ratio: 0.40 + # Fraction of the threshold to preserve as recent tail (default: 0.20 = 20%) + # e.g. 20% of 50% threshold = 10% of total context kept as recent messages. + # Summary output is separately capped at 12K tokens (Gemini output limit). + # Range: 0.10 - 0.80 + target_ratio: 0.20 # Number of most-recent messages to always preserve (default: 20 ≈ 10 full turns) # Higher values keep more recent conversation intact at the cost of more aggressive diff --git a/hermes_cli/config.py b/hermes_cli/config.py index f96324fd8..000d8626b 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -163,8 +163,8 @@ DEFAULT_CONFIG = { "compression": { "enabled": True, - "threshold": 0.80, # compress when context usage exceeds this ratio - "target_ratio": 0.40, # fraction of context to preserve as recent tail + "threshold": 0.50, # compress when context usage exceeds this ratio + "target_ratio": 0.20, # fraction of threshold to preserve as recent tail "protect_last_n": 20, # minimum recent messages to keep uncompressed "summary_model": "", # empty = use main configured model "summary_provider": "auto", @@ -1686,8 +1686,8 @@ def show_config(): enabled = compression.get('enabled', True) print(f" Enabled: {'yes' if enabled else 'no'}") if enabled: - print(f" Threshold: {compression.get('threshold', 0.80) * 100:.0f}%") - print(f" Target ratio: {compression.get('target_ratio', 0.40) * 100:.0f}% of context preserved") + print(f" Threshold: {compression.get('threshold', 0.50) * 100:.0f}%") + print(f" Target ratio: {compression.get('target_ratio', 0.20) * 100:.0f}% of threshold preserved") print(f" Protect last: {compression.get('protect_last_n', 20)} messages") _sm = compression.get('summary_model', '') or '(main model)' print(f" Model: {_sm}") diff --git a/run_agent.py b/run_agent.py index 11d728e83..25cfd3674 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1009,10 +1009,10 @@ class AIAgent: _compression_cfg = _agent_cfg.get("compression", {}) if not isinstance(_compression_cfg, dict): _compression_cfg = {} - compression_threshold = float(_compression_cfg.get("threshold", 0.80)) + compression_threshold = float(_compression_cfg.get("threshold", 0.50)) compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes") compression_summary_model = _compression_cfg.get("summary_model") or None - compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.40)) + compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.20)) compression_protect_last = int(_compression_cfg.get("protect_last_n", 20)) # Read explicit context_length override from model config diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 380dc8a42..45c832dfc 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -519,24 +519,26 @@ class TestSummaryTargetRatio: """Verify that summary_target_ratio properly scales budgets with context window.""" def test_tail_budget_scales_with_context(self): - """Tail token budget should be context_length * summary_target_ratio.""" + """Tail token budget should be threshold_tokens * summary_target_ratio.""" with patch("agent.context_compressor.get_model_context_length", return_value=200_000): c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40) - assert c.tail_token_budget == 80_000 + # 200K * 0.50 threshold * 0.40 ratio = 40K + assert c.tail_token_budget == 40_000 with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000): c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40) - assert c.tail_token_budget == 400_000 + # 1M * 0.50 threshold * 0.40 ratio = 200K + assert c.tail_token_budget == 200_000 def test_summary_cap_scales_with_context(self): - """Max summary tokens should be 5% of context, capped at 32K.""" + """Max summary tokens should be 5% of context, capped at 12K.""" with patch("agent.context_compressor.get_model_context_length", return_value=200_000): c = ContextCompressor(model="test", quiet_mode=True) assert c.max_summary_tokens == 10_000 # 200K * 0.05 with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000): c = ContextCompressor(model="test", quiet_mode=True) - assert c.max_summary_tokens == 32_000 # capped at ceiling + assert c.max_summary_tokens == 12_000 # capped at 12K ceiling def test_ratio_clamped(self): """Ratio should be clamped to [0.10, 0.80].""" @@ -548,12 +550,12 @@ class TestSummaryTargetRatio: c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.95) assert c.summary_target_ratio == 0.80 - def test_default_threshold_is_80_percent(self): - """Default compression threshold should be 80%.""" + def test_default_threshold_is_50_percent(self): + """Default compression threshold should be 50%.""" with patch("agent.context_compressor.get_model_context_length", return_value=100_000): c = ContextCompressor(model="test", quiet_mode=True) - assert c.threshold_percent == 0.80 - assert c.threshold_tokens == 80_000 + assert c.threshold_percent == 0.50 + assert c.threshold_tokens == 50_000 def test_default_protect_last_n_is_20(self): """Default protect_last_n should be 20."""