feat: smart context length probing with persistent caching + banner display
Replaces the unsafe 128K fallback for unknown models with a descending probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a context-length error occurs, the agent steps down tiers and retries. The discovered limit is cached per model+provider combo in ~/.hermes/context_length_cache.yaml so subsequent sessions skip probing. Also parses API error messages to extract the actual context limit (e.g. 'maximum context length is 32768 tokens') for instant resolution. The CLI banner now displays the context window size next to the model name (e.g. 'claude-opus-4 · 200K context · Nous Research'). Changes: - agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache (save/load/get), parse_context_limit_from_error(), get_next_probe_tier() - agent/context_compressor.py: accepts base_url, passes to metadata - run_agent.py: step-down logic in context error handler, caches on success - cli.py + hermes_cli/banner.py: context length in welcome banner - tests: 22 new tests for probing, parsing, and caching Addresses #132. PR #319's approach (8K default) rejected — too conservative.
This commit is contained in:
39
run_agent.py
39
run_agent.py
@@ -82,6 +82,8 @@ from agent.prompt_builder import (
|
||||
from agent.model_metadata import (
|
||||
fetch_model_metadata, get_model_context_length,
|
||||
estimate_tokens_rough, estimate_messages_tokens_rough,
|
||||
get_next_probe_tier, parse_context_limit_from_error,
|
||||
save_context_length,
|
||||
)
|
||||
from agent.context_compressor import ContextCompressor
|
||||
from agent.prompt_caching import apply_anthropic_cache_control
|
||||
@@ -536,6 +538,7 @@ class AIAgent:
|
||||
summary_target_tokens=500,
|
||||
summary_model_override=compression_summary_model,
|
||||
quiet_mode=self.quiet_mode,
|
||||
base_url=self.base_url,
|
||||
)
|
||||
self.compression_enabled = compression_enabled
|
||||
self._user_turn_count = 0
|
||||
@@ -3236,6 +3239,13 @@ class AIAgent:
|
||||
}
|
||||
self.context_compressor.update_from_response(usage_dict)
|
||||
|
||||
# Cache discovered context length after successful call
|
||||
if self.context_compressor._context_probed:
|
||||
ctx = self.context_compressor.context_length
|
||||
save_context_length(self.model, self.base_url, ctx)
|
||||
print(f"{self.log_prefix}💾 Cached context length: {ctx:,} tokens for {self.model}")
|
||||
self.context_compressor._context_probed = False
|
||||
|
||||
self.session_prompt_tokens += prompt_tokens
|
||||
self.session_completion_tokens += completion_tokens
|
||||
self.session_total_tokens += total_tokens
|
||||
@@ -3355,18 +3365,37 @@ class AIAgent:
|
||||
])
|
||||
|
||||
if is_context_length_error:
|
||||
print(f"{self.log_prefix}⚠️ Context length exceeded - attempting compression...")
|
||||
compressor = self.context_compressor
|
||||
old_ctx = compressor.context_length
|
||||
|
||||
# Try to parse the actual limit from the error message
|
||||
parsed_limit = parse_context_limit_from_error(error_msg)
|
||||
if parsed_limit and parsed_limit < old_ctx:
|
||||
new_ctx = parsed_limit
|
||||
print(f"{self.log_prefix}⚠️ Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})")
|
||||
else:
|
||||
# Step down to the next probe tier
|
||||
new_ctx = get_next_probe_tier(old_ctx)
|
||||
|
||||
if new_ctx and new_ctx < old_ctx:
|
||||
compressor.context_length = new_ctx
|
||||
compressor.threshold_tokens = int(new_ctx * compressor.threshold_percent)
|
||||
compressor._context_probed = True
|
||||
print(f"{self.log_prefix}⚠️ Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens")
|
||||
else:
|
||||
print(f"{self.log_prefix}⚠️ Context length exceeded at minimum tier — attempting compression...")
|
||||
|
||||
original_len = len(messages)
|
||||
messages, active_system_prompt = self._compress_context(
|
||||
messages, system_message, approx_tokens=approx_tokens
|
||||
)
|
||||
|
||||
if len(messages) < original_len:
|
||||
print(f"{self.log_prefix} 🗜️ Compressed {original_len} → {len(messages)} messages, retrying...")
|
||||
continue # Retry with compressed messages
|
||||
if len(messages) < original_len or new_ctx and new_ctx < old_ctx:
|
||||
if len(messages) < original_len:
|
||||
print(f"{self.log_prefix} 🗜️ Compressed {original_len} → {len(messages)} messages, retrying...")
|
||||
continue # Retry with compressed messages or new tier
|
||||
else:
|
||||
# Can't compress further
|
||||
# Can't compress further and already at minimum tier
|
||||
print(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.")
|
||||
print(f"{self.log_prefix} 💡 The conversation has accumulated too much content.")
|
||||
logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.")
|
||||
|
||||
Reference in New Issue
Block a user