Replaces the unsafe 128K fallback for unknown models with a descending probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a context-length error occurs, the agent steps down tiers and retries. The discovered limit is cached per model+provider combo in ~/.hermes/context_length_cache.yaml so subsequent sessions skip probing. Also parses API error messages to extract the actual context limit (e.g. 'maximum context length is 32768 tokens') for instant resolution. The CLI banner now displays the context window size next to the model name (e.g. 'claude-opus-4 · 200K context · Nous Research'). Changes: - agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache (save/load/get), parse_context_limit_from_error(), get_next_probe_tier() - agent/context_compressor.py: accepts base_url, passes to metadata - run_agent.py: step-down logic in context error handler, caches on success - cli.py + hermes_cli/banner.py: context length in welcome banner - tests: 22 new tests for probing, parsing, and caching Addresses #132. PR #319's approach (8K default) rejected — too conservative.
212 lines
7.1 KiB
Python
212 lines
7.1 KiB
Python
"""Model metadata, context lengths, and token estimation utilities.
|
|
|
|
Pure utility functions with no AIAgent dependency. Used by ContextCompressor
|
|
and run_agent.py for pre-flight context checks.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import requests
|
|
import yaml
|
|
|
|
from hermes_constants import OPENROUTER_MODELS_URL
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
|
|
_model_metadata_cache_time: float = 0
|
|
_MODEL_CACHE_TTL = 3600
|
|
|
|
# Descending tiers for context length probing when the model is unknown.
|
|
# We start high and step down on context-length errors until one works.
|
|
CONTEXT_PROBE_TIERS = [
|
|
2_000_000,
|
|
1_000_000,
|
|
512_000,
|
|
200_000,
|
|
128_000,
|
|
64_000,
|
|
32_000,
|
|
]
|
|
|
|
DEFAULT_CONTEXT_LENGTHS = {
|
|
"anthropic/claude-opus-4": 200000,
|
|
"anthropic/claude-opus-4.5": 200000,
|
|
"anthropic/claude-opus-4.6": 200000,
|
|
"anthropic/claude-sonnet-4": 200000,
|
|
"anthropic/claude-sonnet-4-20250514": 200000,
|
|
"anthropic/claude-haiku-4.5": 200000,
|
|
"openai/gpt-4o": 128000,
|
|
"openai/gpt-4-turbo": 128000,
|
|
"openai/gpt-4o-mini": 128000,
|
|
"google/gemini-2.0-flash": 1048576,
|
|
"google/gemini-2.5-pro": 1048576,
|
|
"meta-llama/llama-3.3-70b-instruct": 131072,
|
|
"deepseek/deepseek-chat-v3": 65536,
|
|
"qwen/qwen-2.5-72b-instruct": 32768,
|
|
}
|
|
|
|
|
|
def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any]]:
|
|
"""Fetch model metadata from OpenRouter (cached for 1 hour)."""
|
|
global _model_metadata_cache, _model_metadata_cache_time
|
|
|
|
if not force_refresh and _model_metadata_cache and (time.time() - _model_metadata_cache_time) < _MODEL_CACHE_TTL:
|
|
return _model_metadata_cache
|
|
|
|
try:
|
|
response = requests.get(OPENROUTER_MODELS_URL, timeout=10)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
cache = {}
|
|
for model in data.get("data", []):
|
|
model_id = model.get("id", "")
|
|
cache[model_id] = {
|
|
"context_length": model.get("context_length", 128000),
|
|
"max_completion_tokens": model.get("top_provider", {}).get("max_completion_tokens", 4096),
|
|
"name": model.get("name", model_id),
|
|
"pricing": model.get("pricing", {}),
|
|
}
|
|
canonical = model.get("canonical_slug", "")
|
|
if canonical and canonical != model_id:
|
|
cache[canonical] = cache[model_id]
|
|
|
|
_model_metadata_cache = cache
|
|
_model_metadata_cache_time = time.time()
|
|
logger.debug("Fetched metadata for %s models from OpenRouter", len(cache))
|
|
return cache
|
|
|
|
except Exception as e:
|
|
logging.warning(f"Failed to fetch model metadata from OpenRouter: {e}")
|
|
return _model_metadata_cache or {}
|
|
|
|
|
|
def _get_context_cache_path() -> Path:
|
|
"""Return path to the persistent context length cache file."""
|
|
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
|
return hermes_home / "context_length_cache.yaml"
|
|
|
|
|
|
def _load_context_cache() -> Dict[str, int]:
|
|
"""Load the model+provider → context_length cache from disk."""
|
|
path = _get_context_cache_path()
|
|
if not path.exists():
|
|
return {}
|
|
try:
|
|
with open(path) as f:
|
|
data = yaml.safe_load(f) or {}
|
|
return data.get("context_lengths", {})
|
|
except Exception as e:
|
|
logger.debug("Failed to load context length cache: %s", e)
|
|
return {}
|
|
|
|
|
|
def save_context_length(model: str, base_url: str, length: int) -> None:
|
|
"""Persist a discovered context length for a model+provider combo.
|
|
|
|
Cache key is ``model@base_url`` so the same model name served from
|
|
different providers can have different limits.
|
|
"""
|
|
key = f"{model}@{base_url}"
|
|
cache = _load_context_cache()
|
|
if cache.get(key) == length:
|
|
return # already stored
|
|
cache[key] = length
|
|
path = _get_context_cache_path()
|
|
try:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(path, "w") as f:
|
|
yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
|
|
logger.info("Cached context length %s → %s tokens", key, f"{length:,}")
|
|
except Exception as e:
|
|
logger.debug("Failed to save context length cache: %s", e)
|
|
|
|
|
|
def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
|
|
"""Look up a previously discovered context length for model+provider."""
|
|
key = f"{model}@{base_url}"
|
|
cache = _load_context_cache()
|
|
return cache.get(key)
|
|
|
|
|
|
def get_next_probe_tier(current_length: int) -> Optional[int]:
|
|
"""Return the next lower probe tier, or None if already at minimum."""
|
|
for tier in CONTEXT_PROBE_TIERS:
|
|
if tier < current_length:
|
|
return tier
|
|
return None
|
|
|
|
|
|
def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
|
|
"""Try to extract the actual context limit from an API error message.
|
|
|
|
Many providers include the limit in their error text, e.g.:
|
|
- "maximum context length is 32768 tokens"
|
|
- "context_length_exceeded: 131072"
|
|
- "Maximum context size 32768 exceeded"
|
|
- "model's max context length is 65536"
|
|
"""
|
|
error_lower = error_msg.lower()
|
|
# Pattern: look for numbers near context-related keywords
|
|
patterns = [
|
|
r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
|
|
r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
|
|
r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
|
|
]
|
|
for pattern in patterns:
|
|
match = re.search(pattern, error_lower)
|
|
if match:
|
|
limit = int(match.group(1))
|
|
# Sanity check: must be a reasonable context length
|
|
if 1024 <= limit <= 10_000_000:
|
|
return limit
|
|
return None
|
|
|
|
|
|
def get_model_context_length(model: str, base_url: str = "") -> int:
|
|
"""Get the context length for a model.
|
|
|
|
Resolution order:
|
|
1. Persistent cache (previously discovered via probing)
|
|
2. OpenRouter API metadata
|
|
3. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match)
|
|
4. First probe tier (2M) — will be narrowed on first context error
|
|
"""
|
|
# 1. Check persistent cache (model+provider)
|
|
if base_url:
|
|
cached = get_cached_context_length(model, base_url)
|
|
if cached is not None:
|
|
return cached
|
|
|
|
# 2. OpenRouter API metadata
|
|
metadata = fetch_model_metadata()
|
|
if model in metadata:
|
|
return metadata[model].get("context_length", 128000)
|
|
|
|
# 3. Hardcoded defaults (fuzzy match)
|
|
for default_model, length in DEFAULT_CONTEXT_LENGTHS.items():
|
|
if default_model in model or model in default_model:
|
|
return length
|
|
|
|
# 4. Unknown model — start at highest probe tier
|
|
return CONTEXT_PROBE_TIERS[0]
|
|
|
|
|
|
def estimate_tokens_rough(text: str) -> int:
|
|
"""Rough token estimate (~4 chars/token) for pre-flight checks."""
|
|
if not text:
|
|
return 0
|
|
return len(text) // 4
|
|
|
|
|
|
def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
|
|
"""Rough token estimate for a message list (pre-flight only)."""
|
|
total_chars = sum(len(str(msg)) for msg in messages)
|
|
return total_chars // 4
|