Add Anthropic as a first-class inference provider, bypassing OpenRouter for direct API access. Uses the native Anthropic SDK with a full format adapter (same pattern as the codex_responses api_mode). ## Auth (three methods, priority order) 1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*) 2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*) 3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription) - Reads Claude Code's OAuth credentials - Checks token expiry with 60s buffer - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header - Regular API keys use standard x-api-key header ## Changes by file ### New files - agent/anthropic_adapter.py — Client builder, message/tool/response format conversion, Claude Code credential reader, token resolver. Handles system prompt extraction, tool_use/tool_result blocks, thinking/reasoning, orphaned tool_use cleanup, cache_control. - tests/test_anthropic_adapter.py — 36 tests covering all adapter logic ### Modified files - pyproject.toml — Add anthropic>=0.39.0 dependency - hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with three env vars, plus 'claude'/'claude-code' aliases - hermes_cli/models.py — Add model catalog, labels, aliases, provider order - hermes_cli/main.py — Add 'anthropic' to --provider CLI choices - hermes_cli/runtime_provider.py — Add Anthropic branch returning api_mode='anthropic_messages' (before generic api_key fallthrough) - hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code credential auto-discovery, model selection, OpenRouter tools prompt - agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model - agent/model_metadata.py — Add bare Claude model context lengths - run_agent.py — Add anthropic_messages api_mode: * Client init (Anthropic SDK instead of OpenAI) * API call dispatch (_anthropic_client.messages.create) * Response validation (content blocks) * finish_reason mapping (stop_reason -> finish_reason) * Token usage (input_tokens/output_tokens) * Response normalization (normalize_anthropic_response) * Client interrupt/rebuild * Prompt caching auto-enabled for native Anthropic - tests/test_run_agent.py — Update test_anthropic_base_url_accepted to expect native routing, add test_prompt_caching_native_anthropic
231 lines
7.8 KiB
Python
231 lines
7.8 KiB
Python
"""Model metadata, context lengths, and token estimation utilities.
|
|
|
|
Pure utility functions with no AIAgent dependency. Used by ContextCompressor
|
|
and run_agent.py for pre-flight context checks.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import requests
|
|
import yaml
|
|
|
|
from hermes_constants import OPENROUTER_MODELS_URL
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
|
|
_model_metadata_cache_time: float = 0
|
|
_MODEL_CACHE_TTL = 3600
|
|
|
|
# Descending tiers for context length probing when the model is unknown.
|
|
# We start high and step down on context-length errors until one works.
|
|
CONTEXT_PROBE_TIERS = [
|
|
2_000_000,
|
|
1_000_000,
|
|
512_000,
|
|
200_000,
|
|
128_000,
|
|
64_000,
|
|
32_000,
|
|
]
|
|
|
|
DEFAULT_CONTEXT_LENGTHS = {
|
|
"anthropic/claude-opus-4": 200000,
|
|
"anthropic/claude-opus-4.5": 200000,
|
|
"anthropic/claude-opus-4.6": 200000,
|
|
"anthropic/claude-sonnet-4": 200000,
|
|
"anthropic/claude-sonnet-4-20250514": 200000,
|
|
"anthropic/claude-haiku-4.5": 200000,
|
|
# Bare Anthropic model IDs (for native API provider)
|
|
"claude-opus-4-20250514": 200000,
|
|
"claude-sonnet-4-20250514": 200000,
|
|
"claude-haiku-4-5-20251001": 200000,
|
|
"openai/gpt-4o": 128000,
|
|
"openai/gpt-4-turbo": 128000,
|
|
"openai/gpt-4o-mini": 128000,
|
|
"google/gemini-2.0-flash": 1048576,
|
|
"google/gemini-2.5-pro": 1048576,
|
|
"meta-llama/llama-3.3-70b-instruct": 131072,
|
|
"deepseek/deepseek-chat-v3": 65536,
|
|
"qwen/qwen-2.5-72b-instruct": 32768,
|
|
"glm-4.7": 202752,
|
|
"glm-5": 202752,
|
|
"glm-4.5": 131072,
|
|
"glm-4.5-flash": 131072,
|
|
"kimi-for-coding": 262144,
|
|
"kimi-k2.5": 262144,
|
|
"kimi-k2-thinking": 262144,
|
|
"kimi-k2-thinking-turbo": 262144,
|
|
"kimi-k2-turbo-preview": 262144,
|
|
"kimi-k2-0905-preview": 131072,
|
|
"MiniMax-M2.5": 204800,
|
|
"MiniMax-M2.5-highspeed": 204800,
|
|
"MiniMax-M2.1": 204800,
|
|
}
|
|
|
|
|
|
def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any]]:
|
|
"""Fetch model metadata from OpenRouter (cached for 1 hour)."""
|
|
global _model_metadata_cache, _model_metadata_cache_time
|
|
|
|
if not force_refresh and _model_metadata_cache and (time.time() - _model_metadata_cache_time) < _MODEL_CACHE_TTL:
|
|
return _model_metadata_cache
|
|
|
|
try:
|
|
response = requests.get(OPENROUTER_MODELS_URL, timeout=10)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
cache = {}
|
|
for model in data.get("data", []):
|
|
model_id = model.get("id", "")
|
|
cache[model_id] = {
|
|
"context_length": model.get("context_length", 128000),
|
|
"max_completion_tokens": model.get("top_provider", {}).get("max_completion_tokens", 4096),
|
|
"name": model.get("name", model_id),
|
|
"pricing": model.get("pricing", {}),
|
|
}
|
|
canonical = model.get("canonical_slug", "")
|
|
if canonical and canonical != model_id:
|
|
cache[canonical] = cache[model_id]
|
|
|
|
_model_metadata_cache = cache
|
|
_model_metadata_cache_time = time.time()
|
|
logger.debug("Fetched metadata for %s models from OpenRouter", len(cache))
|
|
return cache
|
|
|
|
except Exception as e:
|
|
logging.warning(f"Failed to fetch model metadata from OpenRouter: {e}")
|
|
return _model_metadata_cache or {}
|
|
|
|
|
|
def _get_context_cache_path() -> Path:
|
|
"""Return path to the persistent context length cache file."""
|
|
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
|
return hermes_home / "context_length_cache.yaml"
|
|
|
|
|
|
def _load_context_cache() -> Dict[str, int]:
|
|
"""Load the model+provider → context_length cache from disk."""
|
|
path = _get_context_cache_path()
|
|
if not path.exists():
|
|
return {}
|
|
try:
|
|
with open(path) as f:
|
|
data = yaml.safe_load(f) or {}
|
|
return data.get("context_lengths", {})
|
|
except Exception as e:
|
|
logger.debug("Failed to load context length cache: %s", e)
|
|
return {}
|
|
|
|
|
|
def save_context_length(model: str, base_url: str, length: int) -> None:
|
|
"""Persist a discovered context length for a model+provider combo.
|
|
|
|
Cache key is ``model@base_url`` so the same model name served from
|
|
different providers can have different limits.
|
|
"""
|
|
key = f"{model}@{base_url}"
|
|
cache = _load_context_cache()
|
|
if cache.get(key) == length:
|
|
return # already stored
|
|
cache[key] = length
|
|
path = _get_context_cache_path()
|
|
try:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(path, "w") as f:
|
|
yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
|
|
logger.info("Cached context length %s → %s tokens", key, f"{length:,}")
|
|
except Exception as e:
|
|
logger.debug("Failed to save context length cache: %s", e)
|
|
|
|
|
|
def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
|
|
"""Look up a previously discovered context length for model+provider."""
|
|
key = f"{model}@{base_url}"
|
|
cache = _load_context_cache()
|
|
return cache.get(key)
|
|
|
|
|
|
def get_next_probe_tier(current_length: int) -> Optional[int]:
|
|
"""Return the next lower probe tier, or None if already at minimum."""
|
|
for tier in CONTEXT_PROBE_TIERS:
|
|
if tier < current_length:
|
|
return tier
|
|
return None
|
|
|
|
|
|
def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
|
|
"""Try to extract the actual context limit from an API error message.
|
|
|
|
Many providers include the limit in their error text, e.g.:
|
|
- "maximum context length is 32768 tokens"
|
|
- "context_length_exceeded: 131072"
|
|
- "Maximum context size 32768 exceeded"
|
|
- "model's max context length is 65536"
|
|
"""
|
|
error_lower = error_msg.lower()
|
|
# Pattern: look for numbers near context-related keywords
|
|
patterns = [
|
|
r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
|
|
r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
|
|
r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
|
|
r'>\s*(\d{4,})\s*(?:max|limit|token)', # "250000 tokens > 200000 maximum"
|
|
r'(\d{4,})\s*(?:max(?:imum)?)\b', # "200000 maximum"
|
|
]
|
|
for pattern in patterns:
|
|
match = re.search(pattern, error_lower)
|
|
if match:
|
|
limit = int(match.group(1))
|
|
# Sanity check: must be a reasonable context length
|
|
if 1024 <= limit <= 10_000_000:
|
|
return limit
|
|
return None
|
|
|
|
|
|
def get_model_context_length(model: str, base_url: str = "") -> int:
|
|
"""Get the context length for a model.
|
|
|
|
Resolution order:
|
|
1. Persistent cache (previously discovered via probing)
|
|
2. OpenRouter API metadata
|
|
3. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match)
|
|
4. First probe tier (2M) — will be narrowed on first context error
|
|
"""
|
|
# 1. Check persistent cache (model+provider)
|
|
if base_url:
|
|
cached = get_cached_context_length(model, base_url)
|
|
if cached is not None:
|
|
return cached
|
|
|
|
# 2. OpenRouter API metadata
|
|
metadata = fetch_model_metadata()
|
|
if model in metadata:
|
|
return metadata[model].get("context_length", 128000)
|
|
|
|
# 3. Hardcoded defaults (fuzzy match)
|
|
for default_model, length in DEFAULT_CONTEXT_LENGTHS.items():
|
|
if default_model in model or model in default_model:
|
|
return length
|
|
|
|
# 4. Unknown model — start at highest probe tier
|
|
return CONTEXT_PROBE_TIERS[0]
|
|
|
|
|
|
def estimate_tokens_rough(text: str) -> int:
|
|
"""Rough token estimate (~4 chars/token) for pre-flight checks."""
|
|
if not text:
|
|
return 0
|
|
return len(text) // 4
|
|
|
|
|
|
def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
|
|
"""Rough token estimate for a message list (pre-flight only)."""
|
|
total_chars = sum(len(str(msg)) for msg in messages)
|
|
return total_chars // 4
|