diff --git a/.env.example b/.env.example index 78549212f..2693931e0 100644 --- a/.env.example +++ b/.env.example @@ -10,7 +10,7 @@ OPENROUTER_API_KEY= # Default model to use (OpenRouter format: provider/model) -# Examples: anthropic/claude-opus-4.6, openai/gpt-4o, google/gemini-2.0-flash, zhipuai/glm-4-plus +# Examples: anthropic/claude-opus-4.6, openai/gpt-4o, google/gemini-3-flash-preview, zhipuai/glm-4-plus LLM_MODEL=anthropic/claude-opus-4.6 # ============================================================================= @@ -200,9 +200,10 @@ IMAGE_TOOLS_DEBUG=false # When conversation approaches model's context limit, middle turns are # automatically summarized to free up space. # +# Context compression is configured in ~/.hermes/config.yaml under compression: # CONTEXT_COMPRESSION_ENABLED=true # Enable auto-compression (default: true) # CONTEXT_COMPRESSION_THRESHOLD=0.85 # Compress at 85% of context limit -# CONTEXT_COMPRESSION_MODEL=google/gemini-2.0-flash-001 # Fast model for summaries +# Model is set via compression.summary_model in config.yaml (default: google/gemini-3-flash-preview) # ============================================================================= # RL TRAINING (Tinker + Atropos) diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 329fd9680..6f9ce3c01 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -33,6 +33,7 @@ class ContextCompressor: protect_last_n: int = 4, summary_target_tokens: int = 500, quiet_mode: bool = False, + summary_model_override: str = None, ): self.model = model self.threshold_percent = threshold_percent @@ -49,7 +50,8 @@ class ContextCompressor: self.last_completion_tokens = 0 self.last_total_tokens = 0 - self.client, self.summary_model = get_text_auxiliary_client() + self.client, default_model = get_text_auxiliary_client() + self.summary_model = summary_model_override or default_model def update_from_response(self, usage: Dict[str, Any]): """Update tracked token usage from API response.""" diff --git a/gateway/run.py b/gateway/run.py index bcd2457b9..4f4a81bad 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -78,6 +78,16 @@ if _config_path.exists(): for _cfg_key, _env_var in _terminal_env_map.items(): if _cfg_key in _terminal_cfg: os.environ[_env_var] = str(_terminal_cfg[_cfg_key]) + _compression_cfg = _cfg.get("compression", {}) + if _compression_cfg and isinstance(_compression_cfg, dict): + _compression_env_map = { + "enabled": "CONTEXT_COMPRESSION_ENABLED", + "threshold": "CONTEXT_COMPRESSION_THRESHOLD", + "summary_model": "CONTEXT_COMPRESSION_MODEL", + } + for _cfg_key, _env_var in _compression_env_map.items(): + if _cfg_key in _compression_cfg: + os.environ[_env_var] = str(_compression_cfg[_cfg_key]) except Exception: pass # Non-fatal; gateway can still run with .env values diff --git a/run_agent.py b/run_agent.py index 59a547f0d..91db7cc2a 100644 --- a/run_agent.py +++ b/run_agent.py @@ -479,9 +479,10 @@ class AIAgent: # Initialize context compressor for automatic context management # Compresses conversation when approaching model's context limit - # Configuration via environment variables (can be set in .env or cli-config.yaml) + # Configuration via config.yaml (compression section) or environment variables compression_threshold = float(os.getenv("CONTEXT_COMPRESSION_THRESHOLD", "0.85")) compression_enabled = os.getenv("CONTEXT_COMPRESSION_ENABLED", "true").lower() in ("true", "1", "yes") + compression_summary_model = os.getenv("CONTEXT_COMPRESSION_MODEL") or None self.context_compressor = ContextCompressor( model=self.model, @@ -489,6 +490,7 @@ class AIAgent: protect_first_n=3, protect_last_n=4, summary_target_tokens=500, + summary_model_override=compression_summary_model, quiet_mode=self.quiet_mode, ) self.compression_enabled = compression_enabled