feat: default reasoning effort from xhigh to medium

Reduces token usage and latency for most tasks by defaulting to medium reasoning effort instead of xhigh. Users can still override via config or CLI flag. Updates code, tests, example config, and docs.
2026-03-07 10:14:19 -08:00
parent 23e84de830
commit b84f9e410c
9 changed files with 25 additions and 24 deletions
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@@ -66,7 +66,8 @@ DEFAULT_AGENT_IDENTITY = (
    "range of tasks including answering questions, writing and editing code, "
    "analyzing information, creative work, and executing actions via your tools. "
    "You communicate clearly, admit uncertainty when appropriate, and prioritize "
-    "being genuinely useful over being verbose unless otherwise directed below."
+    "being genuinely useful over being verbose unless otherwise directed below. "
+    "Be targeted and efficient in your exploration and investigations."
 )

 MEMORY_GUIDANCE = (
--- a/batch_runner.py
+++ b/batch_runner.py
@@ -1155,7 +1155,7 @@ def main(
        providers_order (str): Comma-separated list of OpenRouter providers to try in order (e.g. "anthropic,openai,google")
        provider_sort (str): Sort providers by "price", "throughput", or "latency" (OpenRouter only)
        max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
-        reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "xhigh")
+        reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "medium")
        reasoning_disabled (bool): Completely disable reasoning/thinking tokens (default: False)
        prefill_messages_file (str): Path to JSON file containing prefill messages (list of {role, content} dicts)
        max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
@@ -1216,7 +1216,7 @@ def main(
    providers_order_list = [p.strip() for p in providers_order.split(",")] if providers_order else None
    
    # Build reasoning_config from CLI flags
-    # --reasoning_disabled takes priority, then --reasoning_effort, then default (xhigh)
+    # --reasoning_disabled takes priority, then --reasoning_effort, then default (medium)
    reasoning_config = None
    if reasoning_disabled:
        # Completely disable reasoning/thinking tokens
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@@ -285,7 +285,7 @@ agent:
  # Reasoning effort level (OpenRouter and Nous Portal)
  # Controls how much "thinking" the model does before responding.
  # Options: "xhigh" (max), "high", "medium", "low", "minimal", "none" (disable)
-  reasoning_effort: "xhigh"
+  reasoning_effort: "medium"
  
  # Predefined personalities (use with /personality command)
  personalities:
--- a/cli.py
+++ b/cli.py
@@ -108,7 +108,7 @@ def _parse_reasoning_config(effort: str) -> dict | None:
    """Parse a reasoning effort level into an OpenRouter reasoning config dict.
    
    Valid levels: "xhigh", "high", "medium", "low", "minimal", "none".
-    Returns None to use the default (xhigh), or a config dict to override.
+    Returns None to use the default (medium), or a config dict to override.
    """
    if not effort or not effort.strip():
        return None
@@ -118,7 +118,7 @@ def _parse_reasoning_config(effort: str) -> dict | None:
    valid = ("xhigh", "high", "medium", "low", "minimal")
    if effort in valid:
        return {"enabled": True, "effort": effort}
-    logger.warning("Unknown reasoning_effort '%s', using default (xhigh)", effort)
+    logger.warning("Unknown reasoning_effort '%s', using default (medium)", effort)
    return None


--- a/gateway/run.py
+++ b/gateway/run.py
@@ -330,7 +330,7 @@ class GatewayRunner:
        
        Checks HERMES_REASONING_EFFORT env var first, then agent.reasoning_effort
        in config.yaml. Valid: "xhigh", "high", "medium", "low", "minimal", "none".
-        Returns None to use default (xhigh).
+        Returns None to use default (medium).
        """
        effort = os.getenv("HERMES_REASONING_EFFORT", "")
        if not effort:
@@ -351,7 +351,7 @@ class GatewayRunner:
        valid = ("xhigh", "high", "medium", "low", "minimal")
        if effort in valid:
            return {"enabled": True, "effort": effort}
-        logger.warning("Unknown reasoning_effort '%s', using default (xhigh)", effort)
+        logger.warning("Unknown reasoning_effort '%s', using default (medium)", effort)
        return None

    @staticmethod
--- a/run_agent.py
+++ b/run_agent.py
@@ -213,7 +213,7 @@ class AIAgent:
                Provided by the platform layer (CLI or gateway). If None, the clarify tool returns an error.
            max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
            reasoning_config (Dict): OpenRouter reasoning configuration override (e.g. {"effort": "none"} to disable thinking).
-                If None, defaults to {"enabled": True, "effort": "xhigh"} for OpenRouter. Set to disable/customize reasoning.
+                If None, defaults to {"enabled": True, "effort": "medium"} for OpenRouter. Set to disable/customize reasoning.
            prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context.
                Useful for injecting a few-shot example or priming the model's response style.
                Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}]
@@ -287,7 +287,7 @@ class AIAgent:
        
        # Model response configuration
        self.max_tokens = max_tokens  # None = use model default
-        self.reasoning_config = reasoning_config  # None = use default (xhigh for OpenRouter)
+        self.reasoning_config = reasoning_config  # None = use default (medium for OpenRouter)
        self.prefill_messages = prefill_messages or []  # Prefilled conversation turns
        
        # Anthropic prompt caching: auto-enabled for Claude models via OpenRouter.
@@ -2157,8 +2157,8 @@ class AIAgent:
            if not instructions:
                instructions = DEFAULT_AGENT_IDENTITY

-            # Resolve reasoning effort: config > default (xhigh)
-            reasoning_effort = "xhigh"
+            # Resolve reasoning effort: config > default (medium)
+            reasoning_effort = "medium"
            reasoning_enabled = True
            if self.reasoning_config and isinstance(self.reasoning_config, dict):
                if self.reasoning_config.get("enabled") is False:
@@ -2224,7 +2224,7 @@ class AIAgent:
            else:
                extra_body["reasoning"] = {
                    "enabled": True,
-                    "effort": "xhigh"
+                    "effort": "medium"
                }

        # Nous Portal product attribution
@@ -2767,7 +2767,7 @@ class AIAgent:
                else:
                    summary_extra_body["reasoning"] = {
                        "enabled": True,
-                        "effort": "xhigh"
+                        "effort": "medium"
                    }
            if _is_nous:
                summary_extra_body["tags"] = ["product=hermes-agent"]
--- a/tests/test_provider_parity.py
+++ b/tests/test_provider_parity.py
@@ -145,7 +145,7 @@ class TestBuildApiKwargsCodex:
        messages = [{"role": "user", "content": "hi"}]
        kwargs = agent._build_api_kwargs(messages)
        assert "reasoning" in kwargs
-        assert kwargs["reasoning"]["effort"] == "xhigh"
+        assert kwargs["reasoning"]["effort"] == "medium"

    def test_includes_encrypted_content_in_include(self, monkeypatch):
        agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
@@ -596,19 +596,19 @@ class TestCodexReasoningPreflight:
 # ── Reasoning effort consistency tests ───────────────────────────────────────

 class TestReasoningEffortDefaults:
-    """Verify reasoning effort defaults to xhigh across all provider paths."""
+    """Verify reasoning effort defaults to medium across all provider paths."""

-    def test_openrouter_default_xhigh(self, monkeypatch):
+    def test_openrouter_default_medium(self, monkeypatch):
        agent = _make_agent(monkeypatch, "openrouter")
        kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
        reasoning = kwargs["extra_body"]["reasoning"]
-        assert reasoning["effort"] == "xhigh"
+        assert reasoning["effort"] == "medium"

-    def test_codex_default_xhigh(self, monkeypatch):
+    def test_codex_default_medium(self, monkeypatch):
        agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
                            base_url="https://chatgpt.com/backend-api/codex")
        kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
-        assert kwargs["reasoning"]["effort"] == "xhigh"
+        assert kwargs["reasoning"]["effort"] == "medium"

    def test_codex_reasoning_disabled(self, monkeypatch):
        agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -498,12 +498,12 @@ class TestBuildApiKwargs:
        assert kwargs["extra_body"]["provider"]["only"] == ["Anthropic"]

    def test_reasoning_config_default_openrouter(self, agent):
-        """Default reasoning config for OpenRouter should be xhigh."""
+        """Default reasoning config for OpenRouter should be medium."""
        messages = [{"role": "user", "content": "hi"}]
        kwargs = agent._build_api_kwargs(messages)
        reasoning = kwargs["extra_body"]["reasoning"]
        assert reasoning["enabled"] is True
-        assert reasoning["effort"] == "xhigh"
+        assert reasoning["effort"] == "medium"

    def test_reasoning_config_custom(self, agent):
        agent.reasoning_config = {"enabled": False}
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@@ -421,10 +421,10 @@ Control how much "thinking" the model does before responding:

 ```yaml
 agent:
-  reasoning_effort: ""   # empty = use model default. Options: xhigh (max), high, medium, low, minimal, none
+  reasoning_effort: ""   # empty = medium (default). Options: xhigh (max), high, medium, low, minimal, none
 ```

-When unset (default), the model's own default reasoning level is used. Setting a value overrides it — higher reasoning effort gives better results on complex tasks at the cost of more tokens and latency.
+When unset (default), reasoning effort defaults to "medium" — a balanced level that works well for most tasks. Setting a value overrides it — higher reasoning effort gives better results on complex tasks at the cost of more tokens and latency.

 ## TTS Configuration