feat: default reasoning effort from xhigh to medium

Reduces token usage and latency for most tasks by defaulting to
medium reasoning effort instead of xhigh. Users can still override
via config or CLI flag. Updates code, tests, example config, and docs.
This commit is contained in:
teknium1
2026-03-07 10:14:19 -08:00
parent 23e84de830
commit b84f9e410c
9 changed files with 25 additions and 24 deletions

View File

@@ -66,7 +66,8 @@ DEFAULT_AGENT_IDENTITY = (
"range of tasks including answering questions, writing and editing code, "
"analyzing information, creative work, and executing actions via your tools. "
"You communicate clearly, admit uncertainty when appropriate, and prioritize "
"being genuinely useful over being verbose unless otherwise directed below."
"being genuinely useful over being verbose unless otherwise directed below. "
"Be targeted and efficient in your exploration and investigations."
)
MEMORY_GUIDANCE = (

View File

@@ -1155,7 +1155,7 @@ def main(
providers_order (str): Comma-separated list of OpenRouter providers to try in order (e.g. "anthropic,openai,google")
provider_sort (str): Sort providers by "price", "throughput", or "latency" (OpenRouter only)
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "xhigh")
reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "medium")
reasoning_disabled (bool): Completely disable reasoning/thinking tokens (default: False)
prefill_messages_file (str): Path to JSON file containing prefill messages (list of {role, content} dicts)
max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
@@ -1216,7 +1216,7 @@ def main(
providers_order_list = [p.strip() for p in providers_order.split(",")] if providers_order else None
# Build reasoning_config from CLI flags
# --reasoning_disabled takes priority, then --reasoning_effort, then default (xhigh)
# --reasoning_disabled takes priority, then --reasoning_effort, then default (medium)
reasoning_config = None
if reasoning_disabled:
# Completely disable reasoning/thinking tokens

View File

@@ -285,7 +285,7 @@ agent:
# Reasoning effort level (OpenRouter and Nous Portal)
# Controls how much "thinking" the model does before responding.
# Options: "xhigh" (max), "high", "medium", "low", "minimal", "none" (disable)
reasoning_effort: "xhigh"
reasoning_effort: "medium"
# Predefined personalities (use with /personality command)
personalities:

4
cli.py
View File

@@ -108,7 +108,7 @@ def _parse_reasoning_config(effort: str) -> dict | None:
"""Parse a reasoning effort level into an OpenRouter reasoning config dict.
Valid levels: "xhigh", "high", "medium", "low", "minimal", "none".
Returns None to use the default (xhigh), or a config dict to override.
Returns None to use the default (medium), or a config dict to override.
"""
if not effort or not effort.strip():
return None
@@ -118,7 +118,7 @@ def _parse_reasoning_config(effort: str) -> dict | None:
valid = ("xhigh", "high", "medium", "low", "minimal")
if effort in valid:
return {"enabled": True, "effort": effort}
logger.warning("Unknown reasoning_effort '%s', using default (xhigh)", effort)
logger.warning("Unknown reasoning_effort '%s', using default (medium)", effort)
return None

View File

@@ -330,7 +330,7 @@ class GatewayRunner:
Checks HERMES_REASONING_EFFORT env var first, then agent.reasoning_effort
in config.yaml. Valid: "xhigh", "high", "medium", "low", "minimal", "none".
Returns None to use default (xhigh).
Returns None to use default (medium).
"""
effort = os.getenv("HERMES_REASONING_EFFORT", "")
if not effort:
@@ -351,7 +351,7 @@ class GatewayRunner:
valid = ("xhigh", "high", "medium", "low", "minimal")
if effort in valid:
return {"enabled": True, "effort": effort}
logger.warning("Unknown reasoning_effort '%s', using default (xhigh)", effort)
logger.warning("Unknown reasoning_effort '%s', using default (medium)", effort)
return None
@staticmethod

View File

@@ -213,7 +213,7 @@ class AIAgent:
Provided by the platform layer (CLI or gateway). If None, the clarify tool returns an error.
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
reasoning_config (Dict): OpenRouter reasoning configuration override (e.g. {"effort": "none"} to disable thinking).
If None, defaults to {"enabled": True, "effort": "xhigh"} for OpenRouter. Set to disable/customize reasoning.
If None, defaults to {"enabled": True, "effort": "medium"} for OpenRouter. Set to disable/customize reasoning.
prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context.
Useful for injecting a few-shot example or priming the model's response style.
Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}]
@@ -287,7 +287,7 @@ class AIAgent:
# Model response configuration
self.max_tokens = max_tokens # None = use model default
self.reasoning_config = reasoning_config # None = use default (xhigh for OpenRouter)
self.reasoning_config = reasoning_config # None = use default (medium for OpenRouter)
self.prefill_messages = prefill_messages or [] # Prefilled conversation turns
# Anthropic prompt caching: auto-enabled for Claude models via OpenRouter.
@@ -2157,8 +2157,8 @@ class AIAgent:
if not instructions:
instructions = DEFAULT_AGENT_IDENTITY
# Resolve reasoning effort: config > default (xhigh)
reasoning_effort = "xhigh"
# Resolve reasoning effort: config > default (medium)
reasoning_effort = "medium"
reasoning_enabled = True
if self.reasoning_config and isinstance(self.reasoning_config, dict):
if self.reasoning_config.get("enabled") is False:
@@ -2224,7 +2224,7 @@ class AIAgent:
else:
extra_body["reasoning"] = {
"enabled": True,
"effort": "xhigh"
"effort": "medium"
}
# Nous Portal product attribution
@@ -2767,7 +2767,7 @@ class AIAgent:
else:
summary_extra_body["reasoning"] = {
"enabled": True,
"effort": "xhigh"
"effort": "medium"
}
if _is_nous:
summary_extra_body["tags"] = ["product=hermes-agent"]

View File

@@ -145,7 +145,7 @@ class TestBuildApiKwargsCodex:
messages = [{"role": "user", "content": "hi"}]
kwargs = agent._build_api_kwargs(messages)
assert "reasoning" in kwargs
assert kwargs["reasoning"]["effort"] == "xhigh"
assert kwargs["reasoning"]["effort"] == "medium"
def test_includes_encrypted_content_in_include(self, monkeypatch):
agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
@@ -596,19 +596,19 @@ class TestCodexReasoningPreflight:
# ── Reasoning effort consistency tests ───────────────────────────────────────
class TestReasoningEffortDefaults:
"""Verify reasoning effort defaults to xhigh across all provider paths."""
"""Verify reasoning effort defaults to medium across all provider paths."""
def test_openrouter_default_xhigh(self, monkeypatch):
def test_openrouter_default_medium(self, monkeypatch):
agent = _make_agent(monkeypatch, "openrouter")
kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
reasoning = kwargs["extra_body"]["reasoning"]
assert reasoning["effort"] == "xhigh"
assert reasoning["effort"] == "medium"
def test_codex_default_xhigh(self, monkeypatch):
def test_codex_default_medium(self, monkeypatch):
agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
base_url="https://chatgpt.com/backend-api/codex")
kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
assert kwargs["reasoning"]["effort"] == "xhigh"
assert kwargs["reasoning"]["effort"] == "medium"
def test_codex_reasoning_disabled(self, monkeypatch):
agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",

View File

@@ -498,12 +498,12 @@ class TestBuildApiKwargs:
assert kwargs["extra_body"]["provider"]["only"] == ["Anthropic"]
def test_reasoning_config_default_openrouter(self, agent):
"""Default reasoning config for OpenRouter should be xhigh."""
"""Default reasoning config for OpenRouter should be medium."""
messages = [{"role": "user", "content": "hi"}]
kwargs = agent._build_api_kwargs(messages)
reasoning = kwargs["extra_body"]["reasoning"]
assert reasoning["enabled"] is True
assert reasoning["effort"] == "xhigh"
assert reasoning["effort"] == "medium"
def test_reasoning_config_custom(self, agent):
agent.reasoning_config = {"enabled": False}

View File

@@ -421,10 +421,10 @@ Control how much "thinking" the model does before responding:
```yaml
agent:
reasoning_effort: "" # empty = use model default. Options: xhigh (max), high, medium, low, minimal, none
reasoning_effort: "" # empty = medium (default). Options: xhigh (max), high, medium, low, minimal, none
```
When unset (default), the model's own default reasoning level is used. Setting a value overrides it — higher reasoning effort gives better results on complex tasks at the cost of more tokens and latency.
When unset (default), reasoning effort defaults to "medium" — a balanced level that works well for most tasks. Setting a value overrides it — higher reasoning effort gives better results on complex tasks at the cost of more tokens and latency.
## TTS Configuration