feat(agent): configurable timeouts for auxiliary LLM calls via config.yaml (#3597)
Add per-task timeout settings under auxiliary.{task}.timeout in config.yaml
instead of hardcoded values. Users with slow local models (Ollama, llama.cpp)
can now increase timeouts for compression, vision, session search, etc.
Defaults:
- auxiliary.compression.timeout: 120s (was hardcoded 45s)
- auxiliary.vision.timeout: 30s (unchanged)
- all other aux tasks: 30s (was hardcoded 30s)
- title_generator: 30s (was hardcoded 15s)
call_llm/async_call_llm now auto-resolve timeout from config when not
explicitly passed. Callers can still override with an explicit timeout arg.
Based on PR #3406 by alanfwilliams. Converted from env vars to config.yaml
per project conventions.
Co-authored-by: alanfwilliams <alanfwilliams@users.noreply.github.com>
This commit is contained in:
@@ -1458,6 +1458,29 @@ def _resolve_task_provider_model(
|
||||
return "auto", resolved_model, None, None
|
||||
|
||||
|
||||
_DEFAULT_AUX_TIMEOUT = 30.0
|
||||
|
||||
|
||||
def _get_task_timeout(task: str, default: float = _DEFAULT_AUX_TIMEOUT) -> float:
|
||||
"""Read timeout from auxiliary.{task}.timeout in config, falling back to *default*."""
|
||||
if not task:
|
||||
return default
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
config = load_config()
|
||||
except ImportError:
|
||||
return default
|
||||
aux = config.get("auxiliary", {}) if isinstance(config, dict) else {}
|
||||
task_config = aux.get(task, {}) if isinstance(aux, dict) else {}
|
||||
raw = task_config.get("timeout")
|
||||
if raw is not None:
|
||||
try:
|
||||
return float(raw)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
return default
|
||||
|
||||
|
||||
def _build_call_kwargs(
|
||||
provider: str,
|
||||
model: str,
|
||||
@@ -1515,7 +1538,7 @@ def call_llm(
|
||||
temperature: float = None,
|
||||
max_tokens: int = None,
|
||||
tools: list = None,
|
||||
timeout: float = 30.0,
|
||||
timeout: float = None,
|
||||
extra_body: dict = None,
|
||||
) -> Any:
|
||||
"""Centralized synchronous LLM call.
|
||||
@@ -1533,7 +1556,7 @@ def call_llm(
|
||||
temperature: Sampling temperature (None = provider default).
|
||||
max_tokens: Max output tokens (handles max_tokens vs max_completion_tokens).
|
||||
tools: Tool definitions (for function calling).
|
||||
timeout: Request timeout in seconds.
|
||||
timeout: Request timeout in seconds (None = read from auxiliary.{task}.timeout config).
|
||||
extra_body: Additional request body fields.
|
||||
|
||||
Returns:
|
||||
@@ -1598,10 +1621,12 @@ def call_llm(
|
||||
f"No LLM provider configured for task={task} provider={resolved_provider}. "
|
||||
f"Run: hermes setup")
|
||||
|
||||
effective_timeout = timeout if timeout is not None else _get_task_timeout(task)
|
||||
|
||||
kwargs = _build_call_kwargs(
|
||||
resolved_provider, final_model, messages,
|
||||
temperature=temperature, max_tokens=max_tokens,
|
||||
tools=tools, timeout=timeout, extra_body=extra_body,
|
||||
tools=tools, timeout=effective_timeout, extra_body=extra_body,
|
||||
base_url=resolved_base_url)
|
||||
|
||||
# Handle max_tokens vs max_completion_tokens retry
|
||||
@@ -1683,7 +1708,7 @@ async def async_call_llm(
|
||||
temperature: float = None,
|
||||
max_tokens: int = None,
|
||||
tools: list = None,
|
||||
timeout: float = 30.0,
|
||||
timeout: float = None,
|
||||
extra_body: dict = None,
|
||||
) -> Any:
|
||||
"""Centralized asynchronous LLM call.
|
||||
@@ -1744,10 +1769,12 @@ async def async_call_llm(
|
||||
f"No LLM provider configured for task={task} provider={resolved_provider}. "
|
||||
f"Run: hermes setup")
|
||||
|
||||
effective_timeout = timeout if timeout is not None else _get_task_timeout(task)
|
||||
|
||||
kwargs = _build_call_kwargs(
|
||||
resolved_provider, final_model, messages,
|
||||
temperature=temperature, max_tokens=max_tokens,
|
||||
tools=tools, timeout=timeout, extra_body=extra_body,
|
||||
tools=tools, timeout=effective_timeout, extra_body=extra_body,
|
||||
base_url=resolved_base_url)
|
||||
|
||||
try:
|
||||
|
||||
@@ -347,7 +347,7 @@ Write only the summary body. Do not include any preamble or prefix."""
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.3,
|
||||
"max_tokens": summary_budget * 2,
|
||||
"timeout": 45.0,
|
||||
# timeout resolved from auxiliary.compression.timeout config by call_llm
|
||||
}
|
||||
if self.summary_model:
|
||||
call_kwargs["model"] = self.summary_model
|
||||
|
||||
@@ -19,7 +19,7 @@ _TITLE_PROMPT = (
|
||||
)
|
||||
|
||||
|
||||
def generate_title(user_message: str, assistant_response: str, timeout: float = 15.0) -> Optional[str]:
|
||||
def generate_title(user_message: str, assistant_response: str, timeout: float = 30.0) -> Optional[str]:
|
||||
"""Generate a session title from the first exchange.
|
||||
|
||||
Uses the auxiliary LLM client (cheapest/fastest available model).
|
||||
|
||||
Reference in New Issue
Block a user