Compare commits
3 Commits
burn/conte
...
fix/gatewa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
992498463e | ||
|
|
ec3cd2081b | ||
|
|
110642d86a |
@@ -412,6 +412,52 @@ class GatewayConfig:
|
||||
return self.unauthorized_dm_behavior
|
||||
|
||||
|
||||
def _validate_fallback_providers() -> None:
|
||||
"""Validate fallback_providers from config.yaml at gateway startup.
|
||||
|
||||
Checks that each entry has 'provider' and 'model' fields and logs
|
||||
warnings for malformed entries. This catches broken fallback chains
|
||||
before they silently degrade into no-fallback mode.
|
||||
"""
|
||||
try:
|
||||
_home = get_hermes_home()
|
||||
_config_path = _home / "config.yaml"
|
||||
if not _config_path.exists():
|
||||
return
|
||||
import yaml
|
||||
with open(_config_path, encoding="utf-8") as _f:
|
||||
_cfg = yaml.safe_load(_f) or {}
|
||||
fbp = _cfg.get("fallback_providers")
|
||||
if not fbp:
|
||||
return
|
||||
if not isinstance(fbp, list):
|
||||
logger.warning(
|
||||
"fallback_providers should be a YAML list, got %s. "
|
||||
"Fallback chain will be disabled.",
|
||||
type(fbp).__name__,
|
||||
)
|
||||
return
|
||||
for i, entry in enumerate(fbp):
|
||||
if not isinstance(entry, dict):
|
||||
logger.warning(
|
||||
"fallback_providers[%d] is not a dict (got %s). Skipping entry.",
|
||||
i, type(entry).__name__,
|
||||
)
|
||||
continue
|
||||
if not entry.get("provider"):
|
||||
logger.warning(
|
||||
"fallback_providers[%d] missing 'provider' field. Skipping entry.",
|
||||
i,
|
||||
)
|
||||
if not entry.get("model"):
|
||||
logger.warning(
|
||||
"fallback_providers[%d] missing 'model' field. Skipping entry.",
|
||||
i,
|
||||
)
|
||||
except Exception:
|
||||
pass # Non-fatal; validation is advisory
|
||||
|
||||
|
||||
def load_gateway_config() -> GatewayConfig:
|
||||
"""
|
||||
Load gateway configuration from multiple sources.
|
||||
@@ -645,6 +691,19 @@ def load_gateway_config() -> GatewayConfig:
|
||||
platform.value, env_name,
|
||||
)
|
||||
|
||||
# Warn about API Server enabled without a key (unauthenticated endpoint)
|
||||
if Platform.API_SERVER in config.platforms:
|
||||
api_cfg = config.platforms[Platform.API_SERVER]
|
||||
if api_cfg.enabled and not api_cfg.extra.get("key"):
|
||||
logger.warning(
|
||||
"api_server is enabled but API_SERVER_KEY is not set. "
|
||||
"The API endpoint will run unauthenticated. "
|
||||
"Set API_SERVER_KEY in ~/.hermes/.env to secure it.",
|
||||
)
|
||||
|
||||
# Validate fallback_providers structure from config.yaml
|
||||
_validate_fallback_providers()
|
||||
|
||||
return config
|
||||
|
||||
|
||||
|
||||
@@ -1338,6 +1338,11 @@ _KNOWN_ROOT_KEYS = {
|
||||
"fallback_providers", "credential_pool_strategies", "toolsets",
|
||||
"agent", "terminal", "display", "compression", "delegation",
|
||||
"auxiliary", "custom_providers", "memory", "gateway",
|
||||
"session_reset", "browser", "checkpoints", "smart_model_routing",
|
||||
"voice", "stt", "tts", "human_delay", "security", "privacy",
|
||||
"cron", "logging", "approvals", "command_allowlist", "quick_commands",
|
||||
"personalities", "skills", "honcho", "timezone", "discord",
|
||||
"whatsapp", "prefill_messages_file", "file_read_max_chars",
|
||||
}
|
||||
|
||||
# Valid fields inside a custom_providers list entry
|
||||
@@ -1478,6 +1483,72 @@ def validate_config_structure(config: Optional[Dict[str, Any]] = None) -> List["
|
||||
f"Move '{key}' under the appropriate section",
|
||||
))
|
||||
|
||||
# ── fallback_providers must be a list of dicts with provider + model ─
|
||||
fbp = config.get("fallback_providers")
|
||||
if fbp is not None:
|
||||
if not isinstance(fbp, list):
|
||||
issues.append(ConfigIssue(
|
||||
"error",
|
||||
f"fallback_providers should be a YAML list, got {type(fbp).__name__}",
|
||||
"Change to:\n"
|
||||
" fallback_providers:\n"
|
||||
" - provider: openrouter\n"
|
||||
" model: google/gemini-3-flash-preview",
|
||||
))
|
||||
elif fbp:
|
||||
for i, entry in enumerate(fbp):
|
||||
if not isinstance(entry, dict):
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
f"fallback_providers[{i}] is not a dict (got {type(entry).__name__})",
|
||||
"Each entry needs at minimum: provider, model",
|
||||
))
|
||||
continue
|
||||
if not entry.get("provider"):
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
f"fallback_providers[{i}] is missing 'provider' field — this fallback will be skipped",
|
||||
"Add: provider: openrouter (or another provider name)",
|
||||
))
|
||||
if not entry.get("model"):
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
f"fallback_providers[{i}] is missing 'model' field — this fallback will be skipped",
|
||||
"Add: model: google/gemini-3-flash-preview (or another model slug)",
|
||||
))
|
||||
|
||||
# ── session_reset validation ─────────────────────────────────────────
|
||||
session_reset = config.get("session_reset", {})
|
||||
if isinstance(session_reset, dict):
|
||||
idle_minutes = session_reset.get("idle_minutes")
|
||||
if idle_minutes is not None:
|
||||
if not isinstance(idle_minutes, (int, float)) or idle_minutes <= 0:
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
f"session_reset.idle_minutes={idle_minutes} is invalid (must be a positive number)",
|
||||
"Set to a positive integer, e.g. 1440 (24 hours). Using 0 causes immediate resets.",
|
||||
))
|
||||
at_hour = session_reset.get("at_hour")
|
||||
if at_hour is not None:
|
||||
if not isinstance(at_hour, (int, float)) or not (0 <= at_hour <= 23):
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
f"session_reset.at_hour={at_hour} is invalid (must be 0-23)",
|
||||
"Set to an hour between 0 and 23, e.g. 4 for 4am",
|
||||
))
|
||||
|
||||
# ── API Server key check ─────────────────────────────────────────────
|
||||
# If api_server is enabled via env, but no key is set, warn.
|
||||
# This catches the "API_SERVER_KEY not configured" error from gateway logs.
|
||||
api_server_enabled = os.getenv("API_SERVER_ENABLED", "").lower() in ("true", "1", "yes")
|
||||
api_server_key = os.getenv("API_SERVER_KEY", "").strip()
|
||||
if api_server_enabled and not api_server_key:
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
"API_SERVER is enabled but API_SERVER_KEY is not set — the API server will run unauthenticated",
|
||||
"Set API_SERVER_KEY in ~/.hermes/.env to secure the API endpoint",
|
||||
))
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
|
||||
109
run_agent.py
109
run_agent.py
@@ -721,6 +721,19 @@ class AIAgent:
|
||||
self._current_tool: str | None = None
|
||||
self._api_call_count: int = 0
|
||||
|
||||
# Poka-yoke #309: Circuit breaker for error cascading
|
||||
# P(error | prev was error) = 58.6% vs P(error | prev was success) = 25.2%
|
||||
# After 3+ consecutive errors, inject guidance to break the cascade.
|
||||
self._consecutive_tool_errors: int = 0
|
||||
self._error_streak_tool_names: list = [] # track which tools are in the streak
|
||||
|
||||
# Poka-yoke #310: Tool fixation detection
|
||||
# Marathon sessions show tool fixation - same tool called 8-25 times in a row.
|
||||
# After 5 consecutive calls to the same tool, nudge the agent to diversify.
|
||||
self._last_tool_name: str | None = None
|
||||
self._same_tool_streak: int = 0
|
||||
self._tool_fixation_threshold: int = 5
|
||||
|
||||
# Centralized logging — agent.log (INFO+) and errors.log (WARNING+)
|
||||
# both live under ~/.hermes/logs/. Idempotent, so gateway mode
|
||||
# (which creates a new AIAgent per message) won't duplicate handlers.
|
||||
@@ -6238,6 +6251,12 @@ class AIAgent:
|
||||
def _run_tool(index, tool_call, function_name, function_args):
|
||||
"""Worker function executed in a thread."""
|
||||
start = time.time()
|
||||
# Poka-yoke #310: Tool fixation detection (concurrent path)
|
||||
if function_name == self._last_tool_name:
|
||||
self._same_tool_streak += 1
|
||||
else:
|
||||
self._last_tool_name = function_name
|
||||
self._same_tool_streak = 1
|
||||
try:
|
||||
result = self._invoke_tool(function_name, function_args, effective_task_id, tool_call.id)
|
||||
except Exception as tool_error:
|
||||
@@ -6288,6 +6307,13 @@ class AIAgent:
|
||||
if is_error:
|
||||
result_preview = function_result[:200] if len(function_result) > 200 else function_result
|
||||
logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
|
||||
# Circuit breaker: track consecutive errors
|
||||
self._consecutive_tool_errors += 1
|
||||
self._error_streak_tool_names.append(function_name)
|
||||
else:
|
||||
# Reset circuit breaker on success
|
||||
self._consecutive_tool_errors = 0
|
||||
self._error_streak_tool_names = []
|
||||
|
||||
if self.tool_progress_callback:
|
||||
try:
|
||||
@@ -6331,6 +6357,41 @@ class AIAgent:
|
||||
if subdir_hints:
|
||||
function_result += subdir_hints
|
||||
|
||||
# Circuit breaker: inject warning after 3+ consecutive errors
|
||||
if self._consecutive_tool_errors >= 3:
|
||||
streak_info = self._error_streak_tool_names[-self._consecutive_tool_errors:]
|
||||
unique_tools = list(dict.fromkeys(streak_info))
|
||||
if self._consecutive_tool_errors == 3:
|
||||
cb_msg = (
|
||||
f"\n\n⚠️ CIRCUIT BREAKER: You have had {self._consecutive_tool_errors} consecutive tool errors "
|
||||
f"({', '.join(unique_tools)}). Errors cascade — P(error|error) is 2.33x higher than normal. "
|
||||
f"Consider: (1) trying a different tool type, (2) using terminal to debug, "
|
||||
f"(3) simplifying your approach, or (4) asking the user for guidance."
|
||||
)
|
||||
function_result += cb_msg
|
||||
elif self._consecutive_tool_errors == 6:
|
||||
cb_msg = (
|
||||
f"\n\n🛑 CIRCUIT BREAKER: {self._consecutive_tool_errors} consecutive errors. "
|
||||
f"The error cascade is severe. STOP retrying the same approach. "
|
||||
f"Use terminal to investigate, or switch strategies entirely."
|
||||
)
|
||||
function_result += cb_msg
|
||||
elif self._consecutive_tool_errors >= 9 and self._consecutive_tool_errors % 3 == 0:
|
||||
cb_msg = (
|
||||
f"\n\n🔴 CIRCUIT BREAKER: {self._consecutive_tool_errors} consecutive errors. "
|
||||
f"Terminal is your only reliable recovery path. Use it now."
|
||||
)
|
||||
function_result += cb_msg
|
||||
|
||||
# Poka-yoke #310: Tool fixation nudge
|
||||
if self._same_tool_streak >= self._tool_fixation_threshold and self._same_tool_streak % self._tool_fixation_threshold == 0:
|
||||
fixation_msg = (
|
||||
f"\n\n🔄 TOOL FIXATION: You have called `{function_name}` {self._same_tool_streak} times consecutively. "
|
||||
f"Consider: (1) trying a different tool, (2) using `terminal` to verify your approach, "
|
||||
f"(3) stepping back to reassess the task."
|
||||
)
|
||||
function_result += fixation_msg
|
||||
|
||||
# Append tool result message in order
|
||||
tool_msg = {
|
||||
"role": "tool",
|
||||
@@ -6416,6 +6477,13 @@ class AIAgent:
|
||||
self._current_tool = function_name
|
||||
self._touch_activity(f"executing tool: {function_name}")
|
||||
|
||||
# Poka-yoke #310: Tool fixation detection
|
||||
if function_name == self._last_tool_name:
|
||||
self._same_tool_streak += 1
|
||||
else:
|
||||
self._last_tool_name = function_name
|
||||
self._same_tool_streak = 1
|
||||
|
||||
if self.tool_progress_callback:
|
||||
try:
|
||||
preview = _build_tool_preview(function_name, function_args)
|
||||
@@ -6609,8 +6677,14 @@ class AIAgent:
|
||||
_is_error_result, _ = _detect_tool_failure(function_name, function_result)
|
||||
if _is_error_result:
|
||||
logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
|
||||
# Circuit breaker: track consecutive errors
|
||||
self._consecutive_tool_errors += 1
|
||||
self._error_streak_tool_names.append(function_name)
|
||||
else:
|
||||
logger.info("tool %s completed (%.2fs, %d chars)", function_name, tool_duration, len(function_result))
|
||||
# Reset circuit breaker on success
|
||||
self._consecutive_tool_errors = 0
|
||||
self._error_streak_tool_names = []
|
||||
|
||||
if self.tool_progress_callback:
|
||||
try:
|
||||
@@ -6642,6 +6716,41 @@ class AIAgent:
|
||||
if subdir_hints:
|
||||
function_result += subdir_hints
|
||||
|
||||
# Circuit breaker: inject warning after 3+ consecutive errors
|
||||
if self._consecutive_tool_errors >= 3:
|
||||
streak_info = self._error_streak_tool_names[-self._consecutive_tool_errors:]
|
||||
unique_tools = list(dict.fromkeys(streak_info)) # preserve order, deduplicate
|
||||
if self._consecutive_tool_errors == 3:
|
||||
cb_msg = (
|
||||
f"\n\n⚠️ CIRCUIT BREAKER: You have had {self._consecutive_tool_errors} consecutive tool errors "
|
||||
f"({', '.join(unique_tools)}). Errors cascade — P(error|error) is 2.33x higher than normal. "
|
||||
f"Consider: (1) trying a different tool type, (2) using terminal to debug, "
|
||||
f"(3) simplifying your approach, or (4) asking the user for guidance."
|
||||
)
|
||||
function_result += cb_msg
|
||||
elif self._consecutive_tool_errors == 6:
|
||||
cb_msg = (
|
||||
f"\n\n🛑 CIRCUIT BREAKER: {self._consecutive_tool_errors} consecutive errors. "
|
||||
f"The error cascade is severe. STOP retrying the same approach. "
|
||||
f"Use terminal to investigate, or switch strategies entirely."
|
||||
)
|
||||
function_result += cb_msg
|
||||
elif self._consecutive_tool_errors >= 9 and self._consecutive_tool_errors % 3 == 0:
|
||||
cb_msg = (
|
||||
f"\n\n🔴 CIRCUIT BREAKER: {self._consecutive_tool_errors} consecutive errors. "
|
||||
f"Terminal is your only reliable recovery path. Use it now."
|
||||
)
|
||||
function_result += cb_msg
|
||||
|
||||
# Poka-yoke #310: Tool fixation nudge
|
||||
if self._same_tool_streak >= self._tool_fixation_threshold and self._same_tool_streak % self._tool_fixation_threshold == 0:
|
||||
fixation_msg = (
|
||||
f"\n\n🔄 TOOL FIXATION: You have called `{function_name}` {self._same_tool_streak} times consecutively. "
|
||||
f"Consider: (1) trying a different tool, (2) using `terminal` to verify your approach, "
|
||||
f"(3) stepping back to reassess the task."
|
||||
)
|
||||
function_result += fixation_msg
|
||||
|
||||
tool_msg = {
|
||||
"role": "tool",
|
||||
"content": function_result,
|
||||
|
||||
@@ -172,3 +172,111 @@ class TestConfigIssueDataclass:
|
||||
a = ConfigIssue("error", "msg", "hint")
|
||||
b = ConfigIssue("error", "msg", "hint")
|
||||
assert a == b
|
||||
|
||||
|
||||
class TestFallbackProvidersValidation:
|
||||
"""fallback_providers must be a list of dicts with provider + model."""
|
||||
|
||||
def test_non_list(self):
|
||||
"""fallback_providers as string should error."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": "openrouter:google/gemini-3-flash-preview",
|
||||
})
|
||||
errors = [i for i in issues if i.severity == "error"]
|
||||
assert any("fallback_providers" in i.message and "list" in i.message for i in errors)
|
||||
|
||||
def test_dict_instead_of_list(self):
|
||||
"""fallback_providers as dict should error."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": {"provider": "openrouter", "model": "test"},
|
||||
})
|
||||
errors = [i for i in issues if i.severity == "error"]
|
||||
assert any("fallback_providers" in i.message and "dict" in i.message for i in errors)
|
||||
|
||||
def test_entry_missing_provider(self):
|
||||
"""Entry without provider should warn."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": [{"model": "google/gemini-3-flash-preview"}],
|
||||
})
|
||||
assert any("missing 'provider'" in i.message for i in issues)
|
||||
|
||||
def test_entry_missing_model(self):
|
||||
"""Entry without model should warn."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": [{"provider": "openrouter"}],
|
||||
})
|
||||
assert any("missing 'model'" in i.message for i in issues)
|
||||
|
||||
def test_entry_not_dict(self):
|
||||
"""Non-dict entries should warn."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": ["not-a-dict"],
|
||||
})
|
||||
assert any("not a dict" in i.message for i in issues)
|
||||
|
||||
def test_valid_entries(self):
|
||||
"""Valid fallback_providers should produce no fallback-related issues."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": [
|
||||
{"provider": "openrouter", "model": "google/gemini-3-flash-preview"},
|
||||
{"provider": "gemini", "model": "gemini-2.5-flash"},
|
||||
],
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback_providers" in i.message]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
def test_empty_list_no_issues(self):
|
||||
"""Empty list is valid (fallback disabled)."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": [],
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback_providers" in i.message]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
|
||||
class TestSessionResetValidation:
|
||||
"""session_reset.idle_minutes must be positive."""
|
||||
|
||||
def test_zero_idle_minutes(self):
|
||||
"""idle_minutes=0 should warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"idle_minutes": 0},
|
||||
})
|
||||
assert any("idle_minutes=0" in i.message for i in issues)
|
||||
|
||||
def test_negative_idle_minutes(self):
|
||||
"""idle_minutes=-5 should warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"idle_minutes": -5},
|
||||
})
|
||||
assert any("idle_minutes=-5" in i.message for i in issues)
|
||||
|
||||
def test_string_idle_minutes(self):
|
||||
"""idle_minutes as string should warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"idle_minutes": "abc"},
|
||||
})
|
||||
assert any("idle_minutes=" in i.message for i in issues)
|
||||
|
||||
def test_valid_idle_minutes(self):
|
||||
"""Valid idle_minutes should not warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"idle_minutes": 1440},
|
||||
})
|
||||
idle_issues = [i for i in issues if "idle_minutes" in i.message]
|
||||
assert len(idle_issues) == 0
|
||||
|
||||
def test_invalid_at_hour(self):
|
||||
"""at_hour=25 should warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"at_hour": 25},
|
||||
})
|
||||
assert any("at_hour=25" in i.message for i in issues)
|
||||
|
||||
def test_valid_at_hour(self):
|
||||
"""Valid at_hour should not warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"at_hour": 4},
|
||||
})
|
||||
hour_issues = [i for i in issues if "at_hour" in i.message]
|
||||
assert len(hour_issues) == 0
|
||||
|
||||
Reference in New Issue
Block a user