fix: gateway config debt - validation, defaults, fallback chain checks (#328 )

- Expand validate_config_structure() to catch: - fallback_providers format errors (non-list, missing provider/model) - session_reset.idle_minutes <= 0 (causes immediate resets) - session_reset.at_hour out of 0-23 range - API_SERVER enabled without API_SERVER_KEY - Unknown root-level keys that look like misplaced custom_providers fields - Add _validate_fallback_providers() in gateway/config.py to validate fallback chain at gateway startup (logs warnings for malformed entries) - Add API_SERVER_KEY check in gateway config loader (warns on unauthenticated endpoint) - Expand _KNOWN_ROOT_KEYS to include all valid top-level config sections (session_reset, browser, checkpoints, voice, stt, tts, etc.) - Add 13 new tests for fallback_providers and session_reset validation - All existing tests pass (47/47) Closes #328
fix(poka-yoke): add tool fixation detection (#310 )
2026-04-13 17:29:20 -04:00 · 2026-04-13 10:16:11 -04:00 · 2026-04-13 10:12:24 -04:00
4 changed files with 347 additions and 0 deletions
--- a/gateway/config.py
+++ b/gateway/config.py
@@ -412,6 +412,52 @@ class GatewayConfig:
        return self.unauthorized_dm_behavior


+def _validate_fallback_providers() -> None:
+    """Validate fallback_providers from config.yaml at gateway startup.
+
+    Checks that each entry has 'provider' and 'model' fields and logs
+    warnings for malformed entries.  This catches broken fallback chains
+    before they silently degrade into no-fallback mode.
+    """
+    try:
+        _home = get_hermes_home()
+        _config_path = _home / "config.yaml"
+        if not _config_path.exists():
+            return
+        import yaml
+        with open(_config_path, encoding="utf-8") as _f:
+            _cfg = yaml.safe_load(_f) or {}
+        fbp = _cfg.get("fallback_providers")
+        if not fbp:
+            return
+        if not isinstance(fbp, list):
+            logger.warning(
+                "fallback_providers should be a YAML list, got %s. "
+                "Fallback chain will be disabled.",
+                type(fbp).__name__,
+            )
+            return
+        for i, entry in enumerate(fbp):
+            if not isinstance(entry, dict):
+                logger.warning(
+                    "fallback_providers[%d] is not a dict (got %s). Skipping entry.",
+                    i, type(entry).__name__,
+                )
+                continue
+            if not entry.get("provider"):
+                logger.warning(
+                    "fallback_providers[%d] missing 'provider' field. Skipping entry.",
+                    i,
+                )
+            if not entry.get("model"):
+                logger.warning(
+                    "fallback_providers[%d] missing 'model' field. Skipping entry.",
+                    i,
+                )
+    except Exception:
+        pass  # Non-fatal; validation is advisory
+
+
 def load_gateway_config() -> GatewayConfig:
    """
    Load gateway configuration from multiple sources.
@@ -645,6 +691,19 @@ def load_gateway_config() -> GatewayConfig:
                platform.value, env_name,
            )

+    # Warn about API Server enabled without a key (unauthenticated endpoint)
+    if Platform.API_SERVER in config.platforms:
+        api_cfg = config.platforms[Platform.API_SERVER]
+        if api_cfg.enabled and not api_cfg.extra.get("key"):
+            logger.warning(
+                "api_server is enabled but API_SERVER_KEY is not set. "
+                "The API endpoint will run unauthenticated. "
+                "Set API_SERVER_KEY in ~/.hermes/.env to secure it.",
+            )
+
+    # Validate fallback_providers structure from config.yaml
+    _validate_fallback_providers()
+
    return config


--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -1338,6 +1338,11 @@ _KNOWN_ROOT_KEYS = {
    "fallback_providers", "credential_pool_strategies", "toolsets",
    "agent", "terminal", "display", "compression", "delegation",
    "auxiliary", "custom_providers", "memory", "gateway",
+    "session_reset", "browser", "checkpoints", "smart_model_routing",
+    "voice", "stt", "tts", "human_delay", "security", "privacy",
+    "cron", "logging", "approvals", "command_allowlist", "quick_commands",
+    "personalities", "skills", "honcho", "timezone", "discord",
+    "whatsapp", "prefill_messages_file", "file_read_max_chars",
 }

 # Valid fields inside a custom_providers list entry
@@ -1478,6 +1483,72 @@ def validate_config_structure(config: Optional[Dict[str, Any]] = None) -> List["
                f"Move '{key}' under the appropriate section",
            ))

+    # ── fallback_providers must be a list of dicts with provider + model ─
+    fbp = config.get("fallback_providers")
+    if fbp is not None:
+        if not isinstance(fbp, list):
+            issues.append(ConfigIssue(
+                "error",
+                f"fallback_providers should be a YAML list, got {type(fbp).__name__}",
+                "Change to:\n"
+                "  fallback_providers:\n"
+                "    - provider: openrouter\n"
+                "      model: google/gemini-3-flash-preview",
+            ))
+        elif fbp:
+            for i, entry in enumerate(fbp):
+                if not isinstance(entry, dict):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"fallback_providers[{i}] is not a dict (got {type(entry).__name__})",
+                        "Each entry needs at minimum: provider, model",
+                    ))
+                    continue
+                if not entry.get("provider"):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"fallback_providers[{i}] is missing 'provider' field — this fallback will be skipped",
+                        "Add: provider: openrouter (or another provider name)",
+                    ))
+                if not entry.get("model"):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"fallback_providers[{i}] is missing 'model' field — this fallback will be skipped",
+                        "Add: model: google/gemini-3-flash-preview (or another model slug)",
+                    ))
+
+    # ── session_reset validation ─────────────────────────────────────────
+    session_reset = config.get("session_reset", {})
+    if isinstance(session_reset, dict):
+        idle_minutes = session_reset.get("idle_minutes")
+        if idle_minutes is not None:
+            if not isinstance(idle_minutes, (int, float)) or idle_minutes <= 0:
+                issues.append(ConfigIssue(
+                    "warning",
+                    f"session_reset.idle_minutes={idle_minutes} is invalid (must be a positive number)",
+                    "Set to a positive integer, e.g. 1440 (24 hours). Using 0 causes immediate resets.",
+                ))
+        at_hour = session_reset.get("at_hour")
+        if at_hour is not None:
+            if not isinstance(at_hour, (int, float)) or not (0 <= at_hour <= 23):
+                issues.append(ConfigIssue(
+                    "warning",
+                    f"session_reset.at_hour={at_hour} is invalid (must be 0-23)",
+                    "Set to an hour between 0 and 23, e.g. 4 for 4am",
+                ))
+
+    # ── API Server key check ─────────────────────────────────────────────
+    # If api_server is enabled via env, but no key is set, warn.
+    # This catches the "API_SERVER_KEY not configured" error from gateway logs.
+    api_server_enabled = os.getenv("API_SERVER_ENABLED", "").lower() in ("true", "1", "yes")
+    api_server_key = os.getenv("API_SERVER_KEY", "").strip()
+    if api_server_enabled and not api_server_key:
+        issues.append(ConfigIssue(
+            "warning",
+            "API_SERVER is enabled but API_SERVER_KEY is not set — the API server will run unauthenticated",
+            "Set API_SERVER_KEY in ~/.hermes/.env to secure the API endpoint",
+        ))
+
    return issues


--- a/run_agent.py
+++ b/run_agent.py
@@ -721,6 +721,19 @@ class AIAgent:
        self._current_tool: str | None = None
        self._api_call_count: int = 0

+        # Poka-yoke #309: Circuit breaker for error cascading
+        # P(error | prev was error) = 58.6% vs P(error | prev was success) = 25.2%
+        # After 3+ consecutive errors, inject guidance to break the cascade.
+        self._consecutive_tool_errors: int = 0
+        self._error_streak_tool_names: list = []  # track which tools are in the streak
+
+        # Poka-yoke #310: Tool fixation detection
+        # Marathon sessions show tool fixation - same tool called 8-25 times in a row.
+        # After 5 consecutive calls to the same tool, nudge the agent to diversify.
+        self._last_tool_name: str | None = None
+        self._same_tool_streak: int = 0
+        self._tool_fixation_threshold: int = 5
+
        # Centralized logging — agent.log (INFO+) and errors.log (WARNING+)
        # both live under ~/.hermes/logs/.  Idempotent, so gateway mode
        # (which creates a new AIAgent per message) won't duplicate handlers.
@@ -6238,6 +6251,12 @@ class AIAgent:
        def _run_tool(index, tool_call, function_name, function_args):
            """Worker function executed in a thread."""
            start = time.time()
+            # Poka-yoke #310: Tool fixation detection (concurrent path)
+            if function_name == self._last_tool_name:
+                self._same_tool_streak += 1
+            else:
+                self._last_tool_name = function_name
+                self._same_tool_streak = 1
            try:
                result = self._invoke_tool(function_name, function_args, effective_task_id, tool_call.id)
            except Exception as tool_error:
@@ -6288,6 +6307,13 @@ class AIAgent:
                if is_error:
                    result_preview = function_result[:200] if len(function_result) > 200 else function_result
                    logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
+                    # Circuit breaker: track consecutive errors
+                    self._consecutive_tool_errors += 1
+                    self._error_streak_tool_names.append(function_name)
+                else:
+                    # Reset circuit breaker on success
+                    self._consecutive_tool_errors = 0
+                    self._error_streak_tool_names = []

                if self.tool_progress_callback:
                    try:
@@ -6331,6 +6357,41 @@ class AIAgent:
            if subdir_hints:
                function_result += subdir_hints

+            # Circuit breaker: inject warning after 3+ consecutive errors
+            if self._consecutive_tool_errors >= 3:
+                streak_info = self._error_streak_tool_names[-self._consecutive_tool_errors:]
+                unique_tools = list(dict.fromkeys(streak_info))
+                if self._consecutive_tool_errors == 3:
+                    cb_msg = (
+                        f"\n\n⚠️ CIRCUIT BREAKER: You have had {self._consecutive_tool_errors} consecutive tool errors "
+                        f"({', '.join(unique_tools)}). Errors cascade — P(error|error) is 2.33x higher than normal. "
+                        f"Consider: (1) trying a different tool type, (2) using terminal to debug, "
+                        f"(3) simplifying your approach, or (4) asking the user for guidance."
+                    )
+                    function_result += cb_msg
+                elif self._consecutive_tool_errors == 6:
+                    cb_msg = (
+                        f"\n\n🛑 CIRCUIT BREAKER: {self._consecutive_tool_errors} consecutive errors. "
+                        f"The error cascade is severe. STOP retrying the same approach. "
+                        f"Use terminal to investigate, or switch strategies entirely."
+                    )
+                    function_result += cb_msg
+                elif self._consecutive_tool_errors >= 9 and self._consecutive_tool_errors % 3 == 0:
+                    cb_msg = (
+                        f"\n\n🔴 CIRCUIT BREAKER: {self._consecutive_tool_errors} consecutive errors. "
+                        f"Terminal is your only reliable recovery path. Use it now."
+                    )
+                    function_result += cb_msg
+
+            # Poka-yoke #310: Tool fixation nudge
+            if self._same_tool_streak >= self._tool_fixation_threshold and self._same_tool_streak % self._tool_fixation_threshold == 0:
+                fixation_msg = (
+                    f"\n\n🔄 TOOL FIXATION: You have called `{function_name}` {self._same_tool_streak} times consecutively. "
+                    f"Consider: (1) trying a different tool, (2) using `terminal` to verify your approach, "
+                    f"(3) stepping back to reassess the task."
+                )
+                function_result += fixation_msg
+
            # Append tool result message in order
            tool_msg = {
                "role": "tool",
@@ -6416,6 +6477,13 @@ class AIAgent:
            self._current_tool = function_name
            self._touch_activity(f"executing tool: {function_name}")

+            # Poka-yoke #310: Tool fixation detection
+            if function_name == self._last_tool_name:
+                self._same_tool_streak += 1
+            else:
+                self._last_tool_name = function_name
+                self._same_tool_streak = 1
+
            if self.tool_progress_callback:
                try:
                    preview = _build_tool_preview(function_name, function_args)
@@ -6609,8 +6677,14 @@ class AIAgent:
            _is_error_result, _ = _detect_tool_failure(function_name, function_result)
            if _is_error_result:
                logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
+                # Circuit breaker: track consecutive errors
+                self._consecutive_tool_errors += 1
+                self._error_streak_tool_names.append(function_name)
            else:
                logger.info("tool %s completed (%.2fs, %d chars)", function_name, tool_duration, len(function_result))
+                # Reset circuit breaker on success
+                self._consecutive_tool_errors = 0
+                self._error_streak_tool_names = []

            if self.tool_progress_callback:
                try:
@@ -6642,6 +6716,41 @@ class AIAgent:
            if subdir_hints:
                function_result += subdir_hints

+            # Circuit breaker: inject warning after 3+ consecutive errors
+            if self._consecutive_tool_errors >= 3:
+                streak_info = self._error_streak_tool_names[-self._consecutive_tool_errors:]
+                unique_tools = list(dict.fromkeys(streak_info))  # preserve order, deduplicate
+                if self._consecutive_tool_errors == 3:
+                    cb_msg = (
+                        f"\n\n⚠️ CIRCUIT BREAKER: You have had {self._consecutive_tool_errors} consecutive tool errors "
+                        f"({', '.join(unique_tools)}). Errors cascade — P(error|error) is 2.33x higher than normal. "
+                        f"Consider: (1) trying a different tool type, (2) using terminal to debug, "
+                        f"(3) simplifying your approach, or (4) asking the user for guidance."
+                    )
+                    function_result += cb_msg
+                elif self._consecutive_tool_errors == 6:
+                    cb_msg = (
+                        f"\n\n🛑 CIRCUIT BREAKER: {self._consecutive_tool_errors} consecutive errors. "
+                        f"The error cascade is severe. STOP retrying the same approach. "
+                        f"Use terminal to investigate, or switch strategies entirely."
+                    )
+                    function_result += cb_msg
+                elif self._consecutive_tool_errors >= 9 and self._consecutive_tool_errors % 3 == 0:
+                    cb_msg = (
+                        f"\n\n🔴 CIRCUIT BREAKER: {self._consecutive_tool_errors} consecutive errors. "
+                        f"Terminal is your only reliable recovery path. Use it now."
+                    )
+                    function_result += cb_msg
+
+            # Poka-yoke #310: Tool fixation nudge
+            if self._same_tool_streak >= self._tool_fixation_threshold and self._same_tool_streak % self._tool_fixation_threshold == 0:
+                fixation_msg = (
+                    f"\n\n🔄 TOOL FIXATION: You have called `{function_name}` {self._same_tool_streak} times consecutively. "
+                    f"Consider: (1) trying a different tool, (2) using `terminal` to verify your approach, "
+                    f"(3) stepping back to reassess the task."
+                )
+                function_result += fixation_msg
+
            tool_msg = {
                "role": "tool",
                "content": function_result,
--- a/tests/hermes_cli/test_config_validation.py
+++ b/tests/hermes_cli/test_config_validation.py
@@ -172,3 +172,111 @@ class TestConfigIssueDataclass:
        a = ConfigIssue("error", "msg", "hint")
        b = ConfigIssue("error", "msg", "hint")
        assert a == b
+
+
+class TestFallbackProvidersValidation:
+    """fallback_providers must be a list of dicts with provider + model."""
+
+    def test_non_list(self):
+        """fallback_providers as string should error."""
+        issues = validate_config_structure({
+            "fallback_providers": "openrouter:google/gemini-3-flash-preview",
+        })
+        errors = [i for i in issues if i.severity == "error"]
+        assert any("fallback_providers" in i.message and "list" in i.message for i in errors)
+
+    def test_dict_instead_of_list(self):
+        """fallback_providers as dict should error."""
+        issues = validate_config_structure({
+            "fallback_providers": {"provider": "openrouter", "model": "test"},
+        })
+        errors = [i for i in issues if i.severity == "error"]
+        assert any("fallback_providers" in i.message and "dict" in i.message for i in errors)
+
+    def test_entry_missing_provider(self):
+        """Entry without provider should warn."""
+        issues = validate_config_structure({
+            "fallback_providers": [{"model": "google/gemini-3-flash-preview"}],
+        })
+        assert any("missing 'provider'" in i.message for i in issues)
+
+    def test_entry_missing_model(self):
+        """Entry without model should warn."""
+        issues = validate_config_structure({
+            "fallback_providers": [{"provider": "openrouter"}],
+        })
+        assert any("missing 'model'" in i.message for i in issues)
+
+    def test_entry_not_dict(self):
+        """Non-dict entries should warn."""
+        issues = validate_config_structure({
+            "fallback_providers": ["not-a-dict"],
+        })
+        assert any("not a dict" in i.message for i in issues)
+
+    def test_valid_entries(self):
+        """Valid fallback_providers should produce no fallback-related issues."""
+        issues = validate_config_structure({
+            "fallback_providers": [
+                {"provider": "openrouter", "model": "google/gemini-3-flash-preview"},
+                {"provider": "gemini", "model": "gemini-2.5-flash"},
+            ],
+        })
+        fb_issues = [i for i in issues if "fallback_providers" in i.message]
+        assert len(fb_issues) == 0
+
+    def test_empty_list_no_issues(self):
+        """Empty list is valid (fallback disabled)."""
+        issues = validate_config_structure({
+            "fallback_providers": [],
+        })
+        fb_issues = [i for i in issues if "fallback_providers" in i.message]
+        assert len(fb_issues) == 0
+
+
+class TestSessionResetValidation:
+    """session_reset.idle_minutes must be positive."""
+
+    def test_zero_idle_minutes(self):
+        """idle_minutes=0 should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": 0},
+        })
+        assert any("idle_minutes=0" in i.message for i in issues)
+
+    def test_negative_idle_minutes(self):
+        """idle_minutes=-5 should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": -5},
+        })
+        assert any("idle_minutes=-5" in i.message for i in issues)
+
+    def test_string_idle_minutes(self):
+        """idle_minutes as string should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": "abc"},
+        })
+        assert any("idle_minutes=" in i.message for i in issues)
+
+    def test_valid_idle_minutes(self):
+        """Valid idle_minutes should not warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": 1440},
+        })
+        idle_issues = [i for i in issues if "idle_minutes" in i.message]
+        assert len(idle_issues) == 0
+
+    def test_invalid_at_hour(self):
+        """at_hour=25 should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"at_hour": 25},
+        })
+        assert any("at_hour=25" in i.message for i in issues)
+
+    def test_valid_at_hour(self):
+        """Valid at_hour should not warn."""
+        issues = validate_config_structure({
+            "session_reset": {"at_hour": 4},
+        })
+        hour_issues = [i for i in issues if "at_hour" in i.message]
+        assert len(hour_issues) == 0