feat: config structure validation — detect malformed YAML at startup (#5426)

Add validate_config_structure() that catches common config.yaml mistakes: - custom_providers as dict instead of list (missing '-' in YAML) - fallback_model accidentally nested inside another section - custom_providers entries missing required fields (name, base_url) - Missing model section when custom_providers is configured - Root-level keys that look like misplaced custom_providers fields Surface these diagnostics at three levels: 1. Startup: print_config_warnings() runs at CLI and gateway module load, so users see issues before hitting cryptic errors 2. Error time: 'Unknown provider' errors in auth.py and model_switch.py now include config diagnostics with fix suggestions 3. Doctor: 'hermes doctor' shows a Config Structure section with all issues and fix hints Also adds a warning log in runtime_provider.py when custom_providers is a dict (previously returned None silently). Motivated by a Discord user who had malformed custom_providers YAML and got only 'Unknown Provider' with no guidance on what was wrong. 17 new tests covering all validation paths.
2026-04-05 23:31:20 -07:00
parent 9ca954a274
commit dce5f51c7c
8 changed files with 443 additions and 9 deletions
--- a/cli.py
+++ b/cli.py
@@ -453,6 +453,13 @@ def load_cli_config() -> Dict[str, Any]:
 # Load configuration at module startup
 CLI_CONFIG = load_cli_config()

+# Validate config structure early — print warnings before user hits cryptic errors
+try:
+    from hermes_cli.config import print_config_warnings
+    print_config_warnings()
+except Exception:
+    pass
+
 # Initialize the skin engine from config
 try:
    from hermes_cli.skin_engine import init_skin_from_config
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -200,6 +200,13 @@ if _config_path.exists():
    except Exception:
        pass  # Non-fatal; gateway can still run with .env values

+# Validate config structure early — log warnings so gateway operators see problems
+try:
+    from hermes_cli.config import print_config_warnings
+    print_config_warnings()
+except Exception:
+    pass
+
 # Gateway runs in quiet mode - suppress debug output and use cwd directly (no temp dirs)
 os.environ["HERMES_QUIET"] = "1"

--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@@ -711,6 +711,32 @@ def deactivate_provider() -> None:
 # Provider Resolution — picks which provider to use
 # =============================================================================

+
+def _get_config_hint_for_unknown_provider(provider_name: str) -> str:
+    """Return a helpful hint string when provider resolution fails.
+
+    Checks for common config.yaml mistakes (malformed custom_providers, etc.)
+    and returns a human-readable diagnostic, or empty string if nothing found.
+    """
+    try:
+        from hermes_cli.config import validate_config_structure
+        issues = validate_config_structure()
+        if not issues:
+            return ""
+
+        lines = ["Config issue detected — run 'hermes doctor' for full diagnostics:"]
+        for ci in issues:
+            prefix = "ERROR" if ci.severity == "error" else "WARNING"
+            lines.append(f"  [{prefix}] {ci.message}")
+            # Show first line of hint
+            first_hint = ci.hint.splitlines()[0] if ci.hint else ""
+            if first_hint:
+                lines.append(f"    → {first_hint}")
+        return "\n".join(lines)
+    except Exception:
+        return ""
+
+
 def resolve_provider(
    requested: Optional[str] = None,
    *,
@@ -757,10 +783,14 @@ def resolve_provider(
    if normalized in PROVIDER_REGISTRY:
        return normalized
    if normalized != "auto":
-        raise AuthError(
-            f"Unknown provider '{normalized}'.",
-            code="invalid_provider",
-        )
+        # Check for common config.yaml issues that cause this error
+        _config_hint = _get_config_hint_for_unknown_provider(normalized)
+        msg = f"Unknown provider '{normalized}'."
+        if _config_hint:
+            msg += f"\n\n{_config_hint}"
+        else:
+            msg += " Check 'hermes model' for available providers, or run 'hermes doctor' to diagnose config issues."
+        raise AuthError(msg, code="invalid_provider")

    # Explicit one-off CLI creds always mean openrouter/custom
    if explicit_api_key or explicit_base_url:
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -19,6 +19,7 @@ import stat
 import subprocess
 import sys
 import tempfile
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Any, Optional, List, Tuple

@@ -1243,6 +1244,182 @@ def check_config_version() -> Tuple[int, int]:
    return current, latest


+# =============================================================================
+# Config structure validation
+# =============================================================================
+
+# Fields that are valid at root level of config.yaml
+_KNOWN_ROOT_KEYS = {
+    "_config_version", "model", "providers", "fallback_model",
+    "fallback_providers", "credential_pool_strategies", "toolsets",
+    "agent", "terminal", "display", "compression", "delegation",
+    "auxiliary", "custom_providers", "memory", "gateway",
+}
+
+# Valid fields inside a custom_providers list entry
+_VALID_CUSTOM_PROVIDER_FIELDS = {
+    "name", "base_url", "api_key", "api_mode", "models",
+    "context_length", "rate_limit_delay",
+}
+
+# Fields that look like they should be inside custom_providers, not at root
+_CUSTOM_PROVIDER_LIKE_FIELDS = {"base_url", "api_key", "rate_limit_delay", "api_mode"}
+
+
+@dataclass
+class ConfigIssue:
+    """A detected config structure problem."""
+
+    severity: str  # "error", "warning"
+    message: str
+    hint: str
+
+
+def validate_config_structure(config: Optional[Dict[str, Any]] = None) -> List["ConfigIssue"]:
+    """Validate config.yaml structure and return a list of detected issues.
+
+    Catches common YAML formatting mistakes that produce confusing runtime
+    errors (like "Unknown provider") instead of clear diagnostics.
+
+    Can be called with a pre-loaded config dict, or will load from disk.
+    """
+    if config is None:
+        try:
+            config = load_config()
+        except Exception:
+            return [ConfigIssue("error", "Could not load config.yaml", "Run 'hermes setup' to create a valid config")]
+
+    issues: List[ConfigIssue] = []
+
+    # ── custom_providers must be a list, not a dict ──────────────────────
+    cp = config.get("custom_providers")
+    if cp is not None:
+        if isinstance(cp, dict):
+            issues.append(ConfigIssue(
+                "error",
+                "custom_providers is a dict — it must be a YAML list (items prefixed with '-')",
+                "Change to:\n"
+                "  custom_providers:\n"
+                "    - name: my-provider\n"
+                "      base_url: https://...\n"
+                "      api_key: ...",
+            ))
+            # Check if dict keys look like they should be list-entry fields
+            cp_keys = set(cp.keys()) if isinstance(cp, dict) else set()
+            suspicious = cp_keys & _CUSTOM_PROVIDER_LIKE_FIELDS
+            if suspicious:
+                issues.append(ConfigIssue(
+                    "warning",
+                    f"Root-level keys {sorted(suspicious)} look like custom_providers entry fields",
+                    "These should be indented under a '- name: ...' list entry, not at root level",
+                ))
+        elif isinstance(cp, list):
+            # Validate each entry in the list
+            for i, entry in enumerate(cp):
+                if not isinstance(entry, dict):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"custom_providers[{i}] is not a dict (got {type(entry).__name__})",
+                        "Each entry should have at minimum: name, base_url",
+                    ))
+                    continue
+                if not entry.get("name"):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"custom_providers[{i}] is missing 'name' field",
+                        "Add a name, e.g.: name: my-provider",
+                    ))
+                if not entry.get("base_url"):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"custom_providers[{i}] is missing 'base_url' field",
+                        "Add the API endpoint URL, e.g.: base_url: https://api.example.com/v1",
+                    ))
+
+    # ── fallback_model must be a top-level dict with provider + model ────
+    fb = config.get("fallback_model")
+    if fb is not None:
+        if not isinstance(fb, dict):
+            issues.append(ConfigIssue(
+                "error",
+                f"fallback_model should be a dict with 'provider' and 'model', got {type(fb).__name__}",
+                "Change to:\n"
+                "  fallback_model:\n"
+                "    provider: openrouter\n"
+                "    model: anthropic/claude-sonnet-4",
+            ))
+        elif fb:
+            if not fb.get("provider"):
+                issues.append(ConfigIssue(
+                    "warning",
+                    "fallback_model is missing 'provider' field — fallback will be disabled",
+                    "Add: provider: openrouter (or another provider)",
+                ))
+            if not fb.get("model"):
+                issues.append(ConfigIssue(
+                    "warning",
+                    "fallback_model is missing 'model' field — fallback will be disabled",
+                    "Add: model: anthropic/claude-sonnet-4 (or another model)",
+                ))
+
+    # ── Check for fallback_model accidentally nested inside custom_providers ──
+    if isinstance(cp, dict) and "fallback_model" not in config and "fallback_model" in (cp or {}):
+        issues.append(ConfigIssue(
+            "error",
+            "fallback_model appears inside custom_providers instead of at root level",
+            "Move fallback_model to the top level of config.yaml (no indentation)",
+        ))
+
+    # ── model section: should exist when custom_providers is configured ──
+    model_cfg = config.get("model")
+    if cp and not model_cfg:
+        issues.append(ConfigIssue(
+            "warning",
+            "custom_providers defined but no 'model' section — Hermes won't know which provider to use",
+            "Add a model section:\n"
+            "  model:\n"
+            "    provider: custom\n"
+            "    default: your-model-name\n"
+            "    base_url: https://...",
+        ))
+
+    # ── Root-level keys that look misplaced ──────────────────────────────
+    for key in config:
+        if key.startswith("_"):
+            continue
+        if key not in _KNOWN_ROOT_KEYS and key in _CUSTOM_PROVIDER_LIKE_FIELDS:
+            issues.append(ConfigIssue(
+                "warning",
+                f"Root-level key '{key}' looks misplaced — should it be under 'model:' or inside a 'custom_providers' entry?",
+                f"Move '{key}' under the appropriate section",
+            ))
+
+    return issues
+
+
+def print_config_warnings(config: Optional[Dict[str, Any]] = None) -> None:
+    """Print config structure warnings to stderr at startup.
+
+    Called early in CLI and gateway init so users see problems before
+    they hit cryptic "Unknown provider" errors.  Prints nothing if
+    config is healthy.
+    """
+    try:
+        issues = validate_config_structure(config)
+    except Exception:
+        return
+    if not issues:
+        return
+
+    import sys
+    lines = ["\033[33m⚠ Config issues detected in config.yaml:\033[0m"]
+    for ci in issues:
+        marker = "\033[31m✗\033[0m" if ci.severity == "error" else "\033[33m⚠\033[0m"
+        lines.append(f"  {marker} {ci.message}")
+    lines.append("  \033[2mRun 'hermes doctor' for fix suggestions.\033[0m")
+    sys.stderr.write("\n".join(lines) + "\n\n")
+
+
 def migrate_config(interactive: bool = True, quiet: bool = False) -> Dict[str, Any]:
    """
    Migrate config to latest version, prompting for new required fields.
--- a/hermes_cli/doctor.py
+++ b/hermes_cli/doctor.py
@@ -318,6 +318,25 @@ def run_doctor(args):
        except Exception:
            pass

+        # Validate config structure (catches malformed custom_providers, etc.)
+        try:
+            from hermes_cli.config import validate_config_structure
+            config_issues = validate_config_structure()
+            if config_issues:
+                print()
+                print(color("◆ Config Structure", Colors.CYAN, Colors.BOLD))
+                for ci in config_issues:
+                    if ci.severity == "error":
+                        check_fail(ci.message)
+                    else:
+                        check_warn(ci.message)
+                    # Show the hint indented
+                    for hint_line in ci.hint.splitlines():
+                        check_info(hint_line)
+                    issues.append(ci.message)
+        except Exception:
+            pass
+
    # =========================================================================
    # Check: Auth providers
    # =========================================================================
--- a/hermes_cli/model_switch.py
+++ b/hermes_cli/model_switch.py
@@ -419,14 +419,25 @@ def switch_model(
        # Resolve the provider
        pdef = resolve_provider_full(explicit_provider, user_providers)
        if pdef is None:
+            _switch_err = (
+                f"Unknown provider '{explicit_provider}'. "
+                f"Check 'hermes model' for available providers, or define it "
+                f"in config.yaml under 'providers:'."
+            )
+            # Check for common config issues that cause provider resolution failures
+            try:
+                from hermes_cli.config import validate_config_structure
+                _cfg_issues = validate_config_structure()
+                if _cfg_issues:
+                    _switch_err += "\n\nRun 'hermes doctor' — config issues detected:"
+                    for _ci in _cfg_issues[:3]:
+                        _switch_err += f"\n  • {_ci.message}"
+            except Exception:
+                pass
            return ModelSwitchResult(
                success=False,
                is_global=is_global,
-                error_message=(
-                    f"Unknown provider '{explicit_provider}'. "
-                    f"Check 'hermes model' for available providers, or define it "
-                    f"in config.yaml under 'providers:'."
-                ),
+                error_message=_switch_err,
            )

        target_provider = pdef.id
--- a/hermes_cli/runtime_provider.py
+++ b/hermes_cli/runtime_provider.py
@@ -2,10 +2,13 @@

 from __future__ import annotations

+import logging
 import os
 import re
 from typing import Any, Dict, Optional

+logger = logging.getLogger(__name__)
+
 from hermes_cli import auth as auth_mod
 from agent.credential_pool import CredentialPool, PooledCredential, get_custom_provider_pool_key, load_pool
 from hermes_cli.auth import (
@@ -258,6 +261,12 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
    config = load_config()
    custom_providers = config.get("custom_providers")
    if not isinstance(custom_providers, list):
+        if isinstance(custom_providers, dict):
+            logger.warning(
+                "custom_providers in config.yaml is a dict, not a list. "
+                "Each entry must be prefixed with '-' in YAML. "
+                "Run 'hermes doctor' for details."
+            )
        return None

    for entry in custom_providers:
--- a/tests/hermes_cli/test_config_validation.py
+++ b/tests/hermes_cli/test_config_validation.py
@@ -0,0 +1,174 @@
+"""Tests for config.yaml structure validation (validate_config_structure)."""
+
+import pytest
+
+from hermes_cli.config import validate_config_structure, ConfigIssue
+
+
+class TestCustomProvidersValidation:
+    """custom_providers must be a YAML list, not a dict."""
+
+    def test_dict_instead_of_list(self):
+        """The exact Discord user scenario — custom_providers as flat dict."""
+        issues = validate_config_structure({
+            "custom_providers": {
+                "name": "Generativelanguage.googleapis.com",
+                "base_url": "https://generativelanguage.googleapis.com/v1beta/openai",
+                "api_key": "xxx",
+                "model": "models/gemini-2.5-flash",
+                "rate_limit_delay": 2.0,
+                "fallback_model": {
+                    "provider": "openrouter",
+                    "model": "qwen/qwen3.6-plus:free",
+                },
+            },
+            "fallback_providers": [],
+        })
+        errors = [i for i in issues if i.severity == "error"]
+        assert any("dict" in i.message and "list" in i.message for i in errors), (
+            "Should detect custom_providers as dict instead of list"
+        )
+
+    def test_dict_detects_misplaced_fields(self):
+        """When custom_providers is a dict, detect fields that look misplaced."""
+        issues = validate_config_structure({
+            "custom_providers": {
+                "name": "test",
+                "base_url": "https://example.com",
+                "api_key": "xxx",
+            },
+        })
+        warnings = [i for i in issues if i.severity == "warning"]
+        # Should flag base_url, api_key as looking like custom_providers entry fields
+        misplaced = [i for i in warnings if "custom_providers entry fields" in i.message]
+        assert len(misplaced) == 1
+
+    def test_dict_detects_nested_fallback(self):
+        """When fallback_model gets swallowed into custom_providers dict."""
+        issues = validate_config_structure({
+            "custom_providers": {
+                "name": "test",
+                "fallback_model": {"provider": "openrouter", "model": "test"},
+            },
+        })
+        errors = [i for i in issues if i.severity == "error"]
+        assert any("fallback_model" in i.message and "inside" in i.message for i in errors)
+
+    def test_valid_list_no_issues(self):
+        """Properly formatted custom_providers should produce no issues."""
+        issues = validate_config_structure({
+            "custom_providers": [
+                {"name": "gemini", "base_url": "https://example.com/v1"},
+            ],
+            "model": {"provider": "custom", "default": "test"},
+        })
+        assert len(issues) == 0
+
+    def test_list_entry_missing_name(self):
+        """List entry without name should warn."""
+        issues = validate_config_structure({
+            "custom_providers": [{"base_url": "https://example.com/v1"}],
+            "model": {"provider": "custom"},
+        })
+        assert any("missing 'name'" in i.message for i in issues)
+
+    def test_list_entry_missing_base_url(self):
+        """List entry without base_url should warn."""
+        issues = validate_config_structure({
+            "custom_providers": [{"name": "test"}],
+            "model": {"provider": "custom"},
+        })
+        assert any("missing 'base_url'" in i.message for i in issues)
+
+    def test_list_entry_not_dict(self):
+        """Non-dict list entries should warn."""
+        issues = validate_config_structure({
+            "custom_providers": ["not-a-dict"],
+            "model": {"provider": "custom"},
+        })
+        assert any("not a dict" in i.message for i in issues)
+
+    def test_none_custom_providers_no_issues(self):
+        """No custom_providers at all should be fine."""
+        issues = validate_config_structure({
+            "model": {"provider": "openrouter"},
+        })
+        assert len(issues) == 0
+
+
+class TestFallbackModelValidation:
+    """fallback_model should be a top-level dict with provider + model."""
+
+    def test_missing_provider(self):
+        issues = validate_config_structure({
+            "fallback_model": {"model": "anthropic/claude-sonnet-4"},
+        })
+        assert any("missing 'provider'" in i.message for i in issues)
+
+    def test_missing_model(self):
+        issues = validate_config_structure({
+            "fallback_model": {"provider": "openrouter"},
+        })
+        assert any("missing 'model'" in i.message for i in issues)
+
+    def test_valid_fallback(self):
+        issues = validate_config_structure({
+            "fallback_model": {
+                "provider": "openrouter",
+                "model": "anthropic/claude-sonnet-4",
+            },
+        })
+        # Only fallback-related issues should be absent
+        fb_issues = [i for i in issues if "fallback" in i.message.lower()]
+        assert len(fb_issues) == 0
+
+    def test_non_dict_fallback(self):
+        issues = validate_config_structure({
+            "fallback_model": "openrouter:anthropic/claude-sonnet-4",
+        })
+        assert any("should be a dict" in i.message for i in issues)
+
+    def test_empty_fallback_dict_no_issues(self):
+        """Empty fallback_model dict means disabled — no warnings needed."""
+        issues = validate_config_structure({
+            "fallback_model": {},
+        })
+        fb_issues = [i for i in issues if "fallback" in i.message.lower()]
+        assert len(fb_issues) == 0
+
+
+class TestMissingModelSection:
+    """Warn when custom_providers exists but model section is missing."""
+
+    def test_custom_providers_without_model(self):
+        issues = validate_config_structure({
+            "custom_providers": [
+                {"name": "test", "base_url": "https://example.com/v1"},
+            ],
+        })
+        assert any("no 'model' section" in i.message for i in issues)
+
+    def test_custom_providers_with_model(self):
+        issues = validate_config_structure({
+            "custom_providers": [
+                {"name": "test", "base_url": "https://example.com/v1"},
+            ],
+            "model": {"provider": "custom", "default": "test-model"},
+        })
+        # Should not warn about missing model section
+        assert not any("no 'model' section" in i.message for i in issues)
+
+
+class TestConfigIssueDataclass:
+    """ConfigIssue should be a proper dataclass."""
+
+    def test_fields(self):
+        issue = ConfigIssue(severity="error", message="test msg", hint="test hint")
+        assert issue.severity == "error"
+        assert issue.message == "test msg"
+        assert issue.hint == "test hint"
+
+    def test_equality(self):
+        a = ConfigIssue("error", "msg", "hint")
+        b = ConfigIssue("error", "msg", "hint")
+        assert a == b