feat: add Hugging Face as a first-class inference provider (#3419)

Salvage of PR #1747 (original PR #1171 by @davanstrien) onto current main. Registers Hugging Face Inference Providers (router.huggingface.co/v1) as a named provider: - hermes chat --provider huggingface (or --provider hf) - 18 curated open models via hermes model picker - HF_TOKEN in ~/.hermes/.env - OpenAI-compatible endpoint with automatic failover (Groq, Together, SambaNova, etc.) Files: auth.py, models.py, main.py, setup.py, config.py, model_metadata.py, .env.example, 5 docs pages, 17 new tests. Co-authored-by: Daniel van Strien <davanstrien@gmail.com>
2026-03-27 12:41:59 -07:00
parent f57ebf52e9
commit fd8c465e42
13 changed files with 240 additions and 8 deletions
--- a/.env.example
+++ b/.env.example
@@ -59,6 +59,15 @@ OPENCODE_ZEN_API_KEY=
 # OpenCode Go provides access to open models (GLM-5, Kimi K2.5, MiniMax M2.5)
 # $10/month subscription. Get your key at: https://opencode.ai/auth
 OPENCODE_GO_API_KEY=
+
+# =============================================================================
+# LLM PROVIDER (Hugging Face Inference Providers)
+# =============================================================================
+# Hugging Face routes to 20+ open models via unified OpenAI-compatible endpoint.
+# Free tier included ($0.10/month), no markup on provider rates.
+# Get your token at: https://huggingface.co/settings/tokens
+# Required permission: "Make calls to Inference Providers"
+HF_TOKEN=
 # OPENCODE_GO_BASE_URL=https://opencode.ai/zen/go/v1  # Override default base URL

 # =============================================================================
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -113,6 +113,25 @@ DEFAULT_CONTEXT_LENGTHS = {
    "glm": 202752,
    # Kimi
    "kimi": 262144,
+    # Hugging Face Inference Providers — model IDs use org/name format
+    "Qwen/Qwen3.5-397B-A17B": 131072,
+    "Qwen/Qwen3-235B-A22B-Thinking-2507": 131072,
+    "Qwen/Qwen3-Coder-480B-A35B-Instruct": 131072,
+    "Qwen/Qwen3-Coder-Next": 131072,
+    "Qwen/Qwen3-Next-80B-A3B-Instruct": 131072,
+    "Qwen/Qwen3-Next-80B-A3B-Thinking": 131072,
+    "deepseek-ai/DeepSeek-R1-0528": 65536,
+    "deepseek-ai/DeepSeek-V3.2": 65536,
+    "moonshotai/Kimi-K2-Instruct": 262144,
+    "moonshotai/Kimi-K2-Instruct-0905": 262144,
+    "moonshotai/Kimi-K2.5": 262144,
+    "moonshotai/Kimi-K2-Thinking": 262144,
+    "MiniMaxAI/MiniMax-M2.5": 204800,
+    "MiniMaxAI/MiniMax-M2.1": 204800,
+    "XiaomiMiMo/MiMo-V2-Flash": 32768,
+    "zai-org/GLM-5": 202752,
+    "zai-org/GLM-4.7": 202752,
+    "zai-org/GLM-4.7-Flash": 202752,
 }

 _CONTEXT_LENGTH_KEYS = (
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@@ -212,6 +212,14 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
        api_key_env_vars=("KILOCODE_API_KEY",),
        base_url_env_var="KILOCODE_BASE_URL",
    ),
+    "huggingface": ProviderConfig(
+        id="huggingface",
+        name="Hugging Face",
+        auth_type="api_key",
+        inference_base_url="https://router.huggingface.co/v1",
+        api_key_env_vars=("HF_TOKEN",),
+        base_url_env_var="HF_BASE_URL",
+    ),
 }


@@ -685,6 +693,7 @@ def resolve_provider(
        "github-copilot-acp": "copilot-acp", "copilot-acp-agent": "copilot-acp",
        "aigateway": "ai-gateway", "vercel": "ai-gateway", "vercel-ai-gateway": "ai-gateway",
        "opencode": "opencode-zen", "zen": "opencode-zen",
+        "hf": "huggingface", "hugging-face": "huggingface", "huggingface-hub": "huggingface",
        "go": "opencode-go", "opencode-go-sub": "opencode-go",
        "kilo": "kilocode", "kilo-code": "kilocode", "kilo-gateway": "kilocode",
    }
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -593,6 +593,21 @@ OPTIONAL_ENV_VARS = {
        "category": "provider",
        "advanced": True,
    },
+    "HF_TOKEN": {
+        "description": "Hugging Face token for Inference Providers (20+ open models via router.huggingface.co)",
+        "prompt": "Hugging Face Token",
+        "url": "https://huggingface.co/settings/tokens",
+        "password": True,
+        "category": "provider",
+    },
+    "HF_BASE_URL": {
+        "description": "Hugging Face Inference Providers base URL override",
+        "prompt": "HF base URL (leave empty for default)",
+        "url": None,
+        "password": False,
+        "category": "provider",
+        "advanced": True,
+    },

    # ── Tool API keys ──
    "PARALLEL_API_KEY": {
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -795,6 +795,7 @@ def cmd_model(args):
        "ai-gateway": "AI Gateway",
        "kilocode": "Kilo Code",
        "alibaba": "Alibaba Cloud (DashScope)",
+        "huggingface": "Hugging Face",
        "custom": "Custom endpoint",
    }
    active_label = provider_labels.get(active, active)
@@ -821,6 +822,7 @@ def cmd_model(args):
        ("opencode-go", "OpenCode Go (open models, $10/month subscription)"),
        ("ai-gateway", "AI Gateway (Vercel — 200+ models, pay-per-use)"),
        ("alibaba", "Alibaba Cloud / DashScope (Qwen models, Anthropic-compatible)"),
+        ("huggingface", "Hugging Face Inference Providers (20+ open models)"),
    ]

    # Add user-defined custom providers from config.yaml
@@ -893,7 +895,7 @@ def cmd_model(args):
        _model_flow_anthropic(config, current_model)
    elif selected_provider == "kimi-coding":
        _model_flow_kimi(config, current_model)
-    elif selected_provider in ("zai", "minimax", "minimax-cn", "kilocode", "opencode-zen", "opencode-go", "ai-gateway", "alibaba"):
+    elif selected_provider in ("zai", "minimax", "minimax-cn", "kilocode", "opencode-zen", "opencode-go", "ai-gateway", "alibaba", "huggingface"):
        _model_flow_api_key_provider(config, selected_provider, current_model)


@@ -1502,6 +1504,27 @@ _PROVIDER_MODELS = {
        "google/gemini-3-pro-preview",
        "google/gemini-3-flash-preview",
    ],
+    # Curated model list sourced from https://models.dev (huggingface provider)
+    "huggingface": [
+        "Qwen/Qwen3.5-397B-A17B",
+        "Qwen/Qwen3-235B-A22B-Thinking-2507",
+        "Qwen/Qwen3-Coder-480B-A35B-Instruct",
+        "Qwen/Qwen3-Coder-Next",
+        "Qwen/Qwen3-Next-80B-A3B-Instruct",
+        "Qwen/Qwen3-Next-80B-A3B-Thinking",
+        "deepseek-ai/DeepSeek-R1-0528",
+        "deepseek-ai/DeepSeek-V3.2",
+        "moonshotai/Kimi-K2-Instruct",
+        "moonshotai/Kimi-K2-Instruct-0905",
+        "moonshotai/Kimi-K2.5",
+        "moonshotai/Kimi-K2-Thinking",
+        "MiniMaxAI/MiniMax-M2.5",
+        "MiniMaxAI/MiniMax-M2.1",
+        "XiaomiMiMo/MiMo-V2-Flash",
+        "zai-org/GLM-5",
+        "zai-org/GLM-4.7",
+        "zai-org/GLM-4.7-Flash",
+    ],
 }


@@ -3122,7 +3145,7 @@ For more help on a command:
    )
    chat_parser.add_argument(
        "--provider",
-        choices=["auto", "openrouter", "nous", "openai-codex", "copilot-acp", "copilot", "anthropic", "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode"],
+        choices=["auto", "openrouter", "nous", "openai-codex", "copilot-acp", "copilot", "anthropic", "huggingface", "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode"],
        default=None,
        help="Inference provider (default: auto)"
    )
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -217,6 +217,28 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "qwen3.5-flash",
        "qwen-vl-max",
    ],
+    # Curated model list for Hugging Face Inference Providers
+    # sourced from https://models.dev (huggingface provider)
+    "huggingface": [
+        "Qwen/Qwen3.5-397B-A17B",
+        "Qwen/Qwen3-235B-A22B-Thinking-2507",
+        "Qwen/Qwen3-Coder-480B-A35B-Instruct",
+        "Qwen/Qwen3-Coder-Next",
+        "Qwen/Qwen3-Next-80B-A3B-Instruct",
+        "Qwen/Qwen3-Next-80B-A3B-Thinking",
+        "deepseek-ai/DeepSeek-R1-0528",
+        "deepseek-ai/DeepSeek-V3.2",
+        "moonshotai/Kimi-K2-Instruct",
+        "moonshotai/Kimi-K2-Instruct-0905",
+        "moonshotai/Kimi-K2.5",
+        "moonshotai/Kimi-K2-Thinking",
+        "MiniMaxAI/MiniMax-M2.5",
+        "MiniMaxAI/MiniMax-M2.1",
+        "XiaomiMiMo/MiMo-V2-Flash",
+        "zai-org/GLM-5",
+        "zai-org/GLM-4.7",
+        "zai-org/GLM-4.7-Flash",
+    ],
 }

 _PROVIDER_LABELS = {
@@ -236,6 +258,7 @@ _PROVIDER_LABELS = {
    "ai-gateway": "AI Gateway",
    "kilocode": "Kilo Code",
    "alibaba": "Alibaba Cloud (DashScope)",
+    "huggingface": "Hugging Face",
    "custom": "Custom endpoint",
 }

@@ -271,6 +294,9 @@ _PROVIDER_ALIASES = {
    "aliyun": "alibaba",
    "qwen": "alibaba",
    "alibaba-cloud": "alibaba",
+    "hf": "huggingface",
+    "hugging-face": "huggingface",
+    "huggingface-hub": "huggingface",
 }


@@ -304,7 +330,7 @@ def list_available_providers() -> list[dict[str, str]]:
    # Canonical providers in display order
    _PROVIDER_ORDER = [
        "openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
-        "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode", "anthropic", "alibaba",
+        "huggingface", "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode", "anthropic", "alibaba",
        "opencode-zen", "opencode-go",
        "ai-gateway", "deepseek", "custom",
    ]
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@@ -80,6 +80,11 @@ _DEFAULT_PROVIDER_MODELS = {
    "minimax-cn": ["MiniMax-M2.7", "MiniMax-M2.7-highspeed", "MiniMax-M2.5", "MiniMax-M2.5-highspeed", "MiniMax-M2.1"],
    "ai-gateway": ["anthropic/claude-opus-4.6", "anthropic/claude-sonnet-4.6", "openai/gpt-5", "google/gemini-3-flash"],
    "kilocode": ["anthropic/claude-opus-4.6", "anthropic/claude-sonnet-4.6", "openai/gpt-5.4", "google/gemini-3-pro-preview", "google/gemini-3-flash-preview"],
+    "huggingface": [
+        "Qwen/Qwen3.5-397B-A17B", "Qwen/Qwen3-235B-A22B-Thinking-2507",
+        "Qwen/Qwen3-Coder-480B-A35B-Instruct", "deepseek-ai/DeepSeek-R1-0528",
+        "deepseek-ai/DeepSeek-V3.2", "moonshotai/Kimi-K2.5",
+    ],
 }


@@ -884,6 +889,7 @@ def setup_model_provider(config: dict):
        "OpenCode Go (open models, $10/month subscription)",
        "GitHub Copilot (uses GITHUB_TOKEN or gh auth token)",
        "GitHub Copilot ACP (spawns `copilot --acp --stdio`)",
+        "Hugging Face Inference Providers (20+ open models)",
    ]
    if keep_label:
        provider_choices.append(keep_label)
@@ -1528,7 +1534,26 @@ def setup_model_provider(config: dict):
        _set_model_provider(config, "copilot-acp", pconfig.inference_base_url)
        selected_base_url = pconfig.inference_base_url

-    # else: provider_idx == 16 (Keep current) — only shown when a provider already exists
+    elif provider_idx == 16:  # Hugging Face Inference Providers
+        selected_provider = "huggingface"
+        print()
+        print_header("Hugging Face API Token")
+        pconfig = PROVIDER_REGISTRY["huggingface"]
+        print_info(f"Provider: {pconfig.name}")
+        print_info("Get your token at: https://huggingface.co/settings/tokens")
+        print_info("Required permission: 'Make calls to Inference Providers'")
+        print()
+
+        api_key = prompt("  HF Token", password=True)
+        if api_key:
+            save_env_value("HF_TOKEN", api_key)
+            # Clear OpenRouter env vars to prevent routing confusion
+            save_env_value("OPENAI_BASE_URL", "")
+            save_env_value("OPENAI_API_KEY", "")
+        _set_model_provider(config, "huggingface", pconfig.inference_base_url)
+        selected_base_url = pconfig.inference_base_url
+
+    # else: provider_idx == 17 (Keep current) — only shown when a provider already exists
    # Normalize "keep current" to an explicit provider so downstream logic
    # doesn't fall back to the generic OpenRouter/static-model path.
    if selected_provider is None:
--- a/tests/test_api_key_providers.py
+++ b/tests/test_api_key_providers.py
@@ -38,6 +38,7 @@ class TestProviderRegistry:
    @pytest.mark.parametrize("provider_id,name,auth_type", [
        ("copilot-acp", "GitHub Copilot ACP", "external_process"),
        ("copilot", "GitHub Copilot", "api_key"),
+        ("huggingface", "Hugging Face", "api_key"),
        ("zai", "Z.AI / GLM", "api_key"),
        ("kimi-coding", "Kimi / Moonshot", "api_key"),
        ("minimax", "MiniMax", "api_key"),
@@ -87,6 +88,11 @@ class TestProviderRegistry:
        assert pconfig.api_key_env_vars == ("KILOCODE_API_KEY",)
        assert pconfig.base_url_env_var == "KILOCODE_BASE_URL"

+    def test_huggingface_env_vars(self):
+        pconfig = PROVIDER_REGISTRY["huggingface"]
+        assert pconfig.api_key_env_vars == ("HF_TOKEN",)
+        assert pconfig.base_url_env_var == "HF_BASE_URL"
+
    def test_base_urls(self):
        assert PROVIDER_REGISTRY["copilot"].inference_base_url == "https://api.githubcopilot.com"
        assert PROVIDER_REGISTRY["copilot-acp"].inference_base_url == "acp://copilot"
@@ -96,6 +102,7 @@ class TestProviderRegistry:
        assert PROVIDER_REGISTRY["minimax-cn"].inference_base_url == "https://api.minimaxi.com/anthropic"
        assert PROVIDER_REGISTRY["ai-gateway"].inference_base_url == "https://ai-gateway.vercel.sh/v1"
        assert PROVIDER_REGISTRY["kilocode"].inference_base_url == "https://api.kilo.ai/api/gateway"
+        assert PROVIDER_REGISTRY["huggingface"].inference_base_url == "https://router.huggingface.co/v1"

    def test_oauth_providers_unchanged(self):
        """Ensure we didn't break the existing OAuth providers."""
@@ -199,6 +206,18 @@ class TestResolveProvider:
        assert resolve_provider("github-copilot-acp") == "copilot-acp"
        assert resolve_provider("copilot-acp-agent") == "copilot-acp"

+    def test_explicit_huggingface(self):
+        assert resolve_provider("huggingface") == "huggingface"
+
+    def test_alias_hf(self):
+        assert resolve_provider("hf") == "huggingface"
+
+    def test_alias_hugging_face(self):
+        assert resolve_provider("hugging-face") == "huggingface"
+
+    def test_alias_huggingface_hub(self):
+        assert resolve_provider("huggingface-hub") == "huggingface"
+
    def test_unknown_provider_raises(self):
        with pytest.raises(AuthError):
            resolve_provider("nonexistent-provider-xyz")
@@ -235,6 +254,10 @@ class TestResolveProvider:
        monkeypatch.setenv("KILOCODE_API_KEY", "test-kilo-key")
        assert resolve_provider("auto") == "kilocode"

+    def test_auto_detects_hf_token(self, monkeypatch):
+        monkeypatch.setenv("HF_TOKEN", "hf_test_token")
+        assert resolve_provider("auto") == "huggingface"
+
    def test_openrouter_takes_priority_over_glm(self, monkeypatch):
        """OpenRouter API key should win over GLM in auto-detection."""
        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
@@ -708,3 +731,55 @@ class TestKimiMoonshotModelListIsolation:
        coding_models = _PROVIDER_MODELS["kimi-coding"]
        assert "kimi-for-coding" in coding_models
        assert "kimi-k2-thinking-turbo" in coding_models
+
+
+# =============================================================================
+# Hugging Face provider model list tests
+# =============================================================================
+
+class TestHuggingFaceModels:
+    """Verify Hugging Face model lists are consistent across all locations."""
+
+    def test_main_provider_models_has_huggingface(self):
+        from hermes_cli.main import _PROVIDER_MODELS
+        assert "huggingface" in _PROVIDER_MODELS
+        models = _PROVIDER_MODELS["huggingface"]
+        assert len(models) >= 10, "Expected at least 10 curated HF models"
+
+    def test_models_py_has_huggingface(self):
+        from hermes_cli.models import _PROVIDER_MODELS
+        assert "huggingface" in _PROVIDER_MODELS
+        models = _PROVIDER_MODELS["huggingface"]
+        assert len(models) >= 10
+
+    def test_model_lists_match(self):
+        """Model lists in main.py and models.py should be identical."""
+        from hermes_cli.main import _PROVIDER_MODELS as main_models
+        from hermes_cli.models import _PROVIDER_MODELS as models_models
+        assert main_models["huggingface"] == models_models["huggingface"]
+
+    def test_model_metadata_has_context_lengths(self):
+        """Every HF model should have a context length entry."""
+        from hermes_cli.models import _PROVIDER_MODELS
+        from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS
+        hf_models = _PROVIDER_MODELS["huggingface"]
+        for model in hf_models:
+            assert model in DEFAULT_CONTEXT_LENGTHS, (
+                f"HF model {model!r} missing from DEFAULT_CONTEXT_LENGTHS"
+            )
+
+    def test_models_use_org_name_format(self):
+        """HF models should use org/name format (e.g. Qwen/Qwen3-235B)."""
+        from hermes_cli.models import _PROVIDER_MODELS
+        for model in _PROVIDER_MODELS["huggingface"]:
+            assert "/" in model, f"HF model {model!r} missing org/ prefix"
+
+    def test_provider_aliases_in_models_py(self):
+        from hermes_cli.models import _PROVIDER_ALIASES
+        assert _PROVIDER_ALIASES.get("hf") == "huggingface"
+        assert _PROVIDER_ALIASES.get("hugging-face") == "huggingface"
+
+    def test_provider_label(self):
+        from hermes_cli.models import _PROVIDER_LABELS
+        assert "huggingface" in _PROVIDER_LABELS
+        assert _PROVIDER_LABELS["huggingface"] == "Hugging Face"
--- a/website/docs/getting-started/quickstart.md
+++ b/website/docs/getting-started/quickstart.md
@@ -50,6 +50,7 @@ hermes setup       # Or configure everything at once
 | **MiniMax** | International MiniMax endpoint | Set `MINIMAX_API_KEY` |
 | **MiniMax China** | China-region MiniMax endpoint | Set `MINIMAX_CN_API_KEY` |
 | **Alibaba Cloud** | Qwen models via DashScope | Set `DASHSCOPE_API_KEY` |
+| **Hugging Face** | 20+ open models via unified router (Qwen, DeepSeek, Kimi, etc.) | Set `HF_TOKEN` |
 | **Kilo Code** | KiloCode-hosted models | Set `KILOCODE_API_KEY` |
 | **OpenCode Zen** | Pay-as-you-go access to curated models | Set `OPENCODE_ZEN_API_KEY` |
 | **OpenCode Go** | $10/month subscription for open models | Set `OPENCODE_GO_API_KEY` |
--- a/website/docs/reference/cli-commands.md
+++ b/website/docs/reference/cli-commands.md
@@ -66,7 +66,7 @@ Common options:
 | `-q`, `--query "..."` | One-shot, non-interactive prompt. |
 | `-m`, `--model <model>` | Override the model for this run. |
 | `-t`, `--toolsets <csv>` | Enable a comma-separated set of toolsets. |
-| `--provider <provider>` | Force a provider: `auto`, `openrouter`, `nous`, `openai-codex`, `copilot`, `copilot-acp`, `anthropic`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`, `kilocode`. |
+| `--provider <provider>` | Force a provider: `auto`, `openrouter`, `nous`, `openai-codex`, `copilot`, `copilot-acp`, `anthropic`, `huggingface`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`, `kilocode`. |
 | `-s`, `--skills <name>` | Preload one or more skills for the session (can be repeated or comma-separated). |
 | `-v`, `--verbose` | Verbose output. |
 | `-Q`, `--quiet` | Programmatic mode: suppress banner/spinner/tool previews. |
--- a/website/docs/reference/environment-variables.md
+++ b/website/docs/reference/environment-variables.md
@@ -37,6 +37,8 @@ All variables go in `~/.hermes/.env`. You can also set them with `hermes config
 | `MINIMAX_CN_BASE_URL` | Override MiniMax China base URL (default: `https://api.minimaxi.com/v1`) |
 | `KILOCODE_API_KEY` | Kilo Code API key ([kilo.ai](https://kilo.ai)) |
 | `KILOCODE_BASE_URL` | Override Kilo Code base URL (default: `https://api.kilo.ai/api/gateway`) |
+| `HF_TOKEN` | Hugging Face token for Inference Providers ([huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)) |
+| `HF_BASE_URL` | Override Hugging Face base URL (default: `https://router.huggingface.co/v1`) |
 | `ANTHROPIC_API_KEY` | Anthropic Console API key ([console.anthropic.com](https://console.anthropic.com/)) |
 | `ANTHROPIC_TOKEN` | Manual or legacy Anthropic OAuth/setup-token override |
 | `DASHSCOPE_API_KEY` | Alibaba Cloud DashScope API key for Qwen models ([modelstudio.console.alibabacloud.com](https://modelstudio.console.alibabacloud.com/)) |
@@ -61,7 +63,7 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe

 | Variable | Description |
 |----------|-------------|
-| `HERMES_INFERENCE_PROVIDER` | Override provider selection: `auto`, `openrouter`, `nous`, `openai-codex`, `copilot`, `copilot-acp`, `anthropic`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`, `kilocode` (default: `auto`) |
+| `HERMES_INFERENCE_PROVIDER` | Override provider selection: `auto`, `openrouter`, `nous`, `openai-codex`, `copilot`, `copilot-acp`, `anthropic`, `huggingface`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`, `kilocode`, `alibaba` (default: `auto`) |
 | `HERMES_PORTAL_BASE_URL` | Override Nous Portal URL (for development/testing) |
 | `NOUS_INFERENCE_BASE_URL` | Override Nous inference API URL |
 | `HERMES_NOUS_MIN_KEY_TTL_SECONDS` | Min agent key TTL before re-mint (default: 1800 = 30min) |
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@@ -92,6 +92,7 @@ You need at least one way to connect to an LLM. Use `hermes model` to switch pro
 | **Kilo Code** | `KILOCODE_API_KEY` in `~/.hermes/.env` (provider: `kilocode`) |
 | **OpenCode Zen** | `OPENCODE_ZEN_API_KEY` in `~/.hermes/.env` (provider: `opencode-zen`) |
 | **OpenCode Go** | `OPENCODE_GO_API_KEY` in `~/.hermes/.env` (provider: `opencode-go`) |
+| **Hugging Face** | `HF_TOKEN` in `~/.hermes/.env` (provider: `huggingface`, aliases: `hf`) |
 | **Custom Endpoint** | `hermes model` (saved in `config.yaml`) or `OPENAI_BASE_URL` + `OPENAI_API_KEY` in `~/.hermes/.env` |

 :::info Codex Note
@@ -224,6 +225,32 @@ model:

 Base URLs can be overridden with `GLM_BASE_URL`, `KIMI_BASE_URL`, `MINIMAX_BASE_URL`, `MINIMAX_CN_BASE_URL`, or `DASHSCOPE_BASE_URL` environment variables.

+### Hugging Face Inference Providers
+
+[Hugging Face Inference Providers](https://huggingface.co/docs/inference-providers) routes to 20+ open models through a unified OpenAI-compatible endpoint (`router.huggingface.co/v1`). Requests are automatically routed to the fastest available backend (Groq, Together, SambaNova, etc.) with automatic failover.
+
+```bash
+# Use any available model
+hermes chat --provider huggingface --model Qwen/Qwen3-235B-A22B-Thinking-2507
+# Requires: HF_TOKEN in ~/.hermes/.env
+
+# Short alias
+hermes chat --provider hf --model deepseek-ai/DeepSeek-V3.2
+```
+
+Or set it permanently in `config.yaml`:
+```yaml
+model:
+  provider: "huggingface"
+  default: "Qwen/Qwen3-235B-A22B-Thinking-2507"
+```
+
+Get your token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) — make sure to enable the "Make calls to Inference Providers" permission. Free tier included ($0.10/month credit, no markup on provider rates).
+
+You can append routing suffixes to model names: `:fastest` (default), `:cheapest`, or `:provider_name` to force a specific backend.
+
+The base URL can be overridden with `HF_BASE_URL`.
+
 ## Custom & Self-Hosted LLM Providers

 Hermes Agent works with **any OpenAI-compatible API endpoint**. If a server implements `/v1/chat/completions`, you can point Hermes at it. This means you can use local models, GPU inference servers, multi-provider routers, or any third-party API.
@@ -627,7 +654,7 @@ fallback_model:

 When activated, the fallback swaps the model and provider mid-session without losing your conversation. It fires **at most once** per session.

-Supported providers: `openrouter`, `nous`, `openai-codex`, `copilot`, `anthropic`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`, `custom`.
+Supported providers: `openrouter`, `nous`, `openai-codex`, `copilot`, `anthropic`, `huggingface`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`, `custom`.

 :::tip
 Fallback is configured exclusively through `config.yaml` — there are no environment variables for it. For full details on when it triggers, supported providers, and how it interacts with auxiliary tasks and delegation, see [Fallback Providers](/docs/user-guide/features/fallback-providers).
--- a/website/docs/user-guide/features/fallback-providers.md
+++ b/website/docs/user-guide/features/fallback-providers.md
@@ -44,6 +44,7 @@ Both `provider` and `model` are **required**. If either is missing, the fallback
 | MiniMax | `minimax` | `MINIMAX_API_KEY` |
 | MiniMax (China) | `minimax-cn` | `MINIMAX_CN_API_KEY` |
 | Kilo Code | `kilocode` | `KILOCODE_API_KEY` |
+| Hugging Face | `huggingface` | `HF_TOKEN` |
 | Custom endpoint | `custom` | `base_url` + `api_key_env` (see below) |

 ### Custom Endpoint Fallback
@@ -161,7 +162,7 @@ When a task's provider is set to `"auto"` (the default), Hermes tries providers

 ```text
 OpenRouter → Nous Portal → Custom endpoint → Codex OAuth →
-API-key providers (z.ai, Kimi, MiniMax, Anthropic) → give up
+API-key providers (z.ai, Kimi, MiniMax, Hugging Face, Anthropic) → give up
 ```

 **For vision tasks:**