Files
hermes-agent/tools/mcp_oauth.py
Teknium cc54818d26 fix(mcp): stability fix pack — reload timeout, shutdown cleanup, event loop handler, OAuth non-blocking (#4757)
Four fixes for MCP server stability issues reported by community member
(terminal lockup, zombie processes, escape sequence pollution, startup hang):

1. MCP reload timeout guard (cli.py): _check_config_mcp_changes now runs
   _reload_mcp in a separate daemon thread with a 30s hard timeout. Previously,
   a hung MCP server could block the process_loop thread indefinitely, freezing
   the entire TUI (user can type but nothing happens, only Ctrl+D/Ctrl+\ work).

2. MCP stdio subprocess PID tracking (mcp_tool.py): Tracks child PIDs spawned
   by stdio_client via before/after snapshots of /proc children. On shutdown,
   _stop_mcp_loop force-kills any tracked PIDs that survived the SDK's graceful
   SIGTERM→SIGKILL cleanup. Prevents zombie MCP server processes from
   accumulating across sessions.

3. MCP event loop exception handler (mcp_tool.py): Installs
   _mcp_loop_exception_handler on the MCP background event loop — same pattern
   as the existing _suppress_closed_loop_errors on prompt_toolkit's loop.
   Suppresses benign 'Event loop is closed' RuntimeError from httpx transport
   __del__ during MCP shutdown. Salvaged from PR #2538 (acsezen).

4. MCP OAuth non-blocking (mcp_oauth.py): Replaces blocking input() call in
   _wait_for_callback with OAuthNonInteractiveError raise. Adds _is_interactive()
   TTY detection. In non-interactive environments, build_oauth_auth() still
   returns a provider (cached tokens + refresh work), but the callback handler
   raises immediately instead of blocking the MCP event loop for 120s. Re-raises
   OAuth setup failures in _run_http so failed servers are reported cleanly
   without blocking others. Salvaged from PRs #4521 (voidborne-d) and #4465
   (heathley).

Closes #2537, closes #4462
Related: #4128, #3436
2026-04-03 02:29:20 -07:00

327 lines
11 KiB
Python

"""Thin OAuth adapter for MCP HTTP servers.
Wraps the MCP SDK's built-in ``OAuthClientProvider`` (which implements
``httpx.Auth``) with Hermes-specific token storage and browser-based
authorization. The SDK handles all of the heavy lifting: PKCE generation,
metadata discovery, dynamic client registration, token exchange, and refresh.
Startup safety:
The callback handler never calls blocking ``input()`` on the event loop.
In non-interactive environments (no TTY, SSH, headless), the OAuth flow
raises ``OAuthNonInteractiveError`` instead of blocking, so that the
server degrades gracefully and other MCP servers are not affected.
Usage in mcp_tool.py::
from tools.mcp_oauth import build_oauth_auth
auth = build_oauth_auth(server_name, server_url)
# pass ``auth`` as the httpx auth parameter
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import socket
import sys
import threading
import webbrowser
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
from typing import Any
from urllib.parse import parse_qs, urlparse
logger = logging.getLogger(__name__)
class OAuthNonInteractiveError(RuntimeError):
"""Raised when OAuth requires user interaction but the environment is non-interactive."""
pass
_TOKEN_DIR_NAME = "mcp-tokens"
# ---------------------------------------------------------------------------
# Token storage — persists tokens + client info to ~/.hermes/mcp-tokens/
# ---------------------------------------------------------------------------
def _sanitize_server_name(name: str) -> str:
"""Sanitize server name for safe use as a filename."""
import re
clean = re.sub(r"[^\w\-]", "-", name.strip().lower())
clean = re.sub(r"-+", "-", clean).strip("-")
return clean[:60] or "unnamed"
class HermesTokenStorage:
"""File-backed token storage implementing the MCP SDK's TokenStorage protocol."""
def __init__(self, server_name: str):
self._server_name = _sanitize_server_name(server_name)
def _base_dir(self) -> Path:
home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
d = home / _TOKEN_DIR_NAME
d.mkdir(parents=True, exist_ok=True)
return d
def _tokens_path(self) -> Path:
return self._base_dir() / f"{self._server_name}.json"
def _client_path(self) -> Path:
return self._base_dir() / f"{self._server_name}.client.json"
# -- TokenStorage protocol (async) --
async def get_tokens(self):
data = self._read_json(self._tokens_path())
if not data:
return None
try:
from mcp.shared.auth import OAuthToken
return OAuthToken(**data)
except Exception:
return None
async def set_tokens(self, tokens) -> None:
self._write_json(self._tokens_path(), tokens.model_dump(exclude_none=True))
async def get_client_info(self):
data = self._read_json(self._client_path())
if not data:
return None
try:
from mcp.shared.auth import OAuthClientInformationFull
return OAuthClientInformationFull(**data)
except Exception:
return None
async def set_client_info(self, client_info) -> None:
self._write_json(self._client_path(), client_info.model_dump(exclude_none=True))
# -- helpers --
@staticmethod
def _read_json(path: Path) -> dict | None:
if not path.exists():
return None
try:
return json.loads(path.read_text(encoding="utf-8"))
except Exception:
return None
@staticmethod
def _write_json(path: Path, data: dict) -> None:
path.write_text(json.dumps(data, indent=2), encoding="utf-8")
try:
path.chmod(0o600)
except OSError:
pass
def remove(self) -> None:
"""Delete stored tokens and client info for this server."""
for p in (self._tokens_path(), self._client_path()):
try:
p.unlink(missing_ok=True)
except OSError:
pass
# ---------------------------------------------------------------------------
# Browser-based callback handler
# ---------------------------------------------------------------------------
def _find_free_port() -> int:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("127.0.0.1", 0))
return s.getsockname()[1]
def _make_callback_handler():
"""Create a callback handler class with instance-scoped result storage."""
result = {"auth_code": None, "state": None}
class Handler(BaseHTTPRequestHandler):
def do_GET(self):
qs = parse_qs(urlparse(self.path).query)
result["auth_code"] = (qs.get("code") or [None])[0]
result["state"] = (qs.get("state") or [None])[0]
self.send_response(200)
self.send_header("Content-Type", "text/html")
self.end_headers()
self.wfile.write(b"<html><body><h3>Authorization complete. You can close this tab.</h3></body></html>")
def log_message(self, *_args: Any) -> None:
pass
return Handler, result
# Port chosen at build time and shared with the callback handler via closure.
_oauth_port: int | None = None
async def _redirect_to_browser(auth_url: str) -> None:
"""Open the authorization URL in the user's browser."""
try:
if _can_open_browser():
webbrowser.open(auth_url)
print(" Opened browser for authorization...")
else:
print(f"\n Open this URL to authorize:\n {auth_url}\n")
except Exception:
print(f"\n Open this URL to authorize:\n {auth_url}\n")
async def _wait_for_callback() -> tuple[str, str | None]:
"""Start a local HTTP server on the pre-registered port and wait for the OAuth redirect.
If the callback times out, raises ``OAuthNonInteractiveError`` instead of
calling blocking ``input()`` — the old ``input()`` call would block the
entire MCP asyncio event loop, preventing all other MCP servers from
connecting and potentially hanging Hermes startup indefinitely.
"""
global _oauth_port
port = _oauth_port or _find_free_port()
HandlerClass, result = _make_callback_handler()
server = HTTPServer(("127.0.0.1", port), HandlerClass)
def _serve():
server.timeout = 120
server.handle_request()
thread = threading.Thread(target=_serve, daemon=True)
thread.start()
for _ in range(1200): # 120 seconds
await asyncio.sleep(0.1)
if result["auth_code"] is not None:
break
server.server_close()
code = result["auth_code"] or ""
state = result["state"]
if not code:
raise OAuthNonInteractiveError(
"OAuth browser callback timed out after 120 seconds. "
"Run 'hermes mcp auth <server-name>' to authorize interactively."
)
return code, state
def _can_open_browser() -> bool:
if os.environ.get("SSH_CLIENT") or os.environ.get("SSH_TTY"):
return False
if not os.environ.get("DISPLAY") and os.name != "nt" and "darwin" not in os.uname().sysname.lower():
return False
return True
def _is_interactive() -> bool:
"""Check if the current environment can support interactive OAuth flows.
Returns False in headless/daemon/container environments where no user
can interact with a browser or paste an auth code.
"""
if not hasattr(sys.stdin, "isatty") or not sys.stdin.isatty():
return False
return True
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def build_oauth_auth(server_name: str, server_url: str):
"""Build an ``httpx.Auth`` handler for the given MCP server using OAuth 2.1 PKCE.
Uses the MCP SDK's ``OAuthClientProvider`` which handles discovery,
registration, PKCE, token exchange, and refresh automatically.
In non-interactive environments (no TTY), this still returns a provider
so that **cached tokens and refresh flows work**. Only the interactive
authorization-code grant will fail fast with a clear error instead of
blocking the event loop.
Returns an ``OAuthClientProvider`` instance (implements ``httpx.Auth``),
or ``None`` if the MCP SDK auth module is not available.
"""
try:
from mcp.client.auth import OAuthClientProvider
from mcp.shared.auth import OAuthClientMetadata
except ImportError:
logger.warning("MCP SDK auth module not available — OAuth disabled")
return None
storage = HermesTokenStorage(server_name)
interactive = _is_interactive()
if not interactive:
# Check whether cached tokens exist. If they do, the SDK can still
# use them (and refresh them) without any user interaction. If not,
# we still build the provider — the callback_handler will raise
# OAuthNonInteractiveError if a fresh authorization is actually
# needed, which surfaces as a clean connection failure for this
# server only (other MCP servers are unaffected).
has_cached = storage._read_json(storage._tokens_path()) is not None
if not has_cached:
logger.warning(
"MCP server '%s' requires OAuth but no cached tokens found "
"and environment is non-interactive. The server will fail to "
"connect. Run 'hermes mcp auth %s' to authorize interactively.",
server_name, server_name,
)
global _oauth_port
_oauth_port = _find_free_port()
redirect_uri = f"http://127.0.0.1:{_oauth_port}/callback"
client_metadata = OAuthClientMetadata(
client_name="Hermes Agent",
redirect_uris=[redirect_uri],
grant_types=["authorization_code", "refresh_token"],
response_types=["code"],
scope="openid profile email offline_access",
token_endpoint_auth_method="none",
)
# In non-interactive mode, the redirect handler logs the URL and the
# callback handler raises immediately — no blocking, no input().
redirect_handler = _redirect_to_browser
callback_handler = _wait_for_callback
if not interactive:
async def _noninteractive_redirect(auth_url: str) -> None:
logger.warning(
"MCP server '%s' needs OAuth authorization (non-interactive, "
"cannot open browser). URL: %s",
server_name, auth_url,
)
async def _noninteractive_callback() -> tuple[str, str | None]:
raise OAuthNonInteractiveError(
f"MCP server '{server_name}' requires interactive OAuth "
f"authorization but the environment is non-interactive "
f"(no TTY). Run 'hermes mcp auth {server_name}' to "
f"authorize, then restart."
)
redirect_handler = _noninteractive_redirect
callback_handler = _noninteractive_callback
return OAuthClientProvider(
server_url=server_url,
client_metadata=client_metadata,
storage=storage,
redirect_handler=redirect_handler,
callback_handler=callback_handler,
timeout=120.0,
)
def remove_oauth_tokens(server_name: str) -> None:
"""Delete stored OAuth tokens and client info for a server."""
HermesTokenStorage(server_name).remove()