When a messaging platform fails to connect at startup (e.g. transient DNS failure) or disconnects at runtime with a retryable error, the gateway now queues it for background reconnection instead of giving up permanently. - New _platform_reconnect_watcher background task runs alongside the existing session expiry watcher - Exponential backoff: 30s, 60s, 120s, 240s, 300s cap - Max 20 retry attempts before giving up on a platform - Non-retryable errors (bad auth token, etc.) are not retried - Runtime disconnections via _handle_adapter_fatal_error now queue retryable failures instead of triggering gateway shutdown - On successful reconnect, adapter is wired up and channel directory is rebuilt automatically Fixes the case where a DNS blip during gateway startup caused Telegram and Discord to be permanently unavailable until manual restart.
96 lines
3.0 KiB
Python
96 lines
3.0 KiB
Python
from unittest.mock import AsyncMock
|
|
|
|
import pytest
|
|
|
|
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
|
from gateway.platforms.base import BasePlatformAdapter
|
|
from gateway.run import GatewayRunner
|
|
|
|
|
|
class _FatalAdapter(BasePlatformAdapter):
|
|
def __init__(self):
|
|
super().__init__(PlatformConfig(enabled=True, token="token"), Platform.TELEGRAM)
|
|
|
|
async def connect(self) -> bool:
|
|
self._set_fatal_error(
|
|
"telegram_token_lock",
|
|
"Another local Hermes gateway is already using this Telegram bot token.",
|
|
retryable=False,
|
|
)
|
|
return False
|
|
|
|
async def disconnect(self) -> None:
|
|
self._mark_disconnected()
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
raise NotImplementedError
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
class _RuntimeRetryableAdapter(BasePlatformAdapter):
|
|
def __init__(self):
|
|
super().__init__(PlatformConfig(enabled=True, token="token"), Platform.WHATSAPP)
|
|
|
|
async def connect(self) -> bool:
|
|
return True
|
|
|
|
async def disconnect(self) -> None:
|
|
self._mark_disconnected()
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
raise NotImplementedError
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_runner_requests_clean_exit_for_nonretryable_startup_conflict(monkeypatch, tmp_path):
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.TELEGRAM: PlatformConfig(enabled=True, token="token")
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
runner = GatewayRunner(config)
|
|
|
|
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _FatalAdapter())
|
|
|
|
ok = await runner.start()
|
|
|
|
assert ok is True
|
|
assert runner.should_exit_cleanly is True
|
|
assert "already using this Telegram bot token" in runner.exit_reason
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatch, tmp_path):
|
|
"""Retryable runtime fatal errors queue the platform for reconnection
|
|
instead of shutting down the gateway."""
|
|
config = GatewayConfig(
|
|
platforms={
|
|
Platform.WHATSAPP: PlatformConfig(enabled=True, token="token")
|
|
},
|
|
sessions_dir=tmp_path / "sessions",
|
|
)
|
|
runner = GatewayRunner(config)
|
|
adapter = _RuntimeRetryableAdapter()
|
|
adapter._set_fatal_error(
|
|
"whatsapp_bridge_exited",
|
|
"WhatsApp bridge process exited unexpectedly (code 1).",
|
|
retryable=True,
|
|
)
|
|
|
|
runner.adapters = {Platform.WHATSAPP: adapter}
|
|
runner.delivery_router.adapters = runner.adapters
|
|
runner.stop = AsyncMock()
|
|
|
|
await runner._handle_adapter_fatal_error(adapter)
|
|
|
|
# Should NOT shut down — platform is queued for reconnection
|
|
runner.stop.assert_not_awaited()
|
|
assert Platform.WHATSAPP in runner._failed_platforms
|
|
assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0
|