When a messaging platform fails to connect at startup (e.g. transient DNS failure) or disconnects at runtime with a retryable error, the gateway now queues it for background reconnection instead of giving up permanently. - New _platform_reconnect_watcher background task runs alongside the existing session expiry watcher - Exponential backoff: 30s, 60s, 120s, 240s, 300s cap - Max 20 retry attempts before giving up on a platform - Non-retryable errors (bad auth token, etc.) are not retried - Runtime disconnections via _handle_adapter_fatal_error now queue retryable failures instead of triggering gateway shutdown - On successful reconnect, adapter is wired up and channel directory is rebuilt automatically Fixes the case where a DNS blip during gateway startup caused Telegram and Discord to be permanently unavailable until manual restart.
402 lines
14 KiB
Python
402 lines
14 KiB
Python
"""Tests for the gateway platform reconnection watcher."""
|
|
|
|
import asyncio
|
|
import time
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
|
from gateway.platforms.base import BasePlatformAdapter, MessageEvent, SendResult
|
|
from gateway.run import GatewayRunner
|
|
|
|
|
|
class StubAdapter(BasePlatformAdapter):
|
|
"""Adapter whose connect() result can be controlled."""
|
|
|
|
def __init__(self, *, succeed=True, fatal_error=None, fatal_retryable=True):
|
|
super().__init__(PlatformConfig(enabled=True, token="test"), Platform.TELEGRAM)
|
|
self._succeed = succeed
|
|
self._fatal_error = fatal_error
|
|
self._fatal_retryable = fatal_retryable
|
|
|
|
async def connect(self):
|
|
if self._fatal_error:
|
|
self._set_fatal_error("test_error", self._fatal_error, retryable=self._fatal_retryable)
|
|
return False
|
|
return self._succeed
|
|
|
|
async def disconnect(self):
|
|
return None
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
return SendResult(success=True, message_id="1")
|
|
|
|
async def send_typing(self, chat_id, metadata=None):
|
|
return None
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
def _make_runner():
|
|
"""Create a minimal GatewayRunner via object.__new__ to skip __init__."""
|
|
runner = object.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(
|
|
platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="test")}
|
|
)
|
|
runner._running = True
|
|
runner._shutdown_event = asyncio.Event()
|
|
runner._exit_reason = None
|
|
runner._exit_with_failure = False
|
|
runner._exit_cleanly = False
|
|
runner._failed_platforms = {}
|
|
runner.adapters = {}
|
|
runner.delivery_router = MagicMock()
|
|
runner._running_agents = {}
|
|
runner._pending_messages = {}
|
|
runner._pending_approvals = {}
|
|
runner._honcho_managers = {}
|
|
runner._honcho_configs = {}
|
|
runner._shutdown_all_gateway_honcho = lambda: None
|
|
return runner
|
|
|
|
|
|
# --- Startup queueing ---
|
|
|
|
class TestStartupFailureQueuing:
|
|
"""Verify that failed platforms are queued during startup."""
|
|
|
|
def test_failed_platform_queued_on_connect_failure(self):
|
|
"""When adapter.connect() returns False without fatal error, queue for retry."""
|
|
runner = _make_runner()
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() + 30,
|
|
}
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 1
|
|
|
|
def test_failed_platform_not_queued_for_nonretryable(self):
|
|
"""Non-retryable errors should not be in the retry queue."""
|
|
runner = _make_runner()
|
|
# Simulate: adapter had a non-retryable error, wasn't queued
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
|
|
|
|
# --- Reconnect watcher ---
|
|
|
|
class TestPlatformReconnectWatcher:
|
|
"""Test the _platform_reconnect_watcher background task."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_succeeds_on_retry(self):
|
|
"""Watcher should reconnect a failed platform when connect() succeeds."""
|
|
runner = _make_runner()
|
|
runner._sync_voice_mode_state_to_adapter = MagicMock()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() - 1, # Already past retry time
|
|
}
|
|
|
|
succeed_adapter = StubAdapter(succeed=True)
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter", return_value=succeed_adapter):
|
|
with patch("gateway.run.build_channel_directory", create=True):
|
|
# Run one iteration of the watcher then stop
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
# Patch the sleep to exit after first check
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
assert Platform.TELEGRAM in runner.adapters
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_nonretryable_removed_from_queue(self):
|
|
"""Non-retryable errors should remove the platform from the retry queue."""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() - 1,
|
|
}
|
|
|
|
fail_adapter = StubAdapter(
|
|
succeed=False, fatal_error="bad token", fatal_retryable=False
|
|
)
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter", return_value=fail_adapter):
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
assert Platform.TELEGRAM not in runner.adapters
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_retryable_stays_in_queue(self):
|
|
"""Retryable failures should remain in the queue with incremented attempts."""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() - 1,
|
|
}
|
|
|
|
fail_adapter = StubAdapter(
|
|
succeed=False, fatal_error="DNS failure", fatal_retryable=True
|
|
)
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter", return_value=fail_adapter):
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_gives_up_after_max_attempts(self):
|
|
"""After max attempts, platform should be removed from retry queue."""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 20, # At max
|
|
"next_retry": time.monotonic() - 1,
|
|
}
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter") as mock_create:
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
mock_create.assert_not_called() # Should give up without trying
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_skips_when_not_time_yet(self):
|
|
"""Watcher should skip platforms whose next_retry is in the future."""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() + 9999, # Far in the future
|
|
}
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter") as mock_create:
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
mock_create.assert_not_called()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_failed_platforms_watcher_idles(self):
|
|
"""When no platforms are failed, watcher should just idle."""
|
|
runner = _make_runner()
|
|
# No failed platforms
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter") as mock_create:
|
|
async def run_briefly():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 2:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_briefly()
|
|
|
|
mock_create.assert_not_called()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_adapter_create_returns_none(self):
|
|
"""If _create_adapter returns None, remove from queue (missing deps)."""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() - 1,
|
|
}
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter", return_value=None):
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
|
|
|
|
# --- Runtime disconnection queueing ---
|
|
|
|
class TestRuntimeDisconnectQueuing:
|
|
"""Test that _handle_adapter_fatal_error queues retryable disconnections."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_retryable_runtime_error_queued_for_reconnect(self):
|
|
"""Retryable runtime errors should add the platform to _failed_platforms."""
|
|
runner = _make_runner()
|
|
|
|
adapter = StubAdapter(succeed=True)
|
|
adapter._set_fatal_error("network_error", "DNS failure", retryable=True)
|
|
runner.adapters[Platform.TELEGRAM] = adapter
|
|
|
|
await runner._handle_adapter_fatal_error(adapter)
|
|
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_nonretryable_runtime_error_not_queued(self):
|
|
"""Non-retryable runtime errors should not be queued for reconnection."""
|
|
runner = _make_runner()
|
|
|
|
adapter = StubAdapter(succeed=True)
|
|
adapter._set_fatal_error("auth_error", "bad token", retryable=False)
|
|
runner.adapters[Platform.TELEGRAM] = adapter
|
|
|
|
# Need to prevent stop() from running fully
|
|
runner.stop = AsyncMock()
|
|
|
|
await runner._handle_adapter_fatal_error(adapter)
|
|
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_retryable_error_prevents_shutdown_when_queued(self):
|
|
"""Gateway should not shut down if failed platforms are queued for reconnection."""
|
|
runner = _make_runner()
|
|
runner.stop = AsyncMock()
|
|
|
|
adapter = StubAdapter(succeed=True)
|
|
adapter._set_fatal_error("network_error", "DNS failure", retryable=True)
|
|
runner.adapters[Platform.TELEGRAM] = adapter
|
|
|
|
await runner._handle_adapter_fatal_error(adapter)
|
|
|
|
# stop() should NOT have been called since we have platforms queued
|
|
runner.stop.assert_not_called()
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_nonretryable_error_triggers_shutdown(self):
|
|
"""Gateway should shut down when no adapters remain and nothing is queued."""
|
|
runner = _make_runner()
|
|
runner.stop = AsyncMock()
|
|
|
|
adapter = StubAdapter(succeed=True)
|
|
adapter._set_fatal_error("auth_error", "bad token", retryable=False)
|
|
runner.adapters[Platform.TELEGRAM] = adapter
|
|
|
|
await runner._handle_adapter_fatal_error(adapter)
|
|
|
|
runner.stop.assert_called_once()
|