diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index 08750faed..7aa7b5278 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -265,6 +265,8 @@ class TelegramAdapter(BasePlatformAdapter): release_scoped_lock("telegram-bot-token", self._token_lock_identity) except Exception: pass + message = f"Telegram startup failed: {e}" + self._set_fatal_error("telegram_connect_error", message, retryable=True) logger.error("[%s] Failed to connect to Telegram: %s", self.name, e, exc_info=True) return False diff --git a/gateway/run.py b/gateway/run.py index ec293693e..81d00f73b 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -831,12 +831,15 @@ class GatewayRunner: logger.warning("Process checkpoint recovery: %s", e) connected_count = 0 + enabled_platform_count = 0 startup_nonretryable_errors: list[str] = [] + startup_retryable_errors: list[str] = [] # Initialize and connect each configured platform for platform, platform_config in self.config.platforms.items(): if not platform_config.enabled: continue + enabled_platform_count += 1 adapter = self._create_adapter(platform, platform_config) if not adapter: @@ -858,12 +861,22 @@ class GatewayRunner: logger.info("✓ %s connected", platform.value) else: logger.warning("✗ %s failed to connect", platform.value) - if adapter.has_fatal_error and not adapter.fatal_error_retryable: - startup_nonretryable_errors.append( + if adapter.has_fatal_error: + target = ( + startup_retryable_errors + if adapter.fatal_error_retryable + else startup_nonretryable_errors + ) + target.append( f"{platform.value}: {adapter.fatal_error_message}" ) + else: + startup_retryable_errors.append( + f"{platform.value}: failed to connect" + ) except Exception as e: logger.error("✗ %s error: %s", platform.value, e) + startup_retryable_errors.append(f"{platform.value}: {e}") if connected_count == 0: if startup_nonretryable_errors: @@ -876,7 +889,16 @@ class GatewayRunner: pass self._request_clean_exit(reason) return True - logger.warning("No messaging platforms connected.") + if enabled_platform_count > 0: + reason = "; ".join(startup_retryable_errors) or "all configured messaging platforms failed to connect" + logger.error("Gateway failed to connect any configured messaging platform: %s", reason) + try: + from gateway.status import write_runtime_status + write_runtime_status(gateway_state="startup_failed", exit_reason=reason) + except Exception: + pass + return False + logger.warning("No messaging platforms enabled.") logger.info("Gateway will continue running for cron job execution.") # Update delivery router with adapters diff --git a/tests/gateway/test_runner_startup_failures.py b/tests/gateway/test_runner_startup_failures.py new file mode 100644 index 000000000..315f26568 --- /dev/null +++ b/tests/gateway/test_runner_startup_failures.py @@ -0,0 +1,89 @@ +import pytest + +from gateway.config import GatewayConfig, Platform, PlatformConfig +from gateway.platforms.base import BasePlatformAdapter +from gateway.run import GatewayRunner +from gateway.status import read_runtime_status + + +class _RetryableFailureAdapter(BasePlatformAdapter): + def __init__(self): + super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM) + + async def connect(self) -> bool: + self._set_fatal_error( + "telegram_connect_error", + "Telegram startup failed: temporary DNS resolution failure.", + retryable=True, + ) + return False + + async def disconnect(self) -> None: + self._mark_disconnected() + + async def send(self, chat_id, content, reply_to=None, metadata=None): + raise NotImplementedError + + async def get_chat_info(self, chat_id): + return {"id": chat_id} + + +class _DisabledAdapter(BasePlatformAdapter): + def __init__(self): + super().__init__(PlatformConfig(enabled=False, token="***"), Platform.TELEGRAM) + + async def connect(self) -> bool: + raise AssertionError("connect should not be called for disabled platforms") + + async def disconnect(self) -> None: + self._mark_disconnected() + + async def send(self, chat_id, content, reply_to=None, metadata=None): + raise NotImplementedError + + async def get_chat_info(self, chat_id): + return {"id": chat_id} + + +@pytest.mark.asyncio +async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + config = GatewayConfig( + platforms={ + Platform.TELEGRAM: PlatformConfig(enabled=True, token="***") + }, + sessions_dir=tmp_path / "sessions", + ) + runner = GatewayRunner(config) + + monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _RetryableFailureAdapter()) + + ok = await runner.start() + + assert ok is False + assert runner.should_exit_cleanly is False + state = read_runtime_status() + assert state["gateway_state"] == "startup_failed" + assert "temporary DNS resolution failure" in state["exit_reason"] + assert state["platforms"]["telegram"]["state"] == "fatal" + assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error" + + +@pytest.mark.asyncio +async def test_runner_allows_cron_only_mode_when_no_platforms_are_enabled(monkeypatch, tmp_path): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + config = GatewayConfig( + platforms={ + Platform.TELEGRAM: PlatformConfig(enabled=False, token="***") + }, + sessions_dir=tmp_path / "sessions", + ) + runner = GatewayRunner(config) + + ok = await runner.start() + + assert ok is True + assert runner.should_exit_cleanly is False + assert runner.adapters == {} + state = read_runtime_status() + assert state["gateway_state"] == "running" diff --git a/tests/gateway/test_telegram_conflict.py b/tests/gateway/test_telegram_conflict.py index 86dc509d5..440aa99d8 100644 --- a/tests/gateway/test_telegram_conflict.py +++ b/tests/gateway/test_telegram_conflict.py @@ -100,6 +100,39 @@ async def test_polling_conflict_stops_polling_and_notifies_handler(monkeypatch): fatal_handler.assert_awaited_once() +@pytest.mark.asyncio +async def test_connect_marks_retryable_fatal_error_for_startup_network_failure(monkeypatch): + adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***")) + + monkeypatch.setattr( + "gateway.status.acquire_scoped_lock", + lambda scope, identity, metadata=None: (True, None), + ) + monkeypatch.setattr( + "gateway.status.release_scoped_lock", + lambda scope, identity: None, + ) + + builder = MagicMock() + builder.token.return_value = builder + app = SimpleNamespace( + bot=SimpleNamespace(), + updater=SimpleNamespace(), + add_handler=MagicMock(), + initialize=AsyncMock(side_effect=RuntimeError("Temporary failure in name resolution")), + start=AsyncMock(), + ) + builder.build.return_value = app + monkeypatch.setattr("gateway.platforms.telegram.Application", SimpleNamespace(builder=MagicMock(return_value=builder))) + + ok = await adapter.connect() + + assert ok is False + assert adapter.fatal_error_code == "telegram_connect_error" + assert adapter.fatal_error_retryable is True + assert "Temporary failure in name resolution" in adapter.fatal_error_message + + @pytest.mark.asyncio async def test_disconnect_skips_inactive_updater_and_app(monkeypatch): adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))