Timmy-time-dashboard/src/dashboard/startup.py

"""Application lifecycle management — startup, shutdown, and background task orchestration."""

import asyncio
import logging
import signal
from contextlib import asynccontextmanager
from pathlib import Path

from fastapi import FastAPI

from config import settings
from dashboard.schedulers import (
    _briefing_scheduler,
    _hermes_scheduler,
    _loop_qa_scheduler,
    _presence_watcher,
    _start_chat_integrations_background,
    _thinking_scheduler,
)

logger = logging.getLogger(__name__)

# Global event to signal shutdown request
_shutdown_event = asyncio.Event()


def _startup_init() -> None:
    """Validate config and enable event persistence."""
    from config import validate_startup

    validate_startup()

    from infrastructure.events.bus import init_event_bus_persistence

    init_event_bus_persistence()

    from spark.engine import get_spark_engine

    if get_spark_engine().enabled:
        logger.info("Spark Intelligence active — event capture enabled")


def _startup_background_tasks() -> list[asyncio.Task]:
    """Spawn all recurring background tasks (non-blocking)."""
    bg_tasks = [
        asyncio.create_task(_briefing_scheduler()),
        asyncio.create_task(_thinking_scheduler()),
        asyncio.create_task(_loop_qa_scheduler()),
        asyncio.create_task(_presence_watcher()),
        asyncio.create_task(_start_chat_integrations_background()),
        asyncio.create_task(_hermes_scheduler()),
    ]
    try:
        from timmy.paperclip import start_paperclip_poller

        bg_tasks.append(asyncio.create_task(start_paperclip_poller()))
        logger.info("Paperclip poller started")
    except ImportError:
        logger.debug("Paperclip module not found, skipping poller")

    return bg_tasks


def _try_prune(label: str, prune_fn, days: int) -> None:
    """Run a prune function, log results, swallow errors."""
    try:
        pruned = prune_fn()
        if pruned:
            logger.info(
                "%s auto-prune: removed %d entries older than %d days",
                label,
                pruned,
                days,
            )
    except Exception as exc:
        logger.debug("%s auto-prune skipped: %s", label, exc)


def _check_vault_size() -> None:
    """Warn if the memory vault exceeds the configured size limit."""
    try:
        vault_path = Path(settings.repo_root) / "memory" / "notes"
        if vault_path.exists():
            total_bytes = sum(f.stat().st_size for f in vault_path.rglob("*") if f.is_file())
            total_mb = total_bytes / (1024 * 1024)
            if total_mb > settings.memory_vault_max_mb:
                logger.warning(
                    "Memory vault (%.1f MB) exceeds limit (%d MB) — consider archiving old notes",
                    total_mb,
                    settings.memory_vault_max_mb,
                )
    except Exception as exc:
        logger.debug("Vault size check skipped: %s", exc)


def _startup_pruning() -> None:
    """Auto-prune old memories, thoughts, and events on startup."""
    if settings.memory_prune_days > 0:
        from timmy.memory_system import prune_memories

        _try_prune(
            "Memory",
            lambda: prune_memories(
                older_than_days=settings.memory_prune_days,
                keep_facts=settings.memory_prune_keep_facts,
            ),
            settings.memory_prune_days,
        )

    if settings.thoughts_prune_days > 0:
        from timmy.thinking import thinking_engine

        _try_prune(
            "Thought",
            lambda: thinking_engine.prune_old_thoughts(
                keep_days=settings.thoughts_prune_days,
                keep_min=settings.thoughts_prune_keep_min,
            ),
            settings.thoughts_prune_days,
        )

    if settings.events_prune_days > 0:
        from swarm.event_log import prune_old_events

        _try_prune(
            "Event",
            lambda: prune_old_events(
                keep_days=settings.events_prune_days,
                keep_min=settings.events_prune_keep_min,
            ),
            settings.events_prune_days,
        )

    if settings.memory_vault_max_mb > 0:
        _check_vault_size()


def _setup_signal_handlers() -> None:
    """Setup signal handlers for graceful shutdown.

    Handles SIGTERM (Docker stop, Kubernetes delete) and SIGINT (Ctrl+C)
    by setting the shutdown event and notifying health checks.

    Note: Signal handlers can only be registered in the main thread.
    In test environments (running in separate threads), this is skipped.
    """
    import threading

    # Signal handlers can only be set in the main thread
    if threading.current_thread() is not threading.main_thread():
        logger.debug("Skipping signal handler setup: not in main thread")
        return

    loop = asyncio.get_running_loop()

    def _signal_handler(sig: signal.Signals) -> None:
        sig_name = sig.name if hasattr(sig, "name") else str(sig)
        logger.info("Received signal %s, initiating graceful shutdown...", sig_name)

        # Notify health module about shutdown
        try:
            from dashboard.routes.health import request_shutdown

            request_shutdown(reason=f"signal:{sig_name}")
        except Exception as exc:
            logger.debug("Failed to set shutdown state: %s", exc)

        # Set the shutdown event to unblock lifespan
        _shutdown_event.set()

    # Register handlers for common shutdown signals
    for sig in (signal.SIGTERM, signal.SIGINT):
        try:
            loop.add_signal_handler(sig, lambda s=sig: _signal_handler(s))
            logger.debug("Registered handler for %s", sig.name if hasattr(sig, "name") else sig)
        except (NotImplementedError, ValueError) as exc:
            # Windows or non-main thread - signal handlers not available
            logger.debug("Could not register signal handler for %s: %s", sig, exc)


async def _wait_for_shutdown(timeout: float | None = None) -> bool:
    """Wait for shutdown signal or timeout.

    Returns True if shutdown was requested, False if timeout expired.
    """
    if timeout:
        try:
            await asyncio.wait_for(_shutdown_event.wait(), timeout=timeout)
            return True
        except TimeoutError:
            return False
    else:
        await _shutdown_event.wait()
        return True


async def _shutdown_cleanup(
    bg_tasks: list[asyncio.Task],
    workshop_heartbeat,
) -> None:
    """Stop chat bots, MCP sessions, heartbeat, and cancel background tasks."""
    from integrations.chat_bridge.vendors.discord import discord_bot
    from integrations.telegram_bot.bot import telegram_bot

    await discord_bot.stop()
    await telegram_bot.stop()

    try:
        from timmy.mcp_tools import close_mcp_sessions

        await close_mcp_sessions()
    except Exception as exc:
        logger.debug("MCP shutdown: %s", exc)

    await workshop_heartbeat.stop()

    for task in bg_tasks:
        task.cancel()
        try:
            await task
        except asyncio.CancelledError:
            pass


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Application lifespan manager with non-blocking startup and graceful shutdown.

    Handles SIGTERM/SIGINT signals for graceful shutdown in container environments.
    When a shutdown signal is received:
    1. Health checks are notified (readiness returns 503)
    2. Active requests are allowed to complete (with timeout)
    3. Background tasks are cancelled
    4. Cleanup operations run
    """
    # Reset shutdown state for fresh start
    _shutdown_event.clear()

    _startup_init()
    bg_tasks = _startup_background_tasks()
    _startup_pruning()

    # Setup signal handlers for graceful shutdown
    _setup_signal_handlers()

    # Start Workshop presence heartbeat with WS relay
    from dashboard.routes.world import broadcast_world_state
    from timmy.workshop_state import WorkshopHeartbeat

    workshop_heartbeat = WorkshopHeartbeat(on_change=broadcast_world_state)
    await workshop_heartbeat.start()

    # Register session logger with error capture
    try:
        from infrastructure.error_capture import register_error_recorder
        from timmy.session_logger import get_session_logger

        register_error_recorder(get_session_logger().record_error)
    except Exception:
        logger.debug("Failed to register error recorder")

    # Mark session start for sovereignty duration tracking
    try:
        from timmy.sovereignty import mark_session_start

        mark_session_start()
    except Exception:
        logger.debug("Failed to mark sovereignty session start")

    logger.info("✓ Dashboard ready for requests")
    logger.info("  Graceful shutdown enabled (SIGTERM/SIGINT)")

    # Wait for shutdown signal or continue until cancelled
    # The yield allows FastAPI to serve requests
    try:
        yield
    except asyncio.CancelledError:
        # FastAPI cancelled the lifespan (normal during shutdown)
        logger.debug("Lifespan cancelled, beginning cleanup...")
    finally:
        # Cleanup phase - this runs during shutdown
        logger.info("Beginning graceful shutdown...")

        # Notify health checks that we're shutting down
        try:
            from dashboard.routes.health import request_shutdown

            request_shutdown(reason="lifespan_cleanup")
        except Exception as exc:
            logger.debug("Failed to set shutdown state: %s", exc)

        await _shutdown_cleanup(bg_tasks, workshop_heartbeat)

        # Generate and commit sovereignty session report
        try:
            from timmy.sovereignty import generate_and_commit_report

            await generate_and_commit_report()
        except Exception as exc:
            logger.warning("Sovereignty report generation failed at shutdown: %s", exc)

        logger.info("✓ Graceful shutdown complete")