This commit was merged in pull request #1457.
This commit is contained in:
@@ -6,6 +6,7 @@ for the Mission Control dashboard.
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import time
|
import time
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
@@ -14,7 +15,7 @@ from pathlib import Path
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from fastapi import APIRouter, Request
|
from fastapi import APIRouter, Request
|
||||||
from fastapi.responses import HTMLResponse
|
from fastapi.responses import HTMLResponse, JSONResponse
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from config import APP_START_TIME as _START_TIME
|
from config import APP_START_TIME as _START_TIME
|
||||||
@@ -24,6 +25,47 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
router = APIRouter(tags=["health"])
|
router = APIRouter(tags=["health"])
|
||||||
|
|
||||||
|
# Shutdown state tracking for graceful shutdown
|
||||||
|
_shutdown_requested = False
|
||||||
|
_shutdown_reason: str | None = None
|
||||||
|
_shutdown_start_time: float | None = None
|
||||||
|
|
||||||
|
# Default graceful shutdown timeout (seconds)
|
||||||
|
GRACEFUL_SHUTDOWN_TIMEOUT = float(os.getenv("GRACEFUL_SHUTDOWN_TIMEOUT", "30"))
|
||||||
|
|
||||||
|
|
||||||
|
def request_shutdown(reason: str = "unknown") -> None:
|
||||||
|
"""Signal that a graceful shutdown has been requested.
|
||||||
|
|
||||||
|
This is called by signal handlers to inform health checks
|
||||||
|
that the service is shutting down.
|
||||||
|
"""
|
||||||
|
global _shutdown_requested, _shutdown_reason, _shutdown_start_time # noqa: PLW0603
|
||||||
|
_shutdown_requested = True
|
||||||
|
_shutdown_reason = reason
|
||||||
|
_shutdown_start_time = time.monotonic()
|
||||||
|
logger.info("Shutdown requested: %s", reason)
|
||||||
|
|
||||||
|
|
||||||
|
def is_shutting_down() -> bool:
|
||||||
|
"""Check if the service is in the process of shutting down."""
|
||||||
|
return _shutdown_requested
|
||||||
|
|
||||||
|
|
||||||
|
def get_shutdown_info() -> dict[str, Any] | None:
|
||||||
|
"""Get information about the shutdown state, if active."""
|
||||||
|
if not _shutdown_requested:
|
||||||
|
return None
|
||||||
|
elapsed = None
|
||||||
|
if _shutdown_start_time:
|
||||||
|
elapsed = time.monotonic() - _shutdown_start_time
|
||||||
|
return {
|
||||||
|
"requested": _shutdown_requested,
|
||||||
|
"reason": _shutdown_reason,
|
||||||
|
"elapsed_seconds": elapsed,
|
||||||
|
"timeout_seconds": GRACEFUL_SHUTDOWN_TIMEOUT,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class DependencyStatus(BaseModel):
|
class DependencyStatus(BaseModel):
|
||||||
"""Status of a single dependency."""
|
"""Status of a single dependency."""
|
||||||
@@ -52,6 +94,36 @@ class HealthStatus(BaseModel):
|
|||||||
uptime_seconds: float
|
uptime_seconds: float
|
||||||
|
|
||||||
|
|
||||||
|
class DetailedHealthStatus(BaseModel):
|
||||||
|
"""Detailed health status with all service checks."""
|
||||||
|
|
||||||
|
status: str # "healthy", "degraded", "unhealthy"
|
||||||
|
timestamp: str
|
||||||
|
version: str
|
||||||
|
uptime_seconds: float
|
||||||
|
services: dict[str, dict[str, Any]]
|
||||||
|
system: dict[str, Any]
|
||||||
|
shutdown: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class ReadinessStatus(BaseModel):
|
||||||
|
"""Readiness probe response."""
|
||||||
|
|
||||||
|
ready: bool
|
||||||
|
timestamp: str
|
||||||
|
checks: dict[str, bool]
|
||||||
|
reason: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class LivenessStatus(BaseModel):
|
||||||
|
"""Liveness probe response."""
|
||||||
|
|
||||||
|
alive: bool
|
||||||
|
timestamp: str
|
||||||
|
uptime_seconds: float
|
||||||
|
shutdown_requested: bool = False
|
||||||
|
|
||||||
|
|
||||||
# Simple uptime tracking
|
# Simple uptime tracking
|
||||||
|
|
||||||
# Ollama health cache (30-second TTL)
|
# Ollama health cache (30-second TTL)
|
||||||
@@ -326,3 +398,178 @@ async def health_snapshot():
|
|||||||
},
|
},
|
||||||
"tokens": {"status": "unknown", "message": "Snapshot failed"},
|
"tokens": {"status": "unknown", "message": "Snapshot failed"},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Production Health Check Endpoints (Readiness & Liveness Probes)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health/detailed")
|
||||||
|
async def health_detailed() -> JSONResponse:
|
||||||
|
"""Comprehensive health check with all service statuses.
|
||||||
|
|
||||||
|
Returns 200 if healthy, 503 if degraded/unhealthy.
|
||||||
|
Includes shutdown state for graceful shutdown awareness.
|
||||||
|
"""
|
||||||
|
uptime = (datetime.now(UTC) - _START_TIME).total_seconds()
|
||||||
|
|
||||||
|
# Check all services in parallel
|
||||||
|
ollama_dep, sqlite_dep = await asyncio.gather(
|
||||||
|
_check_ollama(),
|
||||||
|
asyncio.to_thread(_check_sqlite),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build service status map
|
||||||
|
services = {
|
||||||
|
"ollama": {
|
||||||
|
"status": ollama_dep.status,
|
||||||
|
"healthy": ollama_dep.status == "healthy",
|
||||||
|
"details": ollama_dep.details,
|
||||||
|
},
|
||||||
|
"sqlite": {
|
||||||
|
"status": sqlite_dep.status,
|
||||||
|
"healthy": sqlite_dep.status == "healthy",
|
||||||
|
"details": sqlite_dep.details,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Determine overall status
|
||||||
|
all_healthy = all(s["healthy"] for s in services.values())
|
||||||
|
any_unhealthy = any(s["status"] == "unavailable" for s in services.values())
|
||||||
|
|
||||||
|
if all_healthy:
|
||||||
|
status = "healthy"
|
||||||
|
status_code = 200
|
||||||
|
elif any_unhealthy:
|
||||||
|
status = "unhealthy"
|
||||||
|
status_code = 503
|
||||||
|
else:
|
||||||
|
status = "degraded"
|
||||||
|
status_code = 503
|
||||||
|
|
||||||
|
# Add shutdown state if shutting down
|
||||||
|
shutdown_info = get_shutdown_info()
|
||||||
|
|
||||||
|
# System info
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
try:
|
||||||
|
process = psutil.Process()
|
||||||
|
memory_info = process.memory_info()
|
||||||
|
system = {
|
||||||
|
"memory_mb": round(memory_info.rss / (1024 * 1024), 2),
|
||||||
|
"cpu_percent": process.cpu_percent(interval=0.1),
|
||||||
|
"threads": process.num_threads(),
|
||||||
|
}
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Could not get system info: %s", exc)
|
||||||
|
system = {"error": "unavailable"}
|
||||||
|
|
||||||
|
response_data = {
|
||||||
|
"status": status,
|
||||||
|
"timestamp": datetime.now(UTC).isoformat(),
|
||||||
|
"version": "2.0.0",
|
||||||
|
"uptime_seconds": uptime,
|
||||||
|
"services": services,
|
||||||
|
"system": system,
|
||||||
|
}
|
||||||
|
|
||||||
|
if shutdown_info:
|
||||||
|
response_data["shutdown"] = shutdown_info
|
||||||
|
# Force 503 if shutting down
|
||||||
|
status_code = 503
|
||||||
|
|
||||||
|
return JSONResponse(content=response_data, status_code=status_code)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/ready")
|
||||||
|
async def readiness_probe() -> JSONResponse:
|
||||||
|
"""Readiness probe for Kubernetes/Docker.
|
||||||
|
|
||||||
|
Returns 200 when the service is ready to receive traffic.
|
||||||
|
Returns 503 during startup or shutdown.
|
||||||
|
"""
|
||||||
|
uptime = (datetime.now(UTC) - _START_TIME).total_seconds()
|
||||||
|
|
||||||
|
# Minimum uptime before ready (allow startup to complete)
|
||||||
|
MIN_READY_UPTIME = 5.0
|
||||||
|
|
||||||
|
checks = {
|
||||||
|
"startup_complete": uptime >= MIN_READY_UPTIME,
|
||||||
|
"database": False,
|
||||||
|
"not_shutting_down": not is_shutting_down(),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check database connectivity
|
||||||
|
try:
|
||||||
|
db_path = Path(settings.repo_root) / "data" / "timmy.db"
|
||||||
|
if db_path.exists():
|
||||||
|
with closing(sqlite3.connect(str(db_path))) as conn:
|
||||||
|
conn.execute("SELECT 1")
|
||||||
|
checks["database"] = True
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Readiness DB check failed: %s", exc)
|
||||||
|
|
||||||
|
ready = all(checks.values())
|
||||||
|
|
||||||
|
response_data = {
|
||||||
|
"ready": ready,
|
||||||
|
"timestamp": datetime.now(UTC).isoformat(),
|
||||||
|
"checks": checks,
|
||||||
|
}
|
||||||
|
|
||||||
|
if not ready and is_shutting_down():
|
||||||
|
response_data["reason"] = f"Service shutting down: {_shutdown_reason}"
|
||||||
|
|
||||||
|
status_code = 200 if ready else 503
|
||||||
|
return JSONResponse(content=response_data, status_code=status_code)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/live")
|
||||||
|
async def liveness_probe() -> JSONResponse:
|
||||||
|
"""Liveness probe for Kubernetes/Docker.
|
||||||
|
|
||||||
|
Returns 200 if the service is alive and functioning.
|
||||||
|
Returns 503 if the service is deadlocked or should be restarted.
|
||||||
|
"""
|
||||||
|
uptime = (datetime.now(UTC) - _START_TIME).total_seconds()
|
||||||
|
|
||||||
|
# Basic liveness: we respond, so we're alive
|
||||||
|
alive = True
|
||||||
|
|
||||||
|
# If shutting down and past timeout, report not alive to force restart
|
||||||
|
if is_shutting_down() and _shutdown_start_time:
|
||||||
|
elapsed = time.monotonic() - _shutdown_start_time
|
||||||
|
if elapsed > GRACEFUL_SHUTDOWN_TIMEOUT:
|
||||||
|
alive = False
|
||||||
|
logger.warning("Liveness probe failed: shutdown timeout exceeded")
|
||||||
|
|
||||||
|
response_data = {
|
||||||
|
"alive": alive,
|
||||||
|
"timestamp": datetime.now(UTC).isoformat(),
|
||||||
|
"uptime_seconds": uptime,
|
||||||
|
"shutdown_requested": is_shutting_down(),
|
||||||
|
}
|
||||||
|
|
||||||
|
status_code = 200 if alive else 503
|
||||||
|
return JSONResponse(content=response_data, status_code=status_code)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health/shutdown", include_in_schema=False)
|
||||||
|
async def shutdown_status() -> JSONResponse:
|
||||||
|
"""Get shutdown status (internal/debug endpoint).
|
||||||
|
|
||||||
|
Returns shutdown state information for debugging graceful shutdown.
|
||||||
|
"""
|
||||||
|
shutdown_info = get_shutdown_info()
|
||||||
|
|
||||||
|
response_data = {
|
||||||
|
"shutting_down": is_shutting_down(),
|
||||||
|
"timestamp": datetime.now(UTC).isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
if shutdown_info:
|
||||||
|
response_data.update(shutdown_info)
|
||||||
|
|
||||||
|
return JSONResponse(content=response_data)
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
import signal
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -19,6 +20,9 @@ from dashboard.schedulers import (
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Global event to signal shutdown request
|
||||||
|
_shutdown_event = asyncio.Event()
|
||||||
|
|
||||||
|
|
||||||
def _startup_init() -> None:
|
def _startup_init() -> None:
|
||||||
"""Validate config and enable event persistence."""
|
"""Validate config and enable event persistence."""
|
||||||
@@ -131,6 +135,65 @@ def _startup_pruning() -> None:
|
|||||||
_check_vault_size()
|
_check_vault_size()
|
||||||
|
|
||||||
|
|
||||||
|
def _setup_signal_handlers() -> None:
|
||||||
|
"""Setup signal handlers for graceful shutdown.
|
||||||
|
|
||||||
|
Handles SIGTERM (Docker stop, Kubernetes delete) and SIGINT (Ctrl+C)
|
||||||
|
by setting the shutdown event and notifying health checks.
|
||||||
|
|
||||||
|
Note: Signal handlers can only be registered in the main thread.
|
||||||
|
In test environments (running in separate threads), this is skipped.
|
||||||
|
"""
|
||||||
|
import threading
|
||||||
|
|
||||||
|
# Signal handlers can only be set in the main thread
|
||||||
|
if threading.current_thread() is not threading.main_thread():
|
||||||
|
logger.debug("Skipping signal handler setup: not in main thread")
|
||||||
|
return
|
||||||
|
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
|
def _signal_handler(sig: signal.Signals) -> None:
|
||||||
|
sig_name = sig.name if hasattr(sig, "name") else str(sig)
|
||||||
|
logger.info("Received signal %s, initiating graceful shutdown...", sig_name)
|
||||||
|
|
||||||
|
# Notify health module about shutdown
|
||||||
|
try:
|
||||||
|
from dashboard.routes.health import request_shutdown
|
||||||
|
|
||||||
|
request_shutdown(reason=f"signal:{sig_name}")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Failed to set shutdown state: %s", exc)
|
||||||
|
|
||||||
|
# Set the shutdown event to unblock lifespan
|
||||||
|
_shutdown_event.set()
|
||||||
|
|
||||||
|
# Register handlers for common shutdown signals
|
||||||
|
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||||
|
try:
|
||||||
|
loop.add_signal_handler(sig, lambda s=sig: _signal_handler(s))
|
||||||
|
logger.debug("Registered handler for %s", sig.name if hasattr(sig, "name") else sig)
|
||||||
|
except (NotImplementedError, ValueError) as exc:
|
||||||
|
# Windows or non-main thread - signal handlers not available
|
||||||
|
logger.debug("Could not register signal handler for %s: %s", sig, exc)
|
||||||
|
|
||||||
|
|
||||||
|
async def _wait_for_shutdown(timeout: float | None = None) -> bool:
|
||||||
|
"""Wait for shutdown signal or timeout.
|
||||||
|
|
||||||
|
Returns True if shutdown was requested, False if timeout expired.
|
||||||
|
"""
|
||||||
|
if timeout:
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(_shutdown_event.wait(), timeout=timeout)
|
||||||
|
return True
|
||||||
|
except TimeoutError:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
await _shutdown_event.wait()
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
async def _shutdown_cleanup(
|
async def _shutdown_cleanup(
|
||||||
bg_tasks: list[asyncio.Task],
|
bg_tasks: list[asyncio.Task],
|
||||||
workshop_heartbeat,
|
workshop_heartbeat,
|
||||||
@@ -161,11 +224,25 @@ async def _shutdown_cleanup(
|
|||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
"""Application lifespan manager with non-blocking startup."""
|
"""Application lifespan manager with non-blocking startup and graceful shutdown.
|
||||||
|
|
||||||
|
Handles SIGTERM/SIGINT signals for graceful shutdown in container environments.
|
||||||
|
When a shutdown signal is received:
|
||||||
|
1. Health checks are notified (readiness returns 503)
|
||||||
|
2. Active requests are allowed to complete (with timeout)
|
||||||
|
3. Background tasks are cancelled
|
||||||
|
4. Cleanup operations run
|
||||||
|
"""
|
||||||
|
# Reset shutdown state for fresh start
|
||||||
|
_shutdown_event.clear()
|
||||||
|
|
||||||
_startup_init()
|
_startup_init()
|
||||||
bg_tasks = _startup_background_tasks()
|
bg_tasks = _startup_background_tasks()
|
||||||
_startup_pruning()
|
_startup_pruning()
|
||||||
|
|
||||||
|
# Setup signal handlers for graceful shutdown
|
||||||
|
_setup_signal_handlers()
|
||||||
|
|
||||||
# Start Workshop presence heartbeat with WS relay
|
# Start Workshop presence heartbeat with WS relay
|
||||||
from dashboard.routes.world import broadcast_world_state
|
from dashboard.routes.world import broadcast_world_state
|
||||||
from timmy.workshop_state import WorkshopHeartbeat
|
from timmy.workshop_state import WorkshopHeartbeat
|
||||||
@@ -191,15 +268,35 @@ async def lifespan(app: FastAPI):
|
|||||||
logger.debug("Failed to mark sovereignty session start")
|
logger.debug("Failed to mark sovereignty session start")
|
||||||
|
|
||||||
logger.info("✓ Dashboard ready for requests")
|
logger.info("✓ Dashboard ready for requests")
|
||||||
|
logger.info(" Graceful shutdown enabled (SIGTERM/SIGINT)")
|
||||||
|
|
||||||
yield
|
# Wait for shutdown signal or continue until cancelled
|
||||||
|
# The yield allows FastAPI to serve requests
|
||||||
await _shutdown_cleanup(bg_tasks, workshop_heartbeat)
|
|
||||||
|
|
||||||
# Generate and commit sovereignty session report
|
|
||||||
try:
|
try:
|
||||||
from timmy.sovereignty import generate_and_commit_report
|
yield
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
# FastAPI cancelled the lifespan (normal during shutdown)
|
||||||
|
logger.debug("Lifespan cancelled, beginning cleanup...")
|
||||||
|
finally:
|
||||||
|
# Cleanup phase - this runs during shutdown
|
||||||
|
logger.info("Beginning graceful shutdown...")
|
||||||
|
|
||||||
await generate_and_commit_report()
|
# Notify health checks that we're shutting down
|
||||||
except Exception as exc:
|
try:
|
||||||
logger.warning("Sovereignty report generation failed at shutdown: %s", exc)
|
from dashboard.routes.health import request_shutdown
|
||||||
|
|
||||||
|
request_shutdown(reason="lifespan_cleanup")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Failed to set shutdown state: %s", exc)
|
||||||
|
|
||||||
|
await _shutdown_cleanup(bg_tasks, workshop_heartbeat)
|
||||||
|
|
||||||
|
# Generate and commit sovereignty session report
|
||||||
|
try:
|
||||||
|
from timmy.sovereignty import generate_and_commit_report
|
||||||
|
|
||||||
|
await generate_and_commit_report()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Sovereignty report generation failed at shutdown: %s", exc)
|
||||||
|
|
||||||
|
logger.info("✓ Graceful shutdown complete")
|
||||||
|
|||||||
@@ -15,13 +15,19 @@ import pytest
|
|||||||
|
|
||||||
from dashboard.routes.health import (
|
from dashboard.routes.health import (
|
||||||
DependencyStatus,
|
DependencyStatus,
|
||||||
|
DetailedHealthStatus,
|
||||||
HealthStatus,
|
HealthStatus,
|
||||||
|
LivenessStatus,
|
||||||
|
ReadinessStatus,
|
||||||
SovereigntyReport,
|
SovereigntyReport,
|
||||||
_calculate_overall_score,
|
_calculate_overall_score,
|
||||||
_check_lightning,
|
_check_lightning,
|
||||||
_check_ollama_sync,
|
_check_ollama_sync,
|
||||||
_check_sqlite,
|
_check_sqlite,
|
||||||
_generate_recommendations,
|
_generate_recommendations,
|
||||||
|
get_shutdown_info,
|
||||||
|
is_shutting_down,
|
||||||
|
request_shutdown,
|
||||||
)
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -497,3 +503,283 @@ class TestSnapshotEndpoint:
|
|||||||
data = client.get("/health/snapshot").json()
|
data = client.get("/health/snapshot").json()
|
||||||
|
|
||||||
assert data["overall_status"] == "unknown"
|
assert data["overall_status"] == "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Shutdown State Tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestShutdownState:
|
||||||
|
"""Tests for shutdown state tracking."""
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _reset_shutdown_state(self):
|
||||||
|
"""Reset shutdown state before each test."""
|
||||||
|
import dashboard.routes.health as mod
|
||||||
|
|
||||||
|
mod._shutdown_requested = False
|
||||||
|
mod._shutdown_reason = None
|
||||||
|
mod._shutdown_start_time = None
|
||||||
|
yield
|
||||||
|
mod._shutdown_requested = False
|
||||||
|
mod._shutdown_reason = None
|
||||||
|
mod._shutdown_start_time = None
|
||||||
|
|
||||||
|
def test_is_shutting_down_initial(self):
|
||||||
|
assert is_shutting_down() is False
|
||||||
|
|
||||||
|
def test_request_shutdown_sets_state(self):
|
||||||
|
request_shutdown(reason="test")
|
||||||
|
assert is_shutting_down() is True
|
||||||
|
|
||||||
|
def test_get_shutdown_info_when_not_shutting_down(self):
|
||||||
|
info = get_shutdown_info()
|
||||||
|
assert info is None
|
||||||
|
|
||||||
|
def test_get_shutdown_info_when_shutting_down(self):
|
||||||
|
request_shutdown(reason="test_reason")
|
||||||
|
info = get_shutdown_info()
|
||||||
|
assert info is not None
|
||||||
|
assert info["requested"] is True
|
||||||
|
assert info["reason"] == "test_reason"
|
||||||
|
assert "elapsed_seconds" in info
|
||||||
|
assert "timeout_seconds" in info
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Detailed Health Endpoint Tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestDetailedHealthEndpoint:
|
||||||
|
"""Tests for GET /health/detailed."""
|
||||||
|
|
||||||
|
def test_returns_200_when_healthy(self, client):
|
||||||
|
with patch(
|
||||||
|
"dashboard.routes.health._check_ollama_sync",
|
||||||
|
return_value=DependencyStatus(
|
||||||
|
name="Ollama AI", status="healthy", sovereignty_score=10, details={}
|
||||||
|
),
|
||||||
|
):
|
||||||
|
response = client.get("/health/detailed")
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["status"] in ["healthy", "degraded", "unhealthy"]
|
||||||
|
assert "timestamp" in data
|
||||||
|
assert "version" in data
|
||||||
|
assert "uptime_seconds" in data
|
||||||
|
assert "services" in data
|
||||||
|
assert "system" in data
|
||||||
|
|
||||||
|
def test_returns_503_when_service_unhealthy(self, client):
|
||||||
|
with patch(
|
||||||
|
"dashboard.routes.health._check_ollama_sync",
|
||||||
|
return_value=DependencyStatus(
|
||||||
|
name="Ollama AI",
|
||||||
|
status="unavailable",
|
||||||
|
sovereignty_score=10,
|
||||||
|
details={"error": "down"},
|
||||||
|
),
|
||||||
|
):
|
||||||
|
response = client.get("/health/detailed")
|
||||||
|
|
||||||
|
assert response.status_code == 503
|
||||||
|
data = response.json()
|
||||||
|
assert data["status"] == "unhealthy"
|
||||||
|
|
||||||
|
def test_includes_shutdown_info_when_shutting_down(self, client):
|
||||||
|
with patch(
|
||||||
|
"dashboard.routes.health._check_ollama_sync",
|
||||||
|
return_value=DependencyStatus(
|
||||||
|
name="Ollama AI", status="healthy", sovereignty_score=10, details={}
|
||||||
|
),
|
||||||
|
):
|
||||||
|
with patch("dashboard.routes.health.is_shutting_down", return_value=True):
|
||||||
|
with patch(
|
||||||
|
"dashboard.routes.health.get_shutdown_info",
|
||||||
|
return_value={
|
||||||
|
"requested": True,
|
||||||
|
"reason": "test",
|
||||||
|
"elapsed_seconds": 1.5,
|
||||||
|
"timeout_seconds": 30.0,
|
||||||
|
},
|
||||||
|
):
|
||||||
|
response = client.get("/health/detailed")
|
||||||
|
|
||||||
|
assert response.status_code == 503
|
||||||
|
data = response.json()
|
||||||
|
assert "shutdown" in data
|
||||||
|
assert data["shutdown"]["requested"] is True
|
||||||
|
|
||||||
|
def test_services_structure(self, client):
|
||||||
|
with patch(
|
||||||
|
"dashboard.routes.health._check_ollama_sync",
|
||||||
|
return_value=DependencyStatus(
|
||||||
|
name="Ollama AI", status="healthy", sovereignty_score=10, details={"model": "test"}
|
||||||
|
),
|
||||||
|
):
|
||||||
|
response = client.get("/health/detailed")
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
assert "services" in data
|
||||||
|
assert "ollama" in data["services"]
|
||||||
|
assert "sqlite" in data["services"]
|
||||||
|
# Each service should have status, healthy flag, and details
|
||||||
|
for _svc_name, svc_data in data["services"].items():
|
||||||
|
assert "status" in svc_data
|
||||||
|
assert "healthy" in svc_data
|
||||||
|
assert isinstance(svc_data["healthy"], bool)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Readiness Probe Tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestReadinessProbe:
|
||||||
|
"""Tests for GET /ready."""
|
||||||
|
|
||||||
|
def test_returns_200_when_ready(self, client):
|
||||||
|
# Wait for startup to complete
|
||||||
|
response = client.get("/ready")
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
# Should return either 200 (ready) or 503 (not ready)
|
||||||
|
assert response.status_code in [200, 503]
|
||||||
|
assert "ready" in data
|
||||||
|
assert isinstance(data["ready"], bool)
|
||||||
|
assert "timestamp" in data
|
||||||
|
assert "checks" in data
|
||||||
|
|
||||||
|
def test_checks_structure(self, client):
|
||||||
|
response = client.get("/ready")
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
assert "checks" in data
|
||||||
|
checks = data["checks"]
|
||||||
|
# Core checks that should be present
|
||||||
|
assert "startup_complete" in checks
|
||||||
|
assert "database" in checks
|
||||||
|
assert "not_shutting_down" in checks
|
||||||
|
|
||||||
|
def test_not_ready_during_shutdown(self, client):
|
||||||
|
with patch("dashboard.routes.health.is_shutting_down", return_value=True):
|
||||||
|
with patch(
|
||||||
|
"dashboard.routes.health._shutdown_reason",
|
||||||
|
"test shutdown",
|
||||||
|
):
|
||||||
|
response = client.get("/ready")
|
||||||
|
|
||||||
|
assert response.status_code == 503
|
||||||
|
data = response.json()
|
||||||
|
assert data["ready"] is False
|
||||||
|
assert data["checks"]["not_shutting_down"] is False
|
||||||
|
assert "reason" in data
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Liveness Probe Tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestLivenessProbe:
|
||||||
|
"""Tests for GET /live."""
|
||||||
|
|
||||||
|
def test_returns_200_when_alive(self, client):
|
||||||
|
response = client.get("/live")
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["alive"] is True
|
||||||
|
assert "timestamp" in data
|
||||||
|
assert "uptime_seconds" in data
|
||||||
|
assert "shutdown_requested" in data
|
||||||
|
|
||||||
|
def test_shutdown_requested_field(self, client):
|
||||||
|
with patch("dashboard.routes.health.is_shutting_down", return_value=False):
|
||||||
|
response = client.get("/live")
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
assert data["shutdown_requested"] is False
|
||||||
|
|
||||||
|
def test_alive_false_after_shutdown_timeout(self, client):
|
||||||
|
import dashboard.routes.health as mod
|
||||||
|
|
||||||
|
with patch.object(mod, "_shutdown_requested", True):
|
||||||
|
with patch.object(mod, "_shutdown_start_time", time.monotonic() - 999):
|
||||||
|
with patch.object(mod, "GRACEFUL_SHUTDOWN_TIMEOUT", 30.0):
|
||||||
|
response = client.get("/live")
|
||||||
|
|
||||||
|
assert response.status_code == 503
|
||||||
|
data = response.json()
|
||||||
|
assert data["alive"] is False
|
||||||
|
assert data["shutdown_requested"] is True
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# New Pydantic Model Tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestDetailedHealthStatusModel:
|
||||||
|
"""Validate DetailedHealthStatus model."""
|
||||||
|
|
||||||
|
def test_fields(self):
|
||||||
|
hs = DetailedHealthStatus(
|
||||||
|
status="healthy",
|
||||||
|
timestamp="2026-01-01T00:00:00+00:00",
|
||||||
|
version="2.0.0",
|
||||||
|
uptime_seconds=42.5,
|
||||||
|
services={"db": {"status": "up", "healthy": True, "details": {}}},
|
||||||
|
system={"memory_mb": 100.5},
|
||||||
|
)
|
||||||
|
assert hs.status == "healthy"
|
||||||
|
assert hs.services["db"]["healthy"] is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestReadinessStatusModel:
|
||||||
|
"""Validate ReadinessStatus model."""
|
||||||
|
|
||||||
|
def test_fields(self):
|
||||||
|
rs = ReadinessStatus(
|
||||||
|
ready=True,
|
||||||
|
timestamp="2026-01-01T00:00:00+00:00",
|
||||||
|
checks={"db": True, "cache": True},
|
||||||
|
)
|
||||||
|
assert rs.ready is True
|
||||||
|
assert rs.checks["db"] is True
|
||||||
|
|
||||||
|
def test_with_reason(self):
|
||||||
|
rs = ReadinessStatus(
|
||||||
|
ready=False,
|
||||||
|
timestamp="2026-01-01T00:00:00+00:00",
|
||||||
|
checks={"db": False},
|
||||||
|
reason="Database unavailable",
|
||||||
|
)
|
||||||
|
assert rs.ready is False
|
||||||
|
assert rs.reason == "Database unavailable"
|
||||||
|
|
||||||
|
|
||||||
|
class TestLivenessStatusModel:
|
||||||
|
"""Validate LivenessStatus model."""
|
||||||
|
|
||||||
|
def test_fields(self):
|
||||||
|
ls = LivenessStatus(
|
||||||
|
alive=True,
|
||||||
|
timestamp="2026-01-01T00:00:00+00:00",
|
||||||
|
uptime_seconds=3600.0,
|
||||||
|
shutdown_requested=False,
|
||||||
|
)
|
||||||
|
assert ls.alive is True
|
||||||
|
assert ls.uptime_seconds == 3600.0
|
||||||
|
assert ls.shutdown_requested is False
|
||||||
|
|
||||||
|
def test_defaults(self):
|
||||||
|
ls = LivenessStatus(
|
||||||
|
alive=True,
|
||||||
|
timestamp="2026-01-01T00:00:00+00:00",
|
||||||
|
uptime_seconds=0.0,
|
||||||
|
)
|
||||||
|
assert ls.shutdown_requested is False
|
||||||
|
|||||||
Reference in New Issue
Block a user