Timmy-time-dashboard/src/dashboard/routes/monitoring.py

"""Real-time monitoring dashboard routes.

Provides a unified operational view of all agent systems:
  - Agent status and vitals
  - System resources (CPU, RAM, disk, network)
  - Economy (sats earned/spent, injection count)
  - Stream health (viewer count, bitrate, uptime)
  - Content pipeline (episodes, highlights, clips)
  - Alerts (agent offline, stream down, low balance)

Refs: #862
"""

from __future__ import annotations

import asyncio
import logging
from datetime import UTC, datetime

from fastapi import APIRouter, Request
from fastapi.responses import HTMLResponse, JSONResponse

from config import APP_START_TIME as _START_TIME
from config import settings
from dashboard.templating import templates

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/monitoring", tags=["monitoring"])


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


async def _get_agent_status() -> list[dict]:
    """Return a list of agent status entries."""
    try:
        from config import settings as cfg

        agents_yaml = cfg.agents_config
        agents_raw = agents_yaml.get("agents", {})
        result = []
        for name, info in agents_raw.items():
            result.append(
                {
                    "name": name,
                    "model": info.get("model", "default"),
                    "status": "running",
                    "last_action": "idle",
                    "cell": info.get("cell", "—"),
                }
            )
        if not result:
            result.append(
                {
                    "name": settings.agent_name,
                    "model": settings.ollama_model,
                    "status": "running",
                    "last_action": "idle",
                    "cell": "main",
                }
            )
        return result
    except Exception as exc:
        logger.warning("agent status fetch failed: %s", exc)
        return []


async def _get_system_resources() -> dict:
    """Return CPU, RAM, disk snapshot (non-blocking)."""
    try:
        from timmy.vassal.house_health import get_system_snapshot

        snap = await get_system_snapshot()
        cpu_pct: float | None = None
        try:
            import psutil  # optional

            cpu_pct = await asyncio.to_thread(psutil.cpu_percent, 0.1)
        except Exception:
            pass

        return {
            "cpu_percent": cpu_pct,
            "ram_percent": snap.memory.percent_used,
            "ram_total_gb": snap.memory.total_gb,
            "ram_available_gb": snap.memory.available_gb,
            "disk_percent": snap.disk.percent_used,
            "disk_total_gb": snap.disk.total_gb,
            "disk_free_gb": snap.disk.free_gb,
            "ollama_reachable": snap.ollama.reachable,
            "loaded_models": snap.ollama.loaded_models,
            "warnings": snap.warnings,
        }
    except Exception as exc:
        logger.warning("system resources fetch failed: %s", exc)
        return {
            "cpu_percent": None,
            "ram_percent": None,
            "ram_total_gb": None,
            "ram_available_gb": None,
            "disk_percent": None,
            "disk_total_gb": None,
            "disk_free_gb": None,
            "ollama_reachable": False,
            "loaded_models": [],
            "warnings": [str(exc)],
        }


async def _get_economy() -> dict:
    """Return economy stats — sats earned/spent, injection count."""
    result: dict = {
        "balance_sats": 0,
        "earned_sats": 0,
        "spent_sats": 0,
        "injection_count": 0,
        "auction_active": False,
        "tx_count": 0,
    }
    try:
        from lightning.ledger import get_balance, get_transactions

        result["balance_sats"] = get_balance()
        txns = get_transactions()
        result["tx_count"] = len(txns)
        for tx in txns:
            if tx.get("direction") == "incoming":
                result["earned_sats"] += tx.get("amount_sats", 0)
            elif tx.get("direction") == "outgoing":
                result["spent_sats"] += tx.get("amount_sats", 0)
    except Exception as exc:
        logger.debug("economy fetch failed: %s", exc)
    return result


async def _get_stream_health() -> dict:
    """Return stream health stats.

    Graceful fallback when no streaming backend is configured.
    """
    return {
        "live": False,
        "viewer_count": 0,
        "bitrate_kbps": 0,
        "uptime_seconds": 0,
        "title": "No active stream",
        "source": "unavailable",
    }


async def _get_content_pipeline() -> dict:
    """Return content pipeline stats — last episode, highlight/clip counts."""
    result: dict = {
        "last_episode": None,
        "highlight_count": 0,
        "clip_count": 0,
        "pipeline_healthy": True,
    }
    try:
        from pathlib import Path

        repo_root = Path(settings.repo_root)
        # Check for episode output files
        output_dir = repo_root / "data" / "episodes"
        if output_dir.exists():
            episodes = sorted(output_dir.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
            if episodes:
                result["last_episode"] = episodes[0].stem
                result["highlight_count"] = len(list(output_dir.glob("highlights_*.json")))
                result["clip_count"] = len(list(output_dir.glob("clips_*.json")))
    except Exception as exc:
        logger.debug("content pipeline fetch failed: %s", exc)
    return result


def _build_alerts(
    resources: dict,
    agents: list[dict],
    economy: dict,
    stream: dict,
) -> list[dict]:
    """Derive operational alerts from aggregated status data."""
    alerts: list[dict] = []

    # Resource alerts
    if resources.get("ram_percent") and resources["ram_percent"] > 90:
        alerts.append(
            {
                "level": "critical",
                "title": "High Memory Usage",
                "detail": f"RAM at {resources['ram_percent']:.0f}%",
            }
        )
    elif resources.get("ram_percent") and resources["ram_percent"] > 80:
        alerts.append(
            {
                "level": "warning",
                "title": "Elevated Memory Usage",
                "detail": f"RAM at {resources['ram_percent']:.0f}%",
            }
        )

    if resources.get("disk_percent") and resources["disk_percent"] > 90:
        alerts.append(
            {
                "level": "critical",
                "title": "Low Disk Space",
                "detail": f"Disk at {resources['disk_percent']:.0f}% used",
            }
        )
    elif resources.get("disk_percent") and resources["disk_percent"] > 80:
        alerts.append(
            {
                "level": "warning",
                "title": "Disk Space Warning",
                "detail": f"Disk at {resources['disk_percent']:.0f}% used",
            }
        )

    if resources.get("cpu_percent") and resources["cpu_percent"] > 95:
        alerts.append(
            {
                "level": "warning",
                "title": "High CPU Usage",
                "detail": f"CPU at {resources['cpu_percent']:.0f}%",
            }
        )

    # Ollama alert
    if not resources.get("ollama_reachable", True):
        alerts.append(
            {
                "level": "critical",
                "title": "LLM Backend Offline",
                "detail": "Ollama is unreachable — agent responses will fail",
            }
        )

    # Agent alerts
    offline_agents = [a["name"] for a in agents if a.get("status") == "offline"]
    if offline_agents:
        alerts.append(
            {
                "level": "critical",
                "title": "Agent Offline",
                "detail": f"Offline: {', '.join(offline_agents)}",
            }
        )

    # Economy alerts
    balance = economy.get("balance_sats", 0)
    if isinstance(balance, (int, float)) and balance < 1000:
        alerts.append(
            {
                "level": "warning",
                "title": "Low Wallet Balance",
                "detail": f"Balance: {balance} sats",
            }
        )

    # Pass-through resource warnings
    for warn in resources.get("warnings", []):
        alerts.append({"level": "warning", "title": "System Warning", "detail": warn})

    return alerts


# ---------------------------------------------------------------------------
# Routes
# ---------------------------------------------------------------------------


@router.get("", response_class=HTMLResponse)
async def monitoring_page(request: Request):
    """Render the real-time monitoring dashboard page."""
    return templates.TemplateResponse(request, "monitoring.html", {})


@router.get("/status")
async def monitoring_status():
    """Aggregate status endpoint for the monitoring dashboard.

    Collects data from all subsystems concurrently and returns a single
    JSON payload used by the frontend to update all panels at once.
    """
    uptime = (datetime.now(UTC) - _START_TIME).total_seconds()

    agents, resources, economy, stream, pipeline = await asyncio.gather(
        _get_agent_status(),
        _get_system_resources(),
        _get_economy(),
        _get_stream_health(),
        _get_content_pipeline(),
    )

    alerts = _build_alerts(resources, agents, economy, stream)

    return {
        "timestamp": datetime.now(UTC).isoformat(),
        "uptime_seconds": uptime,
        "agents": agents,
        "resources": resources,
        "economy": economy,
        "stream": stream,
        "pipeline": pipeline,
        "alerts": alerts,
    }


@router.get("/alerts")
async def monitoring_alerts():
    """Return current alerts only."""
    agents, resources, economy, stream = await asyncio.gather(
        _get_agent_status(),
        _get_system_resources(),
        _get_economy(),
        _get_stream_health(),
    )
    alerts = _build_alerts(resources, agents, economy, stream)
    return {"alerts": alerts, "count": len(alerts)}