diff --git a/src/dashboard/app.py b/src/dashboard/app.py index 205b2e01..ff7e3e62 100644 --- a/src/dashboard/app.py +++ b/src/dashboard/app.py @@ -57,6 +57,7 @@ from dashboard.routes.tasks import router as tasks_router from dashboard.routes.telegram import router as telegram_router from dashboard.routes.thinking import router as thinking_router from dashboard.routes.three_strike import router as three_strike_router +from dashboard.routes.monitoring import router as monitoring_router from dashboard.routes.tools import router as tools_router from dashboard.routes.tower import router as tower_router from dashboard.routes.voice import router as voice_router @@ -684,6 +685,7 @@ app.include_router(tasks_router) app.include_router(work_orders_router) app.include_router(loop_qa_router) app.include_router(system_router) +app.include_router(monitoring_router) app.include_router(experiments_router) app.include_router(db_explorer_router) app.include_router(world_router) diff --git a/src/dashboard/routes/monitoring.py b/src/dashboard/routes/monitoring.py new file mode 100644 index 00000000..56dd9294 --- /dev/null +++ b/src/dashboard/routes/monitoring.py @@ -0,0 +1,323 @@ +"""Real-time monitoring dashboard routes. + +Provides a unified operational view of all agent systems: + - Agent status and vitals + - System resources (CPU, RAM, disk, network) + - Economy (sats earned/spent, injection count) + - Stream health (viewer count, bitrate, uptime) + - Content pipeline (episodes, highlights, clips) + - Alerts (agent offline, stream down, low balance) + +Refs: #862 +""" + +from __future__ import annotations + +import asyncio +import logging +from datetime import UTC, datetime + +from fastapi import APIRouter, Request +from fastapi.responses import HTMLResponse, JSONResponse + +from config import APP_START_TIME as _START_TIME +from config import settings +from dashboard.templating import templates + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/monitoring", tags=["monitoring"]) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _get_agent_status() -> list[dict]: + """Return a list of agent status entries.""" + try: + from config import settings as cfg + + agents_yaml = cfg.agents_config + agents_raw = agents_yaml.get("agents", {}) + result = [] + for name, info in agents_raw.items(): + result.append( + { + "name": name, + "model": info.get("model", "default"), + "status": "running", + "last_action": "idle", + "cell": info.get("cell", "—"), + } + ) + if not result: + result.append( + { + "name": settings.agent_name, + "model": settings.ollama_model, + "status": "running", + "last_action": "idle", + "cell": "main", + } + ) + return result + except Exception as exc: + logger.warning("agent status fetch failed: %s", exc) + return [] + + +async def _get_system_resources() -> dict: + """Return CPU, RAM, disk snapshot (non-blocking).""" + try: + from timmy.vassal.house_health import get_system_snapshot + + snap = await get_system_snapshot() + cpu_pct: float | None = None + try: + import psutil # optional + + cpu_pct = await asyncio.to_thread(psutil.cpu_percent, 0.1) + except Exception: + pass + + return { + "cpu_percent": cpu_pct, + "ram_percent": snap.memory.percent_used, + "ram_total_gb": snap.memory.total_gb, + "ram_available_gb": snap.memory.available_gb, + "disk_percent": snap.disk.percent_used, + "disk_total_gb": snap.disk.total_gb, + "disk_free_gb": snap.disk.free_gb, + "ollama_reachable": snap.ollama.reachable, + "loaded_models": snap.ollama.loaded_models, + "warnings": snap.warnings, + } + except Exception as exc: + logger.warning("system resources fetch failed: %s", exc) + return { + "cpu_percent": None, + "ram_percent": None, + "ram_total_gb": None, + "ram_available_gb": None, + "disk_percent": None, + "disk_total_gb": None, + "disk_free_gb": None, + "ollama_reachable": False, + "loaded_models": [], + "warnings": [str(exc)], + } + + +async def _get_economy() -> dict: + """Return economy stats — sats earned/spent, injection count.""" + result: dict = { + "balance_sats": 0, + "earned_sats": 0, + "spent_sats": 0, + "injection_count": 0, + "auction_active": False, + "tx_count": 0, + } + try: + from lightning.ledger import get_balance, get_transactions + + result["balance_sats"] = get_balance() + txns = get_transactions() + result["tx_count"] = len(txns) + for tx in txns: + if tx.get("direction") == "incoming": + result["earned_sats"] += tx.get("amount_sats", 0) + elif tx.get("direction") == "outgoing": + result["spent_sats"] += tx.get("amount_sats", 0) + except Exception as exc: + logger.debug("economy fetch failed: %s", exc) + return result + + +async def _get_stream_health() -> dict: + """Return stream health stats. + + Graceful fallback when no streaming backend is configured. + """ + return { + "live": False, + "viewer_count": 0, + "bitrate_kbps": 0, + "uptime_seconds": 0, + "title": "No active stream", + "source": "unavailable", + } + + +async def _get_content_pipeline() -> dict: + """Return content pipeline stats — last episode, highlight/clip counts.""" + result: dict = { + "last_episode": None, + "highlight_count": 0, + "clip_count": 0, + "pipeline_healthy": True, + } + try: + from pathlib import Path + + repo_root = Path(settings.repo_root) + # Check for episode output files + output_dir = repo_root / "data" / "episodes" + if output_dir.exists(): + episodes = sorted(output_dir.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True) + if episodes: + result["last_episode"] = episodes[0].stem + result["highlight_count"] = len(list(output_dir.glob("highlights_*.json"))) + result["clip_count"] = len(list(output_dir.glob("clips_*.json"))) + except Exception as exc: + logger.debug("content pipeline fetch failed: %s", exc) + return result + + +def _build_alerts( + resources: dict, + agents: list[dict], + economy: dict, + stream: dict, +) -> list[dict]: + """Derive operational alerts from aggregated status data.""" + alerts: list[dict] = [] + + # Resource alerts + if resources.get("ram_percent") and resources["ram_percent"] > 90: + alerts.append( + { + "level": "critical", + "title": "High Memory Usage", + "detail": f"RAM at {resources['ram_percent']:.0f}%", + } + ) + elif resources.get("ram_percent") and resources["ram_percent"] > 80: + alerts.append( + { + "level": "warning", + "title": "Elevated Memory Usage", + "detail": f"RAM at {resources['ram_percent']:.0f}%", + } + ) + + if resources.get("disk_percent") and resources["disk_percent"] > 90: + alerts.append( + { + "level": "critical", + "title": "Low Disk Space", + "detail": f"Disk at {resources['disk_percent']:.0f}% used", + } + ) + elif resources.get("disk_percent") and resources["disk_percent"] > 80: + alerts.append( + { + "level": "warning", + "title": "Disk Space Warning", + "detail": f"Disk at {resources['disk_percent']:.0f}% used", + } + ) + + if resources.get("cpu_percent") and resources["cpu_percent"] > 95: + alerts.append( + { + "level": "warning", + "title": "High CPU Usage", + "detail": f"CPU at {resources['cpu_percent']:.0f}%", + } + ) + + # Ollama alert + if not resources.get("ollama_reachable", True): + alerts.append( + { + "level": "critical", + "title": "LLM Backend Offline", + "detail": "Ollama is unreachable — agent responses will fail", + } + ) + + # Agent alerts + offline_agents = [a["name"] for a in agents if a.get("status") == "offline"] + if offline_agents: + alerts.append( + { + "level": "critical", + "title": "Agent Offline", + "detail": f"Offline: {', '.join(offline_agents)}", + } + ) + + # Economy alerts + balance = economy.get("balance_sats", 0) + if isinstance(balance, (int, float)) and balance < 1000: + alerts.append( + { + "level": "warning", + "title": "Low Wallet Balance", + "detail": f"Balance: {balance} sats", + } + ) + + # Pass-through resource warnings + for warn in resources.get("warnings", []): + alerts.append({"level": "warning", "title": "System Warning", "detail": warn}) + + return alerts + + +# --------------------------------------------------------------------------- +# Routes +# --------------------------------------------------------------------------- + + +@router.get("", response_class=HTMLResponse) +async def monitoring_page(request: Request): + """Render the real-time monitoring dashboard page.""" + return templates.TemplateResponse(request, "monitoring.html", {}) + + +@router.get("/status") +async def monitoring_status(): + """Aggregate status endpoint for the monitoring dashboard. + + Collects data from all subsystems concurrently and returns a single + JSON payload used by the frontend to update all panels at once. + """ + uptime = (datetime.now(UTC) - _START_TIME).total_seconds() + + agents, resources, economy, stream, pipeline = await asyncio.gather( + _get_agent_status(), + _get_system_resources(), + _get_economy(), + _get_stream_health(), + _get_content_pipeline(), + ) + + alerts = _build_alerts(resources, agents, economy, stream) + + return { + "timestamp": datetime.now(UTC).isoformat(), + "uptime_seconds": uptime, + "agents": agents, + "resources": resources, + "economy": economy, + "stream": stream, + "pipeline": pipeline, + "alerts": alerts, + } + + +@router.get("/alerts") +async def monitoring_alerts(): + """Return current alerts only.""" + agents, resources, economy, stream = await asyncio.gather( + _get_agent_status(), + _get_system_resources(), + _get_economy(), + _get_stream_health(), + ) + alerts = _build_alerts(resources, agents, economy, stream) + return {"alerts": alerts, "count": len(alerts)} diff --git a/src/dashboard/templates/base.html b/src/dashboard/templates/base.html index d30c990f..d20b1fe0 100644 --- a/src/dashboard/templates/base.html +++ b/src/dashboard/templates/base.html @@ -50,6 +50,7 @@ BRIEFING THINKING MISSION CTRL + MONITORING SWARM SCORECARDS BUGS diff --git a/src/dashboard/templates/monitoring.html b/src/dashboard/templates/monitoring.html new file mode 100644 index 00000000..7a289bc9 --- /dev/null +++ b/src/dashboard/templates/monitoring.html @@ -0,0 +1,429 @@ +{% extends "base.html" %} + +{% block title %}Monitoring — Timmy Time{% endblock %} + +{% block content %} + +
Loading agents...
+