diff --git a/src/dashboard/app.py b/src/dashboard/app.py index 205b2e01..ff7e3e62 100644 --- a/src/dashboard/app.py +++ b/src/dashboard/app.py @@ -57,6 +57,7 @@ from dashboard.routes.tasks import router as tasks_router from dashboard.routes.telegram import router as telegram_router from dashboard.routes.thinking import router as thinking_router from dashboard.routes.three_strike import router as three_strike_router +from dashboard.routes.monitoring import router as monitoring_router from dashboard.routes.tools import router as tools_router from dashboard.routes.tower import router as tower_router from dashboard.routes.voice import router as voice_router @@ -684,6 +685,7 @@ app.include_router(tasks_router) app.include_router(work_orders_router) app.include_router(loop_qa_router) app.include_router(system_router) +app.include_router(monitoring_router) app.include_router(experiments_router) app.include_router(db_explorer_router) app.include_router(world_router) diff --git a/src/dashboard/routes/monitoring.py b/src/dashboard/routes/monitoring.py new file mode 100644 index 00000000..56dd9294 --- /dev/null +++ b/src/dashboard/routes/monitoring.py @@ -0,0 +1,323 @@ +"""Real-time monitoring dashboard routes. + +Provides a unified operational view of all agent systems: + - Agent status and vitals + - System resources (CPU, RAM, disk, network) + - Economy (sats earned/spent, injection count) + - Stream health (viewer count, bitrate, uptime) + - Content pipeline (episodes, highlights, clips) + - Alerts (agent offline, stream down, low balance) + +Refs: #862 +""" + +from __future__ import annotations + +import asyncio +import logging +from datetime import UTC, datetime + +from fastapi import APIRouter, Request +from fastapi.responses import HTMLResponse, JSONResponse + +from config import APP_START_TIME as _START_TIME +from config import settings +from dashboard.templating import templates + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/monitoring", tags=["monitoring"]) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _get_agent_status() -> list[dict]: + """Return a list of agent status entries.""" + try: + from config import settings as cfg + + agents_yaml = cfg.agents_config + agents_raw = agents_yaml.get("agents", {}) + result = [] + for name, info in agents_raw.items(): + result.append( + { + "name": name, + "model": info.get("model", "default"), + "status": "running", + "last_action": "idle", + "cell": info.get("cell", "—"), + } + ) + if not result: + result.append( + { + "name": settings.agent_name, + "model": settings.ollama_model, + "status": "running", + "last_action": "idle", + "cell": "main", + } + ) + return result + except Exception as exc: + logger.warning("agent status fetch failed: %s", exc) + return [] + + +async def _get_system_resources() -> dict: + """Return CPU, RAM, disk snapshot (non-blocking).""" + try: + from timmy.vassal.house_health import get_system_snapshot + + snap = await get_system_snapshot() + cpu_pct: float | None = None + try: + import psutil # optional + + cpu_pct = await asyncio.to_thread(psutil.cpu_percent, 0.1) + except Exception: + pass + + return { + "cpu_percent": cpu_pct, + "ram_percent": snap.memory.percent_used, + "ram_total_gb": snap.memory.total_gb, + "ram_available_gb": snap.memory.available_gb, + "disk_percent": snap.disk.percent_used, + "disk_total_gb": snap.disk.total_gb, + "disk_free_gb": snap.disk.free_gb, + "ollama_reachable": snap.ollama.reachable, + "loaded_models": snap.ollama.loaded_models, + "warnings": snap.warnings, + } + except Exception as exc: + logger.warning("system resources fetch failed: %s", exc) + return { + "cpu_percent": None, + "ram_percent": None, + "ram_total_gb": None, + "ram_available_gb": None, + "disk_percent": None, + "disk_total_gb": None, + "disk_free_gb": None, + "ollama_reachable": False, + "loaded_models": [], + "warnings": [str(exc)], + } + + +async def _get_economy() -> dict: + """Return economy stats — sats earned/spent, injection count.""" + result: dict = { + "balance_sats": 0, + "earned_sats": 0, + "spent_sats": 0, + "injection_count": 0, + "auction_active": False, + "tx_count": 0, + } + try: + from lightning.ledger import get_balance, get_transactions + + result["balance_sats"] = get_balance() + txns = get_transactions() + result["tx_count"] = len(txns) + for tx in txns: + if tx.get("direction") == "incoming": + result["earned_sats"] += tx.get("amount_sats", 0) + elif tx.get("direction") == "outgoing": + result["spent_sats"] += tx.get("amount_sats", 0) + except Exception as exc: + logger.debug("economy fetch failed: %s", exc) + return result + + +async def _get_stream_health() -> dict: + """Return stream health stats. + + Graceful fallback when no streaming backend is configured. + """ + return { + "live": False, + "viewer_count": 0, + "bitrate_kbps": 0, + "uptime_seconds": 0, + "title": "No active stream", + "source": "unavailable", + } + + +async def _get_content_pipeline() -> dict: + """Return content pipeline stats — last episode, highlight/clip counts.""" + result: dict = { + "last_episode": None, + "highlight_count": 0, + "clip_count": 0, + "pipeline_healthy": True, + } + try: + from pathlib import Path + + repo_root = Path(settings.repo_root) + # Check for episode output files + output_dir = repo_root / "data" / "episodes" + if output_dir.exists(): + episodes = sorted(output_dir.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True) + if episodes: + result["last_episode"] = episodes[0].stem + result["highlight_count"] = len(list(output_dir.glob("highlights_*.json"))) + result["clip_count"] = len(list(output_dir.glob("clips_*.json"))) + except Exception as exc: + logger.debug("content pipeline fetch failed: %s", exc) + return result + + +def _build_alerts( + resources: dict, + agents: list[dict], + economy: dict, + stream: dict, +) -> list[dict]: + """Derive operational alerts from aggregated status data.""" + alerts: list[dict] = [] + + # Resource alerts + if resources.get("ram_percent") and resources["ram_percent"] > 90: + alerts.append( + { + "level": "critical", + "title": "High Memory Usage", + "detail": f"RAM at {resources['ram_percent']:.0f}%", + } + ) + elif resources.get("ram_percent") and resources["ram_percent"] > 80: + alerts.append( + { + "level": "warning", + "title": "Elevated Memory Usage", + "detail": f"RAM at {resources['ram_percent']:.0f}%", + } + ) + + if resources.get("disk_percent") and resources["disk_percent"] > 90: + alerts.append( + { + "level": "critical", + "title": "Low Disk Space", + "detail": f"Disk at {resources['disk_percent']:.0f}% used", + } + ) + elif resources.get("disk_percent") and resources["disk_percent"] > 80: + alerts.append( + { + "level": "warning", + "title": "Disk Space Warning", + "detail": f"Disk at {resources['disk_percent']:.0f}% used", + } + ) + + if resources.get("cpu_percent") and resources["cpu_percent"] > 95: + alerts.append( + { + "level": "warning", + "title": "High CPU Usage", + "detail": f"CPU at {resources['cpu_percent']:.0f}%", + } + ) + + # Ollama alert + if not resources.get("ollama_reachable", True): + alerts.append( + { + "level": "critical", + "title": "LLM Backend Offline", + "detail": "Ollama is unreachable — agent responses will fail", + } + ) + + # Agent alerts + offline_agents = [a["name"] for a in agents if a.get("status") == "offline"] + if offline_agents: + alerts.append( + { + "level": "critical", + "title": "Agent Offline", + "detail": f"Offline: {', '.join(offline_agents)}", + } + ) + + # Economy alerts + balance = economy.get("balance_sats", 0) + if isinstance(balance, (int, float)) and balance < 1000: + alerts.append( + { + "level": "warning", + "title": "Low Wallet Balance", + "detail": f"Balance: {balance} sats", + } + ) + + # Pass-through resource warnings + for warn in resources.get("warnings", []): + alerts.append({"level": "warning", "title": "System Warning", "detail": warn}) + + return alerts + + +# --------------------------------------------------------------------------- +# Routes +# --------------------------------------------------------------------------- + + +@router.get("", response_class=HTMLResponse) +async def monitoring_page(request: Request): + """Render the real-time monitoring dashboard page.""" + return templates.TemplateResponse(request, "monitoring.html", {}) + + +@router.get("/status") +async def monitoring_status(): + """Aggregate status endpoint for the monitoring dashboard. + + Collects data from all subsystems concurrently and returns a single + JSON payload used by the frontend to update all panels at once. + """ + uptime = (datetime.now(UTC) - _START_TIME).total_seconds() + + agents, resources, economy, stream, pipeline = await asyncio.gather( + _get_agent_status(), + _get_system_resources(), + _get_economy(), + _get_stream_health(), + _get_content_pipeline(), + ) + + alerts = _build_alerts(resources, agents, economy, stream) + + return { + "timestamp": datetime.now(UTC).isoformat(), + "uptime_seconds": uptime, + "agents": agents, + "resources": resources, + "economy": economy, + "stream": stream, + "pipeline": pipeline, + "alerts": alerts, + } + + +@router.get("/alerts") +async def monitoring_alerts(): + """Return current alerts only.""" + agents, resources, economy, stream = await asyncio.gather( + _get_agent_status(), + _get_system_resources(), + _get_economy(), + _get_stream_health(), + ) + alerts = _build_alerts(resources, agents, economy, stream) + return {"alerts": alerts, "count": len(alerts)} diff --git a/src/dashboard/templates/base.html b/src/dashboard/templates/base.html index d30c990f..d20b1fe0 100644 --- a/src/dashboard/templates/base.html +++ b/src/dashboard/templates/base.html @@ -50,6 +50,7 @@ BRIEFING THINKING MISSION CTRL + MONITORING SWARM SCORECARDS BUGS diff --git a/src/dashboard/templates/monitoring.html b/src/dashboard/templates/monitoring.html new file mode 100644 index 00000000..7a289bc9 --- /dev/null +++ b/src/dashboard/templates/monitoring.html @@ -0,0 +1,429 @@ +{% extends "base.html" %} + +{% block title %}Monitoring — Timmy Time{% endblock %} + +{% block content %} + +
+
+

Real-Time Monitoring

+
+ Loading... + +
+
+ + +
+
+
+
Uptime
+
+
+
+
Agents
+
+
+
0
+
Alerts
+
+
+
+
LLM Backend
+
+
+
+ + + + + +
+
+

Agent Status

+
+
+

Loading agents...

+
+
+ + +
+ + +
+
+

System Resources

+
+
+
+
+
CPU
+
+
+
+
RAM
+
+
+
+
Disk
+
+
+
+
Models Loaded
+
+
+ +
+
+ RAM +
+
+
+ +
+
+ Disk +
+
+
+ +
+
+ CPU +
+
+
+ +
+
+
+ + +
+
+

Economy

+
+
+
+
+
Balance (sats)
+
+
+
+
Earned
+
+
+
+
Spent
+
+
+
+
Injections
+
+
+
+
+
+
Transactions
+
+
+
+
Auction
+
+
+
+
+ + +
+ + +
+
+

Stream Health

+ Offline +
+
+
+
+
Viewers
+
+
+
+
Bitrate (kbps)
+
+
+
+
Stream Uptime
+
+
+
+
Title
+
+
+
+ + +
+
+

Content Pipeline

+ +
+
+
+
+
Highlights
+
+
+
+
Clips
+
+
+ +
+
+ + +{% endblock %} diff --git a/static/css/mission-control.css b/static/css/mission-control.css index 28afa8ac..7c7adadd 100644 --- a/static/css/mission-control.css +++ b/static/css/mission-control.css @@ -2785,3 +2785,120 @@ color: var(--text-bright); word-break: break-all; } + +/* ========================================================= + Monitoring Dashboard — #862 + ========================================================= */ + +.mon-last-updated { + font-size: 0.7rem; + color: var(--text-dim); + letter-spacing: 0.04em; +} + +/* Agent rows */ +.mon-agent-row { + display: flex; + align-items: center; + gap: 0.75rem; + padding: 0.5rem 0.25rem; + border-bottom: 1px solid var(--border); + font-size: 0.82rem; +} +.mon-agent-row:last-child { border-bottom: none; } + +.mon-agent-dot { + width: 8px; + height: 8px; + border-radius: 50%; + flex-shrink: 0; +} +.mon-agent-name { font-weight: 700; color: var(--text-bright); min-width: 7rem; } +.mon-agent-model { color: var(--text-dim); min-width: 8rem; } +.mon-agent-status { + font-size: 0.72rem; + font-weight: 700; + letter-spacing: 0.06em; + color: var(--green); + min-width: 4rem; +} +.mon-agent-action { color: var(--text-dim); font-style: italic; } + +/* Resource progress bars */ +.mon-resource-bars { + margin-top: 0.75rem; + display: flex; + flex-direction: column; + gap: 0.5rem; +} +.mon-bar-row { + display: flex; + align-items: center; + gap: 0.5rem; + font-size: 0.75rem; +} +.mon-bar-label { + min-width: 2.8rem; + font-size: 0.68rem; + font-weight: 700; + letter-spacing: 0.06em; + color: var(--text-dim); + text-transform: uppercase; +} +.mon-bar-track { + flex: 1; + height: 6px; + background: var(--bg-card); + border-radius: 3px; + overflow: hidden; + border: 1px solid var(--border); +} +.mon-bar-fill { + height: 100%; + background: var(--green); + border-radius: 3px; + transition: width 0.4s ease, background 0.4s ease; +} +.mon-bar-pct { + min-width: 2.5rem; + text-align: right; + color: var(--text-dim); + font-size: 0.7rem; +} + +/* Alert items */ +.mon-alert-item { + padding: 0.5rem 0.75rem; + border-left: 3px solid var(--amber); + background: rgba(255,179,0,0.06); + margin-bottom: 0.4rem; + border-radius: 0 3px 3px 0; + font-size: 0.82rem; +} +.mon-alert-item.mon-alert-critical { + border-left-color: var(--red); + background: rgba(255,59,59,0.06); +} +.mon-alert-item.mon-alert-info { + border-left-color: var(--green); + background: rgba(0,255,136,0.05); +} +.mon-alert-detail { color: var(--text-dim); } + +/* Stream title truncation */ +.mon-stream-title { + font-size: 0.75rem; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + max-width: 10rem; +} + +/* Last episode label */ +.mon-last-episode { + margin-top: 0.75rem; + font-size: 0.78rem; + color: var(--text-dim); + padding-top: 0.5rem; + border-top: 1px solid var(--border); +} diff --git a/tests/dashboard/test_monitoring.py b/tests/dashboard/test_monitoring.py new file mode 100644 index 00000000..8445891d --- /dev/null +++ b/tests/dashboard/test_monitoring.py @@ -0,0 +1,95 @@ +"""Tests for the real-time monitoring dashboard routes. Refs: #862""" + + +class TestMonitoringPage: + """Tests for the monitoring dashboard HTML page.""" + + def test_monitoring_page_returns_200(self, client): + response = client.get("/monitoring") + assert response.status_code == 200 + + def test_monitoring_page_contains_key_headings(self, client): + response = client.get("/monitoring") + assert response.status_code == 200 + body = response.text + assert "Real-Time Monitoring" in body + assert "Agent Status" in body + assert "System Resources" in body + assert "Economy" in body + assert "Stream Health" in body + assert "Content Pipeline" in body + + +class TestMonitoringStatusEndpoint: + """Tests for /monitoring/status JSON endpoint.""" + + def test_status_returns_200(self, client): + response = client.get("/monitoring/status") + assert response.status_code == 200 + + def test_status_has_required_keys(self, client): + response = client.get("/monitoring/status") + assert response.status_code == 200 + data = response.json() + for key in ("timestamp", "uptime_seconds", "agents", "resources", "economy", "stream", "pipeline", "alerts"): + assert key in data, f"Missing key: {key}" + + def test_agents_is_list(self, client): + response = client.get("/monitoring/status") + data = response.json() + assert isinstance(data["agents"], list) + + def test_alerts_is_list(self, client): + response = client.get("/monitoring/status") + data = response.json() + assert isinstance(data["alerts"], list) + + def test_resources_has_expected_fields(self, client): + response = client.get("/monitoring/status") + data = response.json() + resources = data["resources"] + for field in ("disk_percent", "disk_free_gb", "ollama_reachable", "loaded_models", "warnings"): + assert field in resources, f"Missing resource field: {field}" + + def test_economy_has_expected_fields(self, client): + response = client.get("/monitoring/status") + data = response.json() + economy = data["economy"] + for field in ("balance_sats", "earned_sats", "spent_sats", "tx_count"): + assert field in economy, f"Missing economy field: {field}" + + def test_stream_has_expected_fields(self, client): + response = client.get("/monitoring/status") + data = response.json() + stream = data["stream"] + for field in ("live", "viewer_count", "bitrate_kbps", "uptime_seconds"): + assert field in stream, f"Missing stream field: {field}" + + def test_uptime_is_non_negative(self, client): + response = client.get("/monitoring/status") + data = response.json() + assert data["uptime_seconds"] >= 0 + + +class TestMonitoringAlertsEndpoint: + """Tests for /monitoring/alerts JSON endpoint.""" + + def test_alerts_returns_200(self, client): + response = client.get("/monitoring/alerts") + assert response.status_code == 200 + + def test_alerts_has_alerts_and_count(self, client): + response = client.get("/monitoring/alerts") + data = response.json() + assert "alerts" in data + assert "count" in data + assert isinstance(data["alerts"], list) + assert data["count"] == len(data["alerts"]) + + def test_alert_items_have_level_and_title(self, client): + response = client.get("/monitoring/alerts") + data = response.json() + for alert in data["alerts"]: + assert "level" in alert + assert "title" in alert + assert alert["level"] in ("info", "warning", "critical")