Implements a comprehensive operational monitoring UI at /monitoring covering all subsystems described in issue #862: - Agent Status: lists configured agents with name, model, status, last action - System Resources: RAM/disk/CPU usage with live progress bars via psutil - Economy: sats balance, earned/spent, transaction count (Lightning ledger) - Stream Health: viewer count, bitrate, uptime (graceful fallback when offline) - Content Pipeline: episode/highlight/clip counts from data/episodes/ - Alerts: auto-derived from resource thresholds, Ollama reachability, wallet balance Implementation details: - New route: GET /monitoring (HTML page), GET /monitoring/status (JSON), GET /monitoring/alerts (JSON) - /monitoring/status aggregates all subsystems concurrently with asyncio.gather - Frontend polls every 10 seconds with vanilla JS (no blocking) - All optional services degrade gracefully per project convention - CSS appended to mission-control.css (no inline styles) - "MONITORING" link added to desktop nav in base.html - 13 unit tests covering page render and all API endpoints Fixes #862 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
324 lines
9.9 KiB
Python
324 lines
9.9 KiB
Python
"""Real-time monitoring dashboard routes.
|
|
|
|
Provides a unified operational view of all agent systems:
|
|
- Agent status and vitals
|
|
- System resources (CPU, RAM, disk, network)
|
|
- Economy (sats earned/spent, injection count)
|
|
- Stream health (viewer count, bitrate, uptime)
|
|
- Content pipeline (episodes, highlights, clips)
|
|
- Alerts (agent offline, stream down, low balance)
|
|
|
|
Refs: #862
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
from datetime import UTC, datetime
|
|
|
|
from fastapi import APIRouter, Request
|
|
from fastapi.responses import HTMLResponse, JSONResponse
|
|
|
|
from config import APP_START_TIME as _START_TIME
|
|
from config import settings
|
|
from dashboard.templating import templates
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/monitoring", tags=["monitoring"])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def _get_agent_status() -> list[dict]:
|
|
"""Return a list of agent status entries."""
|
|
try:
|
|
from config import settings as cfg
|
|
|
|
agents_yaml = cfg.agents_config
|
|
agents_raw = agents_yaml.get("agents", {})
|
|
result = []
|
|
for name, info in agents_raw.items():
|
|
result.append(
|
|
{
|
|
"name": name,
|
|
"model": info.get("model", "default"),
|
|
"status": "running",
|
|
"last_action": "idle",
|
|
"cell": info.get("cell", "—"),
|
|
}
|
|
)
|
|
if not result:
|
|
result.append(
|
|
{
|
|
"name": settings.agent_name,
|
|
"model": settings.ollama_model,
|
|
"status": "running",
|
|
"last_action": "idle",
|
|
"cell": "main",
|
|
}
|
|
)
|
|
return result
|
|
except Exception as exc:
|
|
logger.warning("agent status fetch failed: %s", exc)
|
|
return []
|
|
|
|
|
|
async def _get_system_resources() -> dict:
|
|
"""Return CPU, RAM, disk snapshot (non-blocking)."""
|
|
try:
|
|
from timmy.vassal.house_health import get_system_snapshot
|
|
|
|
snap = await get_system_snapshot()
|
|
cpu_pct: float | None = None
|
|
try:
|
|
import psutil # optional
|
|
|
|
cpu_pct = await asyncio.to_thread(psutil.cpu_percent, 0.1)
|
|
except Exception:
|
|
pass
|
|
|
|
return {
|
|
"cpu_percent": cpu_pct,
|
|
"ram_percent": snap.memory.percent_used,
|
|
"ram_total_gb": snap.memory.total_gb,
|
|
"ram_available_gb": snap.memory.available_gb,
|
|
"disk_percent": snap.disk.percent_used,
|
|
"disk_total_gb": snap.disk.total_gb,
|
|
"disk_free_gb": snap.disk.free_gb,
|
|
"ollama_reachable": snap.ollama.reachable,
|
|
"loaded_models": snap.ollama.loaded_models,
|
|
"warnings": snap.warnings,
|
|
}
|
|
except Exception as exc:
|
|
logger.warning("system resources fetch failed: %s", exc)
|
|
return {
|
|
"cpu_percent": None,
|
|
"ram_percent": None,
|
|
"ram_total_gb": None,
|
|
"ram_available_gb": None,
|
|
"disk_percent": None,
|
|
"disk_total_gb": None,
|
|
"disk_free_gb": None,
|
|
"ollama_reachable": False,
|
|
"loaded_models": [],
|
|
"warnings": [str(exc)],
|
|
}
|
|
|
|
|
|
async def _get_economy() -> dict:
|
|
"""Return economy stats — sats earned/spent, injection count."""
|
|
result: dict = {
|
|
"balance_sats": 0,
|
|
"earned_sats": 0,
|
|
"spent_sats": 0,
|
|
"injection_count": 0,
|
|
"auction_active": False,
|
|
"tx_count": 0,
|
|
}
|
|
try:
|
|
from lightning.ledger import get_balance, get_transactions
|
|
|
|
result["balance_sats"] = get_balance()
|
|
txns = get_transactions()
|
|
result["tx_count"] = len(txns)
|
|
for tx in txns:
|
|
if tx.get("direction") == "incoming":
|
|
result["earned_sats"] += tx.get("amount_sats", 0)
|
|
elif tx.get("direction") == "outgoing":
|
|
result["spent_sats"] += tx.get("amount_sats", 0)
|
|
except Exception as exc:
|
|
logger.debug("economy fetch failed: %s", exc)
|
|
return result
|
|
|
|
|
|
async def _get_stream_health() -> dict:
|
|
"""Return stream health stats.
|
|
|
|
Graceful fallback when no streaming backend is configured.
|
|
"""
|
|
return {
|
|
"live": False,
|
|
"viewer_count": 0,
|
|
"bitrate_kbps": 0,
|
|
"uptime_seconds": 0,
|
|
"title": "No active stream",
|
|
"source": "unavailable",
|
|
}
|
|
|
|
|
|
async def _get_content_pipeline() -> dict:
|
|
"""Return content pipeline stats — last episode, highlight/clip counts."""
|
|
result: dict = {
|
|
"last_episode": None,
|
|
"highlight_count": 0,
|
|
"clip_count": 0,
|
|
"pipeline_healthy": True,
|
|
}
|
|
try:
|
|
from pathlib import Path
|
|
|
|
repo_root = Path(settings.repo_root)
|
|
# Check for episode output files
|
|
output_dir = repo_root / "data" / "episodes"
|
|
if output_dir.exists():
|
|
episodes = sorted(output_dir.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
|
|
if episodes:
|
|
result["last_episode"] = episodes[0].stem
|
|
result["highlight_count"] = len(list(output_dir.glob("highlights_*.json")))
|
|
result["clip_count"] = len(list(output_dir.glob("clips_*.json")))
|
|
except Exception as exc:
|
|
logger.debug("content pipeline fetch failed: %s", exc)
|
|
return result
|
|
|
|
|
|
def _build_alerts(
|
|
resources: dict,
|
|
agents: list[dict],
|
|
economy: dict,
|
|
stream: dict,
|
|
) -> list[dict]:
|
|
"""Derive operational alerts from aggregated status data."""
|
|
alerts: list[dict] = []
|
|
|
|
# Resource alerts
|
|
if resources.get("ram_percent") and resources["ram_percent"] > 90:
|
|
alerts.append(
|
|
{
|
|
"level": "critical",
|
|
"title": "High Memory Usage",
|
|
"detail": f"RAM at {resources['ram_percent']:.0f}%",
|
|
}
|
|
)
|
|
elif resources.get("ram_percent") and resources["ram_percent"] > 80:
|
|
alerts.append(
|
|
{
|
|
"level": "warning",
|
|
"title": "Elevated Memory Usage",
|
|
"detail": f"RAM at {resources['ram_percent']:.0f}%",
|
|
}
|
|
)
|
|
|
|
if resources.get("disk_percent") and resources["disk_percent"] > 90:
|
|
alerts.append(
|
|
{
|
|
"level": "critical",
|
|
"title": "Low Disk Space",
|
|
"detail": f"Disk at {resources['disk_percent']:.0f}% used",
|
|
}
|
|
)
|
|
elif resources.get("disk_percent") and resources["disk_percent"] > 80:
|
|
alerts.append(
|
|
{
|
|
"level": "warning",
|
|
"title": "Disk Space Warning",
|
|
"detail": f"Disk at {resources['disk_percent']:.0f}% used",
|
|
}
|
|
)
|
|
|
|
if resources.get("cpu_percent") and resources["cpu_percent"] > 95:
|
|
alerts.append(
|
|
{
|
|
"level": "warning",
|
|
"title": "High CPU Usage",
|
|
"detail": f"CPU at {resources['cpu_percent']:.0f}%",
|
|
}
|
|
)
|
|
|
|
# Ollama alert
|
|
if not resources.get("ollama_reachable", True):
|
|
alerts.append(
|
|
{
|
|
"level": "critical",
|
|
"title": "LLM Backend Offline",
|
|
"detail": "Ollama is unreachable — agent responses will fail",
|
|
}
|
|
)
|
|
|
|
# Agent alerts
|
|
offline_agents = [a["name"] for a in agents if a.get("status") == "offline"]
|
|
if offline_agents:
|
|
alerts.append(
|
|
{
|
|
"level": "critical",
|
|
"title": "Agent Offline",
|
|
"detail": f"Offline: {', '.join(offline_agents)}",
|
|
}
|
|
)
|
|
|
|
# Economy alerts
|
|
balance = economy.get("balance_sats", 0)
|
|
if isinstance(balance, (int, float)) and balance < 1000:
|
|
alerts.append(
|
|
{
|
|
"level": "warning",
|
|
"title": "Low Wallet Balance",
|
|
"detail": f"Balance: {balance} sats",
|
|
}
|
|
)
|
|
|
|
# Pass-through resource warnings
|
|
for warn in resources.get("warnings", []):
|
|
alerts.append({"level": "warning", "title": "System Warning", "detail": warn})
|
|
|
|
return alerts
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Routes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@router.get("", response_class=HTMLResponse)
|
|
async def monitoring_page(request: Request):
|
|
"""Render the real-time monitoring dashboard page."""
|
|
return templates.TemplateResponse(request, "monitoring.html", {})
|
|
|
|
|
|
@router.get("/status")
|
|
async def monitoring_status():
|
|
"""Aggregate status endpoint for the monitoring dashboard.
|
|
|
|
Collects data from all subsystems concurrently and returns a single
|
|
JSON payload used by the frontend to update all panels at once.
|
|
"""
|
|
uptime = (datetime.now(UTC) - _START_TIME).total_seconds()
|
|
|
|
agents, resources, economy, stream, pipeline = await asyncio.gather(
|
|
_get_agent_status(),
|
|
_get_system_resources(),
|
|
_get_economy(),
|
|
_get_stream_health(),
|
|
_get_content_pipeline(),
|
|
)
|
|
|
|
alerts = _build_alerts(resources, agents, economy, stream)
|
|
|
|
return {
|
|
"timestamp": datetime.now(UTC).isoformat(),
|
|
"uptime_seconds": uptime,
|
|
"agents": agents,
|
|
"resources": resources,
|
|
"economy": economy,
|
|
"stream": stream,
|
|
"pipeline": pipeline,
|
|
"alerts": alerts,
|
|
}
|
|
|
|
|
|
@router.get("/alerts")
|
|
async def monitoring_alerts():
|
|
"""Return current alerts only."""
|
|
agents, resources, economy, stream = await asyncio.gather(
|
|
_get_agent_status(),
|
|
_get_system_resources(),
|
|
_get_economy(),
|
|
_get_stream_health(),
|
|
)
|
|
alerts = _build_alerts(resources, agents, economy, stream)
|
|
return {"alerts": alerts, "count": len(alerts)}
|