217 lines
6.4 KiB
Python
217 lines
6.4 KiB
Python
"""Vassal Protocol — Hermes house health monitoring.
|
|
|
|
Monitors system resources on the M3 Max (Hermes) and Ollama model state.
|
|
Reports warnings when resources are tight and provides cleanup utilities.
|
|
|
|
All I/O is wrapped in asyncio.to_thread() per CLAUDE.md convention.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import shutil
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Thresholds
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_WARN_DISK_PCT = 85.0 # warn when disk is more than 85% full
|
|
_WARN_MEM_PCT = 90.0 # warn when memory is more than 90% used
|
|
_WARN_CPU_PCT = 95.0 # warn when CPU is above 95% sustained
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data models
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class DiskUsage:
|
|
path: str = "/"
|
|
total_gb: float = 0.0
|
|
used_gb: float = 0.0
|
|
free_gb: float = 0.0
|
|
percent_used: float = 0.0
|
|
|
|
|
|
@dataclass
|
|
class MemoryUsage:
|
|
total_gb: float = 0.0
|
|
available_gb: float = 0.0
|
|
percent_used: float = 0.0
|
|
|
|
|
|
@dataclass
|
|
class OllamaHealth:
|
|
reachable: bool = False
|
|
loaded_models: list[str] = field(default_factory=list)
|
|
error: str = ""
|
|
|
|
|
|
@dataclass
|
|
class SystemSnapshot:
|
|
"""Point-in-time snapshot of Hermes resource usage."""
|
|
|
|
disk: DiskUsage = field(default_factory=DiskUsage)
|
|
memory: MemoryUsage = field(default_factory=MemoryUsage)
|
|
ollama: OllamaHealth = field(default_factory=OllamaHealth)
|
|
warnings: list[str] = field(default_factory=list)
|
|
taken_at: str = field(default_factory=lambda: datetime.now(UTC).isoformat())
|
|
|
|
@property
|
|
def healthy(self) -> bool:
|
|
return len(self.warnings) == 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Resource probes (sync, run in threads)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _probe_disk(path: str = "/") -> DiskUsage:
|
|
try:
|
|
usage = shutil.disk_usage(path)
|
|
total_gb = usage.total / 1e9
|
|
used_gb = usage.used / 1e9
|
|
free_gb = usage.free / 1e9
|
|
pct = (usage.used / usage.total * 100) if usage.total > 0 else 0.0
|
|
return DiskUsage(
|
|
path=path,
|
|
total_gb=round(total_gb, 2),
|
|
used_gb=round(used_gb, 2),
|
|
free_gb=round(free_gb, 2),
|
|
percent_used=round(pct, 1),
|
|
)
|
|
except Exception as exc:
|
|
logger.debug("_probe_disk: %s", exc)
|
|
return DiskUsage(path=path)
|
|
|
|
|
|
def _probe_memory() -> MemoryUsage:
|
|
try:
|
|
import psutil # optional — gracefully degrade if absent
|
|
|
|
vm = psutil.virtual_memory()
|
|
return MemoryUsage(
|
|
total_gb=round(vm.total / 1e9, 2),
|
|
available_gb=round(vm.available / 1e9, 2),
|
|
percent_used=round(vm.percent, 1),
|
|
)
|
|
except ImportError:
|
|
logger.debug("_probe_memory: psutil not installed — skipping")
|
|
return MemoryUsage()
|
|
except Exception as exc:
|
|
logger.debug("_probe_memory: %s", exc)
|
|
return MemoryUsage()
|
|
|
|
|
|
def _probe_ollama_sync(ollama_url: str) -> OllamaHealth:
|
|
"""Synchronous Ollama health probe — run in a thread."""
|
|
try:
|
|
import json
|
|
import urllib.request
|
|
|
|
url = ollama_url.rstrip("/") + "/api/tags"
|
|
with urllib.request.urlopen(url, timeout=5) as resp: # noqa: S310
|
|
data = json.loads(resp.read())
|
|
models = [m.get("name", "") for m in data.get("models", [])]
|
|
return OllamaHealth(reachable=True, loaded_models=models)
|
|
except Exception as exc:
|
|
return OllamaHealth(reachable=False, error=str(exc)[:120])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public API
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def get_system_snapshot() -> SystemSnapshot:
|
|
"""Collect a non-blocking snapshot of system resources.
|
|
|
|
Uses asyncio.to_thread() for all blocking I/O per project convention.
|
|
|
|
Returns:
|
|
SystemSnapshot with disk, memory, and Ollama status.
|
|
"""
|
|
from config import settings
|
|
|
|
disk, memory, ollama = await asyncio.gather(
|
|
asyncio.to_thread(_probe_disk, "/"),
|
|
asyncio.to_thread(_probe_memory),
|
|
asyncio.to_thread(_probe_ollama_sync, settings.normalized_ollama_url),
|
|
)
|
|
|
|
warnings: list[str] = []
|
|
|
|
if disk.percent_used >= _WARN_DISK_PCT:
|
|
warnings.append(
|
|
f"Disk {disk.path}: {disk.percent_used:.0f}% used ({disk.free_gb:.1f} GB free)"
|
|
)
|
|
|
|
if memory.percent_used >= _WARN_MEM_PCT:
|
|
warnings.append(
|
|
f"Memory: {memory.percent_used:.0f}% used ({memory.available_gb:.1f} GB available)"
|
|
)
|
|
|
|
if not ollama.reachable:
|
|
warnings.append(f"Ollama unreachable: {ollama.error}")
|
|
|
|
if warnings:
|
|
logger.warning("House health warnings: %s", "; ".join(warnings))
|
|
|
|
return SystemSnapshot(
|
|
disk=disk,
|
|
memory=memory,
|
|
ollama=ollama,
|
|
warnings=warnings,
|
|
)
|
|
|
|
|
|
async def cleanup_stale_files(
|
|
temp_dirs: list[str] | None = None,
|
|
max_age_days: int = 7,
|
|
) -> dict[str, Any]:
|
|
"""Remove files older than *max_age_days* from temp directories.
|
|
|
|
Only removes files under safe temp paths (never project source).
|
|
|
|
Args:
|
|
temp_dirs: Directories to scan. Defaults to ``["/tmp/timmy"]``.
|
|
max_age_days: Age threshold in days.
|
|
|
|
Returns:
|
|
Dict with ``deleted_count`` and ``errors``.
|
|
"""
|
|
import time
|
|
|
|
dirs = temp_dirs or ["/tmp/timmy"] # noqa: S108
|
|
cutoff = time.time() - max_age_days * 86400
|
|
deleted = 0
|
|
errors: list[str] = []
|
|
|
|
def _cleanup() -> None:
|
|
nonlocal deleted
|
|
for d in dirs:
|
|
p = Path(d)
|
|
if not p.exists():
|
|
continue
|
|
for f in p.rglob("*"):
|
|
if f.is_file():
|
|
try:
|
|
if f.stat().st_mtime < cutoff:
|
|
f.unlink()
|
|
deleted += 1
|
|
except Exception as exc:
|
|
errors.append(str(exc))
|
|
|
|
await asyncio.to_thread(_cleanup)
|
|
logger.info("cleanup_stale_files: deleted %d files, %d errors", deleted, len(errors))
|
|
return {"deleted_count": deleted, "errors": errors}
|