feat: Add queue health check script

2026-04-17 01:26:07 -04:00
3 changed files with 147 additions and 174 deletions
--- a/agent/provider_preflight.py
+++ b/agent/provider_preflight.py
@@ -1,146 +0,0 @@
-"""Provider Preflight — Poka-yoke validation of provider/model config.
-
-Validates provider and model configuration before session start.
-Prevents wasted context on misconfigured providers.
-
-Usage:
-    from agent.provider_preflight import preflight_check
-    result = preflight_check(provider="openrouter", model="xiaomi/mimo-v2-pro")
-    if not result["valid"]:
-        print(result["error"])
-"""
-
-from __future__ import annotations
-
-import logging
-import os
-from typing import Any, Dict, Optional
-
-logger = logging.getLogger(__name__)
-
-
-# Provider -> required env var
-PROVIDER_KEYS = {
-    "openrouter": "OPENROUTER_API_KEY",
-    "anthropic": "ANTHROPIC_API_KEY",
-    "openai": "OPENAI_API_KEY",
-    "nous": "NOUS_API_KEY",
-    "ollama": None,  # Local, no key needed
-    "local": None,
-}
-
-
-def check_provider_key(provider: str) -> Dict[str, Any]:
-    """Check if provider has a valid API key configured."""
-    provider_lower = provider.lower().strip()
-
-    env_var = None
-    for known, key in PROVIDER_KEYS.items():
-        if known in provider_lower:
-            env_var = key
-            break
-
-    if env_var is None:
-        # Unknown provider — assume OK (custom/local)
-        return {"valid": True, "provider": provider, "key_status": "unknown"}
-
-    if env_var is None:
-        # Local provider, no key needed
-        return {"valid": True, "provider": provider, "key_status": "not_required"}
-
-    key_value = os.getenv(env_var, "").strip()
-    if not key_value:
-        return {
-            "valid": False,
-            "provider": provider,
-            "key_status": "missing",
-            "error": f"{env_var} is not set. Provider '{provider}' will fail.",
-            "fix": f"Set {env_var} in ~/.hermes/.env",
-        }
-
-    if len(key_value) < 10:
-        return {
-            "valid": False,
-            "provider": provider,
-            "key_status": "too_short",
-            "error": f"{env_var} is suspiciously short ({len(key_value)} chars). May be invalid.",
-            "fix": f"Verify {env_var} value in ~/.hermes/.env",
-        }
-
-    return {"valid": True, "provider": provider, "key_status": "set"}
-
-
-def check_model_availability(model: str, provider: str) -> Dict[str, Any]:
-    """Check if model is likely available for provider."""
-    if not model:
-        return {"valid": False, "error": "No model specified"}
-
-    # Basic sanity checks
-    model_lower = model.lower()
-
-    # Anthropic models should use anthropic provider
-    if "claude" in model_lower and "anthropic" not in provider.lower():
-        return {
-            "valid": True,  # Allow but warn
-            "warning": f"Model '{model}' usually runs on Anthropic provider, not '{provider}'",
-        }
-
-    # Ollama models
-    ollama_indicators = ["llama", "mistral", "qwen", "gemma", "phi", "hermes"]
-    if any(x in model_lower for x in ollama_indicators) and ":" not in model:
-        return {
-            "valid": True,
-            "warning": f"Model '{model}' may need a version tag for Ollama (e.g., {model}:latest)",
-        }
-
-    return {"valid": True}
-
-
-def preflight_check(
-    provider: str = "",
-    model: str = "",
-    fallback_provider: str = "",
-    fallback_model: str = "",
-) -> Dict[str, Any]:
-    """Full pre-flight check for provider/model configuration.
-
-    Returns:
-        Dict with valid (bool), errors (list), warnings (list).
-    """
-    errors = []
-    warnings = []
-
-    # Check primary provider
-    if provider:
-        result = check_provider_key(provider)
-        if not result["valid"]:
-            errors.append(result.get("error", f"Provider {provider} invalid"))
-
-    # Check primary model
-    if model:
-        result = check_model_availability(model, provider)
-        if not result["valid"]:
-            errors.append(result.get("error", f"Model {model} invalid"))
-        elif result.get("warning"):
-            warnings.append(result["warning"])
-
-    # Check fallback
-    if fallback_provider:
-        result = check_provider_key(fallback_provider)
-        if not result["valid"]:
-            warnings.append(f"Fallback provider {fallback_provider} also invalid: {result.get('error','')}")
-
-    if fallback_model:
-        result = check_model_availability(fallback_model, fallback_provider)
-        if not result["valid"]:
-            warnings.append(f"Fallback model {fallback_model} invalid")
-        elif result.get("warning"):
-            warnings.append(result["warning"])
-
-    return {
-        "valid": len(errors) == 0,
-        "errors": errors,
-        "warnings": warnings,
-        "provider": provider,
-        "model": model,
-    }
--- a/scripts/queue_health_check.py
+++ b/scripts/queue_health_check.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+"""
+Queue Health Check — Verify dispatch queue is operational.
+
+Checks:
+1. Queue file exists and is readable
+2. Queue has pending items
+3. Queue is not stuck (items processing)
+4. Queue age (stale items)
+
+Usage:
+    python scripts/queue_health_check.py
+    python scripts/queue_health_check.py --json
+"""
+
+import json
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+
+
+def check_queue_health(queue_path: str = "~/.hermes/queue.json") -> dict:
+    """Check queue health status."""
+    path = Path(queue_path).expanduser()
+    
+    result = {
+        "healthy": True,
+        "checks": {},
+        "warnings": [],
+        "errors": []
+    }
+    
+    # Check 1: File exists
+    if not path.exists():
+        result["healthy"] = False
+        result["errors"].append(f"Queue file not found: {path}")
+        result["checks"]["file_exists"] = False
+        return result
+    
+    result["checks"]["file_exists"] = True
+    
+    # Check 2: File is readable
+    try:
+        with open(path, "r") as f:
+            data = json.load(f)
+    except Exception as e:
+        result["healthy"] = False
+        result["errors"].append(f"Cannot read queue: {e}")
+        result["checks"]["readable"] = False
+        return result
+    
+    result["checks"]["readable"] = True
+    
+    # Check 3: Queue structure
+    if not isinstance(data, dict):
+        result["healthy"] = False
+        result["errors"].append("Queue is not a dict")
+        result["checks"]["valid_structure"] = False
+        return result
+    
+    result["checks"]["valid_structure"] = True
+    
+    # Check 4: Pending items
+    pending = data.get("pending", [])
+    processing = data.get("processing", [])
+    completed = data.get("completed", [])
+    
+    result["checks"]["pending_count"] = len(pending)
+    result["checks"]["processing_count"] = len(processing)
+    result["checks"]["completed_count"] = len(completed)
+    
+    if len(pending) == 0 and len(processing) == 0:
+        result["warnings"].append("Queue is empty")
+    
+    # Check 5: Stale processing items
+    now = datetime.now()
+    stale_threshold = timedelta(hours=1)
+    
+    for item in processing:
+        started = item.get("started_at")
+        if started:
+            try:
+                started_time = datetime.fromisoformat(started.replace("Z", "+00:00"))
+                if now - started_time > stale_threshold:
+                    result["warnings"].append(f"Stale item: {item.get('id', 'unknown')} (started {started})")
+            except:
+                pass
+    
+    # Check 6: Queue age
+    if pending:
+        oldest = min(pending, key=lambda x: x.get("added_at", ""))
+        added = oldest.get("added_at")
+        if added:
+            try:
+                added_time = datetime.fromisoformat(added.replace("Z", "+00:00"))
+                age = now - added_time
+                if age > timedelta(hours=24):
+                    result["warnings"].append(f"Old item in queue: {oldest.get('id', 'unknown')} (added {added})")
+            except:
+                pass
+    
+    return result
+
+
+def main():
+    """Main function."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Queue health check")
+    parser.add_argument("--queue", default="~/.hermes/queue.json", help="Queue file path")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    args = parser.parse_args()
+    
+    result = check_queue_health(args.queue)
+    
+    if args.json:
+        print(json.dumps(result, indent=2))
+    else:
+        print("Queue Health Check")
+        print("=" * 50)
+        print(f"Healthy: {'✓' if result['healthy'] else '✗'}")
+        print()
+        
+        print("Checks:")
+        for check, value in result["checks"].items():
+            if isinstance(value, bool):
+                print(f"  {check}: {'✓' if value else '✗'}")
+            else:
+                print(f"  {check}: {value}")
+        
+        if result["warnings"]:
+            print()
+            print("Warnings:")
+            for warning in result["warnings"]:
+                print(f"  ⚠ {warning}")
+        
+        if result["errors"]:
+            print()
+            print("Errors:")
+            for error in result["errors"]:
+                print(f"  ✗ {error}")
+    
+    sys.exit(0 if result["healthy"] else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/skill_manager_tool.py
+++ b/tools/skill_manager_tool.py
@@ -44,34 +44,6 @@ from typing import Dict, Any, Optional, Tuple

 logger = logging.getLogger(__name__)

-
-def _format_error(
-    message: str,
-    skill_name: str = None,
-    file_path: str = None,
-    suggestion: str = None,
-    context: dict = None,
-) -> Dict[str, Any]:
-    """Format an error with rich context for better debugging."""
-    parts = [message]
-    if skill_name:
-        parts.append(f"Skill: {skill_name}")
-    if file_path:
-        parts.append(f"File: {file_path}")
-    if suggestion:
-        parts.append(f"Suggestion: {suggestion}")
-    if context:
-        for key, value in context.items():
-            parts.append(f"{key}: {value}")
-    return {
-        "success": False,
-        "error": " | ".join(parts),
-        "skill_name": skill_name,
-        "file_path": file_path,
-        "suggestion": suggestion,
-    }
-
-
 # Import security scanner — agent-created skills get the same scrutiny as
 # community hub installs.
 try: