feat: Add queue health check script

2026-04-17 01:26:07 -04:00
2 changed files with 147 additions and 224 deletions
--- a/gateway/config_validator.py
+++ b/gateway/config_validator.py
@@ -1,224 +0,0 @@
-"""
-Gateway Config Validator & Fallback Fix — #892.
-
-Validates gateway configuration and provides sensible defaults
-for missing keys to prevent fallback chain breaks.
-"""
-
-import logging
-import os
-from typing import Dict, Any, List, Optional
-from dataclasses import dataclass, field
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ConfigIssue:
-    """A configuration issue found during validation."""
-    key: str
-    severity: str  # error, warning, info
-    message: str
-    fix: str
-
-
-@dataclass
-class ConfigValidation:
-    """Result of config validation."""
-    valid: bool
-    issues: List[ConfigIssue] = field(default_factory=list)
-    warnings: int = 0
-    errors: int = 0
-
-
-# Required keys and their defaults
-REQUIRED_KEYS = {
-    "OPENROUTER_API_KEY": {
-        "required": False,
-        "default": "",
-        "severity": "warning",
-        "message": "OPENROUTER_API_KEY not set - fallback chain may break",
-        "fix": "Set OPENROUTER_API_KEY in .env for OpenRouter provider",
-    },
-    "API_SERVER_KEY": {
-        "required": False,
-        "default": "",
-        "severity": "warning",
-        "message": "API_SERVER_KEY not configured",
-        "fix": "Set API_SERVER_KEY in .env for API server auth",
-    },
-    "GITEA_TOKEN": {
-        "required": False,
-        "default": "",
-        "severity": "info",
-        "message": "GITEA_TOKEN not set - Gitea features disabled",
-        "fix": "Set GITEA_TOKEN in .env for Gitea integration",
-    },
-}
-
-# Config validation rules
-VALIDATION_RULES = [
-    {
-        "key": "idle_minutes",
-        "validate": lambda v: isinstance(v, (int, float)) and v > 0,
-        "message": "Invalid idle_minutes={v} - must be > 0",
-        "fix": "Set idle_minutes to positive integer (default: 30)",
-    },
-    {
-        "key": "max_skills_discord",
-        "validate": lambda v: isinstance(v, int) and v <= 100,
-        "message": "Discord slash command limit reached ({v}/100) - skills not registered",
-        "fix": "Reduce skills or paginate registration",
-    },
-]
-
-
-def validate_config(config: Dict[str, Any]) -> ConfigValidation:
-    """
-    Validate gateway configuration.
-    
-    Args:
-        config: Configuration dictionary
-        
-    Returns:
-        ConfigValidation with issues found
-    """
-    issues = []
-    
-    # Check required keys
-    for key, spec in REQUIRED_KEYS.items():
-        value = config.get(key) or os.environ.get(key) or spec["default"]
-        if spec["required"] and not value:
-            issues.append(ConfigIssue(
-                key=key,
-                severity=spec["severity"],
-                message=spec["message"],
-                fix=spec["fix"],
-            ))
-        elif not value and spec["severity"] != "error":
-            issues.append(ConfigIssue(
-                key=key,
-                severity=spec["severity"],
-                message=spec["message"],
-                fix=spec["fix"],
-            ))
-    
-    # Check validation rules
-    for rule in VALIDATION_RULES:
-        value = config.get(rule["key"])
-        if value is not None:
-            if not rule["validate"](value):
-                issues.append(ConfigIssue(
-                    key=rule["key"],
-                    severity="error",
-                    message=rule["message"].format(v=value),
-                    fix=rule["fix"],
-                ))
-    
-    errors = sum(1 for i in issues if i.severity == "error")
-    warnings = sum(1 for i in issues if i.severity == "warning")
-    
-    return ConfigValidation(
-        valid=errors == 0,
-        issues=issues,
-        warnings=warnings,
-        errors=errors,
-    )
-
-
-def apply_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Apply default values for missing config keys.
-    
-    Args:
-        config: Configuration dictionary
-        
-    Returns:
-        Config with defaults applied
-    """
-    result = dict(config)
-    
-    for key, spec in REQUIRED_KEYS.items():
-        if key not in result or not result[key]:
-            default = os.environ.get(key) or spec["default"]
-            if default:
-                result[key] = default
-                logger.debug("Applied default for %s", key)
-    
-    # Apply validation defaults
-    if "idle_minutes" not in result or not result["idle_minutes"] or result["idle_minutes"] <= 0:
-        result["idle_minutes"] = 30
-        logger.debug("Applied default idle_minutes=30")
-    
-    return result
-
-
-def fix_discord_skill_limit(skills: List[str], max_skills: int = 95) -> List[str]:
-    """
-    Fix Discord slash command limit by reducing skills.
-    
-    Args:
-        skills: List of skill names
-        max_skills: Maximum skills to register (default 95, leaving room for built-ins)
-        
-    Returns:
-        Reduced skill list
-    """
-    if len(skills) <= max_skills:
-        return skills
-    
-    logger.warning(
-        "Discord skill limit: %d skills exceeds %d limit, truncating",
-        len(skills), max_skills
-    )
-    
-    # Keep first max_skills (alphabetical priority)
-    return sorted(skills)[:max_skills]
-
-
-def validate_provider_config(provider: str, config: Dict[str, Any]) -> ConfigIssue:
-    """
-    Validate provider-specific configuration.
-    
-    Args:
-        provider: Provider name
-        config: Provider config
-        
-    Returns:
-        ConfigIssue if invalid, None if valid
-    """
-    if provider == "local-llama.cpp":
-        # Check if llama.cpp is configured
-        if not config.get("model_path") and not config.get("base_url"):
-            return ConfigIssue(
-                key=f"provider.{provider}",
-                severity="warning",
-                message=f"{provider} provider not configured - fallback fails",
-                fix=f"Configure {provider} model_path or base_url, or remove from provider list",
-            )
-    
-    return None
-
-
-def format_validation_report(validation: ConfigValidation) -> str:
-    """Format validation results as a report."""
-    lines = [
-        "=" * 50,
-        "GATEWAY CONFIG VALIDATION",
-        "=" * 50,
-        "",
-        f"Status: {'VALID' if validation.valid else 'INVALID'}",
-        f"Errors: {validation.errors}",
-        f"Warnings: {validation.warnings}",
-        "",
-    ]
-    
-    if validation.issues:
-        lines.append("Issues:")
-        for issue in validation.issues:
-            icon = "❌" if issue.severity == "error" else "⚠️" if issue.severity == "warning" else "ℹ️"
-            lines.append(f"  {icon} [{issue.key}] {issue.message}")
-            lines.append(f"     Fix: {issue.fix}")
-            lines.append("")
-    
-    return "\n".join(lines)
--- a/scripts/queue_health_check.py
+++ b/scripts/queue_health_check.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+"""
+Queue Health Check — Verify dispatch queue is operational.
+
+Checks:
+1. Queue file exists and is readable
+2. Queue has pending items
+3. Queue is not stuck (items processing)
+4. Queue age (stale items)
+
+Usage:
+    python scripts/queue_health_check.py
+    python scripts/queue_health_check.py --json
+"""
+
+import json
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+
+
+def check_queue_health(queue_path: str = "~/.hermes/queue.json") -> dict:
+    """Check queue health status."""
+    path = Path(queue_path).expanduser()
+    
+    result = {
+        "healthy": True,
+        "checks": {},
+        "warnings": [],
+        "errors": []
+    }
+    
+    # Check 1: File exists
+    if not path.exists():
+        result["healthy"] = False
+        result["errors"].append(f"Queue file not found: {path}")
+        result["checks"]["file_exists"] = False
+        return result
+    
+    result["checks"]["file_exists"] = True
+    
+    # Check 2: File is readable
+    try:
+        with open(path, "r") as f:
+            data = json.load(f)
+    except Exception as e:
+        result["healthy"] = False
+        result["errors"].append(f"Cannot read queue: {e}")
+        result["checks"]["readable"] = False
+        return result
+    
+    result["checks"]["readable"] = True
+    
+    # Check 3: Queue structure
+    if not isinstance(data, dict):
+        result["healthy"] = False
+        result["errors"].append("Queue is not a dict")
+        result["checks"]["valid_structure"] = False
+        return result
+    
+    result["checks"]["valid_structure"] = True
+    
+    # Check 4: Pending items
+    pending = data.get("pending", [])
+    processing = data.get("processing", [])
+    completed = data.get("completed", [])
+    
+    result["checks"]["pending_count"] = len(pending)
+    result["checks"]["processing_count"] = len(processing)
+    result["checks"]["completed_count"] = len(completed)
+    
+    if len(pending) == 0 and len(processing) == 0:
+        result["warnings"].append("Queue is empty")
+    
+    # Check 5: Stale processing items
+    now = datetime.now()
+    stale_threshold = timedelta(hours=1)
+    
+    for item in processing:
+        started = item.get("started_at")
+        if started:
+            try:
+                started_time = datetime.fromisoformat(started.replace("Z", "+00:00"))
+                if now - started_time > stale_threshold:
+                    result["warnings"].append(f"Stale item: {item.get('id', 'unknown')} (started {started})")
+            except:
+                pass
+    
+    # Check 6: Queue age
+    if pending:
+        oldest = min(pending, key=lambda x: x.get("added_at", ""))
+        added = oldest.get("added_at")
+        if added:
+            try:
+                added_time = datetime.fromisoformat(added.replace("Z", "+00:00"))
+                age = now - added_time
+                if age > timedelta(hours=24):
+                    result["warnings"].append(f"Old item in queue: {oldest.get('id', 'unknown')} (added {added})")
+            except:
+                pass
+    
+    return result
+
+
+def main():
+    """Main function."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Queue health check")
+    parser.add_argument("--queue", default="~/.hermes/queue.json", help="Queue file path")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    args = parser.parse_args()
+    
+    result = check_queue_health(args.queue)
+    
+    if args.json:
+        print(json.dumps(result, indent=2))
+    else:
+        print("Queue Health Check")
+        print("=" * 50)
+        print(f"Healthy: {'✓' if result['healthy'] else '✗'}")
+        print()
+        
+        print("Checks:")
+        for check, value in result["checks"].items():
+            if isinstance(value, bool):
+                print(f"  {check}: {'✓' if value else '✗'}")
+            else:
+                print(f"  {check}: {value}")
+        
+        if result["warnings"]:
+            print()
+            print("Warnings:")
+            for warning in result["warnings"]:
+                print(f"  ⚠ {warning}")
+        
+        if result["errors"]:
+            print()
+            print("Errors:")
+            for error in result["errors"]:
+                print(f"  ✗ {error}")
+    
+    sys.exit(0 if result["healthy"] else 1)
+
+
+if __name__ == "__main__":
+    main()