feat: Add queue health check script

2026-04-17 01:26:07 -04:00
3 changed files with 147 additions and 70 deletions
--- a/docs/holographic-vector-hybrid.md
+++ b/docs/holographic-vector-hybrid.md
@@ -1,42 +0,0 @@
-# Holographic + Vector Hybrid Memory Architecture
-
-Research issue #879. Combining HRR (holographic) and vector (Qdrant) memory.
-
-## Architecture
-
-Three memory backends, each with unique strengths:
-
-| Backend | Strength | Weakness | Use Case |
-|---------|----------|----------|----------|
-| FTS5 | Exact keyword match | No semantic understanding | Precise recall |
-| Vector (Qdrant) | Semantic similarity | No compositional queries | Topic search |
-| HRR (Holographic) | Compositional queries | Limited scale | Complex reasoning |
-
-## Why Hybrid
-
- FTS5 alone: misses ~30-40% of semantically relevant content
- Vector alone: can't do compositional queries ("what did I discuss about X after doing Y?")
- HRR alone: unique capability but no semantic fallback
- Hybrid: best of all three, RRF fusion for ranking
-
-## Implementation: Reciprocal Rank Fusion
-
-Results from each backend are merged using RRF:
- score = sum(weight / (k + rank)) for each backend
- k=60 (standard RRF constant)
- Weights: FTS5=0.6, Vector=0.4 (configurable)
-
-## Status
-
- FTS5: EXISTS (hermes_state.py)
- Vector (Qdrant): implemented (tools/hybrid_search.py)
- HRR: EXISTS (plugins/memory/holographic.py)
- RRF fusion: implemented (tools/hybrid_search.py)
- Ingestion pipeline: partial
-
-## Next Steps
-
-1. Wire HRR into hybrid_search.py
-2. Session-level vector ingestion
-3. Benchmark: measure R@5 improvement
-4. Cross-session memory persistence
--- a/scripts/queue_health_check.py
+++ b/scripts/queue_health_check.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+"""
+Queue Health Check — Verify dispatch queue is operational.
+
+Checks:
+1. Queue file exists and is readable
+2. Queue has pending items
+3. Queue is not stuck (items processing)
+4. Queue age (stale items)
+
+Usage:
+    python scripts/queue_health_check.py
+    python scripts/queue_health_check.py --json
+"""
+
+import json
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+
+
+def check_queue_health(queue_path: str = "~/.hermes/queue.json") -> dict:
+    """Check queue health status."""
+    path = Path(queue_path).expanduser()
+    
+    result = {
+        "healthy": True,
+        "checks": {},
+        "warnings": [],
+        "errors": []
+    }
+    
+    # Check 1: File exists
+    if not path.exists():
+        result["healthy"] = False
+        result["errors"].append(f"Queue file not found: {path}")
+        result["checks"]["file_exists"] = False
+        return result
+    
+    result["checks"]["file_exists"] = True
+    
+    # Check 2: File is readable
+    try:
+        with open(path, "r") as f:
+            data = json.load(f)
+    except Exception as e:
+        result["healthy"] = False
+        result["errors"].append(f"Cannot read queue: {e}")
+        result["checks"]["readable"] = False
+        return result
+    
+    result["checks"]["readable"] = True
+    
+    # Check 3: Queue structure
+    if not isinstance(data, dict):
+        result["healthy"] = False
+        result["errors"].append("Queue is not a dict")
+        result["checks"]["valid_structure"] = False
+        return result
+    
+    result["checks"]["valid_structure"] = True
+    
+    # Check 4: Pending items
+    pending = data.get("pending", [])
+    processing = data.get("processing", [])
+    completed = data.get("completed", [])
+    
+    result["checks"]["pending_count"] = len(pending)
+    result["checks"]["processing_count"] = len(processing)
+    result["checks"]["completed_count"] = len(completed)
+    
+    if len(pending) == 0 and len(processing) == 0:
+        result["warnings"].append("Queue is empty")
+    
+    # Check 5: Stale processing items
+    now = datetime.now()
+    stale_threshold = timedelta(hours=1)
+    
+    for item in processing:
+        started = item.get("started_at")
+        if started:
+            try:
+                started_time = datetime.fromisoformat(started.replace("Z", "+00:00"))
+                if now - started_time > stale_threshold:
+                    result["warnings"].append(f"Stale item: {item.get('id', 'unknown')} (started {started})")
+            except:
+                pass
+    
+    # Check 6: Queue age
+    if pending:
+        oldest = min(pending, key=lambda x: x.get("added_at", ""))
+        added = oldest.get("added_at")
+        if added:
+            try:
+                added_time = datetime.fromisoformat(added.replace("Z", "+00:00"))
+                age = now - added_time
+                if age > timedelta(hours=24):
+                    result["warnings"].append(f"Old item in queue: {oldest.get('id', 'unknown')} (added {added})")
+            except:
+                pass
+    
+    return result
+
+
+def main():
+    """Main function."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Queue health check")
+    parser.add_argument("--queue", default="~/.hermes/queue.json", help="Queue file path")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    args = parser.parse_args()
+    
+    result = check_queue_health(args.queue)
+    
+    if args.json:
+        print(json.dumps(result, indent=2))
+    else:
+        print("Queue Health Check")
+        print("=" * 50)
+        print(f"Healthy: {'✓' if result['healthy'] else '✗'}")
+        print()
+        
+        print("Checks:")
+        for check, value in result["checks"].items():
+            if isinstance(value, bool):
+                print(f"  {check}: {'✓' if value else '✗'}")
+            else:
+                print(f"  {check}: {value}")
+        
+        if result["warnings"]:
+            print()
+            print("Warnings:")
+            for warning in result["warnings"]:
+                print(f"  ⚠ {warning}")
+        
+        if result["errors"]:
+            print()
+            print("Errors:")
+            for error in result["errors"]:
+                print(f"  ✗ {error}")
+    
+    sys.exit(0 if result["healthy"] else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/skill_manager_tool.py
+++ b/tools/skill_manager_tool.py
@@ -44,34 +44,6 @@ from typing import Dict, Any, Optional, Tuple

 logger = logging.getLogger(__name__)

-
-def _format_error(
-    message: str,
-    skill_name: str = None,
-    file_path: str = None,
-    suggestion: str = None,
-    context: dict = None,
-) -> Dict[str, Any]:
-    """Format an error with rich context for better debugging."""
-    parts = [message]
-    if skill_name:
-        parts.append(f"Skill: {skill_name}")
-    if file_path:
-        parts.append(f"File: {file_path}")
-    if suggestion:
-        parts.append(f"Suggestion: {suggestion}")
-    if context:
-        for key, value in context.items():
-            parts.append(f"{key}: {value}")
-    return {
-        "success": False,
-        "error": " | ".join(parts),
-        "skill_name": skill_name,
-        "file_path": file_path,
-        "suggestion": suggestion,
-    }
-
-
 # Import security scanner — agent-created skills get the same scrutiny as
 # community hub installs.
 try: