Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
73984ca72f feat: Add queue health check script
Some checks failed
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Contributor Attribution Check / check-attribution (pull_request) Failing after 29s
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 31s
Tests / e2e (pull_request) Successful in 2m13s
Tests / test (pull_request) Failing after 28m10s
2026-04-17 01:26:07 -04:00
2 changed files with 147 additions and 224 deletions

View File

@@ -1,224 +0,0 @@
"""
Gateway Config Validator & Fallback Fix — #892.
Validates gateway configuration and provides sensible defaults
for missing keys to prevent fallback chain breaks.
"""
import logging
import os
from typing import Dict, Any, List, Optional
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
@dataclass
class ConfigIssue:
"""A configuration issue found during validation."""
key: str
severity: str # error, warning, info
message: str
fix: str
@dataclass
class ConfigValidation:
"""Result of config validation."""
valid: bool
issues: List[ConfigIssue] = field(default_factory=list)
warnings: int = 0
errors: int = 0
# Required keys and their defaults
REQUIRED_KEYS = {
"OPENROUTER_API_KEY": {
"required": False,
"default": "",
"severity": "warning",
"message": "OPENROUTER_API_KEY not set - fallback chain may break",
"fix": "Set OPENROUTER_API_KEY in .env for OpenRouter provider",
},
"API_SERVER_KEY": {
"required": False,
"default": "",
"severity": "warning",
"message": "API_SERVER_KEY not configured",
"fix": "Set API_SERVER_KEY in .env for API server auth",
},
"GITEA_TOKEN": {
"required": False,
"default": "",
"severity": "info",
"message": "GITEA_TOKEN not set - Gitea features disabled",
"fix": "Set GITEA_TOKEN in .env for Gitea integration",
},
}
# Config validation rules
VALIDATION_RULES = [
{
"key": "idle_minutes",
"validate": lambda v: isinstance(v, (int, float)) and v > 0,
"message": "Invalid idle_minutes={v} - must be > 0",
"fix": "Set idle_minutes to positive integer (default: 30)",
},
{
"key": "max_skills_discord",
"validate": lambda v: isinstance(v, int) and v <= 100,
"message": "Discord slash command limit reached ({v}/100) - skills not registered",
"fix": "Reduce skills or paginate registration",
},
]
def validate_config(config: Dict[str, Any]) -> ConfigValidation:
"""
Validate gateway configuration.
Args:
config: Configuration dictionary
Returns:
ConfigValidation with issues found
"""
issues = []
# Check required keys
for key, spec in REQUIRED_KEYS.items():
value = config.get(key) or os.environ.get(key) or spec["default"]
if spec["required"] and not value:
issues.append(ConfigIssue(
key=key,
severity=spec["severity"],
message=spec["message"],
fix=spec["fix"],
))
elif not value and spec["severity"] != "error":
issues.append(ConfigIssue(
key=key,
severity=spec["severity"],
message=spec["message"],
fix=spec["fix"],
))
# Check validation rules
for rule in VALIDATION_RULES:
value = config.get(rule["key"])
if value is not None:
if not rule["validate"](value):
issues.append(ConfigIssue(
key=rule["key"],
severity="error",
message=rule["message"].format(v=value),
fix=rule["fix"],
))
errors = sum(1 for i in issues if i.severity == "error")
warnings = sum(1 for i in issues if i.severity == "warning")
return ConfigValidation(
valid=errors == 0,
issues=issues,
warnings=warnings,
errors=errors,
)
def apply_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
"""
Apply default values for missing config keys.
Args:
config: Configuration dictionary
Returns:
Config with defaults applied
"""
result = dict(config)
for key, spec in REQUIRED_KEYS.items():
if key not in result or not result[key]:
default = os.environ.get(key) or spec["default"]
if default:
result[key] = default
logger.debug("Applied default for %s", key)
# Apply validation defaults
if "idle_minutes" not in result or not result["idle_minutes"] or result["idle_minutes"] <= 0:
result["idle_minutes"] = 30
logger.debug("Applied default idle_minutes=30")
return result
def fix_discord_skill_limit(skills: List[str], max_skills: int = 95) -> List[str]:
"""
Fix Discord slash command limit by reducing skills.
Args:
skills: List of skill names
max_skills: Maximum skills to register (default 95, leaving room for built-ins)
Returns:
Reduced skill list
"""
if len(skills) <= max_skills:
return skills
logger.warning(
"Discord skill limit: %d skills exceeds %d limit, truncating",
len(skills), max_skills
)
# Keep first max_skills (alphabetical priority)
return sorted(skills)[:max_skills]
def validate_provider_config(provider: str, config: Dict[str, Any]) -> ConfigIssue:
"""
Validate provider-specific configuration.
Args:
provider: Provider name
config: Provider config
Returns:
ConfigIssue if invalid, None if valid
"""
if provider == "local-llama.cpp":
# Check if llama.cpp is configured
if not config.get("model_path") and not config.get("base_url"):
return ConfigIssue(
key=f"provider.{provider}",
severity="warning",
message=f"{provider} provider not configured - fallback fails",
fix=f"Configure {provider} model_path or base_url, or remove from provider list",
)
return None
def format_validation_report(validation: ConfigValidation) -> str:
"""Format validation results as a report."""
lines = [
"=" * 50,
"GATEWAY CONFIG VALIDATION",
"=" * 50,
"",
f"Status: {'VALID' if validation.valid else 'INVALID'}",
f"Errors: {validation.errors}",
f"Warnings: {validation.warnings}",
"",
]
if validation.issues:
lines.append("Issues:")
for issue in validation.issues:
icon = "" if issue.severity == "error" else "⚠️" if issue.severity == "warning" else ""
lines.append(f" {icon} [{issue.key}] {issue.message}")
lines.append(f" Fix: {issue.fix}")
lines.append("")
return "\n".join(lines)

147
scripts/queue_health_check.py Executable file
View File

@@ -0,0 +1,147 @@
#!/usr/bin/env python3
"""
Queue Health Check — Verify dispatch queue is operational.
Checks:
1. Queue file exists and is readable
2. Queue has pending items
3. Queue is not stuck (items processing)
4. Queue age (stale items)
Usage:
python scripts/queue_health_check.py
python scripts/queue_health_check.py --json
"""
import json
import sys
from datetime import datetime, timedelta
from pathlib import Path
def check_queue_health(queue_path: str = "~/.hermes/queue.json") -> dict:
"""Check queue health status."""
path = Path(queue_path).expanduser()
result = {
"healthy": True,
"checks": {},
"warnings": [],
"errors": []
}
# Check 1: File exists
if not path.exists():
result["healthy"] = False
result["errors"].append(f"Queue file not found: {path}")
result["checks"]["file_exists"] = False
return result
result["checks"]["file_exists"] = True
# Check 2: File is readable
try:
with open(path, "r") as f:
data = json.load(f)
except Exception as e:
result["healthy"] = False
result["errors"].append(f"Cannot read queue: {e}")
result["checks"]["readable"] = False
return result
result["checks"]["readable"] = True
# Check 3: Queue structure
if not isinstance(data, dict):
result["healthy"] = False
result["errors"].append("Queue is not a dict")
result["checks"]["valid_structure"] = False
return result
result["checks"]["valid_structure"] = True
# Check 4: Pending items
pending = data.get("pending", [])
processing = data.get("processing", [])
completed = data.get("completed", [])
result["checks"]["pending_count"] = len(pending)
result["checks"]["processing_count"] = len(processing)
result["checks"]["completed_count"] = len(completed)
if len(pending) == 0 and len(processing) == 0:
result["warnings"].append("Queue is empty")
# Check 5: Stale processing items
now = datetime.now()
stale_threshold = timedelta(hours=1)
for item in processing:
started = item.get("started_at")
if started:
try:
started_time = datetime.fromisoformat(started.replace("Z", "+00:00"))
if now - started_time > stale_threshold:
result["warnings"].append(f"Stale item: {item.get('id', 'unknown')} (started {started})")
except:
pass
# Check 6: Queue age
if pending:
oldest = min(pending, key=lambda x: x.get("added_at", ""))
added = oldest.get("added_at")
if added:
try:
added_time = datetime.fromisoformat(added.replace("Z", "+00:00"))
age = now - added_time
if age > timedelta(hours=24):
result["warnings"].append(f"Old item in queue: {oldest.get('id', 'unknown')} (added {added})")
except:
pass
return result
def main():
"""Main function."""
import argparse
parser = argparse.ArgumentParser(description="Queue health check")
parser.add_argument("--queue", default="~/.hermes/queue.json", help="Queue file path")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
result = check_queue_health(args.queue)
if args.json:
print(json.dumps(result, indent=2))
else:
print("Queue Health Check")
print("=" * 50)
print(f"Healthy: {'' if result['healthy'] else ''}")
print()
print("Checks:")
for check, value in result["checks"].items():
if isinstance(value, bool):
print(f" {check}: {'' if value else ''}")
else:
print(f" {check}: {value}")
if result["warnings"]:
print()
print("Warnings:")
for warning in result["warnings"]:
print(f"{warning}")
if result["errors"]:
print()
print("Errors:")
for error in result["errors"]:
print(f"{error}")
sys.exit(0 if result["healthy"] else 1)
if __name__ == "__main__":
main()