- Add nexus_alert() function to send alerts to Nexus Watchdog - Alerts are written as JSON files to $NEXUS_ALERT_DIR (default: /tmp/nexus-alerts) - Alert includes: alert_id, timestamp, source, host, alert_type, severity, message, data - Send 'stale_lock_reclaimed' warning alert when stale lock detected (age > 600s) - Send 'heartbeat_resumed' info alert after successful recovery - Include lock age, lockfile path, action taken, and stat info in alert data - Add configurable NEXUS_ALERT_DIR and NEXUS_ALERT_ENABLED settings - Add test script for validating alert functionality
147 lines
3.8 KiB
Bash
Executable File
147 lines
3.8 KiB
Bash
Executable File
#!/bin/bash
|
|
# Test script for Nexus Watchdog alerting functionality
|
|
|
|
set -euo pipefail
|
|
|
|
TEST_DIR="/tmp/test-nexus-alerts-$$"
|
|
export NEXUS_ALERT_DIR="$TEST_DIR"
|
|
export NEXUS_ALERT_ENABLED=true
|
|
|
|
echo "=== Nexus Watchdog Alert Test ==="
|
|
echo "Test alert directory: $TEST_DIR"
|
|
|
|
# Source the alert function from the heartbeat script
|
|
# Extract just the nexus_alert function for testing
|
|
cat > /tmp/test_alert_func.sh << 'ALEOF'
|
|
#!/bin/bash
|
|
NEXUS_ALERT_DIR="${NEXUS_ALERT_DIR:-/tmp/nexus-alerts}"
|
|
NEXUS_ALERT_ENABLED=true
|
|
HOSTNAME=$(hostname -s 2>/dev/null || echo "unknown")
|
|
SCRIPT_NAME="kimi-heartbeat-test"
|
|
|
|
nexus_alert() {
|
|
local alert_type="$1"
|
|
local message="$2"
|
|
local severity="${3:-info}"
|
|
local extra_data="${4:-{}}"
|
|
|
|
if [ "$NEXUS_ALERT_ENABLED" != "true" ]; then
|
|
return 0
|
|
fi
|
|
|
|
mkdir -p "$NEXUS_ALERT_DIR" 2>/dev/null || return 0
|
|
|
|
local timestamp
|
|
timestamp=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
|
|
local nanoseconds=$(date +%N 2>/dev/null || echo "$$")
|
|
local alert_id="${SCRIPT_NAME}_$(date +%s)_${nanoseconds}_$$"
|
|
local alert_file="$NEXUS_ALERT_DIR/${alert_id}.json"
|
|
|
|
cat > "$alert_file" << EOF
|
|
{
|
|
"alert_id": "$alert_id",
|
|
"timestamp": "$timestamp",
|
|
"source": "$SCRIPT_NAME",
|
|
"host": "$HOSTNAME",
|
|
"alert_type": "$alert_type",
|
|
"severity": "$severity",
|
|
"message": "$message",
|
|
"data": $extra_data
|
|
}
|
|
EOF
|
|
|
|
if [ -f "$alert_file" ]; then
|
|
echo "NEXUS_ALERT: $alert_type [$severity] - $message"
|
|
return 0
|
|
else
|
|
echo "NEXUS_ALERT_FAILED: Could not write alert"
|
|
return 1
|
|
fi
|
|
}
|
|
ALEOF
|
|
|
|
source /tmp/test_alert_func.sh
|
|
|
|
# Test 1: Basic alert
|
|
echo -e "\n[TEST 1] Sending basic info alert..."
|
|
nexus_alert "test_alert" "Test message from heartbeat" "info" '{"test": true}'
|
|
|
|
# Test 2: Stale lock alert simulation
|
|
echo -e "\n[TEST 2] Sending stale lock alert..."
|
|
nexus_alert \
|
|
"stale_lock_reclaimed" \
|
|
"Stale lockfile deadlock cleared after 650s" \
|
|
"warning" \
|
|
'{"lock_age_seconds": 650, "lockfile": "/tmp/kimi-heartbeat.lock", "action": "removed"}'
|
|
|
|
# Test 3: Heartbeat resumed alert
|
|
echo -e "\n[TEST 3] Sending heartbeat resumed alert..."
|
|
nexus_alert \
|
|
"heartbeat_resumed" \
|
|
"Kimi heartbeat resumed after clearing stale lock" \
|
|
"info" \
|
|
'{"recovery": "successful", "continuing": true}'
|
|
|
|
# Check results
|
|
echo -e "\n=== Alert Files Created ==="
|
|
alert_count=$(find "$TEST_DIR" -name "*.json" 2>/dev/null | wc -l)
|
|
echo "Total alert files: $alert_count"
|
|
|
|
if [ "$alert_count" -eq 3 ]; then
|
|
echo "✅ All 3 alerts were created successfully"
|
|
else
|
|
echo "❌ Expected 3 alerts, found $alert_count"
|
|
exit 1
|
|
fi
|
|
|
|
echo -e "\n=== Alert Contents ==="
|
|
for f in "$TEST_DIR"/*.json; do
|
|
echo -e "\n--- $(basename "$f") ---"
|
|
cat "$f" | python3 -m json.tool 2>/dev/null || cat "$f"
|
|
done
|
|
|
|
# Validate JSON structure
|
|
echo -e "\n=== JSON Validation ==="
|
|
all_valid=true
|
|
for f in "$TEST_DIR"/*.json; do
|
|
if python3 -c "import json; json.load(open('$f'))" 2>/dev/null; then
|
|
echo "✅ $(basename "$f") - Valid JSON"
|
|
else
|
|
echo "❌ $(basename "$f") - Invalid JSON"
|
|
all_valid=false
|
|
fi
|
|
done
|
|
|
|
# Check for required fields
|
|
echo -e "\n=== Required Fields Check ==="
|
|
for f in "$TEST_DIR"/*.json; do
|
|
basename=$(basename "$f")
|
|
missing=()
|
|
python3 -c "import json; d=json.load(open('$f'))" 2>/dev/null || continue
|
|
|
|
for field in alert_id timestamp source host alert_type severity message data; do
|
|
if ! python3 -c "import json; d=json.load(open('$f')); exit(0 if '$field' in d else 1)" 2>/dev/null; then
|
|
missing+=("$field")
|
|
fi
|
|
done
|
|
|
|
if [ ${#missing[@]} -eq 0 ]; then
|
|
echo "✅ $basename - All required fields present"
|
|
else
|
|
echo "❌ $basename - Missing fields: ${missing[*]}"
|
|
all_valid=false
|
|
fi
|
|
done
|
|
|
|
# Cleanup
|
|
rm -rf "$TEST_DIR" /tmp/test_alert_func.sh
|
|
|
|
echo -e "\n=== Test Summary ==="
|
|
if [ "$all_valid" = true ]; then
|
|
echo "✅ All tests passed!"
|
|
exit 0
|
|
else
|
|
echo "❌ Some tests failed"
|
|
exit 1
|
|
fi
|