295 lines
9.3 KiB
Python
295 lines
9.3 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Agent Tick Monitor
|
||
|
|
|
||
|
|
Monthly monitoring to verify all agents have submitted ticks.
|
||
|
|
Creates Gitea issues with RCA templates for missing or failed ticks.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import json
|
||
|
|
import subprocess
|
||
|
|
import argparse
|
||
|
|
from datetime import datetime, timedelta
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
REPO_DIR = Path("/root/wizards/household-snapshots")
|
||
|
|
TICKS_DIR = REPO_DIR / "ticks"
|
||
|
|
REGISTRY_FILE = REPO_DIR / "config" / "agent-registry.json"
|
||
|
|
GITEA_URL = os.environ.get("CLAW_CODE_GITEA_URL", "http://143.198.27.163:3000")
|
||
|
|
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
|
||
|
|
REPO_NAME = "allegro/household-snapshots"
|
||
|
|
|
||
|
|
def run_cmd(cmd, cwd=None):
|
||
|
|
"""Run shell command and return output."""
|
||
|
|
result = subprocess.run(
|
||
|
|
cmd, shell=True, cwd=cwd, capture_output=True, text=True
|
||
|
|
)
|
||
|
|
return result.stdout.strip(), result.stderr.strip(), result.returncode
|
||
|
|
|
||
|
|
def load_registry():
|
||
|
|
"""Load agent registry."""
|
||
|
|
with open(REGISTRY_FILE) as f:
|
||
|
|
return json.load(f)
|
||
|
|
|
||
|
|
def get_tick_month():
|
||
|
|
"""Get the tick month (previous month if checking on 1st-3rd)."""
|
||
|
|
now = datetime.utcnow()
|
||
|
|
if now.day <= 3:
|
||
|
|
# Still in grace period, check previous month
|
||
|
|
if now.month == 1:
|
||
|
|
return f"{now.year - 1}-12"
|
||
|
|
else:
|
||
|
|
return f"{now.year}-{now.month - 1:02d}"
|
||
|
|
return now.strftime("%Y-%m")
|
||
|
|
|
||
|
|
def load_tick(agent_id, tick_month):
|
||
|
|
"""Load tick file for an agent."""
|
||
|
|
tick_file = TICKS_DIR / tick_month / f"{agent_id}.json"
|
||
|
|
if tick_file.exists():
|
||
|
|
with open(tick_file) as f:
|
||
|
|
return json.load(f)
|
||
|
|
return None
|
||
|
|
|
||
|
|
def create_gitea_issue(title, body):
|
||
|
|
"""Create a Gitea issue."""
|
||
|
|
if not GITEA_TOKEN:
|
||
|
|
print("Error: GITEA_TOKEN not set")
|
||
|
|
return None
|
||
|
|
|
||
|
|
import urllib.request
|
||
|
|
import urllib.error
|
||
|
|
|
||
|
|
url = f"{GITEA_URL}/api/v1/repos/{REPO_NAME}/issues"
|
||
|
|
|
||
|
|
data = json.dumps({
|
||
|
|
"title": title,
|
||
|
|
"body": body,
|
||
|
|
"labels": ["agent-health", "rca-required", "auto-generated"]
|
||
|
|
}).encode('utf-8')
|
||
|
|
|
||
|
|
req = urllib.request.Request(
|
||
|
|
url,
|
||
|
|
data=data,
|
||
|
|
headers={
|
||
|
|
"Content-Type": "application/json",
|
||
|
|
"Authorization": f"token {GITEA_TOKEN}"
|
||
|
|
},
|
||
|
|
method="POST"
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
return result.get("number"), result.get("html_url")
|
||
|
|
except urllib.error.HTTPError as e:
|
||
|
|
print(f"Error creating issue: {e.code} {e.reason}")
|
||
|
|
print(e.read().decode())
|
||
|
|
return None
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error: {e}")
|
||
|
|
return None
|
||
|
|
|
||
|
|
def generate_rca_body(agent, tick, tick_month, failure_type):
|
||
|
|
"""Generate RCA issue body."""
|
||
|
|
|
||
|
|
now = datetime.utcnow().isoformat() + "Z"
|
||
|
|
|
||
|
|
if failure_type == "MISSING_TICK":
|
||
|
|
status_section = """### Actual Behavior
|
||
|
|
- **Tick Status:** NOT SUBMITTED
|
||
|
|
- **Expected By:** End of 72-hour window (3rd of month)
|
||
|
|
- **Current Date:** {now}
|
||
|
|
- **Days Overdue:** {days_overdue}
|
||
|
|
""".format(now=now[:10], days_overdue=max(0, datetime.utcnow().day - 3))
|
||
|
|
failure_desc = "Agent failed to submit monthly tick within grace period"
|
||
|
|
elif failure_type == "CRITICAL_STATUS":
|
||
|
|
status_section = """### Actual Behavior
|
||
|
|
- **Tick Status:** CRITICAL
|
||
|
|
- **Vitals Check:**
|
||
|
|
- Gateway: {gateway}
|
||
|
|
- Home Dir: {home}
|
||
|
|
- Config: {config}
|
||
|
|
- **Capabilities Lost:** {lost_caps}
|
||
|
|
""".format(
|
||
|
|
gateway=tick["vitals"]["gateway_running"],
|
||
|
|
home=tick["vitals"]["home_directory_accessible"],
|
||
|
|
config=tick["vitals"]["config_valid"],
|
||
|
|
lost_caps=", ".join([k for k, v in tick["capabilities"].items() if not v]) or "None"
|
||
|
|
)
|
||
|
|
failure_desc = "Agent reported critical status in monthly tick"
|
||
|
|
elif failure_type == "OFFLINE":
|
||
|
|
status_section = """### Actual Behavior
|
||
|
|
- **Tick Status:** OFFLINE
|
||
|
|
- **Agent unreachable** during tick window
|
||
|
|
"""
|
||
|
|
failure_desc = "Agent marked as offline in monthly tick"
|
||
|
|
else:
|
||
|
|
status_section = "### Actual Behavior\n- Unknown failure\n"
|
||
|
|
failure_desc = "Unknown failure type"
|
||
|
|
|
||
|
|
body = """## Agent Health Failure: {agent_id}
|
||
|
|
|
||
|
|
**Detected:** {detected}
|
||
|
|
**Agent:** {agent_name}
|
||
|
|
**Failure Type:** {failure_type}
|
||
|
|
**Auto-Generated By:** Agent Tick Monitor
|
||
|
|
|
||
|
|
### Expected Behavior
|
||
|
|
Agent should emit monthly tick within 72-hour window (by 3rd of month) with healthy status.
|
||
|
|
|
||
|
|
{status_section}
|
||
|
|
|
||
|
|
### Root Cause Analysis Required
|
||
|
|
|
||
|
|
Please investigate and document:
|
||
|
|
|
||
|
|
#### 1. What happened?
|
||
|
|
- [ ] Last successful operation identified
|
||
|
|
- [ ] Error logs reviewed
|
||
|
|
- [ ] System state at failure documented
|
||
|
|
|
||
|
|
#### 2. Why did it happen?
|
||
|
|
- [ ] Configuration drift assessed
|
||
|
|
- [ ] Resource exhaustion checked
|
||
|
|
- [ ] External dependencies verified
|
||
|
|
- [ ] Code regression analyzed
|
||
|
|
|
||
|
|
#### 3. How do we prevent recurrence?
|
||
|
|
- [ ] Monitoring improvements identified
|
||
|
|
- [ ] Automated recovery considered
|
||
|
|
- [ ] Alert tuning reviewed
|
||
|
|
|
||
|
|
#### 4. Recovery steps taken
|
||
|
|
- [ ] Actions performed
|
||
|
|
- [ ] Current status verified
|
||
|
|
- [ ] Validation completed
|
||
|
|
|
||
|
|
### Timeline
|
||
|
|
|
||
|
|
- [x] T+0: Issue created (auto)
|
||
|
|
- [ ] T+1h: Initial response from agent owner
|
||
|
|
- [ ] T+24h: RCA submitted
|
||
|
|
- [ ] T+7d: Resolution verified
|
||
|
|
|
||
|
|
### Related Resources
|
||
|
|
|
||
|
|
- Agent home: `{home_dir}`
|
||
|
|
- Agent config: `{config_path}`
|
||
|
|
- Last tick: {last_tick_link}
|
||
|
|
- Tick file: `{tick_file}`
|
||
|
|
|
||
|
|
### Checklist for Resolution
|
||
|
|
|
||
|
|
- [ ] Root cause identified
|
||
|
|
- [ ] Fix implemented
|
||
|
|
- [ ] Agent recovered to healthy status
|
||
|
|
- [ ] New tick submitted (if missing)
|
||
|
|
- [ ] Monitoring improved
|
||
|
|
- [ ] Issue closed with summary
|
||
|
|
|
||
|
|
---
|
||
|
|
*Auto-generated by Agent Tick Monitor v1.0*
|
||
|
|
*Evenia binds us. Failures are learned from.*
|
||
|
|
""".format(
|
||
|
|
agent_id=agent["id"],
|
||
|
|
agent_name=agent["name"],
|
||
|
|
detected=now,
|
||
|
|
failure_type=failure_type,
|
||
|
|
status_section=status_section,
|
||
|
|
home_dir=agent["home_dir"],
|
||
|
|
config_path=agent["config_path"],
|
||
|
|
last_tick_link=f"{GITEA_URL}/{REPO_NAME}/tree/main/ticks/{tick_month}/{agent['id']}.json" if tick else "N/A - No tick found",
|
||
|
|
tick_file=str(TICKS_DIR / tick_month / f"{agent['id']}.json")
|
||
|
|
)
|
||
|
|
|
||
|
|
return body
|
||
|
|
|
||
|
|
def check_agent(agent, tick_month, create_issues=True):
|
||
|
|
"""Check an agent's tick status."""
|
||
|
|
agent_id = agent["id"]
|
||
|
|
tick = load_tick(agent_id, tick_month)
|
||
|
|
|
||
|
|
results = {
|
||
|
|
"agent_id": agent_id,
|
||
|
|
"tick_present": tick is not None,
|
||
|
|
"status": tick["status"] if tick else "NO_TICK",
|
||
|
|
"issue_created": False,
|
||
|
|
"issue_number": None
|
||
|
|
}
|
||
|
|
|
||
|
|
if not tick:
|
||
|
|
# Missing tick
|
||
|
|
print(f" ✗ {agent_id}: MISSING TICK")
|
||
|
|
if create_issues:
|
||
|
|
title = f"RCA-AGENT: {agent_id} failed to submit monthly tick ({tick_month})"
|
||
|
|
body = generate_rca_body(agent, None, tick_month, "MISSING_TICK")
|
||
|
|
issue_num, issue_url = create_gitea_issue(title, body)
|
||
|
|
if issue_num:
|
||
|
|
print(f" → Issue #{issue_num} created: {issue_url}")
|
||
|
|
results["issue_created"] = True
|
||
|
|
results["issue_number"] = issue_num
|
||
|
|
else:
|
||
|
|
print(f" ✗ Failed to create issue")
|
||
|
|
elif tick["status"] in ["critical", "offline"]:
|
||
|
|
# Failed status
|
||
|
|
print(f" ✗ {agent_id}: STATUS {tick['status'].upper()}")
|
||
|
|
if create_issues:
|
||
|
|
title = f"RCA-AGENT: {agent_id} reported {tick['status']} status ({tick_month})"
|
||
|
|
body = generate_rca_body(agent, tick, tick_month, tick["status"].upper())
|
||
|
|
issue_num, issue_url = create_gitea_issue(title, body)
|
||
|
|
if issue_num:
|
||
|
|
print(f" → Issue #{issue_num} created: {issue_url}")
|
||
|
|
results["issue_created"] = True
|
||
|
|
results["issue_number"] = issue_num
|
||
|
|
elif tick["status"] == "degraded":
|
||
|
|
# Degraded - warning but no RCA
|
||
|
|
print(f" ⚠ {agent_id}: DEGRADED")
|
||
|
|
print(f" (No RCA required, but monitor)")
|
||
|
|
else:
|
||
|
|
# Healthy
|
||
|
|
print(f" ✓ {agent_id}: HEALTHY")
|
||
|
|
|
||
|
|
return results
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Agent Tick Monitor")
|
||
|
|
parser.add_argument("--check-and-report", action="store_true", help="Run monthly check and create issues")
|
||
|
|
parser.add_argument("--month", help="Specific month to check (YYYY-MM format)")
|
||
|
|
parser.add_argument("--dry-run", action="store_true", help="Check but don't create issues")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
registry = load_registry()
|
||
|
|
tick_month = args.month or get_tick_month()
|
||
|
|
|
||
|
|
print(f"=== Agent Tick Monitor ===")
|
||
|
|
print(f"Checking month: {tick_month}")
|
||
|
|
print(f"Time: {datetime.utcnow().isoformat()}Z")
|
||
|
|
print()
|
||
|
|
|
||
|
|
results = []
|
||
|
|
for agent in registry["agents"]:
|
||
|
|
if agent["active"]:
|
||
|
|
result = check_agent(agent, tick_month, create_issues=not args.dry_run)
|
||
|
|
results.append(result)
|
||
|
|
|
||
|
|
print()
|
||
|
|
print("=== Summary ===")
|
||
|
|
total = len(results)
|
||
|
|
healthy = sum(1 for r in results if r["status"] == "healthy")
|
||
|
|
degraded = sum(1 for r in results if r["status"] == "degraded")
|
||
|
|
failed = total - healthy - degraded
|
||
|
|
issues = sum(1 for r in results if r["issue_created"])
|
||
|
|
|
||
|
|
print(f"Total agents: {total}")
|
||
|
|
print(f" Healthy: {healthy}")
|
||
|
|
print(f" Degraded: {degraded}")
|
||
|
|
print(f" Failed: {failed}")
|
||
|
|
print(f" Issues created: {issues}")
|
||
|
|
|
||
|
|
return 0 if failed == 0 else 1
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
sys.exit(main())
|