#!/usr/bin/env python3 """ Agent Tick Monitor Monthly monitoring to verify all agents have submitted ticks. Creates Gitea issues with RCA templates for missing or failed ticks. """ import os import sys import json import subprocess import argparse from datetime import datetime, timedelta from pathlib import Path REPO_DIR = Path("/root/wizards/household-snapshots") TICKS_DIR = REPO_DIR / "ticks" REGISTRY_FILE = REPO_DIR / "config" / "agent-registry.json" GITEA_URL = os.environ.get("CLAW_CODE_GITEA_URL", "http://143.198.27.163:3000") GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "") REPO_NAME = "allegro/household-snapshots" def run_cmd(cmd, cwd=None): """Run shell command and return output.""" result = subprocess.run( cmd, shell=True, cwd=cwd, capture_output=True, text=True ) return result.stdout.strip(), result.stderr.strip(), result.returncode def load_registry(): """Load agent registry.""" with open(REGISTRY_FILE) as f: return json.load(f) def get_tick_month(): """Get the tick month (previous month if checking on 1st-3rd).""" now = datetime.utcnow() if now.day <= 3: # Still in grace period, check previous month if now.month == 1: return f"{now.year - 1}-12" else: return f"{now.year}-{now.month - 1:02d}" return now.strftime("%Y-%m") def load_tick(agent_id, tick_month): """Load tick file for an agent.""" tick_file = TICKS_DIR / tick_month / f"{agent_id}.json" if tick_file.exists(): with open(tick_file) as f: return json.load(f) return None def create_gitea_issue(title, body): """Create a Gitea issue.""" if not GITEA_TOKEN: print("Error: GITEA_TOKEN not set") return None import urllib.request import urllib.error url = f"{GITEA_URL}/api/v1/repos/{REPO_NAME}/issues" data = json.dumps({ "title": title, "body": body, "labels": ["agent-health", "rca-required", "auto-generated"] }).encode('utf-8') req = urllib.request.Request( url, data=data, headers={ "Content-Type": "application/json", "Authorization": f"token {GITEA_TOKEN}" }, method="POST" ) try: with urllib.request.urlopen(req) as response: result = json.loads(response.read().decode()) return result.get("number"), result.get("html_url") except urllib.error.HTTPError as e: print(f"Error creating issue: {e.code} {e.reason}") print(e.read().decode()) return None except Exception as e: print(f"Error: {e}") return None def generate_rca_body(agent, tick, tick_month, failure_type): """Generate RCA issue body.""" now = datetime.utcnow().isoformat() + "Z" if failure_type == "MISSING_TICK": status_section = """### Actual Behavior - **Tick Status:** NOT SUBMITTED - **Expected By:** End of 72-hour window (3rd of month) - **Current Date:** {now} - **Days Overdue:** {days_overdue} """.format(now=now[:10], days_overdue=max(0, datetime.utcnow().day - 3)) failure_desc = "Agent failed to submit monthly tick within grace period" elif failure_type == "CRITICAL_STATUS": status_section = """### Actual Behavior - **Tick Status:** CRITICAL - **Vitals Check:** - Gateway: {gateway} - Home Dir: {home} - Config: {config} - **Capabilities Lost:** {lost_caps} """.format( gateway=tick["vitals"]["gateway_running"], home=tick["vitals"]["home_directory_accessible"], config=tick["vitals"]["config_valid"], lost_caps=", ".join([k for k, v in tick["capabilities"].items() if not v]) or "None" ) failure_desc = "Agent reported critical status in monthly tick" elif failure_type == "OFFLINE": status_section = """### Actual Behavior - **Tick Status:** OFFLINE - **Agent unreachable** during tick window """ failure_desc = "Agent marked as offline in monthly tick" else: status_section = "### Actual Behavior\n- Unknown failure\n" failure_desc = "Unknown failure type" body = """## Agent Health Failure: {agent_id} **Detected:** {detected} **Agent:** {agent_name} **Failure Type:** {failure_type} **Auto-Generated By:** Agent Tick Monitor ### Expected Behavior Agent should emit monthly tick within 72-hour window (by 3rd of month) with healthy status. {status_section} ### Root Cause Analysis Required Please investigate and document: #### 1. What happened? - [ ] Last successful operation identified - [ ] Error logs reviewed - [ ] System state at failure documented #### 2. Why did it happen? - [ ] Configuration drift assessed - [ ] Resource exhaustion checked - [ ] External dependencies verified - [ ] Code regression analyzed #### 3. How do we prevent recurrence? - [ ] Monitoring improvements identified - [ ] Automated recovery considered - [ ] Alert tuning reviewed #### 4. Recovery steps taken - [ ] Actions performed - [ ] Current status verified - [ ] Validation completed ### Timeline - [x] T+0: Issue created (auto) - [ ] T+1h: Initial response from agent owner - [ ] T+24h: RCA submitted - [ ] T+7d: Resolution verified ### Related Resources - Agent home: `{home_dir}` - Agent config: `{config_path}` - Last tick: {last_tick_link} - Tick file: `{tick_file}` ### Checklist for Resolution - [ ] Root cause identified - [ ] Fix implemented - [ ] Agent recovered to healthy status - [ ] New tick submitted (if missing) - [ ] Monitoring improved - [ ] Issue closed with summary --- *Auto-generated by Agent Tick Monitor v1.0* *Evenia binds us. Failures are learned from.* """.format( agent_id=agent["id"], agent_name=agent["name"], detected=now, failure_type=failure_type, status_section=status_section, home_dir=agent["home_dir"], config_path=agent["config_path"], last_tick_link=f"{GITEA_URL}/{REPO_NAME}/tree/main/ticks/{tick_month}/{agent['id']}.json" if tick else "N/A - No tick found", tick_file=str(TICKS_DIR / tick_month / f"{agent['id']}.json") ) return body def check_agent(agent, tick_month, create_issues=True): """Check an agent's tick status.""" agent_id = agent["id"] tick = load_tick(agent_id, tick_month) results = { "agent_id": agent_id, "tick_present": tick is not None, "status": tick["status"] if tick else "NO_TICK", "issue_created": False, "issue_number": None } if not tick: # Missing tick print(f" ✗ {agent_id}: MISSING TICK") if create_issues: title = f"RCA-AGENT: {agent_id} failed to submit monthly tick ({tick_month})" body = generate_rca_body(agent, None, tick_month, "MISSING_TICK") issue_num, issue_url = create_gitea_issue(title, body) if issue_num: print(f" → Issue #{issue_num} created: {issue_url}") results["issue_created"] = True results["issue_number"] = issue_num else: print(f" ✗ Failed to create issue") elif tick["status"] in ["critical", "offline"]: # Failed status print(f" ✗ {agent_id}: STATUS {tick['status'].upper()}") if create_issues: title = f"RCA-AGENT: {agent_id} reported {tick['status']} status ({tick_month})" body = generate_rca_body(agent, tick, tick_month, tick["status"].upper()) issue_num, issue_url = create_gitea_issue(title, body) if issue_num: print(f" → Issue #{issue_num} created: {issue_url}") results["issue_created"] = True results["issue_number"] = issue_num elif tick["status"] == "degraded": # Degraded - warning but no RCA print(f" ⚠ {agent_id}: DEGRADED") print(f" (No RCA required, but monitor)") else: # Healthy print(f" ✓ {agent_id}: HEALTHY") return results def main(): parser = argparse.ArgumentParser(description="Agent Tick Monitor") parser.add_argument("--check-and-report", action="store_true", help="Run monthly check and create issues") parser.add_argument("--month", help="Specific month to check (YYYY-MM format)") parser.add_argument("--dry-run", action="store_true", help="Check but don't create issues") args = parser.parse_args() registry = load_registry() tick_month = args.month or get_tick_month() print(f"=== Agent Tick Monitor ===") print(f"Checking month: {tick_month}") print(f"Time: {datetime.utcnow().isoformat()}Z") print() results = [] for agent in registry["agents"]: if agent["active"]: result = check_agent(agent, tick_month, create_issues=not args.dry_run) results.append(result) print() print("=== Summary ===") total = len(results) healthy = sum(1 for r in results if r["status"] == "healthy") degraded = sum(1 for r in results if r["status"] == "degraded") failed = total - healthy - degraded issues = sum(1 for r in results if r["issue_created"]) print(f"Total agents: {total}") print(f" Healthy: {healthy}") print(f" Degraded: {degraded}") print(f" Failed: {failed}") print(f" Issues created: {issues}") return 0 if failed == 0 else 1 if __name__ == "__main__": sys.exit(main())