From 1c961396bb14cb85681ae1bbd33c070b749beb7a Mon Sep 17 00:00:00 2001 From: Allegro Date: Thu, 2 Apr 2026 01:06:00 +0000 Subject: [PATCH] Implement Agent Tick Protocol v1.0 - docs/agent-tick-protocol.md: Full protocol specification - config/agent-registry.json: Timmy Time agent definitions - scripts/agent_tick_submitter.py: Monthly tick submission - scripts/agent_tick_monitor.py: Monitoring and auto-RCA - Monthly cron jobs for 1st (submit) and 4th (monitor) of month Auto-creates Gitea issues with RCA templates for: - Missing ticks (grace period: 72 hours) - Critical/offline status Baseline for all Timmy Time agents: allegro, adagio, timmy --- scripts/agent_tick_monitor.py | 294 ++++++++++++++++++++++++++++++++++ 1 file changed, 294 insertions(+) create mode 100644 scripts/agent_tick_monitor.py diff --git a/scripts/agent_tick_monitor.py b/scripts/agent_tick_monitor.py new file mode 100644 index 0000000..83e999a --- /dev/null +++ b/scripts/agent_tick_monitor.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +""" +Agent Tick Monitor + +Monthly monitoring to verify all agents have submitted ticks. +Creates Gitea issues with RCA templates for missing or failed ticks. +""" + +import os +import sys +import json +import subprocess +import argparse +from datetime import datetime, timedelta +from pathlib import Path + +REPO_DIR = Path("/root/wizards/household-snapshots") +TICKS_DIR = REPO_DIR / "ticks" +REGISTRY_FILE = REPO_DIR / "config" / "agent-registry.json" +GITEA_URL = os.environ.get("CLAW_CODE_GITEA_URL", "http://143.198.27.163:3000") +GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "") +REPO_NAME = "allegro/household-snapshots" + +def run_cmd(cmd, cwd=None): + """Run shell command and return output.""" + result = subprocess.run( + cmd, shell=True, cwd=cwd, capture_output=True, text=True + ) + return result.stdout.strip(), result.stderr.strip(), result.returncode + +def load_registry(): + """Load agent registry.""" + with open(REGISTRY_FILE) as f: + return json.load(f) + +def get_tick_month(): + """Get the tick month (previous month if checking on 1st-3rd).""" + now = datetime.utcnow() + if now.day <= 3: + # Still in grace period, check previous month + if now.month == 1: + return f"{now.year - 1}-12" + else: + return f"{now.year}-{now.month - 1:02d}" + return now.strftime("%Y-%m") + +def load_tick(agent_id, tick_month): + """Load tick file for an agent.""" + tick_file = TICKS_DIR / tick_month / f"{agent_id}.json" + if tick_file.exists(): + with open(tick_file) as f: + return json.load(f) + return None + +def create_gitea_issue(title, body): + """Create a Gitea issue.""" + if not GITEA_TOKEN: + print("Error: GITEA_TOKEN not set") + return None + + import urllib.request + import urllib.error + + url = f"{GITEA_URL}/api/v1/repos/{REPO_NAME}/issues" + + data = json.dumps({ + "title": title, + "body": body, + "labels": ["agent-health", "rca-required", "auto-generated"] + }).encode('utf-8') + + req = urllib.request.Request( + url, + data=data, + headers={ + "Content-Type": "application/json", + "Authorization": f"token {GITEA_TOKEN}" + }, + method="POST" + ) + + try: + with urllib.request.urlopen(req) as response: + result = json.loads(response.read().decode()) + return result.get("number"), result.get("html_url") + except urllib.error.HTTPError as e: + print(f"Error creating issue: {e.code} {e.reason}") + print(e.read().decode()) + return None + except Exception as e: + print(f"Error: {e}") + return None + +def generate_rca_body(agent, tick, tick_month, failure_type): + """Generate RCA issue body.""" + + now = datetime.utcnow().isoformat() + "Z" + + if failure_type == "MISSING_TICK": + status_section = """### Actual Behavior +- **Tick Status:** NOT SUBMITTED +- **Expected By:** End of 72-hour window (3rd of month) +- **Current Date:** {now} +- **Days Overdue:** {days_overdue} +""".format(now=now[:10], days_overdue=max(0, datetime.utcnow().day - 3)) + failure_desc = "Agent failed to submit monthly tick within grace period" + elif failure_type == "CRITICAL_STATUS": + status_section = """### Actual Behavior +- **Tick Status:** CRITICAL +- **Vitals Check:** + - Gateway: {gateway} + - Home Dir: {home} + - Config: {config} +- **Capabilities Lost:** {lost_caps} +""".format( + gateway=tick["vitals"]["gateway_running"], + home=tick["vitals"]["home_directory_accessible"], + config=tick["vitals"]["config_valid"], + lost_caps=", ".join([k for k, v in tick["capabilities"].items() if not v]) or "None" + ) + failure_desc = "Agent reported critical status in monthly tick" + elif failure_type == "OFFLINE": + status_section = """### Actual Behavior +- **Tick Status:** OFFLINE +- **Agent unreachable** during tick window +""" + failure_desc = "Agent marked as offline in monthly tick" + else: + status_section = "### Actual Behavior\n- Unknown failure\n" + failure_desc = "Unknown failure type" + + body = """## Agent Health Failure: {agent_id} + +**Detected:** {detected} +**Agent:** {agent_name} +**Failure Type:** {failure_type} +**Auto-Generated By:** Agent Tick Monitor + +### Expected Behavior +Agent should emit monthly tick within 72-hour window (by 3rd of month) with healthy status. + +{status_section} + +### Root Cause Analysis Required + +Please investigate and document: + +#### 1. What happened? +- [ ] Last successful operation identified +- [ ] Error logs reviewed +- [ ] System state at failure documented + +#### 2. Why did it happen? +- [ ] Configuration drift assessed +- [ ] Resource exhaustion checked +- [ ] External dependencies verified +- [ ] Code regression analyzed + +#### 3. How do we prevent recurrence? +- [ ] Monitoring improvements identified +- [ ] Automated recovery considered +- [ ] Alert tuning reviewed + +#### 4. Recovery steps taken +- [ ] Actions performed +- [ ] Current status verified +- [ ] Validation completed + +### Timeline + +- [x] T+0: Issue created (auto) +- [ ] T+1h: Initial response from agent owner +- [ ] T+24h: RCA submitted +- [ ] T+7d: Resolution verified + +### Related Resources + +- Agent home: `{home_dir}` +- Agent config: `{config_path}` +- Last tick: {last_tick_link} +- Tick file: `{tick_file}` + +### Checklist for Resolution + +- [ ] Root cause identified +- [ ] Fix implemented +- [ ] Agent recovered to healthy status +- [ ] New tick submitted (if missing) +- [ ] Monitoring improved +- [ ] Issue closed with summary + +--- +*Auto-generated by Agent Tick Monitor v1.0* +*Evenia binds us. Failures are learned from.* +""".format( + agent_id=agent["id"], + agent_name=agent["name"], + detected=now, + failure_type=failure_type, + status_section=status_section, + home_dir=agent["home_dir"], + config_path=agent["config_path"], + last_tick_link=f"{GITEA_URL}/{REPO_NAME}/tree/main/ticks/{tick_month}/{agent['id']}.json" if tick else "N/A - No tick found", + tick_file=str(TICKS_DIR / tick_month / f"{agent['id']}.json") + ) + + return body + +def check_agent(agent, tick_month, create_issues=True): + """Check an agent's tick status.""" + agent_id = agent["id"] + tick = load_tick(agent_id, tick_month) + + results = { + "agent_id": agent_id, + "tick_present": tick is not None, + "status": tick["status"] if tick else "NO_TICK", + "issue_created": False, + "issue_number": None + } + + if not tick: + # Missing tick + print(f" ✗ {agent_id}: MISSING TICK") + if create_issues: + title = f"RCA-AGENT: {agent_id} failed to submit monthly tick ({tick_month})" + body = generate_rca_body(agent, None, tick_month, "MISSING_TICK") + issue_num, issue_url = create_gitea_issue(title, body) + if issue_num: + print(f" → Issue #{issue_num} created: {issue_url}") + results["issue_created"] = True + results["issue_number"] = issue_num + else: + print(f" ✗ Failed to create issue") + elif tick["status"] in ["critical", "offline"]: + # Failed status + print(f" ✗ {agent_id}: STATUS {tick['status'].upper()}") + if create_issues: + title = f"RCA-AGENT: {agent_id} reported {tick['status']} status ({tick_month})" + body = generate_rca_body(agent, tick, tick_month, tick["status"].upper()) + issue_num, issue_url = create_gitea_issue(title, body) + if issue_num: + print(f" → Issue #{issue_num} created: {issue_url}") + results["issue_created"] = True + results["issue_number"] = issue_num + elif tick["status"] == "degraded": + # Degraded - warning but no RCA + print(f" ⚠ {agent_id}: DEGRADED") + print(f" (No RCA required, but monitor)") + else: + # Healthy + print(f" ✓ {agent_id}: HEALTHY") + + return results + +def main(): + parser = argparse.ArgumentParser(description="Agent Tick Monitor") + parser.add_argument("--check-and-report", action="store_true", help="Run monthly check and create issues") + parser.add_argument("--month", help="Specific month to check (YYYY-MM format)") + parser.add_argument("--dry-run", action="store_true", help="Check but don't create issues") + args = parser.parse_args() + + registry = load_registry() + tick_month = args.month or get_tick_month() + + print(f"=== Agent Tick Monitor ===") + print(f"Checking month: {tick_month}") + print(f"Time: {datetime.utcnow().isoformat()}Z") + print() + + results = [] + for agent in registry["agents"]: + if agent["active"]: + result = check_agent(agent, tick_month, create_issues=not args.dry_run) + results.append(result) + + print() + print("=== Summary ===") + total = len(results) + healthy = sum(1 for r in results if r["status"] == "healthy") + degraded = sum(1 for r in results if r["status"] == "degraded") + failed = total - healthy - degraded + issues = sum(1 for r in results if r["issue_created"]) + + print(f"Total agents: {total}") + print(f" Healthy: {healthy}") + print(f" Degraded: {degraded}") + print(f" Failed: {failed}") + print(f" Issues created: {issues}") + + return 0 if failed == 0 else 1 + +if __name__ == "__main__": + sys.exit(main())