Files
household-snapshots/scripts/agent_tick_monitor.py
Allegro 1c961396bb Implement Agent Tick Protocol v1.0
- docs/agent-tick-protocol.md: Full protocol specification
- config/agent-registry.json: Timmy Time agent definitions
- scripts/agent_tick_submitter.py: Monthly tick submission
- scripts/agent_tick_monitor.py: Monitoring and auto-RCA
- Monthly cron jobs for 1st (submit) and 4th (monitor) of month

Auto-creates Gitea issues with RCA templates for:
- Missing ticks (grace period: 72 hours)
- Critical/offline status

Baseline for all Timmy Time agents: allegro, adagio, timmy
2026-04-02 01:06:00 +00:00

295 lines
9.3 KiB
Python

#!/usr/bin/env python3
"""
Agent Tick Monitor
Monthly monitoring to verify all agents have submitted ticks.
Creates Gitea issues with RCA templates for missing or failed ticks.
"""
import os
import sys
import json
import subprocess
import argparse
from datetime import datetime, timedelta
from pathlib import Path
REPO_DIR = Path("/root/wizards/household-snapshots")
TICKS_DIR = REPO_DIR / "ticks"
REGISTRY_FILE = REPO_DIR / "config" / "agent-registry.json"
GITEA_URL = os.environ.get("CLAW_CODE_GITEA_URL", "http://143.198.27.163:3000")
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
REPO_NAME = "allegro/household-snapshots"
def run_cmd(cmd, cwd=None):
"""Run shell command and return output."""
result = subprocess.run(
cmd, shell=True, cwd=cwd, capture_output=True, text=True
)
return result.stdout.strip(), result.stderr.strip(), result.returncode
def load_registry():
"""Load agent registry."""
with open(REGISTRY_FILE) as f:
return json.load(f)
def get_tick_month():
"""Get the tick month (previous month if checking on 1st-3rd)."""
now = datetime.utcnow()
if now.day <= 3:
# Still in grace period, check previous month
if now.month == 1:
return f"{now.year - 1}-12"
else:
return f"{now.year}-{now.month - 1:02d}"
return now.strftime("%Y-%m")
def load_tick(agent_id, tick_month):
"""Load tick file for an agent."""
tick_file = TICKS_DIR / tick_month / f"{agent_id}.json"
if tick_file.exists():
with open(tick_file) as f:
return json.load(f)
return None
def create_gitea_issue(title, body):
"""Create a Gitea issue."""
if not GITEA_TOKEN:
print("Error: GITEA_TOKEN not set")
return None
import urllib.request
import urllib.error
url = f"{GITEA_URL}/api/v1/repos/{REPO_NAME}/issues"
data = json.dumps({
"title": title,
"body": body,
"labels": ["agent-health", "rca-required", "auto-generated"]
}).encode('utf-8')
req = urllib.request.Request(
url,
data=data,
headers={
"Content-Type": "application/json",
"Authorization": f"token {GITEA_TOKEN}"
},
method="POST"
)
try:
with urllib.request.urlopen(req) as response:
result = json.loads(response.read().decode())
return result.get("number"), result.get("html_url")
except urllib.error.HTTPError as e:
print(f"Error creating issue: {e.code} {e.reason}")
print(e.read().decode())
return None
except Exception as e:
print(f"Error: {e}")
return None
def generate_rca_body(agent, tick, tick_month, failure_type):
"""Generate RCA issue body."""
now = datetime.utcnow().isoformat() + "Z"
if failure_type == "MISSING_TICK":
status_section = """### Actual Behavior
- **Tick Status:** NOT SUBMITTED
- **Expected By:** End of 72-hour window (3rd of month)
- **Current Date:** {now}
- **Days Overdue:** {days_overdue}
""".format(now=now[:10], days_overdue=max(0, datetime.utcnow().day - 3))
failure_desc = "Agent failed to submit monthly tick within grace period"
elif failure_type == "CRITICAL_STATUS":
status_section = """### Actual Behavior
- **Tick Status:** CRITICAL
- **Vitals Check:**
- Gateway: {gateway}
- Home Dir: {home}
- Config: {config}
- **Capabilities Lost:** {lost_caps}
""".format(
gateway=tick["vitals"]["gateway_running"],
home=tick["vitals"]["home_directory_accessible"],
config=tick["vitals"]["config_valid"],
lost_caps=", ".join([k for k, v in tick["capabilities"].items() if not v]) or "None"
)
failure_desc = "Agent reported critical status in monthly tick"
elif failure_type == "OFFLINE":
status_section = """### Actual Behavior
- **Tick Status:** OFFLINE
- **Agent unreachable** during tick window
"""
failure_desc = "Agent marked as offline in monthly tick"
else:
status_section = "### Actual Behavior\n- Unknown failure\n"
failure_desc = "Unknown failure type"
body = """## Agent Health Failure: {agent_id}
**Detected:** {detected}
**Agent:** {agent_name}
**Failure Type:** {failure_type}
**Auto-Generated By:** Agent Tick Monitor
### Expected Behavior
Agent should emit monthly tick within 72-hour window (by 3rd of month) with healthy status.
{status_section}
### Root Cause Analysis Required
Please investigate and document:
#### 1. What happened?
- [ ] Last successful operation identified
- [ ] Error logs reviewed
- [ ] System state at failure documented
#### 2. Why did it happen?
- [ ] Configuration drift assessed
- [ ] Resource exhaustion checked
- [ ] External dependencies verified
- [ ] Code regression analyzed
#### 3. How do we prevent recurrence?
- [ ] Monitoring improvements identified
- [ ] Automated recovery considered
- [ ] Alert tuning reviewed
#### 4. Recovery steps taken
- [ ] Actions performed
- [ ] Current status verified
- [ ] Validation completed
### Timeline
- [x] T+0: Issue created (auto)
- [ ] T+1h: Initial response from agent owner
- [ ] T+24h: RCA submitted
- [ ] T+7d: Resolution verified
### Related Resources
- Agent home: `{home_dir}`
- Agent config: `{config_path}`
- Last tick: {last_tick_link}
- Tick file: `{tick_file}`
### Checklist for Resolution
- [ ] Root cause identified
- [ ] Fix implemented
- [ ] Agent recovered to healthy status
- [ ] New tick submitted (if missing)
- [ ] Monitoring improved
- [ ] Issue closed with summary
---
*Auto-generated by Agent Tick Monitor v1.0*
*Evenia binds us. Failures are learned from.*
""".format(
agent_id=agent["id"],
agent_name=agent["name"],
detected=now,
failure_type=failure_type,
status_section=status_section,
home_dir=agent["home_dir"],
config_path=agent["config_path"],
last_tick_link=f"{GITEA_URL}/{REPO_NAME}/tree/main/ticks/{tick_month}/{agent['id']}.json" if tick else "N/A - No tick found",
tick_file=str(TICKS_DIR / tick_month / f"{agent['id']}.json")
)
return body
def check_agent(agent, tick_month, create_issues=True):
"""Check an agent's tick status."""
agent_id = agent["id"]
tick = load_tick(agent_id, tick_month)
results = {
"agent_id": agent_id,
"tick_present": tick is not None,
"status": tick["status"] if tick else "NO_TICK",
"issue_created": False,
"issue_number": None
}
if not tick:
# Missing tick
print(f"{agent_id}: MISSING TICK")
if create_issues:
title = f"RCA-AGENT: {agent_id} failed to submit monthly tick ({tick_month})"
body = generate_rca_body(agent, None, tick_month, "MISSING_TICK")
issue_num, issue_url = create_gitea_issue(title, body)
if issue_num:
print(f" → Issue #{issue_num} created: {issue_url}")
results["issue_created"] = True
results["issue_number"] = issue_num
else:
print(f" ✗ Failed to create issue")
elif tick["status"] in ["critical", "offline"]:
# Failed status
print(f"{agent_id}: STATUS {tick['status'].upper()}")
if create_issues:
title = f"RCA-AGENT: {agent_id} reported {tick['status']} status ({tick_month})"
body = generate_rca_body(agent, tick, tick_month, tick["status"].upper())
issue_num, issue_url = create_gitea_issue(title, body)
if issue_num:
print(f" → Issue #{issue_num} created: {issue_url}")
results["issue_created"] = True
results["issue_number"] = issue_num
elif tick["status"] == "degraded":
# Degraded - warning but no RCA
print(f"{agent_id}: DEGRADED")
print(f" (No RCA required, but monitor)")
else:
# Healthy
print(f"{agent_id}: HEALTHY")
return results
def main():
parser = argparse.ArgumentParser(description="Agent Tick Monitor")
parser.add_argument("--check-and-report", action="store_true", help="Run monthly check and create issues")
parser.add_argument("--month", help="Specific month to check (YYYY-MM format)")
parser.add_argument("--dry-run", action="store_true", help="Check but don't create issues")
args = parser.parse_args()
registry = load_registry()
tick_month = args.month or get_tick_month()
print(f"=== Agent Tick Monitor ===")
print(f"Checking month: {tick_month}")
print(f"Time: {datetime.utcnow().isoformat()}Z")
print()
results = []
for agent in registry["agents"]:
if agent["active"]:
result = check_agent(agent, tick_month, create_issues=not args.dry_run)
results.append(result)
print()
print("=== Summary ===")
total = len(results)
healthy = sum(1 for r in results if r["status"] == "healthy")
degraded = sum(1 for r in results if r["status"] == "degraded")
failed = total - healthy - degraded
issues = sum(1 for r in results if r["issue_created"])
print(f"Total agents: {total}")
print(f" Healthy: {healthy}")
print(f" Degraded: {degraded}")
print(f" Failed: {failed}")
print(f" Issues created: {issues}")
return 0 if failed == 0 else 1
if __name__ == "__main__":
sys.exit(main())