scripts/agent_tick_monitor.py

#!/usr/bin/env python3
"""
Agent Tick Monitor

Monthly monitoring to verify all agents have submitted ticks.
Creates Gitea issues with RCA templates for missing or failed ticks.
"""

import os
import sys
import json
import subprocess
import argparse
from datetime import datetime, timedelta
from pathlib import Path

REPO_DIR = Path("/root/wizards/household-snapshots")
TICKS_DIR = REPO_DIR / "ticks"
REGISTRY_FILE = REPO_DIR / "config" / "agent-registry.json"
GITEA_URL = os.environ.get("CLAW_CODE_GITEA_URL", "http://143.198.27.163:3000")
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
REPO_NAME = "allegro/household-snapshots"

def run_cmd(cmd, cwd=None):
    """Run shell command and return output."""
    result = subprocess.run(
        cmd, shell=True, cwd=cwd, capture_output=True, text=True
    )
    return result.stdout.strip(), result.stderr.strip(), result.returncode

def load_registry():
    """Load agent registry."""
    with open(REGISTRY_FILE) as f:
        return json.load(f)

def get_tick_month():
    """Get the tick month (previous month if checking on 1st-3rd)."""
    now = datetime.utcnow()
    if now.day <= 3:
        # Still in grace period, check previous month
        if now.month == 1:
            return f"{now.year - 1}-12"
        else:
            return f"{now.year}-{now.month - 1:02d}"
    return now.strftime("%Y-%m")

def load_tick(agent_id, tick_month):
    """Load tick file for an agent."""
    tick_file = TICKS_DIR / tick_month / f"{agent_id}.json"
    if tick_file.exists():
        with open(tick_file) as f:
            return json.load(f)
    return None

def create_gitea_issue(title, body):
    """Create a Gitea issue."""
    if not GITEA_TOKEN:
        print("Error: GITEA_TOKEN not set")
        return None
    
    import urllib.request
    import urllib.error
    
    url = f"{GITEA_URL}/api/v1/repos/{REPO_NAME}/issues"
    
    data = json.dumps({
        "title": title,
        "body": body,
        "labels": ["agent-health", "rca-required", "auto-generated"]
    }).encode('utf-8')
    
    req = urllib.request.Request(
        url,
        data=data,
        headers={
            "Content-Type": "application/json",
            "Authorization": f"token {GITEA_TOKEN}"
        },
        method="POST"
    )
    
    try:
        with urllib.request.urlopen(req) as response:
            result = json.loads(response.read().decode())
            return result.get("number"), result.get("html_url")
    except urllib.error.HTTPError as e:
        print(f"Error creating issue: {e.code} {e.reason}")
        print(e.read().decode())
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

def generate_rca_body(agent, tick, tick_month, failure_type):
    """Generate RCA issue body."""
    
    now = datetime.utcnow().isoformat() + "Z"
    
    if failure_type == "MISSING_TICK":
        status_section = """### Actual Behavior
- **Tick Status:** NOT SUBMITTED
- **Expected By:** End of 72-hour window (3rd of month)
- **Current Date:** {now}
- **Days Overdue:** {days_overdue}
""".format(now=now[:10], days_overdue=max(0, datetime.utcnow().day - 3))
        failure_desc = "Agent failed to submit monthly tick within grace period"
    elif failure_type == "CRITICAL_STATUS":
        status_section = """### Actual Behavior
- **Tick Status:** CRITICAL
- **Vitals Check:**
  - Gateway: {gateway}
  - Home Dir: {home}
  - Config: {config}
- **Capabilities Lost:** {lost_caps}
""".format(
            gateway=tick["vitals"]["gateway_running"],
            home=tick["vitals"]["home_directory_accessible"],
            config=tick["vitals"]["config_valid"],
            lost_caps=", ".join([k for k, v in tick["capabilities"].items() if not v]) or "None"
        )
        failure_desc = "Agent reported critical status in monthly tick"
    elif failure_type == "OFFLINE":
        status_section = """### Actual Behavior
- **Tick Status:** OFFLINE
- **Agent unreachable** during tick window
"""
        failure_desc = "Agent marked as offline in monthly tick"
    else:
        status_section = "### Actual Behavior\n- Unknown failure\n"
        failure_desc = "Unknown failure type"
    
    body = """## Agent Health Failure: {agent_id}

**Detected:** {detected}
**Agent:** {agent_name}
**Failure Type:** {failure_type}
**Auto-Generated By:** Agent Tick Monitor

### Expected Behavior
Agent should emit monthly tick within 72-hour window (by 3rd of month) with healthy status.

{status_section}

### Root Cause Analysis Required

Please investigate and document:

#### 1. What happened?
- [ ] Last successful operation identified
- [ ] Error logs reviewed
- [ ] System state at failure documented

#### 2. Why did it happen?
- [ ] Configuration drift assessed
- [ ] Resource exhaustion checked
- [ ] External dependencies verified
- [ ] Code regression analyzed

#### 3. How do we prevent recurrence?
- [ ] Monitoring improvements identified
- [ ] Automated recovery considered
- [ ] Alert tuning reviewed

#### 4. Recovery steps taken
- [ ] Actions performed
- [ ] Current status verified
- [ ] Validation completed

### Timeline

- [x] T+0: Issue created (auto)
- [ ] T+1h: Initial response from agent owner
- [ ] T+24h: RCA submitted
- [ ] T+7d: Resolution verified

### Related Resources

- Agent home: `{home_dir}`
- Agent config: `{config_path}`
- Last tick: {last_tick_link}
- Tick file: `{tick_file}`

### Checklist for Resolution

- [ ] Root cause identified
- [ ] Fix implemented
- [ ] Agent recovered to healthy status
- [ ] New tick submitted (if missing)
- [ ] Monitoring improved
- [ ] Issue closed with summary

---
*Auto-generated by Agent Tick Monitor v1.0*
*Evenia binds us. Failures are learned from.*
""".format(
        agent_id=agent["id"],
        agent_name=agent["name"],
        detected=now,
        failure_type=failure_type,
        status_section=status_section,
        home_dir=agent["home_dir"],
        config_path=agent["config_path"],
        last_tick_link=f"{GITEA_URL}/{REPO_NAME}/tree/main/ticks/{tick_month}/{agent['id']}.json" if tick else "N/A - No tick found",
        tick_file=str(TICKS_DIR / tick_month / f"{agent['id']}.json")
    )
    
    return body

def check_agent(agent, tick_month, create_issues=True):
    """Check an agent's tick status."""
    agent_id = agent["id"]
    tick = load_tick(agent_id, tick_month)
    
    results = {
        "agent_id": agent_id,
        "tick_present": tick is not None,
        "status": tick["status"] if tick else "NO_TICK",
        "issue_created": False,
        "issue_number": None
    }
    
    if not tick:
        # Missing tick
        print(f"  ✗ {agent_id}: MISSING TICK")
        if create_issues:
            title = f"RCA-AGENT: {agent_id} failed to submit monthly tick ({tick_month})"
            body = generate_rca_body(agent, None, tick_month, "MISSING_TICK")
            issue_num, issue_url = create_gitea_issue(title, body)
            if issue_num:
                print(f"    → Issue #{issue_num} created: {issue_url}")
                results["issue_created"] = True
                results["issue_number"] = issue_num
            else:
                print(f"    ✗ Failed to create issue")
    elif tick["status"] in ["critical", "offline"]:
        # Failed status
        print(f"  ✗ {agent_id}: STATUS {tick['status'].upper()}")
        if create_issues:
            title = f"RCA-AGENT: {agent_id} reported {tick['status']} status ({tick_month})"
            body = generate_rca_body(agent, tick, tick_month, tick["status"].upper())
            issue_num, issue_url = create_gitea_issue(title, body)
            if issue_num:
                print(f"    → Issue #{issue_num} created: {issue_url}")
                results["issue_created"] = True
                results["issue_number"] = issue_num
    elif tick["status"] == "degraded":
        # Degraded - warning but no RCA
        print(f"  ⚠ {agent_id}: DEGRADED")
        print(f"    (No RCA required, but monitor)")
    else:
        # Healthy
        print(f"  ✓ {agent_id}: HEALTHY")
    
    return results

def main():
    parser = argparse.ArgumentParser(description="Agent Tick Monitor")
    parser.add_argument("--check-and-report", action="store_true", help="Run monthly check and create issues")
    parser.add_argument("--month", help="Specific month to check (YYYY-MM format)")
    parser.add_argument("--dry-run", action="store_true", help="Check but don't create issues")
    args = parser.parse_args()
    
    registry = load_registry()
    tick_month = args.month or get_tick_month()
    
    print(f"=== Agent Tick Monitor ===")
    print(f"Checking month: {tick_month}")
    print(f"Time: {datetime.utcnow().isoformat()}Z")
    print()
    
    results = []
    for agent in registry["agents"]:
        if agent["active"]:
            result = check_agent(agent, tick_month, create_issues=not args.dry_run)
            results.append(result)
    
    print()
    print("=== Summary ===")
    total = len(results)
    healthy = sum(1 for r in results if r["status"] == "healthy")
    degraded = sum(1 for r in results if r["status"] == "degraded")
    failed = total - healthy - degraded
    issues = sum(1 for r in results if r["issue_created"])
    
    print(f"Total agents: {total}")
    print(f"  Healthy: {healthy}")
    print(f"  Degraded: {degraded}")
    print(f"  Failed: {failed}")
    print(f"  Issues created: {issues}")
    
    return 0 if failed == 0 else 1

if __name__ == "__main__":
    sys.exit(main())
Implement Agent Tick Protocol v1.0 - docs/agent-tick-protocol.md: Full protocol specification - config/agent-registry.json: Timmy Time agent definitions - scripts/agent_tick_submitter.py: Monthly tick submission - scripts/agent_tick_monitor.py: Monitoring and auto-RCA - Monthly cron jobs for 1st (submit) and 4th (monitor) of month Auto-creates Gitea issues with RCA templates for: - Missing ticks (grace period: 72 hours) - Critical/offline status Baseline for all Timmy Time agents: allegro, adagio, timmy 2026-04-02 01:06:00 +00:00			`#!/usr/bin/env python3`
			`"""`
			`Agent Tick Monitor`

			`Monthly monitoring to verify all agents have submitted ticks.`
			`Creates Gitea issues with RCA templates for missing or failed ticks.`
			`"""`

			`import os`
			`import sys`
			`import json`
			`import subprocess`
			`import argparse`
			`from datetime import datetime, timedelta`
			`from pathlib import Path`

			`REPO_DIR = Path("/root/wizards/household-snapshots")`
			`TICKS_DIR = REPO_DIR / "ticks"`
			`REGISTRY_FILE = REPO_DIR / "config" / "agent-registry.json"`
			`GITEA_URL = os.environ.get("CLAW_CODE_GITEA_URL", "http://143.198.27.163:3000")`
			`GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")`
			`REPO_NAME = "allegro/household-snapshots"`

			`def run_cmd(cmd, cwd=None):`
			`"""Run shell command and return output."""`
			`result = subprocess.run(`
			`cmd, shell=True, cwd=cwd, capture_output=True, text=True`
			`)`
			`return result.stdout.strip(), result.stderr.strip(), result.returncode`

			`def load_registry():`
			`"""Load agent registry."""`
			`with open(REGISTRY_FILE) as f:`
			`return json.load(f)`

			`def get_tick_month():`
			`"""Get the tick month (previous month if checking on 1st-3rd)."""`
			`now = datetime.utcnow()`
			`if now.day <= 3:`
			`# Still in grace period, check previous month`
			`if now.month == 1:`
			`return f"{now.year - 1}-12"`
			`else:`
			`return f"{now.year}-{now.month - 1:02d}"`
			`return now.strftime("%Y-%m")`

			`def load_tick(agent_id, tick_month):`
			`"""Load tick file for an agent."""`
			`tick_file = TICKS_DIR / tick_month / f"{agent_id}.json"`
			`if tick_file.exists():`
			`with open(tick_file) as f:`
			`return json.load(f)`
			`return None`

			`def create_gitea_issue(title, body):`
			`"""Create a Gitea issue."""`
			`if not GITEA_TOKEN:`
			`print("Error: GITEA_TOKEN not set")`
			`return None`

			`import urllib.request`
			`import urllib.error`

			`url = f"{GITEA_URL}/api/v1/repos/{REPO_NAME}/issues"`

			`data = json.dumps({`
			`"title": title,`
			`"body": body,`
			`"labels": ["agent-health", "rca-required", "auto-generated"]`
			`}).encode('utf-8')`

			`req = urllib.request.Request(`
			`url,`
			`data=data,`
			`headers={`
			`"Content-Type": "application/json",`
			`"Authorization": f"token {GITEA_TOKEN}"`
			`},`
			`method="POST"`
			`)`

			`try:`
			`with urllib.request.urlopen(req) as response:`
			`result = json.loads(response.read().decode())`
			`return result.get("number"), result.get("html_url")`
			`except urllib.error.HTTPError as e:`
			`print(f"Error creating issue: {e.code} {e.reason}")`
			`print(e.read().decode())`
			`return None`
			`except Exception as e:`
			`print(f"Error: {e}")`
			`return None`

			`def generate_rca_body(agent, tick, tick_month, failure_type):`
			`"""Generate RCA issue body."""`

			`now = datetime.utcnow().isoformat() + "Z"`

			`if failure_type == "MISSING_TICK":`
			`status_section = """### Actual Behavior`
			`- Tick Status: NOT SUBMITTED`
			`- Expected By: End of 72-hour window (3rd of month)`
			`- Current Date: {now}`
			`- Days Overdue: {days_overdue}`
			`""".format(now=now[:10], days_overdue=max(0, datetime.utcnow().day - 3))`
			`failure_desc = "Agent failed to submit monthly tick within grace period"`
			`elif failure_type == "CRITICAL_STATUS":`
			`status_section = """### Actual Behavior`
			`- Tick Status: CRITICAL`
			`- Vitals Check:`
			`- Gateway: {gateway}`
			`- Home Dir: {home}`
			`- Config: {config}`
			`- Capabilities Lost: {lost_caps}`
			`""".format(`
			`gateway=tick["vitals"]["gateway_running"],`
			`home=tick["vitals"]["home_directory_accessible"],`
			`config=tick["vitals"]["config_valid"],`
			`lost_caps=", ".join([k for k, v in tick["capabilities"].items() if not v]) or "None"`
			`)`
			`failure_desc = "Agent reported critical status in monthly tick"`
			`elif failure_type == "OFFLINE":`
			`status_section = """### Actual Behavior`
			`- Tick Status: OFFLINE`
			`- Agent unreachable during tick window`
			`"""`
			`failure_desc = "Agent marked as offline in monthly tick"`
			`else:`
			`status_section = "### Actual Behavior\n- Unknown failure\n"`
			`failure_desc = "Unknown failure type"`

			`body = """## Agent Health Failure: {agent_id}`

			`Detected: {detected}`
			`Agent: {agent_name}`
			`Failure Type: {failure_type}`
			`Auto-Generated By: Agent Tick Monitor`

			`### Expected Behavior`
			`Agent should emit monthly tick within 72-hour window (by 3rd of month) with healthy status.`

			`{status_section}`

			`### Root Cause Analysis Required`

			`Please investigate and document:`

			`#### 1. What happened?`
			`- [ ] Last successful operation identified`
			`- [ ] Error logs reviewed`
			`- [ ] System state at failure documented`

			`#### 2. Why did it happen?`
			`- [ ] Configuration drift assessed`
			`- [ ] Resource exhaustion checked`
			`- [ ] External dependencies verified`
			`- [ ] Code regression analyzed`

			`#### 3. How do we prevent recurrence?`
			`- [ ] Monitoring improvements identified`
			`- [ ] Automated recovery considered`
			`- [ ] Alert tuning reviewed`

			`#### 4. Recovery steps taken`
			`- [ ] Actions performed`
			`- [ ] Current status verified`
			`- [ ] Validation completed`

			`### Timeline`

			`- [x] T+0: Issue created (auto)`
			`- [ ] T+1h: Initial response from agent owner`
			`- [ ] T+24h: RCA submitted`
			`- [ ] T+7d: Resolution verified`

			`### Related Resources`

			- Agent home: `{home_dir}`
			- Agent config: `{config_path}`
			`- Last tick: {last_tick_link}`
			- Tick file: `{tick_file}`

			`### Checklist for Resolution`

			`- [ ] Root cause identified`
			`- [ ] Fix implemented`
			`- [ ] Agent recovered to healthy status`
			`- [ ] New tick submitted (if missing)`
			`- [ ] Monitoring improved`
			`- [ ] Issue closed with summary`

			`---`
			`Auto-generated by Agent Tick Monitor v1.0`
			`Evenia binds us. Failures are learned from.`
			`""".format(`
			`agent_id=agent["id"],`
			`agent_name=agent["name"],`
			`detected=now,`
			`failure_type=failure_type,`
			`status_section=status_section,`
			`home_dir=agent["home_dir"],`
			`config_path=agent["config_path"],`
			`last_tick_link=f"{GITEA_URL}/{REPO_NAME}/tree/main/ticks/{tick_month}/{agent['id']}.json" if tick else "N/A - No tick found",`
			`tick_file=str(TICKS_DIR / tick_month / f"{agent['id']}.json")`
			`)`

			`return body`

			`def check_agent(agent, tick_month, create_issues=True):`
			`"""Check an agent's tick status."""`
			`agent_id = agent["id"]`
			`tick = load_tick(agent_id, tick_month)`

			`results = {`
			`"agent_id": agent_id,`
			`"tick_present": tick is not None,`
			`"status": tick["status"] if tick else "NO_TICK",`
			`"issue_created": False,`
			`"issue_number": None`
			`}`

			`if not tick:`
			`# Missing tick`
			`print(f" ✗ {agent_id}: MISSING TICK")`
			`if create_issues:`
			`title = f"RCA-AGENT: {agent_id} failed to submit monthly tick ({tick_month})"`
			`body = generate_rca_body(agent, None, tick_month, "MISSING_TICK")`
			`issue_num, issue_url = create_gitea_issue(title, body)`
			`if issue_num:`
			`print(f" → Issue #{issue_num} created: {issue_url}")`
			`results["issue_created"] = True`
			`results["issue_number"] = issue_num`
			`else:`
			`print(f" ✗ Failed to create issue")`
			`elif tick["status"] in ["critical", "offline"]:`
			`# Failed status`
			`print(f" ✗ {agent_id}: STATUS {tick['status'].upper()}")`
			`if create_issues:`
			`title = f"RCA-AGENT: {agent_id} reported {tick['status']} status ({tick_month})"`
			`body = generate_rca_body(agent, tick, tick_month, tick["status"].upper())`
			`issue_num, issue_url = create_gitea_issue(title, body)`
			`if issue_num:`
			`print(f" → Issue #{issue_num} created: {issue_url}")`
			`results["issue_created"] = True`
			`results["issue_number"] = issue_num`
			`elif tick["status"] == "degraded":`
			`# Degraded - warning but no RCA`
			`print(f" ⚠ {agent_id}: DEGRADED")`
			`print(f" (No RCA required, but monitor)")`
			`else:`
			`# Healthy`
			`print(f" ✓ {agent_id}: HEALTHY")`

			`return results`

			`def main():`
			`parser = argparse.ArgumentParser(description="Agent Tick Monitor")`
			`parser.add_argument("--check-and-report", action="store_true", help="Run monthly check and create issues")`
			`parser.add_argument("--month", help="Specific month to check (YYYY-MM format)")`
			`parser.add_argument("--dry-run", action="store_true", help="Check but don't create issues")`
			`args = parser.parse_args()`

			`registry = load_registry()`
			`tick_month = args.month or get_tick_month()`

			`print(f"=== Agent Tick Monitor ===")`
			`print(f"Checking month: {tick_month}")`
			`print(f"Time: {datetime.utcnow().isoformat()}Z")`
			`print()`

			`results = []`
			`for agent in registry["agents"]:`
			`if agent["active"]:`
			`result = check_agent(agent, tick_month, create_issues=not args.dry_run)`
			`results.append(result)`

			`print()`
			`print("=== Summary ===")`
			`total = len(results)`
			`healthy = sum(1 for r in results if r["status"] == "healthy")`
			`degraded = sum(1 for r in results if r["status"] == "degraded")`
			`failed = total - healthy - degraded`
			`issues = sum(1 for r in results if r["issue_created"])`

			`print(f"Total agents: {total}")`
			`print(f" Healthy: {healthy}")`
			`print(f" Degraded: {degraded}")`
			`print(f" Failed: {failed}")`
			`print(f" Issues created: {issues}")`

			`return 0 if failed == 0 else 1`

			`if __name__ == "__main__":`
			`sys.exit(main())`