330 lines
11 KiB
Python
330 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""Create or refresh fleet incidents on Gitea from local infrastructure signals.
|
|
|
|
Refs: timmy-home #553
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
from urllib import request
|
|
|
|
DEFAULT_BASE_URL = "https://forge.alexanderwhitestone.com/api/v1"
|
|
DEFAULT_OWNER = "Timmy_Foundation"
|
|
DEFAULT_REPO = "timmy-home"
|
|
DEFAULT_TOKEN_FILE = Path.home() / ".config" / "gitea" / "token"
|
|
DEFAULT_FAILOVER_STATUS = Path.home() / ".timmy" / "failover_status.json"
|
|
DEFAULT_RESTART_STATE_DIR = Path("/var/lib/timmy/restarts")
|
|
DEFAULT_HEARTBEAT_FILE = Path("/var/lib/timmy/heartbeats/fleet_health.last")
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Incident:
|
|
fingerprint: str
|
|
title: str
|
|
body: str
|
|
|
|
def latest_evidence(self) -> str:
|
|
lines = [line for line in self.body.splitlines() if line.strip()]
|
|
if lines and lines[0].startswith("Fingerprint: "):
|
|
lines = lines[1:]
|
|
return "\n".join(lines).strip()
|
|
|
|
|
|
class GiteaClient:
|
|
def __init__(self, token: str, owner: str = DEFAULT_OWNER, repo: str = DEFAULT_REPO, base_url: str = DEFAULT_BASE_URL):
|
|
self.token = token
|
|
self.owner = owner
|
|
self.repo = repo
|
|
self.base_url = base_url.rstrip("/")
|
|
|
|
def _request(self, path: str, *, method: str = "GET", data: dict | None = None):
|
|
payload = None if data is None else json.dumps(data).encode()
|
|
headers = {"Authorization": f"token {self.token}"}
|
|
if payload is not None:
|
|
headers["Content-Type"] = "application/json"
|
|
req = request.Request(f"{self.base_url}{path}", data=payload, headers=headers, method=method)
|
|
with request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read().decode())
|
|
|
|
def list_open_issues(self):
|
|
issues = self._request(f"/repos/{self.owner}/{self.repo}/issues?state=open&limit=100")
|
|
return [issue for issue in issues if not issue.get("pull_request")]
|
|
|
|
def create_issue(self, title: str, body: str):
|
|
return self._request(
|
|
f"/repos/{self.owner}/{self.repo}/issues",
|
|
method="POST",
|
|
data={"title": title, "body": body},
|
|
)
|
|
|
|
def comment_issue(self, issue_number: int, body: str):
|
|
return self._request(
|
|
f"/repos/{self.owner}/{self.repo}/issues/{issue_number}/comments",
|
|
method="POST",
|
|
data={"body": body},
|
|
)
|
|
|
|
|
|
def load_json(path: Path):
|
|
if not path.exists():
|
|
return None
|
|
return json.loads(path.read_text())
|
|
|
|
|
|
def load_restart_counts(state_dir: Path) -> dict[str, int]:
|
|
if not state_dir.exists():
|
|
return {}
|
|
|
|
counts: dict[str, int] = {}
|
|
for path in sorted(state_dir.glob("*.count")):
|
|
try:
|
|
counts[path.stem] = int(path.read_text().strip())
|
|
except ValueError:
|
|
continue
|
|
return counts
|
|
|
|
|
|
def heartbeat_is_stale(path: Path, *, now: datetime | None = None, max_age_seconds: int = 900) -> bool:
|
|
if now is None:
|
|
now = datetime.now(timezone.utc)
|
|
if not path.exists():
|
|
return True
|
|
age = now.timestamp() - path.stat().st_mtime
|
|
return age > max_age_seconds
|
|
|
|
|
|
def _iso(dt: datetime) -> str:
|
|
return dt.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
|
|
|
|
|
|
def _build_body(fingerprint: str, *details: str) -> str:
|
|
detail_lines = [detail for detail in details if detail]
|
|
return "\n".join([f"Fingerprint: {fingerprint}", *detail_lines])
|
|
|
|
|
|
def build_incidents(
|
|
*,
|
|
failover_status: dict | None,
|
|
restart_counts: dict[str, int],
|
|
heartbeat_stale: bool,
|
|
now: datetime | None = None,
|
|
restart_escalation_threshold: int = 3,
|
|
) -> list[Incident]:
|
|
if now is None:
|
|
now = datetime.now(timezone.utc)
|
|
|
|
incidents: list[Incident] = []
|
|
failover_timestamp = None
|
|
fleet = {}
|
|
if failover_status:
|
|
failover_timestamp = failover_status.get("timestamp")
|
|
fleet = failover_status.get("fleet") or {}
|
|
|
|
for host, status in sorted(fleet.items()):
|
|
if str(status).upper() == "ONLINE":
|
|
continue
|
|
fingerprint = f"host-offline:{host}"
|
|
failover_detail = f"Failover status timestamp: {failover_timestamp}" if failover_timestamp is not None else "Failover status timestamp: unknown"
|
|
incidents.append(
|
|
Incident(
|
|
fingerprint=fingerprint,
|
|
title=f"[AUTO] Fleet host offline: {host}",
|
|
body=_build_body(
|
|
fingerprint,
|
|
f"Detected at: {_iso(now)}",
|
|
failover_detail,
|
|
f"Host `{host}` reported `{status}` by failover monitor.",
|
|
),
|
|
)
|
|
)
|
|
|
|
for process_name, count in sorted(restart_counts.items()):
|
|
if count <= restart_escalation_threshold:
|
|
continue
|
|
fingerprint = f"restart-escalation:{process_name}"
|
|
incidents.append(
|
|
Incident(
|
|
fingerprint=fingerprint,
|
|
title=f"[AUTO] Restart escalation: {process_name}",
|
|
body=_build_body(
|
|
fingerprint,
|
|
f"Detected at: {_iso(now)}",
|
|
f"Process `{process_name}` has crossed the restart escalation threshold with count={count}.",
|
|
),
|
|
)
|
|
)
|
|
|
|
if heartbeat_stale:
|
|
fingerprint = "probe-stale:fleet-health"
|
|
incidents.append(
|
|
Incident(
|
|
fingerprint=fingerprint,
|
|
title="[AUTO] Fleet health probe stale",
|
|
body=_build_body(
|
|
fingerprint,
|
|
f"Detected at: {_iso(now)}",
|
|
"Heartbeat missing or older than the configured fleet health maximum age.",
|
|
),
|
|
)
|
|
)
|
|
|
|
return incidents
|
|
|
|
|
|
def find_matching_issue(incident: Incident, open_issues: Iterable[dict]) -> dict | None:
|
|
for issue in open_issues:
|
|
haystack = "\n".join([issue.get("title") or "", issue.get("body") or ""])
|
|
if incident.fingerprint in haystack or incident.title == issue.get("title"):
|
|
return issue
|
|
return None
|
|
|
|
|
|
def build_repeat_comment(incident: Incident) -> str:
|
|
return (
|
|
"Autonomous infrastructure detector saw the same incident again.\n\n"
|
|
f"Fingerprint: {incident.fingerprint}\n\n"
|
|
f"Latest evidence:\n{incident.latest_evidence()}"
|
|
)
|
|
|
|
|
|
def sync_incidents(
|
|
incidents: Iterable[Incident],
|
|
client: GiteaClient,
|
|
*,
|
|
apply: bool = False,
|
|
comment_existing: bool = True,
|
|
):
|
|
open_issues = list(client.list_open_issues())
|
|
results = []
|
|
|
|
for incident in incidents:
|
|
existing = find_matching_issue(incident, open_issues)
|
|
if existing:
|
|
action = "existing"
|
|
if apply and comment_existing:
|
|
client.comment_issue(existing["number"], build_repeat_comment(incident))
|
|
action = "commented"
|
|
results.append(
|
|
{
|
|
"action": action,
|
|
"fingerprint": incident.fingerprint,
|
|
"issue_number": existing["number"],
|
|
"title": existing.get("title"),
|
|
}
|
|
)
|
|
continue
|
|
|
|
if apply:
|
|
created = client.create_issue(incident.title, incident.body)
|
|
open_issues.append(created)
|
|
results.append(
|
|
{
|
|
"action": "created",
|
|
"fingerprint": incident.fingerprint,
|
|
"issue_number": created["number"],
|
|
"title": created.get("title"),
|
|
}
|
|
)
|
|
else:
|
|
results.append(
|
|
{
|
|
"action": "would_create",
|
|
"fingerprint": incident.fingerprint,
|
|
"issue_number": None,
|
|
"title": incident.title,
|
|
}
|
|
)
|
|
|
|
return results
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="Create or refresh fleet incidents on Gitea from local infrastructure signals.")
|
|
parser.add_argument("--owner", default=DEFAULT_OWNER)
|
|
parser.add_argument("--repo", default=DEFAULT_REPO)
|
|
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
|
|
parser.add_argument("--token-file", type=Path, default=DEFAULT_TOKEN_FILE)
|
|
parser.add_argument("--failover-status", type=Path, default=DEFAULT_FAILOVER_STATUS)
|
|
parser.add_argument("--restart-state-dir", type=Path, default=DEFAULT_RESTART_STATE_DIR)
|
|
parser.add_argument("--heartbeat-file", type=Path, default=DEFAULT_HEARTBEAT_FILE)
|
|
parser.add_argument("--heartbeat-max-age-seconds", type=int, default=900)
|
|
parser.add_argument("--restart-escalation-threshold", type=int, default=3)
|
|
parser.add_argument("--apply", action="store_true", help="Create/comment issues instead of reporting what would happen.")
|
|
parser.add_argument("--no-comment-existing", action="store_true", help="Do not comment on existing matching issues.")
|
|
parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON output.")
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
now = datetime.now(timezone.utc)
|
|
|
|
failover_status = load_json(args.failover_status)
|
|
restart_counts = load_restart_counts(args.restart_state_dir)
|
|
heartbeat_stale = heartbeat_is_stale(
|
|
args.heartbeat_file,
|
|
now=now,
|
|
max_age_seconds=args.heartbeat_max_age_seconds,
|
|
)
|
|
incidents = build_incidents(
|
|
failover_status=failover_status,
|
|
restart_counts=restart_counts,
|
|
heartbeat_stale=heartbeat_stale,
|
|
now=now,
|
|
restart_escalation_threshold=args.restart_escalation_threshold,
|
|
)
|
|
|
|
payload = {
|
|
"generated_at": _iso(now),
|
|
"incidents": [incident.__dict__ for incident in incidents],
|
|
"results": [],
|
|
}
|
|
|
|
token = None
|
|
if args.token_file.exists():
|
|
token = args.token_file.read_text().strip()
|
|
|
|
if args.apply and not token:
|
|
raise SystemExit(f"Token file not found: {args.token_file}")
|
|
|
|
if token:
|
|
client = GiteaClient(token=token, owner=args.owner, repo=args.repo, base_url=args.base_url)
|
|
payload["results"] = sync_incidents(
|
|
incidents,
|
|
client,
|
|
apply=args.apply,
|
|
comment_existing=not args.no_comment_existing,
|
|
)
|
|
else:
|
|
payload["results"] = [
|
|
{
|
|
"action": "local_only",
|
|
"fingerprint": incident.fingerprint,
|
|
"issue_number": None,
|
|
"title": incident.title,
|
|
}
|
|
for incident in incidents
|
|
]
|
|
|
|
if args.json:
|
|
print(json.dumps(payload, indent=2))
|
|
else:
|
|
print(f"Generated at: {payload['generated_at']}")
|
|
if not incidents:
|
|
print("No autonomous infrastructure incidents detected.")
|
|
for incident in incidents:
|
|
print(f"- {incident.title} [{incident.fingerprint}]")
|
|
for result in payload["results"]:
|
|
print(f" -> {result['action']}: {result['title']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|