Compare commits

...

1 Commits

Author SHA1 Message Date
Alexander Whitestone
d63654da22 ci: add self-healing smoke workflow (#549)
Some checks failed
Self-Healing Smoke / self-healing-smoke (pull_request) Successful in 21s
Smoke Test / smoke (pull_request) Failing after 24s
2026-04-15 01:05:04 -04:00
3 changed files with 103 additions and 0 deletions

View File

@@ -0,0 +1,34 @@
name: Self-Healing Smoke
on:
pull_request:
push:
branches: [main]
jobs:
self-healing-smoke:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Shell syntax checks
run: |
bash -n scripts/fleet_health_probe.sh
bash -n scripts/auto_restart_agent.sh
bash -n scripts/backup_pipeline.sh
- name: Python compile checks
run: |
python3 -m py_compile uni-wizard/daemons/health_daemon.py
python3 -m py_compile scripts/fleet_milestones.py
python3 -m py_compile scripts/sovereign_health_report.py
python3 -m py_compile tests/docs/test_self_healing_infrastructure.py
python3 -m py_compile tests/docs/test_self_healing_ci.py
- name: Phase-2 doc tests
run: |
pytest -q tests/docs/test_self_healing_infrastructure.py tests/docs/test_self_healing_ci.py

View File

@@ -0,0 +1,29 @@
from pathlib import Path
WORKFLOW = Path(".gitea/workflows/self-healing-smoke.yml")
def _content() -> str:
return WORKFLOW.read_text()
def test_self_healing_workflow_exists() -> None:
assert WORKFLOW.exists()
def test_self_healing_workflow_checks_phase2_artifacts() -> None:
content = _content()
assert "name: Self-Healing Smoke" in content
assert "pull_request:" in content
assert "push:" in content
assert "branches: [main]" in content
assert "actions/checkout@v4" in content
assert "actions/setup-python@v5" in content
assert "bash -n scripts/fleet_health_probe.sh" in content
assert "bash -n scripts/auto_restart_agent.sh" in content
assert "bash -n scripts/backup_pipeline.sh" in content
assert "python3 -m py_compile uni-wizard/daemons/health_daemon.py" in content
assert "python3 -m py_compile scripts/fleet_milestones.py" in content
assert "python3 -m py_compile scripts/sovereign_health_report.py" in content
assert "pytest -q tests/docs/test_self_healing_infrastructure.py tests/docs/test_self_healing_ci.py" in content

View File

@@ -0,0 +1,40 @@
from pathlib import Path
HEALTH_PROBE = Path("scripts/fleet_health_probe.sh")
AUTO_RESTART = Path("scripts/auto_restart_agent.sh")
BACKUP_PIPELINE = Path("scripts/backup_pipeline.sh")
HEALTH_SERVICE = Path("configs/timmy-health.service")
TASK_ROUTER_SERVICE = Path("configs/timmy-task-router.service")
AGENT_SERVICE = Path("configs/timmy-agent.service")
def test_health_probe_has_thresholds_and_heartbeat() -> None:
content = HEALTH_PROBE.read_text()
assert "DISK_THRESHOLD=90" in content
assert "MEM_THRESHOLD=90" in content
assert 'touch "${HEARTBEAT_DIR}/fleet_health.last"' in content
assert 'CRITICAL_PROCESSES="${CRITICAL_PROCESSES:-act_runner}"' in content
def test_auto_restart_agent_has_retry_cap_and_escalation() -> None:
content = AUTO_RESTART.read_text()
assert 'count=$((count + 1))' in content
assert '[[ "$count" -le 3 ]]' in content
assert 'ESCALATION: $proc_name still dead after 3 restart attempts.' in content
assert 'touch "${STATE_DIR}/auto_restart.last"' in content
def test_backup_pipeline_has_offsite_sync_and_retention() -> None:
content = BACKUP_PIPELINE.read_text()
assert 'OFFSITE_TARGET="${OFFSITE_TARGET:-}"' in content
assert 'rsync -az --delete' in content
assert 'find "$BACKUP_ROOT" -mindepth 1 -maxdepth 1 -type d -mtime +7 -exec rm -rf {} +' in content
assert 'send_telegram "✅ Daily backup completed: ${DATESTAMP}"' in content
def test_self_healing_services_restart_automatically() -> None:
for path in [HEALTH_SERVICE, TASK_ROUTER_SERVICE, AGENT_SERVICE]:
content = path.read_text()
assert "Restart=always" in content
assert "RestartSec=" in content