Files
the-nexus/scripts/lazarus_checkpoint.py
Bezalel 259df5b5e6
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
feat(lazarus): fleet health dashboard, pulse viz, and checkpoint/restore (#805 #869 #881)
2026-04-07 15:14:03 +00:00

141 lines
4.2 KiB
Python

#!/usr/bin/env python3
"""
Lazarus Checkpoint / Restore
============================
Save and resume mission cell state for agent resurrection.
Usage:
python scripts/lazarus_checkpoint.py <mission_name>
python scripts/lazarus_checkpoint.py --restore <mission_name>
python scripts/lazarus_checkpoint.py --list
"""
import os
import sys
import argparse
import json
import tarfile
import subprocess
from datetime import datetime, timezone
from pathlib import Path
CHECKPOINT_DIR = Path("/var/lib/lazarus/checkpoints")
MISSION_DIRS = {
"bezalel": "/root/wizards/bezalel",
"the-nexus": "/root/wizards/bezalel/workspace/the-nexus",
"hermes-agent": "/root/wizards/bezalel/workspace/hermes-agent",
}
def shell(cmd: str, timeout: int = 60) -> tuple[int, str, str]:
try:
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return r.returncode, r.stdout.strip(), r.stderr.strip()
except Exception as e:
return -1, "", str(e)
def checkpoint(mission: str) -> Path:
src = Path(MISSION_DIRS.get(mission, mission))
if not src.exists():
print(f"ERROR: Source directory not found: {src}")
sys.exit(1)
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
out_dir = CHECKPOINT_DIR / mission
out_dir.mkdir(parents=True, exist_ok=True)
tar_path = out_dir / f"{mission}_{ts}.tar.gz"
# Git commit checkpoint
git_sha = ""
git_path = src / ".git"
if git_path.exists():
code, out, _ = shell(f"cd {src} && git rev-parse HEAD")
if code == 0:
git_sha = out
meta = {
"mission": mission,
"created_at": datetime.now(timezone.utc).isoformat(),
"source": str(src),
"git_sha": git_sha,
}
meta_path = out_dir / f"{mission}_{ts}.json"
with open(meta_path, "w") as f:
json.dump(meta, f, indent=2)
# Tar.gz checkpoint (respect .gitignore if possible)
with tarfile.open(tar_path, "w:gz") as tar:
tar.add(src, arcname=src.name)
print(f"CHECKPOINT {mission}: {tar_path}")
print(f" Meta: {meta_path}")
print(f" Git SHA: {git_sha or 'n/a'}")
return tar_path
def restore(mission: str, identifier: str | None = None):
out_dir = CHECKPOINT_DIR / mission
if not out_dir.exists():
print(f"ERROR: No checkpoints found for {mission}")
sys.exit(1)
tars = sorted(out_dir.glob("*.tar.gz"))
if not tars:
print(f"ERROR: No tar.gz checkpoints for {mission}")
sys.exit(1)
if identifier:
tar_path = out_dir / f"{mission}_{identifier}.tar.gz"
if not tar_path.exists():
print(f"ERROR: Checkpoint not found: {tar_path}")
sys.exit(1)
else:
tar_path = tars[-1]
src = Path(MISSION_DIRS.get(mission, mission))
print(f"RESTORE {mission}: {tar_path}{src}")
with tarfile.open(tar_path, "r:gz") as tar:
tar.extractall(path=src.parent)
print("Restore complete. Restart agent to resume from checkpoint.")
def list_checkpoints():
if not CHECKPOINT_DIR.exists():
print("No checkpoints stored.")
return
for mission_dir in sorted(CHECKPOINT_DIR.iterdir()):
if mission_dir.is_dir():
tars = sorted(mission_dir.glob("*.tar.gz"))
print(f"{mission_dir.name}: {len(tars)} checkpoint(s)")
for t in tars[-5:]:
print(f" {t.name}")
def main() -> int:
parser = argparse.ArgumentParser(description="Lazarus Checkpoint / Restore")
parser.add_argument("mission", nargs="?", help="Mission name to checkpoint/restore")
parser.add_argument("--restore", action="store_true", help="Restore mode")
parser.add_argument("--identifier", help="Specific checkpoint identifier (YYYYMMDD_HHMMSS)")
parser.add_argument("--list", action="store_true", help="List all checkpoints")
args = parser.parse_args()
if args.list:
list_checkpoints()
return 0
if not args.mission:
print("ERROR: mission name required (or use --list)")
return 1
if args.restore:
restore(args.mission, args.identifier)
else:
checkpoint(args.mission)
return 0
if __name__ == "__main__":
raise SystemExit(main())