diff --git a/.gitea/workflows/smoke.yml b/.gitea/workflows/smoke.yml index b49c0b7..82159cb 100644 --- a/.gitea/workflows/smoke.yml +++ b/.gitea/workflows/smoke.yml @@ -22,3 +22,6 @@ jobs: run: | if grep -rE 'sk-or-|sk-ant-|ghp_|AKIA' . --include='*.yml' --include='*.py' --include='*.sh' 2>/dev/null | grep -v '.gitea' | grep -v 'detect_secrets' | grep -v 'test_trajectory_sanitize'; then exit 1; fi echo "PASS: No secrets" + - name: Backup pipeline regression test + run: | + python3 -m unittest discover -s tests -p 'test_backup_pipeline.py' -v diff --git a/scripts/backup_pipeline.sh b/scripts/backup_pipeline.sh index bce1099..126a437 100644 --- a/scripts/backup_pipeline.sh +++ b/scripts/backup_pipeline.sh @@ -1,80 +1,170 @@ #!/usr/bin/env bash -# backup_pipeline.sh — Daily fleet backup pipeline (FLEET-008) -# Refs: timmy-home #561 +# backup_pipeline.sh — Nightly encrypted Hermes backup pipeline +# Refs: timmy-home #693, timmy-home #561 set -euo pipefail -BACKUP_ROOT="/backups/timmy" -DATESTAMP=$(date +%Y%m%d-%H%M%S) -BACKUP_DIR="${BACKUP_ROOT}/${DATESTAMP}" -LOG_DIR="/var/log/timmy" -ALERT_LOG="${LOG_DIR}/backup_pipeline.log" -mkdir -p "$BACKUP_DIR" "$LOG_DIR" +DATESTAMP="${BACKUP_TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}" +BACKUP_SOURCE_DIR="${BACKUP_SOURCE_DIR:-${HOME}/.hermes}" +BACKUP_ROOT="${BACKUP_ROOT:-${HOME}/.timmy-backups/hermes}" +BACKUP_LOG_DIR="${BACKUP_LOG_DIR:-${BACKUP_ROOT}/logs}" +BACKUP_RETENTION_DAYS="${BACKUP_RETENTION_DAYS:-14}" +BACKUP_S3_URI="${BACKUP_S3_URI:-}" +BACKUP_NAS_TARGET="${BACKUP_NAS_TARGET:-}" +AWS_ENDPOINT_URL="${AWS_ENDPOINT_URL:-}" +BACKUP_NAME="hermes-backup-${DATESTAMP}" +LOCAL_BACKUP_DIR="${BACKUP_ROOT}/${DATESTAMP}" +STAGE_DIR="$(mktemp -d "${TMPDIR:-/tmp}/timmy-backup.XXXXXX")" +PLAINTEXT_ARCHIVE="${STAGE_DIR}/${BACKUP_NAME}.tar.gz" +ENCRYPTED_ARCHIVE="${STAGE_DIR}/${BACKUP_NAME}.tar.gz.enc" +MANIFEST_PATH="${STAGE_DIR}/${BACKUP_NAME}.json" +ALERT_LOG="${BACKUP_LOG_DIR}/backup_pipeline.log" +PASSFILE_CLEANUP="" -TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}" -TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}" -OFFSITE_TARGET="${OFFSITE_TARGET:-}" +mkdir -p "$BACKUP_LOG_DIR" -log() { echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"; } +log() { + echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG" +} -send_telegram() { - local msg="$1" - if [[ -n "$TELEGRAM_BOT_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then - curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ - -d "chat_id=${TELEGRAM_CHAT_ID}" -d "text=${msg}" >/dev/null 2>&1 || true +fail() { + log "ERROR: $1" + exit 1 +} + +cleanup() { + rm -f "$PLAINTEXT_ARCHIVE" + rm -rf "$STAGE_DIR" + if [[ -n "$PASSFILE_CLEANUP" && -f "$PASSFILE_CLEANUP" ]]; then + rm -f "$PASSFILE_CLEANUP" + fi +} +trap cleanup EXIT + +resolve_passphrase_file() { + if [[ -n "${BACKUP_PASSPHRASE_FILE:-}" ]]; then + [[ -f "$BACKUP_PASSPHRASE_FILE" ]] || fail "BACKUP_PASSPHRASE_FILE does not exist: $BACKUP_PASSPHRASE_FILE" + echo "$BACKUP_PASSPHRASE_FILE" + return + fi + + if [[ -n "${BACKUP_PASSPHRASE:-}" ]]; then + PASSFILE_CLEANUP="${STAGE_DIR}/backup.passphrase" + printf '%s' "$BACKUP_PASSPHRASE" > "$PASSFILE_CLEANUP" + chmod 600 "$PASSFILE_CLEANUP" + echo "$PASSFILE_CLEANUP" + return + fi + + fail "Set BACKUP_PASSPHRASE_FILE or BACKUP_PASSPHRASE before running the backup pipeline." +} + +sha256_file() { + local path="$1" + if command -v shasum >/dev/null 2>&1; then + shasum -a 256 "$path" | awk '{print $1}' + elif command -v sha256sum >/dev/null 2>&1; then + sha256sum "$path" | awk '{print $1}' + else + python3 - <<'PY' "$path" +import hashlib +import pathlib +import sys +path = pathlib.Path(sys.argv[1]) +h = hashlib.sha256() +with path.open('rb') as f: + for chunk in iter(lambda: f.read(1024 * 1024), b''): + h.update(chunk) +print(h.hexdigest()) +PY fi } -status=0 +write_manifest() { + python3 - <<'PY' "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" +import json +import sys +manifest_path, source_dir, archive_name, archive_sha256, local_dir, s3_uri, nas_target, created_at = sys.argv[1:] +manifest = { + "created_at": created_at, + "source_dir": source_dir, + "archive_name": archive_name, + "archive_sha256": archive_sha256, + "encryption": { + "type": "openssl", + "cipher": "aes-256-cbc", + "pbkdf2": True, + "iterations": 200000, + }, + "destinations": { + "local_dir": local_dir, + "s3_uri": s3_uri or None, + "nas_target": nas_target or None, + }, +} +with open(manifest_path, 'w', encoding='utf-8') as handle: + json.dump(manifest, handle, indent=2) + handle.write('\n') +PY +} -# --- Gitea repositories --- -if [[ -d /root/gitea ]]; then - tar czf "${BACKUP_DIR}/gitea-repos.tar.gz" -C /root gitea 2>/dev/null || true - log "Backed up Gitea repos" -fi +upload_to_nas() { + local archive_path="$1" + local manifest_path="$2" + local target_root="$3" -# --- Agent configs and state --- -for wiz in bezalel allegro ezra timmy; do - if [[ -d "/root/wizards/${wiz}" ]]; then - tar czf "${BACKUP_DIR}/${wiz}-home.tar.gz" -C /root/wizards "${wiz}" 2>/dev/null || true - log "Backed up ${wiz} home" + local target_dir="${target_root%/}/${DATESTAMP}" + mkdir -p "$target_dir" + cp "$archive_path" "$manifest_path" "$target_dir/" + log "Uploaded backup to NAS target: $target_dir" +} + +upload_to_s3() { + local archive_path="$1" + local manifest_path="$2" + + command -v aws >/dev/null 2>&1 || fail "BACKUP_S3_URI is set but aws CLI is not installed." + + local args=() + if [[ -n "$AWS_ENDPOINT_URL" ]]; then + args+=(--endpoint-url "$AWS_ENDPOINT_URL") fi -done -# --- System configs --- -cp /etc/crontab "${BACKUP_DIR}/crontab" 2>/dev/null || true -cp -r /etc/systemd/system "${BACKUP_DIR}/systemd" 2>/dev/null || true -log "Backed up system configs" + aws "${args[@]}" s3 cp "$archive_path" "${BACKUP_S3_URI%/}/$(basename "$archive_path")" + aws "${args[@]}" s3 cp "$manifest_path" "${BACKUP_S3_URI%/}/$(basename "$manifest_path")" + log "Uploaded backup to S3 target: $BACKUP_S3_URI" +} -# --- Evennia worlds (if present) --- -if [[ -d /root/evennia ]]; then - tar czf "${BACKUP_DIR}/evennia-worlds.tar.gz" -C /root evennia 2>/dev/null || true - log "Backed up Evennia worlds" +[[ -d "$BACKUP_SOURCE_DIR" ]] || fail "BACKUP_SOURCE_DIR does not exist: $BACKUP_SOURCE_DIR" +[[ -n "$BACKUP_NAS_TARGET" || -n "$BACKUP_S3_URI" ]] || fail "Set BACKUP_NAS_TARGET or BACKUP_S3_URI for remote backup storage." + +PASSFILE="$(resolve_passphrase_file)" +mkdir -p "$LOCAL_BACKUP_DIR" + +log "Creating archive from $BACKUP_SOURCE_DIR" +tar -czf "$PLAINTEXT_ARCHIVE" -C "$(dirname "$BACKUP_SOURCE_DIR")" "$(basename "$BACKUP_SOURCE_DIR")" + +log "Encrypting archive" +openssl enc -aes-256-cbc -salt -pbkdf2 -iter 200000 \ + -pass "file:${PASSFILE}" \ + -in "$PLAINTEXT_ARCHIVE" \ + -out "$ENCRYPTED_ARCHIVE" + +ARCHIVE_SHA256="$(sha256_file "$ENCRYPTED_ARCHIVE")" +CREATED_AT="$(date -u '+%Y-%m-%dT%H:%M:%SZ')" +write_manifest "$MANIFEST_PATH" "$BACKUP_SOURCE_DIR" "$(basename "$ENCRYPTED_ARCHIVE")" "$ARCHIVE_SHA256" "$LOCAL_BACKUP_DIR" "$BACKUP_S3_URI" "$BACKUP_NAS_TARGET" "$CREATED_AT" + +cp "$ENCRYPTED_ARCHIVE" "$MANIFEST_PATH" "$LOCAL_BACKUP_DIR/" +rm -f "$PLAINTEXT_ARCHIVE" +log "Encrypted backup stored locally: ${LOCAL_BACKUP_DIR}/$(basename "$ENCRYPTED_ARCHIVE")" + +if [[ -n "$BACKUP_NAS_TARGET" ]]; then + upload_to_nas "$ENCRYPTED_ARCHIVE" "$MANIFEST_PATH" "$BACKUP_NAS_TARGET" fi -# --- Manifest --- -find "$BACKUP_DIR" -type f > "${BACKUP_DIR}/manifest.txt" -log "Backup manifest written" - -# --- Offsite sync --- -if [[ -n "$OFFSITE_TARGET" ]]; then - if rsync -az --delete "${BACKUP_DIR}/" "${OFFSITE_TARGET}/${DATESTAMP}/" 2>/dev/null; then - log "Offsite sync completed" - else - log "WARNING: Offsite sync failed" - status=1 - fi +if [[ -n "$BACKUP_S3_URI" ]]; then + upload_to_s3 "$ENCRYPTED_ARCHIVE" "$MANIFEST_PATH" fi -# --- Retention: keep last 7 days --- -find "$BACKUP_ROOT" -mindepth 1 -maxdepth 1 -type d -mtime +7 -exec rm -rf {} + 2>/dev/null || true -log "Retention applied (7 days)" - -if [[ "$status" -eq 0 ]]; then - log "Backup pipeline completed: ${BACKUP_DIR}" - send_telegram "✅ Daily backup completed: ${DATESTAMP}" -else - log "Backup pipeline completed with WARNINGS: ${BACKUP_DIR}" - send_telegram "⚠️ Daily backup completed with warnings: ${DATESTAMP}" -fi - -exit "$status" +find "$BACKUP_ROOT" -mindepth 1 -maxdepth 1 -type d -name '20*' -mtime "+${BACKUP_RETENTION_DAYS}" -exec rm -rf {} + 2>/dev/null || true +log "Retention applied (${BACKUP_RETENTION_DAYS} days)" +log "Backup pipeline completed successfully" diff --git a/scripts/restore_backup.sh b/scripts/restore_backup.sh new file mode 100644 index 0000000..07d0844 --- /dev/null +++ b/scripts/restore_backup.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# restore_backup.sh — Restore an encrypted Hermes backup archive +# Usage: restore_backup.sh /path/to/hermes-backup-YYYYmmdd-HHMMSS.tar.gz.enc /restore/root +set -euo pipefail + +ARCHIVE_PATH="${1:-}" +RESTORE_ROOT="${2:-}" +STAGE_DIR="$(mktemp -d "${TMPDIR:-/tmp}/timmy-restore.XXXXXX")" +PLAINTEXT_ARCHIVE="${STAGE_DIR}/restore.tar.gz" +PASSFILE_CLEANUP="" + +cleanup() { + rm -f "$PLAINTEXT_ARCHIVE" + rm -rf "$STAGE_DIR" + if [[ -n "$PASSFILE_CLEANUP" && -f "$PASSFILE_CLEANUP" ]]; then + rm -f "$PASSFILE_CLEANUP" + fi +} +trap cleanup EXIT + +fail() { + echo "ERROR: $1" >&2 + exit 1 +} + +resolve_passphrase_file() { + if [[ -n "${BACKUP_PASSPHRASE_FILE:-}" ]]; then + [[ -f "$BACKUP_PASSPHRASE_FILE" ]] || fail "BACKUP_PASSPHRASE_FILE does not exist: $BACKUP_PASSPHRASE_FILE" + echo "$BACKUP_PASSPHRASE_FILE" + return + fi + + if [[ -n "${BACKUP_PASSPHRASE:-}" ]]; then + PASSFILE_CLEANUP="${STAGE_DIR}/backup.passphrase" + printf '%s' "$BACKUP_PASSPHRASE" > "$PASSFILE_CLEANUP" + chmod 600 "$PASSFILE_CLEANUP" + echo "$PASSFILE_CLEANUP" + return + fi + + fail "Set BACKUP_PASSPHRASE_FILE or BACKUP_PASSPHRASE before restoring a backup." +} + +sha256_file() { + local path="$1" + if command -v shasum >/dev/null 2>&1; then + shasum -a 256 "$path" | awk '{print $1}' + elif command -v sha256sum >/dev/null 2>&1; then + sha256sum "$path" | awk '{print $1}' + else + python3 - <<'PY' "$path" +import hashlib +import pathlib +import sys +path = pathlib.Path(sys.argv[1]) +h = hashlib.sha256() +with path.open('rb') as f: + for chunk in iter(lambda: f.read(1024 * 1024), b''): + h.update(chunk) +print(h.hexdigest()) +PY + fi +} + +[[ -n "$ARCHIVE_PATH" ]] || fail "Usage: restore_backup.sh /path/to/archive.tar.gz.enc /restore/root" +[[ -n "$RESTORE_ROOT" ]] || fail "Usage: restore_backup.sh /path/to/archive.tar.gz.enc /restore/root" +[[ -f "$ARCHIVE_PATH" ]] || fail "Archive not found: $ARCHIVE_PATH" + +if [[ "$ARCHIVE_PATH" == *.tar.gz.enc ]]; then + MANIFEST_PATH="${ARCHIVE_PATH%.tar.gz.enc}.json" +else + MANIFEST_PATH="" +fi + +if [[ -n "$MANIFEST_PATH" && -f "$MANIFEST_PATH" ]]; then + EXPECTED_SHA="$(python3 - <<'PY' "$MANIFEST_PATH" +import json +import sys +with open(sys.argv[1], 'r', encoding='utf-8') as handle: + manifest = json.load(handle) +print(manifest['archive_sha256']) +PY +)" + ACTUAL_SHA="$(sha256_file "$ARCHIVE_PATH")" + [[ "$EXPECTED_SHA" == "$ACTUAL_SHA" ]] || fail "Archive SHA256 mismatch: expected $EXPECTED_SHA got $ACTUAL_SHA" +fi + +PASSFILE="$(resolve_passphrase_file)" +mkdir -p "$RESTORE_ROOT" + +openssl enc -d -aes-256-cbc -salt -pbkdf2 -iter 200000 \ + -pass "file:${PASSFILE}" \ + -in "$ARCHIVE_PATH" \ + -out "$PLAINTEXT_ARCHIVE" + +tar -xzf "$PLAINTEXT_ARCHIVE" -C "$RESTORE_ROOT" +echo "Restored backup into $RESTORE_ROOT" diff --git a/tests/test_backup_pipeline.py b/tests/test_backup_pipeline.py new file mode 100644 index 0000000..69e3c5f --- /dev/null +++ b/tests/test_backup_pipeline.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +import os +import subprocess +import tempfile +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +BACKUP_SCRIPT = ROOT / "scripts" / "backup_pipeline.sh" +RESTORE_SCRIPT = ROOT / "scripts" / "restore_backup.sh" + + +class TestBackupPipeline(unittest.TestCase): + def setUp(self) -> None: + self.tempdir = tempfile.TemporaryDirectory() + self.base = Path(self.tempdir.name) + self.home = self.base / "home" + self.source_dir = self.home / ".hermes" + self.source_dir.mkdir(parents=True) + (self.source_dir / "sessions").mkdir() + (self.source_dir / "cron").mkdir() + (self.source_dir / "config.yaml").write_text("model: local-first\n") + (self.source_dir / "sessions" / "session.jsonl").write_text('{"role":"assistant","content":"hello"}\n') + (self.source_dir / "cron" / "jobs.json").write_text('{"jobs": 1}\n') + (self.source_dir / "state.db").write_bytes(b"sqlite-state") + + self.backup_root = self.base / "backup-root" + self.nas_target = self.base / "nas-target" + self.restore_root = self.base / "restore-root" + self.log_dir = self.base / "logs" + self.passphrase_file = self.base / "backup.passphrase" + self.passphrase_file.write_text("correct horse battery staple\n") + + def tearDown(self) -> None: + self.tempdir.cleanup() + + def _env(self, *, include_remote: bool = True) -> dict[str, str]: + env = os.environ.copy() + env.update( + { + "HOME": str(self.home), + "BACKUP_SOURCE_DIR": str(self.source_dir), + "BACKUP_ROOT": str(self.backup_root), + "BACKUP_LOG_DIR": str(self.log_dir), + "BACKUP_PASSPHRASE_FILE": str(self.passphrase_file), + } + ) + if include_remote: + env["BACKUP_NAS_TARGET"] = str(self.nas_target) + return env + + def test_backup_encrypts_and_restore_round_trips(self) -> None: + backup = subprocess.run( + ["bash", str(BACKUP_SCRIPT)], + capture_output=True, + text=True, + env=self._env(), + cwd=ROOT, + ) + self.assertEqual(backup.returncode, 0, msg=backup.stdout + backup.stderr) + + encrypted_archives = sorted(self.nas_target.rglob("*.tar.gz.enc")) + self.assertEqual(len(encrypted_archives), 1, msg=f"expected one encrypted archive, found: {encrypted_archives}") + archive_path = encrypted_archives[0] + self.assertNotIn(b"model: local-first", archive_path.read_bytes()) + + manifests = sorted(self.nas_target.rglob("*.json")) + self.assertEqual(len(manifests), 1, msg=f"expected one manifest, found: {manifests}") + + plaintext_archives = sorted(self.backup_root.rglob("*.tar.gz")) + sorted(self.nas_target.rglob("*.tar.gz")) + self.assertEqual(plaintext_archives, [], msg=f"plaintext archives leaked: {plaintext_archives}") + + restore = subprocess.run( + ["bash", str(RESTORE_SCRIPT), str(archive_path), str(self.restore_root)], + capture_output=True, + text=True, + env=self._env(), + cwd=ROOT, + ) + self.assertEqual(restore.returncode, 0, msg=restore.stdout + restore.stderr) + + restored_hermes = self.restore_root / ".hermes" + self.assertTrue(restored_hermes.exists()) + self.assertEqual((restored_hermes / "config.yaml").read_text(), "model: local-first\n") + self.assertEqual((restored_hermes / "sessions" / "session.jsonl").read_text(), '{"role":"assistant","content":"hello"}\n') + self.assertEqual((restored_hermes / "cron" / "jobs.json").read_text(), '{"jobs": 1}\n') + self.assertEqual((restored_hermes / "state.db").read_bytes(), b"sqlite-state") + + def test_backup_requires_remote_target(self) -> None: + backup = subprocess.run( + ["bash", str(BACKUP_SCRIPT)], + capture_output=True, + text=True, + env=self._env(include_remote=False), + cwd=ROOT, + ) + self.assertNotEqual(backup.returncode, 0) + self.assertIn("BACKUP_NAS_TARGET or BACKUP_S3_URI", backup.stdout + backup.stderr) + + +if __name__ == "__main__": + unittest.main(verbosity=2)