PAPERCLIPS-7: Add disassembly audit script

- New bin/disassembly_audit.py identifies waste: * Zombie processes (defunct, unreclaimable) * Unused systemd services (enabled but inactive) * Dead loops: systemic cron failures (via cron-audit-662) * Stale hermes agent sessions - Recommends keep/kill/disable actions - Estimates monthly cost savings (memory, CPU) - Optional --execute flag for safe disassembly - JSON and markdown report output Closes #335
feat(backup): add automated Gitea daily backup and recovery runbook
2026-04-30 02:31:08 -04:00 · 2026-04-30 01:44:05 -04:00
6 changed files with 753 additions and 0 deletions
--- a/bin/disassembly_audit.py
+++ b/bin/disassembly_audit.py
@@ -0,0 +1,477 @@
 #!/usr/bin/env python3
 """
 Disassembly Audit — Know what to tear down (#335)
 Monthly audit that identifies waste: zombie processes, unused services, dead loops.
 Recommends: keep, kill, or migrate. Optionally executes clean disassembly.
 Usage:
    python3 bin/disassembly_audit.py                 # Dry-run audit report
    python3 bin/disassembly_audit.py --execute        # Perform safe disassembly
    python3 bin/disassembly_audit.py --json           # JSON output
    python3 bin/disassembly_audit.py --output report.md
 References:
  - Issue: https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config/issues/335
  - Epic:  #328 (Universal Paperclips — Disassembly Sequence)
 """
 from __future__ import annotations
 import json
 import os
 import re
 import signal
 import subprocess
 import sys
 import time
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 # === CONFIG ===
 ZOMBIE_THRESHOLD_DAYS = 7
 UNUSED_SERVICE_DAYS = 30
 CRON_ERROR_THRESHOLD_HOURS = 48
 HERMES_STALE_AGE_HOURS = 24
 COST_PER_GB_MONTH = 10
 COST_PER_VCPU_MONTH = 5
 def get_zombie_processes() -> List[Dict[str, Any]]:
    """Identify zombie (defunct) processes."""
    try:
        result = subprocess.run(
            ["ps", "aux"],
            capture_output=True, text=True, timeout=10
        )
        zombies = []
        for line in result.stdout.split('\n'):
            if not line.strip() or 'grep' in line:
                continue
            parts = line.split(None, 10)
            if len(parts) < 11:
                continue
            stat = parts[7]
            if 'Z' in stat:
                pid = int(parts[1])
                cmd = parts[10][:120]
                age_sec = None
                try:
                    age_result = subprocess.run(
                        ["ps", "-o", "etimes=", "-p", str(pid)],
                        capture_output=True, text=True, timeout=5
                    )
                    if age_result.returncode == 0 and age_result.stdout.strip():
                        age_sec = int(age_result.stdout.strip())
                except Exception:
                    pass
                zombies.append({
                    "pid": pid,
                    "user": parts[0],
                    "command": cmd,
                    "state": stat,
                    "age_seconds": age_sec,
                    "age_hours": round(age_sec / 3600, 1) if age_sec else None,
                    "category": "zombie_process",
                    "recommendation": "kill",
                    "reason": "Zombie process cannot be wait()-ed; blocks resources",
                    "resources_freed_mb": estimate_process_memory_mb(pid),
                })
        return zombies
    except Exception as e:
        print(f"Warning: zombie scan failed: {e}", file=sys.stderr)
        return []
 def estimate_process_memory_mb(pid: int) -> float:
    """Estimate memory usage of a process in MB."""
    try:
        result = subprocess.run(
            ["ps", "-p", str(pid), "-o", "rss="],
            capture_output=True, text=True, timeout=5
        )
        if result.returncode == 0 and result.stdout.strip():
            rss_kb = int(result.stdout.strip())
            return round(rss_kb / 1024, 1)
    except Exception:
        pass
    return 0.0
 def get_systemd_services() -> List[Dict[str, Any]]:
    """Identify enabled but inactive systemd services (potential waste)."""
    try:
        result = subprocess.run(
            ["systemctl", "list-units", "--type=service", "--state=enabled", "--all", "--output=json"],
            capture_output=True, text=True, timeout=10
        )
        services = []
        try:
            data = json.loads(result.stdout)
        except json.JSONDecodeError:
            return []
        for svc in data:
            name = svc.get("unit", "")
            desc = svc.get("description", "")
            try:
                active_result = subprocess.run(
                    ["systemctl", "is-active", name],
                    capture_output=True, text=True, timeout=5
                )
                active = active_result.stdout.strip()
                if active == "inactive":
                    services.append({
                        "service": name,
                        "description": desc,
                        "state": "enabled+inactive",
                        "category": "unused_service",
                        "recommendation": "disable",
                        "reason": "Service is enabled but not running",
                        "resources_freed_mb": 0,
                    })
            except Exception:
                continue
        return services
    except FileNotFoundError:
        return []
    except Exception as e:
        print(f"Warning: systemd scan failed: {e}", file=sys.stderr)
        return []
 def get_cron_jobs_audit() -> Dict[str, Any]:
    """Reuse cron-audit logic if available."""
    try:
        sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts"))
        from cron_audit_662 import load_cron_state, categorize_job, parse_timestamp
        jobs = load_cron_state()
        now = datetime.now(timezone.utc)
        categorized = []
        systemic = []
        transient = []
        healthy = []
        for job in jobs:
            cat = categorize_job(job, now)
            cat["_job"] = job
            categorized.append(cat)
            if cat["category"] == "systemic":
                systemic.append(cat)
            elif cat["category"] == "transient":
                transient.append(cat)
            else:
                healthy.append(cat)
        return {
            "total": len(jobs),
            "healthy": len(healthy),
            "transient": len(transient),
            "systemic": len(systemic),
            "systemic_jobs": [
                {
                    "id": c["id"],
                    "name": c["name"],
                    "reason": c["reason"],
                    "last_error": c.get("last_error", ""),
                    "recommendation": "disable",
                    "category": "dead_loop",
                }
                for c in systemic
            ],
            "transient_jobs": [
                {"id": c["id"], "name": c["name"], "reason": c["reason"], "recommendation": "monitor"}
                for c in transient
            ],
        }
    except ImportError:
        return {"error": "cron_audit_662 not available", "total": 0, "systemic_jobs": []}
 def get_hermes_stale_sessions() -> List[Dict[str, Any]]:
    """Detect stale hermes agent sessions."""
    try:
        result = subprocess.run(
            ["ps", "aux"],
            capture_output=True, text=True, timeout=10
        )
        sessions = {}
        for line in result.stdout.split('\n'):
            if 'hermes' not in line.lower() or 'grep' in line:
                continue
            parts = line.split(None, 10)
            if len(parts) < 11:
                continue
            pid = int(parts[1])
            cpu = float(parts[2])
            mem_kb = int(parts[5])
            cmd = parts[10][:100]
            sessions[str(pid)] = {"pid": pid, "cpu": cpu, "rss": mem_kb, "cmd": cmd}
            try:
                child_out = subprocess.run(
                    ["pgrep", "-P", str(pid)],
                    capture_output=True, text=True, timeout=5
                )
                children = []
                for cp in child_out.stdout.strip().split('\n'):
                    if cp.strip():
                        children.append(int(cp.strip()))
                sessions[str(pid)]["children"] = children
            except Exception:
                sessions[str(pid)]["children"] = []
        stale = []
        for key, proc_info in sessions.items():
            pid = proc_info["pid"]
            try:
                age_res = subprocess.run(
                    ["ps", "-o", "etimes=", "-p", str(pid)],
                    capture_output=True, text=True, timeout=5
                )
                if age_res.returncode == 0 and age_res.stdout.strip():
                    age_sec = int(age_res.stdout.strip())
                    age_hours = age_sec / 3600
                else:
                    continue
            except Exception:
                continue
            if age_hours > HERMES_STALE_AGE_HOURS and proc_info["cpu"] < 0.5:
                total_rss = proc_info["rss"] + sum(
                    _get_rss_kb(child) for child in proc_info.get("children", [])
                )
                stale.append({
                    "pid": pid,
                    "age_hours": round(age_hours, 1),
                    "cpu_percent": proc_info["cpu"],
                    "total_rss_mb": round(total_rss / 1024, 1),
                    "process_count": 1 + len(proc_info.get("children", [])),
                    "command": proc_info["cmd"],
                    "category": "dead_loop",
                    "recommendation": "kill",
                    "reason": f"Hermes session idle for {age_hours:.1f}h",
                    "resources_freed_mb": round(total_rss / 1024, 1),
                })
        return stale
    except Exception as e:
        print(f"Warning: hermes scan failed: {e}", file=sys.stderr)
        return []
 def _get_rss_kb(pid: int) -> int:
    try:
        r = subprocess.run(["ps", "-p", str(pid), "-o", "rss="], capture_output=True, text=True, timeout=3)
        if r.returncode == 0 and r.stdout.strip():
            return int(r.stdout.strip())
    except Exception:
        pass
    return 0
 def calculate_cost_savings(findings: List[Dict[str, Any]]) -> Dict[str, float]:
    total_mb = sum(f.get("resources_freed_mb", 0) for f in findings)
    gb = total_mb / 1024
    process_count = sum(f.get("process_count", 1) for f in findings)
    cpu_equiv = process_count * 0.1
    return {
        "memory_gb_reclaimed": round(gb, 2),
        "memory_cost_saved_monthly": round(gb * COST_PER_GB_MONTH, 2),
        "cpu_equiv_vcpus": round(cpu_equiv, 1),
        "cpu_cost_saved_monthly": round(cpu_equiv * COST_PER_VCPU_MONTH, 2),
        "total_cost_saved_monthly": round(gb * COST_PER_GB_MONTH + cpu_equiv * COST_PER_VCPU_MONTH, 2),
    }
 def generate_markdown_report(audit: Dict[str, Any]) -> str:
    lines = []
    lines.append("# Disassembly Audit Report")
    lines.append(f"**Generated:** {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
    lines.append(f"**Issue:** [#335](https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config/issues/335)")
    lines.append("")
    total_findings = sum(len(audit.get(k, [])) for k in ["zombies", "unused_services", "dead_loops_cron", "dead_loops_hermes"])
    lines.append("## Summary")
    lines.append("")
    lines.append("| Category | Count | Action |")
    lines.append("|----------|-------|--------|")
    lines.append(f"| Zombie Processes | {len(audit.get('zombies', []))} | Kill |")
    lines.append(f"| Unused Services  | {len(audit.get('unused_services', []))} | Disable |")
    lines.append(f"| Dead Loops (Cron)  | {len(audit.get('dead_loops_cron', []))} | Disable |")
    lines.append(f"| Dead Loops (Hermes) | {len(audit.get('dead_loops_hermes', []))} | Kill |")
    lines.append(f"| **Total Waste** | **{total_findings}** | **Disassemble** |")
    lines.append("")
    costs = audit.get("cost_savings", {})
    if costs and costs.get("total_cost_saved_monthly", 0) > 0:
        lines.append("## Cost Impact (Monthly)")
        lines.append("")
        lines.append(f"- Memory reclaimed: **{costs.get('memory_gb_reclaimed', 0):.2f} GB**")
        lines.append(f"- CPU equivalent: **{costs.get('cpu_equiv_vcpus', 0):.1f} vCPUs**")
        lines.append(f"- **Estimated monthly savings: ${costs.get('total_cost_saved_monthly', 0):.2f}**")
        lines.append("")
    def add_table(title, items, headers):
        if not items:
            return
        lines.append(f"## {title}")
        lines.append("")
        header_row = "| " + " | ".join(headers) + " |"
        lines.append(header_row)
        lines.append("|" + "|".join("---" for _ in headers) + "|")
        for item in items:
            row = []
            for h in headers:
                key = h.lower().replace(" ", "_")
                val = item.get(key, "—")
                if isinstance(val, float):
                    val = f"{val:.1f}"
                row.append(str(val))
            lines.append("| " + " | ".join(row) + " |")
        lines.append("")
    add_table(
        "Zombie Processes",
        audit.get("zombies", []),
        ["PID", "User", "Command", "Age (h)", "Memory (MB)", "Action"]
    )
    add_table(
        "Unused Services (enabled but inactive)",
        audit.get("unused_services", []),
        ["Service", "Description", "Action"]
    )
    add_table(
        "Dead Loops — Cron Systemic Failures",
        audit.get("dead_loops_cron", []),
        ["ID", "Name", "Reason", "Action"]
    )
    add_table(
        "Dead Loops — Stale Hermes Sessions",
        audit.get("dead_loops_hermes", []),
        ["PID", "Age (h)", "CPU %", "Memory (MB)", "Procs", "Action"]
    )
    lines.append("## Recommended Actions")
    lines.append("")
    lines.append("### Kill immediately")
    lines.append("- All zombie processes (unreclaimable)")
    lines.append("- Stale hermes sessions (idle >24h)")
    lines.append("")
    lines.append("### Disable")
    lines.append("- Systemd services enabled but inactive")
    lines.append("- Cron jobs erroring >48h (systemic failures)")
    lines.append("")
    lines.append("### Monitor")
    lines.append("- Cron jobs with transient errors (network, rate limit)")
    lines.append("")
    lines.append("## Disassembly Execution")
    lines.append("")
    lines.append("To execute these disassembly actions safely, run:")
    lines.append("```")
    lines.append("  python3 bin/disassembly_audit.py --execute")
    lines.append("```")
    lines.append("")
    lines.append("This will perform safe termination, disable services, pause cron jobs.")
    lines.append("")
    lines.append("---")
    lines.append("*Generated by Disassembly Audit — #335*")
    return "\n".join(lines)
 def execute_disassembly(audit: Dict[str, Any]) -> Dict[str, Any]:
    executed = {"killed": [], "disabled": [], "errors": []}
    for z in audit.get("zombies", []):
        pid = z["pid"]
        try:
            os.kill(pid, signal.SIGTERM)
            executed["killed"].append({"pid": pid, "type": "zombie"})
        except Exception as e:
            executed["errors"].append(f"PID {pid}: {e}")
    for h in audit.get("dead_loops_hermes", []):
        pid = h["pid"]
        try:
            os.kill(pid, signal.SIGTERM)
            executed["killed"].append({"pid": pid, "type": "hermes_stale"})
        except Exception as e:
            executed["errors"].append(f"PID {pid}: {e}")
    for s in audit.get("unused_services", []):
        name = s["service"]
        try:
            subprocess.run(["systemctl", "disable", "--now", name], check=True, capture_output=True)
            executed["disabled"].append({"service": name, "type": "systemd"})
        except subprocess.CalledProcessError as e:
            executed["errors"].append(f"Service {name}: {e}")
    for c in audit.get("dead_loops_cron", []):
        job_id = c["id"]
        try:
            subprocess.run(["hermes", "cron", "pause", job_id], check=True, capture_output=True)
            executed["disabled"].append({"cron_job": job_id, "type": "hermes_cron"})
        except subprocess.CalledProcessError as e:
            executed["errors"].append(f"Cron {job_id}: {e}")
    return executed
 def main():
    import argparse
    parser = argparse.ArgumentParser(description="Disassembly Audit — identify & eliminate waste")
    parser.add_argument("--json", action="store_true", help="JSON output only")
    parser.add_argument("--output", "-o", help="Write report to file")
    parser.add_argument("--execute", action="store_true", help="Execute disassembly actions (DESTRUCTIVE)")
    args = parser.parse_args()
    zombies = get_zombie_processes()
    unused_services = get_systemd_services()
    cron_audit = get_cron_jobs_audit()
    hermes_stale = get_hermes_stale_sessions()
    dead_loops_cron = cron_audit.get("systemic_jobs", [])
    dead_loops_hermes = hermes_stale
    all_findings = zombies + unused_services + dead_loops_cron + dead_loops_hermes
    cost_savings = calculate_cost_savings(all_findings)
    audit = {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "zombies": zombies,
        "unused_services": unused_services,
        "dead_loops_cron": dead_loops_cron,
        "dead_loops_hermes": dead_loops_hermes,
        "cost_savings": cost_savings,
    }
    if args.json:
        print(json.dumps(audit, indent=2))
        sys.exit(0)
    report = generate_markdown_report(audit)
    if args.output:
        Path(args.output).write_text(report)
        print(f"Report written to {args.output}")
    else:
        print(report)
    if args.execute:
        print("\n" + "="*50)
        print("EXECUTING DISASSEMBLY ACTIONS...")
        result = execute_disassembly(audit)
        print(f"Killed: {len(result['killed'])} processes")
        print(f"Disabled: {len(result['disabled'])} services/cron jobs")
        if result["errors"]:
            print(f"Errors: {len(result['errors'])}")
            for e in result["errors"][:10]:
                print(f"  - {e}")
    total_critical = len(zombies) + len(unused_services) + len(dead_loops_cron)
    if total_critical > 0 and not args.execute:
        print(f"\n⚠️  {total_critical} waste items identified. Run with --execute to disassemble.")
        sys.exit(1)
    sys.exit(0)
 if __name__ == "__main__":
    main()
--- a/bin/gitea-backup.sh
+++ b/bin/gitea-backup.sh
@@ -0,0 +1,87 @@
 #!/bin/bash
 # Gitea Daily Backup Script
 # Uses Gitea's native dump command to create automated backups of repositories and SQLite databases.
 # Designed to run on the VPS (Ezra) as part of a daily cron job.
 #
 # Configuration via environment variables:
 #   GITEA_BIN               Path to gitea binary (default: auto-detect)
 #   GITEA_BACKUP_DIR        Directory for backup archives (default: /var/backups/gitea)
 #   GITEA_BACKUP_RETENTION  Days to retain backups (default: 7)
 #   GITEA_BACKUP_LOG        Log file path (default: /var/log/gitea-backup.log)
 set -euo pipefail
 GITEA_BIN="${GITEA_BIN:-$(command -v gitea 2>/dev/null || echo "/usr/local/bin/gitea")}"
 BACKUP_DIR="${GITEA_BACKUP_DIR:-/var/backups/gitea}"
 RETENTION_DAYS="${GITEA_BACKUP_RETENTION:-7}"
 DATE="$(date +%Y-%m-%d_%H%M%S)"
 BACKUP_FILE="${BACKUP_DIR}/gitea-backup-${DATE}.tar.gz"
 LOG_FILE="${GITEA_BACKUP_LOG:-/var/log/gitea-backup.log}"
 mkdir -p "${BACKUP_DIR}"
 log() {
  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "${LOG_FILE}"
 }
 log "=== Starting Gitea daily backup ==="
 # Verify gitea binary exists
 if [ ! -x "${GITEA_BIN}" ]; then
  log "ERROR: Gitea binary not found at ${GITEA_BIN}"
  log "Set GITEA_BIN environment variable to the gitea binary path (e.g., /usr/bin/gitea)"
  exit 1
 fi
 # Detect Gitea WORK_PATH
 WORK_PATH=""
 APP_INI=""
 for path in /etc/gitea/app.ini /home/git/gitea/custom/conf/app.ini ~/gitea/custom/conf/app.ini; do
  if [ -f "$path" ]; then
    APP_INI="$path"
    break
  fi
 done
 if [ -n "$APP_INI" ]; then
  # Parse [app] WORK_PATH = /var/lib/gitea
  WORK_PATH=$(sed -n 's/^[[:space:]]*WORK_PATH[[:space:]]*=[[:space:]]*//p' "$APP_INI" | head -1)
  log "Detected WORK_PATH from app.ini: ${WORK_PATH}"
 fi
 # Fallback detection
 if [ -z "$WORK_PATH" ]; then
  for d in /var/lib/gitea /home/git/gitea /srv/gitea /opt/gitea; do
    if [ -d "$d" ]; then
      WORK_PATH="$d"
      break
    fi
  done
  log "Inferred WORK_PATH: ${WORK_PATH:-not found}"
 fi
 if [ -z "$WORK_PATH" ]; then
  log "ERROR: Could not determine Gitea WORK_PATH. Set GITEA_WORK_PATH manually."
  exit 1
 fi
 # Perform gitea dump
 # Flags: --work-path sets the Gitea working directory, --file writes dump to tar.gz
 log "Running: gitea dump --work-path ${WORK_PATH} --file ${BACKUP_FILE}"
 "${GITEA_BIN}" dump --work-path "${WORK_PATH}" --file "${BACKUP_FILE}" 2>>"${LOG_FILE}"
 if [ $? -ne 0 ]; then
  log "ERROR: gitea dump failed — check ${LOG_FILE} for details"
  exit 1
 fi
 FILE_SIZE=$(du -h "${BACKUP_FILE}" | cut -f1)
 log "Backup created: ${BACKUP_FILE} (${FILE_SIZE})"
 # Prune old backups (keep last N days)
 find "${BACKUP_DIR}" -name "gitea-backup-*.tar.gz" -type f -mtime +$((${RETENTION_DAYS}-1)) -delete 2>/dev/null || true
 log "Pruned backups older than ${RETENTION_DAYS} days"
 log "=== Backup completed successfully ==="
 exit 0
--- a/cron/vps/gitea-daily-backup.yml
+++ b/cron/vps/gitea-daily-backup.yml
@@ -0,0 +1,9 @@
 - name: Daily Gitea Backup
  schedule: '0 2 * * *'  # 2:00 AM daily
  tasks:
    - name: Run Gitea daily backup
      shell: bash ~/.hermes/bin/gitea-backup.sh
      env:
        GITEA_BIN: /usr/local/bin/gitea
        GITEA_BACKUP_DIR: /var/backups/gitea
        GITEA_BACKUP_RETENTION: "7"
--- a/docs/backup-recovery-runbook.md
+++ b/docs/backup-recovery-runbook.md
@@ -0,0 +1,155 @@
 # Gitea Backup & Recovery Runbook
 **Last updated:** 2026-04-30  
 **Scope:** Single-node VPS (Ezra, 143.198.27.163) running Gitea  
 **Backup Strategy:** Automated daily full dumps via `gitea dump`
 ---
 ## What Gets Backed Up
 | Component | Method | Frequency | Retention |
 |-----------|--------|-----------|-----------|
 | All Gitea repositories (bare git dirs) | `gitea dump --file` | Daily at 2:00 AM | 7 days |
 | SQLite databases (gitea.db, indexer.db, etc.) | Included in dump | Daily | 7 days |
 | Attachments, avatars, hooks | Included in dump | Daily | 7 days |
 **Backup location:** `/var/backups/gitea/gitea-backup-YYYY-MM-DD_HHMMSS.tar.gz`
 **Log file:** `/var/log/gitea-backup.log`
 ---
 ## Backup Architecture
 The backup script `bin/gitea-backup.sh` runs daily via Hermes cron (`cron/vps/gitea-daily-backup.yml`). It:
 1. Locates the Gitea `WORK_PATH` by reading `/etc/gitea/app.ini` or falling back to common locations (`/var/lib/gitea`, `/home/git/gitea`)
 2. Invokes `gitea dump --work-path <path> --file <backup-tar.gz>` — Gitea's native, consistent snapshot mechanism
 3. Prunes archives older than 7 days
 4. Logs all operations to `/var/log/gitea-backup.log`
 **Prerequisites on the VPS:**
 - Gitea binary available at `/usr/local/bin/gitea` (or set `GITEA_BIN` env var)
 - `gitea dump` command must be available (Gitea ≥ 1.12)
 - SSH access to the VPS for manual recovery operations
 - Sufficient disk space in `/var/backups/gitea` (typical dump: ~2–10 GB depending on repo count/size)
 ---
 ## Recovery Time Objective (RTO) & Recovery Point Objective (RPO)
 | Metric | Estimate |
 |--------|----------|
 | **RPO** (data loss window) | ≤ 24 hours (last daily backup) |
 | **RTO** (time to restore) | **~45 minutes** (cold restore from backup tarball) |
 | **Downtime impact** | Gitea offline during restore (~20 min) |
 ---
 ## Step-by-Step Recovery Procedure
 ### Phase 1 — Assess & Prepare (5 min)
 1. SSH into Ezra VPS: `ssh root@143.198.27.163`
 2. Stop Gitea so files are quiescent:
   ```bash
   systemctl stop gitea
   ```
 3. Confirm current Gitea data directory (for reference):
   ```bash
   gitea --work-path /var/lib/gitea --config /etc/gitea/app.ini dump --help 2>&1
   # Or check app.ini for WORK_PATH
   cat /etc/gitea/app.ini | grep '^WORK_PATH'
   ```
 ### Phase 2 — Restore from Backup (20 min)
 4. Choose the backup tarball to restore from:
   ```bash
   ls -lh /var/backups/gitea/
   # Pick the most recent: gitea-backup-2026-04-29_020001.tar.gz
   ```
 5. **Optional: Move current data aside** (safety copy):
   ```bash
   mv /var/lib/gitea /var/lib/gitea.bak-$(date +%s)
   ```
 6. Extract the backup in place:
   ```bash
   mkdir -p /var/lib/gitea
   tar -xzf /var/backups/gitea/gitea-backup-YYYY-MM-DD_HHMMSS.tar.gz -C /var/lib/gitea --strip-components=1
   ```
   *Note:* `gitea dump` archives contain a single top-level directory `gitea-dump-<timestamp>`. The `--strip-components=1` puts its contents directly into `/var/lib/gitea`.
 7. Set correct ownership (typically `git:git`):
   ```bash
   chown -R git:git /var/lib/gitea
   ```
 ### Phase 3 — Restart & Validate (15 min)
 8. Start Gitea:
   ```bash
   systemctl start gitea
   ```
 9. Wait 30 seconds, then verify:
   ```bash
   systemctl status gitea
   # Check HTTP endpoint
   curl -s -o /dev/null -w '%{http_code}' http://localhost:3000/  # Should be 200
   ```
 10. Log into Gitea UI and spot-check:
    - Home page loads
    - A few repositories are accessible
    - Attachments (avatars) render
    - Recent commits visible
 11. If the web UI works but indices are stale, rebuild them (wait for background jobs to process):
    ```bash
    gitea admin index rebuild-repo --all
    ```
 ### Post-Restore Checklist
 - [ ] Admin UI reachable at `https://forge.alexanderwhitestone.com`
 - [ ] Sample PRs/milestones/labels present
 - [ ] Repository clone via SSH works: `git clone git@forge.alexanderwhitestone.com:Timmy_Foundation/timmy-config.git`
 - [ ] Check backup script health: `cat /var/log/gitea-backup.log | tail -20`
 - [ ] Re-enable any disabled integrations (webhooks, CI/CD runners)
 - [ ] Notify the fleet: post to relevant channels confirming operational status
 ---
 ## Known Issues & Workarounds
 | Symptom | Likely cause | Fix |
 |---------|--------------|-----|
 | `gitea: command not found` | Binary at non-standard path | Set `GITEA_BIN=/path/to/gitea` in cron env |
 | `Permission denied` on backup dir | Cron user lacks write access to `/var/backups` | `mkdir /var/backups/gitea && chown root:root /var/backups/gitea` |
 | Restore fails: `"database or disk is full"` | Insufficient space on `/var/lib/gitea` | Expand disk or clean up old data first; backups require ~1.5x live data size |
 | Old backup tarballs not deleting | Retention cron not firing | Check `systemctl status hermes-cron` and cron logs |
 ---
 ## Off-Site Replication (Future Work)
 This backup is **on-site only** (same VPS). For true resilience, replicating to a secondary location is recommended:
 - **Option A — rsync to second VPS** (Push nightly to `backup@backup-alexanderwhitestone.com:/backups/gitea/`)
 - **Option B — S3-compatible bucket** with lifecycle policy
 - **Option C — GitHub mirror of each repo** using `git push --mirror` (already considered in issue #481 broader work)
 Current scope: single-VPS backup only (single point of failure mitigated but not eliminated).
 ---
 ## Related Documentation
 - `bin/gitea-backup.sh` — backup script source
 - `cron/vps/gitea-daily-backup.yml` — Hermes cron definition
 - Gitea official docs: <https://docs.gitea.com/administration/backup-and-restore>
 - Hermes cron: <https://hermes-agent.nousresearch.com/docs>
--- a/scripts/cron_audit_662.py
+++ b/scripts/cron_audit_662.py
--- a/tests/test_disassembly_audit.py
+++ b/tests/test_disassembly_audit.py
@@ -0,0 +1,25 @@
 #!/usr/bin/env python3
 """Minimal smoke test for disassembly_audit.py (#335)"""
 import sys
 from pathlib import Path
 import json
 import subprocess
 script_path = Path(__file__).resolve().parent.parent / "bin" / "disassembly_audit.py"
 result = subprocess.run(
    [sys.executable, str(script_path), "--json"],
    capture_output=True, text=True, timeout=20
 )
 if result.returncode != 0:
    print(f"Script error: {result.stderr[:500]}")
    sys.exit(1)
 data = json.loads(result.stdout)
 assert "zombies" in data
 assert "unused_services" in data
 assert "dead_loops_cron" in data
 assert "dead_loops_hermes" in data
 assert "cost_savings" in data
 print("SMOKE TEST: disassembly_audit.py generates valid report structure")
Author	SHA1	Message	Date
Alex Payne	03f4f8fbad	PAPERCLIPS-7: Add disassembly audit script Some checks failed Architecture Lint / Linter Tests (pull_request) Successful in 29s Details Smoke Test / smoke (pull_request) Failing after 29s Details Validate Config / YAML Lint (pull_request) Failing after 20s Details Validate Config / JSON Validate (pull_request) Successful in 26s Details Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m4s Details Validate Config / Python Test Suite (pull_request) Has been skipped Details Validate Config / Shell Script Lint (pull_request) Failing after 1m13s Details Validate Config / Cron Syntax Check (pull_request) Successful in 12s Details Validate Config / Deploy Script Dry Run (pull_request) Successful in 16s Details PR Checklist / pr-checklist (pull_request) Successful in 4m58s Details Validate Config / Playbook Schema Validation (pull_request) Successful in 27s Details Architecture Lint / Lint Repository (pull_request) Failing after 23s Details - New bin/disassembly_audit.py identifies waste: * Zombie processes (defunct, unreclaimable) * Unused systemd services (enabled but inactive) * Dead loops: systemic cron failures (via cron-audit-662) * Stale hermes agent sessions - Recommends keep/kill/disable actions - Estimates monthly cost savings (memory, CPU) - Optional --execute flag for safe disassembly - JSON and markdown report output Closes #335	2026-04-30 02:31:08 -04:00
Rockachopa	874ce137b0	feat(backup): add automated Gitea daily backup and recovery runbook Some checks failed Architecture Lint / Linter Tests (push) Successful in 30s Details Smoke Test / smoke (push) Failing after 24s Details Validate Config / YAML Lint (push) Failing after 16s Details Validate Config / JSON Validate (push) Successful in 21s Details Validate Config / Cron Syntax Check (push) Successful in 15s Details Validate Config / Deploy Script Dry Run (push) Successful in 14s Details Validate Config / Python Syntax & Import Check (push) Failing after 1m2s Details Validate Config / Python Test Suite (push) Has been skipped Details Validate Config / Shell Script Lint (push) Failing after 1m3s Details Validate Config / Playbook Schema Validation (push) Successful in 24s Details Architecture Lint / Linter Tests (pull_request) Successful in 27s Details Smoke Test / smoke (pull_request) Failing after 22s Details Validate Config / YAML Lint (pull_request) Failing after 16s Details Validate Config / JSON Validate (pull_request) Successful in 23s Details Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m5s Details Validate Config / Python Test Suite (pull_request) Has been skipped Details Validate Config / Cron Syntax Check (pull_request) Successful in 12s Details Validate Config / Shell Script Lint (pull_request) Failing after 1m6s Details Validate Config / Deploy Script Dry Run (pull_request) Successful in 13s Details Validate Config / Playbook Schema Validation (pull_request) Successful in 25s Details PR Checklist / pr-checklist (pull_request) Failing after 4m33s Details Architecture Lint / Lint Repository (push) Failing after 26s Details Architecture Lint / Lint Repository (pull_request) Failing after 26s Details - Add bin/gitea-backup.sh: daily backup script using gitea dump - Add cron/vps/gitea-daily-backup.yml: Hermes cron job (2 AM daily) - Add docs/backup-recovery-runbook.md: complete recovery procedures Addresses [AUDIT][RISK] Single-node VPS is a single point of failure. Closes #481	2026-04-30 01:44:05 -04:00