Compare commits
2 Commits
step35/595
...
step35/335
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
03f4f8fbad | ||
|
|
874ce137b0 |
477
bin/disassembly_audit.py
Normal file
477
bin/disassembly_audit.py
Normal file
@@ -0,0 +1,477 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Disassembly Audit — Know what to tear down (#335)
|
||||||
|
|
||||||
|
Monthly audit that identifies waste: zombie processes, unused services, dead loops.
|
||||||
|
Recommends: keep, kill, or migrate. Optionally executes clean disassembly.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 bin/disassembly_audit.py # Dry-run audit report
|
||||||
|
python3 bin/disassembly_audit.py --execute # Perform safe disassembly
|
||||||
|
python3 bin/disassembly_audit.py --json # JSON output
|
||||||
|
python3 bin/disassembly_audit.py --output report.md
|
||||||
|
|
||||||
|
References:
|
||||||
|
- Issue: https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config/issues/335
|
||||||
|
- Epic: #328 (Universal Paperclips — Disassembly Sequence)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import signal
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
# === CONFIG ===
|
||||||
|
ZOMBIE_THRESHOLD_DAYS = 7
|
||||||
|
UNUSED_SERVICE_DAYS = 30
|
||||||
|
CRON_ERROR_THRESHOLD_HOURS = 48
|
||||||
|
HERMES_STALE_AGE_HOURS = 24
|
||||||
|
COST_PER_GB_MONTH = 10
|
||||||
|
COST_PER_VCPU_MONTH = 5
|
||||||
|
|
||||||
|
|
||||||
|
def get_zombie_processes() -> List[Dict[str, Any]]:
|
||||||
|
"""Identify zombie (defunct) processes."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["ps", "aux"],
|
||||||
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
zombies = []
|
||||||
|
for line in result.stdout.split('\n'):
|
||||||
|
if not line.strip() or 'grep' in line:
|
||||||
|
continue
|
||||||
|
parts = line.split(None, 10)
|
||||||
|
if len(parts) < 11:
|
||||||
|
continue
|
||||||
|
stat = parts[7]
|
||||||
|
if 'Z' in stat:
|
||||||
|
pid = int(parts[1])
|
||||||
|
cmd = parts[10][:120]
|
||||||
|
age_sec = None
|
||||||
|
try:
|
||||||
|
age_result = subprocess.run(
|
||||||
|
["ps", "-o", "etimes=", "-p", str(pid)],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if age_result.returncode == 0 and age_result.stdout.strip():
|
||||||
|
age_sec = int(age_result.stdout.strip())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
zombies.append({
|
||||||
|
"pid": pid,
|
||||||
|
"user": parts[0],
|
||||||
|
"command": cmd,
|
||||||
|
"state": stat,
|
||||||
|
"age_seconds": age_sec,
|
||||||
|
"age_hours": round(age_sec / 3600, 1) if age_sec else None,
|
||||||
|
"category": "zombie_process",
|
||||||
|
"recommendation": "kill",
|
||||||
|
"reason": "Zombie process cannot be wait()-ed; blocks resources",
|
||||||
|
"resources_freed_mb": estimate_process_memory_mb(pid),
|
||||||
|
})
|
||||||
|
return zombies
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: zombie scan failed: {e}", file=sys.stderr)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_process_memory_mb(pid: int) -> float:
|
||||||
|
"""Estimate memory usage of a process in MB."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["ps", "-p", str(pid), "-o", "rss="],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
rss_kb = int(result.stdout.strip())
|
||||||
|
return round(rss_kb / 1024, 1)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def get_systemd_services() -> List[Dict[str, Any]]:
|
||||||
|
"""Identify enabled but inactive systemd services (potential waste)."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["systemctl", "list-units", "--type=service", "--state=enabled", "--all", "--output=json"],
|
||||||
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
services = []
|
||||||
|
try:
|
||||||
|
data = json.loads(result.stdout)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return []
|
||||||
|
|
||||||
|
for svc in data:
|
||||||
|
name = svc.get("unit", "")
|
||||||
|
desc = svc.get("description", "")
|
||||||
|
try:
|
||||||
|
active_result = subprocess.run(
|
||||||
|
["systemctl", "is-active", name],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
active = active_result.stdout.strip()
|
||||||
|
if active == "inactive":
|
||||||
|
services.append({
|
||||||
|
"service": name,
|
||||||
|
"description": desc,
|
||||||
|
"state": "enabled+inactive",
|
||||||
|
"category": "unused_service",
|
||||||
|
"recommendation": "disable",
|
||||||
|
"reason": "Service is enabled but not running",
|
||||||
|
"resources_freed_mb": 0,
|
||||||
|
})
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return services
|
||||||
|
except FileNotFoundError:
|
||||||
|
return []
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: systemd scan failed: {e}", file=sys.stderr)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def get_cron_jobs_audit() -> Dict[str, Any]:
|
||||||
|
"""Reuse cron-audit logic if available."""
|
||||||
|
try:
|
||||||
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts"))
|
||||||
|
from cron_audit_662 import load_cron_state, categorize_job, parse_timestamp
|
||||||
|
jobs = load_cron_state()
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
categorized = []
|
||||||
|
systemic = []
|
||||||
|
transient = []
|
||||||
|
healthy = []
|
||||||
|
|
||||||
|
for job in jobs:
|
||||||
|
cat = categorize_job(job, now)
|
||||||
|
cat["_job"] = job
|
||||||
|
categorized.append(cat)
|
||||||
|
if cat["category"] == "systemic":
|
||||||
|
systemic.append(cat)
|
||||||
|
elif cat["category"] == "transient":
|
||||||
|
transient.append(cat)
|
||||||
|
else:
|
||||||
|
healthy.append(cat)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total": len(jobs),
|
||||||
|
"healthy": len(healthy),
|
||||||
|
"transient": len(transient),
|
||||||
|
"systemic": len(systemic),
|
||||||
|
"systemic_jobs": [
|
||||||
|
{
|
||||||
|
"id": c["id"],
|
||||||
|
"name": c["name"],
|
||||||
|
"reason": c["reason"],
|
||||||
|
"last_error": c.get("last_error", ""),
|
||||||
|
"recommendation": "disable",
|
||||||
|
"category": "dead_loop",
|
||||||
|
}
|
||||||
|
for c in systemic
|
||||||
|
],
|
||||||
|
"transient_jobs": [
|
||||||
|
{"id": c["id"], "name": c["name"], "reason": c["reason"], "recommendation": "monitor"}
|
||||||
|
for c in transient
|
||||||
|
],
|
||||||
|
}
|
||||||
|
except ImportError:
|
||||||
|
return {"error": "cron_audit_662 not available", "total": 0, "systemic_jobs": []}
|
||||||
|
|
||||||
|
|
||||||
|
def get_hermes_stale_sessions() -> List[Dict[str, Any]]:
|
||||||
|
"""Detect stale hermes agent sessions."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["ps", "aux"],
|
||||||
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
sessions = {}
|
||||||
|
for line in result.stdout.split('\n'):
|
||||||
|
if 'hermes' not in line.lower() or 'grep' in line:
|
||||||
|
continue
|
||||||
|
parts = line.split(None, 10)
|
||||||
|
if len(parts) < 11:
|
||||||
|
continue
|
||||||
|
pid = int(parts[1])
|
||||||
|
cpu = float(parts[2])
|
||||||
|
mem_kb = int(parts[5])
|
||||||
|
cmd = parts[10][:100]
|
||||||
|
sessions[str(pid)] = {"pid": pid, "cpu": cpu, "rss": mem_kb, "cmd": cmd}
|
||||||
|
try:
|
||||||
|
child_out = subprocess.run(
|
||||||
|
["pgrep", "-P", str(pid)],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
children = []
|
||||||
|
for cp in child_out.stdout.strip().split('\n'):
|
||||||
|
if cp.strip():
|
||||||
|
children.append(int(cp.strip()))
|
||||||
|
sessions[str(pid)]["children"] = children
|
||||||
|
except Exception:
|
||||||
|
sessions[str(pid)]["children"] = []
|
||||||
|
|
||||||
|
stale = []
|
||||||
|
for key, proc_info in sessions.items():
|
||||||
|
pid = proc_info["pid"]
|
||||||
|
try:
|
||||||
|
age_res = subprocess.run(
|
||||||
|
["ps", "-o", "etimes=", "-p", str(pid)],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if age_res.returncode == 0 and age_res.stdout.strip():
|
||||||
|
age_sec = int(age_res.stdout.strip())
|
||||||
|
age_hours = age_sec / 3600
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if age_hours > HERMES_STALE_AGE_HOURS and proc_info["cpu"] < 0.5:
|
||||||
|
total_rss = proc_info["rss"] + sum(
|
||||||
|
_get_rss_kb(child) for child in proc_info.get("children", [])
|
||||||
|
)
|
||||||
|
stale.append({
|
||||||
|
"pid": pid,
|
||||||
|
"age_hours": round(age_hours, 1),
|
||||||
|
"cpu_percent": proc_info["cpu"],
|
||||||
|
"total_rss_mb": round(total_rss / 1024, 1),
|
||||||
|
"process_count": 1 + len(proc_info.get("children", [])),
|
||||||
|
"command": proc_info["cmd"],
|
||||||
|
"category": "dead_loop",
|
||||||
|
"recommendation": "kill",
|
||||||
|
"reason": f"Hermes session idle for {age_hours:.1f}h",
|
||||||
|
"resources_freed_mb": round(total_rss / 1024, 1),
|
||||||
|
})
|
||||||
|
return stale
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: hermes scan failed: {e}", file=sys.stderr)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _get_rss_kb(pid: int) -> int:
|
||||||
|
try:
|
||||||
|
r = subprocess.run(["ps", "-p", str(pid), "-o", "rss="], capture_output=True, text=True, timeout=3)
|
||||||
|
if r.returncode == 0 and r.stdout.strip():
|
||||||
|
return int(r.stdout.strip())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_cost_savings(findings: List[Dict[str, Any]]) -> Dict[str, float]:
|
||||||
|
total_mb = sum(f.get("resources_freed_mb", 0) for f in findings)
|
||||||
|
gb = total_mb / 1024
|
||||||
|
process_count = sum(f.get("process_count", 1) for f in findings)
|
||||||
|
cpu_equiv = process_count * 0.1
|
||||||
|
return {
|
||||||
|
"memory_gb_reclaimed": round(gb, 2),
|
||||||
|
"memory_cost_saved_monthly": round(gb * COST_PER_GB_MONTH, 2),
|
||||||
|
"cpu_equiv_vcpus": round(cpu_equiv, 1),
|
||||||
|
"cpu_cost_saved_monthly": round(cpu_equiv * COST_PER_VCPU_MONTH, 2),
|
||||||
|
"total_cost_saved_monthly": round(gb * COST_PER_GB_MONTH + cpu_equiv * COST_PER_VCPU_MONTH, 2),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generate_markdown_report(audit: Dict[str, Any]) -> str:
|
||||||
|
lines = []
|
||||||
|
lines.append("# Disassembly Audit Report")
|
||||||
|
lines.append(f"**Generated:** {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
|
||||||
|
lines.append(f"**Issue:** [#335](https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config/issues/335)")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
total_findings = sum(len(audit.get(k, [])) for k in ["zombies", "unused_services", "dead_loops_cron", "dead_loops_hermes"])
|
||||||
|
lines.append("## Summary")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("| Category | Count | Action |")
|
||||||
|
lines.append("|----------|-------|--------|")
|
||||||
|
lines.append(f"| Zombie Processes | {len(audit.get('zombies', []))} | Kill |")
|
||||||
|
lines.append(f"| Unused Services | {len(audit.get('unused_services', []))} | Disable |")
|
||||||
|
lines.append(f"| Dead Loops (Cron) | {len(audit.get('dead_loops_cron', []))} | Disable |")
|
||||||
|
lines.append(f"| Dead Loops (Hermes) | {len(audit.get('dead_loops_hermes', []))} | Kill |")
|
||||||
|
lines.append(f"| **Total Waste** | **{total_findings}** | **Disassemble** |")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
costs = audit.get("cost_savings", {})
|
||||||
|
if costs and costs.get("total_cost_saved_monthly", 0) > 0:
|
||||||
|
lines.append("## Cost Impact (Monthly)")
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f"- Memory reclaimed: **{costs.get('memory_gb_reclaimed', 0):.2f} GB**")
|
||||||
|
lines.append(f"- CPU equivalent: **{costs.get('cpu_equiv_vcpus', 0):.1f} vCPUs**")
|
||||||
|
lines.append(f"- **Estimated monthly savings: ${costs.get('total_cost_saved_monthly', 0):.2f}**")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
def add_table(title, items, headers):
|
||||||
|
if not items:
|
||||||
|
return
|
||||||
|
lines.append(f"## {title}")
|
||||||
|
lines.append("")
|
||||||
|
header_row = "| " + " | ".join(headers) + " |"
|
||||||
|
lines.append(header_row)
|
||||||
|
lines.append("|" + "|".join("---" for _ in headers) + "|")
|
||||||
|
for item in items:
|
||||||
|
row = []
|
||||||
|
for h in headers:
|
||||||
|
key = h.lower().replace(" ", "_")
|
||||||
|
val = item.get(key, "—")
|
||||||
|
if isinstance(val, float):
|
||||||
|
val = f"{val:.1f}"
|
||||||
|
row.append(str(val))
|
||||||
|
lines.append("| " + " | ".join(row) + " |")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
add_table(
|
||||||
|
"Zombie Processes",
|
||||||
|
audit.get("zombies", []),
|
||||||
|
["PID", "User", "Command", "Age (h)", "Memory (MB)", "Action"]
|
||||||
|
)
|
||||||
|
add_table(
|
||||||
|
"Unused Services (enabled but inactive)",
|
||||||
|
audit.get("unused_services", []),
|
||||||
|
["Service", "Description", "Action"]
|
||||||
|
)
|
||||||
|
add_table(
|
||||||
|
"Dead Loops — Cron Systemic Failures",
|
||||||
|
audit.get("dead_loops_cron", []),
|
||||||
|
["ID", "Name", "Reason", "Action"]
|
||||||
|
)
|
||||||
|
add_table(
|
||||||
|
"Dead Loops — Stale Hermes Sessions",
|
||||||
|
audit.get("dead_loops_hermes", []),
|
||||||
|
["PID", "Age (h)", "CPU %", "Memory (MB)", "Procs", "Action"]
|
||||||
|
)
|
||||||
|
|
||||||
|
lines.append("## Recommended Actions")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("### Kill immediately")
|
||||||
|
lines.append("- All zombie processes (unreclaimable)")
|
||||||
|
lines.append("- Stale hermes sessions (idle >24h)")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("### Disable")
|
||||||
|
lines.append("- Systemd services enabled but inactive")
|
||||||
|
lines.append("- Cron jobs erroring >48h (systemic failures)")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("### Monitor")
|
||||||
|
lines.append("- Cron jobs with transient errors (network, rate limit)")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("## Disassembly Execution")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("To execute these disassembly actions safely, run:")
|
||||||
|
lines.append("```")
|
||||||
|
lines.append(" python3 bin/disassembly_audit.py --execute")
|
||||||
|
lines.append("```")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("This will perform safe termination, disable services, pause cron jobs.")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("---")
|
||||||
|
lines.append("*Generated by Disassembly Audit — #335*")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def execute_disassembly(audit: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
executed = {"killed": [], "disabled": [], "errors": []}
|
||||||
|
for z in audit.get("zombies", []):
|
||||||
|
pid = z["pid"]
|
||||||
|
try:
|
||||||
|
os.kill(pid, signal.SIGTERM)
|
||||||
|
executed["killed"].append({"pid": pid, "type": "zombie"})
|
||||||
|
except Exception as e:
|
||||||
|
executed["errors"].append(f"PID {pid}: {e}")
|
||||||
|
|
||||||
|
for h in audit.get("dead_loops_hermes", []):
|
||||||
|
pid = h["pid"]
|
||||||
|
try:
|
||||||
|
os.kill(pid, signal.SIGTERM)
|
||||||
|
executed["killed"].append({"pid": pid, "type": "hermes_stale"})
|
||||||
|
except Exception as e:
|
||||||
|
executed["errors"].append(f"PID {pid}: {e}")
|
||||||
|
|
||||||
|
for s in audit.get("unused_services", []):
|
||||||
|
name = s["service"]
|
||||||
|
try:
|
||||||
|
subprocess.run(["systemctl", "disable", "--now", name], check=True, capture_output=True)
|
||||||
|
executed["disabled"].append({"service": name, "type": "systemd"})
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
executed["errors"].append(f"Service {name}: {e}")
|
||||||
|
|
||||||
|
for c in audit.get("dead_loops_cron", []):
|
||||||
|
job_id = c["id"]
|
||||||
|
try:
|
||||||
|
subprocess.run(["hermes", "cron", "pause", job_id], check=True, capture_output=True)
|
||||||
|
executed["disabled"].append({"cron_job": job_id, "type": "hermes_cron"})
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
executed["errors"].append(f"Cron {job_id}: {e}")
|
||||||
|
|
||||||
|
return executed
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser(description="Disassembly Audit — identify & eliminate waste")
|
||||||
|
parser.add_argument("--json", action="store_true", help="JSON output only")
|
||||||
|
parser.add_argument("--output", "-o", help="Write report to file")
|
||||||
|
parser.add_argument("--execute", action="store_true", help="Execute disassembly actions (DESTRUCTIVE)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
zombies = get_zombie_processes()
|
||||||
|
unused_services = get_systemd_services()
|
||||||
|
cron_audit = get_cron_jobs_audit()
|
||||||
|
hermes_stale = get_hermes_stale_sessions()
|
||||||
|
|
||||||
|
dead_loops_cron = cron_audit.get("systemic_jobs", [])
|
||||||
|
dead_loops_hermes = hermes_stale
|
||||||
|
|
||||||
|
all_findings = zombies + unused_services + dead_loops_cron + dead_loops_hermes
|
||||||
|
cost_savings = calculate_cost_savings(all_findings)
|
||||||
|
|
||||||
|
audit = {
|
||||||
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"zombies": zombies,
|
||||||
|
"unused_services": unused_services,
|
||||||
|
"dead_loops_cron": dead_loops_cron,
|
||||||
|
"dead_loops_hermes": dead_loops_hermes,
|
||||||
|
"cost_savings": cost_savings,
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.json:
|
||||||
|
print(json.dumps(audit, indent=2))
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
report = generate_markdown_report(audit)
|
||||||
|
if args.output:
|
||||||
|
Path(args.output).write_text(report)
|
||||||
|
print(f"Report written to {args.output}")
|
||||||
|
else:
|
||||||
|
print(report)
|
||||||
|
|
||||||
|
if args.execute:
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("EXECUTING DISASSEMBLY ACTIONS...")
|
||||||
|
result = execute_disassembly(audit)
|
||||||
|
print(f"Killed: {len(result['killed'])} processes")
|
||||||
|
print(f"Disabled: {len(result['disabled'])} services/cron jobs")
|
||||||
|
if result["errors"]:
|
||||||
|
print(f"Errors: {len(result['errors'])}")
|
||||||
|
for e in result["errors"][:10]:
|
||||||
|
print(f" - {e}")
|
||||||
|
|
||||||
|
total_critical = len(zombies) + len(unused_services) + len(dead_loops_cron)
|
||||||
|
if total_critical > 0 and not args.execute:
|
||||||
|
print(f"\n⚠️ {total_critical} waste items identified. Run with --execute to disassemble.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
87
bin/gitea-backup.sh
Normal file
87
bin/gitea-backup.sh
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Gitea Daily Backup Script
|
||||||
|
# Uses Gitea's native dump command to create automated backups of repositories and SQLite databases.
|
||||||
|
# Designed to run on the VPS (Ezra) as part of a daily cron job.
|
||||||
|
#
|
||||||
|
# Configuration via environment variables:
|
||||||
|
# GITEA_BIN Path to gitea binary (default: auto-detect)
|
||||||
|
# GITEA_BACKUP_DIR Directory for backup archives (default: /var/backups/gitea)
|
||||||
|
# GITEA_BACKUP_RETENTION Days to retain backups (default: 7)
|
||||||
|
# GITEA_BACKUP_LOG Log file path (default: /var/log/gitea-backup.log)
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
GITEA_BIN="${GITEA_BIN:-$(command -v gitea 2>/dev/null || echo "/usr/local/bin/gitea")}"
|
||||||
|
BACKUP_DIR="${GITEA_BACKUP_DIR:-/var/backups/gitea}"
|
||||||
|
RETENTION_DAYS="${GITEA_BACKUP_RETENTION:-7}"
|
||||||
|
DATE="$(date +%Y-%m-%d_%H%M%S)"
|
||||||
|
BACKUP_FILE="${BACKUP_DIR}/gitea-backup-${DATE}.tar.gz"
|
||||||
|
LOG_FILE="${GITEA_BACKUP_LOG:-/var/log/gitea-backup.log}"
|
||||||
|
|
||||||
|
mkdir -p "${BACKUP_DIR}"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "${LOG_FILE}"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "=== Starting Gitea daily backup ==="
|
||||||
|
|
||||||
|
# Verify gitea binary exists
|
||||||
|
if [ ! -x "${GITEA_BIN}" ]; then
|
||||||
|
log "ERROR: Gitea binary not found at ${GITEA_BIN}"
|
||||||
|
log "Set GITEA_BIN environment variable to the gitea binary path (e.g., /usr/bin/gitea)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Detect Gitea WORK_PATH
|
||||||
|
WORK_PATH=""
|
||||||
|
APP_INI=""
|
||||||
|
for path in /etc/gitea/app.ini /home/git/gitea/custom/conf/app.ini ~/gitea/custom/conf/app.ini; do
|
||||||
|
if [ -f "$path" ]; then
|
||||||
|
APP_INI="$path"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -n "$APP_INI" ]; then
|
||||||
|
# Parse [app] WORK_PATH = /var/lib/gitea
|
||||||
|
WORK_PATH=$(sed -n 's/^[[:space:]]*WORK_PATH[[:space:]]*=[[:space:]]*//p' "$APP_INI" | head -1)
|
||||||
|
log "Detected WORK_PATH from app.ini: ${WORK_PATH}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Fallback detection
|
||||||
|
if [ -z "$WORK_PATH" ]; then
|
||||||
|
for d in /var/lib/gitea /home/git/gitea /srv/gitea /opt/gitea; do
|
||||||
|
if [ -d "$d" ]; then
|
||||||
|
WORK_PATH="$d"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
log "Inferred WORK_PATH: ${WORK_PATH:-not found}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$WORK_PATH" ]; then
|
||||||
|
log "ERROR: Could not determine Gitea WORK_PATH. Set GITEA_WORK_PATH manually."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Perform gitea dump
|
||||||
|
# Flags: --work-path sets the Gitea working directory, --file writes dump to tar.gz
|
||||||
|
log "Running: gitea dump --work-path ${WORK_PATH} --file ${BACKUP_FILE}"
|
||||||
|
"${GITEA_BIN}" dump --work-path "${WORK_PATH}" --file "${BACKUP_FILE}" 2>>"${LOG_FILE}"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
log "ERROR: gitea dump failed — check ${LOG_FILE} for details"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
FILE_SIZE=$(du -h "${BACKUP_FILE}" | cut -f1)
|
||||||
|
log "Backup created: ${BACKUP_FILE} (${FILE_SIZE})"
|
||||||
|
|
||||||
|
# Prune old backups (keep last N days)
|
||||||
|
find "${BACKUP_DIR}" -name "gitea-backup-*.tar.gz" -type f -mtime +$((${RETENTION_DAYS}-1)) -delete 2>/dev/null || true
|
||||||
|
log "Pruned backups older than ${RETENTION_DAYS} days"
|
||||||
|
|
||||||
|
log "=== Backup completed successfully ==="
|
||||||
|
|
||||||
|
exit 0
|
||||||
9
cron/vps/gitea-daily-backup.yml
Normal file
9
cron/vps/gitea-daily-backup.yml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
- name: Daily Gitea Backup
|
||||||
|
schedule: '0 2 * * *' # 2:00 AM daily
|
||||||
|
tasks:
|
||||||
|
- name: Run Gitea daily backup
|
||||||
|
shell: bash ~/.hermes/bin/gitea-backup.sh
|
||||||
|
env:
|
||||||
|
GITEA_BIN: /usr/local/bin/gitea
|
||||||
|
GITEA_BACKUP_DIR: /var/backups/gitea
|
||||||
|
GITEA_BACKUP_RETENTION: "7"
|
||||||
155
docs/backup-recovery-runbook.md
Normal file
155
docs/backup-recovery-runbook.md
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
# Gitea Backup & Recovery Runbook
|
||||||
|
|
||||||
|
**Last updated:** 2026-04-30
|
||||||
|
**Scope:** Single-node VPS (Ezra, 143.198.27.163) running Gitea
|
||||||
|
**Backup Strategy:** Automated daily full dumps via `gitea dump`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What Gets Backed Up
|
||||||
|
|
||||||
|
| Component | Method | Frequency | Retention |
|
||||||
|
|-----------|--------|-----------|-----------|
|
||||||
|
| All Gitea repositories (bare git dirs) | `gitea dump --file` | Daily at 2:00 AM | 7 days |
|
||||||
|
| SQLite databases (gitea.db, indexer.db, etc.) | Included in dump | Daily | 7 days |
|
||||||
|
| Attachments, avatars, hooks | Included in dump | Daily | 7 days |
|
||||||
|
|
||||||
|
**Backup location:** `/var/backups/gitea/gitea-backup-YYYY-MM-DD_HHMMSS.tar.gz`
|
||||||
|
|
||||||
|
**Log file:** `/var/log/gitea-backup.log`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Backup Architecture
|
||||||
|
|
||||||
|
The backup script `bin/gitea-backup.sh` runs daily via Hermes cron (`cron/vps/gitea-daily-backup.yml`). It:
|
||||||
|
|
||||||
|
1. Locates the Gitea `WORK_PATH` by reading `/etc/gitea/app.ini` or falling back to common locations (`/var/lib/gitea`, `/home/git/gitea`)
|
||||||
|
2. Invokes `gitea dump --work-path <path> --file <backup-tar.gz>` — Gitea's native, consistent snapshot mechanism
|
||||||
|
3. Prunes archives older than 7 days
|
||||||
|
4. Logs all operations to `/var/log/gitea-backup.log`
|
||||||
|
|
||||||
|
**Prerequisites on the VPS:**
|
||||||
|
- Gitea binary available at `/usr/local/bin/gitea` (or set `GITEA_BIN` env var)
|
||||||
|
- `gitea dump` command must be available (Gitea ≥ 1.12)
|
||||||
|
- SSH access to the VPS for manual recovery operations
|
||||||
|
- Sufficient disk space in `/var/backups/gitea` (typical dump: ~2–10 GB depending on repo count/size)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recovery Time Objective (RTO) & Recovery Point Objective (RPO)
|
||||||
|
|
||||||
|
| Metric | Estimate |
|
||||||
|
|--------|----------|
|
||||||
|
| **RPO** (data loss window) | ≤ 24 hours (last daily backup) |
|
||||||
|
| **RTO** (time to restore) | **~45 minutes** (cold restore from backup tarball) |
|
||||||
|
| **Downtime impact** | Gitea offline during restore (~20 min) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step-by-Step Recovery Procedure
|
||||||
|
|
||||||
|
### Phase 1 — Assess & Prepare (5 min)
|
||||||
|
|
||||||
|
1. SSH into Ezra VPS: `ssh root@143.198.27.163`
|
||||||
|
2. Stop Gitea so files are quiescent:
|
||||||
|
```bash
|
||||||
|
systemctl stop gitea
|
||||||
|
```
|
||||||
|
3. Confirm current Gitea data directory (for reference):
|
||||||
|
```bash
|
||||||
|
gitea --work-path /var/lib/gitea --config /etc/gitea/app.ini dump --help 2>&1
|
||||||
|
# Or check app.ini for WORK_PATH
|
||||||
|
cat /etc/gitea/app.ini | grep '^WORK_PATH'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 2 — Restore from Backup (20 min)
|
||||||
|
|
||||||
|
4. Choose the backup tarball to restore from:
|
||||||
|
```bash
|
||||||
|
ls -lh /var/backups/gitea/
|
||||||
|
# Pick the most recent: gitea-backup-2026-04-29_020001.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Optional: Move current data aside** (safety copy):
|
||||||
|
```bash
|
||||||
|
mv /var/lib/gitea /var/lib/gitea.bak-$(date +%s)
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Extract the backup in place:
|
||||||
|
```bash
|
||||||
|
mkdir -p /var/lib/gitea
|
||||||
|
tar -xzf /var/backups/gitea/gitea-backup-YYYY-MM-DD_HHMMSS.tar.gz -C /var/lib/gitea --strip-components=1
|
||||||
|
```
|
||||||
|
*Note:* `gitea dump` archives contain a single top-level directory `gitea-dump-<timestamp>`. The `--strip-components=1` puts its contents directly into `/var/lib/gitea`.
|
||||||
|
|
||||||
|
7. Set correct ownership (typically `git:git`):
|
||||||
|
```bash
|
||||||
|
chown -R git:git /var/lib/gitea
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 3 — Restart & Validate (15 min)
|
||||||
|
|
||||||
|
8. Start Gitea:
|
||||||
|
```bash
|
||||||
|
systemctl start gitea
|
||||||
|
```
|
||||||
|
|
||||||
|
9. Wait 30 seconds, then verify:
|
||||||
|
```bash
|
||||||
|
systemctl status gitea
|
||||||
|
# Check HTTP endpoint
|
||||||
|
curl -s -o /dev/null -w '%{http_code}' http://localhost:3000/ # Should be 200
|
||||||
|
```
|
||||||
|
|
||||||
|
10. Log into Gitea UI and spot-check:
|
||||||
|
- Home page loads
|
||||||
|
- A few repositories are accessible
|
||||||
|
- Attachments (avatars) render
|
||||||
|
- Recent commits visible
|
||||||
|
|
||||||
|
11. If the web UI works but indices are stale, rebuild them (wait for background jobs to process):
|
||||||
|
```bash
|
||||||
|
gitea admin index rebuild-repo --all
|
||||||
|
```
|
||||||
|
|
||||||
|
### Post-Restore Checklist
|
||||||
|
|
||||||
|
- [ ] Admin UI reachable at `https://forge.alexanderwhitestone.com`
|
||||||
|
- [ ] Sample PRs/milestones/labels present
|
||||||
|
- [ ] Repository clone via SSH works: `git clone git@forge.alexanderwhitestone.com:Timmy_Foundation/timmy-config.git`
|
||||||
|
- [ ] Check backup script health: `cat /var/log/gitea-backup.log | tail -20`
|
||||||
|
- [ ] Re-enable any disabled integrations (webhooks, CI/CD runners)
|
||||||
|
- [ ] Notify the fleet: post to relevant channels confirming operational status
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Known Issues & Workarounds
|
||||||
|
|
||||||
|
| Symptom | Likely cause | Fix |
|
||||||
|
|---------|--------------|-----|
|
||||||
|
| `gitea: command not found` | Binary at non-standard path | Set `GITEA_BIN=/path/to/gitea` in cron env |
|
||||||
|
| `Permission denied` on backup dir | Cron user lacks write access to `/var/backups` | `mkdir /var/backups/gitea && chown root:root /var/backups/gitea` |
|
||||||
|
| Restore fails: `"database or disk is full"` | Insufficient space on `/var/lib/gitea` | Expand disk or clean up old data first; backups require ~1.5x live data size |
|
||||||
|
| Old backup tarballs not deleting | Retention cron not firing | Check `systemctl status hermes-cron` and cron logs |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Off-Site Replication (Future Work)
|
||||||
|
|
||||||
|
This backup is **on-site only** (same VPS). For true resilience, replicating to a secondary location is recommended:
|
||||||
|
|
||||||
|
- **Option A — rsync to second VPS** (Push nightly to `backup@backup-alexanderwhitestone.com:/backups/gitea/`)
|
||||||
|
- **Option B — S3-compatible bucket** with lifecycle policy
|
||||||
|
- **Option C — GitHub mirror of each repo** using `git push --mirror` (already considered in issue #481 broader work)
|
||||||
|
|
||||||
|
Current scope: single-VPS backup only (single point of failure mitigated but not eliminated).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Related Documentation
|
||||||
|
|
||||||
|
- `bin/gitea-backup.sh` — backup script source
|
||||||
|
- `cron/vps/gitea-daily-backup.yml` — Hermes cron definition
|
||||||
|
- Gitea official docs: <https://docs.gitea.com/administration/backup-and-restore>
|
||||||
|
- Hermes cron: <https://hermes-agent.nousresearch.com/docs>
|
||||||
25
tests/test_disassembly_audit.py
Executable file
25
tests/test_disassembly_audit.py
Executable file
@@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Minimal smoke test for disassembly_audit.py (#335)"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
script_path = Path(__file__).resolve().parent.parent / "bin" / "disassembly_audit.py"
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, str(script_path), "--json"],
|
||||||
|
capture_output=True, text=True, timeout=20
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f"Script error: {result.stderr[:500]}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
data = json.loads(result.stdout)
|
||||||
|
assert "zombies" in data
|
||||||
|
assert "unused_services" in data
|
||||||
|
assert "dead_loops_cron" in data
|
||||||
|
assert "dead_loops_hermes" in data
|
||||||
|
assert "cost_savings" in data
|
||||||
|
print("SMOKE TEST: disassembly_audit.py generates valid report structure")
|
||||||
Reference in New Issue
Block a user