Compare commits
2 Commits
step35/595
...
step35/445
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ffd2d352c6 | ||
|
|
874ce137b0 |
@@ -24,12 +24,17 @@ import yaml
|
|||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
import sqlite3
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
|
||||||
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
|
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
|
||||||
CONFIG_PATH = HERMES_HOME / "config.yaml"
|
CONFIG_PATH = HERMES_HOME / "config.yaml"
|
||||||
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
|
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
|
||||||
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
|
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
|
||||||
FORGE_URL = "https://forge.alexanderwhitestone.com"
|
FORGE_URL = "https://forge.alexanderwhitestone.com"
|
||||||
|
# Golden-state fallback chain: Kimi → OpenRouter (Gemini 2.5 Pro) → Ollama (gemma4:latest)
|
||||||
|
PROVIDER_TIMEOUT = int(os.getenv("PROVIDER_TIMEOUT", "30"))
|
||||||
|
|
||||||
def load_config():
|
def load_config():
|
||||||
with open(CONFIG_PATH) as f:
|
with open(CONFIG_PATH) as f:
|
||||||
@@ -50,7 +55,7 @@ def save_state(state):
|
|||||||
with open(FALLBACK_STATE, "w") as f:
|
with open(FALLBACK_STATE, "w") as f:
|
||||||
json.dump(state, f, indent=2)
|
json.dump(state, f, indent=2)
|
||||||
|
|
||||||
def run(cmd, timeout=10):
|
def run(cmd, timeout=PROVIDER_TIMEOUT):
|
||||||
try:
|
try:
|
||||||
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
||||||
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
||||||
@@ -61,6 +66,23 @@ def run(cmd, timeout=10):
|
|||||||
|
|
||||||
# ─── HEALTH CHECKS ───
|
# ─── HEALTH CHECKS ───
|
||||||
|
|
||||||
|
|
||||||
|
def log_fallback_event(agent_name, provider, model, status, error_message=None):
|
||||||
|
"""Log fallback events to request_log SQLite DB (telemetry)."""
|
||||||
|
try:
|
||||||
|
log_path = Path.home() / ".local" / "timmy" / "request_log.db"
|
||||||
|
if log_path.exists():
|
||||||
|
conn = sqlite3.connect(str(log_path))
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message)
|
||||||
|
VALUES (datetime('now'), ?, ?, ?, ?, ?, ?)
|
||||||
|
""", (agent_name, provider, model, 'fallback_switch', status, error_message))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
except Exception:
|
||||||
|
pass # Silent if telemetry unavailable
|
||||||
|
|
||||||
def check_kimi():
|
def check_kimi():
|
||||||
"""Can we reach Kimi Coding API?"""
|
"""Can we reach Kimi Coding API?"""
|
||||||
key = os.environ.get("KIMI_API_KEY", "")
|
key = os.environ.get("KIMI_API_KEY", "")
|
||||||
@@ -89,12 +111,38 @@ def check_kimi():
|
|||||||
return True, f"HTTP {out}"
|
return True, f"HTTP {out}"
|
||||||
return False, f"HTTP {out} err={err[:80]}"
|
return False, f"HTTP {out} err={err[:80]}"
|
||||||
|
|
||||||
def check_local_llama():
|
def check_openrouter():
|
||||||
"""Is local llama.cpp serving?"""
|
"""Check OpenRouter API availability and credentials."""
|
||||||
code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
|
key = os.environ.get("OPENROUTER_API_KEY", "")
|
||||||
if code == 0 and "hermes" in out.lower():
|
if not key:
|
||||||
return True, "serving"
|
env_file = HERMES_HOME / ".env"
|
||||||
return False, f"exit={code}"
|
if env_file.exists():
|
||||||
|
for line in open(env_file):
|
||||||
|
line = line.strip()
|
||||||
|
if line.startswith("OPENROUTER_API_KEY="):
|
||||||
|
key = line.split("=", 1)[1].strip().strip('"\'')
|
||||||
|
break
|
||||||
|
if not key:
|
||||||
|
return False, "No OPENROUTER_API_KEY"
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(
|
||||||
|
"https://openrouter.ai/api/v1/models",
|
||||||
|
headers={"Authorization": "Bearer " + key}
|
||||||
|
)
|
||||||
|
resp = urllib.request.urlopen(req, timeout=PROVIDER_TIMEOUT)
|
||||||
|
if resp.status == 200:
|
||||||
|
data = json.loads(resp.read())
|
||||||
|
models = data.get("data", [])
|
||||||
|
return True, f"{len(models)} models available"
|
||||||
|
else:
|
||||||
|
return False, f"HTTP {resp.status}"
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
if e.code == 401:
|
||||||
|
return False, "Invalid OPENROUTER_API_KEY"
|
||||||
|
else:
|
||||||
|
return False, f"HTTP {e.code}"
|
||||||
|
except Exception as e:
|
||||||
|
return False, str(e)[:100]
|
||||||
|
|
||||||
def check_ollama():
|
def check_ollama():
|
||||||
"""Is Ollama running?"""
|
"""Is Ollama running?"""
|
||||||
@@ -127,15 +175,18 @@ def check_vps(ip, name):
|
|||||||
|
|
||||||
# ─── FALLBACK ACTIONS ───
|
# ─── FALLBACK ACTIONS ───
|
||||||
|
|
||||||
def fallback_to_local_model(cfg):
|
def fallback_to_openrouter(cfg):
|
||||||
"""Switch primary model from Kimi to local llama.cpp"""
|
"Switch primary model from Kimi to OpenRouter (Gemini 2.5 Pro)"
|
||||||
if not BACKUP_CONFIG.exists():
|
if not BACKUP_CONFIG.exists():
|
||||||
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
||||||
|
|
||||||
cfg["model"]["provider"] = "local-llama.cpp"
|
openrouter_cfg = cfg.get("providers", {}).get("openrouter", {})
|
||||||
cfg["model"]["default"] = "hermes3"
|
base_url = openrouter_cfg.get("base_url", "https://openrouter.ai/api/v1")
|
||||||
|
cfg["model"]["provider"] = "openrouter"
|
||||||
|
cfg["model"]["default"] = "google/gemini-2.5-pro"
|
||||||
|
cfg["model"]["base_url"] = base_url
|
||||||
save_config(cfg)
|
save_config(cfg)
|
||||||
return "Switched primary model to local-llama.cpp/hermes3"
|
return "Switched primary model to openrouter/google/gemini-2.5-pro"
|
||||||
|
|
||||||
def fallback_to_ollama(cfg):
|
def fallback_to_ollama(cfg):
|
||||||
"""Switch to Ollama if llama.cpp is also down"""
|
"""Switch to Ollama if llama.cpp is also down"""
|
||||||
@@ -179,11 +230,11 @@ def diagnose_and_fallback():
|
|||||||
kimi_ok, kimi_msg = check_kimi()
|
kimi_ok, kimi_msg = check_kimi()
|
||||||
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
|
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
|
||||||
|
|
||||||
llama_ok, llama_msg = check_local_llama()
|
openrouter_ok, openrouter_msg = check_openrouter()
|
||||||
results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
|
results["checks"]["openrouter"] = {"ok": openrouter_ok, "msg": openrouter_msg}
|
||||||
|
|
||||||
ollama_ok, ollama_msg = check_ollama()
|
oopenrouter_ok, oopenrouter_msg = check_ollama()
|
||||||
results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg}
|
results["checks"]["ollama"] = {"ok": oopenrouter_ok, "msg": oopenrouter_msg}
|
||||||
|
|
||||||
gitea_ok, gitea_msg = check_gitea()
|
gitea_ok, gitea_msg = check_gitea()
|
||||||
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
|
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
|
||||||
@@ -202,41 +253,79 @@ def diagnose_and_fallback():
|
|||||||
|
|
||||||
# ─── FALLBACK LOGIC ───
|
# ─── FALLBACK LOGIC ───
|
||||||
|
|
||||||
# Case 1: Primary (Kimi) down, local available
|
# Case 1: Primary (Kimi) down, try fallback chain (OpenRouter -> Ollama)
|
||||||
if not kimi_ok and current_provider == "kimi-coding":
|
if not kimi_ok and current_provider == "kimi-coding":
|
||||||
if llama_ok:
|
agent_name = cfg.get("agent", {}).get("name", "timmy")
|
||||||
msg = fallback_to_local_model(cfg)
|
applied = False
|
||||||
results["actions"].append(msg)
|
# Try OpenRouter fallback
|
||||||
state["active_fallbacks"].append("kimi->local-llama")
|
if openrouter_ok:
|
||||||
results["status"] = "degraded_local"
|
try:
|
||||||
elif ollama_ok:
|
msg = fallback_to_openrouter(cfg)
|
||||||
msg = fallback_to_ollama(cfg)
|
results["actions"].append(msg)
|
||||||
results["actions"].append(msg)
|
state["active_fallbacks"].append("kimi->openrouter")
|
||||||
state["active_fallbacks"].append("kimi->ollama")
|
results["status"] = "degraded_openrouter"
|
||||||
results["status"] = "degraded_ollama"
|
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "success")
|
||||||
else:
|
applied = True
|
||||||
msg = enter_safe_mode(state)
|
except Exception as e:
|
||||||
results["actions"].append(msg)
|
log(f"OpenRouter fallback failed: {e}")
|
||||||
results["status"] = "safe_mode"
|
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e))
|
||||||
|
# If still not applied, try Ollama
|
||||||
|
if not applied and oopenrouter_ok:
|
||||||
|
try:
|
||||||
|
msg = fallback_to_ollama(cfg)
|
||||||
|
results["actions"].append(msg)
|
||||||
|
state["active_fallbacks"].append("kimi->ollama")
|
||||||
|
results["status"] = "degraded_ollama"
|
||||||
|
log_fallback_event(agent_name, "ollama", "gemma4:latest", "success")
|
||||||
|
applied = True
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Ollama fallback failed: {e}")
|
||||||
|
log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e))
|
||||||
|
if not applied:
|
||||||
|
try:
|
||||||
|
msg = enter_safe_mode(state)
|
||||||
|
results["actions"].append(msg)
|
||||||
|
results["status"] = "safe_mode"
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Safe mode failed: {e}")
|
||||||
|
|
||||||
# Case 2: Already on fallback, check if primary recovered
|
# Case 2: Already on fallback, check if primary recovered — restore with resilience
|
||||||
elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []):
|
elif kimi_ok:
|
||||||
msg = restore_config()
|
restored = False
|
||||||
results["actions"].append(msg)
|
agent_name = cfg.get("agent", {}).get("name", "timmy")
|
||||||
state["active_fallbacks"].remove("kimi->local-llama")
|
# Try restore from OpenRouter fallback
|
||||||
results["status"] = "recovered"
|
if "kimi->openrouter" in state.get("active_fallbacks", []):
|
||||||
elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []):
|
try:
|
||||||
msg = restore_config()
|
msg = restore_config()
|
||||||
results["actions"].append(msg)
|
results["actions"].append(msg)
|
||||||
state["active_fallbacks"].remove("kimi->ollama")
|
state["active_fallbacks"].remove("kimi->openrouter")
|
||||||
results["status"] = "recovered"
|
results["status"] = "recovered"
|
||||||
|
restored = True
|
||||||
|
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "restored")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Restore from OpenRouter failed: {e}")
|
||||||
|
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e))
|
||||||
|
# Try restore from Ollama fallback if still not restored
|
||||||
|
if not restored and "kimi->ollama" in state.get("active_fallbacks", []):
|
||||||
|
try:
|
||||||
|
msg = restore_config()
|
||||||
|
results["actions"].append(msg)
|
||||||
|
state["active_fallbacks"].remove("kimi->ollama")
|
||||||
|
results["status"] = "recovered"
|
||||||
|
restored = True
|
||||||
|
log_fallback_event(agent_name, "ollama", "gemma4:latest", "restored")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Restore from Ollama failed: {e}")
|
||||||
|
log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e))
|
||||||
|
if not restored:
|
||||||
|
log("WARNING: Primary recovered but unable to restore config")
|
||||||
|
|
||||||
# Case 3: Gitea down — just flag it, work locally
|
# Case 3: Gitea down — just flag it, work locally
|
||||||
if not gitea_ok:
|
if not gitea_ok:
|
||||||
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
|
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
|
||||||
if "gitea_down" not in state.get("active_fallbacks", []):
|
if "gitea_down" not in state.get("active_fallbacks", []):
|
||||||
state["active_fallbacks"].append("gitea_down")
|
state["active_fallbacks"].append("gitea_down")
|
||||||
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0)
|
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"] else 0)
|
||||||
elif "gitea_down" in state.get("active_fallbacks", []):
|
elif "gitea_down" in state.get("active_fallbacks", []):
|
||||||
state["active_fallbacks"].remove("gitea_down")
|
state["active_fallbacks"].remove("gitea_down")
|
||||||
results["actions"].append("Gitea recovered — resume normal operations")
|
results["actions"].append("Gitea recovered — resume normal operations")
|
||||||
|
|||||||
87
bin/gitea-backup.sh
Normal file
87
bin/gitea-backup.sh
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Gitea Daily Backup Script
|
||||||
|
# Uses Gitea's native dump command to create automated backups of repositories and SQLite databases.
|
||||||
|
# Designed to run on the VPS (Ezra) as part of a daily cron job.
|
||||||
|
#
|
||||||
|
# Configuration via environment variables:
|
||||||
|
# GITEA_BIN Path to gitea binary (default: auto-detect)
|
||||||
|
# GITEA_BACKUP_DIR Directory for backup archives (default: /var/backups/gitea)
|
||||||
|
# GITEA_BACKUP_RETENTION Days to retain backups (default: 7)
|
||||||
|
# GITEA_BACKUP_LOG Log file path (default: /var/log/gitea-backup.log)
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
GITEA_BIN="${GITEA_BIN:-$(command -v gitea 2>/dev/null || echo "/usr/local/bin/gitea")}"
|
||||||
|
BACKUP_DIR="${GITEA_BACKUP_DIR:-/var/backups/gitea}"
|
||||||
|
RETENTION_DAYS="${GITEA_BACKUP_RETENTION:-7}"
|
||||||
|
DATE="$(date +%Y-%m-%d_%H%M%S)"
|
||||||
|
BACKUP_FILE="${BACKUP_DIR}/gitea-backup-${DATE}.tar.gz"
|
||||||
|
LOG_FILE="${GITEA_BACKUP_LOG:-/var/log/gitea-backup.log}"
|
||||||
|
|
||||||
|
mkdir -p "${BACKUP_DIR}"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "${LOG_FILE}"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "=== Starting Gitea daily backup ==="
|
||||||
|
|
||||||
|
# Verify gitea binary exists
|
||||||
|
if [ ! -x "${GITEA_BIN}" ]; then
|
||||||
|
log "ERROR: Gitea binary not found at ${GITEA_BIN}"
|
||||||
|
log "Set GITEA_BIN environment variable to the gitea binary path (e.g., /usr/bin/gitea)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Detect Gitea WORK_PATH
|
||||||
|
WORK_PATH=""
|
||||||
|
APP_INI=""
|
||||||
|
for path in /etc/gitea/app.ini /home/git/gitea/custom/conf/app.ini ~/gitea/custom/conf/app.ini; do
|
||||||
|
if [ -f "$path" ]; then
|
||||||
|
APP_INI="$path"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -n "$APP_INI" ]; then
|
||||||
|
# Parse [app] WORK_PATH = /var/lib/gitea
|
||||||
|
WORK_PATH=$(sed -n 's/^[[:space:]]*WORK_PATH[[:space:]]*=[[:space:]]*//p' "$APP_INI" | head -1)
|
||||||
|
log "Detected WORK_PATH from app.ini: ${WORK_PATH}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Fallback detection
|
||||||
|
if [ -z "$WORK_PATH" ]; then
|
||||||
|
for d in /var/lib/gitea /home/git/gitea /srv/gitea /opt/gitea; do
|
||||||
|
if [ -d "$d" ]; then
|
||||||
|
WORK_PATH="$d"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
log "Inferred WORK_PATH: ${WORK_PATH:-not found}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$WORK_PATH" ]; then
|
||||||
|
log "ERROR: Could not determine Gitea WORK_PATH. Set GITEA_WORK_PATH manually."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Perform gitea dump
|
||||||
|
# Flags: --work-path sets the Gitea working directory, --file writes dump to tar.gz
|
||||||
|
log "Running: gitea dump --work-path ${WORK_PATH} --file ${BACKUP_FILE}"
|
||||||
|
"${GITEA_BIN}" dump --work-path "${WORK_PATH}" --file "${BACKUP_FILE}" 2>>"${LOG_FILE}"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
log "ERROR: gitea dump failed — check ${LOG_FILE} for details"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
FILE_SIZE=$(du -h "${BACKUP_FILE}" | cut -f1)
|
||||||
|
log "Backup created: ${BACKUP_FILE} (${FILE_SIZE})"
|
||||||
|
|
||||||
|
# Prune old backups (keep last N days)
|
||||||
|
find "${BACKUP_DIR}" -name "gitea-backup-*.tar.gz" -type f -mtime +$((${RETENTION_DAYS}-1)) -delete 2>/dev/null || true
|
||||||
|
log "Pruned backups older than ${RETENTION_DAYS} days"
|
||||||
|
|
||||||
|
log "=== Backup completed successfully ==="
|
||||||
|
|
||||||
|
exit 0
|
||||||
9
cron/vps/gitea-daily-backup.yml
Normal file
9
cron/vps/gitea-daily-backup.yml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
- name: Daily Gitea Backup
|
||||||
|
schedule: '0 2 * * *' # 2:00 AM daily
|
||||||
|
tasks:
|
||||||
|
- name: Run Gitea daily backup
|
||||||
|
shell: bash ~/.hermes/bin/gitea-backup.sh
|
||||||
|
env:
|
||||||
|
GITEA_BIN: /usr/local/bin/gitea
|
||||||
|
GITEA_BACKUP_DIR: /var/backups/gitea
|
||||||
|
GITEA_BACKUP_RETENTION: "7"
|
||||||
155
docs/backup-recovery-runbook.md
Normal file
155
docs/backup-recovery-runbook.md
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
# Gitea Backup & Recovery Runbook
|
||||||
|
|
||||||
|
**Last updated:** 2026-04-30
|
||||||
|
**Scope:** Single-node VPS (Ezra, 143.198.27.163) running Gitea
|
||||||
|
**Backup Strategy:** Automated daily full dumps via `gitea dump`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What Gets Backed Up
|
||||||
|
|
||||||
|
| Component | Method | Frequency | Retention |
|
||||||
|
|-----------|--------|-----------|-----------|
|
||||||
|
| All Gitea repositories (bare git dirs) | `gitea dump --file` | Daily at 2:00 AM | 7 days |
|
||||||
|
| SQLite databases (gitea.db, indexer.db, etc.) | Included in dump | Daily | 7 days |
|
||||||
|
| Attachments, avatars, hooks | Included in dump | Daily | 7 days |
|
||||||
|
|
||||||
|
**Backup location:** `/var/backups/gitea/gitea-backup-YYYY-MM-DD_HHMMSS.tar.gz`
|
||||||
|
|
||||||
|
**Log file:** `/var/log/gitea-backup.log`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Backup Architecture
|
||||||
|
|
||||||
|
The backup script `bin/gitea-backup.sh` runs daily via Hermes cron (`cron/vps/gitea-daily-backup.yml`). It:
|
||||||
|
|
||||||
|
1. Locates the Gitea `WORK_PATH` by reading `/etc/gitea/app.ini` or falling back to common locations (`/var/lib/gitea`, `/home/git/gitea`)
|
||||||
|
2. Invokes `gitea dump --work-path <path> --file <backup-tar.gz>` — Gitea's native, consistent snapshot mechanism
|
||||||
|
3. Prunes archives older than 7 days
|
||||||
|
4. Logs all operations to `/var/log/gitea-backup.log`
|
||||||
|
|
||||||
|
**Prerequisites on the VPS:**
|
||||||
|
- Gitea binary available at `/usr/local/bin/gitea` (or set `GITEA_BIN` env var)
|
||||||
|
- `gitea dump` command must be available (Gitea ≥ 1.12)
|
||||||
|
- SSH access to the VPS for manual recovery operations
|
||||||
|
- Sufficient disk space in `/var/backups/gitea` (typical dump: ~2–10 GB depending on repo count/size)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recovery Time Objective (RTO) & Recovery Point Objective (RPO)
|
||||||
|
|
||||||
|
| Metric | Estimate |
|
||||||
|
|--------|----------|
|
||||||
|
| **RPO** (data loss window) | ≤ 24 hours (last daily backup) |
|
||||||
|
| **RTO** (time to restore) | **~45 minutes** (cold restore from backup tarball) |
|
||||||
|
| **Downtime impact** | Gitea offline during restore (~20 min) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step-by-Step Recovery Procedure
|
||||||
|
|
||||||
|
### Phase 1 — Assess & Prepare (5 min)
|
||||||
|
|
||||||
|
1. SSH into Ezra VPS: `ssh root@143.198.27.163`
|
||||||
|
2. Stop Gitea so files are quiescent:
|
||||||
|
```bash
|
||||||
|
systemctl stop gitea
|
||||||
|
```
|
||||||
|
3. Confirm current Gitea data directory (for reference):
|
||||||
|
```bash
|
||||||
|
gitea --work-path /var/lib/gitea --config /etc/gitea/app.ini dump --help 2>&1
|
||||||
|
# Or check app.ini for WORK_PATH
|
||||||
|
cat /etc/gitea/app.ini | grep '^WORK_PATH'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 2 — Restore from Backup (20 min)
|
||||||
|
|
||||||
|
4. Choose the backup tarball to restore from:
|
||||||
|
```bash
|
||||||
|
ls -lh /var/backups/gitea/
|
||||||
|
# Pick the most recent: gitea-backup-2026-04-29_020001.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Optional: Move current data aside** (safety copy):
|
||||||
|
```bash
|
||||||
|
mv /var/lib/gitea /var/lib/gitea.bak-$(date +%s)
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Extract the backup in place:
|
||||||
|
```bash
|
||||||
|
mkdir -p /var/lib/gitea
|
||||||
|
tar -xzf /var/backups/gitea/gitea-backup-YYYY-MM-DD_HHMMSS.tar.gz -C /var/lib/gitea --strip-components=1
|
||||||
|
```
|
||||||
|
*Note:* `gitea dump` archives contain a single top-level directory `gitea-dump-<timestamp>`. The `--strip-components=1` puts its contents directly into `/var/lib/gitea`.
|
||||||
|
|
||||||
|
7. Set correct ownership (typically `git:git`):
|
||||||
|
```bash
|
||||||
|
chown -R git:git /var/lib/gitea
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 3 — Restart & Validate (15 min)
|
||||||
|
|
||||||
|
8. Start Gitea:
|
||||||
|
```bash
|
||||||
|
systemctl start gitea
|
||||||
|
```
|
||||||
|
|
||||||
|
9. Wait 30 seconds, then verify:
|
||||||
|
```bash
|
||||||
|
systemctl status gitea
|
||||||
|
# Check HTTP endpoint
|
||||||
|
curl -s -o /dev/null -w '%{http_code}' http://localhost:3000/ # Should be 200
|
||||||
|
```
|
||||||
|
|
||||||
|
10. Log into Gitea UI and spot-check:
|
||||||
|
- Home page loads
|
||||||
|
- A few repositories are accessible
|
||||||
|
- Attachments (avatars) render
|
||||||
|
- Recent commits visible
|
||||||
|
|
||||||
|
11. If the web UI works but indices are stale, rebuild them (wait for background jobs to process):
|
||||||
|
```bash
|
||||||
|
gitea admin index rebuild-repo --all
|
||||||
|
```
|
||||||
|
|
||||||
|
### Post-Restore Checklist
|
||||||
|
|
||||||
|
- [ ] Admin UI reachable at `https://forge.alexanderwhitestone.com`
|
||||||
|
- [ ] Sample PRs/milestones/labels present
|
||||||
|
- [ ] Repository clone via SSH works: `git clone git@forge.alexanderwhitestone.com:Timmy_Foundation/timmy-config.git`
|
||||||
|
- [ ] Check backup script health: `cat /var/log/gitea-backup.log | tail -20`
|
||||||
|
- [ ] Re-enable any disabled integrations (webhooks, CI/CD runners)
|
||||||
|
- [ ] Notify the fleet: post to relevant channels confirming operational status
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Known Issues & Workarounds
|
||||||
|
|
||||||
|
| Symptom | Likely cause | Fix |
|
||||||
|
|---------|--------------|-----|
|
||||||
|
| `gitea: command not found` | Binary at non-standard path | Set `GITEA_BIN=/path/to/gitea` in cron env |
|
||||||
|
| `Permission denied` on backup dir | Cron user lacks write access to `/var/backups` | `mkdir /var/backups/gitea && chown root:root /var/backups/gitea` |
|
||||||
|
| Restore fails: `"database or disk is full"` | Insufficient space on `/var/lib/gitea` | Expand disk or clean up old data first; backups require ~1.5x live data size |
|
||||||
|
| Old backup tarballs not deleting | Retention cron not firing | Check `systemctl status hermes-cron` and cron logs |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Off-Site Replication (Future Work)
|
||||||
|
|
||||||
|
This backup is **on-site only** (same VPS). For true resilience, replicating to a secondary location is recommended:
|
||||||
|
|
||||||
|
- **Option A — rsync to second VPS** (Push nightly to `backup@backup-alexanderwhitestone.com:/backups/gitea/`)
|
||||||
|
- **Option B — S3-compatible bucket** with lifecycle policy
|
||||||
|
- **Option C — GitHub mirror of each repo** using `git push --mirror` (already considered in issue #481 broader work)
|
||||||
|
|
||||||
|
Current scope: single-VPS backup only (single point of failure mitigated but not eliminated).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Related Documentation
|
||||||
|
|
||||||
|
- `bin/gitea-backup.sh` — backup script source
|
||||||
|
- `cron/vps/gitea-daily-backup.yml` — Hermes cron definition
|
||||||
|
- Gitea official docs: <https://docs.gitea.com/administration/backup-and-restore>
|
||||||
|
- Hermes cron: <https://hermes-agent.nousresearch.com/docs>
|
||||||
Reference in New Issue
Block a user