Compare commits
6 Commits
step35/595
...
step35/347
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a027249618 | ||
|
|
ba4220d5ed | ||
|
|
2451f38bee | ||
|
|
54093991ab | ||
|
|
1ea6bf6e33 | ||
|
|
874ce137b0 |
@@ -28,6 +28,7 @@ WORKTREE_BASE="$HOME/worktrees"
|
|||||||
LOG_DIR="$HOME/.hermes/logs"
|
LOG_DIR="$HOME/.hermes/logs"
|
||||||
LOCK_DIR="$LOG_DIR/${AGENT}-locks"
|
LOCK_DIR="$LOG_DIR/${AGENT}-locks"
|
||||||
SKIP_FILE="$LOG_DIR/${AGENT}-skip-list.json"
|
SKIP_FILE="$LOG_DIR/${AGENT}-skip-list.json"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
ACTIVE_FILE="$LOG_DIR/${AGENT}-active.json"
|
ACTIVE_FILE="$LOG_DIR/${AGENT}-active.json"
|
||||||
TIMEOUT=600
|
TIMEOUT=600
|
||||||
COOLDOWN=30
|
COOLDOWN=30
|
||||||
@@ -245,6 +246,13 @@ print(json.dumps({
|
|||||||
consecutive_failures=$((consecutive_failures + 1))
|
consecutive_failures=$((consecutive_failures + 1))
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# ── ANDON: Alert on 3+ consecutive failures ─────────────────────────────
|
||||||
|
if [ "$consecutive_failures" -ge 3 ]; then
|
||||||
|
"$SCRIPT_DIR/andon-alert.sh" CRITICAL \
|
||||||
|
"Agent $AGENT worker $worker_id: $consecutive_failures consecutive failures (last issue #$issue_num)" \
|
||||||
|
"agent-loop"
|
||||||
|
fi
|
||||||
|
|
||||||
# ── METRICS ──
|
# ── METRICS ──
|
||||||
python3 -c "
|
python3 -c "
|
||||||
import json, datetime
|
import json, datetime
|
||||||
|
|||||||
80
bin/andon-alert.sh
Executable file
80
bin/andon-alert.sh
Executable file
@@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# andon-alert.sh — Andon (行燈) real-time signal light.
|
||||||
|
# Raises visible alerts for any failure in the fleet.
|
||||||
|
#
|
||||||
|
# Usage: andon-alert.sh <severity> "<message>" "<source>"
|
||||||
|
# severities: INFO, WARNING, CRITICAL, HALT
|
||||||
|
# source: component name (e.g., "model-health", "agent-loop", "watchdog", "jidoka")
|
||||||
|
#
|
||||||
|
# Behavior:
|
||||||
|
# INFO — log only
|
||||||
|
# WARNING — log + Telegram
|
||||||
|
# CRITICAL — log + Telegram + pause offending loop (SIGSTOP)
|
||||||
|
# HALT — log + Telegram + kill offending loop (SIGKILL) + create flag file
|
||||||
|
#
|
||||||
|
# State is written to ~/.hermes/andon-state/<source>.json for the burn monitor.
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
||||||
|
LOG_DIR="$HERMES_HOME/logs"
|
||||||
|
STATE_DIR="$HERMES_HOME/andon-state"
|
||||||
|
mkdir -p "$LOG_DIR" "$STATE_DIR"
|
||||||
|
|
||||||
|
LOG_FILE="$LOG_DIR/andon.log"
|
||||||
|
TELEGRAM_TOKEN="${TELEGRAM_TOKEN:-}"
|
||||||
|
TELEGRAM_CHAT="${TELEGRAM_CHAT:-"-1003664764329"}"
|
||||||
|
|
||||||
|
# Read token from legacy location if not set
|
||||||
|
if [ -z "$TELEGRAM_TOKEN" ] && [ -f "$HOME/.config/telegram/special_bot" ]; then
|
||||||
|
TELEGRAM_TOKEN=$(cat "$HOME/.config/telegram/special_bot" | tr -d '[:space:]')
|
||||||
|
fi
|
||||||
|
|
||||||
|
severity="${1:-INFO}"
|
||||||
|
message="${2:-No message}"
|
||||||
|
source="${3:-unknown}"
|
||||||
|
|
||||||
|
timestamp=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
|
||||||
|
echo "[$timestamp] [$severity] [$source] $message" >> "$LOG_FILE"
|
||||||
|
|
||||||
|
# Update state file for burn monitor
|
||||||
|
state_file="$STATE_DIR/${source}.json"
|
||||||
|
python3 -c "import json,sys; json.dump({'severity':'$severity','message':'$message','timestamp':'$timestamp'}, sys.stdout)" > "$state_file"
|
||||||
|
|
||||||
|
send_telegram() {
|
||||||
|
local text="$1"
|
||||||
|
if [ -n "$TELEGRAM_TOKEN" ] && [ -n "$TELEGRAM_CHAT" ]; then
|
||||||
|
curl -sf --max-time 10 -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" -d "chat_id=${TELEGRAM_CHAT}" -d "text=${text}" >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
case "${severity^^}" in
|
||||||
|
WARNING|CRITICAL|HALT)
|
||||||
|
send_telegram "[$severity] [$source] $timestamp\n$message"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [ "${severity^^}" = "CRITICAL" ]; then
|
||||||
|
# Try to pause the offending loop (SIGSTOP)
|
||||||
|
if pkill -STOP -f "$source" 2>/dev/null; then
|
||||||
|
echo "[$timestamp] Paused loop matching '$source'" >> "$LOG_FILE"
|
||||||
|
else
|
||||||
|
echo "[$timestamp] WARN: Could not pause loop for '$source'" >> "$LOG_FILE"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${severity^^}" = "HALT" ]; then
|
||||||
|
# Kill the loop and create a flag file
|
||||||
|
if pkill -KILL -f "$source" 2>/dev/null; then
|
||||||
|
echo "[$timestamp] Killed loop matching '$source'" >> "$LOG_FILE"
|
||||||
|
else
|
||||||
|
echo "[$timestamp] WARN: Could not kill loop for '$source'" >> "$LOG_FILE"
|
||||||
|
fi
|
||||||
|
flag_file="$HERMES_HOME/andon-halt-${source}.flag"
|
||||||
|
touch "$flag_file"
|
||||||
|
echo "[$timestamp] Created halt flag: $flag_file" >> "$LOG_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
77
bin/burn-monitor.sh
Executable file
77
bin/burn-monitor.sh
Executable file
@@ -0,0 +1,77 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# burn-monitor.sh — Andon Board: real-time signal light for the fleet.
|
||||||
|
# Displays color-coded status per lane (component).
|
||||||
|
#
|
||||||
|
# Reads per-source state from ~/.hermes/andon-state/*.json
|
||||||
|
# Lanes without state are shown GREEN (OK).
|
||||||
|
# Output suitable for terminal or cron email.
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
||||||
|
STATE_DIR="$HERMES_HOME/andon-state"
|
||||||
|
|
||||||
|
# Define all lanes we monitor (must match source names used by andon-alert)
|
||||||
|
LANES=("model-health" "agent-loop" "watchdog" "jidoka" "claude-loop" "gemini-loop")
|
||||||
|
|
||||||
|
# ANSI color codes
|
||||||
|
RED='\033[0;31m'
|
||||||
|
YELLOW='\033[0;33m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
RESET='\033[0m'
|
||||||
|
|
||||||
|
now=$(date -u '+%Y-%m-%d %H:%M UTC')
|
||||||
|
|
||||||
|
echo "======================================"
|
||||||
|
echo " Andon Board — Fleet Status ($now)"
|
||||||
|
echo "======================================"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
for lane in "${LANES[@]}"; do
|
||||||
|
state_file="$STATE_DIR/${lane}.json"
|
||||||
|
if [ -f "$state_file" ]; then
|
||||||
|
severity=$(python3 -c "import json; print(json.load(open('$state_file')).get('severity','UNKNOWN'))" 2>/dev/null || echo "UNKNOWN")
|
||||||
|
message=$(python3 -c "import json; print(json.load(open('$state_file')).get('message',''))" 2>/dev/null || echo "")
|
||||||
|
timestamp=$(python3 -c "import json; print(json.load(open('$state_file')).get('timestamp',''))" 2>/dev/null || echo "")
|
||||||
|
else
|
||||||
|
severity="OK"
|
||||||
|
message=""
|
||||||
|
timestamp=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "${severity^^}" in
|
||||||
|
CRITICAL|HALT)
|
||||||
|
color="$RED"
|
||||||
|
symbol="🔴"
|
||||||
|
;;
|
||||||
|
WARNING)
|
||||||
|
color="$YELLOW"
|
||||||
|
symbol="🟡"
|
||||||
|
;;
|
||||||
|
OK|INFO)
|
||||||
|
color="$GREEN"
|
||||||
|
symbol="🟢"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
color="$BLUE"
|
||||||
|
symbol="⚪"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
printf "${color}%s${RESET} %-20s" "$symbol" "$lane"
|
||||||
|
if [ -n "$message" ]; then
|
||||||
|
# Truncate message
|
||||||
|
disp_msg="${message:0:80}"
|
||||||
|
printf " — %s" "$disp_msg"
|
||||||
|
fi
|
||||||
|
if [ -n "$timestamp" ]; then
|
||||||
|
printf " [%s]" "$timestamp"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Legend: 🔴 RED (critical/halt) 🟡 YELLOW (warning) 🟢 GREEN (ok)"
|
||||||
|
echo "For details: tail -f ~/.hermes/logs/andon.log"
|
||||||
|
echo ""
|
||||||
@@ -5,6 +5,7 @@ set -uo pipefail
|
|||||||
export PATH="/opt/homebrew/bin:$HOME/.local/bin:$HOME/.hermes/bin:/usr/local/bin:$PATH"
|
export PATH="/opt/homebrew/bin:$HOME/.local/bin:$HOME/.hermes/bin:/usr/local/bin:$PATH"
|
||||||
|
|
||||||
LOG="$HOME/.hermes/logs/claudemax-watchdog.log"
|
LOG="$HOME/.hermes/logs/claudemax-watchdog.log"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
GITEA_URL="https://forge.alexanderwhitestone.com"
|
GITEA_URL="https://forge.alexanderwhitestone.com"
|
||||||
GITEA_TOKEN=$(tr -d '[:space:]' < "$HOME/.hermes/gitea_token_vps" 2>/dev/null || true)
|
GITEA_TOKEN=$(tr -d '[:space:]' < "$HOME/.hermes/gitea_token_vps" 2>/dev/null || true)
|
||||||
REPO_API="$GITEA_URL/api/v1/repos/Timmy_Foundation/the-nexus"
|
REPO_API="$GITEA_URL/api/v1/repos/Timmy_Foundation/the-nexus"
|
||||||
@@ -29,6 +30,7 @@ start_loop() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
log "$name not running. Restarting..."
|
log "$name not running. Restarting..."
|
||||||
|
"$SCRIPT_DIR/andon-alert.sh" WARNING "Restarted $name loop (was down)" "watchdog"
|
||||||
nohup bash -lc "$cmd" >/dev/null 2>&1 &
|
nohup bash -lc "$cmd" >/dev/null 2>&1 &
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
||||||
|
|||||||
87
bin/gitea-backup.sh
Normal file
87
bin/gitea-backup.sh
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Gitea Daily Backup Script
|
||||||
|
# Uses Gitea's native dump command to create automated backups of repositories and SQLite databases.
|
||||||
|
# Designed to run on the VPS (Ezra) as part of a daily cron job.
|
||||||
|
#
|
||||||
|
# Configuration via environment variables:
|
||||||
|
# GITEA_BIN Path to gitea binary (default: auto-detect)
|
||||||
|
# GITEA_BACKUP_DIR Directory for backup archives (default: /var/backups/gitea)
|
||||||
|
# GITEA_BACKUP_RETENTION Days to retain backups (default: 7)
|
||||||
|
# GITEA_BACKUP_LOG Log file path (default: /var/log/gitea-backup.log)
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
GITEA_BIN="${GITEA_BIN:-$(command -v gitea 2>/dev/null || echo "/usr/local/bin/gitea")}"
|
||||||
|
BACKUP_DIR="${GITEA_BACKUP_DIR:-/var/backups/gitea}"
|
||||||
|
RETENTION_DAYS="${GITEA_BACKUP_RETENTION:-7}"
|
||||||
|
DATE="$(date +%Y-%m-%d_%H%M%S)"
|
||||||
|
BACKUP_FILE="${BACKUP_DIR}/gitea-backup-${DATE}.tar.gz"
|
||||||
|
LOG_FILE="${GITEA_BACKUP_LOG:-/var/log/gitea-backup.log}"
|
||||||
|
|
||||||
|
mkdir -p "${BACKUP_DIR}"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "${LOG_FILE}"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "=== Starting Gitea daily backup ==="
|
||||||
|
|
||||||
|
# Verify gitea binary exists
|
||||||
|
if [ ! -x "${GITEA_BIN}" ]; then
|
||||||
|
log "ERROR: Gitea binary not found at ${GITEA_BIN}"
|
||||||
|
log "Set GITEA_BIN environment variable to the gitea binary path (e.g., /usr/bin/gitea)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Detect Gitea WORK_PATH
|
||||||
|
WORK_PATH=""
|
||||||
|
APP_INI=""
|
||||||
|
for path in /etc/gitea/app.ini /home/git/gitea/custom/conf/app.ini ~/gitea/custom/conf/app.ini; do
|
||||||
|
if [ -f "$path" ]; then
|
||||||
|
APP_INI="$path"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -n "$APP_INI" ]; then
|
||||||
|
# Parse [app] WORK_PATH = /var/lib/gitea
|
||||||
|
WORK_PATH=$(sed -n 's/^[[:space:]]*WORK_PATH[[:space:]]*=[[:space:]]*//p' "$APP_INI" | head -1)
|
||||||
|
log "Detected WORK_PATH from app.ini: ${WORK_PATH}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Fallback detection
|
||||||
|
if [ -z "$WORK_PATH" ]; then
|
||||||
|
for d in /var/lib/gitea /home/git/gitea /srv/gitea /opt/gitea; do
|
||||||
|
if [ -d "$d" ]; then
|
||||||
|
WORK_PATH="$d"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
log "Inferred WORK_PATH: ${WORK_PATH:-not found}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$WORK_PATH" ]; then
|
||||||
|
log "ERROR: Could not determine Gitea WORK_PATH. Set GITEA_WORK_PATH manually."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Perform gitea dump
|
||||||
|
# Flags: --work-path sets the Gitea working directory, --file writes dump to tar.gz
|
||||||
|
log "Running: gitea dump --work-path ${WORK_PATH} --file ${BACKUP_FILE}"
|
||||||
|
"${GITEA_BIN}" dump --work-path "${WORK_PATH}" --file "${BACKUP_FILE}" 2>>"${LOG_FILE}"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
log "ERROR: gitea dump failed — check ${LOG_FILE} for details"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
FILE_SIZE=$(du -h "${BACKUP_FILE}" | cut -f1)
|
||||||
|
log "Backup created: ${BACKUP_FILE} (${FILE_SIZE})"
|
||||||
|
|
||||||
|
# Prune old backups (keep last N days)
|
||||||
|
find "${BACKUP_DIR}" -name "gitea-backup-*.tar.gz" -type f -mtime +$((${RETENTION_DAYS}-1)) -delete 2>/dev/null || true
|
||||||
|
log "Pruned backups older than ${RETENTION_DAYS} days"
|
||||||
|
|
||||||
|
log "=== Backup completed successfully ==="
|
||||||
|
|
||||||
|
exit 0
|
||||||
@@ -23,6 +23,7 @@ log() {
|
|||||||
log "Running model health check..."
|
log "Running model health check..."
|
||||||
if ! bash "$SCRIPT_DIR/model-health-check.sh"; then
|
if ! bash "$SCRIPT_DIR/model-health-check.sh"; then
|
||||||
log "FATAL: Model health check failed. Aborting loop startup."
|
log "FATAL: Model health check failed. Aborting loop startup."
|
||||||
|
"$SCRIPT_DIR/andon-alert.sh" CRITICAL "Model health check failed — API key invalid or unreachable" "model-health"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
log "Model health check passed."
|
log "Model health check passed."
|
||||||
|
|||||||
@@ -129,20 +129,42 @@ Preserved by timmy-orchestrator to prevent loss." 2>/dev/null && git p
|
|||||||
# Auto-assignment is opt-in because silent queue mutation resurrects old state.
|
# Auto-assignment is opt-in because silent queue mutation resurrects old state.
|
||||||
if [ "$unassigned_count" -gt 0 ]; then
|
if [ "$unassigned_count" -gt 0 ]; then
|
||||||
if [ "$AUTO_ASSIGN_UNASSIGNED" = "1" ]; then
|
if [ "$AUTO_ASSIGN_UNASSIGNED" = "1" ]; then
|
||||||
log "Assigning $unassigned_count issues to claude..."
|
log "Assigning $unassigned_count issues via dispatch router..."
|
||||||
while IFS= read -r line; do
|
DISPATCH_LOG="$LOG_DIR/dispatch_decisions.log"
|
||||||
local repo=$(echo "$line" | sed 's/.*REPO=\([^ ]*\).*/\1/')
|
while IFS= read -r line; do
|
||||||
local num=$(echo "$line" | sed 's/.*NUM=\([^ ]*\).*/\1/')
|
local repo=$(echo "$line" | sed 's/.*REPO=\([^ ]*\).*//')
|
||||||
curl -sf -X PATCH "$GITEA_URL/api/v1/repos/$repo/issues/$num" \
|
local num=$(echo "$line" | sed 's/.*NUM=\([^ ]*\).*//')
|
||||||
-H "Authorization: token $GITEA_TOKEN" \
|
local title=$(echo "$line" | sed 's/.*TITLE=//')
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"assignees":["claude"]}' >/dev/null 2>&1 && \
|
# Call dispatch_router to pick best agent
|
||||||
log " Assigned #$num ($repo) to claude"
|
local route_json
|
||||||
done < "$state_dir/unassigned.txt"
|
route_json=$(python3 "$SCRIPT_DIR/../scripts/dispatch_router.py" "$title" "$repo" 2>/dev/null) || route_json=""
|
||||||
else
|
|
||||||
log "Auto-assign disabled: leaving $unassigned_count unassigned issues untouched"
|
local recommended_agent="claude" # fallback
|
||||||
fi
|
local route_category="unknown"
|
||||||
fi
|
local route_score="0"
|
||||||
|
local route_reason="fallback"
|
||||||
|
|
||||||
|
if [ -n "$route_json" ]; then
|
||||||
|
recommended_agent=$(echo "$route_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('recommended_agent','claude'))" 2>/dev/null || echo "claude")
|
||||||
|
route_score=$(echo "$route_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('score',0))" 2>/dev/null || echo "0")
|
||||||
|
route_category=$(echo "$route_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('category','unknown'))" 2>/dev/null || echo "unknown")
|
||||||
|
route_reason=$(echo "$route_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('reason',''))" 2>/dev/null || echo "")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Assign via API
|
||||||
|
curl -sf -X PATCH "$GITEA_URL/api/v1/repos/$repo/issues/$num" \\
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \\
|
||||||
|
-H "Content-Type: application/json" \\
|
||||||
|
-d "{\"assignees\":[\"$recommended_agent\"]}" >/dev/null 2>&1 && \\
|
||||||
|
log " Assigned #$num ($repo) to $recommended_agent [score=$route_score cat=$route_category]"
|
||||||
|
|
||||||
|
# Log dispatch decision for audit (RFC3339 timestamp)
|
||||||
|
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
|
||||||
|
"$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$num" "$repo" "$title" "$recommended_agent" "$route_score" "$route_category|$route_reason" \
|
||||||
|
>> "$DISPATCH_LOG"
|
||||||
|
done < "$state_dir/unassigned.txt"
|
||||||
|
else fi
|
||||||
|
|
||||||
# Phase 2: PR review via Timmy (LLM)
|
# Phase 2: PR review via Timmy (LLM)
|
||||||
if [ "$pr_count" -gt 0 ]; then
|
if [ "$pr_count" -gt 0 ]; then
|
||||||
|
|||||||
9
cron/vps/gitea-daily-backup.yml
Normal file
9
cron/vps/gitea-daily-backup.yml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
- name: Daily Gitea Backup
|
||||||
|
schedule: '0 2 * * *' # 2:00 AM daily
|
||||||
|
tasks:
|
||||||
|
- name: Run Gitea daily backup
|
||||||
|
shell: bash ~/.hermes/bin/gitea-backup.sh
|
||||||
|
env:
|
||||||
|
GITEA_BIN: /usr/local/bin/gitea
|
||||||
|
GITEA_BACKUP_DIR: /var/backups/gitea
|
||||||
|
GITEA_BACKUP_RETENTION: "7"
|
||||||
155
docs/backup-recovery-runbook.md
Normal file
155
docs/backup-recovery-runbook.md
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
# Gitea Backup & Recovery Runbook
|
||||||
|
|
||||||
|
**Last updated:** 2026-04-30
|
||||||
|
**Scope:** Single-node VPS (Ezra, 143.198.27.163) running Gitea
|
||||||
|
**Backup Strategy:** Automated daily full dumps via `gitea dump`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What Gets Backed Up
|
||||||
|
|
||||||
|
| Component | Method | Frequency | Retention |
|
||||||
|
|-----------|--------|-----------|-----------|
|
||||||
|
| All Gitea repositories (bare git dirs) | `gitea dump --file` | Daily at 2:00 AM | 7 days |
|
||||||
|
| SQLite databases (gitea.db, indexer.db, etc.) | Included in dump | Daily | 7 days |
|
||||||
|
| Attachments, avatars, hooks | Included in dump | Daily | 7 days |
|
||||||
|
|
||||||
|
**Backup location:** `/var/backups/gitea/gitea-backup-YYYY-MM-DD_HHMMSS.tar.gz`
|
||||||
|
|
||||||
|
**Log file:** `/var/log/gitea-backup.log`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Backup Architecture
|
||||||
|
|
||||||
|
The backup script `bin/gitea-backup.sh` runs daily via Hermes cron (`cron/vps/gitea-daily-backup.yml`). It:
|
||||||
|
|
||||||
|
1. Locates the Gitea `WORK_PATH` by reading `/etc/gitea/app.ini` or falling back to common locations (`/var/lib/gitea`, `/home/git/gitea`)
|
||||||
|
2. Invokes `gitea dump --work-path <path> --file <backup-tar.gz>` — Gitea's native, consistent snapshot mechanism
|
||||||
|
3. Prunes archives older than 7 days
|
||||||
|
4. Logs all operations to `/var/log/gitea-backup.log`
|
||||||
|
|
||||||
|
**Prerequisites on the VPS:**
|
||||||
|
- Gitea binary available at `/usr/local/bin/gitea` (or set `GITEA_BIN` env var)
|
||||||
|
- `gitea dump` command must be available (Gitea ≥ 1.12)
|
||||||
|
- SSH access to the VPS for manual recovery operations
|
||||||
|
- Sufficient disk space in `/var/backups/gitea` (typical dump: ~2–10 GB depending on repo count/size)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recovery Time Objective (RTO) & Recovery Point Objective (RPO)
|
||||||
|
|
||||||
|
| Metric | Estimate |
|
||||||
|
|--------|----------|
|
||||||
|
| **RPO** (data loss window) | ≤ 24 hours (last daily backup) |
|
||||||
|
| **RTO** (time to restore) | **~45 minutes** (cold restore from backup tarball) |
|
||||||
|
| **Downtime impact** | Gitea offline during restore (~20 min) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step-by-Step Recovery Procedure
|
||||||
|
|
||||||
|
### Phase 1 — Assess & Prepare (5 min)
|
||||||
|
|
||||||
|
1. SSH into Ezra VPS: `ssh root@143.198.27.163`
|
||||||
|
2. Stop Gitea so files are quiescent:
|
||||||
|
```bash
|
||||||
|
systemctl stop gitea
|
||||||
|
```
|
||||||
|
3. Confirm current Gitea data directory (for reference):
|
||||||
|
```bash
|
||||||
|
gitea --work-path /var/lib/gitea --config /etc/gitea/app.ini dump --help 2>&1
|
||||||
|
# Or check app.ini for WORK_PATH
|
||||||
|
cat /etc/gitea/app.ini | grep '^WORK_PATH'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 2 — Restore from Backup (20 min)
|
||||||
|
|
||||||
|
4. Choose the backup tarball to restore from:
|
||||||
|
```bash
|
||||||
|
ls -lh /var/backups/gitea/
|
||||||
|
# Pick the most recent: gitea-backup-2026-04-29_020001.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Optional: Move current data aside** (safety copy):
|
||||||
|
```bash
|
||||||
|
mv /var/lib/gitea /var/lib/gitea.bak-$(date +%s)
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Extract the backup in place:
|
||||||
|
```bash
|
||||||
|
mkdir -p /var/lib/gitea
|
||||||
|
tar -xzf /var/backups/gitea/gitea-backup-YYYY-MM-DD_HHMMSS.tar.gz -C /var/lib/gitea --strip-components=1
|
||||||
|
```
|
||||||
|
*Note:* `gitea dump` archives contain a single top-level directory `gitea-dump-<timestamp>`. The `--strip-components=1` puts its contents directly into `/var/lib/gitea`.
|
||||||
|
|
||||||
|
7. Set correct ownership (typically `git:git`):
|
||||||
|
```bash
|
||||||
|
chown -R git:git /var/lib/gitea
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 3 — Restart & Validate (15 min)
|
||||||
|
|
||||||
|
8. Start Gitea:
|
||||||
|
```bash
|
||||||
|
systemctl start gitea
|
||||||
|
```
|
||||||
|
|
||||||
|
9. Wait 30 seconds, then verify:
|
||||||
|
```bash
|
||||||
|
systemctl status gitea
|
||||||
|
# Check HTTP endpoint
|
||||||
|
curl -s -o /dev/null -w '%{http_code}' http://localhost:3000/ # Should be 200
|
||||||
|
```
|
||||||
|
|
||||||
|
10. Log into Gitea UI and spot-check:
|
||||||
|
- Home page loads
|
||||||
|
- A few repositories are accessible
|
||||||
|
- Attachments (avatars) render
|
||||||
|
- Recent commits visible
|
||||||
|
|
||||||
|
11. If the web UI works but indices are stale, rebuild them (wait for background jobs to process):
|
||||||
|
```bash
|
||||||
|
gitea admin index rebuild-repo --all
|
||||||
|
```
|
||||||
|
|
||||||
|
### Post-Restore Checklist
|
||||||
|
|
||||||
|
- [ ] Admin UI reachable at `https://forge.alexanderwhitestone.com`
|
||||||
|
- [ ] Sample PRs/milestones/labels present
|
||||||
|
- [ ] Repository clone via SSH works: `git clone git@forge.alexanderwhitestone.com:Timmy_Foundation/timmy-config.git`
|
||||||
|
- [ ] Check backup script health: `cat /var/log/gitea-backup.log | tail -20`
|
||||||
|
- [ ] Re-enable any disabled integrations (webhooks, CI/CD runners)
|
||||||
|
- [ ] Notify the fleet: post to relevant channels confirming operational status
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Known Issues & Workarounds
|
||||||
|
|
||||||
|
| Symptom | Likely cause | Fix |
|
||||||
|
|---------|--------------|-----|
|
||||||
|
| `gitea: command not found` | Binary at non-standard path | Set `GITEA_BIN=/path/to/gitea` in cron env |
|
||||||
|
| `Permission denied` on backup dir | Cron user lacks write access to `/var/backups` | `mkdir /var/backups/gitea && chown root:root /var/backups/gitea` |
|
||||||
|
| Restore fails: `"database or disk is full"` | Insufficient space on `/var/lib/gitea` | Expand disk or clean up old data first; backups require ~1.5x live data size |
|
||||||
|
| Old backup tarballs not deleting | Retention cron not firing | Check `systemctl status hermes-cron` and cron logs |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Off-Site Replication (Future Work)
|
||||||
|
|
||||||
|
This backup is **on-site only** (same VPS). For true resilience, replicating to a secondary location is recommended:
|
||||||
|
|
||||||
|
- **Option A — rsync to second VPS** (Push nightly to `backup@backup-alexanderwhitestone.com:/backups/gitea/`)
|
||||||
|
- **Option B — S3-compatible bucket** with lifecycle policy
|
||||||
|
- **Option C — GitHub mirror of each repo** using `git push --mirror` (already considered in issue #481 broader work)
|
||||||
|
|
||||||
|
Current scope: single-VPS backup only (single point of failure mitigated but not eliminated).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Related Documentation
|
||||||
|
|
||||||
|
- `bin/gitea-backup.sh` — backup script source
|
||||||
|
- `cron/vps/gitea-daily-backup.yml` — Hermes cron definition
|
||||||
|
- Gitea official docs: <https://docs.gitea.com/administration/backup-and-restore>
|
||||||
|
- Hermes cron: <https://hermes-agent.nousresearch.com/docs>
|
||||||
Reference in New Issue
Block a user