Compare commits

..

5 Commits

Author SHA1 Message Date
c8b0f2a8fb feat(config): default local model to gemma4:12b via Ollama
- config.yaml: provider ollama, default gemma4:12b
- dynamic_dispatch_optimizer.py: fallback route references gemma4:12b
2026-04-07 15:56:17 +00:00
0470e23efb feat(infra): fleet milestone tracker with 22 phase messages (#557, FLEET-004) 2026-04-07 15:46:09 +00:00
39540a2a8c feat(infra): auto-restart agent, backup pipeline, Telegram thread reporter (#560, #561, #895)
- scripts/auto_restart_agent.sh — monitor and restart dead processes (3-attempt backoff)
- scripts/backup_pipeline.sh — daily backups with retention + offsite rsync hook
- scripts/telegram_thread_reporter.py — route messages to ops/burn/main threads
- infrastructure/cron/*.crontab — scheduling for new automations
2026-04-07 15:43:21 +00:00
839f52af12 fix(allegro): switch to kimi-k2.5 and add full fallback chain
- Replace broken kimi-for-coding model with kimi-k2.5
- Add fallback_providers with kimi-coding -> anthropic -> openrouter
- Add explicit provider config for kimi-coding base_url and timeouts

Refs: #lazzyPit
2026-04-07 15:39:58 +00:00
4e3f60344b feat(infra): add fleet health probe + crontab (#559, FLEET-006)
- scripts/fleet_health_probe.sh: SSH, disk, memory, process checks
- infrastructure/cron/fleet-health.crontab: 5-minute cron schedule
- Thresholds: disk<90%, mem<90%, critical processes monitored
2026-04-07 15:22:10 +00:00
10 changed files with 478 additions and 132 deletions

3
.gitignore vendored
View File

@@ -56,9 +56,6 @@ __pycache__/
venv/
*/venv/
# Resource Tracking System
metrics/resource_state.json
# Editor temps
\#*\#
*~

View File

@@ -1,6 +1,6 @@
model:
default: hermes4:14b
provider: custom
default: gemma4:12b
provider: ollama
toolsets:
- all
agent:

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env bash
# auto_restart_agent.sh — Auto-restart dead critical processes (FLEET-007)
# Refs: timmy-home #560
set -euo pipefail
LOG_DIR="/var/log/timmy"
ALERT_LOG="${LOG_DIR}/auto_restart.log"
STATE_DIR="/var/lib/timmy/restarts"
mkdir -p "$LOG_DIR" "$STATE_DIR"
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
log() { echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"; }
send_telegram() {
local msg="$1"
if [[ -n "$TELEGRAM_BOT_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID}" -d "text=${msg}" >/dev/null 2>&1 || true
fi
}
# Format: "process_name:command_to_restart"
# Override via AUTO_RESTART_PROCESSES env var
DEFAULT_PROCESSES="act_runner:cd /opt/gitea-runner && nohup ./act_runner daemon >/var/log/gitea-runner.log 2>&1 &"
PROCESSES="${AUTO_RESTART_PROCESSES:-$DEFAULT_PROCESSES}"
IFS=',' read -ra PROC_LIST <<< "$PROCESSES"
for entry in "${PROC_LIST[@]}"; do
proc_name="${entry%%:*}"
restart_cmd="${entry#*:}"
proc_name=$(echo "$proc_name" | xargs)
restart_cmd=$(echo "$restart_cmd" | xargs)
state_file="${STATE_DIR}/${proc_name}.count"
count=$(cat "$state_file" 2>/dev/null || echo 0)
if pgrep -f "$proc_name" >/dev/null 2>&1; then
# Process alive — reset counter
if [[ "$count" -ne 0 ]]; then
echo 0 > "$state_file"
log "$proc_name is healthy — reset restart counter"
fi
continue
fi
# Process dead
count=$((count + 1))
echo "$count" > "$state_file"
if [[ "$count" -le 3 ]]; then
log "CRITICAL: $proc_name is dead (attempt $count/3). Restarting..."
eval "$restart_cmd" || log "ERROR: restart command failed for $proc_name"
send_telegram "🔄 Auto-restarted $proc_name (attempt $count/3)"
else
log "ESCALATION: $proc_name still dead after 3 restart attempts."
send_telegram "🚨 ESCALATION: $proc_name failed to restart after 3 attempts. Manual intervention required."
fi
done
touch "${STATE_DIR}/auto_restart.last"

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env bash
# backup_pipeline.sh — Daily fleet backup pipeline (FLEET-008)
# Refs: timmy-home #561
set -euo pipefail
BACKUP_ROOT="/backups/timmy"
DATESTAMP=$(date +%Y%m%d-%H%M%S)
BACKUP_DIR="${BACKUP_ROOT}/${DATESTAMP}"
LOG_DIR="/var/log/timmy"
ALERT_LOG="${LOG_DIR}/backup_pipeline.log"
mkdir -p "$BACKUP_DIR" "$LOG_DIR"
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
OFFSITE_TARGET="${OFFSITE_TARGET:-}"
log() { echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"; }
send_telegram() {
local msg="$1"
if [[ -n "$TELEGRAM_BOT_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID}" -d "text=${msg}" >/dev/null 2>&1 || true
fi
}
status=0
# --- Gitea repositories ---
if [[ -d /root/gitea ]]; then
tar czf "${BACKUP_DIR}/gitea-repos.tar.gz" -C /root gitea 2>/dev/null || true
log "Backed up Gitea repos"
fi
# --- Agent configs and state ---
for wiz in bezalel allegro ezra timmy; do
if [[ -d "/root/wizards/${wiz}" ]]; then
tar czf "${BACKUP_DIR}/${wiz}-home.tar.gz" -C /root/wizards "${wiz}" 2>/dev/null || true
log "Backed up ${wiz} home"
fi
done
# --- System configs ---
cp /etc/crontab "${BACKUP_DIR}/crontab" 2>/dev/null || true
cp -r /etc/systemd/system "${BACKUP_DIR}/systemd" 2>/dev/null || true
log "Backed up system configs"
# --- Evennia worlds (if present) ---
if [[ -d /root/evennia ]]; then
tar czf "${BACKUP_DIR}/evennia-worlds.tar.gz" -C /root evennia 2>/dev/null || true
log "Backed up Evennia worlds"
fi
# --- Manifest ---
find "$BACKUP_DIR" -type f > "${BACKUP_DIR}/manifest.txt"
log "Backup manifest written"
# --- Offsite sync ---
if [[ -n "$OFFSITE_TARGET" ]]; then
if rsync -az --delete "${BACKUP_DIR}/" "${OFFSITE_TARGET}/${DATESTAMP}/" 2>/dev/null; then
log "Offsite sync completed"
else
log "WARNING: Offsite sync failed"
status=1
fi
fi
# --- Retention: keep last 7 days ---
find "$BACKUP_ROOT" -mindepth 1 -maxdepth 1 -type d -mtime +7 -exec rm -rf {} + 2>/dev/null || true
log "Retention applied (7 days)"
if [[ "$status" -eq 0 ]]; then
log "Backup pipeline completed: ${BACKUP_DIR}"
send_telegram "✅ Daily backup completed: ${DATESTAMP}"
else
log "Backup pipeline completed with WARNINGS: ${BACKUP_DIR}"
send_telegram "⚠️ Daily backup completed with warnings: ${DATESTAMP}"
fi
exit "$status"

View File

@@ -23,7 +23,7 @@ def main():
if fleet.get("ezra") == "OFFLINE":
print("Ezra (Primary) is OFFLINE. Optimizing for local-only fallback...")
# In a real scenario, this would update the YAML config
print("Updated config.yaml: fallback_model -> local:hermes3")
print("Updated config.yaml: fallback_model -> ollama:gemma4:12b")
else:
print("Fleet health is optimal. Maintaining high-performance routing.")

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env bash
# fleet_health_probe.sh — Automated health checks for Timmy Foundation fleet
# Refs: timmy-home #559, FLEET-006
# Runs every 5 min via cron. Checks: SSH reachability, disk < 90%, memory < 90%, critical processes.
set -euo pipefail
LOG_DIR="/var/log/timmy"
ALERT_LOG="${LOG_DIR}/fleet_health.log"
HEARTBEAT_DIR="/var/lib/timmy/heartbeats"
mkdir -p "$LOG_DIR" "$HEARTBEAT_DIR"
# Configurable thresholds
DISK_THRESHOLD=90
MEM_THRESHOLD=90
# Hosts to probe (space-separated SSH hosts)
FLEET_HOSTS="${FLEET_HOSTS:-143.198.27.163 104.131.15.18}"
# Critical processes that must be running locally
CRITICAL_PROCESSES="${CRITICAL_PROCESSES:-act_runner}"
log() {
echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"
}
alert() {
log "ALERT: $1"
}
ok() {
log "OK: $1"
}
status=0
# --- SSH Reachability ---
for host in $FLEET_HOSTS; do
if nc -z -w 5 "$host" 22 >/dev/null 2>&1 || timeout 5 bash -c "</dev/tcp/${host}/22" 2>/dev/null; then
ok "SSH reachable: $host"
else
alert "SSH unreachable: $host"
status=1
fi
done
# --- Disk Usage ---
disk_usage=$(df / | awk 'NR==2 {print $5}' | tr -d '%')
if [[ "$disk_usage" -lt "$DISK_THRESHOLD" ]]; then
ok "Disk usage: ${disk_usage}%"
else
alert "Disk usage critical: ${disk_usage}%"
status=1
fi
# --- Memory Usage ---
mem_usage=$(free | awk '/Mem:/ {printf("%.0f", $3/$2 * 100.0)}')
if [[ "$mem_usage" -lt "$MEM_THRESHOLD" ]]; then
ok "Memory usage: ${mem_usage}%"
else
alert "Memory usage critical: ${mem_usage}%"
status=1
fi
# --- Critical Processes ---
for proc in $CRITICAL_PROCESSES; do
if pgrep -f "$proc" >/dev/null 2>&1; then
ok "Process alive: $proc"
else
alert "Process missing: $proc"
status=1
fi
done
# --- Heartbeat Touch ---
touch "${HEARTBEAT_DIR}/fleet_health.last"
if [[ "$status" -eq 0 ]]; then
log "Fleet health probe passed."
else
log "Fleet health probe FAILED."
fi
exit "$status"

164
scripts/fleet_milestones.py Normal file
View File

@@ -0,0 +1,164 @@
#!/usr/bin/env python3
"""
fleet_milestones.py — Print milestone messages when fleet achievements trigger.
Refs: timmy-home #557, FLEET-004
"""
import json
import os
import sys
from pathlib import Path
from datetime import datetime
STATE_FILE = Path("/var/lib/timmy/milestones.json")
LOG_FILE = Path("/var/log/timmy/fleet_milestones.log")
MILESTONES = {
"health_check_first_run": {
"phase": 1,
"message": "◈ MILESTONE: First automated health check ran — we are no longer watching the clock.",
},
"auto_restart_3am": {
"phase": 2,
"message": "◈ MILESTONE: A process failed at 3am and restarted itself before anyone woke up.",
},
"backup_first_success": {
"phase": 2,
"message": "◈ MILESTONE: First automated backup completed — fleet state is no longer ephemeral.",
},
"ci_green_main": {
"phase": 2,
"message": "◈ MILESTONE: CI pipeline kept main green for 24 hours straight.",
},
"pr_auto_merged": {
"phase": 2,
"message": "◈ MILESTONE: An agent PR passed review and merged without human hands.",
},
"dns_self_healed": {
"phase": 2,
"message": "◈ MILESTONE: DNS outage detected and resolved automatically.",
},
"runner_self_healed": {
"phase": 2,
"message": "◈ MILESTONE: CI runner died and resurrected itself within 60 seconds.",
},
"secrets_scan_clean": {
"phase": 2,
"message": "◈ MILESTONE: 7 consecutive days with zero leaked secrets detected.",
},
"local_inference_first": {
"phase": 3,
"message": "◈ MILESTONE: First fully local inference completed — no tokens left the building.",
},
"ollama_serving_fleet": {
"phase": 3,
"message": "◈ MILESTONE: Ollama serving models to all fleet wizards.",
},
"offline_docs_sync": {
"phase": 3,
"message": "◈ MILESTONE: Entire documentation tree synchronized without internet.",
},
"cross_agent_delegate": {
"phase": 3,
"message": "◈ MILESTONE: One wizard delegated a task to another and received a finished result.",
},
"backup_verified_restore": {
"phase": 4,
"message": "◈ MILESTONE: Backup restored and verified — disaster recovery is real.",
},
"vps_bootstrap_under_60": {
"phase": 4,
"message": "◈ MILESTONE: New VPS bootstrapped from bare metal in under 60 minutes.",
},
"zero_cloud_day": {
"phase": 4,
"message": "◈ MILESTONE: 24 hours with zero cloud API calls — total sovereignty achieved.",
},
"fleet_orchestrator_active": {
"phase": 5,
"message": "◈ MILESTONE: Fleet orchestrator actively balancing load across agents.",
},
"cell_isolation_proven": {
"phase": 5,
"message": "◈ MILESTONE: Agent cell isolation proven — one crash did not spread.",
},
"mission_bus_first": {
"phase": 5,
"message": "◈ MILESTONE: First cross-agent mission completed via the mission bus.",
},
"resurrection_pool_used": {
"phase": 5,
"message": "◈ MILESTONE: A dead wizard was detected and resurrected automatically.",
},
"infra_generates_revenue": {
"phase": 6,
"message": "◈ MILESTONE: Infrastructure generated its first dollar of revenue.",
},
"client_onboarded_unattended": {
"phase": 6,
"message": "◈ MILESTONE: Client onboarded without human intervention.",
},
"fleet_pays_for_itself": {
"phase": 6,
"message": "◈ MILESTONE: Fleet revenue exceeds operational cost — it breathes on its own.",
},
}
def load_state() -> dict:
if STATE_FILE.exists():
return json.loads(STATE_FILE.read_text())
return {}
def save_state(state: dict):
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
STATE_FILE.write_text(json.dumps(state, indent=2))
def log(msg: str):
LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
entry = f"[{datetime.utcnow().isoformat()}Z] {msg}"
print(entry)
with LOG_FILE.open("a") as f:
f.write(entry + "\n")
def trigger(key: str, dry_run: bool = False):
if key not in MILESTONES:
print(f"Unknown milestone: {key}", file=sys.stderr)
sys.exit(1)
state = load_state()
if state.get(key):
if not dry_run:
print(f"Milestone {key} already triggered. Skipping.")
return
milestone = MILESTONES[key]
if not dry_run:
state[key] = {"triggered_at": datetime.utcnow().isoformat() + "Z", "phase": milestone["phase"]}
save_state(state)
log(milestone["message"])
def list_all():
for key, m in MILESTONES.items():
print(f"{key} (phase {m['phase']}): {m['message']}")
def main():
import argparse
parser = argparse.ArgumentParser(description="Fleet milestone tracker")
parser.add_argument("--trigger", help="Trigger a milestone by key")
parser.add_argument("--dry-run", action="store_true", help="Show but do not record")
parser.add_argument("--list", action="store_true", help="List all milestones")
args = parser.parse_args()
if args.list:
list_all()
elif args.trigger:
trigger(args.trigger, dry_run=args.dry_run)
else:
parser.print_help()
if __name__ == "__main__":
main()

View File

@@ -1,125 +0,0 @@
"""
Resource Tracking System for FLEET-005.
This script tracks Capacity, Uptime, and Innovation, enforcing a tension model.
"""
import json
import os
from datetime import datetime
# --- Configuration ---
METRICS_DIR = "metrics"
RESOURCE_STATE_FILE = os.path.join(METRICS_DIR, "resource_state.json")
CAPACITY_THRESHOLD_INNOVATION = 70.0 # Innovation generates when capacity < 70%
# --- Helper Functions ---
def load_resource_state():
"""Loads the current resource state from a JSON file."""
if not os.path.exists(RESOURCE_STATE_FILE):
return {"capacity": 100.0, "uptime": 100.0, "innovation": 0.0, "last_run": None}
with open(RESOURCE_STATE_FILE, "r") as f:
return json.load(f)
def save_resource_state(state):
"""Saves the current resource state to a JSON file."""
os.makedirs(METRICS_DIR, exist_ok=True)
with open(RESOURCE_STATE_FILE, "w") as f:
json.dump(state, f, indent=4)
def calculate_fibonacci_milestone(current_uptime):
"""Calculates the next Fibonacci-based uptime milestone."""
milestones = [95.0, 95.5, 96.0, 97.0, 98.0, 99.0, 99.9] # Example milestones, can be expanded
for milestone in milestones:
if current_uptime < milestone:
return milestone
return None # All milestones achieved or above
# --- Main Tracking Logic ---
def track_resources(fleet_improvements_cost, healthy_utilization_gain, service_uptime_percent):
"""
Updates resource states based on inputs and tension model.
Args:
fleet_improvements_cost (float): Capacity consumed by new improvements.
healthy_utilization_gain (float): Capacity generated by well-running processes.
service_uptime_percent (float): Current uptime of services (0-100%).
"""
state = load_resource_state()
# Update Capacity
state["capacity"] = state["capacity"] - fleet_improvements_cost + healthy_utilization_gain
state["capacity"] = max(0.0, min(100.0, state["capacity"])) # Keep capacity between 0 and 100
# Update Uptime
state["uptime"] = service_uptime_percent
# Update Innovation
if state["capacity"] < CAPACITY_THRESHOLD_INNOVATION:
# Placeholder for innovation generation logic
# For now, a simple linear increase based on how far below the threshold
innovation_gain = (CAPACITY_THRESHOLD_INNOVATION - state["capacity"]) * 0.1
state["innovation"] += innovation_gain
state["last_run"] = datetime.now().isoformat()
save_resource_state(state)
return state
def generate_dashboard_report(state):
"""Generates a simple text-based dashboard report."""
report = f"""
--- Resource Tracking System Dashboard ---
Last Run: {state.get("last_run", "N/A")}
Capacity: {state["capacity"]:.2f}%
Uptime: {state["uptime"]:.2f}%
Innovation: {state["innovation"]:.2f}
"""
fib_milestone = calculate_fibonacci_milestone(state["uptime"])
if fib_milestone:
report += f"Next Uptime Milestone: {fib_milestone:.2f}%
"
else:
report += "All Uptime Milestones Achieved!
"
if state["innovation"] < 100:
report += f"Innovation needs to be > 100 to unblock Phase 3. Current: {state['innovation']:.2f}
"
else:
report += "Phase 3 is unblocked (Innovation > 100)!
"
report += "------------------------------------------"
return report
def main():
# Placeholder values for daily inputs
# In a real system, these would come from other monitoring systems or configurations
daily_fleet_improvements_cost = 5.0 # Example: 5% capacity consumed daily
daily_healthy_utilization_gain = 3.0 # Example: 3% capacity generated daily
current_service_uptime = 96.5 # Example: 96.5% current uptime
print("Running resource tracker...")
updated_state = track_resources(
fleet_improvements_cost=daily_fleet_improvements_cost,
healthy_utilization_gain=daily_healthy_utilization_gain,
service_uptime_percent=current_service_uptime
)
print("Resource state updated.")
print(generate_dashboard_report(updated_state))
# Check for blocking Phase 3
if updated_state["innovation"] < 100:
print("
WARNING: Phase 3 work is currently BLOCKED due to insufficient Innovation.")
else:
print("
Phase 3 work is UNBLOCKED!")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,59 @@
#!/usr/bin/env python3
"""
telegram_thread_reporter.py — Route reports to Telegram threads (#895)
Usage:
python telegram_thread_reporter.py --topic ops --message "Heartbeat OK"
python telegram_thread_reporter.py --topic burn --message "Burn cycle done"
python telegram_thread_reporter.py --topic main --message "Escalation!"
"""
import argparse
import os
import sys
import urllib.request
import urllib.parse
import json
DEFAULT_THREADS = {
"ops": os.environ.get("TELEGRAM_OPS_THREAD_ID"),
"burn": os.environ.get("TELEGRAM_BURN_THREAD_ID"),
"main": None, # main channel = no thread id
}
def send_message(bot_token: str, chat_id: str, text: str, thread_id: str | None = None):
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
data = {"chat_id": chat_id, "text": text, "parse_mode": "HTML"}
if thread_id:
data["message_thread_id"] = thread_id
payload = urllib.parse.urlencode(data).encode("utf-8")
req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/x-www-form-urlencoded"})
try:
with urllib.request.urlopen(req, timeout=15) as resp:
return json.loads(resp.read().decode("utf-8"))
except Exception as e:
return {"ok": False, "error": str(e)}
def main():
parser = argparse.ArgumentParser(description="Telegram thread reporter")
parser.add_argument("--topic", required=True, choices=["ops", "burn", "main"])
parser.add_argument("--message", required=True)
args = parser.parse_args()
bot_token = os.environ.get("TELEGRAM_BOT_TOKEN")
chat_id = os.environ.get("TELEGRAM_CHAT_ID")
if not bot_token or not chat_id:
print("Missing TELEGRAM_BOT_TOKEN or TELEGRAM_CHAT_ID", file=sys.stderr)
sys.exit(1)
thread_id = DEFAULT_THREADS.get(args.topic)
result = send_message(bot_token, chat_id, args.message, thread_id)
if result.get("ok"):
print(f"Sent to {args.topic}")
else:
print(f"Failed: {result}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -1,8 +1,33 @@
model:
default: kimi-for-coding
default: kimi-k2.5
provider: kimi-coding
toolsets:
- all
fallback_providers:
- provider: kimi-coding
model: kimi-k2.5
timeout: 120
reason: Kimi coding fallback (front of chain)
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
reason: Direct Anthropic fallback
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
base_url: https://openrouter.ai/api/v1
api_key_env: OPENROUTER_API_KEY
timeout: 120
reason: OpenRouter fallback
providers:
kimi-coding:
base_url: https://api.kimi.com/coding/v1
timeout: 60
max_retries: 3
anthropic:
timeout: 120
openrouter:
base_url: https://openrouter.ai/api/v1
timeout: 120
agent:
max_turns: 30
reasoning_effort: medium