Files
timmy-home/scripts/big_brain_manager.py
Timmy 4093c74c19
Some checks failed
Smoke Test / smoke (pull_request) Failing after 8s
feat(big-brain): auto-stop pod when idle (#577)
Cost control for L40S pod (/bin/sh.79/hr).

Idle watchdog (cron every 15 min):
- Tracks last inference request timestamp
- If idle > 30 min, stops pod via RunPod GraphQL API
- Logs stop/start events with timestamps to cost log

Auto-resume manager:
- Import before inference to ensure pod is RUNNING
- If stopped, resumes and polls until Ollama responds
- Updates timestamp on each request

Components:
- big_brain_idle_watchdog.py: idle check + pod stop
- big_brain_manager.py: auto-resume + status
- 11 tests covering all states and edge cases

Closes #577
2026-04-13 18:14:01 -04:00

195 lines
7.0 KiB
Python

#!/usr/bin/env python3
"""
Big Brain Manager — Auto-resume pod before inference.
Import this module or run standalone to ensure the Big Brain pod is
RUNNING before sending inference requests. If the pod is stopped,
resumes it and polls until ready.
Usage:
python3 scripts/big_brain_manager.py --ensure-running # resume if stopped
python3 scripts/big_brain_manager.py --status # just check status
python3 scripts/big_brain_manager.py --stop # manually stop
python3 scripts/big_brain_manager.py --resume # manually resume
python3 scripts/big_brain_manager.py --json # machine-readable
As a library:
from big_brain_manager import ensure_running, touch_request
ensure_running() # blocks until pod is RUNNING
touch_request() # update idle timestamp
"""
import json
import os
import sys
import time
import urllib.request
import urllib.error
from datetime import datetime, timezone
# Reuse watchdog internals
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from big_brain_idle_watchdog import (
get_pod_status, stop_pod, resume_pod, get_runpod_key,
touch_last_request, get_last_request_time, log_event,
DEFAULT_POD_ID, COST_PER_HOUR, TIMESTAMP_FILE,
)
OLLAMA_ENDPOINT = "https://{pod_id}-11434.proxy.runpod.net"
POLL_INTERVAL_SEC = 10
MAX_POLL_ATTEMPTS = 30 # 5 minutes max wait
def is_ollama_ready(pod_id: str) -> bool:
"""Check if Ollama is responding on the pod endpoint."""
url = OLLAMA_ENDPOINT.format(pod_id=pod_id) + "/api/tags"
try:
req = urllib.request.Request(url, method="GET")
with urllib.request.urlopen(req, timeout=10) as resp:
return resp.status == 200
except Exception:
return False
def ensure_running(pod_id: str = DEFAULT_POD_ID, timeout_sec: int = 300) -> dict:
"""
Ensure the Big Brain pod is running. Resume if stopped, wait until ready.
Returns status dict with action taken and final state.
"""
pod = get_pod_status(pod_id)
status = pod.get("desiredStatus", "UNKNOWN")
result = {"pod_id": pod_id, "initial_status": status, "action": "none"}
if pod.get("error"):
result["error"] = pod["error"]
result["action"] = "error"
return result
if status == "RUNNING":
# Verify Ollama is actually serving
if is_ollama_ready(pod_id):
result["action"] = "already_running"
result["final_status"] = "RUNNING"
touch_last_request()
return result
else:
# Pod is running but Ollama not ready yet — wait
result["action"] = "waiting_for_ollama"
elif status in ("EXITED", "STOPPED"):
# Resume the pod
resume_result = resume_pod(pod_id)
if "errors" in resume_result:
result["action"] = "resume_failed"
result["error"] = resume_result["errors"][0].get("message", "unknown")
return result
result["action"] = "resumed"
log_event("resume", pod_id, "auto-resume before inference")
else:
result["action"] = "unexpected_status"
result["final_status"] = status
return result
# Poll until Ollama is ready
attempts = 0
while attempts < (timeout_sec / POLL_INTERVAL_SEC):
time.sleep(POLL_INTERVAL_SEC)
attempts += 1
if is_ollama_ready(pod_id):
result["final_status"] = "RUNNING"
result["wait_seconds"] = attempts * POLL_INTERVAL_SEC
touch_last_request()
return result
result["action"] = "timeout"
result["final_status"] = get_pod_status(pod_id).get("desiredStatus", "UNKNOWN")
return result
def touch_request():
"""Public wrapper for updating the last request timestamp."""
touch_last_request()
def main():
import argparse
parser = argparse.ArgumentParser(description="Big Brain pod manager — auto-resume for inference")
parser.add_argument("--pod-id", default=DEFAULT_POD_ID, help="RunPod pod ID")
parser.add_argument("--ensure-running", action="store_true", help="Resume if stopped, wait for ready")
parser.add_argument("--status", action="store_true", help="Show pod status")
parser.add_argument("--stop", action="store_true", help="Stop the pod")
parser.add_argument("--resume", action="store_true", help="Resume the pod")
parser.add_argument("--touch", action="store_true", help="Update last request timestamp")
parser.add_argument("--json", dest="as_json", action="store_true", help="JSON output")
args = parser.parse_args()
if args.touch:
touch_request()
print(f"Timestamp updated: {TIMESTAMP_FILE}")
return
if args.ensure_running:
result = ensure_running(args.pod_id)
if args.as_json:
print(json.dumps(result, indent=2))
else:
action = result["action"]
if action == "already_running":
print(f"[Big Brain] Already RUNNING and Ollama ready")
elif action == "resumed":
wait = result.get("wait_seconds", 0)
print(f"[Big Brain] Resumed pod, Ollama ready in {wait}s")
elif action == "resume_failed":
print(f"[Big Brain] RESUME FAILED: {result.get('error', 'unknown')}", file=sys.stderr)
sys.exit(1)
elif action == "timeout":
print(f"[Big Brain] Timed out waiting for Ollama", file=sys.stderr)
sys.exit(1)
return
if args.stop:
result = stop_pod(args.pod_id)
if "errors" in result:
print(f"Stop failed: {result['errors'][0].get('message')}", file=sys.stderr)
sys.exit(1)
log_event("stop", args.pod_id, "manual stop")
print(f"Pod {args.pod_id} stopped")
return
if args.resume:
result = resume_pod(args.pod_id)
if "errors" in result:
print(f"Resume failed: {result['errors'][0].get('message')}", file=sys.stderr)
sys.exit(1)
log_event("resume", args.pod_id, "manual resume")
print(f"Pod {args.pod_id} resuming...")
return
# Default: status
pod = get_pod_status(args.pod_id)
last_req = get_last_request_time()
ollama_ok = is_ollama_ready(args.pod_id) if pod.get("desiredStatus") == "RUNNING" else False
if args.as_json:
print(json.dumps({
"pod": pod,
"ollama_ready": ollama_ok,
"last_request": last_req.isoformat() if last_req else None,
}, indent=2))
else:
print(f"Pod: {args.pod_id}")
print(f"Status: {pod.get('desiredStatus', 'UNKNOWN')}")
print(f"Ollama: {'ready' if ollama_ok else 'not responding'}")
print(f"Last request: {last_req.isoformat() if last_req else 'never'}")
print(f"Cost/hr: ${COST_PER_HOUR}")
if pod.get("uptimeSeconds"):
hrs = pod["uptimeSeconds"] / 3600
print(f"Uptime: {hrs:.1f} hrs (${hrs * COST_PER_HOUR:.2f})")
if __name__ == "__main__":
main()