271 lines
8.5 KiB
Python
271 lines
8.5 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Deploy GPU instance on RunPod for Big Brain Gemma 4."""
|
||
|
|
import subprocess, json, os, time, requests
|
||
|
|
|
||
|
|
# Read RunPod API key
|
||
|
|
RUNPOD_API_KEY = open(os.path.expanduser('~/.config/runpod/access_key')).read().strip()
|
||
|
|
GITEA_TOKEN = open(os.path.expanduser('~/.hermes/gitea_token_vps')).read().strip()
|
||
|
|
GITEA_FORGE = 'https://forge.alexanderwhitestone.com/api/v1/repos/Timmy_Foundation/timmy-home'
|
||
|
|
|
||
|
|
def log(msg):
|
||
|
|
print(f"[{time.strftime('%H:%M:%S')}] {msg}")
|
||
|
|
|
||
|
|
def comment_issue(issue_num, body):
|
||
|
|
"""Add comment to Gitea issue."""
|
||
|
|
subprocess.run(
|
||
|
|
['curl', '-s', '-X', 'POST', f'{GITEA_FORGE}/issues/{issue_num}/comments',
|
||
|
|
'-H', f'Authorization: token {GITEA_TOKEN}',
|
||
|
|
'-H', 'Content-Type: application/json',
|
||
|
|
'-d', json.dumps({"body": body})],
|
||
|
|
capture_output=True, timeout=10
|
||
|
|
)
|
||
|
|
|
||
|
|
def graphql_query(query, variables=None):
|
||
|
|
"""Run GraphQL query against RunPod API."""
|
||
|
|
payload = {"query": query}
|
||
|
|
if variables:
|
||
|
|
payload["variables"] = variables
|
||
|
|
|
||
|
|
r = requests.post(
|
||
|
|
'https://api.runpod.io/graphql',
|
||
|
|
headers={
|
||
|
|
'Authorization': f'Bearer {RUNPOD_API_KEY}',
|
||
|
|
'Content-Type': 'application/json',
|
||
|
|
},
|
||
|
|
json=payload,
|
||
|
|
timeout=30
|
||
|
|
)
|
||
|
|
return r.json()
|
||
|
|
|
||
|
|
def deploy_pod(gpu_type, name, cloud_type="COMMUNITY"):
|
||
|
|
"""Deploy a RunPod pod with Ollama."""
|
||
|
|
query = """
|
||
|
|
mutation($input: PodFindAndDeployOnDemandInput!) {
|
||
|
|
podFindAndDeployOnDemand(input: $input) {
|
||
|
|
id
|
||
|
|
desiredStatus
|
||
|
|
machineId
|
||
|
|
warning
|
||
|
|
}
|
||
|
|
}
|
||
|
|
"""
|
||
|
|
|
||
|
|
variables = {
|
||
|
|
"input": {
|
||
|
|
"cloudType": cloud_type,
|
||
|
|
"gpuCount": 1,
|
||
|
|
"gpuTypeId": gpu_type,
|
||
|
|
"name": name,
|
||
|
|
"containerDiskInGb": 100,
|
||
|
|
"imageName": "runpod/ollama:latest",
|
||
|
|
"ports": "11434/http",
|
||
|
|
"volumeInGb": 50,
|
||
|
|
"volumeMountPath": "/workspace",
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
try:
|
||
|
|
result = graphql_query(query, variables)
|
||
|
|
return result
|
||
|
|
except Exception as e:
|
||
|
|
return {"error": str(e)}
|
||
|
|
|
||
|
|
def check_if_endpoint_exists(name):
|
||
|
|
"""Check if endpoint already exists."""
|
||
|
|
query = "{ endpoints { id name } }"
|
||
|
|
result = graphql_query(query)
|
||
|
|
endpoints = result.get('data', {}).get('endpoints', [])
|
||
|
|
matching = [e for e in endpoints if name.lower() in e.get('name', '').lower()]
|
||
|
|
return matching
|
||
|
|
|
||
|
|
# Main deployment logic
|
||
|
|
log("Starting Big Brain GPU deployment")
|
||
|
|
log(f"RunPod API key: {RUNPOD_API_KEY[:20]}...{RUNPOD_API_KEY[-10:]}")
|
||
|
|
|
||
|
|
# Step 1: Get available GPU types
|
||
|
|
log("\n=== Step 1: Getting GPU types ===")
|
||
|
|
gpu_query = "{ gpuTypes { id displayName memoryInGb secureCloud communityCloud } }"
|
||
|
|
result = graphql_query(gpu_query)
|
||
|
|
|
||
|
|
gpus = result.get('data', {}).get('gpuTypes', [])
|
||
|
|
log(f"Total GPU types: {len(gpus)}")
|
||
|
|
|
||
|
|
# Filter GPUs with 24GB+ VRAM for Gemma 3 27B
|
||
|
|
suitable_gpus = []
|
||
|
|
for gpu in gpus:
|
||
|
|
mem = gpu.get('memoryInGb', 0)
|
||
|
|
if mem >= 24:
|
||
|
|
suitable_gpus.append(gpu)
|
||
|
|
|
||
|
|
log(f"\nGPUs with 24GB+ VRAM:")
|
||
|
|
for gpu in suitable_gpus[:15]:
|
||
|
|
log(f" {gpu.get('id')}: {gpu.get('displayName')} - {gpu.get('memoryInGb')}GB, Secure: {gpu.get('secureCloud')}, Community: {gpu.get('communityCloud')}")
|
||
|
|
|
||
|
|
# Step 2: Try to find GPU availability
|
||
|
|
# The error was "no instances available" - we need to find available ones
|
||
|
|
# The GPU ID format matters - try the ones from the list
|
||
|
|
|
||
|
|
pod_name = "big-brain-timmy"
|
||
|
|
|
||
|
|
# Try different GPUs in order of preference (cheapest first with enough memory)
|
||
|
|
gpu_attempts = [
|
||
|
|
("NVIDIA RTX 4090", "COMMUNITY"), # 24GB, ~$0.44/hr
|
||
|
|
("NVIDIA A40", "COMMUNITY"), # 48GB
|
||
|
|
("NVIDIA RTX 3090", "COMMUNITY"), # 24GB
|
||
|
|
("NVIDIA RTX 3090 Ti", "COMMUNITY"), # 24GB
|
||
|
|
("NVIDIA L40S", "COMMUNITY"), # 48GB
|
||
|
|
("NVIDIA A6000", "COMMUNITY"), # 48GB
|
||
|
|
# Try secure cloud
|
||
|
|
("NVIDIA RTX 4090", "SECURE"),
|
||
|
|
("NVIDIA A40", "SECURE"),
|
||
|
|
("NVIDIA L40S", "SECURE"),
|
||
|
|
]
|
||
|
|
|
||
|
|
log("\n=== Step 2: Attempting deployment ===")
|
||
|
|
deployed = False
|
||
|
|
for gpu_type, cloud_type in gpu_attempts:
|
||
|
|
log(f"Trying {gpu_type} ({cloud_type})...")
|
||
|
|
result = deploy_pod(gpu_type, pod_name, cloud_type)
|
||
|
|
|
||
|
|
errors = result.get('errors', [])
|
||
|
|
data = result.get('data', {}).get('podFindAndDeployOnDemand', {})
|
||
|
|
|
||
|
|
if errors:
|
||
|
|
for err in errors:
|
||
|
|
msg = err.get('message', '')
|
||
|
|
if 'no longer any instances' in msg or 'no instances' in msg:
|
||
|
|
log(f" No instances available")
|
||
|
|
elif 'invalid' in msg.lower() or 'not found' in msg.lower():
|
||
|
|
log(f" GPU type not found: {msg[:100]}")
|
||
|
|
else:
|
||
|
|
log(f" Error: {msg[:100]}")
|
||
|
|
elif data and data.get('id'):
|
||
|
|
log(f" ✅ SUCCESS! Pod ID: {data.get('id')}")
|
||
|
|
log(f" Machine ID: {data.get('machineId')}")
|
||
|
|
log(f" Status: {data.get('desiredStatus')}")
|
||
|
|
deployed = True
|
||
|
|
break
|
||
|
|
else:
|
||
|
|
log(f" Response: {json.dumps(result)[:200]}")
|
||
|
|
|
||
|
|
if deployed:
|
||
|
|
pod_id = data.get('id')
|
||
|
|
|
||
|
|
# Wait for pod to be running
|
||
|
|
log(f"\n=== Step 3: Waiting for pod {pod_id} to start ===")
|
||
|
|
pod_status_query = """
|
||
|
|
query($podId: String!) {
|
||
|
|
pod(id: $podId) {
|
||
|
|
id
|
||
|
|
desiredStatus
|
||
|
|
runtimeStatus
|
||
|
|
machineId
|
||
|
|
ports
|
||
|
|
}
|
||
|
|
}
|
||
|
|
"""
|
||
|
|
|
||
|
|
for attempt in range(30): # Wait up to 15 minutes
|
||
|
|
time.sleep(30)
|
||
|
|
result = graphql_query(pod_status_query, {"podId": pod_id})
|
||
|
|
pod = result.get('data', {}).get('pod', {})
|
||
|
|
runtime = pod.get('runtimeStatus', 'unknown')
|
||
|
|
desired = pod.get('desiredStatus', 'unknown')
|
||
|
|
log(f" Attempt {attempt+1}: desired={desired}, runtime={runtime}")
|
||
|
|
|
||
|
|
if runtime == 'RUNNING':
|
||
|
|
log(f" ✅ Pod is RUNNING!")
|
||
|
|
|
||
|
|
# Get the IP/port
|
||
|
|
ip = f"{pod_id}-11434.proxy.runpod.net"
|
||
|
|
log(f" Ollama endpoint: http://{ip}:11434")
|
||
|
|
log(f" Ollama endpoint: http://{pod_id}.proxy.runpod.net:11434")
|
||
|
|
|
||
|
|
# Comment on Gitea tickets
|
||
|
|
comment_text = f"""# ✅ SUCCESS: GPU Instance Deployed
|
||
|
|
|
||
|
|
## Pod Details
|
||
|
|
- **Pod ID:** {pod_id}
|
||
|
|
- **GPU:** {gpu_type} ({cloud_type} cloud)
|
||
|
|
- **Status:** RUNNING
|
||
|
|
- **Endpoint:** http://{pod_id}.proxy.runpod.net:11434
|
||
|
|
|
||
|
|
## Next Steps
|
||
|
|
|
||
|
|
1. **SSH into pod:**
|
||
|
|
```bash
|
||
|
|
ssh root@{pod_id}.proxy.runpod.net
|
||
|
|
```
|
||
|
|
|
||
|
|
2. **Pull Gemma 3 27B:**
|
||
|
|
```bash
|
||
|
|
ollama pull gemma3:27b-instruct-q4_K_M
|
||
|
|
```
|
||
|
|
|
||
|
|
3. **Verify Ollama is working:**
|
||
|
|
```bash
|
||
|
|
curl http://localhost:11434/api/tags
|
||
|
|
```
|
||
|
|
|
||
|
|
4. **Test inference:**
|
||
|
|
```bash
|
||
|
|
curl http://localhost:11434/api/chat \\
|
||
|
|
-H "Content-Type: application/json" \\
|
||
|
|
-d '{{"model": "gemma3:27b-instruct-q4_K_M", "messages": [{{"role": "user", "content": "Hello from Timmy"}}]}}'
|
||
|
|
```
|
||
|
|
|
||
|
|
5. **Wire to Mac Hermes:**
|
||
|
|
Add to `~/.hermes/config.yaml`:
|
||
|
|
```yaml
|
||
|
|
providers:
|
||
|
|
big_brain:
|
||
|
|
base_url: "http://{pod_id}.proxy.runpod.net:11434/v1"
|
||
|
|
api_key: ""
|
||
|
|
model: "gemma3:27b-instruct-q4_K_M"
|
||
|
|
```
|
||
|
|
|
||
|
|
6. **Test Hermes:**
|
||
|
|
```bash
|
||
|
|
hermes chat --model gemma3:27b-instruct-q4_K_M --provider big_brain
|
||
|
|
```"""
|
||
|
|
|
||
|
|
comment_issue(543, comment_text)
|
||
|
|
comment_issue(544, comment_text.replace("Timmy", "Bezalel").replace("Mac Hermes", "Bezalel Hermes"))
|
||
|
|
|
||
|
|
log("\n🎉 Big Brain GPU deployed successfully!")
|
||
|
|
log(f"Pod: {pod_id}")
|
||
|
|
log(f"Endpoint: http://{pod_id}.proxy.runpod.net:11434")
|
||
|
|
log(f"Gitea tickets updated with deployment details")
|
||
|
|
break
|
||
|
|
elif runtime == 'ERROR' or desired == 'TERMINATED' or desired == 'SUSPENDED':
|
||
|
|
log(f" ❌ Pod failed: runtime={runtime}, desired={desired}")
|
||
|
|
break
|
||
|
|
|
||
|
|
if runtime != 'RUNNING':
|
||
|
|
log(f"\n⚠️ Pod is not running after waiting. Check RunPod dashboard.")
|
||
|
|
else:
|
||
|
|
log("\n❌ No GPU instances available on RunPod")
|
||
|
|
log("Try Vertex AI or check back later")
|
||
|
|
|
||
|
|
# Comment on tickets
|
||
|
|
comment_text = """# Deployment Status: RunPod Failed
|
||
|
|
|
||
|
|
## Issue
|
||
|
|
No GPU instances available on RunPod. All GPU types returned "no instances available" error.
|
||
|
|
|
||
|
|
## Alternatives
|
||
|
|
1. **Vertex AI** - Google Cloud's managed Gemma endpoints (see ticket for instructions)
|
||
|
|
2. **Lambda Labs** - Another GPU cloud provider
|
||
|
|
3. **Vast.ai** - Community GPU marketplace
|
||
|
|
4. **Wait for RunPod** - Check back in a few hours"""
|
||
|
|
|
||
|
|
comment_issue(543, comment_text)
|
||
|
|
comment_issue(544, comment_text)
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
Write the deployment script
|
||
|
|
write_file('~/.timmy/big_brain_deploy.py', script_content)
|
||
|
|
|
||
|
|
# Also run it (with timeout)
|
||
|
|
print("Running deployment script... (will check Gitea tickets for results in parallel)")
|