big_brain_deploy.py

#!/usr/bin/env python3
"""Deploy GPU instance on RunPod for Big Brain Gemma 4."""
import subprocess, json, os, time, requests

# Read RunPod API key
RUNPOD_API_KEY = open(os.path.expanduser('~/.config/runpod/access_key')).read().strip()
GITEA_TOKEN = open(os.path.expanduser('~/.hermes/gitea_token_vps')).read().strip()
GITEA_FORGE = 'https://forge.alexanderwhitestone.com/api/v1/repos/Timmy_Foundation/timmy-home'

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}")

def comment_issue(issue_num, body):
    """Add comment to Gitea issue."""
    subprocess.run(
        ['curl', '-s', '-X', 'POST', f'{GITEA_FORGE}/issues/{issue_num}/comments',
         '-H', f'Authorization: token {GITEA_TOKEN}',
         '-H', 'Content-Type: application/json',
         '-d', json.dumps({"body": body})],
        capture_output=True, timeout=10
    )

def graphql_query(query, variables=None):
    """Run GraphQL query against RunPod API."""
    payload = {"query": query}
    if variables:
        payload["variables"] = variables
    
    r = requests.post(
        'https://api.runpod.io/graphql',
        headers={
            'Authorization': f'Bearer {RUNPOD_API_KEY}',
            'Content-Type': 'application/json',
        },
        json=payload,
        timeout=30
    )
    return r.json()

def deploy_pod(gpu_type, name, cloud_type="COMMUNITY"):
    """Deploy a RunPod pod with Ollama."""
    query = """
    mutation($input: PodFindAndDeployOnDemandInput!) {
      podFindAndDeployOnDemand(input: $input) {
        id
        desiredStatus
        machineId
        warning
      }
    }
    """
    
    variables = {
        "input": {
            "cloudType": cloud_type,
            "gpuCount": 1,
            "gpuTypeId": gpu_type,
            "name": name,
            "containerDiskInGb": 100,
            "imageName": "runpod/ollama:latest",
            "ports": "11434/http",
            "volumeInGb": 50,
            "volumeMountPath": "/workspace",
        }
    }
    
    try:
        result = graphql_query(query, variables)
        return result
    except Exception as e:
        return {"error": str(e)}

def check_if_endpoint_exists(name):
    """Check if endpoint already exists."""
    query = "{ endpoints { id name } }"
    result = graphql_query(query)
    endpoints = result.get('data', {}).get('endpoints', [])
    matching = [e for e in endpoints if name.lower() in e.get('name', '').lower()]
    return matching

# Main deployment logic
log("Starting Big Brain GPU deployment")
log(f"RunPod API key: {RUNPOD_API_KEY[:20]}...{RUNPOD_API_KEY[-10:]}")

# Step 1: Get available GPU types
log("\n=== Step 1: Getting GPU types ===")
gpu_query = "{ gpuTypes { id displayName memoryInGb secureCloud communityCloud } }"
result = graphql_query(gpu_query)

gpus = result.get('data', {}).get('gpuTypes', [])
log(f"Total GPU types: {len(gpus)}")

# Filter GPUs with 24GB+ VRAM for Gemma 3 27B
suitable_gpus = []
for gpu in gpus:
    mem = gpu.get('memoryInGb', 0)
    if mem >= 24:
        suitable_gpus.append(gpu)
        
log(f"\nGPUs with 24GB+ VRAM:")
for gpu in suitable_gpus[:15]:
    log(f"  {gpu.get('id')}: {gpu.get('displayName')} - {gpu.get('memoryInGb')}GB, Secure: {gpu.get('secureCloud')}, Community: {gpu.get('communityCloud')}")

# Step 2: Try to find GPU availability
# The error was "no instances available" - we need to find available ones
# The GPU ID format matters - try the ones from the list

pod_name = "big-brain-timmy"

# Try different GPUs in order of preference (cheapest first with enough memory)
gpu_attempts = [
    ("NVIDIA RTX 4090", "COMMUNITY"),    # 24GB, ~$0.44/hr
    ("NVIDIA A40", "COMMUNITY"),          # 48GB
    ("NVIDIA RTX 3090", "COMMUNITY"),    # 24GB
    ("NVIDIA RTX 3090 Ti", "COMMUNITY"), # 24GB
    ("NVIDIA L40S", "COMMUNITY"),        # 48GB
    ("NVIDIA A6000", "COMMUNITY"),       # 48GB
    # Try secure cloud
    ("NVIDIA RTX 4090", "SECURE"),
    ("NVIDIA A40", "SECURE"),
    ("NVIDIA L40S", "SECURE"),
]

log("\n=== Step 2: Attempting deployment ===")
deployed = False
for gpu_type, cloud_type in gpu_attempts:
    log(f"Trying {gpu_type} ({cloud_type})...")
    result = deploy_pod(gpu_type, pod_name, cloud_type)
    
    errors = result.get('errors', [])
    data = result.get('data', {}).get('podFindAndDeployOnDemand', {})
    
    if errors:
        for err in errors:
            msg = err.get('message', '')
            if 'no longer any instances' in msg or 'no instances' in msg:
                log(f"  No instances available")
            elif 'invalid' in msg.lower() or 'not found' in msg.lower():
                log(f"  GPU type not found: {msg[:100]}")
            else:
                log(f"  Error: {msg[:100]}")
    elif data and data.get('id'):
        log(f"  ✅ SUCCESS! Pod ID: {data.get('id')}")
        log(f"     Machine ID: {data.get('machineId')}")
        log(f"     Status: {data.get('desiredStatus')}")
        deployed = True
        break
    else:
        log(f"  Response: {json.dumps(result)[:200]}")

if deployed:
    pod_id = data.get('id')
    
    # Wait for pod to be running
    log(f"\n=== Step 3: Waiting for pod {pod_id} to start ===")
    pod_status_query = """
    query($podId: String!) {
      pod(id: $podId) {
        id
        desiredStatus
        runtimeStatus
        machineId
        ports
      }
    }
    """
    
    for attempt in range(30):  # Wait up to 15 minutes
        time.sleep(30)
        result = graphql_query(pod_status_query, {"podId": pod_id})
        pod = result.get('data', {}).get('pod', {})
        runtime = pod.get('runtimeStatus', 'unknown')
        desired = pod.get('desiredStatus', 'unknown')
        log(f"  Attempt {attempt+1}: desired={desired}, runtime={runtime}")
        
        if runtime == 'RUNNING':
            log(f"  ✅ Pod is RUNNING!")
            
            # Get the IP/port
            ip = f"{pod_id}-11434.proxy.runpod.net"
            log(f"  Ollama endpoint: http://{ip}:11434")
            log(f"  Ollama endpoint: http://{pod_id}.proxy.runpod.net:11434")
            
            # Comment on Gitea tickets
            comment_text = f"""# ✅ SUCCESS: GPU Instance Deployed

## Pod Details
- **Pod ID:** {pod_id}
- **GPU:** {gpu_type} ({cloud_type} cloud)
- **Status:** RUNNING
- **Endpoint:** http://{pod_id}.proxy.runpod.net:11434

## Next Steps

1. **SSH into pod:**
```bash
ssh root@{pod_id}.proxy.runpod.net
```

2. **Pull Gemma 3 27B:**
```bash
ollama pull gemma3:27b-instruct-q4_K_M
```

3. **Verify Ollama is working:**
```bash
curl http://localhost:11434/api/tags
```

4. **Test inference:**
```bash
curl http://localhost:11434/api/chat \\
  -H "Content-Type: application/json" \\
  -d '{{"model": "gemma3:27b-instruct-q4_K_M", "messages": [{{"role": "user", "content": "Hello from Timmy"}}]}}'
```

5. **Wire to Mac Hermes:**
Add to `~/.hermes/config.yaml`:
```yaml
providers:
  big_brain:
    base_url: "http://{pod_id}.proxy.runpod.net:11434/v1"
    api_key: ""
    model: "gemma3:27b-instruct-q4_K_M"
```

6. **Test Hermes:**
```bash
hermes chat --model gemma3:27b-instruct-q4_K_M --provider big_brain
```"""
            
            comment_issue(543, comment_text)
            comment_issue(544, comment_text.replace("Timmy", "Bezalel").replace("Mac Hermes", "Bezalel Hermes"))
            
            log("\n🎉 Big Brain GPU deployed successfully!")
            log(f"Pod: {pod_id}")
            log(f"Endpoint: http://{pod_id}.proxy.runpod.net:11434")
            log(f"Gitea tickets updated with deployment details")
            break
        elif runtime == 'ERROR' or desired == 'TERMINATED' or desired == 'SUSPENDED':
            log(f"  ❌ Pod failed: runtime={runtime}, desired={desired}")
            break
    
    if runtime != 'RUNNING':
        log(f"\n⚠️ Pod is not running after waiting. Check RunPod dashboard.")
else:
    log("\n❌ No GPU instances available on RunPod")
    log("Try Vertex AI or check back later")
    
    # Comment on tickets
    comment_text = """# Deployment Status: RunPod Failed

## Issue
No GPU instances available on RunPod. All GPU types returned "no instances available" error.

## Alternatives
1. **Vertex AI** - Google Cloud's managed Gemma endpoints (see ticket for instructions)
2. **Lambda Labs** - Another GPU cloud provider
3. **Vast.ai** - Community GPU marketplace
4. **Wait for RunPod** - Check back in a few hours"""
    
    comment_issue(543, comment_text)
    comment_issue(544, comment_text)

"""

Write the deployment script
write_file('~/.timmy/big_brain_deploy.py', script_content)

# Also run it (with timeout)
print("Running deployment script... (will check Gitea tickets for results in parallel)")
Tick #1520 - Timmy climbs the Tower. The servers hum. \| Bezalel examines the anvil: a thousand scars. \| Allegro visits the Tower. Reads the logs. (+5 more) 2026-04-06 22:42:40 -04:00			`#!/usr/bin/env python3`
			`"""Deploy GPU instance on RunPod for Big Brain Gemma 4."""`
			`import subprocess, json, os, time, requests`

			`# Read RunPod API key`
			`RUNPOD_API_KEY = open(os.path.expanduser('~/.config/runpod/access_key')).read().strip()`
			`GITEA_TOKEN = open(os.path.expanduser('~/.hermes/gitea_token_vps')).read().strip()`
			`GITEA_FORGE = 'https://forge.alexanderwhitestone.com/api/v1/repos/Timmy_Foundation/timmy-home'`

			`def log(msg):`
			`print(f"[{time.strftime('%H:%M:%S')}] {msg}")`

			`def comment_issue(issue_num, body):`
			`"""Add comment to Gitea issue."""`
			`subprocess.run(`
			`['curl', '-s', '-X', 'POST', f'{GITEA_FORGE}/issues/{issue_num}/comments',`
			`'-H', f'Authorization: token {GITEA_TOKEN}',`
			`'-H', 'Content-Type: application/json',`
			`'-d', json.dumps({"body": body})],`
			`capture_output=True, timeout=10`
			`)`

			`def graphql_query(query, variables=None):`
			`"""Run GraphQL query against RunPod API."""`
			`payload = {"query": query}`
			`if variables:`
			`payload["variables"] = variables`

			`r = requests.post(`
			`'https://api.runpod.io/graphql',`
			`headers={`
			`'Authorization': f'Bearer {RUNPOD_API_KEY}',`
			`'Content-Type': 'application/json',`
			`},`
			`json=payload,`
			`timeout=30`
			`)`
			`return r.json()`

			`def deploy_pod(gpu_type, name, cloud_type="COMMUNITY"):`
			`"""Deploy a RunPod pod with Ollama."""`
			`query = """`
			`mutation($input: PodFindAndDeployOnDemandInput!) {`
			`podFindAndDeployOnDemand(input: $input) {`
			`id`
			`desiredStatus`
			`machineId`
			`warning`
			`}`
			`}`
			`"""`

			`variables = {`
			`"input": {`
			`"cloudType": cloud_type,`
			`"gpuCount": 1,`
			`"gpuTypeId": gpu_type,`
			`"name": name,`
			`"containerDiskInGb": 100,`
			`"imageName": "runpod/ollama:latest",`
			`"ports": "11434/http",`
			`"volumeInGb": 50,`
			`"volumeMountPath": "/workspace",`
			`}`
			`}`

			`try:`
			`result = graphql_query(query, variables)`
			`return result`
			`except Exception as e:`
			`return {"error": str(e)}`

			`def check_if_endpoint_exists(name):`
			`"""Check if endpoint already exists."""`
			`query = "{ endpoints { id name } }"`
			`result = graphql_query(query)`
			`endpoints = result.get('data', {}).get('endpoints', [])`
			`matching = [e for e in endpoints if name.lower() in e.get('name', '').lower()]`
			`return matching`

			`# Main deployment logic`
			`log("Starting Big Brain GPU deployment")`
			`log(f"RunPod API key: {RUNPOD_API_KEY[:20]}...{RUNPOD_API_KEY[-10:]}")`

			`# Step 1: Get available GPU types`
			`log("\n=== Step 1: Getting GPU types ===")`
			`gpu_query = "{ gpuTypes { id displayName memoryInGb secureCloud communityCloud } }"`
			`result = graphql_query(gpu_query)`

			`gpus = result.get('data', {}).get('gpuTypes', [])`
			`log(f"Total GPU types: {len(gpus)}")`

			`# Filter GPUs with 24GB+ VRAM for Gemma 3 27B`
			`suitable_gpus = []`
			`for gpu in gpus:`
			`mem = gpu.get('memoryInGb', 0)`
			`if mem >= 24:`
			`suitable_gpus.append(gpu)`

			`log(f"\nGPUs with 24GB+ VRAM:")`
			`for gpu in suitable_gpus[:15]:`
			`log(f" {gpu.get('id')}: {gpu.get('displayName')} - {gpu.get('memoryInGb')}GB, Secure: {gpu.get('secureCloud')}, Community: {gpu.get('communityCloud')}")`

			`# Step 2: Try to find GPU availability`
			`# The error was "no instances available" - we need to find available ones`
			`# The GPU ID format matters - try the ones from the list`

			`pod_name = "big-brain-timmy"`

			`# Try different GPUs in order of preference (cheapest first with enough memory)`
			`gpu_attempts = [`
			`("NVIDIA RTX 4090", "COMMUNITY"), # 24GB, ~$0.44/hr`
			`("NVIDIA A40", "COMMUNITY"), # 48GB`
			`("NVIDIA RTX 3090", "COMMUNITY"), # 24GB`
			`("NVIDIA RTX 3090 Ti", "COMMUNITY"), # 24GB`
			`("NVIDIA L40S", "COMMUNITY"), # 48GB`
			`("NVIDIA A6000", "COMMUNITY"), # 48GB`
			`# Try secure cloud`
			`("NVIDIA RTX 4090", "SECURE"),`
			`("NVIDIA A40", "SECURE"),`
			`("NVIDIA L40S", "SECURE"),`
			`]`

			`log("\n=== Step 2: Attempting deployment ===")`
			`deployed = False`
			`for gpu_type, cloud_type in gpu_attempts:`
			`log(f"Trying {gpu_type} ({cloud_type})...")`
			`result = deploy_pod(gpu_type, pod_name, cloud_type)`

			`errors = result.get('errors', [])`
			`data = result.get('data', {}).get('podFindAndDeployOnDemand', {})`

			`if errors:`
			`for err in errors:`
			`msg = err.get('message', '')`
			`if 'no longer any instances' in msg or 'no instances' in msg:`
			`log(f" No instances available")`
			`elif 'invalid' in msg.lower() or 'not found' in msg.lower():`
			`log(f" GPU type not found: {msg[:100]}")`
			`else:`
			`log(f" Error: {msg[:100]}")`
			`elif data and data.get('id'):`
			`log(f" ✅ SUCCESS! Pod ID: {data.get('id')}")`
			`log(f" Machine ID: {data.get('machineId')}")`
			`log(f" Status: {data.get('desiredStatus')}")`
			`deployed = True`
			`break`
			`else:`
			`log(f" Response: {json.dumps(result)[:200]}")`

			`if deployed:`
			`pod_id = data.get('id')`

			`# Wait for pod to be running`
			`log(f"\n=== Step 3: Waiting for pod {pod_id} to start ===")`
			`pod_status_query = """`
			`query($podId: String!) {`
			`pod(id: $podId) {`
			`id`
			`desiredStatus`
			`runtimeStatus`
			`machineId`
			`ports`
			`}`
			`}`
			`"""`

			`for attempt in range(30): # Wait up to 15 minutes`
			`time.sleep(30)`
			`result = graphql_query(pod_status_query, {"podId": pod_id})`
			`pod = result.get('data', {}).get('pod', {})`
			`runtime = pod.get('runtimeStatus', 'unknown')`
			`desired = pod.get('desiredStatus', 'unknown')`
			`log(f" Attempt {attempt+1}: desired={desired}, runtime={runtime}")`

			`if runtime == 'RUNNING':`
			`log(f" ✅ Pod is RUNNING!")`

			`# Get the IP/port`
			`ip = f"{pod_id}-11434.proxy.runpod.net"`
			`log(f" Ollama endpoint: http://{ip}:11434")`
			`log(f" Ollama endpoint: http://{pod_id}.proxy.runpod.net:11434")`

			`# Comment on Gitea tickets`
			`comment_text = f"""# ✅ SUCCESS: GPU Instance Deployed`

			`## Pod Details`
			`- Pod ID: {pod_id}`
			`- GPU: {gpu_type} ({cloud_type} cloud)`
			`- Status: RUNNING`
			`- Endpoint: http://{pod_id}.proxy.runpod.net:11434`

			`## Next Steps`

			`1. SSH into pod:`
			```bash
			`ssh root@{pod_id}.proxy.runpod.net`
			```

			`2. Pull Gemma 3 27B:`
			```bash
			`ollama pull gemma3:27b-instruct-q4_K_M`
			```

			`3. Verify Ollama is working:`
			```bash
			`curl http://localhost:11434/api/tags`
			```

			`4. Test inference:`
			```bash
			`curl http://localhost:11434/api/chat \\`
			`-H "Content-Type: application/json" \\`
			`-d '{{"model": "gemma3:27b-instruct-q4_K_M", "messages": [{{"role": "user", "content": "Hello from Timmy"}}]}}'`
			```

			`5. Wire to Mac Hermes:`
			Add to `~/.hermes/config.yaml`:
			```yaml
			`providers:`
			`big_brain:`
			`base_url: "http://{pod_id}.proxy.runpod.net:11434/v1"`
			`api_key: ""`
			`model: "gemma3:27b-instruct-q4_K_M"`
			```

			`6. Test Hermes:`
			```bash
			`hermes chat --model gemma3:27b-instruct-q4_K_M --provider big_brain`
			```"""

			`comment_issue(543, comment_text)`
			`comment_issue(544, comment_text.replace("Timmy", "Bezalel").replace("Mac Hermes", "Bezalel Hermes"))`

			`log("\n🎉 Big Brain GPU deployed successfully!")`
			`log(f"Pod: {pod_id}")`
			`log(f"Endpoint: http://{pod_id}.proxy.runpod.net:11434")`
			`log(f"Gitea tickets updated with deployment details")`
			`break`
			`elif runtime == 'ERROR' or desired == 'TERMINATED' or desired == 'SUSPENDED':`
			`log(f" ❌ Pod failed: runtime={runtime}, desired={desired}")`
			`break`

			`if runtime != 'RUNNING':`
			`log(f"\n⚠️ Pod is not running after waiting. Check RunPod dashboard.")`
			`else:`
			`log("\n❌ No GPU instances available on RunPod")`
			`log("Try Vertex AI or check back later")`

			`# Comment on tickets`
			`comment_text = """# Deployment Status: RunPod Failed`

			`## Issue`
			`No GPU instances available on RunPod. All GPU types returned "no instances available" error.`

			`## Alternatives`
			`1. Vertex AI - Google Cloud's managed Gemma endpoints (see ticket for instructions)`
			`2. Lambda Labs - Another GPU cloud provider`
			`3. Vast.ai - Community GPU marketplace`
			`4. Wait for RunPod - Check back in a few hours"""`

			`comment_issue(543, comment_text)`
			`comment_issue(544, comment_text)`

			`"""`

			`Write the deployment script`
			`write_file('~/.timmy/big_brain_deploy.py', script_content)`

			`# Also run it (with timeout)`
			`print("Running deployment script... (will check Gitea tickets for results in parallel)")`