Compare commits

...

2 Commits

Author SHA1 Message Date
71d3ad7879 feat: Atlas L40S evaluation script (#708)
Some checks failed
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Contributor Attribution Check / check-attribution (pull_request) Failing after 37s
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 47s
Tests / e2e (pull_request) Successful in 4m30s
Tests / test (pull_request) Failing after 42m54s
Deploy and benchmark Atlas inference engine on RunPod L40S.

Commands: deploy, status, benchmark, stop, terminate
Benchmark: 5 prompts measuring tok/s on Qwen3.5-35B-A3B-NVFP4
Results: saved to ~/.hermes/atlas_eval_log.jsonl

Closes #708
2026-04-15 22:54:55 +00:00
d86359cbb2 Merge pull request 'feat: robust tool orchestration and circuit breaking' (#811) from feat/robust-tool-orchestration-1776268138150 into main 2026-04-15 16:03:07 +00:00

122
evals/atlas_l40s_eval.py Normal file
View File

@@ -0,0 +1,122 @@
#!/usr/bin/env python3
"""Atlas Inference Engine Evaluation on RunPod L40S."""
import argparse, json, os, sys, time, urllib.request, urllib.error
from datetime import datetime, timezone
from pathlib import Path
RUNPOD_API = "https://api.runpod.io/graphql"
POD_NAME = "atlas-eval-l40s"
ATLAS_IMAGE = "avarok/atlas-gb10:alpha-2.8"
MODEL = "Qwen/Qwen3.5-35B-A3B-NVFP4"
COST_LOG = Path.home() / ".hermes" / "atlas_eval_log.jsonl"
def load_key():
k = os.environ.get("RUNPOD_API_KEY", "")
if k: return k.strip()
p = Path.home() / ".config" / "runpod" / "access_key"
if p.exists(): return p.read_text().strip()
print("ERROR: No RunPod key"); sys.exit(1)
def gql(query):
req = urllib.request.Request(RUNPOD_API,
data=json.dumps({"query": query}).encode(),
headers={"Authorization": f"Bearer {load_key()}", "Content-Type": "application/json"},
method="POST")
try:
with urllib.request.urlopen(req, timeout=30) as r:
return json.loads(r.read().decode())
except urllib.error.HTTPError as e:
print(f"Error: {e.read().decode()[:300]}"); return None
def find_pod():
r = gql("{ myself { pods { id name desiredStatus costPerHr gpuCount runtime { uptimeInSeconds } } } }")
if r and r.get("data"):
for p in r["data"]["myself"]["pods"]:
if p["name"] == POD_NAME: return p
return None
def deploy():
existing = find_pod()
if existing:
print(f"Exists: {existing['id']} ({existing['desiredStatus']})")
if existing["desiredStatus"] == "STOPPED":
gql(f'mutation {{ podResume(input: {{ podId: "{existing["id"]}" }}) {{ id }} }}')
print("Resuming...")
return existing["id"]
q = 'mutation { podFindAndDeployOnDemand(input: { cloudType: COMMUNITY, gpuCount: 1, gpuTypeId: "NVIDIA L40S", name: "' + POD_NAME + '", containerDiskInGb: 50, imageName: "' + ATLAS_IMAGE + '", ports: "8888/http", volumeInGb: 100, volumeMountPath: "/workspace" }) { id desiredStatus } }'
r = gql(q)
if r and r.get("data"):
pod = r["data"]["podFindAndDeployOnDemand"]
print(f"Deployed: {pod['id']} -> https://{pod['id']}-8888.proxy.runpod.net")
return pod["id"]
print("Deploy failed")
def status():
pod = find_pod()
if not pod: print("No pod"); return
print(f"ID: {pod['id']}\nStatus: {pod['desiredStatus']}\nCost: ${pod['costPerHr']}/hr\nEndpoint: https://{pod['id']}-8888.proxy.runpod.net")
u = pod.get("runtime", {}).get("uptimeInSeconds", 0)
if u: print(f"Uptime: {u//3600}h {(u%3600)//60}m")
def benchmark():
pod = find_pod()
if not pod or pod["desiredStatus"] != "RUNNING":
print("Pod not running"); return
ep = f"https://{pod['id']}-8888.proxy.runpod.net/v1"
print(f"Benchmarking: {ep}")
prompts = [
"Explain sovereign AI in 100 words.",
"Write quicksort in Python.",
"Compare transformers vs state space models.",
"Describe MoE architecture.",
"Write a Dockerfile for Flask+Redis.",
]
results = []
for i, p in enumerate(prompts):
print(f"\n[{i+1}/5] {p[:40]}...")
start = time.time()
try:
payload = json.dumps({"model": MODEL, "messages": [{"role": "user", "content": p}], "max_tokens": 512}).encode()
req = urllib.request.Request(f"{ep}/chat/completions", data=payload,
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"}, method="POST")
with urllib.request.urlopen(req, timeout=120) as resp:
r = json.loads(resp.read().decode())
elapsed = time.time() - start
usage = r.get("usage", {})
tps = usage.get("completion_tokens", 0) / elapsed if elapsed > 0 else 0
results.append({"prompt": i, "tok_per_sec": round(tps, 2), "tokens": usage.get("completion_tokens", 0)})
print(f" {usage.get('completion_tokens', 0)} tokens / {elapsed:.1f}s = {tps:.1f} tok/s")
except Exception as e:
print(f" Error: {e}")
results.append({"prompt": i, "error": str(e)})
ok = [r for r in results if "tok_per_sec" in r]
if ok:
avg = sum(r["tok_per_sec"] for r in ok) / len(ok)
print(f"\nAvg: {avg:.1f} tok/s | Min: {min(r['tok_per_sec'] for r in ok):.1f} | Max: {max(r['tok_per_sec'] for r in ok):.1f}")
COST_LOG.parent.mkdir(parents=True, exist_ok=True)
with open(COST_LOG, "a") as f:
f.write(json.dumps({"ts": datetime.now(timezone.utc).isoformat(), "avg_tps": round(avg, 2), "results": results}) + "\n")
def stop():
pod = find_pod()
if not pod: print("No pod"); return
gql(f'mutation {{ podStop(input: {{ podId: "{pod["id"]}" }}) {{ id }} }}')
print(f"Stopped: {pod['id']}")
def terminate():
pod = find_pod()
if not pod: print("No pod"); return
gql(f'mutation {{ podTerminate(input: {{ podId: "{pod["id"]}" }}) }}')
print(f"Terminated: {pod['id']}")
def main():
p = argparse.ArgumentParser(description="Atlas L40S Eval")
sub = p.add_subparsers(dest="cmd")
sub.add_parser("deploy"); sub.add_parser("status"); sub.add_parser("benchmark")
sub.add_parser("stop"); sub.add_parser("terminate")
args = p.parse_args()
if not args.cmd: p.print_help(); sys.exit(1)
{"deploy": deploy, "status": status, "benchmark": benchmark, "stop": stop, "terminate": terminate}[args.cmd]()
if __name__ == "__main__": main()