Compare commits
1 Commits
fix/754
...
fix/issue-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
752453de65 |
112
docs/atlas-evaluation-runpod.md
Normal file
112
docs/atlas-evaluation-runpod.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# Atlas Inference Engine — RunPod L40S Evaluation
|
||||
|
||||
## Status: PENDING
|
||||
|
||||
Atlas benchmarks are on DGX Spark (Blackwell SM120/121). Our hardware is
|
||||
RunPod L40S (Ada Lovelace SM89). This evaluation tests compatibility.
|
||||
|
||||
## Hardware
|
||||
|
||||
| Spec | Value |
|
||||
|------|-------|
|
||||
| GPU | NVIDIA L40S |
|
||||
| VRAM | 48 GB |
|
||||
| Architecture | Ada Lovelace (SM89) |
|
||||
| CUDA Compute | 8.9 |
|
||||
| Provider | RunPod |
|
||||
|
||||
## Expected Issues
|
||||
|
||||
1. **CUDA compatibility**: Atlas uses custom CUDA kernels for Blackwell SM120/121.
|
||||
L40S is SM89 — kernels may not compile or may have PTX fallback.
|
||||
2. **Quantization**: Atlas uses NVFP4. L40S supports FP8 natively but NVFP4
|
||||
may require Blackwell tensor cores.
|
||||
3. **Performance**: Even if it works, L40S won't match Blackwell throughput.
|
||||
|
||||
## Test Procedure
|
||||
|
||||
### 1. Deploy on RunPod
|
||||
|
||||
```bash
|
||||
# Start RunPod instance with:
|
||||
# - Template: RunPod PyTorch 2.4
|
||||
# - GPU: L40S
|
||||
# - Volume: 100GB (model cache)
|
||||
|
||||
# SSH into pod
|
||||
runpod ssh <pod-id>
|
||||
|
||||
# Pull and run Atlas
|
||||
docker pull avarok/atlas-gb10:alpha-2.8
|
||||
docker run -d --gpus all --ipc=host -p 8888:8888 \
|
||||
-v /root/.cache/huggingface:/root/.cache/huggingface \
|
||||
--name atlas \
|
||||
avarok/atlas-gb10:alpha-2.8 serve \
|
||||
Sehyo/Qwen3.5-35B-A3B-NVFP4 \
|
||||
--speculative --scheduling-policy slai \
|
||||
--max-seq-len 131072 --max-batch-size 1 \
|
||||
--max-prefill-tokens 0
|
||||
```
|
||||
|
||||
### 2. Check Compatibility
|
||||
|
||||
```bash
|
||||
# Watch for CUDA errors
|
||||
docker logs -f atlas
|
||||
|
||||
# Expected success: "Model loaded" or similar
|
||||
# Expected failure: "CUDA error" or "unsupported architecture"
|
||||
```
|
||||
|
||||
### 3. Run Benchmark
|
||||
|
||||
```bash
|
||||
python3 scripts/atlas_benchmark.py --base-url http://localhost:8888/v1
|
||||
```
|
||||
|
||||
### 4. Compare with vLLM
|
||||
|
||||
```bash
|
||||
# Start vLLM on another port
|
||||
docker run -d --gpus all -p 8000:8000 \
|
||||
vllm/vllm-openai \
|
||||
--model Qwen/Qwen2.5-7B \
|
||||
--max-model-len 8192
|
||||
|
||||
# Run comparison
|
||||
python3 scripts/atlas_benchmark.py \
|
||||
--base-url http://localhost:8888/v1 \
|
||||
--compare-vllm http://localhost:8000/v1
|
||||
```
|
||||
|
||||
## Evaluation Checklist
|
||||
|
||||
- [ ] Atlas starts without CUDA errors on L40S
|
||||
- [ ] Model loads successfully
|
||||
- [ ] `/v1/models` returns model list
|
||||
- [ ] Chat completions work
|
||||
- [ ] Tool calls work (function calling)
|
||||
- [ ] Cold start measured
|
||||
- [ ] Throughput measured (tok/s)
|
||||
- [ ] vLLM comparison completed
|
||||
- [ ] Report saved to ~/.hermes/atlas-benchmark-report.json
|
||||
|
||||
## Results
|
||||
|
||||
(Fill in after evaluation)
|
||||
|
||||
| Metric | Atlas | vLLM | Notes |
|
||||
|--------|-------|------|-------|
|
||||
| Starts? | | | |
|
||||
| CUDA compatible? | | | |
|
||||
| Cold start | | | |
|
||||
| tok/s (short) | | | |
|
||||
| tok/s (code) | | | |
|
||||
| tok/s (reasoning) | | | |
|
||||
| tok/s (long) | | | |
|
||||
| Tool calls work? | | | |
|
||||
| Overall verdict | | | |
|
||||
|
||||
## Recommendation
|
||||
|
||||
(Pending evaluation results)
|
||||
403
scripts/atlas_benchmark.py
Normal file
403
scripts/atlas_benchmark.py
Normal file
@@ -0,0 +1,403 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Atlas Inference Engine benchmark — RunPod L40S evaluation.
|
||||
|
||||
Tests Atlas on RunPod L40S (Ada Lovelace, SM89) and compares to vLLM.
|
||||
Atlas benchmarks are on DGX Spark (Blackwell SM120/121), so this validates
|
||||
whether it works on our hardware.
|
||||
|
||||
Usage:
|
||||
python3 scripts/atlas_benchmark.py --base-url http://localhost:8888/v1
|
||||
python3 scripts/atlas_benchmark.py --base-url http://localhost:8888/v1 --compare-vllm
|
||||
python3 scripts/atlas_benchmark.py --runpod-setup
|
||||
|
||||
Outputs JSON report to stdout and saves to ~/.hermes/atlas-benchmark-report.json
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Benchmark prompts
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
BENCHMARK_PROMPTS = [
|
||||
{
|
||||
"name": "short_answer",
|
||||
"prompt": "What is the capital of France?",
|
||||
"max_tokens": 50,
|
||||
},
|
||||
{
|
||||
"name": "code_generation",
|
||||
"prompt": "Write a Python function that implements binary search on a sorted list.",
|
||||
"max_tokens": 200,
|
||||
},
|
||||
{
|
||||
"name": "reasoning",
|
||||
"prompt": "If a train travels at 60 mph for 2.5 hours, then at 80 mph for 1.5 hours, what is the total distance traveled? Show your work step by step.",
|
||||
"max_tokens": 300,
|
||||
},
|
||||
{
|
||||
"name": "long_form",
|
||||
"prompt": "Explain the difference between TCP and UDP protocols. Include use cases, advantages, disadvantages, and when to choose each one.",
|
||||
"max_tokens": 500,
|
||||
},
|
||||
{
|
||||
"name": "tool_use_simulation",
|
||||
"prompt": "I need to find all Python files in the current directory that contain the word 'import'. What command would I use?",
|
||||
"max_tokens": 100,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkResult:
|
||||
name: str
|
||||
model: str
|
||||
provider: str
|
||||
prompt_tokens: int
|
||||
completion_tokens: int
|
||||
total_time_ms: int
|
||||
time_to_first_token_ms: int
|
||||
tokens_per_second: float
|
||||
success: bool
|
||||
error: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkReport:
|
||||
provider: str
|
||||
base_url: str
|
||||
model: str
|
||||
gpu_info: str
|
||||
timestamp: str
|
||||
results: List[BenchmarkResult]
|
||||
summary: Dict[str, Any]
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
d = asdict(self)
|
||||
d["results"] = [asdict(r) for r in self.results]
|
||||
return d
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# API calls
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def call_openai_compat(
|
||||
base_url: str,
|
||||
model: str,
|
||||
messages: list,
|
||||
max_tokens: int = 200,
|
||||
api_key: str = "",
|
||||
timeout: int = 120,
|
||||
) -> dict:
|
||||
"""Call an OpenAI-compatible API endpoint."""
|
||||
import urllib.request
|
||||
|
||||
url = f"{base_url.rstrip('/')}/chat/completions"
|
||||
body = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": max_tokens,
|
||||
"stream": False,
|
||||
}
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=json.dumps(body).encode(),
|
||||
headers=headers,
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def list_models(base_url: str, api_key: str = "") -> list:
|
||||
"""List available models."""
|
||||
import urllib.request
|
||||
|
||||
url = f"{base_url.rstrip('/')}/models"
|
||||
headers = {}
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
req = urllib.request.Request(url, headers=headers, method="GET")
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data.get("data", [])
|
||||
|
||||
|
||||
def measure_cold_start(base_url: str, model: str, api_key: str = "") -> dict:
|
||||
"""Measure cold start time (time to first token on first request)."""
|
||||
messages = [{"role": "user", "content": "Hello. Reply with just 'Ready.'"}]
|
||||
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
result = call_openai_compat(base_url, model, messages, max_tokens=10, api_key=api_key)
|
||||
elapsed = time.monotonic() - t0
|
||||
return {
|
||||
"cold_start_ms": int(elapsed * 1000),
|
||||
"success": True,
|
||||
"model": result.get("model", model),
|
||||
}
|
||||
except Exception as exc:
|
||||
return {
|
||||
"cold_start_ms": int((time.monotonic() - t0) * 1000),
|
||||
"success": False,
|
||||
"error": str(exc),
|
||||
}
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
base_url: str,
|
||||
model: str,
|
||||
prompt_config: dict,
|
||||
api_key: str = "",
|
||||
) -> BenchmarkResult:
|
||||
"""Run a single benchmark prompt."""
|
||||
messages = [{"role": "user", "content": prompt_config["prompt"]}]
|
||||
max_tokens = prompt_config.get("max_tokens", 200)
|
||||
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
result = call_openai_compat(
|
||||
base_url, model, messages,
|
||||
max_tokens=max_tokens, api_key=api_key,
|
||||
)
|
||||
elapsed = time.monotonic() - t0
|
||||
usage = result.get("usage", {})
|
||||
|
||||
return BenchmarkResult(
|
||||
name=prompt_config["name"],
|
||||
model=result.get("model", model),
|
||||
provider="atlas" if "atlas" in base_url.lower() else "unknown",
|
||||
prompt_tokens=usage.get("prompt_tokens", 0),
|
||||
completion_tokens=usage.get("completion_tokens", 0),
|
||||
total_time_ms=int(elapsed * 1000),
|
||||
time_to_first_token_ms=int(elapsed * 1000), # non-streaming, same as total
|
||||
tokens_per_second=round(
|
||||
usage.get("completion_tokens", 0) / elapsed, 1
|
||||
) if elapsed > 0 else 0.0,
|
||||
success=True,
|
||||
)
|
||||
except Exception as exc:
|
||||
return BenchmarkResult(
|
||||
name=prompt_config["name"],
|
||||
model=model,
|
||||
provider="atlas",
|
||||
prompt_tokens=0,
|
||||
completion_tokens=0,
|
||||
total_time_ms=int((time.monotonic() - t0) * 1000),
|
||||
time_to_first_token_ms=0,
|
||||
tokens_per_second=0.0,
|
||||
success=False,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
|
||||
def get_gpu_info() -> str:
|
||||
"""Get GPU info if available."""
|
||||
try:
|
||||
import subprocess
|
||||
result = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
return "Unknown (nvidia-smi not available)"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# RunPod setup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RUNPOD_SETUP_COMMANDS = """# Atlas on RunPod L40S Setup
|
||||
|
||||
# 1. Start RunPod with L40S (48GB VRAM, Ada Lovelace SM89)
|
||||
# Template: RunPod PyTorch 2.4
|
||||
# GPU: L40S
|
||||
# Volume: 50GB+ (for model cache)
|
||||
|
||||
# 2. Install Docker (if not present)
|
||||
apt-get update && apt-get install -y docker.io
|
||||
|
||||
# 3. Pull Atlas image
|
||||
docker pull avarok/atlas-gb10:alpha-2.8
|
||||
|
||||
# 4. Start Atlas with Qwen3.5-35B (smallest supported model)
|
||||
docker run -d --gpus all --ipc=host -p 8888:8888 \\
|
||||
-v /root/.cache/huggingface:/root/.cache/huggingface \\
|
||||
--name atlas \\
|
||||
avarok/atlas-gb10:alpha-2.8 serve \\
|
||||
Sehyo/Qwen3.5-35B-A3B-NVFP4 \\
|
||||
--speculative --scheduling-policy slai \\
|
||||
--max-seq-len 131072 --max-batch-size 1 \\
|
||||
--max-prefill-tokens 0
|
||||
|
||||
# 5. Wait for model to load (watch logs)
|
||||
docker logs -f atlas
|
||||
|
||||
# 6. Test endpoint
|
||||
curl http://localhost:8888/v1/models
|
||||
|
||||
# 7. Run benchmark
|
||||
python3 scripts/atlas_benchmark.py --base-url http://localhost:8888/v1
|
||||
|
||||
# 8. Compare with vLLM (if installed)
|
||||
# Start vLLM:
|
||||
# docker run -d --gpus all -p 8000:8000 vllm/vllm-openai \\
|
||||
# --model Qwen/Qwen2.5-7B --max-model-len 8192
|
||||
# python3 scripts/atlas_benchmark.py --base-url http://localhost:8888/v1 --compare-vllm http://localhost:8000/v1
|
||||
|
||||
# NOTE: Atlas may NOT work on L40S (SM89). Benchmarks are on Blackwell (SM120/121).
|
||||
# If you get CUDA errors, Atlas doesn't support your GPU architecture yet.
|
||||
"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Atlas Inference Engine benchmark")
|
||||
parser.add_argument("--base-url", default="http://localhost:8888/v1", help="Atlas API base URL")
|
||||
parser.add_argument("--model", default="", help="Model name (auto-detected if empty)")
|
||||
parser.add_argument("--api-key", default="", help="API key (if required)")
|
||||
parser.add_argument("--compare-vllm", default="", help="vLLM base URL for comparison")
|
||||
parser.add_argument("--runpod-setup", action="store_true", help="Print RunPod setup commands")
|
||||
parser.add_argument("--output", default="", help="Output file path")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.runpod_setup:
|
||||
print(RUNPOD_SETUP_COMMANDS)
|
||||
return 0
|
||||
|
||||
print(f"Atlas Benchmark")
|
||||
print(f"=" * 60)
|
||||
print(f"Base URL: {args.base_url}")
|
||||
print(f"GPU: {get_gpu_info()}")
|
||||
print()
|
||||
|
||||
# Check availability
|
||||
print("Checking Atlas availability...", end=" ", flush=True)
|
||||
models = list_models(args.base_url, args.api_key)
|
||||
if not models:
|
||||
print("FAILED")
|
||||
print("Atlas is not running or not reachable at", args.base_url)
|
||||
print("Run with --runpod-setup for deployment instructions.")
|
||||
return 1
|
||||
print(f"OK ({len(models)} models)")
|
||||
|
||||
model = args.model or (models[0].get("id", "") if models else "")
|
||||
if not model:
|
||||
print("No model specified and none detected.")
|
||||
return 1
|
||||
print(f"Model: {model}")
|
||||
print()
|
||||
|
||||
# Cold start measurement
|
||||
print("Measuring cold start...", end=" ", flush=True)
|
||||
cold = measure_cold_start(args.base_url, model, args.api_key)
|
||||
print(f"{cold['cold_start_ms']}ms {'OK' if cold['success'] else 'FAILED'}")
|
||||
if not cold["success"]:
|
||||
print(f" Error: {cold.get('error', 'unknown')}")
|
||||
print()
|
||||
|
||||
# Run benchmarks
|
||||
results = []
|
||||
for pc in BENCHMARK_PROMPTS:
|
||||
print(f"Benchmark: {pc['name']}...", end=" ", flush=True)
|
||||
result = run_benchmark(args.base_url, model, pc, args.api_key)
|
||||
results.append(result)
|
||||
if result.success:
|
||||
print(f"{result.tokens_per_second} tok/s ({result.total_time_ms}ms)")
|
||||
else:
|
||||
print(f"FAILED: {result.error}")
|
||||
|
||||
# Summary
|
||||
successful = [r for r in results if r.success]
|
||||
total_tokens = sum(r.completion_tokens for r in successful)
|
||||
total_time = sum(r.total_time_ms for r in successful) / 1000
|
||||
avg_tps = round(total_tokens / total_time, 1) if total_time > 0 else 0
|
||||
|
||||
print()
|
||||
print(f"Summary:")
|
||||
print(f" Successful: {len(successful)}/{len(results)}")
|
||||
print(f" Total tokens: {total_tokens}")
|
||||
print(f" Average throughput: {avg_tps} tok/s")
|
||||
|
||||
# vLLM comparison
|
||||
vllm_results = []
|
||||
if args.compare_vllm:
|
||||
print()
|
||||
print(f"Comparing with vLLM at {args.compare_vllm}...")
|
||||
for pc in BENCHMARK_PROMPTS:
|
||||
print(f" vLLM: {pc['name']}...", end=" ", flush=True)
|
||||
result = run_benchmark(args.compare_vllm, model, pc, args.api_key)
|
||||
vllm_results.append(result)
|
||||
if result.success:
|
||||
print(f"{result.tokens_per_second} tok/s")
|
||||
else:
|
||||
print(f"FAILED")
|
||||
|
||||
vllm_success = [r for r in vllm_results if r.success]
|
||||
vllm_tokens = sum(r.completion_tokens for r in vllm_success)
|
||||
vllm_time = sum(r.total_time_ms for r in vllm_success) / 1000
|
||||
vllm_tps = round(vllm_tokens / vllm_time, 1) if vllm_time > 0 else 0
|
||||
|
||||
if avg_tps > 0 and vllm_tps > 0:
|
||||
speedup = round(avg_tps / vllm_tps, 2)
|
||||
print(f"\n Atlas: {avg_tps} tok/s | vLLM: {vllm_tps} tok/s | Speedup: {speedup}x")
|
||||
|
||||
# Build report
|
||||
import datetime
|
||||
report = BenchmarkReport(
|
||||
provider="atlas",
|
||||
base_url=args.base_url,
|
||||
model=model,
|
||||
gpu_info=get_gpu_info(),
|
||||
timestamp=datetime.datetime.now().isoformat(),
|
||||
results=results,
|
||||
summary={
|
||||
"successful_benchmarks": len(successful),
|
||||
"total_benchmarks": len(results),
|
||||
"total_completion_tokens": total_tokens,
|
||||
"average_tps": avg_tps,
|
||||
"cold_start_ms": cold.get("cold_start_ms", 0),
|
||||
"vllm_comparison": {
|
||||
"vllm_tps": vllm_tps if vllm_results else None,
|
||||
"speedup": speedup if vllm_results and avg_tps > 0 and vllm_tps > 0 else None,
|
||||
} if vllm_results else None,
|
||||
},
|
||||
)
|
||||
|
||||
# Save report
|
||||
output_path = args.output or str(Path.home() / ".hermes" / "atlas-benchmark-report.json")
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(report.to_dict(), f, indent=2)
|
||||
print(f"\nReport saved to: {output_path}")
|
||||
|
||||
# Also print JSON to stdout
|
||||
print("\n" + json.dumps(report.to_dict(), indent=2))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user