96 lines
3.1 KiB
Python
96 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
[EVAL] Model Evaluation Harness
|
|
Part of the Gemini Sovereign Infrastructure Suite.
|
|
|
|
Benchmarks GGUF models for speed and quality before deployment.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import argparse
|
|
import requests
|
|
|
|
BENCHMARK_PROMPTS = [
|
|
"Write a Python script to sort a list of dictionaries by a key.",
|
|
"Explain the concept of 'Sovereign AI' in three sentences.",
|
|
"What is the capital of France?",
|
|
"Write a short story about a robot learning to paint."
|
|
]
|
|
|
|
class ModelEval:
|
|
def __init__(self, endpoint: str):
|
|
self.endpoint = endpoint.rstrip("/")
|
|
|
|
def log(self, message: str):
|
|
print(f"[*] {message}")
|
|
|
|
def run_benchmark(self):
|
|
self.log(f"Starting benchmark for {self.endpoint}...")
|
|
results = []
|
|
|
|
for prompt in BENCHMARK_PROMPTS:
|
|
self.log(f"Testing prompt: {prompt[:30]}...")
|
|
|
|
start_time = time.time()
|
|
try:
|
|
# llama.cpp server /completion endpoint
|
|
response = requests.post(
|
|
f"{self.endpoint}/completion",
|
|
json={"prompt": prompt, "n_predict": 128},
|
|
timeout=60
|
|
)
|
|
duration = time.time() - start_time
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
content = data.get("content", "")
|
|
# Rough estimate of tokens (4 chars per token is a common rule of thumb)
|
|
tokens = len(content) / 4
|
|
tps = tokens / duration
|
|
|
|
results.append({
|
|
"prompt": prompt,
|
|
"duration": duration,
|
|
"tps": tps,
|
|
"success": True
|
|
})
|
|
else:
|
|
results.append({"prompt": prompt, "success": False, "error": response.text})
|
|
except Exception as e:
|
|
results.append({"prompt": prompt, "success": False, "error": str(e)})
|
|
|
|
self.report(results)
|
|
|
|
def report(self, results: list):
|
|
print("\n--- Evaluation Report ---")
|
|
total_tps = 0
|
|
success_count = 0
|
|
|
|
for r in results:
|
|
if r["success"]:
|
|
print(f"✅ {r['prompt'][:40]}... | {r['tps']:.2f} tok/s | {r['duration']:.2f}s")
|
|
total_tps += r["tps"]
|
|
success_count += 1
|
|
else:
|
|
print(f"❌ {r['prompt'][:40]}... | FAILED: {r['error']}")
|
|
|
|
if success_count > 0:
|
|
avg_tps = total_tps / success_count
|
|
print(f"\nAverage Performance: {avg_tps:.2f} tok/s")
|
|
else:
|
|
print("\n[FAILURE] All benchmarks failed.")
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Gemini Model Eval")
|
|
parser.add_argument("endpoint", help="llama-server endpoint (e.g. http://localhost:8080)")
|
|
args = parser.parse_args()
|
|
|
|
evaluator = ModelEval(args.endpoint)
|
|
evaluator.run_benchmark()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|