feat: implement model_eval.py
This commit is contained in:
95
scripts/model_eval.py
Normal file
95
scripts/model_eval.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
[EVAL] Model Evaluation Harness
|
||||
Part of the Gemini Sovereign Infrastructure Suite.
|
||||
|
||||
Benchmarks GGUF models for speed and quality before deployment.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import argparse
|
||||
import requests
|
||||
|
||||
BENCHMARK_PROMPTS = [
|
||||
"Write a Python script to sort a list of dictionaries by a key.",
|
||||
"Explain the concept of 'Sovereign AI' in three sentences.",
|
||||
"What is the capital of France?",
|
||||
"Write a short story about a robot learning to paint."
|
||||
]
|
||||
|
||||
class ModelEval:
|
||||
def __init__(self, endpoint: str):
|
||||
self.endpoint = endpoint.rstrip("/")
|
||||
|
||||
def log(self, message: str):
|
||||
print(f"[*] {message}")
|
||||
|
||||
def run_benchmark(self):
|
||||
self.log(f"Starting benchmark for {self.endpoint}...")
|
||||
results = []
|
||||
|
||||
for prompt in BENCHMARK_PROMPTS:
|
||||
self.log(f"Testing prompt: {prompt[:30]}...")
|
||||
|
||||
start_time = time.time()
|
||||
try:
|
||||
# llama.cpp server /completion endpoint
|
||||
response = requests.post(
|
||||
f"{self.endpoint}/completion",
|
||||
json={"prompt": prompt, "n_predict": 128},
|
||||
timeout=60
|
||||
)
|
||||
duration = time.time() - start_time
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
content = data.get("content", "")
|
||||
# Rough estimate of tokens (4 chars per token is a common rule of thumb)
|
||||
tokens = len(content) / 4
|
||||
tps = tokens / duration
|
||||
|
||||
results.append({
|
||||
"prompt": prompt,
|
||||
"duration": duration,
|
||||
"tps": tps,
|
||||
"success": True
|
||||
})
|
||||
else:
|
||||
results.append({"prompt": prompt, "success": False, "error": response.text})
|
||||
except Exception as e:
|
||||
results.append({"prompt": prompt, "success": False, "error": str(e)})
|
||||
|
||||
self.report(results)
|
||||
|
||||
def report(self, results: list):
|
||||
print("\n--- Evaluation Report ---")
|
||||
total_tps = 0
|
||||
success_count = 0
|
||||
|
||||
for r in results:
|
||||
if r["success"]:
|
||||
print(f"✅ {r['prompt'][:40]}... | {r['tps']:.2f} tok/s | {r['duration']:.2f}s")
|
||||
total_tps += r["tps"]
|
||||
success_count += 1
|
||||
else:
|
||||
print(f"❌ {r['prompt'][:40]}... | FAILED: {r['error']}")
|
||||
|
||||
if success_count > 0:
|
||||
avg_tps = total_tps / success_count
|
||||
print(f"\nAverage Performance: {avg_tps:.2f} tok/s")
|
||||
else:
|
||||
print("\n[FAILURE] All benchmarks failed.")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Gemini Model Eval")
|
||||
parser.add_argument("endpoint", help="llama-server endpoint (e.g. http://localhost:8080)")
|
||||
args = parser.parse_args()
|
||||
|
||||
evaluator = ModelEval(args.endpoint)
|
||||
evaluator.run_benchmark()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user