#!/usr/bin/env python3 """ [EVAL] Model Evaluation Harness Part of the Gemini Sovereign Infrastructure Suite. Benchmarks GGUF models for speed and quality before deployment. """ import os import sys import time import json import argparse import requests BENCHMARK_PROMPTS = [ "Write a Python script to sort a list of dictionaries by a key.", "Explain the concept of 'Sovereign AI' in three sentences.", "What is the capital of France?", "Write a short story about a robot learning to paint." ] class ModelEval: def __init__(self, endpoint: str): self.endpoint = endpoint.rstrip("/") def log(self, message: str): print(f"[*] {message}") def run_benchmark(self): self.log(f"Starting benchmark for {self.endpoint}...") results = [] for prompt in BENCHMARK_PROMPTS: self.log(f"Testing prompt: {prompt[:30]}...") start_time = time.time() try: # llama.cpp server /completion endpoint response = requests.post( f"{self.endpoint}/completion", json={"prompt": prompt, "n_predict": 128}, timeout=60 ) duration = time.time() - start_time if response.status_code == 200: data = response.json() content = data.get("content", "") # Rough estimate of tokens (4 chars per token is a common rule of thumb) tokens = len(content) / 4 tps = tokens / duration results.append({ "prompt": prompt, "duration": duration, "tps": tps, "success": True }) else: results.append({"prompt": prompt, "success": False, "error": response.text}) except Exception as e: results.append({"prompt": prompt, "success": False, "error": str(e)}) self.report(results) def report(self, results: list): print("\n--- Evaluation Report ---") total_tps = 0 success_count = 0 for r in results: if r["success"]: print(f"✅ {r['prompt'][:40]}... | {r['tps']:.2f} tok/s | {r['duration']:.2f}s") total_tps += r["tps"] success_count += 1 else: print(f"❌ {r['prompt'][:40]}... | FAILED: {r['error']}") if success_count > 0: avg_tps = total_tps / success_count print(f"\nAverage Performance: {avg_tps:.2f} tok/s") else: print("\n[FAILURE] All benchmarks failed.") def main(): parser = argparse.ArgumentParser(description="Gemini Model Eval") parser.add_argument("endpoint", help="llama-server endpoint (e.g. http://localhost:8080)") args = parser.parse_args() evaluator = ModelEval(args.endpoint) evaluator.run_benchmark() if __name__ == "__main__": main()