diff --git a/scripts/model_eval.py b/scripts/model_eval.py new file mode 100644 index 00000000..7d2664da --- /dev/null +++ b/scripts/model_eval.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +""" +[EVAL] Model Evaluation Harness +Part of the Gemini Sovereign Infrastructure Suite. + +Benchmarks GGUF models for speed and quality before deployment. +""" + +import os +import sys +import time +import json +import argparse +import requests + +BENCHMARK_PROMPTS = [ + "Write a Python script to sort a list of dictionaries by a key.", + "Explain the concept of 'Sovereign AI' in three sentences.", + "What is the capital of France?", + "Write a short story about a robot learning to paint." +] + +class ModelEval: + def __init__(self, endpoint: str): + self.endpoint = endpoint.rstrip("/") + + def log(self, message: str): + print(f"[*] {message}") + + def run_benchmark(self): + self.log(f"Starting benchmark for {self.endpoint}...") + results = [] + + for prompt in BENCHMARK_PROMPTS: + self.log(f"Testing prompt: {prompt[:30]}...") + + start_time = time.time() + try: + # llama.cpp server /completion endpoint + response = requests.post( + f"{self.endpoint}/completion", + json={"prompt": prompt, "n_predict": 128}, + timeout=60 + ) + duration = time.time() - start_time + + if response.status_code == 200: + data = response.json() + content = data.get("content", "") + # Rough estimate of tokens (4 chars per token is a common rule of thumb) + tokens = len(content) / 4 + tps = tokens / duration + + results.append({ + "prompt": prompt, + "duration": duration, + "tps": tps, + "success": True + }) + else: + results.append({"prompt": prompt, "success": False, "error": response.text}) + except Exception as e: + results.append({"prompt": prompt, "success": False, "error": str(e)}) + + self.report(results) + + def report(self, results: list): + print("\n--- Evaluation Report ---") + total_tps = 0 + success_count = 0 + + for r in results: + if r["success"]: + print(f"✅ {r['prompt'][:40]}... | {r['tps']:.2f} tok/s | {r['duration']:.2f}s") + total_tps += r["tps"] + success_count += 1 + else: + print(f"❌ {r['prompt'][:40]}... | FAILED: {r['error']}") + + if success_count > 0: + avg_tps = total_tps / success_count + print(f"\nAverage Performance: {avg_tps:.2f} tok/s") + else: + print("\n[FAILURE] All benchmarks failed.") + +def main(): + parser = argparse.ArgumentParser(description="Gemini Model Eval") + parser.add_argument("endpoint", help="llama-server endpoint (e.g. http://localhost:8080)") + args = parser.parse_args() + + evaluator = ModelEval(args.endpoint) + evaluator.run_benchmark() + +if __name__ == "__main__": + main()