feat: implement model_eval.py

2026-04-08 11:40:19 +00:00
parent 77f258efa5
commit 311ecf19db
1 changed files with 95 additions and 0 deletions
--- a/scripts/model_eval.py
+++ b/scripts/model_eval.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""
+[EVAL] Model Evaluation Harness
+Part of the Gemini Sovereign Infrastructure Suite.
+
+Benchmarks GGUF models for speed and quality before deployment.
+"""
+
+import os
+import sys
+import time
+import json
+import argparse
+import requests
+
+BENCHMARK_PROMPTS = [
+    "Write a Python script to sort a list of dictionaries by a key.",
+    "Explain the concept of 'Sovereign AI' in three sentences.",
+    "What is the capital of France?",
+    "Write a short story about a robot learning to paint."
+]
+
+class ModelEval:
+    def __init__(self, endpoint: str):
+        self.endpoint = endpoint.rstrip("/")
+
+    def log(self, message: str):
+        print(f"[*] {message}")
+
+    def run_benchmark(self):
+        self.log(f"Starting benchmark for {self.endpoint}...")
+        results = []
+        
+        for prompt in BENCHMARK_PROMPTS:
+            self.log(f"Testing prompt: {prompt[:30]}...")
+            
+            start_time = time.time()
+            try:
+                # llama.cpp server /completion endpoint
+                response = requests.post(
+                    f"{self.endpoint}/completion",
+                    json={"prompt": prompt, "n_predict": 128},
+                    timeout=60
+                )
+                duration = time.time() - start_time
+                
+                if response.status_code == 200:
+                    data = response.json()
+                    content = data.get("content", "")
+                    # Rough estimate of tokens (4 chars per token is a common rule of thumb)
+                    tokens = len(content) / 4
+                    tps = tokens / duration
+                    
+                    results.append({
+                        "prompt": prompt,
+                        "duration": duration,
+                        "tps": tps,
+                        "success": True
+                    })
+                else:
+                    results.append({"prompt": prompt, "success": False, "error": response.text})
+            except Exception as e:
+                results.append({"prompt": prompt, "success": False, "error": str(e)})
+
+        self.report(results)
+
+    def report(self, results: list):
+        print("\n--- Evaluation Report ---")
+        total_tps = 0
+        success_count = 0
+        
+        for r in results:
+            if r["success"]:
+                print(f"✅ {r['prompt'][:40]}... | {r['tps']:.2f} tok/s | {r['duration']:.2f}s")
+                total_tps += r["tps"]
+                success_count += 1
+            else:
+                print(f"❌ {r['prompt'][:40]}... | FAILED: {r['error']}")
+                
+        if success_count > 0:
+            avg_tps = total_tps / success_count
+            print(f"\nAverage Performance: {avg_tps:.2f} tok/s")
+        else:
+            print("\n[FAILURE] All benchmarks failed.")
+
+def main():
+    parser = argparse.ArgumentParser(description="Gemini Model Eval")
+    parser.add_argument("endpoint", help="llama-server endpoint (e.g. http://localhost:8080)")
+    args = parser.parse_args()
+    
+    evaluator = ModelEval(args.endpoint)
+    evaluator.run_benchmark()
+
+if __name__ == "__main__":
+    main()