import json import time import requests import os from typing import List, Dict # ═══════════════════════════════════════════ # TURBOQUANT BENCHMARKING SUITE (Issue #16) # ═══════════════════════════════════════════ # This script runs a standardized set of prompts against the local inference # engine (Ollama) and logs the results. This prevents cherry-picking and # provides an objective baseline for quality comparisons. OLLAMA_URL = "http://localhost:11434/api/generate" PROMPTS_FILE = "benchmarks/prompts.json" RESULTS_FILE = f"benchmarks/results_{int(time.time())}.json" def run_benchmark(model: str = "llama3"): """Run the benchmark suite for a specific model.""" if not os.path.exists(PROMPTS_FILE): print(f"Error: {PROMPTS_FILE} not found.") return with open(PROMPTS_FILE, 'r') as f: prompts = json.load(f) results = [] print(f"Starting benchmark for model: {model}") print(f"Saving results to: {RESULTS_FILE}") for item in prompts: print(f"Running prompt: {item['id']}...") start_time = time.time() try: response = requests.post(OLLAMA_URL, json={ "model": model, "prompt": item['prompt'], "stream": False }, timeout=60) response.raise_for_status() data = response.json() end_time = time.time() results.append({ "id": item['id'], "prompt": item['prompt'], "response": data.get("response"), "latency": end_time - start_time, "tokens_per_second": data.get("eval_count", 0) / (data.get("eval_duration", 1) / 1e9) if data.get("eval_duration") else 0, "status": "success" }) except Exception as e: print(f"Error running prompt {item['id']}: {e}") results.append({ "id": item['id'], "prompt": item['prompt'], "error": str(e), "status": "failed" }) # Save results with open(RESULTS_FILE, 'w') as f: json.dump({ "model": model, "timestamp": time.time(), "results": results }, f, indent=2) print("Benchmark complete.") if __name__ == "__main__": # Default to llama3 for testing run_benchmark("llama3")