turboquant/benchmarks/run_benchmarks.py

import json
import time
import requests
import os
from typing import List, Dict

# ═══════════════════════════════════════════
# TURBOQUANT BENCHMARKING SUITE (Issue #16)
# ═══════════════════════════════════════════
# This script runs a standardized set of prompts against the local inference
# engine (Ollama) and logs the results. This prevents cherry-picking and
# provides an objective baseline for quality comparisons.

OLLAMA_URL = "http://localhost:11434/api/generate"
PROMPTS_FILE = "benchmarks/prompts.json"
RESULTS_FILE = f"benchmarks/results_{int(time.time())}.json"

def run_benchmark(model: str = "llama3"):
    """Run the benchmark suite for a specific model."""
    if not os.path.exists(PROMPTS_FILE):
        print(f"Error: {PROMPTS_FILE} not found.")
        return

    with open(PROMPTS_FILE, 'r') as f:
        prompts = json.load(f)

    results = []
    print(f"Starting benchmark for model: {model}")
    print(f"Saving results to: {RESULTS_FILE}")

    for item in prompts:
        print(f"Running prompt: {item['id']}...")

        start_time = time.time()
        try:
            response = requests.post(OLLAMA_URL, json={
                "model": model,
                "prompt": item['prompt'],
                "stream": False
            }, timeout=60)

            response.raise_for_status()
            data = response.json()
            end_time = time.time()

            results.append({
                "id": item['id'],
                "prompt": item['prompt'],
                "response": data.get("response"),
                "latency": end_time - start_time,
                "tokens_per_second": data.get("eval_count", 0) / (data.get("eval_duration", 1) / 1e9) if data.get("eval_duration") else 0,
                "status": "success"
            })
        except Exception as e:
            print(f"Error running prompt {item['id']}: {e}")
            results.append({
                "id": item['id'],
                "prompt": item['prompt'],
                "error": str(e),
                "status": "failed"
            })

    # Save results
    with open(RESULTS_FILE, 'w') as f:
        json.dump({
            "model": model,
            "timestamp": time.time(),
            "results": results
        }, f, indent=2)

    print("Benchmark complete.")

if __name__ == "__main__":
    # Default to llama3 for testing
    run_benchmark("llama3")