#!/usr/bin/env python3 """ Benchmark local model inference performance. Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning """ import time import json import statistics import argparse import requests from typing import List, Dict, Any, Optional from dataclasses import dataclass from datetime import datetime @dataclass class BenchmarkResult: """Results from a benchmark run.""" model: str iterations: int total_time: float tokens_per_second: float time_to_first_token: float average_latency: float p95_latency: float errors: int timestamp: str class ModelBenchmark: """Benchmark local model inference.""" def __init__(self, endpoint: str = "http://localhost:11434"): self.endpoint = endpoint self.results: List[BenchmarkResult] = [] def check_connection(self) -> bool: """Check if Ollama is running.""" try: response = requests.get(f"{self.endpoint}/api/tags", timeout=5) return response.status_code == 200 except: return False def get_available_models(self) -> List[str]: """Get list of available models.""" try: response = requests.get(f"{self.endpoint}/api/tags", timeout=5) if response.status_code == 200: data = response.json() return [model["name"] for model in data.get("models", [])] except: pass return [] def benchmark_model(self, model: str, prompt: str = "Explain quantum computing in simple terms.", iterations: int = 5, max_tokens: int = 100) -> BenchmarkResult: """Benchmark a single model.""" print(f"Benchmarking {model} ({iterations} iterations)...") latencies = [] ttfts = [] # Time to first token total_tokens = 0 errors = 0 for i in range(iterations): try: start_time = time.time() # Generate response payload = { "model": model, "prompt": prompt, "stream": False, "options": { "num_predict": max_tokens, "temperature": 0.7 } } response = requests.post( f"{self.endpoint}/api/generate", json=payload, timeout=60 ) end_time = time.time() latency = end_time - start_time if response.status_code == 200: data = response.json() response_text = data.get("response", "") tokens = len(response_text.split()) total_tokens += tokens # Estimate time to first token (simplified) ttft = latency * 0.1 # Rough estimate ttfts.append(ttft) latencies.append(latency) print(f" Iteration {i+1}: {latency:.2f}s, {tokens} tokens") else: errors += 1 print(f" Iteration {i+1}: Error {response.status_code}") except Exception as e: errors += 1 print(f" Iteration {i+1}: Exception {e}") # Calculate statistics if latencies: avg_latency = statistics.mean(latencies) p95_latency = statistics.quantiles(latencies, n=20)[18] if len(latencies) >= 2 else avg_latency avg_ttft = statistics.mean(ttfts) total_time = sum(latencies) tokens_per_second = total_tokens / total_time if total_time > 0 else 0 else: avg_latency = 0 p95_latency = 0 avg_ttft = 0 total_time = 0 tokens_per_second = 0 result = BenchmarkResult( model=model, iterations=iterations, total_time=total_time, tokens_per_second=tokens_per_second, time_to_first_token=avg_ttft, average_latency=avg_latency, p95_latency=p95_latency, errors=errors, timestamp=datetime.now().isoformat() ) self.results.append(result) return result def compare_models(self, models: List[str], prompt: str = None, iterations: int = 5) -> List[BenchmarkResult]: """Compare multiple models.""" if prompt is None: prompt = "Explain quantum computing in simple terms." print(f"Comparing {len(models)} models...") print("=" * 60) results = [] for model in models: result = self.benchmark_model(model, prompt, iterations) results.append(result) print() return results def print_comparison(self, results: List[BenchmarkResult]): """Print comparison table.""" print("\n" + "=" * 80) print("MODEL COMPARISON") print("=" * 80) print(f"{'Model':<20} {'Tokens/s':<10} {'Avg Latency':<12} {'P95 Latency':<12} {'TTFT':<10} {'Errors':<6}") print("-" * 80) for result in results: print(f"{result.model:<20} {result.tokens_per_second:<10.1f} {result.average_latency:<12.2f} " f"{result.p95_latency:<12.2f} {result.time_to_first_token:<10.2f} {result.errors:<6}") print("=" * 80) def save_results(self, filename: str = "benchmark_results.json"): """Save results to file.""" data = { "timestamp": datetime.now().isoformat(), "endpoint": self.endpoint, "results": [ { "model": r.model, "iterations": r.iterations, "total_time": r.total_time, "tokens_per_second": r.tokens_per_second, "time_to_first_token": r.time_to_first_token, "average_latency": r.average_latency, "p95_latency": r.p95_latency, "errors": r.errors, "timestamp": r.timestamp } for r in self.results ] } with open(filename, 'w') as f: json.dump(data, f, indent=2) print(f"Results saved to {filename}") def main(): parser = argparse.ArgumentParser(description="Benchmark local model inference") parser.add_argument("--endpoint", default="http://localhost:11434", help="Ollama endpoint") parser.add_argument("--models", nargs="+", help="Models to benchmark") parser.add_argument("--prompt", default="Explain quantum computing in simple terms.", help="Test prompt") parser.add_argument("--iterations", type=int, default=5, help="Iterations per model") parser.add_argument("--output", default="benchmark_results.json", help="Output file") args = parser.parse_args() benchmark = ModelBenchmark(args.endpoint) # Check connection if not benchmark.check_connection(): print(f"Error: Cannot connect to Ollama at {args.endpoint}") print("Make sure Ollama is running: ollama serve") return 1 # Get models to benchmark if args.models: models = args.models else: models = benchmark.get_available_models() if not models: print("No models available. Pull a model first: ollama pull llama3") return 1 print(f"Available models: {models}") print() # Run benchmarks results = benchmark.compare_models(models, args.prompt, args.iterations) # Print comparison benchmark.print_comparison(results) # Save results benchmark.save_results(args.output) return 0 if __name__ == "__main__": import sys sys.exit(main())