timmy-config/scripts/local-models/benchmark_inference.py

#!/usr/bin/env python3
"""
Benchmark local model inference performance.
Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning
"""
import time
import json
import statistics
import argparse
import requests
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from datetime import datetime

@dataclass
class BenchmarkResult:
    """Results from a benchmark run."""
    model: str
    iterations: int
    total_time: float
    tokens_per_second: float
    time_to_first_token: float
    average_latency: float
    p95_latency: float
    errors: int
    timestamp: str

class ModelBenchmark:
    """Benchmark local model inference."""

    def __init__(self, endpoint: str = "http://localhost:11434"):
        self.endpoint = endpoint
        self.results: List[BenchmarkResult] = []

    def check_connection(self) -> bool:
        """Check if Ollama is running."""
        try:
            response = requests.get(f"{self.endpoint}/api/tags", timeout=5)
            return response.status_code == 200
        except:
            return False

    def get_available_models(self) -> List[str]:
        """Get list of available models."""
        try:
            response = requests.get(f"{self.endpoint}/api/tags", timeout=5)
            if response.status_code == 200:
                data = response.json()
                return [model["name"] for model in data.get("models", [])]
        except:
            pass
        return []

    def benchmark_model(self, model: str, prompt: str = "Explain quantum computing in simple terms.",
                       iterations: int = 5, max_tokens: int = 100) -> BenchmarkResult:
        """Benchmark a single model."""
        print(f"Benchmarking {model} ({iterations} iterations)...")

        latencies = []
        ttfts = []  # Time to first token
        total_tokens = 0
        errors = 0

        for i in range(iterations):
            try:
                start_time = time.time()

                # Generate response
                payload = {
                    "model": model,
                    "prompt": prompt,
                    "stream": False,
                    "options": {
                        "num_predict": max_tokens,
                        "temperature": 0.7
                    }
                }

                response = requests.post(
                    f"{self.endpoint}/api/generate",
                    json=payload,
                    timeout=60
                )

                end_time = time.time()
                latency = end_time - start_time

                if response.status_code == 200:
                    data = response.json()
                    response_text = data.get("response", "")
                    tokens = len(response_text.split())
                    total_tokens += tokens

                    # Estimate time to first token (simplified)
                    ttft = latency * 0.1  # Rough estimate
                    ttfts.append(ttft)
                    latencies.append(latency)

                    print(f"  Iteration {i+1}: {latency:.2f}s, {tokens} tokens")
                else:
                    errors += 1
                    print(f"  Iteration {i+1}: Error {response.status_code}")

            except Exception as e:
                errors += 1
                print(f"  Iteration {i+1}: Exception {e}")

        # Calculate statistics
        if latencies:
            avg_latency = statistics.mean(latencies)
            p95_latency = statistics.quantiles(latencies, n=20)[18] if len(latencies) >= 2 else avg_latency
            avg_ttft = statistics.mean(ttfts)
            total_time = sum(latencies)
            tokens_per_second = total_tokens / total_time if total_time > 0 else 0
        else:
            avg_latency = 0
            p95_latency = 0
            avg_ttft = 0
            total_time = 0
            tokens_per_second = 0

        result = BenchmarkResult(
            model=model,
            iterations=iterations,
            total_time=total_time,
            tokens_per_second=tokens_per_second,
            time_to_first_token=avg_ttft,
            average_latency=avg_latency,
            p95_latency=p95_latency,
            errors=errors,
            timestamp=datetime.now().isoformat()
        )

        self.results.append(result)
        return result

    def compare_models(self, models: List[str], prompt: str = None, iterations: int = 5) -> List[BenchmarkResult]:
        """Compare multiple models."""
        if prompt is None:
            prompt = "Explain quantum computing in simple terms."

        print(f"Comparing {len(models)} models...")
        print("=" * 60)

        results = []
        for model in models:
            result = self.benchmark_model(model, prompt, iterations)
            results.append(result)
            print()

        return results

    def print_comparison(self, results: List[BenchmarkResult]):
        """Print comparison table."""
        print("\n" + "=" * 80)
        print("MODEL COMPARISON")
        print("=" * 80)
        print(f"{'Model':<20} {'Tokens/s':<10} {'Avg Latency':<12} {'P95 Latency':<12} {'TTFT':<10} {'Errors':<6}")
        print("-" * 80)

        for result in results:
            print(f"{result.model:<20} {result.tokens_per_second:<10.1f} {result.average_latency:<12.2f} "
                  f"{result.p95_latency:<12.2f} {result.time_to_first_token:<10.2f} {result.errors:<6}")

        print("=" * 80)

    def save_results(self, filename: str = "benchmark_results.json"):
        """Save results to file."""
        data = {
            "timestamp": datetime.now().isoformat(),
            "endpoint": self.endpoint,
            "results": [
                {
                    "model": r.model,
                    "iterations": r.iterations,
                    "total_time": r.total_time,
                    "tokens_per_second": r.tokens_per_second,
                    "time_to_first_token": r.time_to_first_token,
                    "average_latency": r.average_latency,
                    "p95_latency": r.p95_latency,
                    "errors": r.errors,
                    "timestamp": r.timestamp
                }
                for r in self.results
            ]
        }

        with open(filename, 'w') as f:
            json.dump(data, f, indent=2)

        print(f"Results saved to {filename}")

def main():
    parser = argparse.ArgumentParser(description="Benchmark local model inference")
    parser.add_argument("--endpoint", default="http://localhost:11434", help="Ollama endpoint")
    parser.add_argument("--models", nargs="+", help="Models to benchmark")
    parser.add_argument("--prompt", default="Explain quantum computing in simple terms.", help="Test prompt")
    parser.add_argument("--iterations", type=int, default=5, help="Iterations per model")
    parser.add_argument("--output", default="benchmark_results.json", help="Output file")

    args = parser.parse_args()

    benchmark = ModelBenchmark(args.endpoint)

    # Check connection
    if not benchmark.check_connection():
        print(f"Error: Cannot connect to Ollama at {args.endpoint}")
        print("Make sure Ollama is running: ollama serve")
        return 1

    # Get models to benchmark
    if args.models:
        models = args.models
    else:
        models = benchmark.get_available_models()
        if not models:
            print("No models available. Pull a model first: ollama pull llama3")
            return 1

    print(f"Available models: {models}")
    print()

    # Run benchmarks
    results = benchmark.compare_models(models, args.prompt, args.iterations)

    # Print comparison
    benchmark.print_comparison(results)

    # Save results
    benchmark.save_results(args.output)

    return 0

if __name__ == "__main__":
    import sys
    sys.exit(main())