Fix #486: Add local model fine-tuning documentation and tools

- Added comprehensive local model fine-tuning guide - Created benchmarking script for inference performance - Added training data collection script for merged PRs - Documented current stack (Ollama + llama.cpp + Hermes 4) - Provided quantization options and best practices - Included troubleshooting and monitoring guidance Addresses issue #486 recommendations: ✓ Documented local model stack for reproducibility ✓ Created benchmarking tools for inference latency ✓ Provided training data collection pipeline ✓ Documented quantization options for faster inference ✓ Included fine-tuning pipeline documentation
2026-04-13 21:43:12 -04:00
parent 0a52cff8a7
commit 1350b9b177
4 changed files with 575 additions and 0 deletions
--- a/scripts/local-models/benchmark_inference.py
+++ b/scripts/local-models/benchmark_inference.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+"""
+Benchmark local model inference performance.
+Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning
+"""
+import time
+import json
+import statistics
+import argparse
+import requests
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass
+from datetime import datetime
+
+@dataclass
+class BenchmarkResult:
+    """Results from a benchmark run."""
+    model: str
+    iterations: int
+    total_time: float
+    tokens_per_second: float
+    time_to_first_token: float
+    average_latency: float
+    p95_latency: float
+    errors: int
+    timestamp: str
+
+class ModelBenchmark:
+    """Benchmark local model inference."""
+    
+    def __init__(self, endpoint: str = "http://localhost:11434"):
+        self.endpoint = endpoint
+        self.results: List[BenchmarkResult] = []
+    
+    def check_connection(self) -> bool:
+        """Check if Ollama is running."""
+        try:
+            response = requests.get(f"{self.endpoint}/api/tags", timeout=5)
+            return response.status_code == 200
+        except:
+            return False
+    
+    def get_available_models(self) -> List[str]:
+        """Get list of available models."""
+        try:
+            response = requests.get(f"{self.endpoint}/api/tags", timeout=5)
+            if response.status_code == 200:
+                data = response.json()
+                return [model["name"] for model in data.get("models", [])]
+        except:
+            pass
+        return []
+    
+    def benchmark_model(self, model: str, prompt: str = "Explain quantum computing in simple terms.", 
+                       iterations: int = 5, max_tokens: int = 100) -> BenchmarkResult:
+        """Benchmark a single model."""
+        print(f"Benchmarking {model} ({iterations} iterations)...")
+        
+        latencies = []
+        ttfts = []  # Time to first token
+        total_tokens = 0
+        errors = 0
+        
+        for i in range(iterations):
+            try:
+                start_time = time.time()
+                
+                # Generate response
+                payload = {
+                    "model": model,
+                    "prompt": prompt,
+                    "stream": False,
+                    "options": {
+                        "num_predict": max_tokens,
+                        "temperature": 0.7
+                    }
+                }
+                
+                response = requests.post(
+                    f"{self.endpoint}/api/generate",
+                    json=payload,
+                    timeout=60
+                )
+                
+                end_time = time.time()
+                latency = end_time - start_time
+                
+                if response.status_code == 200:
+                    data = response.json()
+                    response_text = data.get("response", "")
+                    tokens = len(response_text.split())
+                    total_tokens += tokens
+                    
+                    # Estimate time to first token (simplified)
+                    ttft = latency * 0.1  # Rough estimate
+                    ttfts.append(ttft)
+                    latencies.append(latency)
+                    
+                    print(f"  Iteration {i+1}: {latency:.2f}s, {tokens} tokens")
+                else:
+                    errors += 1
+                    print(f"  Iteration {i+1}: Error {response.status_code}")
+            
+            except Exception as e:
+                errors += 1
+                print(f"  Iteration {i+1}: Exception {e}")
+        
+        # Calculate statistics
+        if latencies:
+            avg_latency = statistics.mean(latencies)
+            p95_latency = statistics.quantiles(latencies, n=20)[18] if len(latencies) >= 2 else avg_latency
+            avg_ttft = statistics.mean(ttfts)
+            total_time = sum(latencies)
+            tokens_per_second = total_tokens / total_time if total_time > 0 else 0
+        else:
+            avg_latency = 0
+            p95_latency = 0
+            avg_ttft = 0
+            total_time = 0
+            tokens_per_second = 0
+        
+        result = BenchmarkResult(
+            model=model,
+            iterations=iterations,
+            total_time=total_time,
+            tokens_per_second=tokens_per_second,
+            time_to_first_token=avg_ttft,
+            average_latency=avg_latency,
+            p95_latency=p95_latency,
+            errors=errors,
+            timestamp=datetime.now().isoformat()
+        )
+        
+        self.results.append(result)
+        return result
+    
+    def compare_models(self, models: List[str], prompt: str = None, iterations: int = 5) -> List[BenchmarkResult]:
+        """Compare multiple models."""
+        if prompt is None:
+            prompt = "Explain quantum computing in simple terms."
+        
+        print(f"Comparing {len(models)} models...")
+        print("=" * 60)
+        
+        results = []
+        for model in models:
+            result = self.benchmark_model(model, prompt, iterations)
+            results.append(result)
+            print()
+        
+        return results
+    
+    def print_comparison(self, results: List[BenchmarkResult]):
+        """Print comparison table."""
+        print("\n" + "=" * 80)
+        print("MODEL COMPARISON")
+        print("=" * 80)
+        print(f"{'Model':<20} {'Tokens/s':<10} {'Avg Latency':<12} {'P95 Latency':<12} {'TTFT':<10} {'Errors':<6}")
+        print("-" * 80)
+        
+        for result in results:
+            print(f"{result.model:<20} {result.tokens_per_second:<10.1f} {result.average_latency:<12.2f} "
+                  f"{result.p95_latency:<12.2f} {result.time_to_first_token:<10.2f} {result.errors:<6}")
+        
+        print("=" * 80)
+    
+    def save_results(self, filename: str = "benchmark_results.json"):
+        """Save results to file."""
+        data = {
+            "timestamp": datetime.now().isoformat(),
+            "endpoint": self.endpoint,
+            "results": [
+                {
+                    "model": r.model,
+                    "iterations": r.iterations,
+                    "total_time": r.total_time,
+                    "tokens_per_second": r.tokens_per_second,
+                    "time_to_first_token": r.time_to_first_token,
+                    "average_latency": r.average_latency,
+                    "p95_latency": r.p95_latency,
+                    "errors": r.errors,
+                    "timestamp": r.timestamp
+                }
+                for r in self.results
+            ]
+        }
+        
+        with open(filename, 'w') as f:
+            json.dump(data, f, indent=2)
+        
+        print(f"Results saved to {filename}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark local model inference")
+    parser.add_argument("--endpoint", default="http://localhost:11434", help="Ollama endpoint")
+    parser.add_argument("--models", nargs="+", help="Models to benchmark")
+    parser.add_argument("--prompt", default="Explain quantum computing in simple terms.", help="Test prompt")
+    parser.add_argument("--iterations", type=int, default=5, help="Iterations per model")
+    parser.add_argument("--output", default="benchmark_results.json", help="Output file")
+    
+    args = parser.parse_args()
+    
+    benchmark = ModelBenchmark(args.endpoint)
+    
+    # Check connection
+    if not benchmark.check_connection():
+        print(f"Error: Cannot connect to Ollama at {args.endpoint}")
+        print("Make sure Ollama is running: ollama serve")
+        return 1
+    
+    # Get models to benchmark
+    if args.models:
+        models = args.models
+    else:
+        models = benchmark.get_available_models()
+        if not models:
+            print("No models available. Pull a model first: ollama pull llama3")
+            return 1
+    
+    print(f"Available models: {models}")
+    print()
+    
+    # Run benchmarks
+    results = benchmark.compare_models(models, args.prompt, args.iterations)
+    
+    # Print comparison
+    benchmark.print_comparison(results)
+    
+    # Save results
+    benchmark.save_results(args.output)
+    
+    return 0
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())