Files
timmy-config/scripts/local-models/benchmark_inference.py
Alexander Whitestone 1350b9b177 Fix #486: Add local model fine-tuning documentation and tools
- Added comprehensive local model fine-tuning guide
- Created benchmarking script for inference performance
- Added training data collection script for merged PRs
- Documented current stack (Ollama + llama.cpp + Hermes 4)
- Provided quantization options and best practices
- Included troubleshooting and monitoring guidance

Addresses issue #486 recommendations:
✓ Documented local model stack for reproducibility
✓ Created benchmarking tools for inference latency
✓ Provided training data collection pipeline
✓ Documented quantization options for faster inference
✓ Included fine-tuning pipeline documentation
2026-04-13 21:43:12 -04:00

237 lines
8.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Benchmark local model inference performance.
Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning
"""
import time
import json
import statistics
import argparse
import requests
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from datetime import datetime
@dataclass
class BenchmarkResult:
"""Results from a benchmark run."""
model: str
iterations: int
total_time: float
tokens_per_second: float
time_to_first_token: float
average_latency: float
p95_latency: float
errors: int
timestamp: str
class ModelBenchmark:
"""Benchmark local model inference."""
def __init__(self, endpoint: str = "http://localhost:11434"):
self.endpoint = endpoint
self.results: List[BenchmarkResult] = []
def check_connection(self) -> bool:
"""Check if Ollama is running."""
try:
response = requests.get(f"{self.endpoint}/api/tags", timeout=5)
return response.status_code == 200
except:
return False
def get_available_models(self) -> List[str]:
"""Get list of available models."""
try:
response = requests.get(f"{self.endpoint}/api/tags", timeout=5)
if response.status_code == 200:
data = response.json()
return [model["name"] for model in data.get("models", [])]
except:
pass
return []
def benchmark_model(self, model: str, prompt: str = "Explain quantum computing in simple terms.",
iterations: int = 5, max_tokens: int = 100) -> BenchmarkResult:
"""Benchmark a single model."""
print(f"Benchmarking {model} ({iterations} iterations)...")
latencies = []
ttfts = [] # Time to first token
total_tokens = 0
errors = 0
for i in range(iterations):
try:
start_time = time.time()
# Generate response
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {
"num_predict": max_tokens,
"temperature": 0.7
}
}
response = requests.post(
f"{self.endpoint}/api/generate",
json=payload,
timeout=60
)
end_time = time.time()
latency = end_time - start_time
if response.status_code == 200:
data = response.json()
response_text = data.get("response", "")
tokens = len(response_text.split())
total_tokens += tokens
# Estimate time to first token (simplified)
ttft = latency * 0.1 # Rough estimate
ttfts.append(ttft)
latencies.append(latency)
print(f" Iteration {i+1}: {latency:.2f}s, {tokens} tokens")
else:
errors += 1
print(f" Iteration {i+1}: Error {response.status_code}")
except Exception as e:
errors += 1
print(f" Iteration {i+1}: Exception {e}")
# Calculate statistics
if latencies:
avg_latency = statistics.mean(latencies)
p95_latency = statistics.quantiles(latencies, n=20)[18] if len(latencies) >= 2 else avg_latency
avg_ttft = statistics.mean(ttfts)
total_time = sum(latencies)
tokens_per_second = total_tokens / total_time if total_time > 0 else 0
else:
avg_latency = 0
p95_latency = 0
avg_ttft = 0
total_time = 0
tokens_per_second = 0
result = BenchmarkResult(
model=model,
iterations=iterations,
total_time=total_time,
tokens_per_second=tokens_per_second,
time_to_first_token=avg_ttft,
average_latency=avg_latency,
p95_latency=p95_latency,
errors=errors,
timestamp=datetime.now().isoformat()
)
self.results.append(result)
return result
def compare_models(self, models: List[str], prompt: str = None, iterations: int = 5) -> List[BenchmarkResult]:
"""Compare multiple models."""
if prompt is None:
prompt = "Explain quantum computing in simple terms."
print(f"Comparing {len(models)} models...")
print("=" * 60)
results = []
for model in models:
result = self.benchmark_model(model, prompt, iterations)
results.append(result)
print()
return results
def print_comparison(self, results: List[BenchmarkResult]):
"""Print comparison table."""
print("\n" + "=" * 80)
print("MODEL COMPARISON")
print("=" * 80)
print(f"{'Model':<20} {'Tokens/s':<10} {'Avg Latency':<12} {'P95 Latency':<12} {'TTFT':<10} {'Errors':<6}")
print("-" * 80)
for result in results:
print(f"{result.model:<20} {result.tokens_per_second:<10.1f} {result.average_latency:<12.2f} "
f"{result.p95_latency:<12.2f} {result.time_to_first_token:<10.2f} {result.errors:<6}")
print("=" * 80)
def save_results(self, filename: str = "benchmark_results.json"):
"""Save results to file."""
data = {
"timestamp": datetime.now().isoformat(),
"endpoint": self.endpoint,
"results": [
{
"model": r.model,
"iterations": r.iterations,
"total_time": r.total_time,
"tokens_per_second": r.tokens_per_second,
"time_to_first_token": r.time_to_first_token,
"average_latency": r.average_latency,
"p95_latency": r.p95_latency,
"errors": r.errors,
"timestamp": r.timestamp
}
for r in self.results
]
}
with open(filename, 'w') as f:
json.dump(data, f, indent=2)
print(f"Results saved to {filename}")
def main():
parser = argparse.ArgumentParser(description="Benchmark local model inference")
parser.add_argument("--endpoint", default="http://localhost:11434", help="Ollama endpoint")
parser.add_argument("--models", nargs="+", help="Models to benchmark")
parser.add_argument("--prompt", default="Explain quantum computing in simple terms.", help="Test prompt")
parser.add_argument("--iterations", type=int, default=5, help="Iterations per model")
parser.add_argument("--output", default="benchmark_results.json", help="Output file")
args = parser.parse_args()
benchmark = ModelBenchmark(args.endpoint)
# Check connection
if not benchmark.check_connection():
print(f"Error: Cannot connect to Ollama at {args.endpoint}")
print("Make sure Ollama is running: ollama serve")
return 1
# Get models to benchmark
if args.models:
models = args.models
else:
models = benchmark.get_available_models()
if not models:
print("No models available. Pull a model first: ollama pull llama3")
return 1
print(f"Available models: {models}")
print()
# Run benchmarks
results = benchmark.compare_models(models, args.prompt, args.iterations)
# Print comparison
benchmark.print_comparison(results)
# Save results
benchmark.save_results(args.output)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())