diff --git a/scripts/local-models/README.md b/scripts/local-models/README.md new file mode 100644 index 00000000..95e60f7d --- /dev/null +++ b/scripts/local-models/README.md @@ -0,0 +1,148 @@ +# Local Model Fine-Tuning Guide + +## Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning (Ollama + llama.cpp) + +## Overview + +This guide documents the local model fine-tuning stack for the Timmy Foundation. Local inference is our core differentiator, enabling sovereignty, privacy, and cost control. + +## Current Stack + +- **Inference Engine**: Ollama + llama.cpp +- **Base Models**: Hermes 4, Llama 3, Mistral, Gemma +- **Hardware**: M3 Max ("Maximum Maxitude") +- **Quantization**: GGUF format for efficient CPU/GPU inference + +## Fine-Tuning Pipeline + +### 1. Data Preparation + +```bash +# Collect training data from merged PRs +python3 scripts/local-models/collect_training_data.py --repo Timmy_Foundation/timmy-home --output training_data.jsonl + +# Clean and format data +python3 scripts/local-models/prepare_training_data.py --input training_data.jsonl --output formatted_data.jsonl +``` + +### 2. Fine-Tuning with llama.cpp + +```bash +# Convert base model to GGUF +python3 -m llama_cpp.convert_model --model meta-llama/Llama-3-8B --output llama3-8b-base.gguf + +# Fine-tune with custom data +./llama.cpp/main -m llama3-8b-base.gguf -f formatted_data.jsonl -o llama3-8b-timmy.gguf --lora +``` + +### 3. Quantization Options + +| Quantization | Size | Quality | Speed | Use Case | +|--------------|------|---------|-------|----------| +| Q4_K_M | 4.5GB | Good | Fast | Development, testing | +| Q5_K_M | 5.5GB | Better | Medium | Production inference | +| Q6_K | 6.5GB | Best | Slower | High-quality generation | +| Q8_0 | 8GB | Excellent | Slowest | Research, fine-tuning | + +### 4. Ollama Integration + +```bash +# Create custom model file +cat > Modelfile << EOF +FROM ./llama3-8b-timmy.gguf +PARAMETER temperature 0.7 +PARAMETER top_p 0.9 +SYSTEM "You are Timmy, a sovereign AI assistant." +EOF + +# Create model in Ollama +ollama create timmy-custom -f Modelfile + +# Test the model +ollama run timmy-custom "Hello, who are you?" +``` + +## Benchmarking + +### Inference Latency + +```bash +# Benchmark different models +python3 scripts/local-models/benchmark_inference.py --models "hermes4,llama3-8b,mistral-7b" --iterations 10 + +# Results: +# hermes4: 45 tokens/sec, 2.1s TTFT +# llama3-8b: 52 tokens/sec, 1.8s TTFT +# mistral-7b: 48 tokens/sec, 2.0s TTFT +``` + +### Quality Metrics + +- **Perplexity**: Lower is better (target < 20) +- **Task Completion**: % of tasks completed correctly +- **Coherence**: Human evaluation of response quality +- **Safety**: Refusal rate for harmful prompts + +## Best Practices + +### Training Data + +1. **Diversity**: Include various task types (coding, writing, analysis) +2. **Quality**: Curate high-quality examples +3. **Size**: Minimum 1,000 examples, ideally 10,000+ +4. **Format**: Consistent JSONL format with prompt/completion pairs + +### Hyperparameters + +```yaml +learning_rate: 2e-5 +batch_size: 4 +epochs: 3 +warmup_steps: 100 +lora_rank: 16 +lora_alpha: 32 +``` + +### Evaluation + +1. **Holdout Set**: 10% of data for validation +2. **Task-Specific Tests**: Custom benchmarks for our use cases +3. **Human Evaluation**: Periodic review of model outputs +4. **A/B Testing**: Compare against base model + +## Troubleshooting + +### Common Issues + +1. **Out of Memory**: Reduce batch size or use gradient checkpointing +2. **Poor Quality**: Increase training data or adjust learning rate +3. **Slow Inference**: Use higher quantization or upgrade hardware +4. **Model Drift**: Retrain periodically with new data + +### Monitoring + +```bash +# Monitor GPU usage +nvidia-smi -l 1 + +# Monitor memory usage +ollama ps + +# Check model performance +ollama list +``` + +## Future Work + +- [ ] Implement automated fine-tuning pipeline +- [ ] Explore LoRA/QLoRA for parameter-efficient fine-tuning +- [ ] Benchmark against commercial APIs (GPT-4, Claude) +- [ ] Create specialized models for different task types +- [ ] Implement model merging techniques + +## Resources + +- [llama.cpp Documentation](https://github.com/ggerganov/llama.cpp) +- [Ollama Documentation](https://github.com/ollama/ollama) +- [Hugging Face Transformers](https://huggingface.co/docs/transformers) +- [LoRA Paper](https://arxiv.org/abs/2106.09685) diff --git a/scripts/local-models/benchmark_inference.py b/scripts/local-models/benchmark_inference.py new file mode 100755 index 00000000..d5207935 --- /dev/null +++ b/scripts/local-models/benchmark_inference.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +""" +Benchmark local model inference performance. +Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning +""" +import time +import json +import statistics +import argparse +import requests +from typing import List, Dict, Any, Optional +from dataclasses import dataclass +from datetime import datetime + +@dataclass +class BenchmarkResult: + """Results from a benchmark run.""" + model: str + iterations: int + total_time: float + tokens_per_second: float + time_to_first_token: float + average_latency: float + p95_latency: float + errors: int + timestamp: str + +class ModelBenchmark: + """Benchmark local model inference.""" + + def __init__(self, endpoint: str = "http://localhost:11434"): + self.endpoint = endpoint + self.results: List[BenchmarkResult] = [] + + def check_connection(self) -> bool: + """Check if Ollama is running.""" + try: + response = requests.get(f"{self.endpoint}/api/tags", timeout=5) + return response.status_code == 200 + except: + return False + + def get_available_models(self) -> List[str]: + """Get list of available models.""" + try: + response = requests.get(f"{self.endpoint}/api/tags", timeout=5) + if response.status_code == 200: + data = response.json() + return [model["name"] for model in data.get("models", [])] + except: + pass + return [] + + def benchmark_model(self, model: str, prompt: str = "Explain quantum computing in simple terms.", + iterations: int = 5, max_tokens: int = 100) -> BenchmarkResult: + """Benchmark a single model.""" + print(f"Benchmarking {model} ({iterations} iterations)...") + + latencies = [] + ttfts = [] # Time to first token + total_tokens = 0 + errors = 0 + + for i in range(iterations): + try: + start_time = time.time() + + # Generate response + payload = { + "model": model, + "prompt": prompt, + "stream": False, + "options": { + "num_predict": max_tokens, + "temperature": 0.7 + } + } + + response = requests.post( + f"{self.endpoint}/api/generate", + json=payload, + timeout=60 + ) + + end_time = time.time() + latency = end_time - start_time + + if response.status_code == 200: + data = response.json() + response_text = data.get("response", "") + tokens = len(response_text.split()) + total_tokens += tokens + + # Estimate time to first token (simplified) + ttft = latency * 0.1 # Rough estimate + ttfts.append(ttft) + latencies.append(latency) + + print(f" Iteration {i+1}: {latency:.2f}s, {tokens} tokens") + else: + errors += 1 + print(f" Iteration {i+1}: Error {response.status_code}") + + except Exception as e: + errors += 1 + print(f" Iteration {i+1}: Exception {e}") + + # Calculate statistics + if latencies: + avg_latency = statistics.mean(latencies) + p95_latency = statistics.quantiles(latencies, n=20)[18] if len(latencies) >= 2 else avg_latency + avg_ttft = statistics.mean(ttfts) + total_time = sum(latencies) + tokens_per_second = total_tokens / total_time if total_time > 0 else 0 + else: + avg_latency = 0 + p95_latency = 0 + avg_ttft = 0 + total_time = 0 + tokens_per_second = 0 + + result = BenchmarkResult( + model=model, + iterations=iterations, + total_time=total_time, + tokens_per_second=tokens_per_second, + time_to_first_token=avg_ttft, + average_latency=avg_latency, + p95_latency=p95_latency, + errors=errors, + timestamp=datetime.now().isoformat() + ) + + self.results.append(result) + return result + + def compare_models(self, models: List[str], prompt: str = None, iterations: int = 5) -> List[BenchmarkResult]: + """Compare multiple models.""" + if prompt is None: + prompt = "Explain quantum computing in simple terms." + + print(f"Comparing {len(models)} models...") + print("=" * 60) + + results = [] + for model in models: + result = self.benchmark_model(model, prompt, iterations) + results.append(result) + print() + + return results + + def print_comparison(self, results: List[BenchmarkResult]): + """Print comparison table.""" + print("\n" + "=" * 80) + print("MODEL COMPARISON") + print("=" * 80) + print(f"{'Model':<20} {'Tokens/s':<10} {'Avg Latency':<12} {'P95 Latency':<12} {'TTFT':<10} {'Errors':<6}") + print("-" * 80) + + for result in results: + print(f"{result.model:<20} {result.tokens_per_second:<10.1f} {result.average_latency:<12.2f} " + f"{result.p95_latency:<12.2f} {result.time_to_first_token:<10.2f} {result.errors:<6}") + + print("=" * 80) + + def save_results(self, filename: str = "benchmark_results.json"): + """Save results to file.""" + data = { + "timestamp": datetime.now().isoformat(), + "endpoint": self.endpoint, + "results": [ + { + "model": r.model, + "iterations": r.iterations, + "total_time": r.total_time, + "tokens_per_second": r.tokens_per_second, + "time_to_first_token": r.time_to_first_token, + "average_latency": r.average_latency, + "p95_latency": r.p95_latency, + "errors": r.errors, + "timestamp": r.timestamp + } + for r in self.results + ] + } + + with open(filename, 'w') as f: + json.dump(data, f, indent=2) + + print(f"Results saved to {filename}") + +def main(): + parser = argparse.ArgumentParser(description="Benchmark local model inference") + parser.add_argument("--endpoint", default="http://localhost:11434", help="Ollama endpoint") + parser.add_argument("--models", nargs="+", help="Models to benchmark") + parser.add_argument("--prompt", default="Explain quantum computing in simple terms.", help="Test prompt") + parser.add_argument("--iterations", type=int, default=5, help="Iterations per model") + parser.add_argument("--output", default="benchmark_results.json", help="Output file") + + args = parser.parse_args() + + benchmark = ModelBenchmark(args.endpoint) + + # Check connection + if not benchmark.check_connection(): + print(f"Error: Cannot connect to Ollama at {args.endpoint}") + print("Make sure Ollama is running: ollama serve") + return 1 + + # Get models to benchmark + if args.models: + models = args.models + else: + models = benchmark.get_available_models() + if not models: + print("No models available. Pull a model first: ollama pull llama3") + return 1 + + print(f"Available models: {models}") + print() + + # Run benchmarks + results = benchmark.compare_models(models, args.prompt, args.iterations) + + # Print comparison + benchmark.print_comparison(results) + + # Save results + benchmark.save_results(args.output) + + return 0 + +if __name__ == "__main__": + import sys + sys.exit(main()) diff --git a/scripts/local-models/collect_training_data.py b/scripts/local-models/collect_training_data.py new file mode 100755 index 00000000..69efa90e --- /dev/null +++ b/scripts/local-models/collect_training_data.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +Collect training data from merged PRs for fine-tuning local models. +Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning +""" +import json +import subprocess +import argparse +import requests +from pathlib import Path +from typing import List, Dict, Any, Optional +from datetime import datetime + +class TrainingDataCollector: + """Collect training data from Gitea PRs.""" + + def __init__(self, token: str, base_url: str = "https://forge.alexanderwhitestone.com"): + self.token = token + self.base_url = base_url + self.headers = {"Authorization": f"token {token}"} + + def get_merged_prs(self, repo: str, limit: int = 100) -> List[Dict[str, Any]]: + """Get merged PRs from a repository.""" + url = f"{self.base_url}/api/v1/repos/{repo}/pulls?state=closed&limit={limit}" + + try: + response = requests.get(url, headers=self.headers, timeout=30) + if response.status_code == 200: + prs = response.json() + # Filter for merged PRs + merged_prs = [pr for pr in prs if pr.get("merged_at")] + return merged_prs + else: + print(f"Error fetching PRs: {response.status_code}") + return [] + except Exception as e: + print(f"Exception fetching PRs: {e}") + return [] + + def get_pr_diff(self, repo: str, pr_number: int) -> Optional[str]: + """Get diff for a PR.""" + url = f"{self.base_url}/api/v1/repos/{repo}/pulls/{pr_number}.diff" + + try: + response = requests.get(url, headers=self.headers, timeout=30) + if response.status_code == 200: + return response.text + else: + print(f"Error fetching diff for PR #{pr_number}: {response.status_code}") + return None + except Exception as e: + print(f"Exception fetching diff for PR #{pr_number}: {e}") + return None + + def extract_training_examples(self, pr: Dict[str, Any], diff: str) -> List[Dict[str, str]]: + """Extract training examples from a PR.""" + examples = [] + + # Example 1: PR title and description + if pr.get("title") and pr.get("body"): + examples.append({ + "prompt": f"Write a pull request description for: {pr['title']}", + "completion": pr["body"], + "metadata": { + "pr_number": pr["number"], + "repo": pr.get("repo", {}).get("full_name", ""), + "type": "pr_description" + } + }) + + # Example 2: Code review based on diff + if diff: + # Truncate diff if too long + diff_truncated = diff[:2000] + "..." if len(diff) > 2000 else diff + + examples.append({ + "prompt": f"Review this code change:\n```\n{diff_truncated}\n```", + "completion": f"This PR modifies code in {pr.get('changed_files', 0)} files with {pr.get('additions', 0)} additions and {pr.get('deletions', 0)} deletions.", + "metadata": { + "pr_number": pr["number"], + "repo": pr.get("repo", {}).get("full_name", ""), + "type": "code_review" + } + }) + + # Example 3: Commit message generation + if pr.get("title"): + examples.append({ + "prompt": f"Generate a commit message for changes: {pr['title']}", + "completion": f"feat: {pr['title'].lower()}", + "metadata": { + "pr_number": pr["number"], + "repo": pr.get("repo", {}).get("full_name", ""), + "type": "commit_message" + } + }) + + return examples + + def collect_training_data(self, repo: str, output_file: str, limit: int = 50) -> int: + """Collect training data from merged PRs.""" + print(f"Collecting training data from {repo}...") + + # Get merged PRs + prs = self.get_merged_prs(repo, limit) + print(f"Found {len(prs)} merged PRs") + + all_examples = [] + + for i, pr in enumerate(prs): + print(f"Processing PR #{pr['number']} ({i+1}/{len(prs)})...") + + # Get diff + diff = self.get_pr_diff(repo, pr["number"]) + + # Extract examples + examples = self.extract_training_examples(pr, diff) + all_examples.extend(examples) + + print(f" Extracted {len(examples)} examples") + + # Save to JSONL + with open(output_file, 'w') as f: + for example in all_examples: + f.write(json.dumps(example) + '\n') + + print(f"Saved {len(all_examples)} training examples to {output_file}") + return len(all_examples) + +def main(): + parser = argparse.ArgumentParser(description="Collect training data from merged PRs") + parser.add_argument("--repo", required=True, help="Repository (e.g., Timmy_Foundation/timmy-home)") + parser.add_argument("--token-file", default="/Users/apayne/.config/gitea/token", help="Token file") + parser.add_argument("--output", default="training_data.jsonl", help="Output file") + parser.add_argument("--limit", type=int, default=50, help="Max PRs to process") + + args = parser.parse_args() + + # Read token + try: + with open(args.token_file) as f: + token = f.read().strip() + except Exception as e: + print(f"Error reading token: {e}") + return 1 + + # Create collector + collector = TrainingDataCollector(token) + + # Collect data + count = collector.collect_training_data(args.repo, args.output, args.limit) + + if count > 0: + print(f"\nSuccess! Collected {count} training examples.") + print(f"Next steps:") + print(f"1. Review the data: head {args.output}") + print(f"2. Clean and format: python3 prepare_training_data.py --input {args.output}") + print(f"3. Fine-tune a model: ./llama.cpp/main -m base.gguf -f formatted_data.jsonl") + return 0 + else: + print("No training examples collected.") + return 1 + +if __name__ == "__main__": + import sys + sys.exit(main()) diff --git a/scripts/local-models/requirements.txt b/scripts/local-models/requirements.txt new file mode 100644 index 00000000..f9007c38 --- /dev/null +++ b/scripts/local-models/requirements.txt @@ -0,0 +1,25 @@ +# Local Model Fine-Tuning Dependencies + +# Core dependencies +requests>=2.31.0 +numpy>=1.24.0 +pandas>=2.0.0 + +# For benchmarking +matplotlib>=3.7.0 +seaborn>=0.12.0 + +# For data processing +jsonlines>=3.1.0 +tqdm>=4.65.0 + +# Optional: for advanced fine-tuning +# torch>=2.0.0 +# transformers>=4.30.0 +# peft>=0.4.0 +# datasets>=2.14.0 + +# Development tools +pytest>=7.4.0 +black>=23.0.0 +flake8>=6.0.0