Fix #486: Add local model fine-tuning documentation and tools

- Added comprehensive local model fine-tuning guide - Created benchmarking script for inference performance - Added training data collection script for merged PRs - Documented current stack (Ollama + llama.cpp + Hermes 4) - Provided quantization options and best practices - Included troubleshooting and monitoring guidance Addresses issue #486 recommendations: ✓ Documented local model stack for reproducibility ✓ Created benchmarking tools for inference latency ✓ Provided training data collection pipeline ✓ Documented quantization options for faster inference ✓ Included fine-tuning pipeline documentation
2026-04-13 21:43:12 -04:00
parent 0a52cff8a7
commit 1350b9b177
4 changed files with 575 additions and 0 deletions
--- a/scripts/local-models/collect_training_data.py
+++ b/scripts/local-models/collect_training_data.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Collect training data from merged PRs for fine-tuning local models.
+Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning
+"""
+import json
+import subprocess
+import argparse
+import requests
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+
+class TrainingDataCollector:
+    """Collect training data from Gitea PRs."""
+    
+    def __init__(self, token: str, base_url: str = "https://forge.alexanderwhitestone.com"):
+        self.token = token
+        self.base_url = base_url
+        self.headers = {"Authorization": f"token {token}"}
+    
+    def get_merged_prs(self, repo: str, limit: int = 100) -> List[Dict[str, Any]]:
+        """Get merged PRs from a repository."""
+        url = f"{self.base_url}/api/v1/repos/{repo}/pulls?state=closed&limit={limit}"
+        
+        try:
+            response = requests.get(url, headers=self.headers, timeout=30)
+            if response.status_code == 200:
+                prs = response.json()
+                # Filter for merged PRs
+                merged_prs = [pr for pr in prs if pr.get("merged_at")]
+                return merged_prs
+            else:
+                print(f"Error fetching PRs: {response.status_code}")
+                return []
+        except Exception as e:
+            print(f"Exception fetching PRs: {e}")
+            return []
+    
+    def get_pr_diff(self, repo: str, pr_number: int) -> Optional[str]:
+        """Get diff for a PR."""
+        url = f"{self.base_url}/api/v1/repos/{repo}/pulls/{pr_number}.diff"
+        
+        try:
+            response = requests.get(url, headers=self.headers, timeout=30)
+            if response.status_code == 200:
+                return response.text
+            else:
+                print(f"Error fetching diff for PR #{pr_number}: {response.status_code}")
+                return None
+        except Exception as e:
+            print(f"Exception fetching diff for PR #{pr_number}: {e}")
+            return None
+    
+    def extract_training_examples(self, pr: Dict[str, Any], diff: str) -> List[Dict[str, str]]:
+        """Extract training examples from a PR."""
+        examples = []
+        
+        # Example 1: PR title and description
+        if pr.get("title") and pr.get("body"):
+            examples.append({
+                "prompt": f"Write a pull request description for: {pr['title']}",
+                "completion": pr["body"],
+                "metadata": {
+                    "pr_number": pr["number"],
+                    "repo": pr.get("repo", {}).get("full_name", ""),
+                    "type": "pr_description"
+                }
+            })
+        
+        # Example 2: Code review based on diff
+        if diff:
+            # Truncate diff if too long
+            diff_truncated = diff[:2000] + "..." if len(diff) > 2000 else diff
+            
+            examples.append({
+                "prompt": f"Review this code change:\n```\n{diff_truncated}\n```",
+                "completion": f"This PR modifies code in {pr.get('changed_files', 0)} files with {pr.get('additions', 0)} additions and {pr.get('deletions', 0)} deletions.",
+                "metadata": {
+                    "pr_number": pr["number"],
+                    "repo": pr.get("repo", {}).get("full_name", ""),
+                    "type": "code_review"
+                }
+            })
+        
+        # Example 3: Commit message generation
+        if pr.get("title"):
+            examples.append({
+                "prompt": f"Generate a commit message for changes: {pr['title']}",
+                "completion": f"feat: {pr['title'].lower()}",
+                "metadata": {
+                    "pr_number": pr["number"],
+                    "repo": pr.get("repo", {}).get("full_name", ""),
+                    "type": "commit_message"
+                }
+            })
+        
+        return examples
+    
+    def collect_training_data(self, repo: str, output_file: str, limit: int = 50) -> int:
+        """Collect training data from merged PRs."""
+        print(f"Collecting training data from {repo}...")
+        
+        # Get merged PRs
+        prs = self.get_merged_prs(repo, limit)
+        print(f"Found {len(prs)} merged PRs")
+        
+        all_examples = []
+        
+        for i, pr in enumerate(prs):
+            print(f"Processing PR #{pr['number']} ({i+1}/{len(prs)})...")
+            
+            # Get diff
+            diff = self.get_pr_diff(repo, pr["number"])
+            
+            # Extract examples
+            examples = self.extract_training_examples(pr, diff)
+            all_examples.extend(examples)
+            
+            print(f"  Extracted {len(examples)} examples")
+        
+        # Save to JSONL
+        with open(output_file, 'w') as f:
+            for example in all_examples:
+                f.write(json.dumps(example) + '\n')
+        
+        print(f"Saved {len(all_examples)} training examples to {output_file}")
+        return len(all_examples)
+
+def main():
+    parser = argparse.ArgumentParser(description="Collect training data from merged PRs")
+    parser.add_argument("--repo", required=True, help="Repository (e.g., Timmy_Foundation/timmy-home)")
+    parser.add_argument("--token-file", default="/Users/apayne/.config/gitea/token", help="Token file")
+    parser.add_argument("--output", default="training_data.jsonl", help="Output file")
+    parser.add_argument("--limit", type=int, default=50, help="Max PRs to process")
+    
+    args = parser.parse_args()
+    
+    # Read token
+    try:
+        with open(args.token_file) as f:
+            token = f.read().strip()
+    except Exception as e:
+        print(f"Error reading token: {e}")
+        return 1
+    
+    # Create collector
+    collector = TrainingDataCollector(token)
+    
+    # Collect data
+    count = collector.collect_training_data(args.repo, args.output, args.limit)
+    
+    if count > 0:
+        print(f"\nSuccess! Collected {count} training examples.")
+        print(f"Next steps:")
+        print(f"1. Review the data: head {args.output}")
+        print(f"2. Clean and format: python3 prepare_training_data.py --input {args.output}")
+        print(f"3. Fine-tune a model: ./llama.cpp/main -m base.gguf -f formatted_data.jsonl")
+        return 0
+    else:
+        print("No training examples collected.")
+        return 1
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())