Fix #486: Add local model fine-tuning documentation and tools
- Added comprehensive local model fine-tuning guide - Created benchmarking script for inference performance - Added training data collection script for merged PRs - Documented current stack (Ollama + llama.cpp + Hermes 4) - Provided quantization options and best practices - Included troubleshooting and monitoring guidance Addresses issue #486 recommendations: ✓ Documented local model stack for reproducibility ✓ Created benchmarking tools for inference latency ✓ Provided training data collection pipeline ✓ Documented quantization options for faster inference ✓ Included fine-tuning pipeline documentation
This commit is contained in:
166
scripts/local-models/collect_training_data.py
Executable file
166
scripts/local-models/collect_training_data.py
Executable file
@@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Collect training data from merged PRs for fine-tuning local models.
|
||||
Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning
|
||||
"""
|
||||
import json
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
|
||||
class TrainingDataCollector:
|
||||
"""Collect training data from Gitea PRs."""
|
||||
|
||||
def __init__(self, token: str, base_url: str = "https://forge.alexanderwhitestone.com"):
|
||||
self.token = token
|
||||
self.base_url = base_url
|
||||
self.headers = {"Authorization": f"token {token}"}
|
||||
|
||||
def get_merged_prs(self, repo: str, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""Get merged PRs from a repository."""
|
||||
url = f"{self.base_url}/api/v1/repos/{repo}/pulls?state=closed&limit={limit}"
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=self.headers, timeout=30)
|
||||
if response.status_code == 200:
|
||||
prs = response.json()
|
||||
# Filter for merged PRs
|
||||
merged_prs = [pr for pr in prs if pr.get("merged_at")]
|
||||
return merged_prs
|
||||
else:
|
||||
print(f"Error fetching PRs: {response.status_code}")
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"Exception fetching PRs: {e}")
|
||||
return []
|
||||
|
||||
def get_pr_diff(self, repo: str, pr_number: int) -> Optional[str]:
|
||||
"""Get diff for a PR."""
|
||||
url = f"{self.base_url}/api/v1/repos/{repo}/pulls/{pr_number}.diff"
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=self.headers, timeout=30)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
else:
|
||||
print(f"Error fetching diff for PR #{pr_number}: {response.status_code}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Exception fetching diff for PR #{pr_number}: {e}")
|
||||
return None
|
||||
|
||||
def extract_training_examples(self, pr: Dict[str, Any], diff: str) -> List[Dict[str, str]]:
|
||||
"""Extract training examples from a PR."""
|
||||
examples = []
|
||||
|
||||
# Example 1: PR title and description
|
||||
if pr.get("title") and pr.get("body"):
|
||||
examples.append({
|
||||
"prompt": f"Write a pull request description for: {pr['title']}",
|
||||
"completion": pr["body"],
|
||||
"metadata": {
|
||||
"pr_number": pr["number"],
|
||||
"repo": pr.get("repo", {}).get("full_name", ""),
|
||||
"type": "pr_description"
|
||||
}
|
||||
})
|
||||
|
||||
# Example 2: Code review based on diff
|
||||
if diff:
|
||||
# Truncate diff if too long
|
||||
diff_truncated = diff[:2000] + "..." if len(diff) > 2000 else diff
|
||||
|
||||
examples.append({
|
||||
"prompt": f"Review this code change:\n```\n{diff_truncated}\n```",
|
||||
"completion": f"This PR modifies code in {pr.get('changed_files', 0)} files with {pr.get('additions', 0)} additions and {pr.get('deletions', 0)} deletions.",
|
||||
"metadata": {
|
||||
"pr_number": pr["number"],
|
||||
"repo": pr.get("repo", {}).get("full_name", ""),
|
||||
"type": "code_review"
|
||||
}
|
||||
})
|
||||
|
||||
# Example 3: Commit message generation
|
||||
if pr.get("title"):
|
||||
examples.append({
|
||||
"prompt": f"Generate a commit message for changes: {pr['title']}",
|
||||
"completion": f"feat: {pr['title'].lower()}",
|
||||
"metadata": {
|
||||
"pr_number": pr["number"],
|
||||
"repo": pr.get("repo", {}).get("full_name", ""),
|
||||
"type": "commit_message"
|
||||
}
|
||||
})
|
||||
|
||||
return examples
|
||||
|
||||
def collect_training_data(self, repo: str, output_file: str, limit: int = 50) -> int:
|
||||
"""Collect training data from merged PRs."""
|
||||
print(f"Collecting training data from {repo}...")
|
||||
|
||||
# Get merged PRs
|
||||
prs = self.get_merged_prs(repo, limit)
|
||||
print(f"Found {len(prs)} merged PRs")
|
||||
|
||||
all_examples = []
|
||||
|
||||
for i, pr in enumerate(prs):
|
||||
print(f"Processing PR #{pr['number']} ({i+1}/{len(prs)})...")
|
||||
|
||||
# Get diff
|
||||
diff = self.get_pr_diff(repo, pr["number"])
|
||||
|
||||
# Extract examples
|
||||
examples = self.extract_training_examples(pr, diff)
|
||||
all_examples.extend(examples)
|
||||
|
||||
print(f" Extracted {len(examples)} examples")
|
||||
|
||||
# Save to JSONL
|
||||
with open(output_file, 'w') as f:
|
||||
for example in all_examples:
|
||||
f.write(json.dumps(example) + '\n')
|
||||
|
||||
print(f"Saved {len(all_examples)} training examples to {output_file}")
|
||||
return len(all_examples)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Collect training data from merged PRs")
|
||||
parser.add_argument("--repo", required=True, help="Repository (e.g., Timmy_Foundation/timmy-home)")
|
||||
parser.add_argument("--token-file", default="/Users/apayne/.config/gitea/token", help="Token file")
|
||||
parser.add_argument("--output", default="training_data.jsonl", help="Output file")
|
||||
parser.add_argument("--limit", type=int, default=50, help="Max PRs to process")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Read token
|
||||
try:
|
||||
with open(args.token_file) as f:
|
||||
token = f.read().strip()
|
||||
except Exception as e:
|
||||
print(f"Error reading token: {e}")
|
||||
return 1
|
||||
|
||||
# Create collector
|
||||
collector = TrainingDataCollector(token)
|
||||
|
||||
# Collect data
|
||||
count = collector.collect_training_data(args.repo, args.output, args.limit)
|
||||
|
||||
if count > 0:
|
||||
print(f"\nSuccess! Collected {count} training examples.")
|
||||
print(f"Next steps:")
|
||||
print(f"1. Review the data: head {args.output}")
|
||||
print(f"2. Clean and format: python3 prepare_training_data.py --input {args.output}")
|
||||
print(f"3. Fine-tune a model: ./llama.cpp/main -m base.gguf -f formatted_data.jsonl")
|
||||
return 0
|
||||
else:
|
||||
print("No training examples collected.")
|
||||
return 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user