- Added comprehensive local model fine-tuning guide - Created benchmarking script for inference performance - Added training data collection script for merged PRs - Documented current stack (Ollama + llama.cpp + Hermes 4) - Provided quantization options and best practices - Included troubleshooting and monitoring guidance Addresses issue #486 recommendations: ✓ Documented local model stack for reproducibility ✓ Created benchmarking tools for inference latency ✓ Provided training data collection pipeline ✓ Documented quantization options for faster inference ✓ Included fine-tuning pipeline documentation
167 lines
6.2 KiB
Python
Executable File
167 lines
6.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Collect training data from merged PRs for fine-tuning local models.
|
|
Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning
|
|
"""
|
|
import json
|
|
import subprocess
|
|
import argparse
|
|
import requests
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional
|
|
from datetime import datetime
|
|
|
|
class TrainingDataCollector:
|
|
"""Collect training data from Gitea PRs."""
|
|
|
|
def __init__(self, token: str, base_url: str = "https://forge.alexanderwhitestone.com"):
|
|
self.token = token
|
|
self.base_url = base_url
|
|
self.headers = {"Authorization": f"token {token}"}
|
|
|
|
def get_merged_prs(self, repo: str, limit: int = 100) -> List[Dict[str, Any]]:
|
|
"""Get merged PRs from a repository."""
|
|
url = f"{self.base_url}/api/v1/repos/{repo}/pulls?state=closed&limit={limit}"
|
|
|
|
try:
|
|
response = requests.get(url, headers=self.headers, timeout=30)
|
|
if response.status_code == 200:
|
|
prs = response.json()
|
|
# Filter for merged PRs
|
|
merged_prs = [pr for pr in prs if pr.get("merged_at")]
|
|
return merged_prs
|
|
else:
|
|
print(f"Error fetching PRs: {response.status_code}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"Exception fetching PRs: {e}")
|
|
return []
|
|
|
|
def get_pr_diff(self, repo: str, pr_number: int) -> Optional[str]:
|
|
"""Get diff for a PR."""
|
|
url = f"{self.base_url}/api/v1/repos/{repo}/pulls/{pr_number}.diff"
|
|
|
|
try:
|
|
response = requests.get(url, headers=self.headers, timeout=30)
|
|
if response.status_code == 200:
|
|
return response.text
|
|
else:
|
|
print(f"Error fetching diff for PR #{pr_number}: {response.status_code}")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Exception fetching diff for PR #{pr_number}: {e}")
|
|
return None
|
|
|
|
def extract_training_examples(self, pr: Dict[str, Any], diff: str) -> List[Dict[str, str]]:
|
|
"""Extract training examples from a PR."""
|
|
examples = []
|
|
|
|
# Example 1: PR title and description
|
|
if pr.get("title") and pr.get("body"):
|
|
examples.append({
|
|
"prompt": f"Write a pull request description for: {pr['title']}",
|
|
"completion": pr["body"],
|
|
"metadata": {
|
|
"pr_number": pr["number"],
|
|
"repo": pr.get("repo", {}).get("full_name", ""),
|
|
"type": "pr_description"
|
|
}
|
|
})
|
|
|
|
# Example 2: Code review based on diff
|
|
if diff:
|
|
# Truncate diff if too long
|
|
diff_truncated = diff[:2000] + "..." if len(diff) > 2000 else diff
|
|
|
|
examples.append({
|
|
"prompt": f"Review this code change:\n```\n{diff_truncated}\n```",
|
|
"completion": f"This PR modifies code in {pr.get('changed_files', 0)} files with {pr.get('additions', 0)} additions and {pr.get('deletions', 0)} deletions.",
|
|
"metadata": {
|
|
"pr_number": pr["number"],
|
|
"repo": pr.get("repo", {}).get("full_name", ""),
|
|
"type": "code_review"
|
|
}
|
|
})
|
|
|
|
# Example 3: Commit message generation
|
|
if pr.get("title"):
|
|
examples.append({
|
|
"prompt": f"Generate a commit message for changes: {pr['title']}",
|
|
"completion": f"feat: {pr['title'].lower()}",
|
|
"metadata": {
|
|
"pr_number": pr["number"],
|
|
"repo": pr.get("repo", {}).get("full_name", ""),
|
|
"type": "commit_message"
|
|
}
|
|
})
|
|
|
|
return examples
|
|
|
|
def collect_training_data(self, repo: str, output_file: str, limit: int = 50) -> int:
|
|
"""Collect training data from merged PRs."""
|
|
print(f"Collecting training data from {repo}...")
|
|
|
|
# Get merged PRs
|
|
prs = self.get_merged_prs(repo, limit)
|
|
print(f"Found {len(prs)} merged PRs")
|
|
|
|
all_examples = []
|
|
|
|
for i, pr in enumerate(prs):
|
|
print(f"Processing PR #{pr['number']} ({i+1}/{len(prs)})...")
|
|
|
|
# Get diff
|
|
diff = self.get_pr_diff(repo, pr["number"])
|
|
|
|
# Extract examples
|
|
examples = self.extract_training_examples(pr, diff)
|
|
all_examples.extend(examples)
|
|
|
|
print(f" Extracted {len(examples)} examples")
|
|
|
|
# Save to JSONL
|
|
with open(output_file, 'w') as f:
|
|
for example in all_examples:
|
|
f.write(json.dumps(example) + '\n')
|
|
|
|
print(f"Saved {len(all_examples)} training examples to {output_file}")
|
|
return len(all_examples)
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Collect training data from merged PRs")
|
|
parser.add_argument("--repo", required=True, help="Repository (e.g., Timmy_Foundation/timmy-home)")
|
|
parser.add_argument("--token-file", default="/Users/apayne/.config/gitea/token", help="Token file")
|
|
parser.add_argument("--output", default="training_data.jsonl", help="Output file")
|
|
parser.add_argument("--limit", type=int, default=50, help="Max PRs to process")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Read token
|
|
try:
|
|
with open(args.token_file) as f:
|
|
token = f.read().strip()
|
|
except Exception as e:
|
|
print(f"Error reading token: {e}")
|
|
return 1
|
|
|
|
# Create collector
|
|
collector = TrainingDataCollector(token)
|
|
|
|
# Collect data
|
|
count = collector.collect_training_data(args.repo, args.output, args.limit)
|
|
|
|
if count > 0:
|
|
print(f"\nSuccess! Collected {count} training examples.")
|
|
print(f"Next steps:")
|
|
print(f"1. Review the data: head {args.output}")
|
|
print(f"2. Clean and format: python3 prepare_training_data.py --input {args.output}")
|
|
print(f"3. Fine-tune a model: ./llama.cpp/main -m base.gguf -f formatted_data.jsonl")
|
|
return 0
|
|
else:
|
|
print("No training examples collected.")
|
|
return 1
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
sys.exit(main())
|