Files
timmy-config/scripts/local-models/collect_training_data.py
Alexander Whitestone 1350b9b177 Fix #486: Add local model fine-tuning documentation and tools
- Added comprehensive local model fine-tuning guide
- Created benchmarking script for inference performance
- Added training data collection script for merged PRs
- Documented current stack (Ollama + llama.cpp + Hermes 4)
- Provided quantization options and best practices
- Included troubleshooting and monitoring guidance

Addresses issue #486 recommendations:
✓ Documented local model stack for reproducibility
✓ Created benchmarking tools for inference latency
✓ Provided training data collection pipeline
✓ Documented quantization options for faster inference
✓ Included fine-tuning pipeline documentation
2026-04-13 21:43:12 -04:00

167 lines
6.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Collect training data from merged PRs for fine-tuning local models.
Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning
"""
import json
import subprocess
import argparse
import requests
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
class TrainingDataCollector:
"""Collect training data from Gitea PRs."""
def __init__(self, token: str, base_url: str = "https://forge.alexanderwhitestone.com"):
self.token = token
self.base_url = base_url
self.headers = {"Authorization": f"token {token}"}
def get_merged_prs(self, repo: str, limit: int = 100) -> List[Dict[str, Any]]:
"""Get merged PRs from a repository."""
url = f"{self.base_url}/api/v1/repos/{repo}/pulls?state=closed&limit={limit}"
try:
response = requests.get(url, headers=self.headers, timeout=30)
if response.status_code == 200:
prs = response.json()
# Filter for merged PRs
merged_prs = [pr for pr in prs if pr.get("merged_at")]
return merged_prs
else:
print(f"Error fetching PRs: {response.status_code}")
return []
except Exception as e:
print(f"Exception fetching PRs: {e}")
return []
def get_pr_diff(self, repo: str, pr_number: int) -> Optional[str]:
"""Get diff for a PR."""
url = f"{self.base_url}/api/v1/repos/{repo}/pulls/{pr_number}.diff"
try:
response = requests.get(url, headers=self.headers, timeout=30)
if response.status_code == 200:
return response.text
else:
print(f"Error fetching diff for PR #{pr_number}: {response.status_code}")
return None
except Exception as e:
print(f"Exception fetching diff for PR #{pr_number}: {e}")
return None
def extract_training_examples(self, pr: Dict[str, Any], diff: str) -> List[Dict[str, str]]:
"""Extract training examples from a PR."""
examples = []
# Example 1: PR title and description
if pr.get("title") and pr.get("body"):
examples.append({
"prompt": f"Write a pull request description for: {pr['title']}",
"completion": pr["body"],
"metadata": {
"pr_number": pr["number"],
"repo": pr.get("repo", {}).get("full_name", ""),
"type": "pr_description"
}
})
# Example 2: Code review based on diff
if diff:
# Truncate diff if too long
diff_truncated = diff[:2000] + "..." if len(diff) > 2000 else diff
examples.append({
"prompt": f"Review this code change:\n```\n{diff_truncated}\n```",
"completion": f"This PR modifies code in {pr.get('changed_files', 0)} files with {pr.get('additions', 0)} additions and {pr.get('deletions', 0)} deletions.",
"metadata": {
"pr_number": pr["number"],
"repo": pr.get("repo", {}).get("full_name", ""),
"type": "code_review"
}
})
# Example 3: Commit message generation
if pr.get("title"):
examples.append({
"prompt": f"Generate a commit message for changes: {pr['title']}",
"completion": f"feat: {pr['title'].lower()}",
"metadata": {
"pr_number": pr["number"],
"repo": pr.get("repo", {}).get("full_name", ""),
"type": "commit_message"
}
})
return examples
def collect_training_data(self, repo: str, output_file: str, limit: int = 50) -> int:
"""Collect training data from merged PRs."""
print(f"Collecting training data from {repo}...")
# Get merged PRs
prs = self.get_merged_prs(repo, limit)
print(f"Found {len(prs)} merged PRs")
all_examples = []
for i, pr in enumerate(prs):
print(f"Processing PR #{pr['number']} ({i+1}/{len(prs)})...")
# Get diff
diff = self.get_pr_diff(repo, pr["number"])
# Extract examples
examples = self.extract_training_examples(pr, diff)
all_examples.extend(examples)
print(f" Extracted {len(examples)} examples")
# Save to JSONL
with open(output_file, 'w') as f:
for example in all_examples:
f.write(json.dumps(example) + '\n')
print(f"Saved {len(all_examples)} training examples to {output_file}")
return len(all_examples)
def main():
parser = argparse.ArgumentParser(description="Collect training data from merged PRs")
parser.add_argument("--repo", required=True, help="Repository (e.g., Timmy_Foundation/timmy-home)")
parser.add_argument("--token-file", default="/Users/apayne/.config/gitea/token", help="Token file")
parser.add_argument("--output", default="training_data.jsonl", help="Output file")
parser.add_argument("--limit", type=int, default=50, help="Max PRs to process")
args = parser.parse_args()
# Read token
try:
with open(args.token_file) as f:
token = f.read().strip()
except Exception as e:
print(f"Error reading token: {e}")
return 1
# Create collector
collector = TrainingDataCollector(token)
# Collect data
count = collector.collect_training_data(args.repo, args.output, args.limit)
if count > 0:
print(f"\nSuccess! Collected {count} training examples.")
print(f"Next steps:")
print(f"1. Review the data: head {args.output}")
print(f"2. Clean and format: python3 prepare_training_data.py --input {args.output}")
print(f"3. Fine-tune a model: ./llama.cpp/main -m base.gguf -f formatted_data.jsonl")
return 0
else:
print("No training examples collected.")
return 1
if __name__ == "__main__":
import sys
sys.exit(main())