#!/usr/bin/env python3 """ Collect training data from merged PRs for fine-tuning local models. Issue #486: [AUDIT][SERVICE] Invest in local model fine-tuning """ import json import subprocess import argparse import requests from pathlib import Path from typing import List, Dict, Any, Optional from datetime import datetime class TrainingDataCollector: """Collect training data from Gitea PRs.""" def __init__(self, token: str, base_url: str = "https://forge.alexanderwhitestone.com"): self.token = token self.base_url = base_url self.headers = {"Authorization": f"token {token}"} def get_merged_prs(self, repo: str, limit: int = 100) -> List[Dict[str, Any]]: """Get merged PRs from a repository.""" url = f"{self.base_url}/api/v1/repos/{repo}/pulls?state=closed&limit={limit}" try: response = requests.get(url, headers=self.headers, timeout=30) if response.status_code == 200: prs = response.json() # Filter for merged PRs merged_prs = [pr for pr in prs if pr.get("merged_at")] return merged_prs else: print(f"Error fetching PRs: {response.status_code}") return [] except Exception as e: print(f"Exception fetching PRs: {e}") return [] def get_pr_diff(self, repo: str, pr_number: int) -> Optional[str]: """Get diff for a PR.""" url = f"{self.base_url}/api/v1/repos/{repo}/pulls/{pr_number}.diff" try: response = requests.get(url, headers=self.headers, timeout=30) if response.status_code == 200: return response.text else: print(f"Error fetching diff for PR #{pr_number}: {response.status_code}") return None except Exception as e: print(f"Exception fetching diff for PR #{pr_number}: {e}") return None def extract_training_examples(self, pr: Dict[str, Any], diff: str) -> List[Dict[str, str]]: """Extract training examples from a PR.""" examples = [] # Example 1: PR title and description if pr.get("title") and pr.get("body"): examples.append({ "prompt": f"Write a pull request description for: {pr['title']}", "completion": pr["body"], "metadata": { "pr_number": pr["number"], "repo": pr.get("repo", {}).get("full_name", ""), "type": "pr_description" } }) # Example 2: Code review based on diff if diff: # Truncate diff if too long diff_truncated = diff[:2000] + "..." if len(diff) > 2000 else diff examples.append({ "prompt": f"Review this code change:\n```\n{diff_truncated}\n```", "completion": f"This PR modifies code in {pr.get('changed_files', 0)} files with {pr.get('additions', 0)} additions and {pr.get('deletions', 0)} deletions.", "metadata": { "pr_number": pr["number"], "repo": pr.get("repo", {}).get("full_name", ""), "type": "code_review" } }) # Example 3: Commit message generation if pr.get("title"): examples.append({ "prompt": f"Generate a commit message for changes: {pr['title']}", "completion": f"feat: {pr['title'].lower()}", "metadata": { "pr_number": pr["number"], "repo": pr.get("repo", {}).get("full_name", ""), "type": "commit_message" } }) return examples def collect_training_data(self, repo: str, output_file: str, limit: int = 50) -> int: """Collect training data from merged PRs.""" print(f"Collecting training data from {repo}...") # Get merged PRs prs = self.get_merged_prs(repo, limit) print(f"Found {len(prs)} merged PRs") all_examples = [] for i, pr in enumerate(prs): print(f"Processing PR #{pr['number']} ({i+1}/{len(prs)})...") # Get diff diff = self.get_pr_diff(repo, pr["number"]) # Extract examples examples = self.extract_training_examples(pr, diff) all_examples.extend(examples) print(f" Extracted {len(examples)} examples") # Save to JSONL with open(output_file, 'w') as f: for example in all_examples: f.write(json.dumps(example) + '\n') print(f"Saved {len(all_examples)} training examples to {output_file}") return len(all_examples) def main(): parser = argparse.ArgumentParser(description="Collect training data from merged PRs") parser.add_argument("--repo", required=True, help="Repository (e.g., Timmy_Foundation/timmy-home)") parser.add_argument("--token-file", default="/Users/apayne/.config/gitea/token", help="Token file") parser.add_argument("--output", default="training_data.jsonl", help="Output file") parser.add_argument("--limit", type=int, default=50, help="Max PRs to process") args = parser.parse_args() # Read token try: with open(args.token_file) as f: token = f.read().strip() except Exception as e: print(f"Error reading token: {e}") return 1 # Create collector collector = TrainingDataCollector(token) # Collect data count = collector.collect_training_data(args.repo, args.output, args.limit) if count > 0: print(f"\nSuccess! Collected {count} training examples.") print(f"Next steps:") print(f"1. Review the data: head {args.output}") print(f"2. Clean and format: python3 prepare_training_data.py --input {args.output}") print(f"3. Fine-tune a model: ./llama.cpp/main -m base.gguf -f formatted_data.jsonl") return 0 else: print("No training examples collected.") return 1 if __name__ == "__main__": import sys sys.exit(main())