diff --git a/training/scripts/fix_training_indentation.py b/training/scripts/fix_training_indentation.py new file mode 100644 index 00000000..2bcc9998 --- /dev/null +++ b/training/scripts/fix_training_indentation.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +""" +Fix Training Data Code Block Indentation +Issue #750: Training data code blocks have inconsistent indentation + +Normalizes code block indentation in JSONL training data files using textwrap.dedent. + +Usage: + python3 fix_training_indentation.py --input data.jsonl + python3 fix_training_indentation.py --input data.jsonl --output fixed.jsonl + python3 fix_training_indentation.py --input data.jsonl --dry-run +""" + +import json +import re +import sys +import textwrap +from pathlib import Path + + +def fix_code_block_indentation(text): + """ + Find code blocks in text and normalize their indentation. + + Handles: + - ```python ... ``` blocks + - ```bash ... ``` blocks + - ``` ... ``` blocks (no language) + - Nested code blocks in JSON strings + """ + if not text or '```' not in text: + return text, 0 + + fixes = 0 + result = text + + # Pattern to match code blocks: ```language\n...code...\n``` + # Also handles cases where code block is indented + code_block_pattern = re.compile( + r'(```(?:\w+)?\n)(.*?)(```)', + re.DOTALL + ) + + def fix_block(match): + nonlocal fixes + opening = match.group(1) # ```python\n + code = match.group(2) # The code content + closing = match.group(3) # ``` + + if not code.strip(): + return match.group(0) + + # Use textwrap.dedent to remove common leading whitespace + dedented = textwrap.dedent(code) + + # Also handle the case where first line has different indentation + lines = dedented.split('\n') + if lines: + # Find minimum indentation (excluding empty lines) + min_indent = float('inf') + for line in lines: + if line.strip(): + indent = len(line) - len(line.lstrip()) + min_indent = min(min_indent, indent) + + if min_indent > 0 and min_indent != float('inf'): + # Remove the minimum indentation from all lines + lines = [line[min_indent:] if line.strip() else line for line in lines] + dedented = '\n'.join(lines) + + if dedented != code: + fixes += 1 + + return opening + dedented + closing + + result = code_block_pattern.sub(fix_block, result) + return result, fixes + + +def process_jsonl_file(input_path, output_path=None, dry_run=False): + """Process a JSONL file and fix code block indentation.""" + input_path = Path(input_path) + if output_path is None: + output_path = input_path.with_suffix('.fixed.jsonl') + else: + output_path = Path(output_path) + + if not input_path.exists(): + print(f"Error: {input_path} does not exist") + return 0, 0 + + total_entries = 0 + total_fixes = 0 + entries_with_fixes = 0 + + with open(input_path, 'r', encoding='utf-8') as f: + lines = f.readlines() + + fixed_lines = [] + + for i, line in enumerate(lines): + line = line.strip() + if not line: + continue + + try: + entry = json.loads(line) + except json.JSONDecodeError as e: + print(f"Warning: Line {i+1} is not valid JSON: {e}") + fixed_lines.append(line) + continue + + total_entries += 1 + entry_fixes = 0 + + # Process all string fields in the entry + for key in entry: + if isinstance(entry[key], str): + fixed_text, fixes = fix_code_block_indentation(entry[key]) + if fixes > 0: + entry[key] = fixed_text + entry_fixes += fixes + + if entry_fixes > 0: + entries_with_fixes += 1 + total_fixes += entry_fixes + + fixed_lines.append(json.dumps(entry, ensure_ascii=False)) + + if dry_run: + print(f"DRY RUN: Would fix {total_fixes} code blocks in {entries_with_fixes}/{total_entries} entries") + return total_fixes, entries_with_fixes + + # Write fixed data + with open(output_path, 'w', encoding='utf-8') as f: + for line in fixed_lines: + f.write(line + '\n') + + print(f"Fixed {total_fixes} code blocks in {entries_with_fixes}/{total_entries} entries") + print(f"Output: {output_path}") + + return total_fixes, entries_with_fixes + + +def main(): + import argparse + parser = argparse.ArgumentParser(description='Fix training data code block indentation') + parser.add_argument('--input', required=True, help='Input JSONL file') + parser.add_argument('--output', default=None, help='Output JSONL file (default: input.fixed.jsonl)') + parser.add_argument('--dry-run', action='store_true', help='Show what would be fixed without writing') + args = parser.parse_args() + + fixes, entries = process_jsonl_file(args.input, args.output, args.dry_run) + + if fixes == 0: + print("No fixes needed - code blocks are properly indented") + elif not args.dry_run: + print("Done!") + + +if __name__ == '__main__': + main()