timmy-config/training/scripts/fix_training_indentation.py

#!/usr/bin/env python3
"""
Fix Training Data Code Block Indentation
Issue #750: Training data code blocks have inconsistent indentation

Normalizes code block indentation in JSONL training data files using textwrap.dedent.

Usage:
    python3 fix_training_indentation.py --input data.jsonl
    python3 fix_training_indentation.py --input data.jsonl --output fixed.jsonl
    python3 fix_training_indentation.py --input data.jsonl --dry-run
"""

import json
import re
import sys
import textwrap
from pathlib import Path


def fix_code_block_indentation(text):
    """
    Find code blocks in text and normalize their indentation.

    Handles:
    - ```python ... ``` blocks
    - ```bash ... ``` blocks
    - ``` ... ``` blocks (no language)
    - Nested code blocks in JSON strings
    """
    if not text or '```' not in text:
        return text, 0

    fixes = 0
    result = text

    # Pattern to match code blocks: ```language\n...code...\n```
    # Also handles cases where code block is indented
    code_block_pattern = re.compile(
        r'(```(?:\w+)?\n)(.*?)(```)',
        re.DOTALL
    )

    def fix_block(match):
        nonlocal fixes
        opening = match.group(1)  # ```python\n
        code = match.group(2)     # The code content
        closing = match.group(3)  # ```

        if not code.strip():
            return match.group(0)

        # Use textwrap.dedent to remove common leading whitespace
        dedented = textwrap.dedent(code)

        # Also handle the case where first line has different indentation
        lines = dedented.split('\n')
        if lines:
            # Find minimum indentation (excluding empty lines)
            min_indent = float('inf')
            for line in lines:
                if line.strip():
                    indent = len(line) - len(line.lstrip())
                    min_indent = min(min_indent, indent)

            if min_indent > 0 and min_indent != float('inf'):
                # Remove the minimum indentation from all lines
                lines = [line[min_indent:] if line.strip() else line for line in lines]
                dedented = '\n'.join(lines)

        if dedented != code:
            fixes += 1

        return opening + dedented + closing

    result = code_block_pattern.sub(fix_block, result)
    return result, fixes


def process_jsonl_file(input_path, output_path=None, dry_run=False):
    """Process a JSONL file and fix code block indentation."""
    input_path = Path(input_path)
    if output_path is None:
        output_path = input_path.with_suffix('.fixed.jsonl')
    else:
        output_path = Path(output_path)

    if not input_path.exists():
        print(f"Error: {input_path} does not exist")
        return 0, 0

    total_entries = 0
    total_fixes = 0
    entries_with_fixes = 0

    with open(input_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    fixed_lines = []

    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue

        try:
            entry = json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Warning: Line {i+1} is not valid JSON: {e}")
            fixed_lines.append(line)
            continue

        total_entries += 1
        entry_fixes = 0

        # Process all string fields in the entry
        for key in entry:
            if isinstance(entry[key], str):
                fixed_text, fixes = fix_code_block_indentation(entry[key])
                if fixes > 0:
                    entry[key] = fixed_text
                    entry_fixes += fixes

        if entry_fixes > 0:
            entries_with_fixes += 1
            total_fixes += entry_fixes

        fixed_lines.append(json.dumps(entry, ensure_ascii=False))

    if dry_run:
        print(f"DRY RUN: Would fix {total_fixes} code blocks in {entries_with_fixes}/{total_entries} entries")
        return total_fixes, entries_with_fixes

    # Write fixed data
    with open(output_path, 'w', encoding='utf-8') as f:
        for line in fixed_lines:
            f.write(line + '\n')

    print(f"Fixed {total_fixes} code blocks in {entries_with_fixes}/{total_entries} entries")
    print(f"Output: {output_path}")

    return total_fixes, entries_with_fixes


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Fix training data code block indentation')
    parser.add_argument('--input', required=True, help='Input JSONL file')
    parser.add_argument('--output', default=None, help='Output JSONL file (default: input.fixed.jsonl)')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be fixed without writing')
    args = parser.parse_args()

    fixes, entries = process_jsonl_file(args.input, args.output, args.dry_run)

    if fixes == 0:
        print("No fixes needed - code blocks are properly indented")
    elif not args.dry_run:
        print("Done!")


if __name__ == '__main__':
    main()