diff --git a/scripts/normalize-code-blocks.py b/scripts/normalize-code-blocks.py new file mode 100644 index 00000000..b35f7bc5 --- /dev/null +++ b/scripts/normalize-code-blocks.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +""" +normalize-code-blocks.py — Fix inconsistent indentation in training data code blocks. + +When code blocks are embedded in JSONL as triple-quoted strings, indentation +accumulates from the surrounding context. This script normalizes code block +content using textwrap.dedent and consistent 4-space indentation. + +Usage: + python3 scripts/normalize-code-blocks.py training/data/preference_pairs.jsonl + python3 scripts/normalize-code-blocks.py --dry-run training/data/*.jsonl + python3 scripts/normalize-code-blocks.py --check training/data/*.jsonl # CI mode +""" + +import argparse +import json +import re +import sys +import textwrap +from pathlib import Path + +# Matches ```python ... ``` or ``` ... ``` blocks inside string values +CODE_BLOCK_RE = re.compile( + r'(?P```(?:python|py|bash|sh|javascript|js|typescript|ts|go|rust|ruby)?\s*\n)' + r'(?P.*?)' + r'(?P```)', + re.DOTALL, +) + + +def normalize_code_block(match: re.Match) -> str: + """Normalize indentation in a single code block.""" + open_tag = match.group("open") + code = match.group("code") + close_tag = match.group("close") + + # Skip empty blocks + if not code.strip(): + return match.group(0) + + # Dedent the code + dedented = textwrap.dedent(code) + + # Strip leading/trailing blank lines + lines = dedented.split("\n") + while lines and not lines[0].strip(): + lines.pop(0) + while lines and not lines[-1].strip(): + lines.pop() + + normalized = "\n".join(lines) + + return f"{open_tag}{normalized}\n{close_tag}" + + +def process_line(line: str) -> tuple[str, int]: + """Process a single JSONL line. Returns (new_line, num_fixes).""" + try: + obj = json.loads(line) + except json.JSONDecodeError: + return line, 0 + + fixes = 0 + + def fix_strings(obj): + nonlocal fixes + if isinstance(obj, str): + original = obj + fixed = CODE_BLOCK_RE.sub(normalize_code_block, obj) + if fixed != original: + fixes += 1 + return fixed + elif isinstance(obj, dict): + return {k: fix_strings(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [fix_strings(item) for item in obj] + return obj + + fixed_obj = fix_strings(obj) + return json.dumps(fixed_obj, ensure_ascii=False) + "\n", fixes + + +def main(): + parser = argparse.ArgumentParser(description="Normalize code block indentation in JSONL training data") + parser.add_argument("files", nargs="+", help="JSONL files to process") + parser.add_argument("--dry-run", action="store_true", help="Show changes without writing") + parser.add_argument("--check", action="store_true", help="CI mode: exit 1 if fixes needed") + args = parser.parse_args() + + total_fixes = 0 + total_lines = 0 + files_changed = 0 + + for filepath in args.files: + path = Path(filepath) + if not path.exists(): + print(f"SKIP: {path} not found", file=sys.stderr) + continue + + lines = path.read_text().splitlines(keepends=True) + fixed_lines = [] + file_fixes = 0 + + for i, line in enumerate(lines): + if not line.strip(): + fixed_lines.append(line) + continue + fixed_line, n = process_line(line) + fixed_lines.append(fixed_line) + file_fixes += n + total_lines += 1 + + if file_fixes > 0: + files_changed += 1 + total_fixes += file_fixes + print(f"{'CHECK' if args.check else 'FIX'}: {path} — {file_fixes} code blocks normalized") + + if args.check: + # Show diff + for i, (old, new) in enumerate(zip(lines, fixed_lines)): + if old != new: + print(f" Line {i+1}: indentation changed") + elif not args.dry_run: + path.write_text("".join(fixed_lines)) + print(f" Written: {path}") + else: + print(f"OK: {path} — no indentation issues") + + print(f"\nSummary: {total_fixes} code blocks fixed across {files_changed} files ({total_lines} lines processed)") + + if args.check and total_fixes > 0: + print("FAIL: Code block indentation issues found. Run without --check to fix.") + sys.exit(1) + + sys.exit(0) + + +if __name__ == "__main__": + main()