diff --git a/scripts/normalize-code-blocks.py b/scripts/normalize-code-blocks.py new file mode 100644 index 00000000..32f63318 --- /dev/null +++ b/scripts/normalize-code-blocks.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +normalize-code-blocks.py — Fix inconsistent indentation in training data code blocks. + +When code blocks are embedded in JSONL as triple-quoted strings, indentation +accumulates from the surrounding context. This script normalizes code block +content using textwrap.dedent and consistent 4-space indentation. + +Usage: + python3 scripts/normalize-code-blocks.py training/data/preference_pairs.jsonl + python3 scripts/normalize-code-blocks.py --dry-run training/data/*.jsonl + python3 scripts/normalize-code-blocks.py --check training/data/*.jsonl # CI mode +""" +import argparse +import json +import re +import sys +import textwrap +from pathlib import Path + +# Matches ```python ... ``` or ``` ... ``` blocks inside string values +CODE_BLOCK_RE = re.compile( + r"(?P```(?:python|py|bash|sh|javascript|js|typescript|ts|go|rust|ruby)?\s*\n)" + r"(?P.*?)" + r"(?P```)", + re.DOTALL, +) + + +def normalize_code_block(match: re.Match) -> str: + """Normalize indentation in a single code block.""" + open_tag = match.group("open") + code = match.group("code") + close_tag = match.group("close") + + # Skip empty blocks + if not code.strip(): + return match.group(0) + + # Dedent the code + dedented = textwrap.dedent(code) + + # Strip leading/trailing blank lines + lines = dedented.split("\n") + while lines and not lines[0].strip(): + lines.pop(0) + while lines and not lines[-1].strip(): + lines.pop() + + normalized = "\n".join(lines) + return f"{open_tag}{normalized}\n{close_tag}" + + +def process_line(line: str) -> tuple: + """Process a single JSONL line. Returns (new_line, num_fixes).""" + try: + obj = json.loads(line) + except json.JSONDecodeError: + return line, 0 + + fixes = 0 + + def fix_strings(obj): + nonlocal fixes + if isinstance(obj, str): + original = obj + fixed = CODE_BLOCK_RE.sub(normalize_code_block, obj) + if fixed != original: + fixes += 1 + return fixed + elif isinstance(obj, dict): + return {k: fix_strings(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [fix_strings(item) for item in obj] + return obj + + fixed_obj = fix_strings(obj) + return json.dumps(fixed_obj, ensure_ascii=False) + "\n", fixes + + +def process_file(filepath: str, dry_run: bool = False) -> dict: + """Process a single JSONL file. Returns stats dict.""" + path = Path(filepath) + if not path.exists(): + return {"file": filepath, "error": "not found", "fixes": 0, "lines": 0} + + lines = path.read_text(encoding="utf-8").splitlines() + fixed_lines = [] + total_fixes = 0 + + for line in lines: + if not line.strip(): + fixed_lines.append(line) + continue + new_line, fixes = process_line(line) + fixed_lines.append(new_line.rstrip("\n")) + total_fixes += fixes + + if total_fixes > 0 and not dry_run: + path.write_text("\n".join(fixed_lines) + "\n", encoding="utf-8") + + return { + "file": filepath, + "lines": len(lines), + "fixes": total_fixes, + "changed": total_fixes > 0, + } + + +def main(): + parser = argparse.ArgumentParser( + description="Normalize code block indentation in JSONL training data" + ) + parser.add_argument("files", nargs="+", help="JSONL files to process") + parser.add_argument("--dry-run", action="store_true", help="Show changes without writing") + parser.add_argument("--check", action="store_true", help="CI mode: exit 1 if fixes needed") + args = parser.parse_args() + + total_fixes = 0 + results = [] + + for filepath in args.files: + result = process_file(filepath, dry_run=args.dry_run or args.check) + results.append(result) + total_fixes += result["fixes"] + + if result["fixes"] > 0: + status = "FIXED" if not args.dry_run and not args.check else "WOULD FIX" + print(f" {status}: {result['file']} — {result['fixes']} code blocks normalized") + else: + print(f" OK: {result['file']}") + + print(f"\nTotal: {total_fixes} code blocks normalized across {len(results)} files") + + if args.check and total_fixes > 0: + print("FAIL: Code block indentation issues found. Run without --check to fix.") + sys.exit(1) + + +if __name__ == "__main__": + main()