#!/usr/bin/env python3 """ normalize-code-blocks.py — Fix inconsistent indentation in training data code blocks. When code blocks are embedded in JSONL as triple-quoted strings, indentation accumulates from the surrounding context. This script normalizes code block content using textwrap.dedent and consistent 4-space indentation. Usage: python3 scripts/normalize-code-blocks.py training/data/preference_pairs.jsonl python3 scripts/normalize-code-blocks.py --dry-run training/data/*.jsonl python3 scripts/normalize-code-blocks.py --check training/data/*.jsonl # CI mode """ import argparse import json import re import sys import textwrap from pathlib import Path # Matches ```python ... ``` or ``` ... ``` blocks inside string values CODE_BLOCK_RE = re.compile( r"(?P```(?:python|py|bash|sh|javascript|js|typescript|ts|go|rust|ruby)?\s*\n)" r"(?P.*?)" r"(?P```)", re.DOTALL, ) def normalize_code_block(match: re.Match) -> str: """Normalize indentation in a single code block.""" open_tag = match.group("open") code = match.group("code") close_tag = match.group("close") # Skip empty blocks if not code.strip(): return match.group(0) # Dedent the code dedented = textwrap.dedent(code) # Strip leading/trailing blank lines lines = dedented.split("\n") while lines and not lines[0].strip(): lines.pop(0) while lines and not lines[-1].strip(): lines.pop() normalized = "\n".join(lines) return f"{open_tag}{normalized}\n{close_tag}" def process_line(line: str) -> tuple: """Process a single JSONL line. Returns (new_line, num_fixes).""" try: obj = json.loads(line) except json.JSONDecodeError: return line, 0 fixes = 0 def fix_strings(obj): nonlocal fixes if isinstance(obj, str): original = obj fixed = CODE_BLOCK_RE.sub(normalize_code_block, obj) if fixed != original: fixes += 1 return fixed elif isinstance(obj, dict): return {k: fix_strings(v) for k, v in obj.items()} elif isinstance(obj, list): return [fix_strings(item) for item in obj] return obj fixed_obj = fix_strings(obj) return json.dumps(fixed_obj, ensure_ascii=False) + "\n", fixes def process_file(filepath: str, dry_run: bool = False) -> dict: """Process a single JSONL file. Returns stats dict.""" path = Path(filepath) if not path.exists(): return {"file": filepath, "error": "not found", "fixes": 0, "lines": 0} lines = path.read_text(encoding="utf-8").splitlines() fixed_lines = [] total_fixes = 0 for line in lines: if not line.strip(): fixed_lines.append(line) continue new_line, fixes = process_line(line) fixed_lines.append(new_line.rstrip("\n")) total_fixes += fixes if total_fixes > 0 and not dry_run: path.write_text("\n".join(fixed_lines) + "\n", encoding="utf-8") return { "file": filepath, "lines": len(lines), "fixes": total_fixes, "changed": total_fixes > 0, } def main(): parser = argparse.ArgumentParser( description="Normalize code block indentation in JSONL training data" ) parser.add_argument("files", nargs="+", help="JSONL files to process") parser.add_argument("--dry-run", action="store_true", help="Show changes without writing") parser.add_argument("--check", action="store_true", help="CI mode: exit 1 if fixes needed") args = parser.parse_args() total_fixes = 0 results = [] for filepath in args.files: result = process_file(filepath, dry_run=args.dry_run or args.check) results.append(result) total_fixes += result["fixes"] if result["fixes"] > 0: status = "FIXED" if not args.dry_run and not args.check else "WOULD FIX" print(f" {status}: {result['file']} — {result['fixes']} code blocks normalized") else: print(f" OK: {result['file']}") print(f"\nTotal: {total_fixes} code blocks normalized across {len(results)} files") if args.check and total_fixes > 0: print("FAIL: Code block indentation issues found. Run without --check to fix.") sys.exit(1) if __name__ == "__main__": main()