#!/usr/bin/env python3 """Validate scene description JSONL files against schema.""" import json import sys from pathlib import Path try: import jsonschema except ImportError: print("Installing jsonschema...") import subprocess subprocess.check_call([sys.executable, "-m", "pip", "install", "jsonschema", "-q"]) import jsonschema def validate_scene_entry(entry, schema): """Validate a single scene entry against the schema.""" # Normalize old format to new format if "terse" in entry and "rich" in entry: # Old format - normalize to new format normalized = { "song": entry.get("song"), "beat": entry.get("beat"), "lyric_line": entry.get("lyric_line"), "scene": entry.get("scene"), "terse": entry["terse"], "rich": entry["rich"], "domain": entry.get("domain"), "source_session_id": entry.get("source_session_id"), "model": entry.get("model", "unknown"), "timestamp": entry.get("timestamp"), "source_type": entry.get("source_type", "backfill") } entry = normalized try: jsonschema.validate(entry, schema) return True, None except jsonschema.ValidationError as e: return False, str(e.message) def validate_file(filepath, schema): """Validate all entries in a JSONL file.""" errors = [] with open(filepath, "r") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue try: entry = json.loads(line) except json.JSONDecodeError as e: errors.append(f"{filepath}:{line_num}: Invalid JSON: {e}") continue valid, error = validate_scene_entry(entry, schema) if not valid: errors.append(f"{filepath}:{line_num}: {error}") return errors def main(): import glob schema_path = Path("training-data/schema.json") with open(schema_path) as f: schema = json.load(f) jsonl_files = sorted(glob.glob("training-data/*.jsonl")) all_errors = [] for filepath in jsonl_files: errors = validate_file(filepath, schema) all_errors.extend(errors) if all_errors: print("Validation FAILED:") for error in all_errors[:20]: # Show first 20 errors print(f" {error}") if len(all_errors) > 20: print(f" ... and {len(all_errors) - 20} more errors") sys.exit(1) else: print(f"All {len(jsonl_files)} files validated successfully!") if __name__ == "__main__": main()