#!/usr/bin/env python3 """ validate.py — Validate scene description JSONL files against schema. Usage: python3 validate.py scene-descriptions-r&b-soul.jsonl python3 validate.py *.jsonl python3 validate.py --schema schema.json *.jsonl Exit codes: 0 = all entries valid 1 = validation errors found 2 = file/parse errors """ import json import sys import os from pathlib import Path SCRIPT_DIR = Path(__file__).parent.resolve() DEFAULT_SCHEMA = SCRIPT_DIR / "schema.json" def load_schema(schema_path=None): """Load and compile the JSON schema.""" try: import jsonschema except ImportError: # Fallback: manual validation return None path = Path(schema_path) if schema_path else DEFAULT_SCHEMA if not path.exists(): print(f"ERROR: Schema not found: {path}", file=sys.stderr) sys.exit(2) with open(path) as f: schema = json.load(f) return jsonschema.Draft202012Validator(schema) def validate_entry_manual(entry, line_num): """Manual validation without jsonschema dependency.""" errors = [] # Required top-level fields required = ["song", "artist", "beat", "timestamp", "duration_seconds", "lyric_line", "scene"] for field in required: if field not in entry: errors.append(f"line {line_num}: missing required field '{field}'") # Type checks if "song" in entry and not isinstance(entry["song"], str): errors.append(f"line {line_num}: 'song' must be string") elif "song" in entry and len(entry["song"]) == 0: errors.append(f"line {line_num}: 'song' must not be empty") if "artist" in entry and not isinstance(entry["artist"], str): errors.append(f"line {line_num}: 'artist' must be string") elif "artist" in entry and len(entry["artist"]) == 0: errors.append(f"line {line_num}: 'artist' must not be empty") if "beat" in entry: if not isinstance(entry["beat"], int) or entry["beat"] < 1: errors.append(f"line {line_num}: 'beat' must be positive integer") if "timestamp" in entry: import re if not isinstance(entry["timestamp"], str) or not re.match(r'^[0-9]{1,2}:[0-9]{2}$', entry["timestamp"]): errors.append(f"line {line_num}: 'timestamp' must be M:SS or MM:SS format, got '{entry.get('timestamp')}'") if "duration_seconds" in entry: if not isinstance(entry["duration_seconds"], int) or entry["duration_seconds"] < 1: errors.append(f"line {line_num}: 'duration_seconds' must be positive integer") if "lyric_line" in entry and not isinstance(entry["lyric_line"], str): errors.append(f"line {line_num}: 'lyric_line' must be string") elif "lyric_line" in entry and len(entry["lyric_line"]) == 0: errors.append(f"line {line_num}: 'lyric_line' must not be empty") # Scene checks scene = entry.get("scene") if scene is not None: if not isinstance(scene, dict): errors.append(f"line {line_num}: 'scene' must be object") else: scene_required = ["mood", "colors", "composition", "camera", "description"] for field in scene_required: if field not in scene: errors.append(f"line {line_num}: scene missing required field '{field}'") if "mood" in scene and (not isinstance(scene["mood"], str) or len(scene["mood"]) == 0): errors.append(f"line {line_num}: scene.mood must be non-empty string") if "colors" in scene: if not isinstance(scene["colors"], list) or len(scene["colors"]) == 0: errors.append(f"line {line_num}: scene.colors must be non-empty array") elif len(scene["colors"]) > 5: errors.append(f"line {line_num}: scene.colors has {len(scene['colors'])} items (max 5)") for i, c in enumerate(scene.get("colors", [])): if not isinstance(c, str) or len(c) == 0: errors.append(f"line {line_num}: scene.colors[{i}] must be non-empty string") if "composition" in scene and (not isinstance(scene["composition"], str) or len(scene["composition"]) == 0): errors.append(f"line {line_num}: scene.composition must be non-empty string") if "camera" in scene and (not isinstance(scene["camera"], str) or len(scene["camera"]) == 0): errors.append(f"line {line_num}: scene.camera must be non-empty string") if "description" in scene: if not isinstance(scene["description"], str): errors.append(f"line {line_num}: scene.description must be string") elif len(scene["description"]) < 10: errors.append(f"line {line_num}: scene.description too short ({len(scene['description'])} chars, min 10)") return errors def validate_file(filepath, schema_validator=None): """Validate a single JSONL file.""" errors = [] line_count = 0 valid_count = 0 if not os.path.exists(filepath): print(f"ERROR: File not found: {filepath}", file=sys.stderr) return 1, 0, 0 with open(filepath) as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue line_count += 1 try: entry = json.loads(line) except json.JSONDecodeError as e: errors.append(f"line {line_num}: JSON parse error: {e}") continue if schema_validator: # Use jsonschema entry_errors = list(schema_validator.iter_errors(entry)) if entry_errors: for err in entry_errors: errors.append(f"line {line_num}: {err.message}") else: valid_count += 1 else: # Manual validation entry_errors = validate_entry_manual(entry, line_num) if entry_errors: errors.extend(entry_errors) else: valid_count += 1 return errors, line_count, valid_count def main(): import argparse parser = argparse.ArgumentParser(description="Validate scene description JSONL files") parser.add_argument("files", nargs="+", help="JSONL files to validate") parser.add_argument("--schema", default=None, help="Schema file path") parser.add_argument("--quiet", action="store_true", help="Only show errors") args = parser.parse_args() schema_validator = load_schema(args.schema) total_errors = 0 total_lines = 0 total_valid = 0 for filepath in args.files: errors, line_count, valid_count = validate_file(filepath, schema_validator) total_errors += len(errors) total_lines += line_count total_valid += valid_count if errors: print(f"\n{filepath}: {len(errors)} errors in {line_count} entries") for err in errors[:20]: # Show first 20 errors print(f" {err}") if len(errors) > 20: print(f" ... and {len(errors) - 20} more") elif not args.quiet: print(f"{filepath}: OK ({valid_count} entries)") # Summary print(f"\nTotal: {total_valid}/{total_lines} valid, {total_errors} errors") sys.exit(0 if total_errors == 0 else 1) if __name__ == "__main__": main()