#!/usr/bin/env python3 """ validate-scene-data.py — Validate scene description JSONL files against schema. Usage: python3 scripts/validate-scene-data.py training-data/*.jsonl python3 scripts/validate-scene-data.py training-data/scene-descriptions-rock.jsonl Exit codes: 0 = all entries valid 1 = validation errors found Refs: timmy-config#647 """ import json import sys import os from pathlib import Path # Try jsonschema, fall back to manual validation try: import jsonschema HAS_JSONSCHEMA = True except ImportError: HAS_JSONSCHEMA = False def load_schema(): """Load the JSON schema from training-data/schema.json.""" schema_path = Path(__file__).parent.parent / "training-data" / "schema.json" if not schema_path.exists(): # Try relative to CWD schema_path = Path("training-data/schema.json") if not schema_path.exists(): print(f"ERROR: Schema not found at {schema_path}", file=sys.stderr) sys.exit(2) with open(schema_path) as f: return json.load(f) def validate_entry_manual(entry, index): """Manual validation without jsonschema dependency.""" errors = [] # Required top-level fields for field in ["song", "artist", "beat", "timestamp", "lyric_line", "scene"]: if field not in entry: errors.append(f"Missing required field: {field}") # Type checks if "song" in entry and not isinstance(entry["song"], str): errors.append("'song' must be a string") if "song" in entry and isinstance(entry["song"], str) and len(entry["song"].strip()) == 0: errors.append("'song' must not be empty") if "artist" in entry and not isinstance(entry["artist"], str): errors.append("'artist' must be a string") if "artist" in entry and isinstance(entry["artist"], str) and len(entry["artist"].strip()) == 0: errors.append("'artist' must not be empty") if "beat" in entry and not isinstance(entry["beat"], int): errors.append("'beat' must be an integer") if "beat" in entry and isinstance(entry["beat"], int) and entry["beat"] < 1: errors.append("'beat' must be >= 1") if "timestamp" in entry: import re if not re.match(r'^[0-9]+:[0-5][0-9]$', str(entry["timestamp"])): errors.append(f"'timestamp' must be M:SS or MM:SS format, got: {entry['timestamp']}") if "lyric_line" in entry and not isinstance(entry["lyric_line"], str): errors.append("'lyric_line' must be a string") if "lyric_line" in entry and isinstance(entry["lyric_line"], str) and len(entry["lyric_line"].strip()) == 0: errors.append("'lyric_line' must not be empty") # Scene validation if "scene" in entry: scene = entry["scene"] if not isinstance(scene, dict): errors.append("'scene' must be an object") else: for field in ["mood", "colors", "composition", "description"]: if field not in scene: errors.append(f"Missing required scene field: {field}") if "mood" in scene and not isinstance(scene["mood"], str): errors.append("'scene.mood' must be a string") if "mood" in scene and isinstance(scene["mood"], str) and len(scene["mood"].strip()) == 0: errors.append("'scene.mood' must not be empty") if "colors" in scene: if not isinstance(scene["colors"], list): errors.append("'scene.colors' must be an array") elif len(scene["colors"]) == 0: errors.append("'scene.colors' must have at least 1 element") else: for i, c in enumerate(scene["colors"]): if not isinstance(c, str) or len(c.strip()) == 0: errors.append(f"'scene.colors[{i}]' must be a non-empty string") if "composition" in scene and not isinstance(scene["composition"], str): errors.append("'scene.composition' must be a string") if "composition" in scene and isinstance(scene["composition"], str) and len(scene["composition"].strip()) == 0: errors.append("'scene.composition' must not be empty") if "description" in scene and not isinstance(scene["description"], str): errors.append("'scene.description' must be a string") if "description" in scene and isinstance(scene["description"], str) and len(scene["description"]) < 10: errors.append(f"'scene.description' too short ({len(scene['description'])} chars, min 10)") return errors def validate_file(filepath, schema): """Validate all entries in a JSONL file.""" errors = [] total = 0 with open(filepath, "r", encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue total += 1 try: entry = json.loads(line) except json.JSONDecodeError as e: errors.append(f" Line {line_num}: Invalid JSON — {e}") continue if HAS_JSONSCHEMA: entry_errors = list(jsonschema.validate(entry, schema) or []) # jsonschema raises on error, so this path won't see errors else: entry_errors = validate_entry_manual(entry, line_num) for err in entry_errors: errors.append(f" Line {line_num}: {err}") return total, errors def main(): if len(sys.argv) < 2: print("Usage: python3 scripts/validate-scene-data.py [file2.jsonl ...]") sys.exit(2) schema = load_schema() total_entries = 0 total_errors = 0 files_checked = 0 for filepath in sys.argv[1:]: if not os.path.exists(filepath): print(f"SKIP: {filepath} (not found)") continue files_checked += 1 count, errors = validate_file(filepath, schema) total_entries += count if errors: total_errors += len(errors) print(f"FAIL: {filepath} — {len(errors)} error(s) in {count} entries:") for err in errors[:20]: # Limit output print(err) if len(errors) > 20: print(f" ... and {len(errors) - 20} more errors") else: print(f"PASS: {filepath} — {count} entries valid") print(f"\nSummary: {files_checked} files, {total_entries} entries, {total_errors} errors") if total_errors > 0: print("VALIDATION FAILED") sys.exit(1) else: print("ALL VALID") sys.exit(0) if __name__ == "__main__": main()