From 7ee587b9f46715444fb7bb4d6df87ba767aebb0f Mon Sep 17 00:00:00 2001 From: Merge Bot Date: Thu, 16 Apr 2026 05:06:10 +0000 Subject: [PATCH] Merge PR #667: scripts/validate-scene-data.py (added) --- scripts/validate-scene-data.py | 185 +++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 scripts/validate-scene-data.py diff --git a/scripts/validate-scene-data.py b/scripts/validate-scene-data.py new file mode 100644 index 00000000..09092e33 --- /dev/null +++ b/scripts/validate-scene-data.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +""" +validate-scene-data.py — Validate scene description JSONL files against schema. + +Usage: + python3 scripts/validate-scene-data.py training-data/*.jsonl + python3 scripts/validate-scene-data.py training-data/scene-descriptions-rock.jsonl + +Exit codes: + 0 = all entries valid + 1 = validation errors found + +Refs: timmy-config#647 +""" + +import json +import sys +import os +from pathlib import Path + +# Try jsonschema, fall back to manual validation +try: + import jsonschema + HAS_JSONSCHEMA = True +except ImportError: + HAS_JSONSCHEMA = False + + +def load_schema(): + """Load the JSON schema from training-data/schema.json.""" + schema_path = Path(__file__).parent.parent / "training-data" / "schema.json" + if not schema_path.exists(): + # Try relative to CWD + schema_path = Path("training-data/schema.json") + if not schema_path.exists(): + print(f"ERROR: Schema not found at {schema_path}", file=sys.stderr) + sys.exit(2) + with open(schema_path) as f: + return json.load(f) + + +def validate_entry_manual(entry, index): + """Manual validation without jsonschema dependency.""" + errors = [] + + # Required top-level fields + for field in ["song", "artist", "beat", "timestamp", "lyric_line", "scene"]: + if field not in entry: + errors.append(f"Missing required field: {field}") + + # Type checks + if "song" in entry and not isinstance(entry["song"], str): + errors.append("'song' must be a string") + if "song" in entry and isinstance(entry["song"], str) and len(entry["song"].strip()) == 0: + errors.append("'song' must not be empty") + + if "artist" in entry and not isinstance(entry["artist"], str): + errors.append("'artist' must be a string") + if "artist" in entry and isinstance(entry["artist"], str) and len(entry["artist"].strip()) == 0: + errors.append("'artist' must not be empty") + + if "beat" in entry and not isinstance(entry["beat"], int): + errors.append("'beat' must be an integer") + if "beat" in entry and isinstance(entry["beat"], int) and entry["beat"] < 1: + errors.append("'beat' must be >= 1") + + if "timestamp" in entry: + import re + if not re.match(r'^[0-9]+:[0-5][0-9]$', str(entry["timestamp"])): + errors.append(f"'timestamp' must be M:SS or MM:SS format, got: {entry['timestamp']}") + + if "lyric_line" in entry and not isinstance(entry["lyric_line"], str): + errors.append("'lyric_line' must be a string") + if "lyric_line" in entry and isinstance(entry["lyric_line"], str) and len(entry["lyric_line"].strip()) == 0: + errors.append("'lyric_line' must not be empty") + + # Scene validation + if "scene" in entry: + scene = entry["scene"] + if not isinstance(scene, dict): + errors.append("'scene' must be an object") + else: + for field in ["mood", "colors", "composition", "description"]: + if field not in scene: + errors.append(f"Missing required scene field: {field}") + + if "mood" in scene and not isinstance(scene["mood"], str): + errors.append("'scene.mood' must be a string") + if "mood" in scene and isinstance(scene["mood"], str) and len(scene["mood"].strip()) == 0: + errors.append("'scene.mood' must not be empty") + + if "colors" in scene: + if not isinstance(scene["colors"], list): + errors.append("'scene.colors' must be an array") + elif len(scene["colors"]) == 0: + errors.append("'scene.colors' must have at least 1 element") + else: + for i, c in enumerate(scene["colors"]): + if not isinstance(c, str) or len(c.strip()) == 0: + errors.append(f"'scene.colors[{i}]' must be a non-empty string") + + if "composition" in scene and not isinstance(scene["composition"], str): + errors.append("'scene.composition' must be a string") + if "composition" in scene and isinstance(scene["composition"], str) and len(scene["composition"].strip()) == 0: + errors.append("'scene.composition' must not be empty") + + if "description" in scene and not isinstance(scene["description"], str): + errors.append("'scene.description' must be a string") + if "description" in scene and isinstance(scene["description"], str) and len(scene["description"]) < 10: + errors.append(f"'scene.description' too short ({len(scene['description'])} chars, min 10)") + + return errors + + +def validate_file(filepath, schema): + """Validate all entries in a JSONL file.""" + errors = [] + total = 0 + + with open(filepath, "r", encoding="utf-8") as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + total += 1 + + try: + entry = json.loads(line) + except json.JSONDecodeError as e: + errors.append(f" Line {line_num}: Invalid JSON — {e}") + continue + + if HAS_JSONSCHEMA: + entry_errors = list(jsonschema.validate(entry, schema) or []) + # jsonschema raises on error, so this path won't see errors + else: + entry_errors = validate_entry_manual(entry, line_num) + + for err in entry_errors: + errors.append(f" Line {line_num}: {err}") + + return total, errors + + +def main(): + if len(sys.argv) < 2: + print("Usage: python3 scripts/validate-scene-data.py [file2.jsonl ...]") + sys.exit(2) + + schema = load_schema() + total_entries = 0 + total_errors = 0 + files_checked = 0 + + for filepath in sys.argv[1:]: + if not os.path.exists(filepath): + print(f"SKIP: {filepath} (not found)") + continue + + files_checked += 1 + count, errors = validate_file(filepath, schema) + total_entries += count + + if errors: + total_errors += len(errors) + print(f"FAIL: {filepath} — {len(errors)} error(s) in {count} entries:") + for err in errors[:20]: # Limit output + print(err) + if len(errors) > 20: + print(f" ... and {len(errors) - 20} more errors") + else: + print(f"PASS: {filepath} — {count} entries valid") + + print(f"\nSummary: {files_checked} files, {total_entries} entries, {total_errors} errors") + + if total_errors > 0: + print("VALIDATION FAILED") + sys.exit(1) + else: + print("ALL VALID") + sys.exit(0) + + +if __name__ == "__main__": + main()