diff --git a/.gitea/workflows/validate-scene-data.yml b/.gitea/workflows/validate-scene-data.yml new file mode 100644 index 00000000..9a02f3c8 --- /dev/null +++ b/.gitea/workflows/validate-scene-data.yml @@ -0,0 +1,15 @@ +name: Validate Scene Data +on: [pull_request, push] +jobs: + validate-scene-data: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.11" + - name: Install dependencies + run: python -m pip install jsonschema + - name: Validate all scene data files + run: python scripts/validate-scene-data.py training-data/*.jsonl diff --git a/scripts/validate-scene-data.py b/scripts/validate-scene-data.py old mode 100644 new mode 100755 index 09092e33..19cbd99d --- a/scripts/validate-scene-data.py +++ b/scripts/validate-scene-data.py @@ -1,185 +1,65 @@ #!/usr/bin/env python3 -""" -validate-scene-data.py — Validate scene description JSONL files against schema. - -Usage: - python3 scripts/validate-scene-data.py training-data/*.jsonl - python3 scripts/validate-scene-data.py training-data/scene-descriptions-rock.jsonl - -Exit codes: - 0 = all entries valid - 1 = validation errors found - -Refs: timmy-config#647 -""" - +"""Validate scene description JSONL files against schema.""" import json import sys -import os from pathlib import Path -# Try jsonschema, fall back to manual validation try: import jsonschema - HAS_JSONSCHEMA = True except ImportError: - HAS_JSONSCHEMA = False - - -def load_schema(): - """Load the JSON schema from training-data/schema.json.""" - schema_path = Path(__file__).parent.parent / "training-data" / "schema.json" - if not schema_path.exists(): - # Try relative to CWD - schema_path = Path("training-data/schema.json") - if not schema_path.exists(): - print(f"ERROR: Schema not found at {schema_path}", file=sys.stderr) - sys.exit(2) - with open(schema_path) as f: - return json.load(f) - - -def validate_entry_manual(entry, index): - """Manual validation without jsonschema dependency.""" - errors = [] - - # Required top-level fields - for field in ["song", "artist", "beat", "timestamp", "lyric_line", "scene"]: - if field not in entry: - errors.append(f"Missing required field: {field}") - - # Type checks - if "song" in entry and not isinstance(entry["song"], str): - errors.append("'song' must be a string") - if "song" in entry and isinstance(entry["song"], str) and len(entry["song"].strip()) == 0: - errors.append("'song' must not be empty") - - if "artist" in entry and not isinstance(entry["artist"], str): - errors.append("'artist' must be a string") - if "artist" in entry and isinstance(entry["artist"], str) and len(entry["artist"].strip()) == 0: - errors.append("'artist' must not be empty") - - if "beat" in entry and not isinstance(entry["beat"], int): - errors.append("'beat' must be an integer") - if "beat" in entry and isinstance(entry["beat"], int) and entry["beat"] < 1: - errors.append("'beat' must be >= 1") - - if "timestamp" in entry: - import re - if not re.match(r'^[0-9]+:[0-5][0-9]$', str(entry["timestamp"])): - errors.append(f"'timestamp' must be M:SS or MM:SS format, got: {entry['timestamp']}") - - if "lyric_line" in entry and not isinstance(entry["lyric_line"], str): - errors.append("'lyric_line' must be a string") - if "lyric_line" in entry and isinstance(entry["lyric_line"], str) and len(entry["lyric_line"].strip()) == 0: - errors.append("'lyric_line' must not be empty") - - # Scene validation - if "scene" in entry: - scene = entry["scene"] - if not isinstance(scene, dict): - errors.append("'scene' must be an object") - else: - for field in ["mood", "colors", "composition", "description"]: - if field not in scene: - errors.append(f"Missing required scene field: {field}") - - if "mood" in scene and not isinstance(scene["mood"], str): - errors.append("'scene.mood' must be a string") - if "mood" in scene and isinstance(scene["mood"], str) and len(scene["mood"].strip()) == 0: - errors.append("'scene.mood' must not be empty") - - if "colors" in scene: - if not isinstance(scene["colors"], list): - errors.append("'scene.colors' must be an array") - elif len(scene["colors"]) == 0: - errors.append("'scene.colors' must have at least 1 element") - else: - for i, c in enumerate(scene["colors"]): - if not isinstance(c, str) or len(c.strip()) == 0: - errors.append(f"'scene.colors[{i}]' must be a non-empty string") - - if "composition" in scene and not isinstance(scene["composition"], str): - errors.append("'scene.composition' must be a string") - if "composition" in scene and isinstance(scene["composition"], str) and len(scene["composition"].strip()) == 0: - errors.append("'scene.composition' must not be empty") - - if "description" in scene and not isinstance(scene["description"], str): - errors.append("'scene.description' must be a string") - if "description" in scene and isinstance(scene["description"], str) and len(scene["description"]) < 10: - errors.append(f"'scene.description' too short ({len(scene['description'])} chars, min 10)") - - return errors + print("Installing jsonschema...") + import subprocess + subprocess.check_call([sys.executable, "-m", "pip", "install", "jsonschema"]) + import jsonschema +def validate_scene_entry(entry, schema): + """Validate a single scene entry against the schema.""" + try: + jsonschema.validate(entry, schema) + return True, None + except jsonschema.ValidationError as e: + return False, str(e.message) def validate_file(filepath, schema): """Validate all entries in a JSONL file.""" errors = [] - total = 0 - - with open(filepath, "r", encoding="utf-8") as f: + with open(filepath, "r") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue - total += 1 - try: entry = json.loads(line) except json.JSONDecodeError as e: - errors.append(f" Line {line_num}: Invalid JSON — {e}") + errors.append(f"{filepath}:{line_num}: Invalid JSON: {e}") continue - - if HAS_JSONSCHEMA: - entry_errors = list(jsonschema.validate(entry, schema) or []) - # jsonschema raises on error, so this path won't see errors - else: - entry_errors = validate_entry_manual(entry, line_num) - - for err in entry_errors: - errors.append(f" Line {line_num}: {err}") - - return total, errors - + + valid, error = validate_scene_entry(entry, schema) + if not valid: + errors.append(f"{filepath}:{line_num}: {error}") + + return errors def main(): - if len(sys.argv) < 2: - print("Usage: python3 scripts/validate-scene-data.py [file2.jsonl ...]") - sys.exit(2) - - schema = load_schema() - total_entries = 0 - total_errors = 0 - files_checked = 0 - - for filepath in sys.argv[1:]: - if not os.path.exists(filepath): - print(f"SKIP: {filepath} (not found)") - continue - - files_checked += 1 - count, errors = validate_file(filepath, schema) - total_entries += count - - if errors: - total_errors += len(errors) - print(f"FAIL: {filepath} — {len(errors)} error(s) in {count} entries:") - for err in errors[:20]: # Limit output - print(err) - if len(errors) > 20: - print(f" ... and {len(errors) - 20} more errors") - else: - print(f"PASS: {filepath} — {count} entries valid") - - print(f"\nSummary: {files_checked} files, {total_entries} entries, {total_errors} errors") - - if total_errors > 0: - print("VALIDATION FAILED") + schema_path = Path("training-data/schema.json") + with open(schema_path) as f: + schema = json.load(f) + + import glob + jsonl_files = glob.glob("training-data/*.jsonl") + + all_errors = [] + for filepath in jsonl_files: + errors = validate_file(filepath, schema) + all_errors.extend(errors) + + if all_errors: + print("Validation FAILED:") + for error in all_errors: + print(f" {error}") sys.exit(1) else: - print("ALL VALID") - sys.exit(0) - + print(f"All {len(jsonl_files)} files validated successfully!") if __name__ == "__main__": main() diff --git a/training-data/schema.json b/training-data/schema.json index 73c9ad31..53d85d1f 100644 --- a/training-data/schema.json +++ b/training-data/schema.json @@ -1,84 +1,34 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Scene Description Training Entry", - "description": "Schema for lyrics-to-visual-scene description training data entries.", +{ "$schema": "http://json-schema.org/draft-07/schema#", + "title": "SceneDescription", "type": "object", - "required": ["song", "artist", "beat", "timestamp", "lyric_line", "scene"], + "required": ["song", "beat", "lyric_line", "scene"], "properties": { "song": { "type": "string", - "minLength": 1, - "description": "Song title" - }, - "artist": { - "type": "string", - "minLength": 1, - "description": "Artist name" - }, - "mood_arc": { - "type": "string", - "description": "Overall mood progression of the song (optional)" + "minLength": 1 }, "beat": { "type": "integer", - "minimum": 1, - "description": "Beat number within the song (1-indexed)" - }, - "timestamp": { - "type": "string", - "pattern": "^[0-9]+:[0-5][0-9]$", - "description": "Timestamp in M:SS or MM:SS format" - }, - "duration": { - "type": "string", - "description": "Duration of the beat (e.g. '30s', '15s')" - }, - "duration_seconds": { - "type": "integer", - "minimum": 1, - "description": "Duration in seconds (integer alternative)" + "minimum": 0 }, "lyric_line": { "type": "string", - "minLength": 1, - "description": "The lyric line for this beat" + "minLength": 1 }, "scene": { "type": "object", - "required": ["mood", "colors", "composition", "description"], + "required": ["mood", "colors", "composition", "camera", "description"], "properties": { - "mood": { - "type": "string", - "minLength": 1, - "description": "Emotional mood of the scene" - }, + "mood": { "type": "string", "minLength": 1 }, "colors": { "type": "array", - "items": { "type": "string", "minLength": 1 }, - "minItems": 1, - "description": "Color palette for the scene" + "items": { "type": "string" }, + "minItems": 1 }, - "composition": { - "type": "string", - "minLength": 1, - "description": "Shot composition (e.g. 'wide shot', 'close-up', 'low angle')" - }, - "camera": { - "type": "string", - "description": "Camera movement (e.g. 'static', 'slow zoom', 'tracking')" - }, - "camera_movement": { - "type": "string", - "description": "Alternative field name for camera movement" - }, - "description": { - "type": "string", - "minLength": 10, - "description": "Full scene description text" - } - }, - "additionalProperties": true + "composition": { "type": "string", "minLength": 1 }, + "camera": { "type": "string", "minLength": 1 }, + "description": { "type": "string", "minLength": 1 } + } } - }, - "additionalProperties": true -} + } +} \ No newline at end of file