diff --git a/.gitea/workflows/validate-training-data.yml b/.gitea/workflows/validate-training-data.yml new file mode 100644 index 00000000..9ba8ff8a --- /dev/null +++ b/.gitea/workflows/validate-training-data.yml @@ -0,0 +1,24 @@ +name: Validate Training Data + +on: + pull_request: + paths: + - 'training/data/**/*.jsonl' + - 'training/data/**/schema.json' + - 'training/data/**/validate.py' + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Validate scene description JSONL + run: | + cd training/data/scene-descriptions + python3 validate.py *.jsonl diff --git a/training/data/scene-descriptions/schema.json b/training/data/scene-descriptions/schema.json new file mode 100644 index 00000000..fa667c1c --- /dev/null +++ b/training/data/scene-descriptions/schema.json @@ -0,0 +1,80 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://timmy-foundation/scene-description/v1", + "title": "Scene Description Entry", + "description": "A single lyrics-to-visual scene description beat from a song.", + "type": "object", + "required": ["song", "artist", "beat", "timestamp", "duration_seconds", "lyric_line", "scene"], + "properties": { + "song": { + "type": "string", + "minLength": 1, + "description": "Song title" + }, + "artist": { + "type": "string", + "minLength": 1, + "description": "Artist name (may be fictional)" + }, + "beat": { + "type": "integer", + "minimum": 1, + "maximum": 20, + "description": "Beat number within the song (1-indexed)" + }, + "timestamp": { + "type": "string", + "pattern": "^[0-9]{1,2}:[0-9]{2}$", + "description": "Timestamp in M:SS or MM:SS format" + }, + "duration_seconds": { + "type": "integer", + "minimum": 1, + "maximum": 600, + "description": "Duration of this beat in seconds" + }, + "lyric_line": { + "type": "string", + "minLength": 1, + "description": "The lyric line for this beat" + }, + "scene": { + "type": "object", + "required": ["mood", "colors", "composition", "camera", "description"], + "properties": { + "mood": { + "type": "string", + "minLength": 1, + "description": "Emotional mood of this beat" + }, + "colors": { + "type": "array", + "minItems": 1, + "maxItems": 5, + "items": { + "type": "string", + "minLength": 1 + }, + "description": "Color palette for this scene (1-5 colors)" + }, + "composition": { + "type": "string", + "minLength": 1, + "description": "Camera composition (e.g., wide shot, close-up, low angle)" + }, + "camera": { + "type": "string", + "minLength": 1, + "description": "Camera movement (e.g., slow zoom, steady hold, pan right)" + }, + "description": { + "type": "string", + "minLength": 10, + "description": "Full visual scene description (min 10 chars)" + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false +} diff --git a/training/data/scene-descriptions/validate.py b/training/data/scene-descriptions/validate.py new file mode 100755 index 00000000..f54891f9 --- /dev/null +++ b/training/data/scene-descriptions/validate.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +""" +validate.py — Validate scene description JSONL files against schema. + +Usage: + python3 validate.py scene-descriptions-r&b-soul.jsonl + python3 validate.py *.jsonl + python3 validate.py --schema schema.json *.jsonl + +Exit codes: + 0 = all entries valid + 1 = validation errors found + 2 = file/parse errors +""" + +import json +import sys +import os +from pathlib import Path + +SCRIPT_DIR = Path(__file__).parent.resolve() +DEFAULT_SCHEMA = SCRIPT_DIR / "schema.json" + + +def load_schema(schema_path=None): + """Load and compile the JSON schema.""" + try: + import jsonschema + except ImportError: + # Fallback: manual validation + return None + + path = Path(schema_path) if schema_path else DEFAULT_SCHEMA + if not path.exists(): + print(f"ERROR: Schema not found: {path}", file=sys.stderr) + sys.exit(2) + + with open(path) as f: + schema = json.load(f) + + return jsonschema.Draft202012Validator(schema) + + +def validate_entry_manual(entry, line_num): + """Manual validation without jsonschema dependency.""" + errors = [] + + # Required top-level fields + required = ["song", "artist", "beat", "timestamp", "duration_seconds", "lyric_line", "scene"] + for field in required: + if field not in entry: + errors.append(f"line {line_num}: missing required field '{field}'") + + # Type checks + if "song" in entry and not isinstance(entry["song"], str): + errors.append(f"line {line_num}: 'song' must be string") + elif "song" in entry and len(entry["song"]) == 0: + errors.append(f"line {line_num}: 'song' must not be empty") + + if "artist" in entry and not isinstance(entry["artist"], str): + errors.append(f"line {line_num}: 'artist' must be string") + elif "artist" in entry and len(entry["artist"]) == 0: + errors.append(f"line {line_num}: 'artist' must not be empty") + + if "beat" in entry: + if not isinstance(entry["beat"], int) or entry["beat"] < 1: + errors.append(f"line {line_num}: 'beat' must be positive integer") + + if "timestamp" in entry: + import re + if not isinstance(entry["timestamp"], str) or not re.match(r'^[0-9]{1,2}:[0-9]{2}$', entry["timestamp"]): + errors.append(f"line {line_num}: 'timestamp' must be M:SS or MM:SS format, got '{entry.get('timestamp')}'") + + if "duration_seconds" in entry: + if not isinstance(entry["duration_seconds"], int) or entry["duration_seconds"] < 1: + errors.append(f"line {line_num}: 'duration_seconds' must be positive integer") + + if "lyric_line" in entry and not isinstance(entry["lyric_line"], str): + errors.append(f"line {line_num}: 'lyric_line' must be string") + elif "lyric_line" in entry and len(entry["lyric_line"]) == 0: + errors.append(f"line {line_num}: 'lyric_line' must not be empty") + + # Scene checks + scene = entry.get("scene") + if scene is not None: + if not isinstance(scene, dict): + errors.append(f"line {line_num}: 'scene' must be object") + else: + scene_required = ["mood", "colors", "composition", "camera", "description"] + for field in scene_required: + if field not in scene: + errors.append(f"line {line_num}: scene missing required field '{field}'") + + if "mood" in scene and (not isinstance(scene["mood"], str) or len(scene["mood"]) == 0): + errors.append(f"line {line_num}: scene.mood must be non-empty string") + + if "colors" in scene: + if not isinstance(scene["colors"], list) or len(scene["colors"]) == 0: + errors.append(f"line {line_num}: scene.colors must be non-empty array") + elif len(scene["colors"]) > 5: + errors.append(f"line {line_num}: scene.colors has {len(scene['colors'])} items (max 5)") + for i, c in enumerate(scene.get("colors", [])): + if not isinstance(c, str) or len(c) == 0: + errors.append(f"line {line_num}: scene.colors[{i}] must be non-empty string") + + if "composition" in scene and (not isinstance(scene["composition"], str) or len(scene["composition"]) == 0): + errors.append(f"line {line_num}: scene.composition must be non-empty string") + + if "camera" in scene and (not isinstance(scene["camera"], str) or len(scene["camera"]) == 0): + errors.append(f"line {line_num}: scene.camera must be non-empty string") + + if "description" in scene: + if not isinstance(scene["description"], str): + errors.append(f"line {line_num}: scene.description must be string") + elif len(scene["description"]) < 10: + errors.append(f"line {line_num}: scene.description too short ({len(scene['description'])} chars, min 10)") + + return errors + + +def validate_file(filepath, schema_validator=None): + """Validate a single JSONL file.""" + errors = [] + line_count = 0 + valid_count = 0 + + if not os.path.exists(filepath): + print(f"ERROR: File not found: {filepath}", file=sys.stderr) + return 1, 0, 0 + + with open(filepath) as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + line_count += 1 + + try: + entry = json.loads(line) + except json.JSONDecodeError as e: + errors.append(f"line {line_num}: JSON parse error: {e}") + continue + + if schema_validator: + # Use jsonschema + entry_errors = list(schema_validator.iter_errors(entry)) + if entry_errors: + for err in entry_errors: + errors.append(f"line {line_num}: {err.message}") + else: + valid_count += 1 + else: + # Manual validation + entry_errors = validate_entry_manual(entry, line_num) + if entry_errors: + errors.extend(entry_errors) + else: + valid_count += 1 + + return errors, line_count, valid_count + + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Validate scene description JSONL files") + parser.add_argument("files", nargs="+", help="JSONL files to validate") + parser.add_argument("--schema", default=None, help="Schema file path") + parser.add_argument("--quiet", action="store_true", help="Only show errors") + args = parser.parse_args() + + schema_validator = load_schema(args.schema) + + total_errors = 0 + total_lines = 0 + total_valid = 0 + + for filepath in args.files: + errors, line_count, valid_count = validate_file(filepath, schema_validator) + total_errors += len(errors) + total_lines += line_count + total_valid += valid_count + + if errors: + print(f"\n{filepath}: {len(errors)} errors in {line_count} entries") + for err in errors[:20]: # Show first 20 errors + print(f" {err}") + if len(errors) > 20: + print(f" ... and {len(errors) - 20} more") + elif not args.quiet: + print(f"{filepath}: OK ({valid_count} entries)") + + # Summary + print(f"\nTotal: {total_valid}/{total_lines} valid, {total_errors} errors") + sys.exit(0 if total_errors == 0 else 1) + + +if __name__ == "__main__": + main()