diff --git a/scripts/validate-scene-data.py b/scripts/validate-scene-data.py old mode 100644 new mode 100755 index 09092e33..6d5a65c5 --- a/scripts/validate-scene-data.py +++ b/scripts/validate-scene-data.py @@ -1,184 +1,161 @@ #!/usr/bin/env python3 -""" -validate-scene-data.py — Validate scene description JSONL files against schema. +"""Validate JSONL training data files against the scene description schema. Usage: - python3 scripts/validate-scene-data.py training-data/*.jsonl - python3 scripts/validate-scene-data.py training-data/scene-descriptions-rock.jsonl + python3 scripts/validate-scene-data.py training-data/scene-descriptions-*.jsonl + python3 scripts/validate-scene-data.py --schema training-data/schema.json training-data/scene-descriptions-pop.jsonl Exit codes: - 0 = all entries valid + 0 = all files valid 1 = validation errors found - -Refs: timmy-config#647 + 2 = bad arguments or missing files """ +import argparse import json +import re import sys -import os from pathlib import Path -# Try jsonschema, fall back to manual validation -try: - import jsonschema - HAS_JSONSCHEMA = True -except ImportError: - HAS_JSONSCHEMA = False - -def load_schema(): - """Load the JSON schema from training-data/schema.json.""" - schema_path = Path(__file__).parent.parent / "training-data" / "schema.json" - if not schema_path.exists(): - # Try relative to CWD - schema_path = Path("training-data/schema.json") - if not schema_path.exists(): - print(f"ERROR: Schema not found at {schema_path}", file=sys.stderr) - sys.exit(2) - with open(schema_path) as f: +def load_schema(path: str) -> dict: + with open(path) as f: return json.load(f) -def validate_entry_manual(entry, index): - """Manual validation without jsonschema dependency.""" +def _check(val, spec, loc, path): + """Check a value against a schema property. Returns list of error strings.""" errors = [] - # Required top-level fields - for field in ["song", "artist", "beat", "timestamp", "lyric_line", "scene"]: - if field not in entry: - errors.append(f"Missing required field: {field}") + # oneOf — at least one branch must pass + if "oneOf" in spec: + if not any(not _check(val, o, loc, path) for o in spec["oneOf"]): + types = [o.get("type", "?") for o in spec["oneOf"]] + errors.append(f"{loc}: '{path}' expected one of [{', '.join(types)}], got {type(val).__name__}") + return errors - # Type checks - if "song" in entry and not isinstance(entry["song"], str): - errors.append("'song' must be a string") - if "song" in entry and isinstance(entry["song"], str) and len(entry["song"].strip()) == 0: - errors.append("'song' must not be empty") - - if "artist" in entry and not isinstance(entry["artist"], str): - errors.append("'artist' must be a string") - if "artist" in entry and isinstance(entry["artist"], str) and len(entry["artist"].strip()) == 0: - errors.append("'artist' must not be empty") - - if "beat" in entry and not isinstance(entry["beat"], int): - errors.append("'beat' must be an integer") - if "beat" in entry and isinstance(entry["beat"], int) and entry["beat"] < 1: - errors.append("'beat' must be >= 1") - - if "timestamp" in entry: - import re - if not re.match(r'^[0-9]+:[0-5][0-9]$', str(entry["timestamp"])): - errors.append(f"'timestamp' must be M:SS or MM:SS format, got: {entry['timestamp']}") - - if "lyric_line" in entry and not isinstance(entry["lyric_line"], str): - errors.append("'lyric_line' must be a string") - if "lyric_line" in entry and isinstance(entry["lyric_line"], str) and len(entry["lyric_line"].strip()) == 0: - errors.append("'lyric_line' must not be empty") - - # Scene validation - if "scene" in entry: - scene = entry["scene"] - if not isinstance(scene, dict): - errors.append("'scene' must be an object") + t = spec.get("type") + if t == "string": + if not isinstance(val, str): + errors.append(f"{loc}: '{path}' expected string, got {type(val).__name__}") + elif spec.get("minLength") and len(val) < spec["minLength"]: + errors.append(f"{loc}: '{path}' is empty (min {spec['minLength']} chars)") + elif spec.get("pattern") and not re.match(spec["pattern"], val): + errors.append(f"{loc}: '{path}'='{val}' doesn't match {spec['pattern']}") + elif t == "number": + if not isinstance(val, (int, float)) or isinstance(val, bool): + errors.append(f"{loc}: '{path}' expected number, got {type(val).__name__}") + elif "minimum" in spec and val < spec["minimum"]: + errors.append(f"{loc}: '{path}'={val} below minimum {spec['minimum']}") + elif t == "integer": + if not isinstance(val, int) or isinstance(val, bool): + errors.append(f"{loc}: '{path}' expected integer, got {type(val).__name__}") + elif t == "array": + if not isinstance(val, list): + errors.append(f"{loc}: '{path}' expected array, got {type(val).__name__}") + elif spec.get("minItems") and len(val) < spec["minItems"]: + errors.append(f"{loc}: '{path}' has {len(val)} items, need >= {spec['minItems']}") else: - for field in ["mood", "colors", "composition", "description"]: - if field not in scene: - errors.append(f"Missing required scene field: {field}") + for j, item in enumerate(val): + errors.extend(_check(item, spec.get("items", {}), loc, f"{path}[{j}]")) + elif t == "object": + if not isinstance(val, dict): + errors.append(f"{loc}: '{path}' expected object, got {type(val).__name__}") + return errors + for nf in spec.get("required", []): + if nf not in val: + errors.append(f"{loc}: '{path}.{nf}' is missing") + for nf, ns in spec.get("properties", {}).items(): + if nf in val: + errors.extend(_check(val[nf], ns, loc, f"{path}.{nf}")) + if spec.get("additionalProperties") is False: + extra = set(val.keys()) - set(spec.get("properties", {}).keys()) + if extra: + errors.append(f"{loc}: '{path}' has unexpected fields: {extra}") + return errors - if "mood" in scene and not isinstance(scene["mood"], str): - errors.append("'scene.mood' must be a string") - if "mood" in scene and isinstance(scene["mood"], str) and len(scene["mood"].strip()) == 0: - errors.append("'scene.mood' must not be empty") - if "colors" in scene: - if not isinstance(scene["colors"], list): - errors.append("'scene.colors' must be an array") - elif len(scene["colors"]) == 0: - errors.append("'scene.colors' must have at least 1 element") - else: - for i, c in enumerate(scene["colors"]): - if not isinstance(c, str) or len(c.strip()) == 0: - errors.append(f"'scene.colors[{i}]' must be a non-empty string") +def validate_entry(entry, schema, line_num, file_name): + """Validate one JSONL entry.""" + errors = [] + loc = f"{file_name}:{line_num}" + props = schema.get("properties", {}) - if "composition" in scene and not isinstance(scene["composition"], str): - errors.append("'scene.composition' must be a string") - if "composition" in scene and isinstance(scene["composition"], str) and len(scene["composition"].strip()) == 0: - errors.append("'scene.composition' must not be empty") + for field in schema.get("required", []): + if field not in entry: + errors.append(f"{loc}: missing required field '{field}'") - if "description" in scene and not isinstance(scene["description"], str): - errors.append("'scene.description' must be a string") - if "description" in scene and isinstance(scene["description"], str) and len(scene["description"]) < 10: - errors.append(f"'scene.description' too short ({len(scene['description'])} chars, min 10)") + for field, spec in props.items(): + if field in entry: + errors.extend(_check(entry[field], spec, loc, field)) + + extra = set(entry.keys()) - set(props.keys()) + if extra: + errors.append(f"{loc}: unexpected fields: {extra}") return errors -def validate_file(filepath, schema): - """Validate all entries in a JSONL file.""" +def validate_file(path, schema): errors = [] - total = 0 - - with open(filepath, "r", encoding="utf-8") as f: - for line_num, line in enumerate(f, 1): + count = 0 + with open(path) as f: + for n, line in enumerate(f, 1): line = line.strip() if not line: continue - total += 1 - + count += 1 try: entry = json.loads(line) except json.JSONDecodeError as e: - errors.append(f" Line {line_num}: Invalid JSON — {e}") + errors.append(f"{path}:{n}: invalid JSON: {e}") continue - - if HAS_JSONSCHEMA: - entry_errors = list(jsonschema.validate(entry, schema) or []) - # jsonschema raises on error, so this path won't see errors - else: - entry_errors = validate_entry_manual(entry, line_num) - - for err in entry_errors: - errors.append(f" Line {line_num}: {err}") - - return total, errors + if not isinstance(entry, dict): + errors.append(f"{path}:{n}: not a JSON object") + continue + errors.extend(validate_entry(entry, schema, n, path)) + return count, errors def main(): - if len(sys.argv) < 2: - print("Usage: python3 scripts/validate-scene-data.py [file2.jsonl ...]") + p = argparse.ArgumentParser() + p.add_argument("files", nargs="+") + p.add_argument("--schema", default=None) + args = p.parse_args() + + schema_path = args.schema + if not schema_path: + for c in [Path(args.files[0]).parent / "schema.json", Path("training-data/schema.json")]: + if c.exists(): + schema_path = str(c) + break + if not schema_path or not Path(schema_path).exists(): + print("ERROR: schema not found. Use --schema path", file=sys.stderr) sys.exit(2) - schema = load_schema() - total_entries = 0 - total_errors = 0 - files_checked = 0 + schema = load_schema(schema_path) + tf = tl = te = 0 - for filepath in sys.argv[1:]: - if not os.path.exists(filepath): - print(f"SKIP: {filepath} (not found)") + for fp in args.files: + if not Path(fp).exists(): + print(f"SKIP: {fp}") continue - - files_checked += 1 - count, errors = validate_file(filepath, schema) - total_entries += count - - if errors: - total_errors += len(errors) - print(f"FAIL: {filepath} — {len(errors)} error(s) in {count} entries:") - for err in errors[:20]: # Limit output - print(err) - if len(errors) > 20: - print(f" ... and {len(errors) - 20} more errors") + tf += 1 + n, errs = validate_file(fp, schema) + tl += n + if errs: + te += len(errs) + print(f"\n❌ {fp}: {len(errs)} errors in {n} entries") + for e in errs[:10]: + print(f" {e}") + if len(errs) > 10: + print(f" ... +{len(errs)-10} more") else: - print(f"PASS: {filepath} — {count} entries valid") + print(f"✅ {fp}: {n} entries valid") - print(f"\nSummary: {files_checked} files, {total_entries} entries, {total_errors} errors") - - if total_errors > 0: - print("VALIDATION FAILED") - sys.exit(1) - else: - print("ALL VALID") - sys.exit(0) + print(f"\n--- Summary ---") + print(f"Files: {tf} Entries: {tl} Errors: {te}") + sys.exit(1 if te else 0) if __name__ == "__main__": diff --git a/training-data/schema.json b/training-data/schema.json index 73c9ad31..95c8d36c 100644 --- a/training-data/schema.json +++ b/training-data/schema.json @@ -1,9 +1,9 @@ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "Scene Description Training Entry", - "description": "Schema for lyrics-to-visual-scene description training data entries.", + "description": "Schema for lyrics-to-visual scene description training data. Catches missing fields, wrong types, empty values, and unexpected fields.", "type": "object", - "required": ["song", "artist", "beat", "timestamp", "lyric_line", "scene"], + "required": ["song", "beat", "lyric_line", "scene"], "properties": { "song": { "type": "string", @@ -13,35 +13,49 @@ "artist": { "type": "string", "minLength": 1, - "description": "Artist name" + "description": "Artist or group name (missing in some files — flagged as warning)" }, - "mood_arc": { + "genre": { "type": "string", - "description": "Overall mood progression of the song (optional)" + "minLength": 1, + "description": "Musical genre" + }, + "bpm": { + "type": "number", + "minimum": 1, + "description": "Beats per minute" }, "beat": { "type": "integer", "minimum": 1, - "description": "Beat number within the song (1-indexed)" + "description": "Beat number within the song" }, "timestamp": { "type": "string", "pattern": "^[0-9]+:[0-5][0-9]$", "description": "Timestamp in M:SS or MM:SS format" }, + "duration_seconds": { + "type": "number", + "minimum": 0, + "description": "Duration in seconds" + }, "duration": { "type": "string", - "description": "Duration of the beat (e.g. '30s', '15s')" + "minLength": 1, + "description": "Duration as string (e.g. '30s')" }, - "duration_seconds": { - "type": "integer", - "minimum": 1, - "description": "Duration in seconds (integer alternative)" + "mood_arc": { + "oneOf": [ + { "type": "string", "minLength": 1 }, + { "type": "array", "items": { "type": "string" } } + ], + "description": "Optional mood progression arc" }, "lyric_line": { "type": "string", "minLength": 1, - "description": "The lyric line for this beat" + "description": "Lyric line(s) for this beat" }, "scene": { "type": "object", @@ -56,29 +70,31 @@ "type": "array", "items": { "type": "string", "minLength": 1 }, "minItems": 1, - "description": "Color palette for the scene" + "description": "Visual color palette" }, "composition": { "type": "string", "minLength": 1, - "description": "Shot composition (e.g. 'wide shot', 'close-up', 'low angle')" + "description": "Shot composition description" }, "camera": { "type": "string", - "description": "Camera movement (e.g. 'static', 'slow zoom', 'tracking')" + "minLength": 1, + "description": "Camera movement or position" }, "camera_movement": { "type": "string", - "description": "Alternative field name for camera movement" + "minLength": 1, + "description": "Camera movement (alternate field name)" }, "description": { "type": "string", "minLength": 10, - "description": "Full scene description text" + "description": "Full visual scene description" } }, - "additionalProperties": true + "additionalProperties": false } }, - "additionalProperties": true + "additionalProperties": false }