Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 26s
Smoke Test / smoke (pull_request) Failing after 18s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 17s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m25s
Validate Config / Shell Script Lint (pull_request) Failing after 1m4s
Validate Config / Cron Syntax Check (pull_request) Successful in 11s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 13s
Validate Config / Playbook Schema Validation (pull_request) Successful in 20s
Validate Training Data / validate (pull_request) Successful in 22s
PR Checklist / pr-checklist (pull_request) Failing after 5m17s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
- schema.json: JSON Schema for scene description entries (song, artist, beat, timestamp, duration_seconds, lyric_line, scene with mood/colors/composition/camera/description) - validate.py: CLI validator for JSONL files (no jsonschema dependency, manual fallback). Validates all fields, types, ranges, empty values. - .gitea/workflows/validate-training-data.yml: CI check on PR Tested: 100/100 entries in scene-descriptions-r&b-soul.jsonl pass. Closes #647
199 lines
7.3 KiB
Python
Executable File
199 lines
7.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
validate.py — Validate scene description JSONL files against schema.
|
|
|
|
Usage:
|
|
python3 validate.py scene-descriptions-r&b-soul.jsonl
|
|
python3 validate.py *.jsonl
|
|
python3 validate.py --schema schema.json *.jsonl
|
|
|
|
Exit codes:
|
|
0 = all entries valid
|
|
1 = validation errors found
|
|
2 = file/parse errors
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
|
|
SCRIPT_DIR = Path(__file__).parent.resolve()
|
|
DEFAULT_SCHEMA = SCRIPT_DIR / "schema.json"
|
|
|
|
|
|
def load_schema(schema_path=None):
|
|
"""Load and compile the JSON schema."""
|
|
try:
|
|
import jsonschema
|
|
except ImportError:
|
|
# Fallback: manual validation
|
|
return None
|
|
|
|
path = Path(schema_path) if schema_path else DEFAULT_SCHEMA
|
|
if not path.exists():
|
|
print(f"ERROR: Schema not found: {path}", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
with open(path) as f:
|
|
schema = json.load(f)
|
|
|
|
return jsonschema.Draft202012Validator(schema)
|
|
|
|
|
|
def validate_entry_manual(entry, line_num):
|
|
"""Manual validation without jsonschema dependency."""
|
|
errors = []
|
|
|
|
# Required top-level fields
|
|
required = ["song", "artist", "beat", "timestamp", "duration_seconds", "lyric_line", "scene"]
|
|
for field in required:
|
|
if field not in entry:
|
|
errors.append(f"line {line_num}: missing required field '{field}'")
|
|
|
|
# Type checks
|
|
if "song" in entry and not isinstance(entry["song"], str):
|
|
errors.append(f"line {line_num}: 'song' must be string")
|
|
elif "song" in entry and len(entry["song"]) == 0:
|
|
errors.append(f"line {line_num}: 'song' must not be empty")
|
|
|
|
if "artist" in entry and not isinstance(entry["artist"], str):
|
|
errors.append(f"line {line_num}: 'artist' must be string")
|
|
elif "artist" in entry and len(entry["artist"]) == 0:
|
|
errors.append(f"line {line_num}: 'artist' must not be empty")
|
|
|
|
if "beat" in entry:
|
|
if not isinstance(entry["beat"], int) or entry["beat"] < 1:
|
|
errors.append(f"line {line_num}: 'beat' must be positive integer")
|
|
|
|
if "timestamp" in entry:
|
|
import re
|
|
if not isinstance(entry["timestamp"], str) or not re.match(r'^[0-9]{1,2}:[0-9]{2}$', entry["timestamp"]):
|
|
errors.append(f"line {line_num}: 'timestamp' must be M:SS or MM:SS format, got '{entry.get('timestamp')}'")
|
|
|
|
if "duration_seconds" in entry:
|
|
if not isinstance(entry["duration_seconds"], int) or entry["duration_seconds"] < 1:
|
|
errors.append(f"line {line_num}: 'duration_seconds' must be positive integer")
|
|
|
|
if "lyric_line" in entry and not isinstance(entry["lyric_line"], str):
|
|
errors.append(f"line {line_num}: 'lyric_line' must be string")
|
|
elif "lyric_line" in entry and len(entry["lyric_line"]) == 0:
|
|
errors.append(f"line {line_num}: 'lyric_line' must not be empty")
|
|
|
|
# Scene checks
|
|
scene = entry.get("scene")
|
|
if scene is not None:
|
|
if not isinstance(scene, dict):
|
|
errors.append(f"line {line_num}: 'scene' must be object")
|
|
else:
|
|
scene_required = ["mood", "colors", "composition", "camera", "description"]
|
|
for field in scene_required:
|
|
if field not in scene:
|
|
errors.append(f"line {line_num}: scene missing required field '{field}'")
|
|
|
|
if "mood" in scene and (not isinstance(scene["mood"], str) or len(scene["mood"]) == 0):
|
|
errors.append(f"line {line_num}: scene.mood must be non-empty string")
|
|
|
|
if "colors" in scene:
|
|
if not isinstance(scene["colors"], list) or len(scene["colors"]) == 0:
|
|
errors.append(f"line {line_num}: scene.colors must be non-empty array")
|
|
elif len(scene["colors"]) > 5:
|
|
errors.append(f"line {line_num}: scene.colors has {len(scene['colors'])} items (max 5)")
|
|
for i, c in enumerate(scene.get("colors", [])):
|
|
if not isinstance(c, str) or len(c) == 0:
|
|
errors.append(f"line {line_num}: scene.colors[{i}] must be non-empty string")
|
|
|
|
if "composition" in scene and (not isinstance(scene["composition"], str) or len(scene["composition"]) == 0):
|
|
errors.append(f"line {line_num}: scene.composition must be non-empty string")
|
|
|
|
if "camera" in scene and (not isinstance(scene["camera"], str) or len(scene["camera"]) == 0):
|
|
errors.append(f"line {line_num}: scene.camera must be non-empty string")
|
|
|
|
if "description" in scene:
|
|
if not isinstance(scene["description"], str):
|
|
errors.append(f"line {line_num}: scene.description must be string")
|
|
elif len(scene["description"]) < 10:
|
|
errors.append(f"line {line_num}: scene.description too short ({len(scene['description'])} chars, min 10)")
|
|
|
|
return errors
|
|
|
|
|
|
def validate_file(filepath, schema_validator=None):
|
|
"""Validate a single JSONL file."""
|
|
errors = []
|
|
line_count = 0
|
|
valid_count = 0
|
|
|
|
if not os.path.exists(filepath):
|
|
print(f"ERROR: File not found: {filepath}", file=sys.stderr)
|
|
return 1, 0, 0
|
|
|
|
with open(filepath) as f:
|
|
for line_num, line in enumerate(f, 1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
line_count += 1
|
|
|
|
try:
|
|
entry = json.loads(line)
|
|
except json.JSONDecodeError as e:
|
|
errors.append(f"line {line_num}: JSON parse error: {e}")
|
|
continue
|
|
|
|
if schema_validator:
|
|
# Use jsonschema
|
|
entry_errors = list(schema_validator.iter_errors(entry))
|
|
if entry_errors:
|
|
for err in entry_errors:
|
|
errors.append(f"line {line_num}: {err.message}")
|
|
else:
|
|
valid_count += 1
|
|
else:
|
|
# Manual validation
|
|
entry_errors = validate_entry_manual(entry, line_num)
|
|
if entry_errors:
|
|
errors.extend(entry_errors)
|
|
else:
|
|
valid_count += 1
|
|
|
|
return errors, line_count, valid_count
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description="Validate scene description JSONL files")
|
|
parser.add_argument("files", nargs="+", help="JSONL files to validate")
|
|
parser.add_argument("--schema", default=None, help="Schema file path")
|
|
parser.add_argument("--quiet", action="store_true", help="Only show errors")
|
|
args = parser.parse_args()
|
|
|
|
schema_validator = load_schema(args.schema)
|
|
|
|
total_errors = 0
|
|
total_lines = 0
|
|
total_valid = 0
|
|
|
|
for filepath in args.files:
|
|
errors, line_count, valid_count = validate_file(filepath, schema_validator)
|
|
total_errors += len(errors)
|
|
total_lines += line_count
|
|
total_valid += valid_count
|
|
|
|
if errors:
|
|
print(f"\n{filepath}: {len(errors)} errors in {line_count} entries")
|
|
for err in errors[:20]: # Show first 20 errors
|
|
print(f" {err}")
|
|
if len(errors) > 20:
|
|
print(f" ... and {len(errors) - 20} more")
|
|
elif not args.quiet:
|
|
print(f"{filepath}: OK ({valid_count} entries)")
|
|
|
|
# Summary
|
|
print(f"\nTotal: {total_valid}/{total_lines} valid, {total_errors} errors")
|
|
sys.exit(0 if total_errors == 0 else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|