timmy-config/training/data/scene-descriptions/validate.py

#!/usr/bin/env python3
"""
validate.py — Validate scene description JSONL files against schema.

Usage:
    python3 validate.py scene-descriptions-r&b-soul.jsonl
    python3 validate.py *.jsonl
    python3 validate.py --schema schema.json *.jsonl

Exit codes:
    0 = all entries valid
    1 = validation errors found
    2 = file/parse errors
"""

import json
import sys
import os
from pathlib import Path

SCRIPT_DIR = Path(__file__).parent.resolve()
DEFAULT_SCHEMA = SCRIPT_DIR / "schema.json"


def load_schema(schema_path=None):
    """Load and compile the JSON schema."""
    try:
        import jsonschema
    except ImportError:
        # Fallback: manual validation
        return None

    path = Path(schema_path) if schema_path else DEFAULT_SCHEMA
    if not path.exists():
        print(f"ERROR: Schema not found: {path}", file=sys.stderr)
        sys.exit(2)

    with open(path) as f:
        schema = json.load(f)

    return jsonschema.Draft202012Validator(schema)


def validate_entry_manual(entry, line_num):
    """Manual validation without jsonschema dependency."""
    errors = []

    # Required top-level fields
    required = ["song", "artist", "beat", "timestamp", "duration_seconds", "lyric_line", "scene"]
    for field in required:
        if field not in entry:
            errors.append(f"line {line_num}: missing required field '{field}'")

    # Type checks
    if "song" in entry and not isinstance(entry["song"], str):
        errors.append(f"line {line_num}: 'song' must be string")
    elif "song" in entry and len(entry["song"]) == 0:
        errors.append(f"line {line_num}: 'song' must not be empty")

    if "artist" in entry and not isinstance(entry["artist"], str):
        errors.append(f"line {line_num}: 'artist' must be string")
    elif "artist" in entry and len(entry["artist"]) == 0:
        errors.append(f"line {line_num}: 'artist' must not be empty")

    if "beat" in entry:
        if not isinstance(entry["beat"], int) or entry["beat"] < 1:
            errors.append(f"line {line_num}: 'beat' must be positive integer")

    if "timestamp" in entry:
        import re
        if not isinstance(entry["timestamp"], str) or not re.match(r'^[0-9]{1,2}:[0-9]{2}$', entry["timestamp"]):
            errors.append(f"line {line_num}: 'timestamp' must be M:SS or MM:SS format, got '{entry.get('timestamp')}'")

    if "duration_seconds" in entry:
        if not isinstance(entry["duration_seconds"], int) or entry["duration_seconds"] < 1:
            errors.append(f"line {line_num}: 'duration_seconds' must be positive integer")

    if "lyric_line" in entry and not isinstance(entry["lyric_line"], str):
        errors.append(f"line {line_num}: 'lyric_line' must be string")
    elif "lyric_line" in entry and len(entry["lyric_line"]) == 0:
        errors.append(f"line {line_num}: 'lyric_line' must not be empty")

    # Scene checks
    scene = entry.get("scene")
    if scene is not None:
        if not isinstance(scene, dict):
            errors.append(f"line {line_num}: 'scene' must be object")
        else:
            scene_required = ["mood", "colors", "composition", "camera", "description"]
            for field in scene_required:
                if field not in scene:
                    errors.append(f"line {line_num}: scene missing required field '{field}'")

            if "mood" in scene and (not isinstance(scene["mood"], str) or len(scene["mood"]) == 0):
                errors.append(f"line {line_num}: scene.mood must be non-empty string")

            if "colors" in scene:
                if not isinstance(scene["colors"], list) or len(scene["colors"]) == 0:
                    errors.append(f"line {line_num}: scene.colors must be non-empty array")
                elif len(scene["colors"]) > 5:
                    errors.append(f"line {line_num}: scene.colors has {len(scene['colors'])} items (max 5)")
                for i, c in enumerate(scene.get("colors", [])):
                    if not isinstance(c, str) or len(c) == 0:
                        errors.append(f"line {line_num}: scene.colors[{i}] must be non-empty string")

            if "composition" in scene and (not isinstance(scene["composition"], str) or len(scene["composition"]) == 0):
                errors.append(f"line {line_num}: scene.composition must be non-empty string")

            if "camera" in scene and (not isinstance(scene["camera"], str) or len(scene["camera"]) == 0):
                errors.append(f"line {line_num}: scene.camera must be non-empty string")

            if "description" in scene:
                if not isinstance(scene["description"], str):
                    errors.append(f"line {line_num}: scene.description must be string")
                elif len(scene["description"]) < 10:
                    errors.append(f"line {line_num}: scene.description too short ({len(scene['description'])} chars, min 10)")

    return errors


def validate_file(filepath, schema_validator=None):
    """Validate a single JSONL file."""
    errors = []
    line_count = 0
    valid_count = 0

    if not os.path.exists(filepath):
        print(f"ERROR: File not found: {filepath}", file=sys.stderr)
        return 1, 0, 0

    with open(filepath) as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            line_count += 1

            try:
                entry = json.loads(line)
            except json.JSONDecodeError as e:
                errors.append(f"line {line_num}: JSON parse error: {e}")
                continue

            if schema_validator:
                # Use jsonschema
                entry_errors = list(schema_validator.iter_errors(entry))
                if entry_errors:
                    for err in entry_errors:
                        errors.append(f"line {line_num}: {err.message}")
                else:
                    valid_count += 1
            else:
                # Manual validation
                entry_errors = validate_entry_manual(entry, line_num)
                if entry_errors:
                    errors.extend(entry_errors)
                else:
                    valid_count += 1

    return errors, line_count, valid_count


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Validate scene description JSONL files")
    parser.add_argument("files", nargs="+", help="JSONL files to validate")
    parser.add_argument("--schema", default=None, help="Schema file path")
    parser.add_argument("--quiet", action="store_true", help="Only show errors")
    args = parser.parse_args()

    schema_validator = load_schema(args.schema)

    total_errors = 0
    total_lines = 0
    total_valid = 0

    for filepath in args.files:
        errors, line_count, valid_count = validate_file(filepath, schema_validator)
        total_errors += len(errors)
        total_lines += line_count
        total_valid += valid_count

        if errors:
            print(f"\n{filepath}: {len(errors)} errors in {line_count} entries")
            for err in errors[:20]:  # Show first 20 errors
                print(f"  {err}")
            if len(errors) > 20:
                print(f"  ... and {len(errors) - 20} more")
        elif not args.quiet:
            print(f"{filepath}: OK ({valid_count} entries)")

    # Summary
    print(f"\nTotal: {total_valid}/{total_lines} valid, {total_errors} errors")
    sys.exit(0 if total_errors == 0 else 1)


if __name__ == "__main__":
    main()