diff --git a/.gitea/workflows/validate-training-data.yml b/.gitea/workflows/validate-training-data.yml
new file mode 100644
index 00000000..9ba8ff8a
--- /dev/null
+++ b/.gitea/workflows/validate-training-data.yml
@@ -0,0 +1,24 @@
+name: Validate Training Data
+
+on:
+  pull_request:
+    paths:
+      - 'training/data/**/*.jsonl'
+      - 'training/data/**/schema.json'
+      - 'training/data/**/validate.py'
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Validate scene description JSONL
+        run: |
+          cd training/data/scene-descriptions
+          python3 validate.py *.jsonl
diff --git a/training/data/scene-descriptions/schema.json b/training/data/scene-descriptions/schema.json
new file mode 100644
index 00000000..fa667c1c
--- /dev/null
+++ b/training/data/scene-descriptions/schema.json
@@ -0,0 +1,80 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://timmy-foundation/scene-description/v1",
+  "title": "Scene Description Entry",
+  "description": "A single lyrics-to-visual scene description beat from a song.",
+  "type": "object",
+  "required": ["song", "artist", "beat", "timestamp", "duration_seconds", "lyric_line", "scene"],
+  "properties": {
+    "song": {
+      "type": "string",
+      "minLength": 1,
+      "description": "Song title"
+    },
+    "artist": {
+      "type": "string",
+      "minLength": 1,
+      "description": "Artist name (may be fictional)"
+    },
+    "beat": {
+      "type": "integer",
+      "minimum": 1,
+      "maximum": 20,
+      "description": "Beat number within the song (1-indexed)"
+    },
+    "timestamp": {
+      "type": "string",
+      "pattern": "^[0-9]{1,2}:[0-9]{2}$",
+      "description": "Timestamp in M:SS or MM:SS format"
+    },
+    "duration_seconds": {
+      "type": "integer",
+      "minimum": 1,
+      "maximum": 600,
+      "description": "Duration of this beat in seconds"
+    },
+    "lyric_line": {
+      "type": "string",
+      "minLength": 1,
+      "description": "The lyric line for this beat"
+    },
+    "scene": {
+      "type": "object",
+      "required": ["mood", "colors", "composition", "camera", "description"],
+      "properties": {
+        "mood": {
+          "type": "string",
+          "minLength": 1,
+          "description": "Emotional mood of this beat"
+        },
+        "colors": {
+          "type": "array",
+          "minItems": 1,
+          "maxItems": 5,
+          "items": {
+            "type": "string",
+            "minLength": 1
+          },
+          "description": "Color palette for this scene (1-5 colors)"
+        },
+        "composition": {
+          "type": "string",
+          "minLength": 1,
+          "description": "Camera composition (e.g., wide shot, close-up, low angle)"
+        },
+        "camera": {
+          "type": "string",
+          "minLength": 1,
+          "description": "Camera movement (e.g., slow zoom, steady hold, pan right)"
+        },
+        "description": {
+          "type": "string",
+          "minLength": 10,
+          "description": "Full visual scene description (min 10 chars)"
+        }
+      },
+      "additionalProperties": false
+    }
+  },
+  "additionalProperties": false
+}
diff --git a/training/data/scene-descriptions/validate.py b/training/data/scene-descriptions/validate.py
new file mode 100755
index 00000000..f54891f9
--- /dev/null
+++ b/training/data/scene-descriptions/validate.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""
+validate.py — Validate scene description JSONL files against schema.
+
+Usage:
+    python3 validate.py scene-descriptions-r&b-soul.jsonl
+    python3 validate.py *.jsonl
+    python3 validate.py --schema schema.json *.jsonl
+
+Exit codes:
+    0 = all entries valid
+    1 = validation errors found
+    2 = file/parse errors
+"""
+
+import json
+import sys
+import os
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DEFAULT_SCHEMA = SCRIPT_DIR / "schema.json"
+
+
+def load_schema(schema_path=None):
+    """Load and compile the JSON schema."""
+    try:
+        import jsonschema
+    except ImportError:
+        # Fallback: manual validation
+        return None
+
+    path = Path(schema_path) if schema_path else DEFAULT_SCHEMA
+    if not path.exists():
+        print(f"ERROR: Schema not found: {path}", file=sys.stderr)
+        sys.exit(2)
+
+    with open(path) as f:
+        schema = json.load(f)
+
+    return jsonschema.Draft202012Validator(schema)
+
+
+def validate_entry_manual(entry, line_num):
+    """Manual validation without jsonschema dependency."""
+    errors = []
+
+    # Required top-level fields
+    required = ["song", "artist", "beat", "timestamp", "duration_seconds", "lyric_line", "scene"]
+    for field in required:
+        if field not in entry:
+            errors.append(f"line {line_num}: missing required field '{field}'")
+
+    # Type checks
+    if "song" in entry and not isinstance(entry["song"], str):
+        errors.append(f"line {line_num}: 'song' must be string")
+    elif "song" in entry and len(entry["song"]) == 0:
+        errors.append(f"line {line_num}: 'song' must not be empty")
+
+    if "artist" in entry and not isinstance(entry["artist"], str):
+        errors.append(f"line {line_num}: 'artist' must be string")
+    elif "artist" in entry and len(entry["artist"]) == 0:
+        errors.append(f"line {line_num}: 'artist' must not be empty")
+
+    if "beat" in entry:
+        if not isinstance(entry["beat"], int) or entry["beat"] < 1:
+            errors.append(f"line {line_num}: 'beat' must be positive integer")
+
+    if "timestamp" in entry:
+        import re
+        if not isinstance(entry["timestamp"], str) or not re.match(r'^[0-9]{1,2}:[0-9]{2}$', entry["timestamp"]):
+            errors.append(f"line {line_num}: 'timestamp' must be M:SS or MM:SS format, got '{entry.get('timestamp')}'")
+
+    if "duration_seconds" in entry:
+        if not isinstance(entry["duration_seconds"], int) or entry["duration_seconds"] < 1:
+            errors.append(f"line {line_num}: 'duration_seconds' must be positive integer")
+
+    if "lyric_line" in entry and not isinstance(entry["lyric_line"], str):
+        errors.append(f"line {line_num}: 'lyric_line' must be string")
+    elif "lyric_line" in entry and len(entry["lyric_line"]) == 0:
+        errors.append(f"line {line_num}: 'lyric_line' must not be empty")
+
+    # Scene checks
+    scene = entry.get("scene")
+    if scene is not None:
+        if not isinstance(scene, dict):
+            errors.append(f"line {line_num}: 'scene' must be object")
+        else:
+            scene_required = ["mood", "colors", "composition", "camera", "description"]
+            for field in scene_required:
+                if field not in scene:
+                    errors.append(f"line {line_num}: scene missing required field '{field}'")
+
+            if "mood" in scene and (not isinstance(scene["mood"], str) or len(scene["mood"]) == 0):
+                errors.append(f"line {line_num}: scene.mood must be non-empty string")
+
+            if "colors" in scene:
+                if not isinstance(scene["colors"], list) or len(scene["colors"]) == 0:
+                    errors.append(f"line {line_num}: scene.colors must be non-empty array")
+                elif len(scene["colors"]) > 5:
+                    errors.append(f"line {line_num}: scene.colors has {len(scene['colors'])} items (max 5)")
+                for i, c in enumerate(scene.get("colors", [])):
+                    if not isinstance(c, str) or len(c) == 0:
+                        errors.append(f"line {line_num}: scene.colors[{i}] must be non-empty string")
+
+            if "composition" in scene and (not isinstance(scene["composition"], str) or len(scene["composition"]) == 0):
+                errors.append(f"line {line_num}: scene.composition must be non-empty string")
+
+            if "camera" in scene and (not isinstance(scene["camera"], str) or len(scene["camera"]) == 0):
+                errors.append(f"line {line_num}: scene.camera must be non-empty string")
+
+            if "description" in scene:
+                if not isinstance(scene["description"], str):
+                    errors.append(f"line {line_num}: scene.description must be string")
+                elif len(scene["description"]) < 10:
+                    errors.append(f"line {line_num}: scene.description too short ({len(scene['description'])} chars, min 10)")
+
+    return errors
+
+
+def validate_file(filepath, schema_validator=None):
+    """Validate a single JSONL file."""
+    errors = []
+    line_count = 0
+    valid_count = 0
+
+    if not os.path.exists(filepath):
+        print(f"ERROR: File not found: {filepath}", file=sys.stderr)
+        return 1, 0, 0
+
+    with open(filepath) as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            line_count += 1
+
+            try:
+                entry = json.loads(line)
+            except json.JSONDecodeError as e:
+                errors.append(f"line {line_num}: JSON parse error: {e}")
+                continue
+
+            if schema_validator:
+                # Use jsonschema
+                entry_errors = list(schema_validator.iter_errors(entry))
+                if entry_errors:
+                    for err in entry_errors:
+                        errors.append(f"line {line_num}: {err.message}")
+                else:
+                    valid_count += 1
+            else:
+                # Manual validation
+                entry_errors = validate_entry_manual(entry, line_num)
+                if entry_errors:
+                    errors.extend(entry_errors)
+                else:
+                    valid_count += 1
+
+    return errors, line_count, valid_count
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Validate scene description JSONL files")
+    parser.add_argument("files", nargs="+", help="JSONL files to validate")
+    parser.add_argument("--schema", default=None, help="Schema file path")
+    parser.add_argument("--quiet", action="store_true", help="Only show errors")
+    args = parser.parse_args()
+
+    schema_validator = load_schema(args.schema)
+
+    total_errors = 0
+    total_lines = 0
+    total_valid = 0
+
+    for filepath in args.files:
+        errors, line_count, valid_count = validate_file(filepath, schema_validator)
+        total_errors += len(errors)
+        total_lines += line_count
+        total_valid += valid_count
+
+        if errors:
+            print(f"\n{filepath}: {len(errors)} errors in {line_count} entries")
+            for err in errors[:20]:  # Show first 20 errors
+                print(f"  {err}")
+            if len(errors) > 20:
+                print(f"  ... and {len(errors) - 20} more")
+        elif not args.quiet:
+            print(f"{filepath}: OK ({valid_count} entries)")
+
+    # Summary
+    print(f"\nTotal: {total_valid}/{total_lines} valid, {total_errors} errors")
+    sys.exit(0 if total_errors == 0 else 1)
+
+
+if __name__ == "__main__":
+    main()