Compare commits

...

3 Commits

Author SHA1 Message Date
2e6b95dd3d ci: add training-data/ path trigger + scripts/validate-scene-data.py (#647)
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 20s
Smoke Test / smoke (pull_request) Failing after 16s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 14s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 45s
Validate Config / Cron Syntax Check (pull_request) Successful in 10s
Validate Config / Shell Script Lint (pull_request) Failing after 52s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 11s
Validate Config / Playbook Schema Validation (pull_request) Successful in 21s
PR Checklist / pr-checklist (pull_request) Failing after 3m50s
Validate Config / Python Test Suite (pull_request) Has been cancelled
Architecture Lint / Lint Repository (pull_request) Has been cancelled
2026-04-17 05:34:38 +00:00
40b68a13cb test: scene data validator tests (#647) 2026-04-17 05:34:36 +00:00
6fbf5bb649 Merge pull request 'feat: sidecar config validation on deploy' (#797) from feat/690-config-validation into main
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 14s
Smoke Test / smoke (pull_request) Failing after 15s
Validate Config / YAML Lint (pull_request) Failing after 13s
Validate Config / JSON Validate (pull_request) Successful in 16s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 42s
Validate Config / Shell Script Lint (pull_request) Failing after 45s
Validate Config / Cron Syntax Check (pull_request) Successful in 9s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 9s
Validate Config / Playbook Schema Validation (pull_request) Successful in 21s
PR Checklist / pr-checklist (pull_request) Failing after 3m31s
Validate Config / Python Test Suite (pull_request) Has been cancelled
Architecture Lint / Lint Repository (pull_request) Has been cancelled
2026-04-17 05:15:05 +00:00
2 changed files with 277 additions and 16 deletions

View File

@@ -5,7 +5,9 @@ on:
paths:
- 'training/data/**/*.jsonl'
- 'training/data/**/schema.json'
- 'training/data/**/validate.py'
- 'training-data/*.jsonl'
- 'training-data/schema.json'
- 'scripts/validate-scene-data.py'
jobs:
validate:
@@ -18,21 +20,11 @@ jobs:
with:
python-version: '3.11'
- name: Validate scene description JSONL
- name: Validate scene descriptions (training-data/)
run: |
python3 scripts/validate-scene-data.py training-data/scene-descriptions-*.jsonl
- name: Validate scene descriptions (training/data/)
run: |
cd training/data/scene-descriptions
python3 validate.py *.jsonl
- name: Validate training data provenance
run: |
cd training
python3 -c "
from training_pair_provenance import validate_provenance
import json, sys, glob
issues = 0
for f in glob.glob('data/*.jsonl'):
report = validate_provenance(f)
print(f'{f}: {report["coverage"]:.0f}% coverage ({report["with_provenance"]}/{report["total"]})')
if report['missing_provenance'] > 0:
print(f' WARNING: {report["missing_provenance"]} pairs missing provenance')
sys.exit(0)
"

View File

@@ -0,0 +1,269 @@
#!/usr/bin/env python3
"""
Tests for scene description data validator (#647).
Tests the validate-scene-data.py script against valid and invalid JSONL entries.
"""
import json
import tempfile
import unittest
from pathlib import Path
import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts"))
from validate_scene_data import validate_entry_manual, load_schema, validate_file
SCHEMA_PATH = Path(__file__).resolve().parent.parent / "training-data" / "schema.json"
def _valid_entry(**overrides):
"""Create a valid scene description entry with optional overrides."""
entry = {
"song": "Thunder Road",
"artist": "Heartland",
"beat": 1,
"timestamp": "0:00",
"duration": "30s",
"lyric_line": "The screen door slams",
"scene": {
"mood": "hope",
"colors": ["gold", "sky blue", "white"],
"composition": "wide shot",
"camera": "static",
"description": "Open horizon. Golden light breaking through clouds. A figure stands silhouetted.",
},
}
entry.update(overrides)
return entry
class TestValidEntry(unittest.TestCase):
"""Valid entries should produce no errors."""
def test_minimal_valid(self):
errors = validate_entry_manual(_valid_entry(), 0)
self.assertEqual(errors, [])
def test_with_optional_fields(self):
entry = _valid_entry(mood_arc="rising", duration_seconds=30)
errors = validate_entry_manual(entry, 0)
self.assertEqual(errors, [])
def test_all_composition_types(self):
for comp in ["wide shot", "close-up", "over the shoulder", "low angle", "bird\'s eye"]:
errors = validate_entry_manual(_valid_entry(composition=comp), 0)
self.assertEqual(errors, [], f"Failed for composition: {comp}")
class TestMissingRequiredFields(unittest.TestCase):
"""Missing required fields should be caught."""
def test_missing_song(self):
entry = _valid_entry()
del entry["song"]
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("song" in e for e in errors))
def test_missing_artist(self):
entry = _valid_entry()
del entry["artist"]
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("artist" in e for e in errors))
def test_missing_beat(self):
entry = _valid_entry()
del entry["beat"]
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("beat" in e for e in errors))
def test_missing_timestamp(self):
entry = _valid_entry()
del entry["timestamp"]
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("timestamp" in e for e in errors))
def test_missing_lyric_line(self):
entry = _valid_entry()
del entry["lyric_line"]
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("lyric_line" in e for e in errors))
def test_missing_scene(self):
entry = _valid_entry()
del entry["scene"]
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("scene" in e for e in errors))
def test_missing_scene_mood(self):
entry = _valid_entry()
del entry["scene"]["mood"]
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("scene.mood" in e or "mood" in e for e in errors))
def test_missing_scene_colors(self):
entry = _valid_entry()
del entry["scene"]["colors"]
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("scene.colors" in e or "colors" in e for e in errors))
def test_missing_scene_description(self):
entry = _valid_entry()
del entry["scene"]["description"]
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("scene.description" in e or "description" in e for e in errors))
class TestTypeValidation(unittest.TestCase):
"""Wrong types should be caught."""
def test_song_not_string(self):
errors = validate_entry_manual(_valid_entry(song=42), 0)
self.assertTrue(any("song" in e and "string" in e for e in errors))
def test_beat_not_integer(self):
errors = validate_entry_manual(_valid_entry(beat="one"), 0)
self.assertTrue(any("beat" in e for e in errors))
def test_beat_zero(self):
errors = validate_entry_manual(_valid_entry(beat=0), 0)
self.assertTrue(any("beat" in e for e in errors))
def test_colors_not_array(self):
entry = _valid_entry()
entry["scene"]["colors"] = "red"
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("colors" in e and "array" in e for e in errors))
def test_colors_empty_array(self):
entry = _valid_entry()
entry["scene"]["colors"] = []
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("colors" in e for e in errors))
def test_scene_not_object(self):
errors = validate_entry_manual(_valid_entry(scene="not an object"), 0)
self.assertTrue(any("scene" in e and "object" in e for e in errors))
def test_timestamp_bad_format(self):
errors = validate_entry_manual(_valid_entry(timestamp="abc"), 0)
self.assertTrue(any("timestamp" in e for e in errors))
def test_timestamp_valid_formats(self):
for ts in ["0:00", "1:30", "12:45", "99:59"]:
errors = validate_entry_manual(_valid_entry(timestamp=ts), 0)
self.assertEqual(errors, [], f"Failed for timestamp: {ts}")
class TestEmptyValues(unittest.TestCase):
"""Empty strings should be caught."""
def test_empty_song(self):
errors = validate_entry_manual(_valid_entry(song=""), 0)
self.assertTrue(any("song" in e and "empty" in e for e in errors))
def test_empty_artist(self):
errors = validate_entry_manual(_valid_entry(artist=""), 0)
self.assertTrue(any("artist" in e and "empty" in e for e in errors))
def test_empty_lyric_line(self):
errors = validate_entry_manual(_valid_entry(lyric_line=""), 0)
self.assertTrue(any("lyric_line" in e and "empty" in e for e in errors))
def test_empty_scene_mood(self):
entry = _valid_entry()
entry["scene"]["mood"] = ""
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("mood" in e and "empty" in e for e in errors))
def test_empty_color_in_array(self):
entry = _valid_entry()
entry["scene"]["colors"] = ["red", "", "blue"]
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("colors[1]" in e for e in errors))
class TestDescriptionLength(unittest.TestCase):
"""Description minimum length check."""
def test_short_description(self):
entry = _valid_entry()
entry["scene"]["description"] = "Short"
errors = validate_entry_manual(entry, 0)
self.assertTrue(any("description" in e and "short" in e for e in errors))
def test_valid_description_length(self):
entry = _valid_entry()
entry["scene"]["description"] = "A long and detailed scene description that exceeds ten characters."
errors = validate_entry_manual(entry, 0)
desc_errors = [e for e in errors if "description" in e.lower()]
self.assertEqual(desc_errors, [])
class TestFileValidation(unittest.TestCase):
"""Test full-file validation."""
def _write_jsonl(self, entries):
f = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
for entry in entries:
f.write(json.dumps(entry) + "\n")
f.close()
return Path(f.name)
def test_valid_file(self):
path = self._write_jsonl([_valid_entry(), _valid_entry(beat=2)])
try:
schema = load_schema(SCHEMA_PATH)
total, errors = validate_file(path, schema)
self.assertEqual(total, 2)
self.assertEqual(errors, [])
finally:
path.unlink()
def test_invalid_entries_reported(self):
entry_bad = _valid_entry()
del entry_bad["song"]
path = self._write_jsonl([_valid_entry(), entry_bad])
try:
schema = load_schema(SCHEMA_PATH)
total, errors = validate_file(path, schema)
self.assertEqual(total, 2)
self.assertGreater(len(errors), 0)
self.assertTrue(any("2" in e for e in errors)) # line 2
finally:
path.unlink()
def test_malformed_json(self):
f = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
f.write("{invalid json\n")
f.write(json.dumps(_valid_entry()) + "\n")
f.close()
path = Path(f.name)
try:
schema = load_schema(SCHEMA_PATH)
total, errors = validate_file(path, schema)
self.assertEqual(total, 2)
self.assertGreater(len(errors), 0)
self.assertTrue(any("JSON" in e for e in errors))
finally:
path.unlink()
def test_empty_lines_ignored(self):
f = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
f.write(json.dumps(_valid_entry()) + "\n\n\n")
f.write(json.dumps(_valid_entry()) + "\n")
f.close()
path = Path(f.name)
try:
schema = load_schema(SCHEMA_PATH)
total, errors = validate_file(path, schema)
self.assertEqual(total, 2)
self.assertEqual(errors, [])
finally:
path.unlink()
if __name__ == "__main__":
unittest.main()