Compare commits
3 Commits
feat/690-c
...
feat/647-s
| Author | SHA1 | Date | |
|---|---|---|---|
| 2e6b95dd3d | |||
| 40b68a13cb | |||
| 6fbf5bb649 |
@@ -5,7 +5,9 @@ on:
|
||||
paths:
|
||||
- 'training/data/**/*.jsonl'
|
||||
- 'training/data/**/schema.json'
|
||||
- 'training/data/**/validate.py'
|
||||
- 'training-data/*.jsonl'
|
||||
- 'training-data/schema.json'
|
||||
- 'scripts/validate-scene-data.py'
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
@@ -18,21 +20,11 @@ jobs:
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Validate scene description JSONL
|
||||
- name: Validate scene descriptions (training-data/)
|
||||
run: |
|
||||
python3 scripts/validate-scene-data.py training-data/scene-descriptions-*.jsonl
|
||||
|
||||
- name: Validate scene descriptions (training/data/)
|
||||
run: |
|
||||
cd training/data/scene-descriptions
|
||||
python3 validate.py *.jsonl
|
||||
- name: Validate training data provenance
|
||||
run: |
|
||||
cd training
|
||||
python3 -c "
|
||||
from training_pair_provenance import validate_provenance
|
||||
import json, sys, glob
|
||||
issues = 0
|
||||
for f in glob.glob('data/*.jsonl'):
|
||||
report = validate_provenance(f)
|
||||
print(f'{f}: {report["coverage"]:.0f}% coverage ({report["with_provenance"]}/{report["total"]})')
|
||||
if report['missing_provenance'] > 0:
|
||||
print(f' WARNING: {report["missing_provenance"]} pairs missing provenance')
|
||||
sys.exit(0)
|
||||
"
|
||||
|
||||
269
tests/test_validate_scene_data.py
Normal file
269
tests/test_validate_scene_data.py
Normal file
@@ -0,0 +1,269 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for scene description data validator (#647).
|
||||
|
||||
Tests the validate-scene-data.py script against valid and invalid JSONL entries.
|
||||
"""
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts"))
|
||||
|
||||
from validate_scene_data import validate_entry_manual, load_schema, validate_file
|
||||
|
||||
|
||||
SCHEMA_PATH = Path(__file__).resolve().parent.parent / "training-data" / "schema.json"
|
||||
|
||||
|
||||
def _valid_entry(**overrides):
|
||||
"""Create a valid scene description entry with optional overrides."""
|
||||
entry = {
|
||||
"song": "Thunder Road",
|
||||
"artist": "Heartland",
|
||||
"beat": 1,
|
||||
"timestamp": "0:00",
|
||||
"duration": "30s",
|
||||
"lyric_line": "The screen door slams",
|
||||
"scene": {
|
||||
"mood": "hope",
|
||||
"colors": ["gold", "sky blue", "white"],
|
||||
"composition": "wide shot",
|
||||
"camera": "static",
|
||||
"description": "Open horizon. Golden light breaking through clouds. A figure stands silhouetted.",
|
||||
},
|
||||
}
|
||||
entry.update(overrides)
|
||||
return entry
|
||||
|
||||
|
||||
class TestValidEntry(unittest.TestCase):
|
||||
"""Valid entries should produce no errors."""
|
||||
|
||||
def test_minimal_valid(self):
|
||||
errors = validate_entry_manual(_valid_entry(), 0)
|
||||
self.assertEqual(errors, [])
|
||||
|
||||
def test_with_optional_fields(self):
|
||||
entry = _valid_entry(mood_arc="rising", duration_seconds=30)
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertEqual(errors, [])
|
||||
|
||||
def test_all_composition_types(self):
|
||||
for comp in ["wide shot", "close-up", "over the shoulder", "low angle", "bird\'s eye"]:
|
||||
errors = validate_entry_manual(_valid_entry(composition=comp), 0)
|
||||
self.assertEqual(errors, [], f"Failed for composition: {comp}")
|
||||
|
||||
|
||||
class TestMissingRequiredFields(unittest.TestCase):
|
||||
"""Missing required fields should be caught."""
|
||||
|
||||
def test_missing_song(self):
|
||||
entry = _valid_entry()
|
||||
del entry["song"]
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("song" in e for e in errors))
|
||||
|
||||
def test_missing_artist(self):
|
||||
entry = _valid_entry()
|
||||
del entry["artist"]
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("artist" in e for e in errors))
|
||||
|
||||
def test_missing_beat(self):
|
||||
entry = _valid_entry()
|
||||
del entry["beat"]
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("beat" in e for e in errors))
|
||||
|
||||
def test_missing_timestamp(self):
|
||||
entry = _valid_entry()
|
||||
del entry["timestamp"]
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("timestamp" in e for e in errors))
|
||||
|
||||
def test_missing_lyric_line(self):
|
||||
entry = _valid_entry()
|
||||
del entry["lyric_line"]
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("lyric_line" in e for e in errors))
|
||||
|
||||
def test_missing_scene(self):
|
||||
entry = _valid_entry()
|
||||
del entry["scene"]
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("scene" in e for e in errors))
|
||||
|
||||
def test_missing_scene_mood(self):
|
||||
entry = _valid_entry()
|
||||
del entry["scene"]["mood"]
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("scene.mood" in e or "mood" in e for e in errors))
|
||||
|
||||
def test_missing_scene_colors(self):
|
||||
entry = _valid_entry()
|
||||
del entry["scene"]["colors"]
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("scene.colors" in e or "colors" in e for e in errors))
|
||||
|
||||
def test_missing_scene_description(self):
|
||||
entry = _valid_entry()
|
||||
del entry["scene"]["description"]
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("scene.description" in e or "description" in e for e in errors))
|
||||
|
||||
|
||||
class TestTypeValidation(unittest.TestCase):
|
||||
"""Wrong types should be caught."""
|
||||
|
||||
def test_song_not_string(self):
|
||||
errors = validate_entry_manual(_valid_entry(song=42), 0)
|
||||
self.assertTrue(any("song" in e and "string" in e for e in errors))
|
||||
|
||||
def test_beat_not_integer(self):
|
||||
errors = validate_entry_manual(_valid_entry(beat="one"), 0)
|
||||
self.assertTrue(any("beat" in e for e in errors))
|
||||
|
||||
def test_beat_zero(self):
|
||||
errors = validate_entry_manual(_valid_entry(beat=0), 0)
|
||||
self.assertTrue(any("beat" in e for e in errors))
|
||||
|
||||
def test_colors_not_array(self):
|
||||
entry = _valid_entry()
|
||||
entry["scene"]["colors"] = "red"
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("colors" in e and "array" in e for e in errors))
|
||||
|
||||
def test_colors_empty_array(self):
|
||||
entry = _valid_entry()
|
||||
entry["scene"]["colors"] = []
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("colors" in e for e in errors))
|
||||
|
||||
def test_scene_not_object(self):
|
||||
errors = validate_entry_manual(_valid_entry(scene="not an object"), 0)
|
||||
self.assertTrue(any("scene" in e and "object" in e for e in errors))
|
||||
|
||||
def test_timestamp_bad_format(self):
|
||||
errors = validate_entry_manual(_valid_entry(timestamp="abc"), 0)
|
||||
self.assertTrue(any("timestamp" in e for e in errors))
|
||||
|
||||
def test_timestamp_valid_formats(self):
|
||||
for ts in ["0:00", "1:30", "12:45", "99:59"]:
|
||||
errors = validate_entry_manual(_valid_entry(timestamp=ts), 0)
|
||||
self.assertEqual(errors, [], f"Failed for timestamp: {ts}")
|
||||
|
||||
|
||||
class TestEmptyValues(unittest.TestCase):
|
||||
"""Empty strings should be caught."""
|
||||
|
||||
def test_empty_song(self):
|
||||
errors = validate_entry_manual(_valid_entry(song=""), 0)
|
||||
self.assertTrue(any("song" in e and "empty" in e for e in errors))
|
||||
|
||||
def test_empty_artist(self):
|
||||
errors = validate_entry_manual(_valid_entry(artist=""), 0)
|
||||
self.assertTrue(any("artist" in e and "empty" in e for e in errors))
|
||||
|
||||
def test_empty_lyric_line(self):
|
||||
errors = validate_entry_manual(_valid_entry(lyric_line=""), 0)
|
||||
self.assertTrue(any("lyric_line" in e and "empty" in e for e in errors))
|
||||
|
||||
def test_empty_scene_mood(self):
|
||||
entry = _valid_entry()
|
||||
entry["scene"]["mood"] = ""
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("mood" in e and "empty" in e for e in errors))
|
||||
|
||||
def test_empty_color_in_array(self):
|
||||
entry = _valid_entry()
|
||||
entry["scene"]["colors"] = ["red", "", "blue"]
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("colors[1]" in e for e in errors))
|
||||
|
||||
|
||||
class TestDescriptionLength(unittest.TestCase):
|
||||
"""Description minimum length check."""
|
||||
|
||||
def test_short_description(self):
|
||||
entry = _valid_entry()
|
||||
entry["scene"]["description"] = "Short"
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
self.assertTrue(any("description" in e and "short" in e for e in errors))
|
||||
|
||||
def test_valid_description_length(self):
|
||||
entry = _valid_entry()
|
||||
entry["scene"]["description"] = "A long and detailed scene description that exceeds ten characters."
|
||||
errors = validate_entry_manual(entry, 0)
|
||||
desc_errors = [e for e in errors if "description" in e.lower()]
|
||||
self.assertEqual(desc_errors, [])
|
||||
|
||||
|
||||
class TestFileValidation(unittest.TestCase):
|
||||
"""Test full-file validation."""
|
||||
|
||||
def _write_jsonl(self, entries):
|
||||
f = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
|
||||
for entry in entries:
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
f.close()
|
||||
return Path(f.name)
|
||||
|
||||
def test_valid_file(self):
|
||||
path = self._write_jsonl([_valid_entry(), _valid_entry(beat=2)])
|
||||
try:
|
||||
schema = load_schema(SCHEMA_PATH)
|
||||
total, errors = validate_file(path, schema)
|
||||
self.assertEqual(total, 2)
|
||||
self.assertEqual(errors, [])
|
||||
finally:
|
||||
path.unlink()
|
||||
|
||||
def test_invalid_entries_reported(self):
|
||||
entry_bad = _valid_entry()
|
||||
del entry_bad["song"]
|
||||
path = self._write_jsonl([_valid_entry(), entry_bad])
|
||||
try:
|
||||
schema = load_schema(SCHEMA_PATH)
|
||||
total, errors = validate_file(path, schema)
|
||||
self.assertEqual(total, 2)
|
||||
self.assertGreater(len(errors), 0)
|
||||
self.assertTrue(any("2" in e for e in errors)) # line 2
|
||||
finally:
|
||||
path.unlink()
|
||||
|
||||
def test_malformed_json(self):
|
||||
f = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
|
||||
f.write("{invalid json\n")
|
||||
f.write(json.dumps(_valid_entry()) + "\n")
|
||||
f.close()
|
||||
path = Path(f.name)
|
||||
try:
|
||||
schema = load_schema(SCHEMA_PATH)
|
||||
total, errors = validate_file(path, schema)
|
||||
self.assertEqual(total, 2)
|
||||
self.assertGreater(len(errors), 0)
|
||||
self.assertTrue(any("JSON" in e for e in errors))
|
||||
finally:
|
||||
path.unlink()
|
||||
|
||||
def test_empty_lines_ignored(self):
|
||||
f = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
|
||||
f.write(json.dumps(_valid_entry()) + "\n\n\n")
|
||||
f.write(json.dumps(_valid_entry()) + "\n")
|
||||
f.close()
|
||||
path = Path(f.name)
|
||||
try:
|
||||
schema = load_schema(SCHEMA_PATH)
|
||||
total, errors = validate_file(path, schema)
|
||||
self.assertEqual(total, 2)
|
||||
self.assertEqual(errors, [])
|
||||
finally:
|
||||
path.unlink()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user