diff --git a/tests/test_quality_gate.py b/tests/test_quality_gate.py new file mode 100644 index 00000000..0d997a87 --- /dev/null +++ b/tests/test_quality_gate.py @@ -0,0 +1,309 @@ +""" +Tests for training/data/scene-descriptions/validate.py — Quality Gate + +Covers: +- Unit tests for each validation type +- Rejection workflow (invalid entries rejected) +- Statistics tracking (line_count, valid_count, error_count) +- Integration tests with sample JSONL content +""" + +import json +import os +import sys +import tempfile +import unittest +from pathlib import Path + +# Add the validate.py module to path +VALIDATE_DIR = Path(__file__).parent.parent / "training" / "data" / "scene-descriptions" +sys.path.insert(0, str(VALIDATE_DIR)) + +from validate import validate_entry_manual, validate_file + + +class TestValidateEntryManual(unittest.TestCase): + """Unit tests for individual field validation.""" + + def test_valid_entry(self): + entry = { + "song": "Test Song", + "artist": "Test Artist", + "beat": 1, + "timestamp": "0:30", + "duration_seconds": 30, + "lyric_line": "A valid lyric line here", + "scene": { + "mood": "hope", + "colors": ["gold", "blue"], + "composition": "wide shot", + "camera": "static", + "description": "A golden field stretching to the horizon at dawn" + } + } + errors = validate_entry_manual(entry, 1) + self.assertEqual(errors, [], f"Valid entry should have no errors: {errors}") + + # ── Required fields ────────────────────────────────────── + def test_missing_song(self): + entry = {"artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, + "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("song" in e for e in errors)) + + def test_missing_artist(self): + entry = {"song": "S", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, + "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("artist" in e for e in errors)) + + def test_missing_beat(self): + entry = {"song": "S", "artist": "A", "timestamp": "0:00", "duration_seconds": 30, + "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("beat" in e for e in errors)) + + def test_missing_scene(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x"} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("scene" in e for e in errors)) + + # ── Type checks ────────────────────────────────────────── + def test_song_empty_string(self): + entry = {"song": "", "artist": "A", "beat": 1, "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("song" in e and "empty" in e for e in errors)) + + def test_beat_not_positive(self): + entry = {"song": "S", "artist": "A", "beat": 0, "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("beat" in e for e in errors)) + + def test_beat_string_rejected(self): + entry = {"song": "S", "artist": "A", "beat": "one", "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("beat" in e for e in errors)) + + # ── Timestamp format ───────────────────────────────────── + def test_timestamp_valid_mss(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:30", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertFalse(any("timestamp" in e for e in errors)) + + def test_timestamp_valid_mmss(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "12:45", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertFalse(any("timestamp" in e for e in errors)) + + def test_timestamp_invalid_no_colon(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "90", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("timestamp" in e for e in errors)) + + def test_timestamp_invalid_letters(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "ab:cd", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("timestamp" in e for e in errors)) + + # ── Scene validation ───────────────────────────────────── + def test_scene_missing_mood(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("mood" in e for e in errors)) + + def test_scene_missing_colors(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("colors" in e for e in errors)) + + def test_scene_colors_empty_array(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": [], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("colors" in e and "non-empty" in e for e in errors)) + + def test_scene_colors_too_many(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": ["a","b","c","d","e","f"], "composition": "c", "camera": "c", "description": "a"*20}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("colors" in e and "max 5" in e for e in errors)) + + def test_scene_description_too_short(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "short"}} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("description" in e and "too short" in e for e in errors)) + + def test_scene_not_dict(self): + entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x", "scene": "not a dict"} + errors = validate_entry_manual(entry, 1) + self.assertTrue(any("scene" in e and "object" in e for e in errors)) + + +class TestValidateFile(unittest.TestCase): + """Integration tests — validate_file with temp JSONL content.""" + + def _write_temp_jsonl(self, entries): + """Write entries to a temp JSONL file and return path.""" + f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) + for entry in entries: + f.write(json.dumps(entry) + '\n') + f.close() + return f.name + + def _valid_entry(self, **overrides): + base = { + "song": "Test Song", + "artist": "Test Artist", + "beat": 1, + "timestamp": "0:30", + "duration_seconds": 30, + "lyric_line": "A valid lyric line", + "scene": { + "mood": "hope", + "colors": ["gold", "blue"], + "composition": "wide shot", + "camera": "static", + "description": "A golden field stretching to the horizon at dawn" + } + } + base.update(overrides) + return base + + def test_all_valid(self): + path = self._write_temp_jsonl([self._valid_entry() for _ in range(5)]) + errors, line_count, valid_count = validate_file(path) + os.unlink(path) + self.assertEqual(len(errors), 0) + self.assertEqual(line_count, 5) + self.assertEqual(valid_count, 5) + + def test_all_invalid(self): + entries = [{"bad": "data"}, {"also": "bad"}] + path = self._write_temp_jsonl(entries) + errors, line_count, valid_count = validate_file(path) + os.unlink(path) + self.assertGreater(len(errors), 0) + self.assertEqual(line_count, 2) + self.assertEqual(valid_count, 0) + + def test_mixed_valid_invalid(self): + entries = [self._valid_entry(), {"bad": "data"}, self._valid_entry()] + path = self._write_temp_jsonl(entries) + errors, line_count, valid_count = validate_file(path) + os.unlink(path) + self.assertGreater(len(errors), 0) + self.assertEqual(line_count, 3) + self.assertEqual(valid_count, 2) + + def test_json_parse_error(self): + f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) + f.write('{"valid": true}\n') + f.write('NOT JSON {{{\n') + f.write('{"also_valid": true}\n') + f.close() + errors, line_count, valid_count = validate_file(f.name) + os.unlink(f.name) + self.assertTrue(any("JSON parse" in e for e in errors)) + self.assertEqual(line_count, 3) # blank lines skipped, but non-blank counted + + def test_blank_lines_skipped(self): + f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) + f.write(json.dumps(self._valid_entry()) + '\n') + f.write('\n') + f.write(' \n') + f.write(json.dumps(self._valid_entry()) + '\n') + f.close() + errors, line_count, valid_count = validate_file(f.name) + os.unlink(f.name) + self.assertEqual(line_count, 2) + self.assertEqual(valid_count, 2) + + def test_missing_file(self): + errors, line_count, valid_count = validate_file("/nonexistent/file.jsonl") + self.assertEqual(line_count, 0) + self.assertEqual(valid_count, 0) + + +class TestStatisticsTracking(unittest.TestCase): + """Verify that validate_file tracks counts correctly.""" + + def _write_temp(self, entries): + f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) + for e in entries: + f.write(json.dumps(e) + '\n') + f.close() + return f.name + + def test_error_count_matches(self): + entries = [ + {"bad": 1}, # missing required + {"bad": 2}, # missing required + {"bad": 3}, # missing required + ] + path = self._write_temp(entries) + errors, line_count, valid_count = validate_file(path) + os.unlink(path) + # Each entry should have multiple missing field errors + self.assertGreater(len(errors), 3) # at least one error per entry + self.assertEqual(valid_count, 0) + + def test_valid_count_accurate(self): + valid = { + "song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", + "duration_seconds": 30, "lyric_line": "x", + "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20} + } + entries = [valid, valid, valid] + path = self._write_temp(entries) + errors, line_count, valid_count = validate_file(path) + os.unlink(path) + self.assertEqual(line_count, 3) + self.assertEqual(valid_count, 3) + self.assertEqual(len(errors), 0) + + +class TestRealFiles(unittest.TestCase): + """Integration tests against actual training data files.""" + + def test_scene_descriptions_validate(self): + """All scene description JSONL files should pass validation.""" + scene_dir = VALIDATE_DIR + jsonl_files = list(scene_dir.glob("*.jsonl")) + if not jsonl_files: + self.skipTest("No JSONL files found") + + for filepath in jsonl_files: + errors, line_count, valid_count = validate_file(str(filepath)) + with self.subTest(file=filepath.name): + self.assertEqual(len(errors), 0, + f"{filepath.name} has {len(errors)} errors: {errors[:3]}") + self.assertEqual(line_count, valid_count) + self.assertGreater(line_count, 0) + + +if __name__ == "__main__": + unittest.main()