""" Tests for training/data/scene-descriptions/validate.py — Quality Gate Covers: - Unit tests for each validation type - Rejection workflow (invalid entries rejected) - Statistics tracking (line_count, valid_count, error_count) - Integration tests with sample JSONL content """ import json import os import sys import tempfile import unittest from pathlib import Path # Add the validate.py module to path VALIDATE_DIR = Path(__file__).parent.parent / "training" / "data" / "scene-descriptions" sys.path.insert(0, str(VALIDATE_DIR)) from validate import validate_entry_manual, validate_file class TestValidateEntryManual(unittest.TestCase): """Unit tests for individual field validation.""" def test_valid_entry(self): entry = { "song": "Test Song", "artist": "Test Artist", "beat": 1, "timestamp": "0:30", "duration_seconds": 30, "lyric_line": "A valid lyric line here", "scene": { "mood": "hope", "colors": ["gold", "blue"], "composition": "wide shot", "camera": "static", "description": "A golden field stretching to the horizon at dawn" } } errors = validate_entry_manual(entry, 1) self.assertEqual(errors, [], f"Valid entry should have no errors: {errors}") # ── Required fields ────────────────────────────────────── def test_missing_song(self): entry = {"artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("song" in e for e in errors)) def test_missing_artist(self): entry = {"song": "S", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("artist" in e for e in errors)) def test_missing_beat(self): entry = {"song": "S", "artist": "A", "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("beat" in e for e in errors)) def test_missing_scene(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x"} errors = validate_entry_manual(entry, 1) self.assertTrue(any("scene" in e for e in errors)) # ── Type checks ────────────────────────────────────────── def test_song_empty_string(self): entry = {"song": "", "artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("song" in e and "empty" in e for e in errors)) def test_beat_not_positive(self): entry = {"song": "S", "artist": "A", "beat": 0, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("beat" in e for e in errors)) def test_beat_string_rejected(self): entry = {"song": "S", "artist": "A", "beat": "one", "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("beat" in e for e in errors)) # ── Timestamp format ───────────────────────────────────── def test_timestamp_valid_mss(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:30", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertFalse(any("timestamp" in e for e in errors)) def test_timestamp_valid_mmss(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "12:45", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertFalse(any("timestamp" in e for e in errors)) def test_timestamp_invalid_no_colon(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "90", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("timestamp" in e for e in errors)) def test_timestamp_invalid_letters(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "ab:cd", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("timestamp" in e for e in errors)) # ── Scene validation ───────────────────────────────────── def test_scene_missing_mood(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("mood" in e for e in errors)) def test_scene_missing_colors(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("colors" in e for e in errors)) def test_scene_colors_empty_array(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": [], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("colors" in e and "non-empty" in e for e in errors)) def test_scene_colors_too_many(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["a","b","c","d","e","f"], "composition": "c", "camera": "c", "description": "a"*20}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("colors" in e and "max 5" in e for e in errors)) def test_scene_description_too_short(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "short"}} errors = validate_entry_manual(entry, 1) self.assertTrue(any("description" in e and "too short" in e for e in errors)) def test_scene_not_dict(self): entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": "not a dict"} errors = validate_entry_manual(entry, 1) self.assertTrue(any("scene" in e and "object" in e for e in errors)) class TestValidateFile(unittest.TestCase): """Integration tests — validate_file with temp JSONL content.""" def _write_temp_jsonl(self, entries): """Write entries to a temp JSONL file and return path.""" f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) for entry in entries: f.write(json.dumps(entry) + '\n') f.close() return f.name def _valid_entry(self, **overrides): base = { "song": "Test Song", "artist": "Test Artist", "beat": 1, "timestamp": "0:30", "duration_seconds": 30, "lyric_line": "A valid lyric line", "scene": { "mood": "hope", "colors": ["gold", "blue"], "composition": "wide shot", "camera": "static", "description": "A golden field stretching to the horizon at dawn" } } base.update(overrides) return base def test_all_valid(self): path = self._write_temp_jsonl([self._valid_entry() for _ in range(5)]) errors, line_count, valid_count = validate_file(path) os.unlink(path) self.assertEqual(len(errors), 0) self.assertEqual(line_count, 5) self.assertEqual(valid_count, 5) def test_all_invalid(self): entries = [{"bad": "data"}, {"also": "bad"}] path = self._write_temp_jsonl(entries) errors, line_count, valid_count = validate_file(path) os.unlink(path) self.assertGreater(len(errors), 0) self.assertEqual(line_count, 2) self.assertEqual(valid_count, 0) def test_mixed_valid_invalid(self): entries = [self._valid_entry(), {"bad": "data"}, self._valid_entry()] path = self._write_temp_jsonl(entries) errors, line_count, valid_count = validate_file(path) os.unlink(path) self.assertGreater(len(errors), 0) self.assertEqual(line_count, 3) self.assertEqual(valid_count, 2) def test_json_parse_error(self): f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) f.write('{"valid": true}\n') f.write('NOT JSON {{{\n') f.write('{"also_valid": true}\n') f.close() errors, line_count, valid_count = validate_file(f.name) os.unlink(f.name) self.assertTrue(any("JSON parse" in e for e in errors)) self.assertEqual(line_count, 3) # blank lines skipped, but non-blank counted def test_blank_lines_skipped(self): f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) f.write(json.dumps(self._valid_entry()) + '\n') f.write('\n') f.write(' \n') f.write(json.dumps(self._valid_entry()) + '\n') f.close() errors, line_count, valid_count = validate_file(f.name) os.unlink(f.name) self.assertEqual(line_count, 2) self.assertEqual(valid_count, 2) def test_missing_file(self): errors, line_count, valid_count = validate_file("/nonexistent/file.jsonl") self.assertEqual(line_count, 0) self.assertEqual(valid_count, 0) class TestStatisticsTracking(unittest.TestCase): """Verify that validate_file tracks counts correctly.""" def _write_temp(self, entries): f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) for e in entries: f.write(json.dumps(e) + '\n') f.close() return f.name def test_error_count_matches(self): entries = [ {"bad": 1}, # missing required {"bad": 2}, # missing required {"bad": 3}, # missing required ] path = self._write_temp(entries) errors, line_count, valid_count = validate_file(path) os.unlink(path) # Each entry should have multiple missing field errors self.assertGreater(len(errors), 3) # at least one error per entry self.assertEqual(valid_count, 0) def test_valid_count_accurate(self): valid = { "song": "S", "artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30, "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20} } entries = [valid, valid, valid] path = self._write_temp(entries) errors, line_count, valid_count = validate_file(path) os.unlink(path) self.assertEqual(line_count, 3) self.assertEqual(valid_count, 3) self.assertEqual(len(errors), 0) class TestRealFiles(unittest.TestCase): """Integration tests against actual training data files.""" def test_scene_descriptions_validate(self): """All scene description JSONL files should pass validation.""" scene_dir = VALIDATE_DIR jsonl_files = list(scene_dir.glob("*.jsonl")) if not jsonl_files: self.skipTest("No JSONL files found") for filepath in jsonl_files: errors, line_count, valid_count = validate_file(str(filepath)) with self.subTest(file=filepath.name): self.assertEqual(len(errors), 0, f"{filepath.name} has {len(errors)} errors: {errors[:3]}") self.assertEqual(line_count, valid_count) self.assertGreater(line_count, 0) if __name__ == "__main__": unittest.main()