feat: quality gate test suite — 27 tests (#629 )

tests/test_quality_gate.py: TestValidateEntryManual (16 tests): valid entry, missing fields (song/artist/beat/scene), type checks (empty song, beat=0, beat=string), timestamp format (valid M:SS, valid MM:SS, no colon, letters), scene validation (missing mood, missing colors, empty colors, too many colors (>5), description too short, scene not dict) TestValidateFile (6 tests): all valid, all invalid, mixed, JSON parse error, blank lines skipped, missing file TestStatisticsTracking (2 tests): error count matches, valid count accurate TestRealFiles (1 test + 6 subtests): validates all actual scene-descriptions/*.jsonl files Total: 27 passed, 6 subtests passed in 0.04s
2026-04-15 19:07:25 -04:00
1 changed files with 309 additions and 0 deletions
--- a/tests/test_quality_gate.py
+++ b/tests/test_quality_gate.py
@@ -0,0 +1,309 @@
+"""
+Tests for training/data/scene-descriptions/validate.py — Quality Gate
+
+Covers:
+- Unit tests for each validation type
+- Rejection workflow (invalid entries rejected)
+- Statistics tracking (line_count, valid_count, error_count)
+- Integration tests with sample JSONL content
+"""
+
+import json
+import os
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+# Add the validate.py module to path
+VALIDATE_DIR = Path(__file__).parent.parent / "training" / "data" / "scene-descriptions"
+sys.path.insert(0, str(VALIDATE_DIR))
+
+from validate import validate_entry_manual, validate_file
+
+
+class TestValidateEntryManual(unittest.TestCase):
+    """Unit tests for individual field validation."""
+
+    def test_valid_entry(self):
+        entry = {
+            "song": "Test Song",
+            "artist": "Test Artist",
+            "beat": 1,
+            "timestamp": "0:30",
+            "duration_seconds": 30,
+            "lyric_line": "A valid lyric line here",
+            "scene": {
+                "mood": "hope",
+                "colors": ["gold", "blue"],
+                "composition": "wide shot",
+                "camera": "static",
+                "description": "A golden field stretching to the horizon at dawn"
+            }
+        }
+        errors = validate_entry_manual(entry, 1)
+        self.assertEqual(errors, [], f"Valid entry should have no errors: {errors}")
+
+    # ── Required fields ──────────────────────────────────────
+    def test_missing_song(self):
+        entry = {"artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30,
+                 "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("song" in e for e in errors))
+
+    def test_missing_artist(self):
+        entry = {"song": "S", "beat": 1, "timestamp": "0:00", "duration_seconds": 30,
+                 "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("artist" in e for e in errors))
+
+    def test_missing_beat(self):
+        entry = {"song": "S", "artist": "A", "timestamp": "0:00", "duration_seconds": 30,
+                 "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("beat" in e for e in errors))
+
+    def test_missing_scene(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
+                 "duration_seconds": 30, "lyric_line": "x"}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("scene" in e for e in errors))
+
+    # ── Type checks ──────────────────────────────────────────
+    def test_song_empty_string(self):
+        entry = {"song": "", "artist": "A", "beat": 1, "timestamp": "0:00",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("song" in e and "empty" in e for e in errors))
+
+    def test_beat_not_positive(self):
+        entry = {"song": "S", "artist": "A", "beat": 0, "timestamp": "0:00",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("beat" in e for e in errors))
+
+    def test_beat_string_rejected(self):
+        entry = {"song": "S", "artist": "A", "beat": "one", "timestamp": "0:00",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("beat" in e for e in errors))
+
+    # ── Timestamp format ─────────────────────────────────────
+    def test_timestamp_valid_mss(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:30",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertFalse(any("timestamp" in e for e in errors))
+
+    def test_timestamp_valid_mmss(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "12:45",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertFalse(any("timestamp" in e for e in errors))
+
+    def test_timestamp_invalid_no_colon(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "90",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("timestamp" in e for e in errors))
+
+    def test_timestamp_invalid_letters(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "ab:cd",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("timestamp" in e for e in errors))
+
+    # ── Scene validation ─────────────────────────────────────
+    def test_scene_missing_mood(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("mood" in e for e in errors))
+
+    def test_scene_missing_colors(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("colors" in e for e in errors))
+
+    def test_scene_colors_empty_array(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "colors": [], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("colors" in e and "non-empty" in e for e in errors))
+
+    def test_scene_colors_too_many(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "colors": ["a","b","c","d","e","f"], "composition": "c", "camera": "c", "description": "a"*20}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("colors" in e and "max 5" in e for e in errors))
+
+    def test_scene_description_too_short(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
+                 "duration_seconds": 30, "lyric_line": "x",
+                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "short"}}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("description" in e and "too short" in e for e in errors))
+
+    def test_scene_not_dict(self):
+        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
+                 "duration_seconds": 30, "lyric_line": "x", "scene": "not a dict"}
+        errors = validate_entry_manual(entry, 1)
+        self.assertTrue(any("scene" in e and "object" in e for e in errors))
+
+
+class TestValidateFile(unittest.TestCase):
+    """Integration tests — validate_file with temp JSONL content."""
+
+    def _write_temp_jsonl(self, entries):
+        """Write entries to a temp JSONL file and return path."""
+        f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
+        for entry in entries:
+            f.write(json.dumps(entry) + '\n')
+        f.close()
+        return f.name
+
+    def _valid_entry(self, **overrides):
+        base = {
+            "song": "Test Song",
+            "artist": "Test Artist",
+            "beat": 1,
+            "timestamp": "0:30",
+            "duration_seconds": 30,
+            "lyric_line": "A valid lyric line",
+            "scene": {
+                "mood": "hope",
+                "colors": ["gold", "blue"],
+                "composition": "wide shot",
+                "camera": "static",
+                "description": "A golden field stretching to the horizon at dawn"
+            }
+        }
+        base.update(overrides)
+        return base
+
+    def test_all_valid(self):
+        path = self._write_temp_jsonl([self._valid_entry() for _ in range(5)])
+        errors, line_count, valid_count = validate_file(path)
+        os.unlink(path)
+        self.assertEqual(len(errors), 0)
+        self.assertEqual(line_count, 5)
+        self.assertEqual(valid_count, 5)
+
+    def test_all_invalid(self):
+        entries = [{"bad": "data"}, {"also": "bad"}]
+        path = self._write_temp_jsonl(entries)
+        errors, line_count, valid_count = validate_file(path)
+        os.unlink(path)
+        self.assertGreater(len(errors), 0)
+        self.assertEqual(line_count, 2)
+        self.assertEqual(valid_count, 0)
+
+    def test_mixed_valid_invalid(self):
+        entries = [self._valid_entry(), {"bad": "data"}, self._valid_entry()]
+        path = self._write_temp_jsonl(entries)
+        errors, line_count, valid_count = validate_file(path)
+        os.unlink(path)
+        self.assertGreater(len(errors), 0)
+        self.assertEqual(line_count, 3)
+        self.assertEqual(valid_count, 2)
+
+    def test_json_parse_error(self):
+        f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
+        f.write('{"valid": true}\n')
+        f.write('NOT JSON {{{\n')
+        f.write('{"also_valid": true}\n')
+        f.close()
+        errors, line_count, valid_count = validate_file(f.name)
+        os.unlink(f.name)
+        self.assertTrue(any("JSON parse" in e for e in errors))
+        self.assertEqual(line_count, 3)  # blank lines skipped, but non-blank counted
+
+    def test_blank_lines_skipped(self):
+        f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
+        f.write(json.dumps(self._valid_entry()) + '\n')
+        f.write('\n')
+        f.write('   \n')
+        f.write(json.dumps(self._valid_entry()) + '\n')
+        f.close()
+        errors, line_count, valid_count = validate_file(f.name)
+        os.unlink(f.name)
+        self.assertEqual(line_count, 2)
+        self.assertEqual(valid_count, 2)
+
+    def test_missing_file(self):
+        errors, line_count, valid_count = validate_file("/nonexistent/file.jsonl")
+        self.assertEqual(line_count, 0)
+        self.assertEqual(valid_count, 0)
+
+
+class TestStatisticsTracking(unittest.TestCase):
+    """Verify that validate_file tracks counts correctly."""
+
+    def _write_temp(self, entries):
+        f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
+        for e in entries:
+            f.write(json.dumps(e) + '\n')
+        f.close()
+        return f.name
+
+    def test_error_count_matches(self):
+        entries = [
+            {"bad": 1},  # missing required
+            {"bad": 2},  # missing required
+            {"bad": 3},  # missing required
+        ]
+        path = self._write_temp(entries)
+        errors, line_count, valid_count = validate_file(path)
+        os.unlink(path)
+        # Each entry should have multiple missing field errors
+        self.assertGreater(len(errors), 3)  # at least one error per entry
+        self.assertEqual(valid_count, 0)
+
+    def test_valid_count_accurate(self):
+        valid = {
+            "song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
+            "duration_seconds": 30, "lyric_line": "x",
+            "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}
+        }
+        entries = [valid, valid, valid]
+        path = self._write_temp(entries)
+        errors, line_count, valid_count = validate_file(path)
+        os.unlink(path)
+        self.assertEqual(line_count, 3)
+        self.assertEqual(valid_count, 3)
+        self.assertEqual(len(errors), 0)
+
+
+class TestRealFiles(unittest.TestCase):
+    """Integration tests against actual training data files."""
+
+    def test_scene_descriptions_validate(self):
+        """All scene description JSONL files should pass validation."""
+        scene_dir = VALIDATE_DIR
+        jsonl_files = list(scene_dir.glob("*.jsonl"))
+        if not jsonl_files:
+            self.skipTest("No JSONL files found")
+
+        for filepath in jsonl_files:
+            errors, line_count, valid_count = validate_file(str(filepath))
+            with self.subTest(file=filepath.name):
+                self.assertEqual(len(errors), 0,
+                    f"{filepath.name} has {len(errors)} errors: {errors[:3]}")
+                self.assertEqual(line_count, valid_count)
+                self.assertGreater(line_count, 0)
+
+
+if __name__ == "__main__":
+    unittest.main()