#!/usr/bin/env python3 """ Test that all 9 genre scene description files have 100 valid entries (#645). """ import json import unittest from pathlib import Path DATA_DIR = Path(__file__).resolve().parent.parent / "training-data" REQUIRED_GENRES = [ "rock", "hip-hop", "electronic", "r&b-soul", "country", "jazz", "classical", "metal", "latin", ] REQUIRED_TOP_FIELDS = ["song", "artist", "beat", "timestamp", "lyric_line", "scene"] REQUIRED_SCENE_FIELDS = ["mood", "colors", "composition", "description"] MIN_ENTRIES = 100 def load_jsonl(path): entries = [] with open(path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line: entries.append(json.loads(line)) return entries def validate_entry(entry): errors = [] for field in REQUIRED_TOP_FIELDS: if field not in entry: errors.append(f"missing top-level: {field}") elif not entry[field] and field != "beat": errors.append(f"empty top-level: {field}") if "beat" in entry: if not isinstance(entry["beat"], int) or entry["beat"] < 1: errors.append(f"beat must be int >= 1, got {entry['beat']}") if "timestamp" in entry: import re if not re.match(r"^[0-9]+:[0-5][0-9]$", str(entry["timestamp"])): errors.append(f"bad timestamp: {entry['timestamp']}") if "scene" in entry and isinstance(entry["scene"], dict): for sf in REQUIRED_SCENE_FIELDS: if sf not in entry["scene"]: errors.append(f"missing scene.{sf}") elif sf == "colors" and isinstance(entry["scene"][sf], list): if len(entry["scene"][sf]) == 0: errors.append("scene.colors is empty") elif sf != "colors" and isinstance(entry["scene"][sf], str) and not entry["scene"][sf].strip(): errors.append(f"scene.{sf} is empty") elif "scene" not in entry: errors.append("missing scene object") return errors class TestAllGenresPresent(unittest.TestCase): """Each required genre file must exist.""" def test_genre_files_exist(self): for genre in REQUIRED_GENRES: path = DATA_DIR / f"scene-descriptions-{genre}.jsonl" self.assertTrue(path.exists(), f"Missing file: {path.name}") class TestEntryCount(unittest.TestCase): """Each genre file must have at least 100 entries.""" def test_minimum_entries(self): for genre in REQUIRED_GENRES: path = DATA_DIR / f"scene-descriptions-{genre}.jsonl" if not path.exists(): self.fail(f"Missing: {path.name}") continue entries = load_jsonl(path) self.assertGreaterEqual( len(entries), MIN_ENTRIES, f"{genre}: only {len(entries)} entries (need {MIN_ENTRIES})", ) class TestSchemaCompliance(unittest.TestCase): """Every entry in every genre file must pass schema validation.""" def test_all_entries_valid(self): failures = [] for genre in REQUIRED_GENRES: path = DATA_DIR / f"scene-descriptions-{genre}.jsonl" if not path.exists(): failures.append(f"{genre}: file missing") continue entries = load_jsonl(path) for i, entry in enumerate(entries): errors = validate_entry(entry) for err in errors: failures.append(f"{genre} line {i+1}: {err}") self.assertEqual(failures, [], f"Schema violations:\n" + "\n".join(failures[:20])) class TestArtistAndTimestamp(unittest.TestCase): """Every entry must have non-empty artist and valid timestamp.""" def test_artists_present(self): for genre in REQUIRED_GENRES: path = DATA_DIR / f"scene-descriptions-{genre}.jsonl" if not path.exists(): continue entries = load_jsonl(path) for i, entry in enumerate(entries): self.assertIn("artist", entry, f"{genre} line {i+1}: missing artist") self.assertTrue( isinstance(entry["artist"], str) and entry["artist"].strip(), f"{genre} line {i+1}: empty artist", ) def test_timestamps_valid(self): import re for genre in REQUIRED_GENRES: path = DATA_DIR / f"scene-descriptions-{genre}.jsonl" if not path.exists(): continue entries = load_jsonl(path) for i, entry in enumerate(entries): ts = entry.get("timestamp", "") self.assertTrue( re.match(r"^[0-9]+:[0-5][0-9]$", ts), f"{genre} line {i+1}: bad timestamp '{ts}'", ) class TestSceneFields(unittest.TestCase): """Scene objects must have all required fields with valid values.""" def test_scene_completeness(self): for genre in REQUIRED_GENRES: path = DATA_DIR / f"scene-descriptions-{genre}.jsonl" if not path.exists(): continue entries = load_jsonl(path) for i, entry in enumerate(entries): scene = entry.get("scene", {}) for field in REQUIRED_SCENE_FIELDS: self.assertIn(field, scene, f"{genre} line {i+1}: missing scene.{field}") self.assertIsInstance(scene["colors"], list, f"{genre} line {i+1}: colors not array") self.assertGreater(len(scene["colors"]), 0, f"{genre} line {i+1}: empty colors") self.assertGreaterEqual( len(scene.get("description", "")), 10, f"{genre} line {i+1}: description too short", ) if __name__ == "__main__": unittest.main()