timmy-config/tests/test_scene_genre_completeness.py

#!/usr/bin/env python3
"""
Test that all 9 genre scene description files have 100 valid entries (#645).
"""

import json
import unittest
from pathlib import Path

DATA_DIR = Path(__file__).resolve().parent.parent / "training-data"
REQUIRED_GENRES = [
    "rock", "hip-hop", "electronic", "r&b-soul", "country",
    "jazz", "classical", "metal", "latin",
]
REQUIRED_TOP_FIELDS = ["song", "artist", "beat", "timestamp", "lyric_line", "scene"]
REQUIRED_SCENE_FIELDS = ["mood", "colors", "composition", "description"]
MIN_ENTRIES = 100


def load_jsonl(path):
    entries = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                entries.append(json.loads(line))
    return entries


def validate_entry(entry):
    errors = []
    for field in REQUIRED_TOP_FIELDS:
        if field not in entry:
            errors.append(f"missing top-level: {field}")
        elif not entry[field] and field != "beat":
            errors.append(f"empty top-level: {field}")
    if "beat" in entry:
        if not isinstance(entry["beat"], int) or entry["beat"] < 1:
            errors.append(f"beat must be int >= 1, got {entry['beat']}")
    if "timestamp" in entry:
        import re
        if not re.match(r"^[0-9]+:[0-5][0-9]$", str(entry["timestamp"])):
            errors.append(f"bad timestamp: {entry['timestamp']}")
    if "scene" in entry and isinstance(entry["scene"], dict):
        for sf in REQUIRED_SCENE_FIELDS:
            if sf not in entry["scene"]:
                errors.append(f"missing scene.{sf}")
            elif sf == "colors" and isinstance(entry["scene"][sf], list):
                if len(entry["scene"][sf]) == 0:
                    errors.append("scene.colors is empty")
            elif sf != "colors" and isinstance(entry["scene"][sf], str) and not entry["scene"][sf].strip():
                errors.append(f"scene.{sf} is empty")
    elif "scene" not in entry:
        errors.append("missing scene object")
    return errors


class TestAllGenresPresent(unittest.TestCase):
    """Each required genre file must exist."""

    def test_genre_files_exist(self):
        for genre in REQUIRED_GENRES:
            path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
            self.assertTrue(path.exists(), f"Missing file: {path.name}")


class TestEntryCount(unittest.TestCase):
    """Each genre file must have at least 100 entries."""

    def test_minimum_entries(self):
        for genre in REQUIRED_GENRES:
            path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
            if not path.exists():
                self.fail(f"Missing: {path.name}")
                continue
            entries = load_jsonl(path)
            self.assertGreaterEqual(
                len(entries), MIN_ENTRIES,
                f"{genre}: only {len(entries)} entries (need {MIN_ENTRIES})",
            )


class TestSchemaCompliance(unittest.TestCase):
    """Every entry in every genre file must pass schema validation."""

    def test_all_entries_valid(self):
        failures = []
        for genre in REQUIRED_GENRES:
            path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
            if not path.exists():
                failures.append(f"{genre}: file missing")
                continue
            entries = load_jsonl(path)
            for i, entry in enumerate(entries):
                errors = validate_entry(entry)
                for err in errors:
                    failures.append(f"{genre} line {i+1}: {err}")
        self.assertEqual(failures, [], f"Schema violations:\n" + "\n".join(failures[:20]))


class TestArtistAndTimestamp(unittest.TestCase):
    """Every entry must have non-empty artist and valid timestamp."""

    def test_artists_present(self):
        for genre in REQUIRED_GENRES:
            path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
            if not path.exists():
                continue
            entries = load_jsonl(path)
            for i, entry in enumerate(entries):
                self.assertIn("artist", entry, f"{genre} line {i+1}: missing artist")
                self.assertTrue(
                    isinstance(entry["artist"], str) and entry["artist"].strip(),
                    f"{genre} line {i+1}: empty artist",
                )

    def test_timestamps_valid(self):
        import re
        for genre in REQUIRED_GENRES:
            path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
            if not path.exists():
                continue
            entries = load_jsonl(path)
            for i, entry in enumerate(entries):
                ts = entry.get("timestamp", "")
                self.assertTrue(
                    re.match(r"^[0-9]+:[0-5][0-9]$", ts),
                    f"{genre} line {i+1}: bad timestamp '{ts}'",
                )


class TestSceneFields(unittest.TestCase):
    """Scene objects must have all required fields with valid values."""

    def test_scene_completeness(self):
        for genre in REQUIRED_GENRES:
            path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
            if not path.exists():
                continue
            entries = load_jsonl(path)
            for i, entry in enumerate(entries):
                scene = entry.get("scene", {})
                for field in REQUIRED_SCENE_FIELDS:
                    self.assertIn(field, scene, f"{genre} line {i+1}: missing scene.{field}")
                self.assertIsInstance(scene["colors"], list, f"{genre} line {i+1}: colors not array")
                self.assertGreater(len(scene["colors"]), 0, f"{genre} line {i+1}: empty colors")
                self.assertGreaterEqual(
                    len(scene.get("description", "")), 10,
                    f"{genre} line {i+1}: description too short",
                )


if __name__ == "__main__":
    unittest.main()