Files
timmy-config/tests/test_scene_genre_completeness.py
Alexander Whitestone eacc670681
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 24s
Smoke Test / smoke (pull_request) Failing after 44s
Validate Config / YAML Lint (pull_request) Failing after 31s
Validate Config / JSON Validate (pull_request) Successful in 36s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 57s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 23s
Validate Config / Cron Syntax Check (pull_request) Successful in 4s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 5s
Validate Config / Playbook Schema Validation (pull_request) Successful in 7s
PR Checklist / pr-checklist (pull_request) Failing after 12m4s
Architecture Lint / Lint Repository (pull_request) Failing after 24s
test: validate all 9 genre scene files have 100 valid entries (#645)
2026-04-21 11:20:25 +00:00

155 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Test that all 9 genre scene description files have 100 valid entries (#645).
"""
import json
import unittest
from pathlib import Path
DATA_DIR = Path(__file__).resolve().parent.parent / "training-data"
REQUIRED_GENRES = [
"rock", "hip-hop", "electronic", "r&b-soul", "country",
"jazz", "classical", "metal", "latin",
]
REQUIRED_TOP_FIELDS = ["song", "artist", "beat", "timestamp", "lyric_line", "scene"]
REQUIRED_SCENE_FIELDS = ["mood", "colors", "composition", "description"]
MIN_ENTRIES = 100
def load_jsonl(path):
entries = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
entries.append(json.loads(line))
return entries
def validate_entry(entry):
errors = []
for field in REQUIRED_TOP_FIELDS:
if field not in entry:
errors.append(f"missing top-level: {field}")
elif not entry[field] and field != "beat":
errors.append(f"empty top-level: {field}")
if "beat" in entry:
if not isinstance(entry["beat"], int) or entry["beat"] < 1:
errors.append(f"beat must be int >= 1, got {entry['beat']}")
if "timestamp" in entry:
import re
if not re.match(r"^[0-9]+:[0-5][0-9]$", str(entry["timestamp"])):
errors.append(f"bad timestamp: {entry['timestamp']}")
if "scene" in entry and isinstance(entry["scene"], dict):
for sf in REQUIRED_SCENE_FIELDS:
if sf not in entry["scene"]:
errors.append(f"missing scene.{sf}")
elif sf == "colors" and isinstance(entry["scene"][sf], list):
if len(entry["scene"][sf]) == 0:
errors.append("scene.colors is empty")
elif sf != "colors" and isinstance(entry["scene"][sf], str) and not entry["scene"][sf].strip():
errors.append(f"scene.{sf} is empty")
elif "scene" not in entry:
errors.append("missing scene object")
return errors
class TestAllGenresPresent(unittest.TestCase):
"""Each required genre file must exist."""
def test_genre_files_exist(self):
for genre in REQUIRED_GENRES:
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
self.assertTrue(path.exists(), f"Missing file: {path.name}")
class TestEntryCount(unittest.TestCase):
"""Each genre file must have at least 100 entries."""
def test_minimum_entries(self):
for genre in REQUIRED_GENRES:
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
if not path.exists():
self.fail(f"Missing: {path.name}")
continue
entries = load_jsonl(path)
self.assertGreaterEqual(
len(entries), MIN_ENTRIES,
f"{genre}: only {len(entries)} entries (need {MIN_ENTRIES})",
)
class TestSchemaCompliance(unittest.TestCase):
"""Every entry in every genre file must pass schema validation."""
def test_all_entries_valid(self):
failures = []
for genre in REQUIRED_GENRES:
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
if not path.exists():
failures.append(f"{genre}: file missing")
continue
entries = load_jsonl(path)
for i, entry in enumerate(entries):
errors = validate_entry(entry)
for err in errors:
failures.append(f"{genre} line {i+1}: {err}")
self.assertEqual(failures, [], f"Schema violations:\n" + "\n".join(failures[:20]))
class TestArtistAndTimestamp(unittest.TestCase):
"""Every entry must have non-empty artist and valid timestamp."""
def test_artists_present(self):
for genre in REQUIRED_GENRES:
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
if not path.exists():
continue
entries = load_jsonl(path)
for i, entry in enumerate(entries):
self.assertIn("artist", entry, f"{genre} line {i+1}: missing artist")
self.assertTrue(
isinstance(entry["artist"], str) and entry["artist"].strip(),
f"{genre} line {i+1}: empty artist",
)
def test_timestamps_valid(self):
import re
for genre in REQUIRED_GENRES:
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
if not path.exists():
continue
entries = load_jsonl(path)
for i, entry in enumerate(entries):
ts = entry.get("timestamp", "")
self.assertTrue(
re.match(r"^[0-9]+:[0-5][0-9]$", ts),
f"{genre} line {i+1}: bad timestamp '{ts}'",
)
class TestSceneFields(unittest.TestCase):
"""Scene objects must have all required fields with valid values."""
def test_scene_completeness(self):
for genre in REQUIRED_GENRES:
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
if not path.exists():
continue
entries = load_jsonl(path)
for i, entry in enumerate(entries):
scene = entry.get("scene", {})
for field in REQUIRED_SCENE_FIELDS:
self.assertIn(field, scene, f"{genre} line {i+1}: missing scene.{field}")
self.assertIsInstance(scene["colors"], list, f"{genre} line {i+1}: colors not array")
self.assertGreater(len(scene["colors"]), 0, f"{genre} line {i+1}: empty colors")
self.assertGreaterEqual(
len(scene.get("description", "")), 10,
f"{genre} line {i+1}: description too short",
)
if __name__ == "__main__":
unittest.main()