Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 24s
Smoke Test / smoke (pull_request) Failing after 44s
Validate Config / YAML Lint (pull_request) Failing after 31s
Validate Config / JSON Validate (pull_request) Successful in 36s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 57s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 23s
Validate Config / Cron Syntax Check (pull_request) Successful in 4s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 5s
Validate Config / Playbook Schema Validation (pull_request) Successful in 7s
PR Checklist / pr-checklist (pull_request) Failing after 12m4s
Architecture Lint / Lint Repository (pull_request) Failing after 24s
155 lines
5.7 KiB
Python
155 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test that all 9 genre scene description files have 100 valid entries (#645).
|
|
"""
|
|
|
|
import json
|
|
import unittest
|
|
from pathlib import Path
|
|
|
|
DATA_DIR = Path(__file__).resolve().parent.parent / "training-data"
|
|
REQUIRED_GENRES = [
|
|
"rock", "hip-hop", "electronic", "r&b-soul", "country",
|
|
"jazz", "classical", "metal", "latin",
|
|
]
|
|
REQUIRED_TOP_FIELDS = ["song", "artist", "beat", "timestamp", "lyric_line", "scene"]
|
|
REQUIRED_SCENE_FIELDS = ["mood", "colors", "composition", "description"]
|
|
MIN_ENTRIES = 100
|
|
|
|
|
|
def load_jsonl(path):
|
|
entries = []
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
entries.append(json.loads(line))
|
|
return entries
|
|
|
|
|
|
def validate_entry(entry):
|
|
errors = []
|
|
for field in REQUIRED_TOP_FIELDS:
|
|
if field not in entry:
|
|
errors.append(f"missing top-level: {field}")
|
|
elif not entry[field] and field != "beat":
|
|
errors.append(f"empty top-level: {field}")
|
|
if "beat" in entry:
|
|
if not isinstance(entry["beat"], int) or entry["beat"] < 1:
|
|
errors.append(f"beat must be int >= 1, got {entry['beat']}")
|
|
if "timestamp" in entry:
|
|
import re
|
|
if not re.match(r"^[0-9]+:[0-5][0-9]$", str(entry["timestamp"])):
|
|
errors.append(f"bad timestamp: {entry['timestamp']}")
|
|
if "scene" in entry and isinstance(entry["scene"], dict):
|
|
for sf in REQUIRED_SCENE_FIELDS:
|
|
if sf not in entry["scene"]:
|
|
errors.append(f"missing scene.{sf}")
|
|
elif sf == "colors" and isinstance(entry["scene"][sf], list):
|
|
if len(entry["scene"][sf]) == 0:
|
|
errors.append("scene.colors is empty")
|
|
elif sf != "colors" and isinstance(entry["scene"][sf], str) and not entry["scene"][sf].strip():
|
|
errors.append(f"scene.{sf} is empty")
|
|
elif "scene" not in entry:
|
|
errors.append("missing scene object")
|
|
return errors
|
|
|
|
|
|
class TestAllGenresPresent(unittest.TestCase):
|
|
"""Each required genre file must exist."""
|
|
|
|
def test_genre_files_exist(self):
|
|
for genre in REQUIRED_GENRES:
|
|
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
|
|
self.assertTrue(path.exists(), f"Missing file: {path.name}")
|
|
|
|
|
|
class TestEntryCount(unittest.TestCase):
|
|
"""Each genre file must have at least 100 entries."""
|
|
|
|
def test_minimum_entries(self):
|
|
for genre in REQUIRED_GENRES:
|
|
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
|
|
if not path.exists():
|
|
self.fail(f"Missing: {path.name}")
|
|
continue
|
|
entries = load_jsonl(path)
|
|
self.assertGreaterEqual(
|
|
len(entries), MIN_ENTRIES,
|
|
f"{genre}: only {len(entries)} entries (need {MIN_ENTRIES})",
|
|
)
|
|
|
|
|
|
class TestSchemaCompliance(unittest.TestCase):
|
|
"""Every entry in every genre file must pass schema validation."""
|
|
|
|
def test_all_entries_valid(self):
|
|
failures = []
|
|
for genre in REQUIRED_GENRES:
|
|
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
|
|
if not path.exists():
|
|
failures.append(f"{genre}: file missing")
|
|
continue
|
|
entries = load_jsonl(path)
|
|
for i, entry in enumerate(entries):
|
|
errors = validate_entry(entry)
|
|
for err in errors:
|
|
failures.append(f"{genre} line {i+1}: {err}")
|
|
self.assertEqual(failures, [], f"Schema violations:\n" + "\n".join(failures[:20]))
|
|
|
|
|
|
class TestArtistAndTimestamp(unittest.TestCase):
|
|
"""Every entry must have non-empty artist and valid timestamp."""
|
|
|
|
def test_artists_present(self):
|
|
for genre in REQUIRED_GENRES:
|
|
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
|
|
if not path.exists():
|
|
continue
|
|
entries = load_jsonl(path)
|
|
for i, entry in enumerate(entries):
|
|
self.assertIn("artist", entry, f"{genre} line {i+1}: missing artist")
|
|
self.assertTrue(
|
|
isinstance(entry["artist"], str) and entry["artist"].strip(),
|
|
f"{genre} line {i+1}: empty artist",
|
|
)
|
|
|
|
def test_timestamps_valid(self):
|
|
import re
|
|
for genre in REQUIRED_GENRES:
|
|
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
|
|
if not path.exists():
|
|
continue
|
|
entries = load_jsonl(path)
|
|
for i, entry in enumerate(entries):
|
|
ts = entry.get("timestamp", "")
|
|
self.assertTrue(
|
|
re.match(r"^[0-9]+:[0-5][0-9]$", ts),
|
|
f"{genre} line {i+1}: bad timestamp '{ts}'",
|
|
)
|
|
|
|
|
|
class TestSceneFields(unittest.TestCase):
|
|
"""Scene objects must have all required fields with valid values."""
|
|
|
|
def test_scene_completeness(self):
|
|
for genre in REQUIRED_GENRES:
|
|
path = DATA_DIR / f"scene-descriptions-{genre}.jsonl"
|
|
if not path.exists():
|
|
continue
|
|
entries = load_jsonl(path)
|
|
for i, entry in enumerate(entries):
|
|
scene = entry.get("scene", {})
|
|
for field in REQUIRED_SCENE_FIELDS:
|
|
self.assertIn(field, scene, f"{genre} line {i+1}: missing scene.{field}")
|
|
self.assertIsInstance(scene["colors"], list, f"{genre} line {i+1}: colors not array")
|
|
self.assertGreater(len(scene["colors"]), 0, f"{genre} line {i+1}: empty colors")
|
|
self.assertGreaterEqual(
|
|
len(scene.get("description", "")), 10,
|
|
f"{genre} line {i+1}: description too short",
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|