"""
Tests for training/data/scene-descriptions/validate.py — Quality Gate

Covers:
- Unit tests for each validation type
- Rejection workflow (invalid entries rejected)
- Statistics tracking (line_count, valid_count, error_count)
- Integration tests with sample JSONL content
"""

import json
import os
import sys
import tempfile
import unittest
from pathlib import Path

# Add the validate.py module to path
VALIDATE_DIR = Path(__file__).parent.parent / "training" / "data" / "scene-descriptions"
sys.path.insert(0, str(VALIDATE_DIR))

from validate import validate_entry_manual, validate_file


class TestValidateEntryManual(unittest.TestCase):
    """Unit tests for individual field validation."""

    def test_valid_entry(self):
        entry = {
            "song": "Test Song",
            "artist": "Test Artist",
            "beat": 1,
            "timestamp": "0:30",
            "duration_seconds": 30,
            "lyric_line": "A valid lyric line here",
            "scene": {
                "mood": "hope",
                "colors": ["gold", "blue"],
                "composition": "wide shot",
                "camera": "static",
                "description": "A golden field stretching to the horizon at dawn"
            }
        }
        errors = validate_entry_manual(entry, 1)
        self.assertEqual(errors, [], f"Valid entry should have no errors: {errors}")

    # ── Required fields ──────────────────────────────────────
    def test_missing_song(self):
        entry = {"artist": "A", "beat": 1, "timestamp": "0:00", "duration_seconds": 30,
                 "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("song" in e for e in errors))

    def test_missing_artist(self):
        entry = {"song": "S", "beat": 1, "timestamp": "0:00", "duration_seconds": 30,
                 "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("artist" in e for e in errors))

    def test_missing_beat(self):
        entry = {"song": "S", "artist": "A", "timestamp": "0:00", "duration_seconds": 30,
                 "lyric_line": "x", "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("beat" in e for e in errors))

    def test_missing_scene(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
                 "duration_seconds": 30, "lyric_line": "x"}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("scene" in e for e in errors))

    # ── Type checks ──────────────────────────────────────────
    def test_song_empty_string(self):
        entry = {"song": "", "artist": "A", "beat": 1, "timestamp": "0:00",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("song" in e and "empty" in e for e in errors))

    def test_beat_not_positive(self):
        entry = {"song": "S", "artist": "A", "beat": 0, "timestamp": "0:00",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("beat" in e for e in errors))

    def test_beat_string_rejected(self):
        entry = {"song": "S", "artist": "A", "beat": "one", "timestamp": "0:00",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("beat" in e for e in errors))

    # ── Timestamp format ─────────────────────────────────────
    def test_timestamp_valid_mss(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:30",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertFalse(any("timestamp" in e for e in errors))

    def test_timestamp_valid_mmss(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "12:45",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertFalse(any("timestamp" in e for e in errors))

    def test_timestamp_invalid_no_colon(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "90",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("timestamp" in e for e in errors))

    def test_timestamp_invalid_letters(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "ab:cd",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("timestamp" in e for e in errors))

    # ── Scene validation ─────────────────────────────────────
    def test_scene_missing_mood(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("mood" in e for e in errors))

    def test_scene_missing_colors(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("colors" in e for e in errors))

    def test_scene_colors_empty_array(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "colors": [], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("colors" in e and "non-empty" in e for e in errors))

    def test_scene_colors_too_many(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "colors": ["a","b","c","d","e","f"], "composition": "c", "camera": "c", "description": "a"*20}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("colors" in e and "max 5" in e for e in errors))

    def test_scene_description_too_short(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
                 "duration_seconds": 30, "lyric_line": "x",
                 "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "short"}}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("description" in e and "too short" in e for e in errors))

    def test_scene_not_dict(self):
        entry = {"song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
                 "duration_seconds": 30, "lyric_line": "x", "scene": "not a dict"}
        errors = validate_entry_manual(entry, 1)
        self.assertTrue(any("scene" in e and "object" in e for e in errors))


class TestValidateFile(unittest.TestCase):
    """Integration tests — validate_file with temp JSONL content."""

    def _write_temp_jsonl(self, entries):
        """Write entries to a temp JSONL file and return path."""
        f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
        for entry in entries:
            f.write(json.dumps(entry) + '\n')
        f.close()
        return f.name

    def _valid_entry(self, **overrides):
        base = {
            "song": "Test Song",
            "artist": "Test Artist",
            "beat": 1,
            "timestamp": "0:30",
            "duration_seconds": 30,
            "lyric_line": "A valid lyric line",
            "scene": {
                "mood": "hope",
                "colors": ["gold", "blue"],
                "composition": "wide shot",
                "camera": "static",
                "description": "A golden field stretching to the horizon at dawn"
            }
        }
        base.update(overrides)
        return base

    def test_all_valid(self):
        path = self._write_temp_jsonl([self._valid_entry() for _ in range(5)])
        errors, line_count, valid_count = validate_file(path)
        os.unlink(path)
        self.assertEqual(len(errors), 0)
        self.assertEqual(line_count, 5)
        self.assertEqual(valid_count, 5)

    def test_all_invalid(self):
        entries = [{"bad": "data"}, {"also": "bad"}]
        path = self._write_temp_jsonl(entries)
        errors, line_count, valid_count = validate_file(path)
        os.unlink(path)
        self.assertGreater(len(errors), 0)
        self.assertEqual(line_count, 2)
        self.assertEqual(valid_count, 0)

    def test_mixed_valid_invalid(self):
        entries = [self._valid_entry(), {"bad": "data"}, self._valid_entry()]
        path = self._write_temp_jsonl(entries)
        errors, line_count, valid_count = validate_file(path)
        os.unlink(path)
        self.assertGreater(len(errors), 0)
        self.assertEqual(line_count, 3)
        self.assertEqual(valid_count, 2)

    def test_json_parse_error(self):
        f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
        f.write('{"valid": true}\n')
        f.write('NOT JSON {{{\n')
        f.write('{"also_valid": true}\n')
        f.close()
        errors, line_count, valid_count = validate_file(f.name)
        os.unlink(f.name)
        self.assertTrue(any("JSON parse" in e for e in errors))
        self.assertEqual(line_count, 3)  # blank lines skipped, but non-blank counted

    def test_blank_lines_skipped(self):
        f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
        f.write(json.dumps(self._valid_entry()) + '\n')
        f.write('\n')
        f.write('   \n')
        f.write(json.dumps(self._valid_entry()) + '\n')
        f.close()
        errors, line_count, valid_count = validate_file(f.name)
        os.unlink(f.name)
        self.assertEqual(line_count, 2)
        self.assertEqual(valid_count, 2)

    def test_missing_file(self):
        errors, line_count, valid_count = validate_file("/nonexistent/file.jsonl")
        self.assertEqual(line_count, 0)
        self.assertEqual(valid_count, 0)


class TestStatisticsTracking(unittest.TestCase):
    """Verify that validate_file tracks counts correctly."""

    def _write_temp(self, entries):
        f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
        for e in entries:
            f.write(json.dumps(e) + '\n')
        f.close()
        return f.name

    def test_error_count_matches(self):
        entries = [
            {"bad": 1},  # missing required
            {"bad": 2},  # missing required
            {"bad": 3},  # missing required
        ]
        path = self._write_temp(entries)
        errors, line_count, valid_count = validate_file(path)
        os.unlink(path)
        # Each entry should have multiple missing field errors
        self.assertGreater(len(errors), 3)  # at least one error per entry
        self.assertEqual(valid_count, 0)

    def test_valid_count_accurate(self):
        valid = {
            "song": "S", "artist": "A", "beat": 1, "timestamp": "0:00",
            "duration_seconds": 30, "lyric_line": "x",
            "scene": {"mood": "m", "colors": ["c"], "composition": "c", "camera": "c", "description": "a"*20}
        }
        entries = [valid, valid, valid]
        path = self._write_temp(entries)
        errors, line_count, valid_count = validate_file(path)
        os.unlink(path)
        self.assertEqual(line_count, 3)
        self.assertEqual(valid_count, 3)
        self.assertEqual(len(errors), 0)


class TestRealFiles(unittest.TestCase):
    """Integration tests against actual training data files."""

    def test_scene_descriptions_validate(self):
        """All scene description JSONL files should pass validation."""
        scene_dir = VALIDATE_DIR
        jsonl_files = list(scene_dir.glob("*.jsonl"))
        if not jsonl_files:
            self.skipTest("No JSONL files found")

        for filepath in jsonl_files:
            errors, line_count, valid_count = validate_file(str(filepath))
            with self.subTest(file=filepath.name):
                self.assertEqual(len(errors), 0,
                    f"{filepath.name} has {len(errors)} errors: {errors[:3]}")
                self.assertEqual(line_count, valid_count)
                self.assertGreater(line_count, 0)


if __name__ == "__main__":
    unittest.main()