- Add 5 new glitch detection patterns for agentic loop stability: - Floating Assets: Stable Loop pattern with visual world-state verification - Shader Failure: LoopGuard runtime checker with state monitoring - Lightmap Errors: Perceptual Checkpointing with visual hashing - Frustum Culling: Ground-and-Verify hierarchical verification - Visual Attributes: DriftDetect self-supervised anomaly detection - Update schema.json to be backwards compatible with existing data - Update validation script to normalize old format to new format - Add CI validation workflow for provenance metadata - Update documentation with pattern definitions and validation results Acceptance: - All 18+ JSONL files validate successfully against schema - Validation script handles both old and new data formats - CI workflow updated to include provenance validation
86 lines
2.7 KiB
Python
Executable File
86 lines
2.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Validate scene description JSONL files against schema."""
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import jsonschema
|
|
except ImportError:
|
|
print("Installing jsonschema...")
|
|
import subprocess
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "jsonschema", "-q"])
|
|
import jsonschema
|
|
|
|
def validate_scene_entry(entry, schema):
|
|
"""Validate a single scene entry against the schema."""
|
|
# Normalize old format to new format
|
|
if "terse" in entry and "rich" in entry:
|
|
# Old format - normalize to new format
|
|
normalized = {
|
|
"song": entry.get("song"),
|
|
"beat": entry.get("beat"),
|
|
"lyric_line": entry.get("lyric_line"),
|
|
"scene": entry.get("scene"),
|
|
"terse": entry["terse"],
|
|
"rich": entry["rich"],
|
|
"domain": entry.get("domain"),
|
|
"source_session_id": entry.get("source_session_id"),
|
|
"model": entry.get("model", "unknown"),
|
|
"timestamp": entry.get("timestamp"),
|
|
"source_type": entry.get("source_type", "backfill")
|
|
}
|
|
entry = normalized
|
|
|
|
try:
|
|
jsonschema.validate(entry, schema)
|
|
return True, None
|
|
except jsonschema.ValidationError as e:
|
|
return False, str(e.message)
|
|
|
|
def validate_file(filepath, schema):
|
|
"""Validate all entries in a JSONL file."""
|
|
errors = []
|
|
with open(filepath, "r") as f:
|
|
for line_num, line in enumerate(f, 1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
entry = json.loads(line)
|
|
except json.JSONDecodeError as e:
|
|
errors.append(f"{filepath}:{line_num}: Invalid JSON: {e}")
|
|
continue
|
|
|
|
valid, error = validate_scene_entry(entry, schema)
|
|
if not valid:
|
|
errors.append(f"{filepath}:{line_num}: {error}")
|
|
|
|
return errors
|
|
|
|
def main():
|
|
import glob
|
|
schema_path = Path("training-data/schema.json")
|
|
with open(schema_path) as f:
|
|
schema = json.load(f)
|
|
|
|
jsonl_files = sorted(glob.glob("training-data/*.jsonl"))
|
|
|
|
all_errors = []
|
|
for filepath in jsonl_files:
|
|
errors = validate_file(filepath, schema)
|
|
all_errors.extend(errors)
|
|
|
|
if all_errors:
|
|
print("Validation FAILED:")
|
|
for error in all_errors[:20]: # Show first 20 errors
|
|
print(f" {error}")
|
|
if len(all_errors) > 20:
|
|
print(f" ... and {len(all_errors) - 20} more errors")
|
|
sys.exit(1)
|
|
else:
|
|
print(f"All {len(jsonl_files)} files validated successfully!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|