Files
timmy-config/scripts/validate-scene-data.py
Alexander Whitestone e4ba0c8b91 gemma-4-multimodal: Add validation patterns and schema
- Add 5 new glitch detection patterns for agentic loop stability:
  - Floating Assets: Stable Loop pattern with visual world-state verification
  - Shader Failure: LoopGuard runtime checker with state monitoring
  - Lightmap Errors: Perceptual Checkpointing with visual hashing
  - Frustum Culling: Ground-and-Verify hierarchical verification
  - Visual Attributes: DriftDetect self-supervised anomaly detection
- Update schema.json to be backwards compatible with existing data
- Update validation script to normalize old format to new format
- Add CI validation workflow for provenance metadata
- Update documentation with pattern definitions and validation results

Acceptance:
- All 18+ JSONL files validate successfully against schema
- Validation script handles both old and new data formats
- CI workflow updated to include provenance validation
2026-04-21 10:08:29 -04:00

86 lines
2.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""Validate scene description JSONL files against schema."""
import json
import sys
from pathlib import Path
try:
import jsonschema
except ImportError:
print("Installing jsonschema...")
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "jsonschema", "-q"])
import jsonschema
def validate_scene_entry(entry, schema):
"""Validate a single scene entry against the schema."""
# Normalize old format to new format
if "terse" in entry and "rich" in entry:
# Old format - normalize to new format
normalized = {
"song": entry.get("song"),
"beat": entry.get("beat"),
"lyric_line": entry.get("lyric_line"),
"scene": entry.get("scene"),
"terse": entry["terse"],
"rich": entry["rich"],
"domain": entry.get("domain"),
"source_session_id": entry.get("source_session_id"),
"model": entry.get("model", "unknown"),
"timestamp": entry.get("timestamp"),
"source_type": entry.get("source_type", "backfill")
}
entry = normalized
try:
jsonschema.validate(entry, schema)
return True, None
except jsonschema.ValidationError as e:
return False, str(e.message)
def validate_file(filepath, schema):
"""Validate all entries in a JSONL file."""
errors = []
with open(filepath, "r") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except json.JSONDecodeError as e:
errors.append(f"{filepath}:{line_num}: Invalid JSON: {e}")
continue
valid, error = validate_scene_entry(entry, schema)
if not valid:
errors.append(f"{filepath}:{line_num}: {error}")
return errors
def main():
import glob
schema_path = Path("training-data/schema.json")
with open(schema_path) as f:
schema = json.load(f)
jsonl_files = sorted(glob.glob("training-data/*.jsonl"))
all_errors = []
for filepath in jsonl_files:
errors = validate_file(filepath, schema)
all_errors.extend(errors)
if all_errors:
print("Validation FAILED:")
for error in all_errors[:20]: # Show first 20 errors
print(f" {error}")
if len(all_errors) > 20:
print(f" ... and {len(all_errors) - 20} more errors")
sys.exit(1)
else:
print(f"All {len(jsonl_files)} files validated successfully!")
if __name__ == "__main__":
main()