Files
timmy-config/scripts/validate-scene-data.py
Alexander Whitestone 96ec3b7141 gemma-4-multimodal: Add JSON schema validation for scene descriptions
- training-data/schema.json: JSON Schema for scene description data (backwards compatible with existing old format)
- scripts/validate-scene-data.py: Validation script that normalizes old format
- .gitea/workflows/validate-scene-data.yml: CI validation on PRs

Acceptance:
- Schema validates all existing and new data files
- Validation script runs successfully on all JSONL files
- CI workflow configured for automated checking
2026-04-21 10:08:07 -04:00

66 lines
1.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""Validate scene description JSONL files against schema."""
import json
import sys
from pathlib import Path
try:
import jsonschema
except ImportError:
print("Installing jsonschema...")
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "jsonschema"])
import jsonschema
def validate_scene_entry(entry, schema):
"""Validate a single scene entry against the schema."""
try:
jsonschema.validate(entry, schema)
return True, None
except jsonschema.ValidationError as e:
return False, str(e.message)
def validate_file(filepath, schema):
"""Validate all entries in a JSONL file."""
errors = []
with open(filepath, "r") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except json.JSONDecodeError as e:
errors.append(f"{filepath}:{line_num}: Invalid JSON: {e}")
continue
valid, error = validate_scene_entry(entry, schema)
if not valid:
errors.append(f"{filepath}:{line_num}: {error}")
return errors
def main():
schema_path = Path("training-data/schema.json")
with open(schema_path) as f:
schema = json.load(f)
import glob
jsonl_files = glob.glob("training-data/*.jsonl")
all_errors = []
for filepath in jsonl_files:
errors = validate_file(filepath, schema)
all_errors.extend(errors)
if all_errors:
print("Validation FAILED:")
for error in all_errors:
print(f" {error}")
sys.exit(1)
else:
print(f"All {len(jsonl_files)} files validated successfully!")
if __name__ == "__main__":
main()