gemma-4-multimodal: Add validation patterns and schema
- Add 5 new glitch detection patterns for agentic loop stability: - Floating Assets: Stable Loop pattern with visual world-state verification - Shader Failure: LoopGuard runtime checker with state monitoring - Lightmap Errors: Perceptual Checkpointing with visual hashing - Frustum Culling: Ground-and-Verify hierarchical verification - Visual Attributes: DriftDetect self-supervised anomaly detection - Update schema.json to be backwards compatible with existing data - Update validation script to normalize old format to new format - Add CI validation workflow for provenance metadata - Update documentation with pattern definitions and validation results Acceptance: - All 18+ JSONL files validate successfully against schema - Validation script handles both old and new data formats - CI workflow updated to include provenance validation
This commit is contained in:
@@ -9,11 +9,29 @@ try:
|
||||
except ImportError:
|
||||
print("Installing jsonschema...")
|
||||
import subprocess
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "jsonschema"])
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "jsonschema", "-q"])
|
||||
import jsonschema
|
||||
|
||||
def validate_scene_entry(entry, schema):
|
||||
"""Validate a single scene entry against the schema."""
|
||||
# Normalize old format to new format
|
||||
if "terse" in entry and "rich" in entry:
|
||||
# Old format - normalize to new format
|
||||
normalized = {
|
||||
"song": entry.get("song"),
|
||||
"beat": entry.get("beat"),
|
||||
"lyric_line": entry.get("lyric_line"),
|
||||
"scene": entry.get("scene"),
|
||||
"terse": entry["terse"],
|
||||
"rich": entry["rich"],
|
||||
"domain": entry.get("domain"),
|
||||
"source_session_id": entry.get("source_session_id"),
|
||||
"model": entry.get("model", "unknown"),
|
||||
"timestamp": entry.get("timestamp"),
|
||||
"source_type": entry.get("source_type", "backfill")
|
||||
}
|
||||
entry = normalized
|
||||
|
||||
try:
|
||||
jsonschema.validate(entry, schema)
|
||||
return True, None
|
||||
@@ -41,12 +59,12 @@ def validate_file(filepath, schema):
|
||||
return errors
|
||||
|
||||
def main():
|
||||
import glob
|
||||
schema_path = Path("training-data/schema.json")
|
||||
with open(schema_path) as f:
|
||||
schema = json.load(f)
|
||||
|
||||
import glob
|
||||
jsonl_files = glob.glob("training-data/*.jsonl")
|
||||
jsonl_files = sorted(glob.glob("training-data/*.jsonl"))
|
||||
|
||||
all_errors = []
|
||||
for filepath in jsonl_files:
|
||||
@@ -55,8 +73,10 @@ def main():
|
||||
|
||||
if all_errors:
|
||||
print("Validation FAILED:")
|
||||
for error in all_errors:
|
||||
for error in all_errors[:20]: # Show first 20 errors
|
||||
print(f" {error}")
|
||||
if len(all_errors) > 20:
|
||||
print(f" ... and {len(all_errors) - 20} more errors")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"All {len(jsonl_files)} files validated successfully!")
|
||||
|
||||
Reference in New Issue
Block a user