Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
6b38c2b5aa fix: JSON schema + validation for scene description data #647
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 34s
Smoke Test / smoke (pull_request) Failing after 12s
Validate Config / YAML Lint (pull_request) Failing after 9s
Validate Config / JSON Validate (pull_request) Successful in 14s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m30s
Validate Config / Shell Script Lint (pull_request) Failing after 1m33s
Validate Config / Cron Syntax Check (pull_request) Successful in 18s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 21s
Validate Config / Playbook Schema Validation (pull_request) Successful in 23s
PR Checklist / pr-checklist (pull_request) Failing after 10m43s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
2026-04-14 21:16:31 -04:00
8 changed files with 269 additions and 6 deletions

View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python3
"""
Glitch pattern definitions for 3D world anomaly detection.

View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python3
"""
Full Nostr agent-to-agent communication demo - FINAL WORKING
"""

View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python3
"""
Soul Eval Gate — The Conscience of the Training Pipeline

View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python3
import json
from hermes_tools import browser_navigate, browser_vision

View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python3
import json
from hermes_tools import browser_navigate, browser_vision

View File

@@ -0,0 +1,185 @@
#!/usr/bin/env python3
"""
validate-scene-data.py — Validate scene description JSONL files against schema.
Usage:
python3 scripts/validate-scene-data.py training-data/*.jsonl
python3 scripts/validate-scene-data.py training-data/scene-descriptions-rock.jsonl
Exit codes:
0 = all entries valid
1 = validation errors found
Refs: timmy-config#647
"""
import json
import sys
import os
from pathlib import Path
# Try jsonschema, fall back to manual validation
try:
import jsonschema
HAS_JSONSCHEMA = True
except ImportError:
HAS_JSONSCHEMA = False
def load_schema():
"""Load the JSON schema from training-data/schema.json."""
schema_path = Path(__file__).parent.parent / "training-data" / "schema.json"
if not schema_path.exists():
# Try relative to CWD
schema_path = Path("training-data/schema.json")
if not schema_path.exists():
print(f"ERROR: Schema not found at {schema_path}", file=sys.stderr)
sys.exit(2)
with open(schema_path) as f:
return json.load(f)
def validate_entry_manual(entry, index):
"""Manual validation without jsonschema dependency."""
errors = []
# Required top-level fields
for field in ["song", "artist", "beat", "timestamp", "lyric_line", "scene"]:
if field not in entry:
errors.append(f"Missing required field: {field}")
# Type checks
if "song" in entry and not isinstance(entry["song"], str):
errors.append("'song' must be a string")
if "song" in entry and isinstance(entry["song"], str) and len(entry["song"].strip()) == 0:
errors.append("'song' must not be empty")
if "artist" in entry and not isinstance(entry["artist"], str):
errors.append("'artist' must be a string")
if "artist" in entry and isinstance(entry["artist"], str) and len(entry["artist"].strip()) == 0:
errors.append("'artist' must not be empty")
if "beat" in entry and not isinstance(entry["beat"], int):
errors.append("'beat' must be an integer")
if "beat" in entry and isinstance(entry["beat"], int) and entry["beat"] < 1:
errors.append("'beat' must be >= 1")
if "timestamp" in entry:
import re
if not re.match(r'^[0-9]+:[0-5][0-9]$', str(entry["timestamp"])):
errors.append(f"'timestamp' must be M:SS or MM:SS format, got: {entry['timestamp']}")
if "lyric_line" in entry and not isinstance(entry["lyric_line"], str):
errors.append("'lyric_line' must be a string")
if "lyric_line" in entry and isinstance(entry["lyric_line"], str) and len(entry["lyric_line"].strip()) == 0:
errors.append("'lyric_line' must not be empty")
# Scene validation
if "scene" in entry:
scene = entry["scene"]
if not isinstance(scene, dict):
errors.append("'scene' must be an object")
else:
for field in ["mood", "colors", "composition", "description"]:
if field not in scene:
errors.append(f"Missing required scene field: {field}")
if "mood" in scene and not isinstance(scene["mood"], str):
errors.append("'scene.mood' must be a string")
if "mood" in scene and isinstance(scene["mood"], str) and len(scene["mood"].strip()) == 0:
errors.append("'scene.mood' must not be empty")
if "colors" in scene:
if not isinstance(scene["colors"], list):
errors.append("'scene.colors' must be an array")
elif len(scene["colors"]) == 0:
errors.append("'scene.colors' must have at least 1 element")
else:
for i, c in enumerate(scene["colors"]):
if not isinstance(c, str) or len(c.strip()) == 0:
errors.append(f"'scene.colors[{i}]' must be a non-empty string")
if "composition" in scene and not isinstance(scene["composition"], str):
errors.append("'scene.composition' must be a string")
if "composition" in scene and isinstance(scene["composition"], str) and len(scene["composition"].strip()) == 0:
errors.append("'scene.composition' must not be empty")
if "description" in scene and not isinstance(scene["description"], str):
errors.append("'scene.description' must be a string")
if "description" in scene and isinstance(scene["description"], str) and len(scene["description"]) < 10:
errors.append(f"'scene.description' too short ({len(scene['description'])} chars, min 10)")
return errors
def validate_file(filepath, schema):
"""Validate all entries in a JSONL file."""
errors = []
total = 0
with open(filepath, "r", encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
total += 1
try:
entry = json.loads(line)
except json.JSONDecodeError as e:
errors.append(f" Line {line_num}: Invalid JSON — {e}")
continue
if HAS_JSONSCHEMA:
entry_errors = list(jsonschema.validate(entry, schema) or [])
# jsonschema raises on error, so this path won't see errors
else:
entry_errors = validate_entry_manual(entry, line_num)
for err in entry_errors:
errors.append(f" Line {line_num}: {err}")
return total, errors
def main():
if len(sys.argv) < 2:
print("Usage: python3 scripts/validate-scene-data.py <file.jsonl> [file2.jsonl ...]")
sys.exit(2)
schema = load_schema()
total_entries = 0
total_errors = 0
files_checked = 0
for filepath in sys.argv[1:]:
if not os.path.exists(filepath):
print(f"SKIP: {filepath} (not found)")
continue
files_checked += 1
count, errors = validate_file(filepath, schema)
total_entries += count
if errors:
total_errors += len(errors)
print(f"FAIL: {filepath}{len(errors)} error(s) in {count} entries:")
for err in errors[:20]: # Limit output
print(err)
if len(errors) > 20:
print(f" ... and {len(errors) - 20} more errors")
else:
print(f"PASS: {filepath}{count} entries valid")
print(f"\nSummary: {files_checked} files, {total_entries} entries, {total_errors} errors")
if total_errors > 0:
print("VALIDATION FAILED")
sys.exit(1)
else:
print("ALL VALID")
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python3
import json
from hermes_tools import browser_navigate, browser_vision

84
training-data/schema.json Normal file
View File

@@ -0,0 +1,84 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Scene Description Training Entry",
"description": "Schema for lyrics-to-visual-scene description training data entries.",
"type": "object",
"required": ["song", "artist", "beat", "timestamp", "lyric_line", "scene"],
"properties": {
"song": {
"type": "string",
"minLength": 1,
"description": "Song title"
},
"artist": {
"type": "string",
"minLength": 1,
"description": "Artist name"
},
"mood_arc": {
"type": "string",
"description": "Overall mood progression of the song (optional)"
},
"beat": {
"type": "integer",
"minimum": 1,
"description": "Beat number within the song (1-indexed)"
},
"timestamp": {
"type": "string",
"pattern": "^[0-9]+:[0-5][0-9]$",
"description": "Timestamp in M:SS or MM:SS format"
},
"duration": {
"type": "string",
"description": "Duration of the beat (e.g. '30s', '15s')"
},
"duration_seconds": {
"type": "integer",
"minimum": 1,
"description": "Duration in seconds (integer alternative)"
},
"lyric_line": {
"type": "string",
"minLength": 1,
"description": "The lyric line for this beat"
},
"scene": {
"type": "object",
"required": ["mood", "colors", "composition", "description"],
"properties": {
"mood": {
"type": "string",
"minLength": 1,
"description": "Emotional mood of the scene"
},
"colors": {
"type": "array",
"items": { "type": "string", "minLength": 1 },
"minItems": 1,
"description": "Color palette for the scene"
},
"composition": {
"type": "string",
"minLength": 1,
"description": "Shot composition (e.g. 'wide shot', 'close-up', 'low angle')"
},
"camera": {
"type": "string",
"description": "Camera movement (e.g. 'static', 'slow zoom', 'tracking')"
},
"camera_movement": {
"type": "string",
"description": "Alternative field name for camera movement"
},
"description": {
"type": "string",
"minLength": 10,
"description": "Full scene description text"
}
},
"additionalProperties": true
}
},
"additionalProperties": true
}