fix: JSON schema + validation for scene description data #647

2026-04-14 21:16:31 -04:00
8 changed files with 269 additions and 265 deletions
--- a/bin/nostr-agent-demo.py
+++ b/bin/nostr-agent-demo.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 """
 Full Nostr agent-to-agent communication demo - FINAL WORKING
 """
--- a/bin/soul_eval_gate.py
+++ b/bin/soul_eval_gate.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 """
 Soul Eval Gate — The Conscience of the Training Pipeline

--- a/scripts/captcha_bypass_handler.py
+++ b/scripts/captcha_bypass_handler.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 import json
 from hermes_tools import browser_navigate, browser_vision

--- a/scripts/diagram_meaning_extractor.py
+++ b/scripts/diagram_meaning_extractor.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 import json
 from hermes_tools import browser_navigate, browser_vision

--- a/scripts/training_provenance.py
+++ b/scripts/training_provenance.py
@@ -1,260 +0,0 @@
-#!/usr/bin/env python3
-"""
-[PROVENANCE] Training Pair Provenance Tracker
-Part of the Timmy Foundation tooling.
-
-Adds, filters, and reports provenance metadata for JSONL training pairs.
-Tracks source_session_id, model, and timestamp for quality auditing.
-
-Usage:
-    # Tag pairs with provenance
-    python3 scripts/training_provenance.py tag input.jsonl -o tagged.jsonl \
-        --session abc123 --model nous/hermes-3
-
-    # Filter by model (exclude Anthropic-sourced)
-    python3 scripts/training_provenance.py filter input.jsonl -o filtered.jsonl \
-        --exclude-model anthropic
-
-    # Report: pair count by source model
-    python3 scripts/training_provenance.py report input.jsonl
-
-    # Pipe support
-    cat pairs.jsonl | python3 scripts/training_provenance.py report -
-"""
-
-import sys
-import json
-import argparse
-from datetime import datetime, timezone
-from collections import Counter
-from typing import Dict, Any, Optional
-
-
-PROVENANCE_KEYS = ["source_session_id", "source_model", "source_timestamp"]
-
-
-def tag_pair(pair: Dict[str, Any], session_id: Optional[str] = None,
-             model: Optional[str] = None) -> Dict[str, Any]:
-    """Add provenance metadata to a training pair."""
-    meta = pair.get("_provenance", {})
-    
-    if session_id:
-        meta["source_session_id"] = session_id
-    if model:
-        meta["source_model"] = model
-    meta["source_timestamp"] = datetime.now(timezone.utc).isoformat()
-    
-    if meta:
-        pair["_provenance"] = meta
-    
-    return pair
-
-
-def filter_pairs(input_path: str, output_path: str,
-                 include_models: Optional[list] = None,
-                 exclude_models: Optional[list] = None,
-                 min_session_age: Optional[str] = None) -> Dict[str, Any]:
-    """Filter pairs by provenance metadata."""
-    kept = []
-    removed = []
-    errors = 0
-    
-    source = sys.stdin if input_path == "-" else open(input_path, "r")
-    
-    try:
-        for line in source:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                pair = json.loads(line)
-            except json.JSONDecodeError:
-                errors += 1
-                continue
-            
-            prov = pair.get("_provenance", {})
-            model = prov.get("source_model", "unknown")
-            
-            should_keep = True
-            
-            if include_models:
-                should_keep = should_keep and model in include_models
-            
-            if exclude_models:
-                should_keep = should_keep and model not in exclude_models
-            
-            if should_keep:
-                kept.append(pair)
-            else:
-                removed.append(pair)
-    finally:
-        if source is not sys.stdin:
-            source.close()
-    
-    # Write output
-    if output_path:
-        out = sys.stdout if output_path == "-" else open(output_path, "w")
-        try:
-            for pair in kept:
-                out.write(json.dumps(pair, ensure_ascii=False) + "\n")
-        finally:
-            if out is not sys.stdin:
-                out.close()
-    
-    return {
-        "total": len(kept) + len(removed),
-        "kept": len(kept),
-        "filtered_out": len(removed),
-        "errors": errors,
-    }
-
-
-def report(input_path: str) -> Dict[str, Any]:
-    """Report pair counts by source model and session."""
-    model_counts = Counter()
-    session_counts = Counter()
-    tagged = 0
-    untagged = 0
-    total = 0
-    errors = 0
-    
-    source = sys.stdin if input_path == "-" else open(input_path, "r")
-    
-    try:
-        for line in source:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                pair = json.loads(line)
-            except json.JSONDecodeError:
-                errors += 1
-                continue
-            
-            total += 1
-            prov = pair.get("_provenance", {})
-            
-            if prov:
-                tagged += 1
-                model = prov.get("source_model", "unknown")
-                session = prov.get("source_session_id", "unknown")
-                model_counts[model] += 1
-                session_counts[session] += 1
-            else:
-                untagged += 1
-    finally:
-        if source is not sys.stdin:
-            source.close()
-    
-    return {
-        "total": total,
-        "tagged": tagged,
-        "untagged": untagged,
-        "tag_rate": round(tagged / max(total, 1) * 100, 1),
-        "by_model": dict(model_counts.most_common(20)),
-        "by_session": dict(session_counts.most_common(10)),
-        "errors": errors,
-    }
-
-
-def stamp_command(input_path: str, output_path: str,
-                  session_id: Optional[str], model: Optional[str]) -> Dict[str, Any]:
-    """Tag all pairs in a file with provenance metadata."""
-    tagged = 0
-    skipped = 0
-    errors = 0
-    
-    source = sys.stdin if input_path == "-" else open(input_path, "r")
-    out = sys.stdout if output_path == "-" else open(output_path, "w")
-    
-    try:
-        for line in source:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                pair = json.loads(line)
-            except json.JSONDecodeError:
-                errors += 1
-                continue
-            
-            # Skip if already tagged with same model
-            existing = pair.get("_provenance", {})
-            if existing.get("source_model") == model and existing.get("source_session_id") == session_id:
-                skipped += 1
-                out.write(line + "\n")
-                continue
-            
-            pair = tag_pair(pair, session_id=session_id, model=model)
-            out.write(json.dumps(pair, ensure_ascii=False) + "\n")
-            tagged += 1
-    finally:
-        if source is not sys.stdin:
-            source.close()
-        if out is not sys.stdin:
-            out.close()
-    
-    return {"tagged": tagged, "skipped": skipped, "errors": errors}
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Training pair provenance tracking")
-    sub = parser.add_subparsers(dest="command", required=True)
-    
-    # tag subcommand
-    tag_p = sub.add_parser("tag", help="Tag pairs with provenance metadata")
-    tag_p.add_argument("input", help="Input JSONL file (use - for stdin)")
-    tag_p.add_argument("-o", "--output", default="-", help="Output JSONL file")
-    tag_p.add_argument("--session", help="Source session ID")
-    tag_p.add_argument("--model", help="Source model name")
-    
-    # filter subcommand
-    filt_p = sub.add_parser("filter", help="Filter pairs by provenance")
-    filt_p.add_argument("input", help="Input JSONL file (use - for stdin)")
-    filt_p.add_argument("-o", "--output", default="-", help="Output JSONL file")
-    filt_p.add_argument("--include-model", action="append", help="Only include these models")
-    filt_p.add_argument("--exclude-model", action="append", help="Exclude these models")
-    
-    # report subcommand
-    rpt_p = sub.add_parser("report", help="Report provenance statistics")
-    rpt_p.add_argument("input", help="Input JSONL file (use - for stdin)")
-    
-    args = parser.parse_args()
-    
-    if args.command == "tag":
-        result = stamp_command(args.input, args.output, args.session, args.model)
-        print(f"Tagged: {result['tagged']}  Skipped: {result['skipped']}  Errors: {result['errors']}", file=sys.stderr)
-    
-    elif args.command == "filter":
-        result = filter_pairs(
-            args.input, args.output,
-            include_models=args.include_model,
-            exclude_models=args.exclude_model,
-        )
-        print(f"Total: {result['total']}  Kept: {result['kept']}  Filtered: {result['filtered_out']}", file=sys.stderr)
-    
-    elif args.command == "report":
-        result = report(args.input)
-        print(f"Training Pair Provenance Report", file=sys.stderr)
-        print(f"{'='*40}", file=sys.stderr)
-        print(f"Total pairs:   {result['total']}", file=sys.stderr)
-        print(f"Tagged:        {result['tagged']} ({result['tag_rate']}%)", file=sys.stderr)
-        print(f"Untagged:      {result['untagged']}", file=sys.stderr)
-        
-        if result['by_model']:
-            print(f"\nBy source model:", file=sys.stderr)
-            for model, count in result['by_model'].items():
-                print(f"  {model}: {count}", file=sys.stderr)
-        
-        if result['by_session']:
-            print(f"\nBy source session (top 10):", file=sys.stderr)
-            for session, count in result['by_session'].items():
-                session_short = session[:12] + "..." if len(session) > 12 else session
-                print(f"  {session_short}: {count}", file=sys.stderr)
-        
-        # Output JSON to stdout
-        print(json.dumps(result, indent=2))
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/validate-scene-data.py
+++ b/scripts/validate-scene-data.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+"""
+validate-scene-data.py — Validate scene description JSONL files against schema.
+
+Usage:
+    python3 scripts/validate-scene-data.py training-data/*.jsonl
+    python3 scripts/validate-scene-data.py training-data/scene-descriptions-rock.jsonl
+
+Exit codes:
+    0 = all entries valid
+    1 = validation errors found
+
+Refs: timmy-config#647
+"""
+
+import json
+import sys
+import os
+from pathlib import Path
+
+# Try jsonschema, fall back to manual validation
+try:
+    import jsonschema
+    HAS_JSONSCHEMA = True
+except ImportError:
+    HAS_JSONSCHEMA = False
+
+
+def load_schema():
+    """Load the JSON schema from training-data/schema.json."""
+    schema_path = Path(__file__).parent.parent / "training-data" / "schema.json"
+    if not schema_path.exists():
+        # Try relative to CWD
+        schema_path = Path("training-data/schema.json")
+    if not schema_path.exists():
+        print(f"ERROR: Schema not found at {schema_path}", file=sys.stderr)
+        sys.exit(2)
+    with open(schema_path) as f:
+        return json.load(f)
+
+
+def validate_entry_manual(entry, index):
+    """Manual validation without jsonschema dependency."""
+    errors = []
+
+    # Required top-level fields
+    for field in ["song", "artist", "beat", "timestamp", "lyric_line", "scene"]:
+        if field not in entry:
+            errors.append(f"Missing required field: {field}")
+
+    # Type checks
+    if "song" in entry and not isinstance(entry["song"], str):
+        errors.append("'song' must be a string")
+    if "song" in entry and isinstance(entry["song"], str) and len(entry["song"].strip()) == 0:
+        errors.append("'song' must not be empty")
+
+    if "artist" in entry and not isinstance(entry["artist"], str):
+        errors.append("'artist' must be a string")
+    if "artist" in entry and isinstance(entry["artist"], str) and len(entry["artist"].strip()) == 0:
+        errors.append("'artist' must not be empty")
+
+    if "beat" in entry and not isinstance(entry["beat"], int):
+        errors.append("'beat' must be an integer")
+    if "beat" in entry and isinstance(entry["beat"], int) and entry["beat"] < 1:
+        errors.append("'beat' must be >= 1")
+
+    if "timestamp" in entry:
+        import re
+        if not re.match(r'^[0-9]+:[0-5][0-9]$', str(entry["timestamp"])):
+            errors.append(f"'timestamp' must be M:SS or MM:SS format, got: {entry['timestamp']}")
+
+    if "lyric_line" in entry and not isinstance(entry["lyric_line"], str):
+        errors.append("'lyric_line' must be a string")
+    if "lyric_line" in entry and isinstance(entry["lyric_line"], str) and len(entry["lyric_line"].strip()) == 0:
+        errors.append("'lyric_line' must not be empty")
+
+    # Scene validation
+    if "scene" in entry:
+        scene = entry["scene"]
+        if not isinstance(scene, dict):
+            errors.append("'scene' must be an object")
+        else:
+            for field in ["mood", "colors", "composition", "description"]:
+                if field not in scene:
+                    errors.append(f"Missing required scene field: {field}")
+
+            if "mood" in scene and not isinstance(scene["mood"], str):
+                errors.append("'scene.mood' must be a string")
+            if "mood" in scene and isinstance(scene["mood"], str) and len(scene["mood"].strip()) == 0:
+                errors.append("'scene.mood' must not be empty")
+
+            if "colors" in scene:
+                if not isinstance(scene["colors"], list):
+                    errors.append("'scene.colors' must be an array")
+                elif len(scene["colors"]) == 0:
+                    errors.append("'scene.colors' must have at least 1 element")
+                else:
+                    for i, c in enumerate(scene["colors"]):
+                        if not isinstance(c, str) or len(c.strip()) == 0:
+                            errors.append(f"'scene.colors[{i}]' must be a non-empty string")
+
+            if "composition" in scene and not isinstance(scene["composition"], str):
+                errors.append("'scene.composition' must be a string")
+            if "composition" in scene and isinstance(scene["composition"], str) and len(scene["composition"].strip()) == 0:
+                errors.append("'scene.composition' must not be empty")
+
+            if "description" in scene and not isinstance(scene["description"], str):
+                errors.append("'scene.description' must be a string")
+            if "description" in scene and isinstance(scene["description"], str) and len(scene["description"]) < 10:
+                errors.append(f"'scene.description' too short ({len(scene['description'])} chars, min 10)")
+
+    return errors
+
+
+def validate_file(filepath, schema):
+    """Validate all entries in a JSONL file."""
+    errors = []
+    total = 0
+
+    with open(filepath, "r", encoding="utf-8") as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            total += 1
+
+            try:
+                entry = json.loads(line)
+            except json.JSONDecodeError as e:
+                errors.append(f"  Line {line_num}: Invalid JSON — {e}")
+                continue
+
+            if HAS_JSONSCHEMA:
+                entry_errors = list(jsonschema.validate(entry, schema) or [])
+                # jsonschema raises on error, so this path won't see errors
+            else:
+                entry_errors = validate_entry_manual(entry, line_num)
+
+            for err in entry_errors:
+                errors.append(f"  Line {line_num}: {err}")
+
+    return total, errors
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python3 scripts/validate-scene-data.py <file.jsonl> [file2.jsonl ...]")
+        sys.exit(2)
+
+    schema = load_schema()
+    total_entries = 0
+    total_errors = 0
+    files_checked = 0
+
+    for filepath in sys.argv[1:]:
+        if not os.path.exists(filepath):
+            print(f"SKIP: {filepath} (not found)")
+            continue
+
+        files_checked += 1
+        count, errors = validate_file(filepath, schema)
+        total_entries += count
+
+        if errors:
+            total_errors += len(errors)
+            print(f"FAIL: {filepath} — {len(errors)} error(s) in {count} entries:")
+            for err in errors[:20]:  # Limit output
+                print(err)
+            if len(errors) > 20:
+                print(f"  ... and {len(errors) - 20} more errors")
+        else:
+            print(f"PASS: {filepath} — {count} entries valid")
+
+    print(f"\nSummary: {files_checked} files, {total_entries} entries, {total_errors} errors")
+
+    if total_errors > 0:
+        print("VALIDATION FAILED")
+        sys.exit(1)
+    else:
+        print("ALL VALID")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/visual_pr_reviewer.py
+++ b/scripts/visual_pr_reviewer.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 import json
 from hermes_tools import browser_navigate, browser_vision

--- a/training-data/schema.json
+++ b/training-data/schema.json
@@ -0,0 +1,84 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Scene Description Training Entry",
+  "description": "Schema for lyrics-to-visual-scene description training data entries.",
+  "type": "object",
+  "required": ["song", "artist", "beat", "timestamp", "lyric_line", "scene"],
+  "properties": {
+    "song": {
+      "type": "string",
+      "minLength": 1,
+      "description": "Song title"
+    },
+    "artist": {
+      "type": "string",
+      "minLength": 1,
+      "description": "Artist name"
+    },
+    "mood_arc": {
+      "type": "string",
+      "description": "Overall mood progression of the song (optional)"
+    },
+    "beat": {
+      "type": "integer",
+      "minimum": 1,
+      "description": "Beat number within the song (1-indexed)"
+    },
+    "timestamp": {
+      "type": "string",
+      "pattern": "^[0-9]+:[0-5][0-9]$",
+      "description": "Timestamp in M:SS or MM:SS format"
+    },
+    "duration": {
+      "type": "string",
+      "description": "Duration of the beat (e.g. '30s', '15s')"
+    },
+    "duration_seconds": {
+      "type": "integer",
+      "minimum": 1,
+      "description": "Duration in seconds (integer alternative)"
+    },
+    "lyric_line": {
+      "type": "string",
+      "minLength": 1,
+      "description": "The lyric line for this beat"
+    },
+    "scene": {
+      "type": "object",
+      "required": ["mood", "colors", "composition", "description"],
+      "properties": {
+        "mood": {
+          "type": "string",
+          "minLength": 1,
+          "description": "Emotional mood of the scene"
+        },
+        "colors": {
+          "type": "array",
+          "items": { "type": "string", "minLength": 1 },
+          "minItems": 1,
+          "description": "Color palette for the scene"
+        },
+        "composition": {
+          "type": "string",
+          "minLength": 1,
+          "description": "Shot composition (e.g. 'wide shot', 'close-up', 'low angle')"
+        },
+        "camera": {
+          "type": "string",
+          "description": "Camera movement (e.g. 'static', 'slow zoom', 'tracking')"
+        },
+        "camera_movement": {
+          "type": "string",
+          "description": "Alternative field name for camera movement"
+        },
+        "description": {
+          "type": "string",
+          "minLength": 10,
+          "description": "Full scene description text"
+        }
+      },
+      "additionalProperties": true
+    }
+  },
+  "additionalProperties": true
+}