Compare commits
1 Commits
fix/691-tr
...
fix/issue-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6b38c2b5aa |
@@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Full Nostr agent-to-agent communication demo - FINAL WORKING
|
||||
"""
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Soul Eval Gate — The Conscience of the Training Pipeline
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
from hermes_tools import browser_navigate, browser_vision
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
from hermes_tools import browser_navigate, browser_vision
|
||||
|
||||
|
||||
@@ -1,260 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
[PROVENANCE] Training Pair Provenance Tracker
|
||||
Part of the Timmy Foundation tooling.
|
||||
|
||||
Adds, filters, and reports provenance metadata for JSONL training pairs.
|
||||
Tracks source_session_id, model, and timestamp for quality auditing.
|
||||
|
||||
Usage:
|
||||
# Tag pairs with provenance
|
||||
python3 scripts/training_provenance.py tag input.jsonl -o tagged.jsonl \
|
||||
--session abc123 --model nous/hermes-3
|
||||
|
||||
# Filter by model (exclude Anthropic-sourced)
|
||||
python3 scripts/training_provenance.py filter input.jsonl -o filtered.jsonl \
|
||||
--exclude-model anthropic
|
||||
|
||||
# Report: pair count by source model
|
||||
python3 scripts/training_provenance.py report input.jsonl
|
||||
|
||||
# Pipe support
|
||||
cat pairs.jsonl | python3 scripts/training_provenance.py report -
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from datetime import datetime, timezone
|
||||
from collections import Counter
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
|
||||
PROVENANCE_KEYS = ["source_session_id", "source_model", "source_timestamp"]
|
||||
|
||||
|
||||
def tag_pair(pair: Dict[str, Any], session_id: Optional[str] = None,
|
||||
model: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Add provenance metadata to a training pair."""
|
||||
meta = pair.get("_provenance", {})
|
||||
|
||||
if session_id:
|
||||
meta["source_session_id"] = session_id
|
||||
if model:
|
||||
meta["source_model"] = model
|
||||
meta["source_timestamp"] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
if meta:
|
||||
pair["_provenance"] = meta
|
||||
|
||||
return pair
|
||||
|
||||
|
||||
def filter_pairs(input_path: str, output_path: str,
|
||||
include_models: Optional[list] = None,
|
||||
exclude_models: Optional[list] = None,
|
||||
min_session_age: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Filter pairs by provenance metadata."""
|
||||
kept = []
|
||||
removed = []
|
||||
errors = 0
|
||||
|
||||
source = sys.stdin if input_path == "-" else open(input_path, "r")
|
||||
|
||||
try:
|
||||
for line in source:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
pair = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
prov = pair.get("_provenance", {})
|
||||
model = prov.get("source_model", "unknown")
|
||||
|
||||
should_keep = True
|
||||
|
||||
if include_models:
|
||||
should_keep = should_keep and model in include_models
|
||||
|
||||
if exclude_models:
|
||||
should_keep = should_keep and model not in exclude_models
|
||||
|
||||
if should_keep:
|
||||
kept.append(pair)
|
||||
else:
|
||||
removed.append(pair)
|
||||
finally:
|
||||
if source is not sys.stdin:
|
||||
source.close()
|
||||
|
||||
# Write output
|
||||
if output_path:
|
||||
out = sys.stdout if output_path == "-" else open(output_path, "w")
|
||||
try:
|
||||
for pair in kept:
|
||||
out.write(json.dumps(pair, ensure_ascii=False) + "\n")
|
||||
finally:
|
||||
if out is not sys.stdin:
|
||||
out.close()
|
||||
|
||||
return {
|
||||
"total": len(kept) + len(removed),
|
||||
"kept": len(kept),
|
||||
"filtered_out": len(removed),
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
|
||||
def report(input_path: str) -> Dict[str, Any]:
|
||||
"""Report pair counts by source model and session."""
|
||||
model_counts = Counter()
|
||||
session_counts = Counter()
|
||||
tagged = 0
|
||||
untagged = 0
|
||||
total = 0
|
||||
errors = 0
|
||||
|
||||
source = sys.stdin if input_path == "-" else open(input_path, "r")
|
||||
|
||||
try:
|
||||
for line in source:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
pair = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
total += 1
|
||||
prov = pair.get("_provenance", {})
|
||||
|
||||
if prov:
|
||||
tagged += 1
|
||||
model = prov.get("source_model", "unknown")
|
||||
session = prov.get("source_session_id", "unknown")
|
||||
model_counts[model] += 1
|
||||
session_counts[session] += 1
|
||||
else:
|
||||
untagged += 1
|
||||
finally:
|
||||
if source is not sys.stdin:
|
||||
source.close()
|
||||
|
||||
return {
|
||||
"total": total,
|
||||
"tagged": tagged,
|
||||
"untagged": untagged,
|
||||
"tag_rate": round(tagged / max(total, 1) * 100, 1),
|
||||
"by_model": dict(model_counts.most_common(20)),
|
||||
"by_session": dict(session_counts.most_common(10)),
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
|
||||
def stamp_command(input_path: str, output_path: str,
|
||||
session_id: Optional[str], model: Optional[str]) -> Dict[str, Any]:
|
||||
"""Tag all pairs in a file with provenance metadata."""
|
||||
tagged = 0
|
||||
skipped = 0
|
||||
errors = 0
|
||||
|
||||
source = sys.stdin if input_path == "-" else open(input_path, "r")
|
||||
out = sys.stdout if output_path == "-" else open(output_path, "w")
|
||||
|
||||
try:
|
||||
for line in source:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
pair = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# Skip if already tagged with same model
|
||||
existing = pair.get("_provenance", {})
|
||||
if existing.get("source_model") == model and existing.get("source_session_id") == session_id:
|
||||
skipped += 1
|
||||
out.write(line + "\n")
|
||||
continue
|
||||
|
||||
pair = tag_pair(pair, session_id=session_id, model=model)
|
||||
out.write(json.dumps(pair, ensure_ascii=False) + "\n")
|
||||
tagged += 1
|
||||
finally:
|
||||
if source is not sys.stdin:
|
||||
source.close()
|
||||
if out is not sys.stdin:
|
||||
out.close()
|
||||
|
||||
return {"tagged": tagged, "skipped": skipped, "errors": errors}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Training pair provenance tracking")
|
||||
sub = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
# tag subcommand
|
||||
tag_p = sub.add_parser("tag", help="Tag pairs with provenance metadata")
|
||||
tag_p.add_argument("input", help="Input JSONL file (use - for stdin)")
|
||||
tag_p.add_argument("-o", "--output", default="-", help="Output JSONL file")
|
||||
tag_p.add_argument("--session", help="Source session ID")
|
||||
tag_p.add_argument("--model", help="Source model name")
|
||||
|
||||
# filter subcommand
|
||||
filt_p = sub.add_parser("filter", help="Filter pairs by provenance")
|
||||
filt_p.add_argument("input", help="Input JSONL file (use - for stdin)")
|
||||
filt_p.add_argument("-o", "--output", default="-", help="Output JSONL file")
|
||||
filt_p.add_argument("--include-model", action="append", help="Only include these models")
|
||||
filt_p.add_argument("--exclude-model", action="append", help="Exclude these models")
|
||||
|
||||
# report subcommand
|
||||
rpt_p = sub.add_parser("report", help="Report provenance statistics")
|
||||
rpt_p.add_argument("input", help="Input JSONL file (use - for stdin)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == "tag":
|
||||
result = stamp_command(args.input, args.output, args.session, args.model)
|
||||
print(f"Tagged: {result['tagged']} Skipped: {result['skipped']} Errors: {result['errors']}", file=sys.stderr)
|
||||
|
||||
elif args.command == "filter":
|
||||
result = filter_pairs(
|
||||
args.input, args.output,
|
||||
include_models=args.include_model,
|
||||
exclude_models=args.exclude_model,
|
||||
)
|
||||
print(f"Total: {result['total']} Kept: {result['kept']} Filtered: {result['filtered_out']}", file=sys.stderr)
|
||||
|
||||
elif args.command == "report":
|
||||
result = report(args.input)
|
||||
print(f"Training Pair Provenance Report", file=sys.stderr)
|
||||
print(f"{'='*40}", file=sys.stderr)
|
||||
print(f"Total pairs: {result['total']}", file=sys.stderr)
|
||||
print(f"Tagged: {result['tagged']} ({result['tag_rate']}%)", file=sys.stderr)
|
||||
print(f"Untagged: {result['untagged']}", file=sys.stderr)
|
||||
|
||||
if result['by_model']:
|
||||
print(f"\nBy source model:", file=sys.stderr)
|
||||
for model, count in result['by_model'].items():
|
||||
print(f" {model}: {count}", file=sys.stderr)
|
||||
|
||||
if result['by_session']:
|
||||
print(f"\nBy source session (top 10):", file=sys.stderr)
|
||||
for session, count in result['by_session'].items():
|
||||
session_short = session[:12] + "..." if len(session) > 12 else session
|
||||
print(f" {session_short}: {count}", file=sys.stderr)
|
||||
|
||||
# Output JSON to stdout
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
185
scripts/validate-scene-data.py
Normal file
185
scripts/validate-scene-data.py
Normal file
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
validate-scene-data.py — Validate scene description JSONL files against schema.
|
||||
|
||||
Usage:
|
||||
python3 scripts/validate-scene-data.py training-data/*.jsonl
|
||||
python3 scripts/validate-scene-data.py training-data/scene-descriptions-rock.jsonl
|
||||
|
||||
Exit codes:
|
||||
0 = all entries valid
|
||||
1 = validation errors found
|
||||
|
||||
Refs: timmy-config#647
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Try jsonschema, fall back to manual validation
|
||||
try:
|
||||
import jsonschema
|
||||
HAS_JSONSCHEMA = True
|
||||
except ImportError:
|
||||
HAS_JSONSCHEMA = False
|
||||
|
||||
|
||||
def load_schema():
|
||||
"""Load the JSON schema from training-data/schema.json."""
|
||||
schema_path = Path(__file__).parent.parent / "training-data" / "schema.json"
|
||||
if not schema_path.exists():
|
||||
# Try relative to CWD
|
||||
schema_path = Path("training-data/schema.json")
|
||||
if not schema_path.exists():
|
||||
print(f"ERROR: Schema not found at {schema_path}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
with open(schema_path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def validate_entry_manual(entry, index):
|
||||
"""Manual validation without jsonschema dependency."""
|
||||
errors = []
|
||||
|
||||
# Required top-level fields
|
||||
for field in ["song", "artist", "beat", "timestamp", "lyric_line", "scene"]:
|
||||
if field not in entry:
|
||||
errors.append(f"Missing required field: {field}")
|
||||
|
||||
# Type checks
|
||||
if "song" in entry and not isinstance(entry["song"], str):
|
||||
errors.append("'song' must be a string")
|
||||
if "song" in entry and isinstance(entry["song"], str) and len(entry["song"].strip()) == 0:
|
||||
errors.append("'song' must not be empty")
|
||||
|
||||
if "artist" in entry and not isinstance(entry["artist"], str):
|
||||
errors.append("'artist' must be a string")
|
||||
if "artist" in entry and isinstance(entry["artist"], str) and len(entry["artist"].strip()) == 0:
|
||||
errors.append("'artist' must not be empty")
|
||||
|
||||
if "beat" in entry and not isinstance(entry["beat"], int):
|
||||
errors.append("'beat' must be an integer")
|
||||
if "beat" in entry and isinstance(entry["beat"], int) and entry["beat"] < 1:
|
||||
errors.append("'beat' must be >= 1")
|
||||
|
||||
if "timestamp" in entry:
|
||||
import re
|
||||
if not re.match(r'^[0-9]+:[0-5][0-9]$', str(entry["timestamp"])):
|
||||
errors.append(f"'timestamp' must be M:SS or MM:SS format, got: {entry['timestamp']}")
|
||||
|
||||
if "lyric_line" in entry and not isinstance(entry["lyric_line"], str):
|
||||
errors.append("'lyric_line' must be a string")
|
||||
if "lyric_line" in entry and isinstance(entry["lyric_line"], str) and len(entry["lyric_line"].strip()) == 0:
|
||||
errors.append("'lyric_line' must not be empty")
|
||||
|
||||
# Scene validation
|
||||
if "scene" in entry:
|
||||
scene = entry["scene"]
|
||||
if not isinstance(scene, dict):
|
||||
errors.append("'scene' must be an object")
|
||||
else:
|
||||
for field in ["mood", "colors", "composition", "description"]:
|
||||
if field not in scene:
|
||||
errors.append(f"Missing required scene field: {field}")
|
||||
|
||||
if "mood" in scene and not isinstance(scene["mood"], str):
|
||||
errors.append("'scene.mood' must be a string")
|
||||
if "mood" in scene and isinstance(scene["mood"], str) and len(scene["mood"].strip()) == 0:
|
||||
errors.append("'scene.mood' must not be empty")
|
||||
|
||||
if "colors" in scene:
|
||||
if not isinstance(scene["colors"], list):
|
||||
errors.append("'scene.colors' must be an array")
|
||||
elif len(scene["colors"]) == 0:
|
||||
errors.append("'scene.colors' must have at least 1 element")
|
||||
else:
|
||||
for i, c in enumerate(scene["colors"]):
|
||||
if not isinstance(c, str) or len(c.strip()) == 0:
|
||||
errors.append(f"'scene.colors[{i}]' must be a non-empty string")
|
||||
|
||||
if "composition" in scene and not isinstance(scene["composition"], str):
|
||||
errors.append("'scene.composition' must be a string")
|
||||
if "composition" in scene and isinstance(scene["composition"], str) and len(scene["composition"].strip()) == 0:
|
||||
errors.append("'scene.composition' must not be empty")
|
||||
|
||||
if "description" in scene and not isinstance(scene["description"], str):
|
||||
errors.append("'scene.description' must be a string")
|
||||
if "description" in scene and isinstance(scene["description"], str) and len(scene["description"]) < 10:
|
||||
errors.append(f"'scene.description' too short ({len(scene['description'])} chars, min 10)")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_file(filepath, schema):
|
||||
"""Validate all entries in a JSONL file."""
|
||||
errors = []
|
||||
total = 0
|
||||
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
total += 1
|
||||
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
except json.JSONDecodeError as e:
|
||||
errors.append(f" Line {line_num}: Invalid JSON — {e}")
|
||||
continue
|
||||
|
||||
if HAS_JSONSCHEMA:
|
||||
entry_errors = list(jsonschema.validate(entry, schema) or [])
|
||||
# jsonschema raises on error, so this path won't see errors
|
||||
else:
|
||||
entry_errors = validate_entry_manual(entry, line_num)
|
||||
|
||||
for err in entry_errors:
|
||||
errors.append(f" Line {line_num}: {err}")
|
||||
|
||||
return total, errors
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 scripts/validate-scene-data.py <file.jsonl> [file2.jsonl ...]")
|
||||
sys.exit(2)
|
||||
|
||||
schema = load_schema()
|
||||
total_entries = 0
|
||||
total_errors = 0
|
||||
files_checked = 0
|
||||
|
||||
for filepath in sys.argv[1:]:
|
||||
if not os.path.exists(filepath):
|
||||
print(f"SKIP: {filepath} (not found)")
|
||||
continue
|
||||
|
||||
files_checked += 1
|
||||
count, errors = validate_file(filepath, schema)
|
||||
total_entries += count
|
||||
|
||||
if errors:
|
||||
total_errors += len(errors)
|
||||
print(f"FAIL: {filepath} — {len(errors)} error(s) in {count} entries:")
|
||||
for err in errors[:20]: # Limit output
|
||||
print(err)
|
||||
if len(errors) > 20:
|
||||
print(f" ... and {len(errors) - 20} more errors")
|
||||
else:
|
||||
print(f"PASS: {filepath} — {count} entries valid")
|
||||
|
||||
print(f"\nSummary: {files_checked} files, {total_entries} entries, {total_errors} errors")
|
||||
|
||||
if total_errors > 0:
|
||||
print("VALIDATION FAILED")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("ALL VALID")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
from hermes_tools import browser_navigate, browser_vision
|
||||
|
||||
|
||||
84
training-data/schema.json
Normal file
84
training-data/schema.json
Normal file
@@ -0,0 +1,84 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"title": "Scene Description Training Entry",
|
||||
"description": "Schema for lyrics-to-visual-scene description training data entries.",
|
||||
"type": "object",
|
||||
"required": ["song", "artist", "beat", "timestamp", "lyric_line", "scene"],
|
||||
"properties": {
|
||||
"song": {
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"description": "Song title"
|
||||
},
|
||||
"artist": {
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"description": "Artist name"
|
||||
},
|
||||
"mood_arc": {
|
||||
"type": "string",
|
||||
"description": "Overall mood progression of the song (optional)"
|
||||
},
|
||||
"beat": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"description": "Beat number within the song (1-indexed)"
|
||||
},
|
||||
"timestamp": {
|
||||
"type": "string",
|
||||
"pattern": "^[0-9]+:[0-5][0-9]$",
|
||||
"description": "Timestamp in M:SS or MM:SS format"
|
||||
},
|
||||
"duration": {
|
||||
"type": "string",
|
||||
"description": "Duration of the beat (e.g. '30s', '15s')"
|
||||
},
|
||||
"duration_seconds": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"description": "Duration in seconds (integer alternative)"
|
||||
},
|
||||
"lyric_line": {
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"description": "The lyric line for this beat"
|
||||
},
|
||||
"scene": {
|
||||
"type": "object",
|
||||
"required": ["mood", "colors", "composition", "description"],
|
||||
"properties": {
|
||||
"mood": {
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"description": "Emotional mood of the scene"
|
||||
},
|
||||
"colors": {
|
||||
"type": "array",
|
||||
"items": { "type": "string", "minLength": 1 },
|
||||
"minItems": 1,
|
||||
"description": "Color palette for the scene"
|
||||
},
|
||||
"composition": {
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"description": "Shot composition (e.g. 'wide shot', 'close-up', 'low angle')"
|
||||
},
|
||||
"camera": {
|
||||
"type": "string",
|
||||
"description": "Camera movement (e.g. 'static', 'slow zoom', 'tracking')"
|
||||
},
|
||||
"camera_movement": {
|
||||
"type": "string",
|
||||
"description": "Alternative field name for camera movement"
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"minLength": 10,
|
||||
"description": "Full scene description text"
|
||||
}
|
||||
},
|
||||
"additionalProperties": true
|
||||
}
|
||||
},
|
||||
"additionalProperties": true
|
||||
}
|
||||
Reference in New Issue
Block a user