name: Validate Training Data on: pull_request: paths: - 'training/data/**/*.jsonl' - 'training/data/**/schema.json' - 'training/data/**/validate.py' jobs: validate: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.11' - name: Validate scene description JSONL run: | cd training/data/scene-descriptions python3 validate.py *.jsonl - name: Validate training data provenance run: | cd training python3 -c " from training_pair_provenance import validate_provenance import json, sys, glob issues = 0 for f in glob.glob('data/*.jsonl'): report = validate_provenance(f) print(f'{f}: {report["coverage"]:.0f}% coverage ({report["with_provenance"]}/{report["total"]})') if report['missing_provenance'] > 0: print(f' WARNING: {report["missing_provenance"]} pairs missing provenance') sys.exit(0) "