39 lines
1.1 KiB
YAML
39 lines
1.1 KiB
YAML
name: Validate Training Data
|
|
|
|
on:
|
|
pull_request:
|
|
paths:
|
|
- 'training/data/**/*.jsonl'
|
|
- 'training/data/**/schema.json'
|
|
- 'training/data/**/validate.py'
|
|
|
|
jobs:
|
|
validate:
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- uses: actions/checkout@v4
|
|
|
|
- name: Set up Python
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: '3.11'
|
|
|
|
- name: Validate scene description JSONL
|
|
run: |
|
|
cd training/data/scene-descriptions
|
|
python3 validate.py *.jsonl
|
|
- name: Validate training data provenance
|
|
run: |
|
|
cd training
|
|
python3 -c "
|
|
from training_pair_provenance import validate_provenance
|
|
import json, sys, glob
|
|
issues = 0
|
|
for f in glob.glob('data/*.jsonl'):
|
|
report = validate_provenance(f)
|
|
print(f'{f}: {report["coverage"]:.0f}% coverage ({report["with_provenance"]}/{report["total"]})')
|
|
if report['missing_provenance'] > 0:
|
|
print(f' WARNING: {report["missing_provenance"]} pairs missing provenance')
|
|
sys.exit(0)
|
|
"
|