diff --git a/.gitea/workflows/validate-training-data.yml b/.gitea/workflows/validate-training-data.yml index 449fdf98..b257fd09 100644 --- a/.gitea/workflows/validate-training-data.yml +++ b/.gitea/workflows/validate-training-data.yml @@ -5,7 +5,9 @@ on: paths: - 'training/data/**/*.jsonl' - 'training/data/**/schema.json' - - 'training/data/**/validate.py' + - 'training-data/*.jsonl' + - 'training-data/schema.json' + - 'scripts/validate-scene-data.py' jobs: validate: @@ -18,21 +20,11 @@ jobs: with: python-version: '3.11' - - name: Validate scene description JSONL + - name: Validate scene descriptions (training-data/) + run: | + python3 scripts/validate-scene-data.py training-data/scene-descriptions-*.jsonl + + - name: Validate scene descriptions (training/data/) run: | cd training/data/scene-descriptions python3 validate.py *.jsonl - - name: Validate training data provenance - run: | - cd training - python3 -c " - from training_pair_provenance import validate_provenance - import json, sys, glob - issues = 0 - for f in glob.glob('data/*.jsonl'): - report = validate_provenance(f) - print(f'{f}: {report["coverage"]:.0f}% coverage ({report["with_provenance"]}/{report["total"]})') - if report['missing_provenance'] > 0: - print(f' WARNING: {report["missing_provenance"]} pairs missing provenance') - sys.exit(0) - "