Files
timmy-config/.gitea/workflows/validate-training-data.yml

39 lines
1.1 KiB
YAML

name: Validate Training Data
on:
pull_request:
paths:
- 'training/data/**/*.jsonl'
- 'training/data/**/schema.json'
- 'training/data/**/validate.py'
jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Validate scene description JSONL
run: |
cd training/data/scene-descriptions
python3 validate.py *.jsonl
- name: Validate training data provenance
run: |
cd training
python3 -c "
from training_pair_provenance import validate_provenance
import json, sys, glob
issues = 0
for f in glob.glob('data/*.jsonl'):
report = validate_provenance(f)
print(f'{f}: {report["coverage"]:.0f}% coverage ({report["with_provenance"]}/{report["total"]})')
if report['missing_provenance'] > 0:
print(f' WARNING: {report["missing_provenance"]} pairs missing provenance')
sys.exit(0)
"