diff --git a/.gitea/workflows/validate-training-data.yml b/.gitea/workflows/validate-training-data.yml index 9ba8ff8a..449fdf98 100644 --- a/.gitea/workflows/validate-training-data.yml +++ b/.gitea/workflows/validate-training-data.yml @@ -22,3 +22,17 @@ jobs: run: | cd training/data/scene-descriptions python3 validate.py *.jsonl + - name: Validate training data provenance + run: | + cd training + python3 -c " + from training_pair_provenance import validate_provenance + import json, sys, glob + issues = 0 + for f in glob.glob('data/*.jsonl'): + report = validate_provenance(f) + print(f'{f}: {report["coverage"]:.0f}% coverage ({report["with_provenance"]}/{report["total"]})') + if report['missing_provenance'] > 0: + print(f' WARNING: {report["missing_provenance"]} pairs missing provenance') + sys.exit(0) + "