From 7c716f8ca14155ead348599b99190d8a469d6e24 Mon Sep 17 00:00:00 2001 From: Merge Bot Date: Thu, 16 Apr 2026 05:07:23 +0000 Subject: [PATCH] Merge PR #760: .gitea/workflows/validate-training-data.yml (changed) --- .gitea/workflows/validate-training-data.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.gitea/workflows/validate-training-data.yml b/.gitea/workflows/validate-training-data.yml index 9ba8ff8a..449fdf98 100644 --- a/.gitea/workflows/validate-training-data.yml +++ b/.gitea/workflows/validate-training-data.yml @@ -22,3 +22,17 @@ jobs: run: | cd training/data/scene-descriptions python3 validate.py *.jsonl + - name: Validate training data provenance + run: | + cd training + python3 -c " + from training_pair_provenance import validate_provenance + import json, sys, glob + issues = 0 + for f in glob.glob('data/*.jsonl'): + report = validate_provenance(f) + print(f'{f}: {report["coverage"]:.0f}% coverage ({report["with_provenance"]}/{report["total"]})') + if report['missing_provenance'] > 0: + print(f' WARNING: {report["missing_provenance"]} pairs missing provenance') + sys.exit(0) + "