Files
timmy-config/training/validate_provenance.py

81 lines
2.3 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Validate provenance metadata on training pairs.
Usage:
python validate_provenance.py data/merged_training_data.jsonl
python validate_provenance.py data/curated_dataset.jsonl --strict
"""
import argparse
import json
import sys
from pathlib import Path
try:
from training_pair_provenance import validate_provenance, get_provenance_stats, print_provenance_report
except ImportError:
sys.path.insert(0, str(Path(__file__).parent))
from training_pair_provenance import validate_provenance, get_provenance_stats, print_provenance_report
def load_jsonl(path: Path) -> list[dict]:
"""Load a JSONL file."""
entries = []
with open(path) as f:
for line in f:
line = line.strip()
if line:
entries.append(json.loads(line))
return entries
def main():
parser = argparse.ArgumentParser(description="Validate provenance metadata")
parser.add_argument("input", type=str, help="Path to JSONL training data")
parser.add_argument("--strict", action="store_true", help="Fail on any missing provenance")
parser.add_argument("--report", action="store_true", help="Print detailed report")
args = parser.parse_args()
input_path = Path(args.input)
if not input_path.exists():
print(f"ERROR: {input_path} not found")
sys.exit(1)
pairs = load_jsonl(input_path)
print(f"Loaded {len(pairs)} pairs from {input_path}")
# Validate each pair
errors = []
for i, pair in enumerate(pairs):
valid, pair_errors = validate_provenance(pair)
if not valid:
errors.append((i, pair_errors))
# Print results
if errors:
print(f"\nFAILED: {len(errors)} pairs with provenance issues")
for idx, pair_errors in errors[:10]: # Show first 10
print(f" Pair {idx}: {', '.join(pair_errors)}")
if len(errors) > 10:
print(f" ... and {len(errors) - 10} more")
else:
print("\nPASSED: All pairs have valid provenance")
# Stats
stats = get_provenance_stats(pairs)
if args.report:
print()
print_provenance_report(stats)
# CI gate
if args.strict and errors:
print("\nStrict mode: failing due to provenance errors")
sys.exit(1)
sys.exit(0 if not errors else 1)
if __name__ == "__main__":
main()