#!/usr/bin/env python3 """ Validate provenance metadata on training pairs. Usage: python validate_provenance.py data/merged_training_data.jsonl python validate_provenance.py data/curated_dataset.jsonl --strict """ import argparse import json import sys from pathlib import Path try: from training_pair_provenance import validate_provenance, get_provenance_stats, print_provenance_report except ImportError: sys.path.insert(0, str(Path(__file__).parent)) from training_pair_provenance import validate_provenance, get_provenance_stats, print_provenance_report def load_jsonl(path: Path) -> list[dict]: """Load a JSONL file.""" entries = [] with open(path) as f: for line in f: line = line.strip() if line: entries.append(json.loads(line)) return entries def main(): parser = argparse.ArgumentParser(description="Validate provenance metadata") parser.add_argument("input", type=str, help="Path to JSONL training data") parser.add_argument("--strict", action="store_true", help="Fail on any missing provenance") parser.add_argument("--report", action="store_true", help="Print detailed report") args = parser.parse_args() input_path = Path(args.input) if not input_path.exists(): print(f"ERROR: {input_path} not found") sys.exit(1) pairs = load_jsonl(input_path) print(f"Loaded {len(pairs)} pairs from {input_path}") # Validate each pair errors = [] for i, pair in enumerate(pairs): valid, pair_errors = validate_provenance(pair) if not valid: errors.append((i, pair_errors)) # Print results if errors: print(f"\nFAILED: {len(errors)} pairs with provenance issues") for idx, pair_errors in errors[:10]: # Show first 10 print(f" Pair {idx}: {', '.join(pair_errors)}") if len(errors) > 10: print(f" ... and {len(errors) - 10} more") else: print("\nPASSED: All pairs have valid provenance") # Stats stats = get_provenance_stats(pairs) if args.report: print() print_provenance_report(stats) # CI gate if args.strict and errors: print("\nStrict mode: failing due to provenance errors") sys.exit(1) sys.exit(0 if not errors else 1) if __name__ == "__main__": main()