81 lines
2.3 KiB
Python
81 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Validate provenance metadata on training pairs.
|
|
|
|
Usage:
|
|
python validate_provenance.py data/merged_training_data.jsonl
|
|
python validate_provenance.py data/curated_dataset.jsonl --strict
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from training_pair_provenance import validate_provenance, get_provenance_stats, print_provenance_report
|
|
except ImportError:
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from training_pair_provenance import validate_provenance, get_provenance_stats, print_provenance_report
|
|
|
|
|
|
def load_jsonl(path: Path) -> list[dict]:
|
|
"""Load a JSONL file."""
|
|
entries = []
|
|
with open(path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
entries.append(json.loads(line))
|
|
return entries
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Validate provenance metadata")
|
|
parser.add_argument("input", type=str, help="Path to JSONL training data")
|
|
parser.add_argument("--strict", action="store_true", help="Fail on any missing provenance")
|
|
parser.add_argument("--report", action="store_true", help="Print detailed report")
|
|
args = parser.parse_args()
|
|
|
|
input_path = Path(args.input)
|
|
if not input_path.exists():
|
|
print(f"ERROR: {input_path} not found")
|
|
sys.exit(1)
|
|
|
|
pairs = load_jsonl(input_path)
|
|
print(f"Loaded {len(pairs)} pairs from {input_path}")
|
|
|
|
# Validate each pair
|
|
errors = []
|
|
for i, pair in enumerate(pairs):
|
|
valid, pair_errors = validate_provenance(pair)
|
|
if not valid:
|
|
errors.append((i, pair_errors))
|
|
|
|
# Print results
|
|
if errors:
|
|
print(f"\nFAILED: {len(errors)} pairs with provenance issues")
|
|
for idx, pair_errors in errors[:10]: # Show first 10
|
|
print(f" Pair {idx}: {', '.join(pair_errors)}")
|
|
if len(errors) > 10:
|
|
print(f" ... and {len(errors) - 10} more")
|
|
else:
|
|
print("\nPASSED: All pairs have valid provenance")
|
|
|
|
# Stats
|
|
stats = get_provenance_stats(pairs)
|
|
if args.report:
|
|
print()
|
|
print_provenance_report(stats)
|
|
|
|
# CI gate
|
|
if args.strict and errors:
|
|
print("\nStrict mode: failing due to provenance errors")
|
|
sys.exit(1)
|
|
|
|
sys.exit(0 if not errors else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|