#!/usr/bin/env python3 """ provenance_validate.py — Validate provenance metadata on training data. Checks: - All pairs have required provenance fields - source_session_id, model, timestamp present - Coverage report by model and source Usage: python3 provenance_validate.py training-data/*.jsonl python3 provenance_validate.py --threshold 80 training-data/*.jsonl """ import json import sys from pathlib import Path from typing import List REQUIRED_FIELDS = ["source_session_id", "model", "timestamp"] def validate_file(filepath: str) -> dict: """Validate provenance on a single JSONL file.""" pairs = [] with open(filepath) as f: for line in f: if line.strip(): pairs.append(json.loads(line)) total = len(pairs) with_provenance = 0 missing_by_field = {f: 0 for f in REQUIRED_FIELDS} by_model = {} by_source = {} for pair in pairs: has_all = True for field in REQUIRED_FIELDS: if field not in pair or not pair[field]: missing_by_field[field] += 1 has_all = False if has_all: with_provenance += 1 model = pair.get("model", "unknown") source = pair.get("source_type", pair.get("source", "unknown")) by_model[model] = by_model.get(model, 0) + 1 by_source[source] = by_source.get(source, 0) + 1 coverage = (with_provenance / total * 100) if total > 0 else 0 return { "file": str(filepath), "total": total, "with_provenance": with_provenance, "coverage_pct": round(coverage, 1), "missing_by_field": missing_by_field, "by_model": by_model, "by_source": by_source, } def validate_all(files: List[str], threshold: float = 0) -> dict: """Validate provenance across multiple files.""" results = [] total_pairs = 0 total_with_prov = 0 for filepath in files: result = validate_file(filepath) results.append(result) total_pairs += result["total"] total_with_prov += result["with_provenance"] overall_coverage = (total_with_prov / total_pairs * 100) if total_pairs > 0 else 0 return { "files": results, "total_pairs": total_pairs, "total_with_provenance": total_with_prov, "overall_coverage_pct": round(overall_coverage, 1), "passes_threshold": overall_coverage >= threshold, } def main(): import argparse parser = argparse.ArgumentParser(description="Validate training data provenance") parser.add_argument("files", nargs="+", help="JSONL files to validate") parser.add_argument("--threshold", type=float, default=0, help="Minimum coverage percentage to pass") parser.add_argument("--json", action="store_true", help="JSON output") args = parser.parse_args() # Expand globs files = [] for pattern in args.files: expanded = list(Path(".").glob(pattern)) if "*" in pattern else [Path(pattern)] files.extend(str(f) for f in expanded if f.exists()) if not files: print("No files found", file=sys.stderr) sys.exit(1) result = validate_all(files, args.threshold) if args.json: print(json.dumps(result, indent=2)) else: print(f"\n{'='*50}") print(" PROVENANCE VALIDATION REPORT") print(f"{'='*50}") print(f" Total pairs: {result['total_pairs']}") print(f" With provenance: {result['total_with_provenance']}") print(f" Coverage: {result['overall_coverage_pct']}%") if args.threshold > 0: status = "PASS" if result["passes_threshold"] else "FAIL" print(f" Threshold: {args.threshold}% [{status}]") print(f"\n Per file:") for f in result["files"]: icon = "✓" if f["coverage_pct"] >= args.threshold else "✗" print(f" {icon} {f['file']}: {f['coverage_pct']}% ({f['with_provenance']}/{f['total']})") print(f"{'='*50}\n") # Exit code if args.threshold > 0 and not result["passes_threshold"]: sys.exit(1) sys.exit(0) if __name__ == "__main__": main()