136
scripts/provenance_validate.py
Normal file
136
scripts/provenance_validate.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
provenance_validate.py — Validate provenance metadata on training data.
|
||||
|
||||
Checks:
|
||||
- All pairs have required provenance fields
|
||||
- source_session_id, model, timestamp present
|
||||
- Coverage report by model and source
|
||||
|
||||
Usage:
|
||||
python3 provenance_validate.py training-data/*.jsonl
|
||||
python3 provenance_validate.py --threshold 80 training-data/*.jsonl
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
REQUIRED_FIELDS = ["source_session_id", "model", "timestamp"]
|
||||
|
||||
|
||||
def validate_file(filepath: str) -> dict:
|
||||
"""Validate provenance on a single JSONL file."""
|
||||
pairs = []
|
||||
with open(filepath) as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
pairs.append(json.loads(line))
|
||||
|
||||
total = len(pairs)
|
||||
with_provenance = 0
|
||||
missing_by_field = {f: 0 for f in REQUIRED_FIELDS}
|
||||
by_model = {}
|
||||
by_source = {}
|
||||
|
||||
for pair in pairs:
|
||||
has_all = True
|
||||
for field in REQUIRED_FIELDS:
|
||||
if field not in pair or not pair[field]:
|
||||
missing_by_field[field] += 1
|
||||
has_all = False
|
||||
|
||||
if has_all:
|
||||
with_provenance += 1
|
||||
model = pair.get("model", "unknown")
|
||||
source = pair.get("source_type", pair.get("source", "unknown"))
|
||||
by_model[model] = by_model.get(model, 0) + 1
|
||||
by_source[source] = by_source.get(source, 0) + 1
|
||||
|
||||
coverage = (with_provenance / total * 100) if total > 0 else 0
|
||||
|
||||
return {
|
||||
"file": str(filepath),
|
||||
"total": total,
|
||||
"with_provenance": with_provenance,
|
||||
"coverage_pct": round(coverage, 1),
|
||||
"missing_by_field": missing_by_field,
|
||||
"by_model": by_model,
|
||||
"by_source": by_source,
|
||||
}
|
||||
|
||||
|
||||
def validate_all(files: List[str], threshold: float = 0) -> dict:
|
||||
"""Validate provenance across multiple files."""
|
||||
results = []
|
||||
total_pairs = 0
|
||||
total_with_prov = 0
|
||||
|
||||
for filepath in files:
|
||||
result = validate_file(filepath)
|
||||
results.append(result)
|
||||
total_pairs += result["total"]
|
||||
total_with_prov += result["with_provenance"]
|
||||
|
||||
overall_coverage = (total_with_prov / total_pairs * 100) if total_pairs > 0 else 0
|
||||
|
||||
return {
|
||||
"files": results,
|
||||
"total_pairs": total_pairs,
|
||||
"total_with_provenance": total_with_prov,
|
||||
"overall_coverage_pct": round(overall_coverage, 1),
|
||||
"passes_threshold": overall_coverage >= threshold,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Validate training data provenance")
|
||||
parser.add_argument("files", nargs="+", help="JSONL files to validate")
|
||||
parser.add_argument("--threshold", type=float, default=0,
|
||||
help="Minimum coverage percentage to pass")
|
||||
parser.add_argument("--json", action="store_true", help="JSON output")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Expand globs
|
||||
files = []
|
||||
for pattern in args.files:
|
||||
expanded = list(Path(".").glob(pattern)) if "*" in pattern else [Path(pattern)]
|
||||
files.extend(str(f) for f in expanded if f.exists())
|
||||
|
||||
if not files:
|
||||
print("No files found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
result = validate_all(files, args.threshold)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(result, indent=2))
|
||||
else:
|
||||
print(f"\n{'='*50}")
|
||||
print(" PROVENANCE VALIDATION REPORT")
|
||||
print(f"{'='*50}")
|
||||
print(f" Total pairs: {result['total_pairs']}")
|
||||
print(f" With provenance: {result['total_with_provenance']}")
|
||||
print(f" Coverage: {result['overall_coverage_pct']}%")
|
||||
|
||||
if args.threshold > 0:
|
||||
status = "PASS" if result["passes_threshold"] else "FAIL"
|
||||
print(f" Threshold: {args.threshold}% [{status}]")
|
||||
|
||||
print(f"\n Per file:")
|
||||
for f in result["files"]:
|
||||
icon = "✓" if f["coverage_pct"] >= args.threshold else "✗"
|
||||
print(f" {icon} {f['file']}: {f['coverage_pct']}% ({f['with_provenance']}/{f['total']})")
|
||||
|
||||
print(f"{'='*50}\n")
|
||||
|
||||
# Exit code
|
||||
if args.threshold > 0 and not result["passes_threshold"]:
|
||||
sys.exit(1)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user