#!/usr/bin/env python3 """ backfill_training_provenance.py — Add provenance to all training data files. Runs the backfill function from training.provenance on all JSONL files in training-data/ and training/data/. Usage: python3 scripts/backfill_training_provenance.py python3 scripts/backfill_training_provenance.py --dry-run """ import json import os import sys from pathlib import Path from datetime import datetime, timezone # Add training to path sys.path.insert(0, str(Path(__file__).parent.parent / "training")) from provenance import add_provenance DATA_DIRS = [ Path.home() / "timmy-config" / "training-data", Path.home() / "timmy-config" / "training" / "data", ] def backfill_file(filepath: Path, dry_run: bool = False) -> dict: """Add provenance to a single JSONL file.""" pairs = [] parse_errors = 0 with open(filepath) as f: for line in f: line = line.strip() if not line: continue try: pairs.append(json.loads(line)) except json.JSONDecodeError: parse_errors += 1 added = 0 already_had = 0 for i, pair in enumerate(pairs): if "source_session_id" not in pair or not pair["source_session_id"]: pairs[i] = add_provenance( pair, session_id="backfill", model="unknown", source_type="backfill", ) added += 1 else: already_had += 1 if not dry_run and added > 0: with open(filepath, 'w') as f: for pair in pairs: f.write(json.dumps(pair, ensure_ascii=False) + '\n') return { "file": str(filepath), "total": len(pairs), "added": added, "already_had": already_had, "parse_errors": parse_errors, } def main(): import argparse parser = argparse.ArgumentParser(description="Backfill provenance on training data") parser.add_argument("--dry-run", action="store_true", help="Don't write changes") parser.add_argument("--json", action="store_true", help="JSON output") args = parser.parse_args() results = [] total_pairs = 0 total_added = 0 for data_dir in DATA_DIRS: if not data_dir.exists(): continue for filepath in sorted(data_dir.rglob("*.jsonl")): result = backfill_file(filepath, dry_run=args.dry_run) results.append(result) total_pairs += result["total"] total_added += result["added"] if args.json: print(json.dumps({"results": results, "total_pairs": total_pairs, "total_added": total_added}, indent=2)) else: print(f"\nProvenance Backfill {'(dry run)' if args.dry_run else ''}") print(f"{'='*50}") print(f"Files processed: {len(results)}") print(f"Total pairs: {total_pairs}") print(f"Provenance added: {total_added}") print(f"Already had: {total_pairs - total_added}") print(f"{'='*50}") if __name__ == "__main__": main()