timmy-config/scripts/backfill_training_provenance.py

#!/usr/bin/env python3
"""
backfill_training_provenance.py — Add provenance to all training data files.

Runs the backfill function from training.provenance on all JSONL files
in training-data/ and training/data/.

Usage:
    python3 scripts/backfill_training_provenance.py
    python3 scripts/backfill_training_provenance.py --dry-run
"""

import json
import os
import sys
from pathlib import Path
from datetime import datetime, timezone

# Add training to path
sys.path.insert(0, str(Path(__file__).parent.parent / "training"))
from provenance import add_provenance


DATA_DIRS = [
    Path.home() / "timmy-config" / "training-data",
    Path.home() / "timmy-config" / "training" / "data",
]


def backfill_file(filepath: Path, dry_run: bool = False) -> dict:
    """Add provenance to a single JSONL file."""
    pairs = []
    parse_errors = 0
    with open(filepath) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                pairs.append(json.loads(line))
            except json.JSONDecodeError:
                parse_errors += 1

    added = 0
    already_had = 0

    for i, pair in enumerate(pairs):
        if "source_session_id" not in pair or not pair["source_session_id"]:
            pairs[i] = add_provenance(
                pair,
                session_id="backfill",
                model="unknown",
                source_type="backfill",
            )
            added += 1
        else:
            already_had += 1

    if not dry_run and added > 0:
        with open(filepath, 'w') as f:
            for pair in pairs:
                f.write(json.dumps(pair, ensure_ascii=False) + '\n')

    return {
        "file": str(filepath),
        "total": len(pairs),
        "added": added,
        "already_had": already_had,
        "parse_errors": parse_errors,
    }


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Backfill provenance on training data")
    parser.add_argument("--dry-run", action="store_true", help="Don't write changes")
    parser.add_argument("--json", action="store_true", help="JSON output")
    args = parser.parse_args()

    results = []
    total_pairs = 0
    total_added = 0

    for data_dir in DATA_DIRS:
        if not data_dir.exists():
            continue
        for filepath in sorted(data_dir.rglob("*.jsonl")):
            result = backfill_file(filepath, dry_run=args.dry_run)
            results.append(result)
            total_pairs += result["total"]
            total_added += result["added"]

    if args.json:
        print(json.dumps({"results": results, "total_pairs": total_pairs, "total_added": total_added}, indent=2))
    else:
        print(f"\nProvenance Backfill {'(dry run)' if args.dry_run else ''}")
        print(f"{'='*50}")
        print(f"Files processed: {len(results)}")
        print(f"Total pairs:     {total_pairs}")
        print(f"Provenance added: {total_added}")
        print(f"Already had:     {total_pairs - total_added}")
        print(f"{'='*50}")


if __name__ == "__main__":
    main()