107 lines
3.0 KiB
Python
107 lines
3.0 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
backfill_training_provenance.py — Add provenance to all training data files.
|
||
|
|
|
||
|
|
Runs the backfill function from training.provenance on all JSONL files
|
||
|
|
in training-data/ and training/data/.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python3 scripts/backfill_training_provenance.py
|
||
|
|
python3 scripts/backfill_training_provenance.py --dry-run
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
|
||
|
|
# Add training to path
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "training"))
|
||
|
|
from provenance import add_provenance
|
||
|
|
|
||
|
|
|
||
|
|
DATA_DIRS = [
|
||
|
|
Path.home() / "timmy-config" / "training-data",
|
||
|
|
Path.home() / "timmy-config" / "training" / "data",
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
def backfill_file(filepath: Path, dry_run: bool = False) -> dict:
|
||
|
|
"""Add provenance to a single JSONL file."""
|
||
|
|
pairs = []
|
||
|
|
parse_errors = 0
|
||
|
|
with open(filepath) as f:
|
||
|
|
for line in f:
|
||
|
|
line = line.strip()
|
||
|
|
if not line:
|
||
|
|
continue
|
||
|
|
try:
|
||
|
|
pairs.append(json.loads(line))
|
||
|
|
except json.JSONDecodeError:
|
||
|
|
parse_errors += 1
|
||
|
|
|
||
|
|
added = 0
|
||
|
|
already_had = 0
|
||
|
|
|
||
|
|
for i, pair in enumerate(pairs):
|
||
|
|
if "source_session_id" not in pair or not pair["source_session_id"]:
|
||
|
|
pairs[i] = add_provenance(
|
||
|
|
pair,
|
||
|
|
session_id="backfill",
|
||
|
|
model="unknown",
|
||
|
|
source_type="backfill",
|
||
|
|
)
|
||
|
|
added += 1
|
||
|
|
else:
|
||
|
|
already_had += 1
|
||
|
|
|
||
|
|
if not dry_run and added > 0:
|
||
|
|
with open(filepath, 'w') as f:
|
||
|
|
for pair in pairs:
|
||
|
|
f.write(json.dumps(pair, ensure_ascii=False) + '\n')
|
||
|
|
|
||
|
|
return {
|
||
|
|
"file": str(filepath),
|
||
|
|
"total": len(pairs),
|
||
|
|
"added": added,
|
||
|
|
"already_had": already_had,
|
||
|
|
"parse_errors": parse_errors,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
import argparse
|
||
|
|
parser = argparse.ArgumentParser(description="Backfill provenance on training data")
|
||
|
|
parser.add_argument("--dry-run", action="store_true", help="Don't write changes")
|
||
|
|
parser.add_argument("--json", action="store_true", help="JSON output")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
results = []
|
||
|
|
total_pairs = 0
|
||
|
|
total_added = 0
|
||
|
|
|
||
|
|
for data_dir in DATA_DIRS:
|
||
|
|
if not data_dir.exists():
|
||
|
|
continue
|
||
|
|
for filepath in sorted(data_dir.rglob("*.jsonl")):
|
||
|
|
result = backfill_file(filepath, dry_run=args.dry_run)
|
||
|
|
results.append(result)
|
||
|
|
total_pairs += result["total"]
|
||
|
|
total_added += result["added"]
|
||
|
|
|
||
|
|
if args.json:
|
||
|
|
print(json.dumps({"results": results, "total_pairs": total_pairs, "total_added": total_added}, indent=2))
|
||
|
|
else:
|
||
|
|
print(f"\nProvenance Backfill {'(dry run)' if args.dry_run else ''}")
|
||
|
|
print(f"{'='*50}")
|
||
|
|
print(f"Files processed: {len(results)}")
|
||
|
|
print(f"Total pairs: {total_pairs}")
|
||
|
|
print(f"Provenance added: {total_added}")
|
||
|
|
print(f"Already had: {total_pairs - total_added}")
|
||
|
|
print(f"{'='*50}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|