Files
timmy-home/scripts/twitter_archive/build_dpo_pairs.py
2026-03-27 18:09:28 -04:00

86 lines
2.5 KiB
Python

#!/usr/bin/env python3
"""Build local-only DPO pairs from archive batch artifacts."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from .common import (
CANDIDATES_DIR,
TRAINING_DPO_DIR,
ensure_layout,
gather_evidence_ids,
load_json,
write_json,
append_jsonl,
write_progress_snapshot,
)
STATE_FILE = TRAINING_DPO_DIR / "processed_batches.json"
def main() -> None:
ensure_layout()
state = load_json(STATE_FILE, {"processed_batches": []})
processed = set(state.get("processed_batches", []))
date_key = datetime.now(timezone.utc).strftime("%Y%m%d")
output_file = TRAINING_DPO_DIR / f"pairs_{date_key}.jsonl"
new_pairs = []
newly_processed = []
for path in sorted(CANDIDATES_DIR.glob("batch_*.json")):
batch = load_json(path, {})
batch_id = batch.get("batch_id", path.stem)
if batch_id in processed:
continue
prompt = batch.get("prompt", "").strip()
chosen = batch.get("chosen", "").strip()
rejected = batch.get("rejected", "").strip()
if not prompt or not chosen or not rejected:
continue
evidence_ids = gather_evidence_ids(batch)
safety_flags = ["archive-private", "evidence-required"]
if not evidence_ids:
safety_flags.append("missing-evidence")
new_pairs.append(
{
"prompt": prompt,
"chosen": chosen,
"rejected": rejected,
"evidence_ids": evidence_ids,
"source_session": {
"draft": batch.get("draft_session_id"),
"critique": batch.get("critique_session_id"),
},
"task_type": "analysis",
"rubric_scores": batch.get("rubric_scores", {}),
"batch_id": batch_id,
"safety_flags": safety_flags,
}
)
newly_processed.append(batch_id)
append_jsonl(output_file, new_pairs)
state["processed_batches"] = sorted(processed.union(newly_processed))
write_json(STATE_FILE, state)
snapshot = write_progress_snapshot()
print(
json.dumps(
{
"status": "ok",
"pairs_written": len(new_pairs),
"output_file": output_file.name,
"processed_batches": len(state["processed_batches"]),
"progress": snapshot,
},
sort_keys=True,
)
)
if __name__ == "__main__":
main()