86 lines
2.5 KiB
Python
86 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Build local-only DPO pairs from archive batch artifacts."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from datetime import datetime, timezone
|
|
|
|
from .common import (
|
|
CANDIDATES_DIR,
|
|
TRAINING_DPO_DIR,
|
|
ensure_layout,
|
|
gather_evidence_ids,
|
|
load_json,
|
|
write_json,
|
|
append_jsonl,
|
|
write_progress_snapshot,
|
|
)
|
|
|
|
STATE_FILE = TRAINING_DPO_DIR / "processed_batches.json"
|
|
|
|
|
|
def main() -> None:
|
|
ensure_layout()
|
|
state = load_json(STATE_FILE, {"processed_batches": []})
|
|
processed = set(state.get("processed_batches", []))
|
|
date_key = datetime.now(timezone.utc).strftime("%Y%m%d")
|
|
output_file = TRAINING_DPO_DIR / f"pairs_{date_key}.jsonl"
|
|
|
|
new_pairs = []
|
|
newly_processed = []
|
|
for path in sorted(CANDIDATES_DIR.glob("batch_*.json")):
|
|
batch = load_json(path, {})
|
|
batch_id = batch.get("batch_id", path.stem)
|
|
if batch_id in processed:
|
|
continue
|
|
prompt = batch.get("prompt", "").strip()
|
|
chosen = batch.get("chosen", "").strip()
|
|
rejected = batch.get("rejected", "").strip()
|
|
if not prompt or not chosen or not rejected:
|
|
continue
|
|
|
|
evidence_ids = gather_evidence_ids(batch)
|
|
safety_flags = ["archive-private", "evidence-required"]
|
|
if not evidence_ids:
|
|
safety_flags.append("missing-evidence")
|
|
|
|
new_pairs.append(
|
|
{
|
|
"prompt": prompt,
|
|
"chosen": chosen,
|
|
"rejected": rejected,
|
|
"evidence_ids": evidence_ids,
|
|
"source_session": {
|
|
"draft": batch.get("draft_session_id"),
|
|
"critique": batch.get("critique_session_id"),
|
|
},
|
|
"task_type": "analysis",
|
|
"rubric_scores": batch.get("rubric_scores", {}),
|
|
"batch_id": batch_id,
|
|
"safety_flags": safety_flags,
|
|
}
|
|
)
|
|
newly_processed.append(batch_id)
|
|
|
|
append_jsonl(output_file, new_pairs)
|
|
state["processed_batches"] = sorted(processed.union(newly_processed))
|
|
write_json(STATE_FILE, state)
|
|
snapshot = write_progress_snapshot()
|
|
print(
|
|
json.dumps(
|
|
{
|
|
"status": "ok",
|
|
"pairs_written": len(new_pairs),
|
|
"output_file": output_file.name,
|
|
"processed_batches": len(state["processed_batches"]),
|
|
"progress": snapshot,
|
|
},
|
|
sort_keys=True,
|
|
)
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|