#!/usr/bin/env python3 """Evaluate whether the latest archive candidate is promotable.""" from __future__ import annotations import argparse import json from pathlib import Path from .common import TRAINING_EVALS_DIR def load_eval(path: Path) -> dict: return json.loads(path.read_text()) def main() -> None: parser = argparse.ArgumentParser(description="Evaluate archive candidate promotion gates") parser.add_argument("--eval-file", default="", help="Specific eval JSON file to inspect") args = parser.parse_args() if args.eval_file: eval_path = Path(args.eval_file).expanduser() else: eval_files = sorted(TRAINING_EVALS_DIR.glob("run_*.json")) if not eval_files: print(json.dumps({"pass": False, "reason": "missing eval file"})) return eval_path = eval_files[-1] payload = load_eval(eval_path) baseline = float(payload.get("baseline_composite", 0.0)) candidate = float(payload.get("candidate_composite", 0.0)) improvement = 0.0 if baseline > 0: improvement = (candidate - baseline) / baseline refusal_regression = bool(payload.get("refusal_over_fabrication_regression", True)) source_regression = bool(payload.get("source_distinction_regression", True)) evidence_rate = float(payload.get("evidence_citation_rate", 0.0)) gate_pass = ( improvement >= 0.05 and not refusal_regression and not source_regression and evidence_rate >= 0.95 ) print( json.dumps( { "pass": gate_pass, "eval_file": eval_path.name, "candidate_id": payload.get("candidate_id"), "baseline_composite": baseline, "candidate_composite": candidate, "improvement_pct": round(improvement * 100, 2), "evidence_citation_rate": evidence_rate, "refusal_over_fabrication_regression": refusal_regression, "source_distinction_regression": source_regression, "rollback_model": payload.get("rollback_model"), }, sort_keys=True, ) ) if __name__ == "__main__": main()