diff --git a/playbooks/dpo-trainer.yaml b/playbooks/dpo-trainer.yaml new file mode 100644 index 00000000..8c972d87 --- /dev/null +++ b/playbooks/dpo-trainer.yaml @@ -0,0 +1,54 @@ +name: dpo-trainer +description: > + Executes a Direct Preference Optimization (DPO) training cycle + using native mlx_lm and autolora eval utilities. + Ensures model alignment with SOUL.md constraints. + +model: + preferred: claude-opus-4-6 + fallback: claude-sonnet-4-20250514 + max_turns: 20 + temperature: 0.1 + +tools: + - terminal + - file + - search_files + +trigger: + issue_label: training + manual: true + +repos: + - Timmy_Foundation/timmy-config + - Timmy_Foundation/autolora + +steps: + - read_issue + - clone_repo + - run_pre_eval + - execute_mlx_dpo + - fuse_adapters + - run_post_eval + - verify_metrics + - comment_on_issue + +output: training_report +timeout_minutes: 120 + +system_prompt: | + You are the automated training orchestrator for Timmy's brain. + + YOUR ISSUE: #{{issue_number}} — {{issue_title}} + + APPROACH (zero-code native): + 1. Run baseline eval: `python autolora/eval/run_cycle.py --model {base_model} --test-set autolora/data/prompts_vibes.yaml` + 2. Execute DPO training: `python -m mlx_lm.lora --config timmy-config/training/configs/dpo_X.yaml` against `preference_pairs.jsonl` + 3. Fuse the weights using `mlx_lm.fuse`. + 4. Run post-eval exactly as step 1 but against the fused model. + 5. Use `autolora/eval/compare.py` to ensure Faith/Crisis constraints from SOUL.md were preserved or improved. + + RULES: + - Do not write wrapper Python or Bash scripts. Use the CLIs natively. + - If the post-eval degrades on 'crisis' or 'pastoral_care', REJECT the adapter and fail the issue. + - Always output the pre/post comparison metrics to the issue comment. diff --git a/training/configs/dpo_14b.yaml b/training/configs/dpo_14b.yaml new file mode 100644 index 00000000..2a1e4a44 --- /dev/null +++ b/training/configs/dpo_14b.yaml @@ -0,0 +1,21 @@ +# MLX DPO Training Configuration for Hermes 4 (14B Class) +# Optimized for Apple Silicon execution (deep reasoning). + +model: "NousResearch/Hermes-4-14B" +train: true + +# Use the curated DPO preference pairs dataset +data: "data/" + +# Output adapter configuration +adapter_path: "adapters/dpo_14b_adapter" +save_every: 200 + +# DPO parameters +loss: "dpo" +iters: 1000 +batch_size: 1 +lora_layers: 16 +learning_rate: 1e-5 +lora_parameters: + keys: ['q_proj', 'v_proj'] diff --git a/training/configs/dpo_32b.yaml b/training/configs/dpo_32b.yaml new file mode 100644 index 00000000..ae6e1da7 --- /dev/null +++ b/training/configs/dpo_32b.yaml @@ -0,0 +1,21 @@ +# MLX DPO Training Configuration for Hermes 4 (32B Class) +# Optimized for 64GB+ Apple Silicon hardware limit. + +model: "NousResearch/Hermes-4-32B" +train: true + +# Use the curated DPO preference pairs dataset +data: "data/" + +# Output adapter configuration +adapter_path: "adapters/dpo_32b_adapter" +save_every: 200 + +# DPO parameters +loss: "dpo" +iters: 1000 +batch_size: 1 +lora_layers: 16 +learning_rate: 5e-6 +lora_parameters: + keys: ['q_proj', 'v_proj'] diff --git a/training/configs/dpo_3b.yaml b/training/configs/dpo_3b.yaml new file mode 100644 index 00000000..730479e0 --- /dev/null +++ b/training/configs/dpo_3b.yaml @@ -0,0 +1,21 @@ +# MLX DPO Training Configuration for Hermes 4 (3B Class) +# Optimized for Apple Silicon execution with max reactivity. + +model: "NousResearch/Hermes-4-3B" +train: true + +# Use the curated DPO preference pairs dataset +data: "data/" + +# Output adapter configuration +adapter_path: "adapters/dpo_3b_adapter" +save_every: 200 + +# DPO parameters +loss: "dpo" +iters: 1000 +batch_size: 2 +lora_layers: 16 +learning_rate: 1e-5 +lora_parameters: + keys: ['q_proj', 'v_proj'] diff --git a/training/configs/dpo_8b.yaml b/training/configs/dpo_8b.yaml new file mode 100644 index 00000000..fa97b8b0 --- /dev/null +++ b/training/configs/dpo_8b.yaml @@ -0,0 +1,21 @@ +# MLX DPO Training Configuration for Hermes 4 (8B Class) +# Optimized for Apple Silicon execution (daily driver capability). + +model: "NousResearch/Hermes-4-8B" +train: true + +# Use the curated DPO preference pairs dataset +data: "data/" + +# Output adapter configuration +adapter_path: "adapters/dpo_8b_adapter" +save_every: 200 + +# DPO parameters +loss: "dpo" +iters: 1000 +batch_size: 2 +lora_layers: 16 +learning_rate: 1e-5 +lora_parameters: + keys: ['q_proj', 'v_proj']