Compare commits
2 Commits
timmy/orch
...
feature/dp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
64e0565563 | ||
|
|
3a5a0ad0b9 |
54
playbooks/dpo-trainer.yaml
Normal file
54
playbooks/dpo-trainer.yaml
Normal file
@@ -0,0 +1,54 @@
|
||||
name: dpo-trainer
|
||||
description: >
|
||||
Executes a Direct Preference Optimization (DPO) training cycle
|
||||
using native mlx_lm and autolora eval utilities.
|
||||
Ensures model alignment with SOUL.md constraints.
|
||||
|
||||
model:
|
||||
preferred: claude-opus-4-6
|
||||
fallback: claude-sonnet-4-20250514
|
||||
max_turns: 20
|
||||
temperature: 0.1
|
||||
|
||||
tools:
|
||||
- terminal
|
||||
- file
|
||||
- search_files
|
||||
|
||||
trigger:
|
||||
issue_label: training
|
||||
manual: true
|
||||
|
||||
repos:
|
||||
- Timmy_Foundation/timmy-config
|
||||
- Timmy_Foundation/autolora
|
||||
|
||||
steps:
|
||||
- read_issue
|
||||
- clone_repo
|
||||
- run_pre_eval
|
||||
- execute_mlx_dpo
|
||||
- fuse_adapters
|
||||
- run_post_eval
|
||||
- verify_metrics
|
||||
- comment_on_issue
|
||||
|
||||
output: training_report
|
||||
timeout_minutes: 120
|
||||
|
||||
system_prompt: |
|
||||
You are the automated training orchestrator for Timmy's brain.
|
||||
|
||||
YOUR ISSUE: #{{issue_number}} — {{issue_title}}
|
||||
|
||||
APPROACH (zero-code native):
|
||||
1. Run baseline eval: `python autolora/eval/run_cycle.py --model {base_model} --test-set autolora/data/prompts_vibes.yaml`
|
||||
2. Execute DPO training: `python -m mlx_lm.lora --config timmy-config/training/configs/dpo_X.yaml` against `preference_pairs.jsonl`
|
||||
3. Fuse the weights using `mlx_lm.fuse`.
|
||||
4. Run post-eval exactly as step 1 but against the fused model.
|
||||
5. Use `autolora/eval/compare.py` to ensure Faith/Crisis constraints from SOUL.md were preserved or improved.
|
||||
|
||||
RULES:
|
||||
- Do not write wrapper Python or Bash scripts. Use the CLIs natively.
|
||||
- If the post-eval degrades on 'crisis' or 'pastoral_care', REJECT the adapter and fail the issue.
|
||||
- Always output the pre/post comparison metrics to the issue comment.
|
||||
21
training/configs/dpo_14b.yaml
Normal file
21
training/configs/dpo_14b.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
# MLX DPO Training Configuration for Hermes 4 (14B Class)
|
||||
# Optimized for Apple Silicon execution (deep reasoning).
|
||||
|
||||
model: "NousResearch/Hermes-4-14B"
|
||||
train: true
|
||||
|
||||
# Use the curated DPO preference pairs dataset
|
||||
data: "data/"
|
||||
|
||||
# Output adapter configuration
|
||||
adapter_path: "adapters/dpo_14b_adapter"
|
||||
save_every: 200
|
||||
|
||||
# DPO parameters
|
||||
loss: "dpo"
|
||||
iters: 1000
|
||||
batch_size: 1
|
||||
lora_layers: 16
|
||||
learning_rate: 1e-5
|
||||
lora_parameters:
|
||||
keys: ['q_proj', 'v_proj']
|
||||
21
training/configs/dpo_32b.yaml
Normal file
21
training/configs/dpo_32b.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
# MLX DPO Training Configuration for Hermes 4 (32B Class)
|
||||
# Optimized for 64GB+ Apple Silicon hardware limit.
|
||||
|
||||
model: "NousResearch/Hermes-4-32B"
|
||||
train: true
|
||||
|
||||
# Use the curated DPO preference pairs dataset
|
||||
data: "data/"
|
||||
|
||||
# Output adapter configuration
|
||||
adapter_path: "adapters/dpo_32b_adapter"
|
||||
save_every: 200
|
||||
|
||||
# DPO parameters
|
||||
loss: "dpo"
|
||||
iters: 1000
|
||||
batch_size: 1
|
||||
lora_layers: 16
|
||||
learning_rate: 5e-6
|
||||
lora_parameters:
|
||||
keys: ['q_proj', 'v_proj']
|
||||
21
training/configs/dpo_3b.yaml
Normal file
21
training/configs/dpo_3b.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
# MLX DPO Training Configuration for Hermes 4 (3B Class)
|
||||
# Optimized for Apple Silicon execution with max reactivity.
|
||||
|
||||
model: "NousResearch/Hermes-4-3B"
|
||||
train: true
|
||||
|
||||
# Use the curated DPO preference pairs dataset
|
||||
data: "data/"
|
||||
|
||||
# Output adapter configuration
|
||||
adapter_path: "adapters/dpo_3b_adapter"
|
||||
save_every: 200
|
||||
|
||||
# DPO parameters
|
||||
loss: "dpo"
|
||||
iters: 1000
|
||||
batch_size: 2
|
||||
lora_layers: 16
|
||||
learning_rate: 1e-5
|
||||
lora_parameters:
|
||||
keys: ['q_proj', 'v_proj']
|
||||
21
training/configs/dpo_8b.yaml
Normal file
21
training/configs/dpo_8b.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
# MLX DPO Training Configuration for Hermes 4 (8B Class)
|
||||
# Optimized for Apple Silicon execution (daily driver capability).
|
||||
|
||||
model: "mlx-community/Hermes-3-Llama-3.1-8B-4bit"
|
||||
train: true
|
||||
|
||||
# Use the curated DPO preference pairs dataset
|
||||
data: "autolora/data/dpo/"
|
||||
|
||||
# Output adapter configuration
|
||||
adapter_path: "autolora/adapters/dpo-8b-adapter"
|
||||
save_every: 200
|
||||
|
||||
# DPO parameters
|
||||
loss: "dpo"
|
||||
iters: 1000
|
||||
batch_size: 2
|
||||
lora_layers: 16
|
||||
learning_rate: 1e-5
|
||||
lora_parameters:
|
||||
keys: ['q_proj', 'v_proj']
|
||||
Reference in New Issue
Block a user