72 lines
2.0 KiB
Bash
72 lines
2.0 KiB
Bash
#!/usr/bin/env bash
|
|
# pipeline_training_factory.sh — Run model fine-tuning with generated training data.
|
|
#
|
|
# Collects training data, runs LoRA/QLoRA fine-tuning on available hardware.
|
|
#
|
|
# Usage:
|
|
# ./scripts/pipeline_training_factory.sh --max-tokens 150000
|
|
|
|
set -euo pipefail
|
|
|
|
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
|
TRAINING_DIR="${TRAINING_DIR:-$HOME/.timmy/training-data}"
|
|
MODELS_DIR="${MODELS_DIR:-$HOME/.timmy/models}"
|
|
MAX_TOKENS="${MAX_TOKENS:-150000}"
|
|
DRY_RUN=false
|
|
TOKENS_USED=0
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--max-tokens) MAX_TOKENS="$2"; shift 2 ;;
|
|
--dry-run) DRY_RUN=true; shift ;;
|
|
*) shift ;;
|
|
esac
|
|
done
|
|
|
|
log() { echo "[training-factory $(date '+%H:%M:%S')] $*"; }
|
|
|
|
mkdir -p "$MODELS_DIR"
|
|
|
|
# Collect all JSONL training data
|
|
DATA_FILES=$(find "$TRAINING_DIR" -name '*.jsonl' -type f 2>/dev/null | head -20)
|
|
TOTAL_PAIRS=0
|
|
|
|
for f in $DATA_FILES; do
|
|
COUNT=$(wc -l < "$f" 2>/dev/null || echo 0)
|
|
TOTAL_PAIRS=$((TOTAL_PAIRS + COUNT))
|
|
done
|
|
|
|
log "Found $TOTAL_PAIRS training pairs across $(echo "$DATA_FILES" | wc -w | tr -d ' ') files"
|
|
|
|
if [[ "$TOTAL_PAIRS" -lt 10 ]]; then
|
|
log "Insufficient training data ($TOTAL_PAIRS pairs < 10 minimum). Skipping."
|
|
exit 0
|
|
fi
|
|
|
|
if $DRY_RUN; then
|
|
log "DRY-RUN: Would train on $TOTAL_PAIRS pairs (budget: $MAX_TOKENS)"
|
|
exit 0
|
|
fi
|
|
|
|
# Check for GPU
|
|
GPU_AVAILABLE=false
|
|
if command -v nvidia-smi &>/dev/null && nvidia-smi &>/dev/null; then
|
|
GPU_AVAILABLE=true
|
|
log "GPU detected"
|
|
elif [[ "$(uname)" == "Darwin" ]] && system_profiler SPDisplaysDataType 2>/dev/null | grep -q "Metal"; then
|
|
GPU_AVAILABLE=true
|
|
log "Apple Metal GPU detected"
|
|
fi
|
|
|
|
# Build merged training file
|
|
MERGED="$HERMES_HOME/training_merged.jsonl"
|
|
cat $DATA_FILES > "$MERGED" 2>/dev/null
|
|
TOKENS_USED=$((TOTAL_PAIRS * 60))
|
|
|
|
log "Merged training data: $MERGED ($TOTAL_PAIRS pairs, ~${TOKENS_USED} tokens)"
|
|
|
|
# Log completion (actual training would be triggered by autolora pipeline)
|
|
log "Training data ready. Tokens used: $TOKENS_USED / $MAX_TOKENS"
|
|
log "Run autolora pipeline for actual fine-tuning."
|
|
exit 0
|