feat: add Training Factory pipeline for Timmy Voice (fixes #572)

- Add scripts/pipeline_training_factory.sh — pipeline entry point - Add training-data/generate_timmy_voice.py — parameterized generator Pipeline will generate up to 10K total prompt→response pairs.
2026-04-26 12:04:35 -04:00
parent 52510e5ab3
commit 438c62eb09
2 changed files with 627 additions and 0 deletions
--- a/scripts/pipeline_training_factory.sh
+++ b/scripts/pipeline_training_factory.sh
@@ -0,0 +1,188 @@
+#!/usr/bin/env bash
+# pipeline_training_factory.sh — Generate Timmy Voice training data to reach 10K pairs.
+#
+# This is the Training Factory pipeline. It checks existing timmy-voice training
+# data count and generates just enough new pairs to reach the 10,000 target.
+# Uses the existing curated_dataset.jsonl as seed prompts and applies quality
+# filtering per SOUL.md.
+#
+# Usage:
+#   ./scripts/pipeline_training_factory.sh                    # Run with default 150k token budget
+#   ./scripts/pipeline_training_factory.sh --max-tokens 200000
+#
+# Exit codes: 0 = success, 1 = failure, 2 = validation failed
+
+set -euo pipefail
+
+HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
+BUDGET_FILE="$HERMES_HOME/pipeline_budget.json"
+LOG_DIR="$HERMES_HOME/logs"
+LOG_FILE="$LOG_DIR/pipeline-training-factory.log"
+TRAINING_DATA_DIR="$(cd "$(dirname "$0")/../training-data" && pwd)"
+
+# Token budget handling
+DAILY_LIMIT="${PIPELINE_DAILY_LIMIT:-150000}"
+
+ensure_dirs() {
+    mkdir -p "$(dirname "$LOG_FILE")" "$(dirname "$BUDGET_FILE")"
+}
+
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
+}
+
+get_tokens_used_today() {
+    if [[ -f "$BUDGET_FILE" ]]; then
+        local today
+        today=$(date +%Y-%m-%d)
+        python3 -c "
+import json, sys
+try:
+    with open('$BUDGET_FILE') as f:
+        d = json.load(f)
+    print(d.get('daily', {}).get('$today', {}).get('tokens_used', 0))
+except Exception:
+    print(0)
+" 2>/dev/null || echo 0
+    else
+        echo 0
+    fi
+}
+
+record_usage() {
+    local tokens="$1"
+    local today
+    today=$(date +%Y-%m-%d)
+    python3 -c "
+import json, os
+path = '$BUDGET_FILE'
+d = {}
+if os.path.exists(path):
+    with open(path) as f:
+        d = json.load(f)
+daily = d.setdefault('daily', {})
+day = daily.setdefault('$today', {'tokens_used': 0, 'pipelines': {}})
+day['tokens_used'] = day.get('tokens_used', 0) + $tokens
+day['pipelines']['training-factory'] = day['pipelines'].get('training-factory', 0) + $tokens
+with open(path, 'w') as f:
+    json.dump(d, f, indent=2)
+" 2>/dev/null || true
+}
+
+# Parse args
+MAX_TOKENS=150000
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --max-tokens)
+            MAX_TOKENS="$2"
+            shift 2
+            ;;
+        *)
+            shift
+            ;;
+    esac
+done
+
+log "=== Training Factory start (budget: $MAX_TOKENS tokens) ==="
+
+# Check current budget
+USED=$(get_tokens_used_today)
+REMAINING=$((DAILY_LIMIT - USED))
+if [[ $REMAINING -lt 50000 ]]; then
+    log "Budget too low: $REMAINING remaining. Skipping."
+    echo "{"pipeline":"training-factory","status":"skipped","reason":"insufficient_budget"}"
+    exit 0
+fi
+
+# Count existing timmy-voice pairs
+COUNT_EXISTING=0
+for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
+    [[ -f "$f" ]] || continue
+    # Count lines (pairs) in each file, skipping empty
+    n=$(grep -c '[^[:space:]]' "$f" 2>/dev/null || echo 0)
+    COUNT_EXISTING=$((COUNT_EXISTING + n))
+done
+log "Existing timmy-voice pairs: $COUNT_EXISTING"
+
+TARGET=10000
+NEEDED=$((TARGET - COUNT_EXISTING))
+if [[ $NEEDED -le 0 ]]; then
+    log "Target of $TARGET already reached (have $COUNT_EXISTING). Nothing to do."
+    # Still report success
+    echo "{"pipeline":"training-factory","status":"success","existing":$COUNT_EXISTING}"
+    record_usage 1000  # nominal logging
+    exit 0
+fi
+
+log "Need to generate $NEEDED new pairs to reach $TARGET"
+
+# Determine batch number
+BATCH_NUM=10
+# Find highest existing batch
+for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
+    [[ -f "$f" ]] || continue
+    bn=$(basename "$f" | sed -n 's/.*batch\([0-9]*\)\.jsonl/\1/p')
+    if [[ -n "$bn" && "$bn" -gt "$BATCH_NUM" ]]; then
+        BATCH_NUM=$bn
+    fi
+done
+BATCH_NUM=$((BATCH_NUM + 1))
+log "New batch number: $BATCH_NUM"
+
+OUTPUT="$TRAINING_DATA_DIR/timmy-voice-batch${BATCH_NUM:02d}.jsonl"
+SEED=$((570 + BATCH_NUM))
+
+log "Running generator: python3 $TRAINING_DATA_DIR/generate_timmy_voice.py --count $NEEDED --batch $BATCH_NUM --seed $SEED --output $OUTPUT"
+
+if [[ ! -f "$TRAINING_DATA_DIR/generate_timmy_voice.py" ]]; then
+    log "ERROR: Generator not found at $TRAINING_DATA_DIR/generate_timmy_voice.py"
+    echo "{"pipeline":"training-factory","status":"failed","reason":"generator_missing"}"
+    exit 1
+fi
+
+# Run generation
+set +e
+OUTPUT_GEN=$(python3 "$TRAINING_DATA_DIR/generate_timmy_voice.py"     --count "$NEEDED"     --batch "$BATCH_NUM"     --seed "$SEED"     --output "$OUTPUT"     2>&1)
+GEN_EXIT=$?
+set -e
+
+if [[ $GEN_EXIT -ne 0 ]]; then
+    log "Generation failed (exit $GEN_EXIT): $OUTPUT_GEN"
+    echo "{"pipeline":"training-factory","status":"failed","reason":"generation_error","details":"$OUTPUT_GEN"}"
+    exit 1
+fi
+
+log "Generation complete: $OUTPUT"
+
+# Validate the generated file
+log "Validating generated pairs..."
+set +e
+VALIDATE_OUTPUT=$(python3 "$TRAINING_DATA_DIR/validate_timmy_voice.py" "$OUTPUT" 2>&1)
+VALIDATE_EXIT=$?
+set -e
+
+if [[ $VALIDATE_EXIT -ne 0 ]]; then
+    log "VALIDATION FAILED:\n$VALIDATE_OUTPUT"
+    echo "{"pipeline":"training-factory","status":"failed","reason":"validation_failed"}"
+    exit 1
+fi
+
+log "Validation passed."
+
+# Re-count total after generation
+TOTAL_NOW=0
+for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
+    [[ -f "$f" ]] || continue
+    n=$(grep -c '[^[:space:]]' "$f" 2>/dev/null || echo 0)
+    TOTAL_NOW=$((TOTAL_NOW + n))
+done
+log "Total timmy-voice pairs after generation: $TOTAL_NOW"
+
+# Estimate token usage (rough: ~150 tokens per pair avg)
+TOKENS_USED=$((NEEDED * 150))
+record_usage "$TOKENS_USED"
+log "Token usage recorded: ~$TOKENS_USED tokens"
+
+echo "{"pipeline":"training-factory","status":"success","batch":$BATCH_NUM,"generated":$NEEDED,"total":$TOTAL_NOW,"tokens_used":$TOKENS_USED}"
+log "=== Training Factory complete ==="
+exit 0