timmy-config/scripts/pipeline_training_factory.sh

#!/usr/bin/env bash
# pipeline_training_factory.sh — Generate Timmy Voice training data to reach 10K pairs.
#
# This is the Training Factory pipeline. It checks existing timmy-voice training
# data count and generates just enough new pairs to reach the 10,000 target.
# Uses the existing curated_dataset.jsonl as seed prompts and applies quality
# filtering per SOUL.md.
#
# Usage:
#   ./scripts/pipeline_training_factory.sh                    # Run with default 150k token budget
#   ./scripts/pipeline_training_factory.sh --max-tokens 200000
#
# Exit codes: 0 = success, 1 = failure, 2 = validation failed

set -euo pipefail

HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
BUDGET_FILE="$HERMES_HOME/pipeline_budget.json"
LOG_DIR="$HERMES_HOME/logs"
LOG_FILE="$LOG_DIR/pipeline-training-factory.log"
TRAINING_DATA_DIR="$(cd "$(dirname "$0")/../training-data" && pwd)"

# Token budget handling
DAILY_LIMIT="${PIPELINE_DAILY_LIMIT:-150000}"

ensure_dirs() {
    mkdir -p "$(dirname "$LOG_FILE")" "$(dirname "$BUDGET_FILE")"
}

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
}

get_tokens_used_today() {
    if [[ -f "$BUDGET_FILE" ]]; then
        local today
        today=$(date +%Y-%m-%d)
        python3 -c "
import json, sys
try:
    with open('$BUDGET_FILE') as f:
        d = json.load(f)
    print(d.get('daily', {}).get('$today', {}).get('tokens_used', 0))
except Exception:
    print(0)
" 2>/dev/null || echo 0
    else
        echo 0
    fi
}

record_usage() {
    local tokens="$1"
    local today
    today=$(date +%Y-%m-%d)
    python3 -c "
import json, os
path = '$BUDGET_FILE'
d = {}
if os.path.exists(path):
    with open(path) as f:
        d = json.load(f)
daily = d.setdefault('daily', {})
day = daily.setdefault('$today', {'tokens_used': 0, 'pipelines': {}})
day['tokens_used'] = day.get('tokens_used', 0) + $tokens
day['pipelines']['training-factory'] = day['pipelines'].get('training-factory', 0) + $tokens
with open(path, 'w') as f:
    json.dump(d, f, indent=2)
" 2>/dev/null || true
}

# Parse args
MAX_TOKENS=150000
while [[ $# -gt 0 ]]; do
    case "$1" in
        --max-tokens)
            MAX_TOKENS="$2"
            shift 2
            ;;
        *)
            shift
            ;;
    esac
done

log "=== Training Factory start (budget: $MAX_TOKENS tokens) ==="

# Check current budget
USED=$(get_tokens_used_today)
REMAINING=$((DAILY_LIMIT - USED))
if [[ $REMAINING -lt 50000 ]]; then
    log "Budget too low: $REMAINING remaining. Skipping."
    echo "{"pipeline":"training-factory","status":"skipped","reason":"insufficient_budget"}"
    exit 0
fi

# Count existing timmy-voice pairs
COUNT_EXISTING=0
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
    [[ -f "$f" ]] || continue
    # Count lines (pairs) in each file, skipping empty
    n=$(grep -c '[^[:space:]]' "$f" 2>/dev/null || echo 0)
    COUNT_EXISTING=$((COUNT_EXISTING + n))
done
log "Existing timmy-voice pairs: $COUNT_EXISTING"

TARGET=10000
NEEDED=$((TARGET - COUNT_EXISTING))
if [[ $NEEDED -le 0 ]]; then
    log "Target of $TARGET already reached (have $COUNT_EXISTING). Nothing to do."
    # Still report success
    echo "{"pipeline":"training-factory","status":"success","existing":$COUNT_EXISTING}"
    record_usage 1000  # nominal logging
    exit 0
fi

log "Need to generate $NEEDED new pairs to reach $TARGET"

# Determine batch number
BATCH_NUM=10
# Find highest existing batch
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
    [[ -f "$f" ]] || continue
    bn=$(basename "$f" | sed -n 's/.*batch\([0-9]*\)\.jsonl/\1/p')
    if [[ -n "$bn" && "$bn" -gt "$BATCH_NUM" ]]; then
        BATCH_NUM=$bn
    fi
done
BATCH_NUM=$((BATCH_NUM + 1))
log "New batch number: $BATCH_NUM"

OUTPUT="$TRAINING_DATA_DIR/timmy-voice-batch${BATCH_NUM:02d}.jsonl"
SEED=$((570 + BATCH_NUM))

log "Running generator: python3 $TRAINING_DATA_DIR/generate_timmy_voice.py --count $NEEDED --batch $BATCH_NUM --seed $SEED --output $OUTPUT"

if [[ ! -f "$TRAINING_DATA_DIR/generate_timmy_voice.py" ]]; then
    log "ERROR: Generator not found at $TRAINING_DATA_DIR/generate_timmy_voice.py"
    echo "{"pipeline":"training-factory","status":"failed","reason":"generator_missing"}"
    exit 1
fi

# Run generation
set +e
OUTPUT_GEN=$(python3 "$TRAINING_DATA_DIR/generate_timmy_voice.py"     --count "$NEEDED"     --batch "$BATCH_NUM"     --seed "$SEED"     --output "$OUTPUT"     2>&1)
GEN_EXIT=$?
set -e

if [[ $GEN_EXIT -ne 0 ]]; then
    log "Generation failed (exit $GEN_EXIT): $OUTPUT_GEN"
    echo "{"pipeline":"training-factory","status":"failed","reason":"generation_error","details":"$OUTPUT_GEN"}"
    exit 1
fi

log "Generation complete: $OUTPUT"

# Validate the generated file
log "Validating generated pairs..."
set +e
VALIDATE_OUTPUT=$(python3 "$TRAINING_DATA_DIR/validate_timmy_voice.py" "$OUTPUT" 2>&1)
VALIDATE_EXIT=$?
set -e

if [[ $VALIDATE_EXIT -ne 0 ]]; then
    log "VALIDATION FAILED:\n$VALIDATE_OUTPUT"
    echo "{"pipeline":"training-factory","status":"failed","reason":"validation_failed"}"
    exit 1
fi

log "Validation passed."

# Re-count total after generation
TOTAL_NOW=0
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
    [[ -f "$f" ]] || continue
    n=$(grep -c '[^[:space:]]' "$f" 2>/dev/null || echo 0)
    TOTAL_NOW=$((TOTAL_NOW + n))
done
log "Total timmy-voice pairs after generation: $TOTAL_NOW"

# Estimate token usage (rough: ~150 tokens per pair avg)
TOKENS_USED=$((NEEDED * 150))
record_usage "$TOKENS_USED"
log "Token usage recorded: ~$TOKENS_USED tokens"

echo "{"pipeline":"training-factory","status":"success","batch":$BATCH_NUM,"generated":$NEEDED,"total":$TOTAL_NOW,"tokens_used":$TOKENS_USED}"
log "=== Training Factory complete ==="
exit 0