feat: add Training Factory pipeline for Timmy Voice (fixes #572)
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 30s
Smoke Test / smoke (pull_request) Failing after 22s
Validate Matrix Scaffold / validate-scaffold (pull_request) Failing after 28s
Validate Training Data / validate (pull_request) Successful in 23s
Architecture Lint / Lint Repository (pull_request) Failing after 22s
PR Checklist / pr-checklist (pull_request) Failing after 3m30s
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 30s
Smoke Test / smoke (pull_request) Failing after 22s
Validate Matrix Scaffold / validate-scaffold (pull_request) Failing after 28s
Validate Training Data / validate (pull_request) Successful in 23s
Architecture Lint / Lint Repository (pull_request) Failing after 22s
PR Checklist / pr-checklist (pull_request) Failing after 3m30s
- Add scripts/pipeline_training_factory.sh — pipeline entry point - Add training-data/generate_timmy_voice.py — parameterized generator Pipeline will generate up to 10K total prompt→response pairs.
This commit is contained in:
188
scripts/pipeline_training_factory.sh
Executable file
188
scripts/pipeline_training_factory.sh
Executable file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env bash
|
||||
# pipeline_training_factory.sh — Generate Timmy Voice training data to reach 10K pairs.
|
||||
#
|
||||
# This is the Training Factory pipeline. It checks existing timmy-voice training
|
||||
# data count and generates just enough new pairs to reach the 10,000 target.
|
||||
# Uses the existing curated_dataset.jsonl as seed prompts and applies quality
|
||||
# filtering per SOUL.md.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/pipeline_training_factory.sh # Run with default 150k token budget
|
||||
# ./scripts/pipeline_training_factory.sh --max-tokens 200000
|
||||
#
|
||||
# Exit codes: 0 = success, 1 = failure, 2 = validation failed
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
||||
BUDGET_FILE="$HERMES_HOME/pipeline_budget.json"
|
||||
LOG_DIR="$HERMES_HOME/logs"
|
||||
LOG_FILE="$LOG_DIR/pipeline-training-factory.log"
|
||||
TRAINING_DATA_DIR="$(cd "$(dirname "$0")/../training-data" && pwd)"
|
||||
|
||||
# Token budget handling
|
||||
DAILY_LIMIT="${PIPELINE_DAILY_LIMIT:-150000}"
|
||||
|
||||
ensure_dirs() {
|
||||
mkdir -p "$(dirname "$LOG_FILE")" "$(dirname "$BUDGET_FILE")"
|
||||
}
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
get_tokens_used_today() {
|
||||
if [[ -f "$BUDGET_FILE" ]]; then
|
||||
local today
|
||||
today=$(date +%Y-%m-%d)
|
||||
python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
with open('$BUDGET_FILE') as f:
|
||||
d = json.load(f)
|
||||
print(d.get('daily', {}).get('$today', {}).get('tokens_used', 0))
|
||||
except Exception:
|
||||
print(0)
|
||||
" 2>/dev/null || echo 0
|
||||
else
|
||||
echo 0
|
||||
fi
|
||||
}
|
||||
|
||||
record_usage() {
|
||||
local tokens="$1"
|
||||
local today
|
||||
today=$(date +%Y-%m-%d)
|
||||
python3 -c "
|
||||
import json, os
|
||||
path = '$BUDGET_FILE'
|
||||
d = {}
|
||||
if os.path.exists(path):
|
||||
with open(path) as f:
|
||||
d = json.load(f)
|
||||
daily = d.setdefault('daily', {})
|
||||
day = daily.setdefault('$today', {'tokens_used': 0, 'pipelines': {}})
|
||||
day['tokens_used'] = day.get('tokens_used', 0) + $tokens
|
||||
day['pipelines']['training-factory'] = day['pipelines'].get('training-factory', 0) + $tokens
|
||||
with open(path, 'w') as f:
|
||||
json.dump(d, f, indent=2)
|
||||
" 2>/dev/null || true
|
||||
}
|
||||
|
||||
# Parse args
|
||||
MAX_TOKENS=150000
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--max-tokens)
|
||||
MAX_TOKENS="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
log "=== Training Factory start (budget: $MAX_TOKENS tokens) ==="
|
||||
|
||||
# Check current budget
|
||||
USED=$(get_tokens_used_today)
|
||||
REMAINING=$((DAILY_LIMIT - USED))
|
||||
if [[ $REMAINING -lt 50000 ]]; then
|
||||
log "Budget too low: $REMAINING remaining. Skipping."
|
||||
echo "{"pipeline":"training-factory","status":"skipped","reason":"insufficient_budget"}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Count existing timmy-voice pairs
|
||||
COUNT_EXISTING=0
|
||||
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
|
||||
[[ -f "$f" ]] || continue
|
||||
# Count lines (pairs) in each file, skipping empty
|
||||
n=$(grep -c '[^[:space:]]' "$f" 2>/dev/null || echo 0)
|
||||
COUNT_EXISTING=$((COUNT_EXISTING + n))
|
||||
done
|
||||
log "Existing timmy-voice pairs: $COUNT_EXISTING"
|
||||
|
||||
TARGET=10000
|
||||
NEEDED=$((TARGET - COUNT_EXISTING))
|
||||
if [[ $NEEDED -le 0 ]]; then
|
||||
log "Target of $TARGET already reached (have $COUNT_EXISTING). Nothing to do."
|
||||
# Still report success
|
||||
echo "{"pipeline":"training-factory","status":"success","existing":$COUNT_EXISTING}"
|
||||
record_usage 1000 # nominal logging
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "Need to generate $NEEDED new pairs to reach $TARGET"
|
||||
|
||||
# Determine batch number
|
||||
BATCH_NUM=10
|
||||
# Find highest existing batch
|
||||
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
|
||||
[[ -f "$f" ]] || continue
|
||||
bn=$(basename "$f" | sed -n 's/.*batch\([0-9]*\)\.jsonl/\1/p')
|
||||
if [[ -n "$bn" && "$bn" -gt "$BATCH_NUM" ]]; then
|
||||
BATCH_NUM=$bn
|
||||
fi
|
||||
done
|
||||
BATCH_NUM=$((BATCH_NUM + 1))
|
||||
log "New batch number: $BATCH_NUM"
|
||||
|
||||
OUTPUT="$TRAINING_DATA_DIR/timmy-voice-batch${BATCH_NUM:02d}.jsonl"
|
||||
SEED=$((570 + BATCH_NUM))
|
||||
|
||||
log "Running generator: python3 $TRAINING_DATA_DIR/generate_timmy_voice.py --count $NEEDED --batch $BATCH_NUM --seed $SEED --output $OUTPUT"
|
||||
|
||||
if [[ ! -f "$TRAINING_DATA_DIR/generate_timmy_voice.py" ]]; then
|
||||
log "ERROR: Generator not found at $TRAINING_DATA_DIR/generate_timmy_voice.py"
|
||||
echo "{"pipeline":"training-factory","status":"failed","reason":"generator_missing"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run generation
|
||||
set +e
|
||||
OUTPUT_GEN=$(python3 "$TRAINING_DATA_DIR/generate_timmy_voice.py" --count "$NEEDED" --batch "$BATCH_NUM" --seed "$SEED" --output "$OUTPUT" 2>&1)
|
||||
GEN_EXIT=$?
|
||||
set -e
|
||||
|
||||
if [[ $GEN_EXIT -ne 0 ]]; then
|
||||
log "Generation failed (exit $GEN_EXIT): $OUTPUT_GEN"
|
||||
echo "{"pipeline":"training-factory","status":"failed","reason":"generation_error","details":"$OUTPUT_GEN"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Generation complete: $OUTPUT"
|
||||
|
||||
# Validate the generated file
|
||||
log "Validating generated pairs..."
|
||||
set +e
|
||||
VALIDATE_OUTPUT=$(python3 "$TRAINING_DATA_DIR/validate_timmy_voice.py" "$OUTPUT" 2>&1)
|
||||
VALIDATE_EXIT=$?
|
||||
set -e
|
||||
|
||||
if [[ $VALIDATE_EXIT -ne 0 ]]; then
|
||||
log "VALIDATION FAILED:\n$VALIDATE_OUTPUT"
|
||||
echo "{"pipeline":"training-factory","status":"failed","reason":"validation_failed"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Validation passed."
|
||||
|
||||
# Re-count total after generation
|
||||
TOTAL_NOW=0
|
||||
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
|
||||
[[ -f "$f" ]] || continue
|
||||
n=$(grep -c '[^[:space:]]' "$f" 2>/dev/null || echo 0)
|
||||
TOTAL_NOW=$((TOTAL_NOW + n))
|
||||
done
|
||||
log "Total timmy-voice pairs after generation: $TOTAL_NOW"
|
||||
|
||||
# Estimate token usage (rough: ~150 tokens per pair avg)
|
||||
TOKENS_USED=$((NEEDED * 150))
|
||||
record_usage "$TOKENS_USED"
|
||||
log "Token usage recorded: ~$TOKENS_USED tokens"
|
||||
|
||||
echo "{"pipeline":"training-factory","status":"success","batch":$BATCH_NUM,"generated":$NEEDED,"total":$TOTAL_NOW,"tokens_used":$TOKENS_USED}"
|
||||
log "=== Training Factory complete ==="
|
||||
exit 0
|
||||
Reference in New Issue
Block a user