Files
timmy-config/scripts/pipeline_training_factory.sh
Rockachopa 438c62eb09
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 30s
Smoke Test / smoke (pull_request) Failing after 22s
Validate Matrix Scaffold / validate-scaffold (pull_request) Failing after 28s
Validate Training Data / validate (pull_request) Successful in 23s
Architecture Lint / Lint Repository (pull_request) Failing after 22s
PR Checklist / pr-checklist (pull_request) Failing after 3m30s
feat: add Training Factory pipeline for Timmy Voice (fixes #572)
- Add scripts/pipeline_training_factory.sh — pipeline entry point
- Add training-data/generate_timmy_voice.py — parameterized generator

Pipeline will generate up to 10K total prompt→response pairs.
2026-04-26 12:07:48 -04:00

189 lines
5.6 KiB
Bash
Executable File

#!/usr/bin/env bash
# pipeline_training_factory.sh — Generate Timmy Voice training data to reach 10K pairs.
#
# This is the Training Factory pipeline. It checks existing timmy-voice training
# data count and generates just enough new pairs to reach the 10,000 target.
# Uses the existing curated_dataset.jsonl as seed prompts and applies quality
# filtering per SOUL.md.
#
# Usage:
# ./scripts/pipeline_training_factory.sh # Run with default 150k token budget
# ./scripts/pipeline_training_factory.sh --max-tokens 200000
#
# Exit codes: 0 = success, 1 = failure, 2 = validation failed
set -euo pipefail
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
BUDGET_FILE="$HERMES_HOME/pipeline_budget.json"
LOG_DIR="$HERMES_HOME/logs"
LOG_FILE="$LOG_DIR/pipeline-training-factory.log"
TRAINING_DATA_DIR="$(cd "$(dirname "$0")/../training-data" && pwd)"
# Token budget handling
DAILY_LIMIT="${PIPELINE_DAILY_LIMIT:-150000}"
ensure_dirs() {
mkdir -p "$(dirname "$LOG_FILE")" "$(dirname "$BUDGET_FILE")"
}
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
}
get_tokens_used_today() {
if [[ -f "$BUDGET_FILE" ]]; then
local today
today=$(date +%Y-%m-%d)
python3 -c "
import json, sys
try:
with open('$BUDGET_FILE') as f:
d = json.load(f)
print(d.get('daily', {}).get('$today', {}).get('tokens_used', 0))
except Exception:
print(0)
" 2>/dev/null || echo 0
else
echo 0
fi
}
record_usage() {
local tokens="$1"
local today
today=$(date +%Y-%m-%d)
python3 -c "
import json, os
path = '$BUDGET_FILE'
d = {}
if os.path.exists(path):
with open(path) as f:
d = json.load(f)
daily = d.setdefault('daily', {})
day = daily.setdefault('$today', {'tokens_used': 0, 'pipelines': {}})
day['tokens_used'] = day.get('tokens_used', 0) + $tokens
day['pipelines']['training-factory'] = day['pipelines'].get('training-factory', 0) + $tokens
with open(path, 'w') as f:
json.dump(d, f, indent=2)
" 2>/dev/null || true
}
# Parse args
MAX_TOKENS=150000
while [[ $# -gt 0 ]]; do
case "$1" in
--max-tokens)
MAX_TOKENS="$2"
shift 2
;;
*)
shift
;;
esac
done
log "=== Training Factory start (budget: $MAX_TOKENS tokens) ==="
# Check current budget
USED=$(get_tokens_used_today)
REMAINING=$((DAILY_LIMIT - USED))
if [[ $REMAINING -lt 50000 ]]; then
log "Budget too low: $REMAINING remaining. Skipping."
echo "{"pipeline":"training-factory","status":"skipped","reason":"insufficient_budget"}"
exit 0
fi
# Count existing timmy-voice pairs
COUNT_EXISTING=0
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
[[ -f "$f" ]] || continue
# Count lines (pairs) in each file, skipping empty
n=$(grep -c '[^[:space:]]' "$f" 2>/dev/null || echo 0)
COUNT_EXISTING=$((COUNT_EXISTING + n))
done
log "Existing timmy-voice pairs: $COUNT_EXISTING"
TARGET=10000
NEEDED=$((TARGET - COUNT_EXISTING))
if [[ $NEEDED -le 0 ]]; then
log "Target of $TARGET already reached (have $COUNT_EXISTING). Nothing to do."
# Still report success
echo "{"pipeline":"training-factory","status":"success","existing":$COUNT_EXISTING}"
record_usage 1000 # nominal logging
exit 0
fi
log "Need to generate $NEEDED new pairs to reach $TARGET"
# Determine batch number
BATCH_NUM=10
# Find highest existing batch
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
[[ -f "$f" ]] || continue
bn=$(basename "$f" | sed -n 's/.*batch\([0-9]*\)\.jsonl/\1/p')
if [[ -n "$bn" && "$bn" -gt "$BATCH_NUM" ]]; then
BATCH_NUM=$bn
fi
done
BATCH_NUM=$((BATCH_NUM + 1))
log "New batch number: $BATCH_NUM"
OUTPUT="$TRAINING_DATA_DIR/timmy-voice-batch${BATCH_NUM:02d}.jsonl"
SEED=$((570 + BATCH_NUM))
log "Running generator: python3 $TRAINING_DATA_DIR/generate_timmy_voice.py --count $NEEDED --batch $BATCH_NUM --seed $SEED --output $OUTPUT"
if [[ ! -f "$TRAINING_DATA_DIR/generate_timmy_voice.py" ]]; then
log "ERROR: Generator not found at $TRAINING_DATA_DIR/generate_timmy_voice.py"
echo "{"pipeline":"training-factory","status":"failed","reason":"generator_missing"}"
exit 1
fi
# Run generation
set +e
OUTPUT_GEN=$(python3 "$TRAINING_DATA_DIR/generate_timmy_voice.py" --count "$NEEDED" --batch "$BATCH_NUM" --seed "$SEED" --output "$OUTPUT" 2>&1)
GEN_EXIT=$?
set -e
if [[ $GEN_EXIT -ne 0 ]]; then
log "Generation failed (exit $GEN_EXIT): $OUTPUT_GEN"
echo "{"pipeline":"training-factory","status":"failed","reason":"generation_error","details":"$OUTPUT_GEN"}"
exit 1
fi
log "Generation complete: $OUTPUT"
# Validate the generated file
log "Validating generated pairs..."
set +e
VALIDATE_OUTPUT=$(python3 "$TRAINING_DATA_DIR/validate_timmy_voice.py" "$OUTPUT" 2>&1)
VALIDATE_EXIT=$?
set -e
if [[ $VALIDATE_EXIT -ne 0 ]]; then
log "VALIDATION FAILED:\n$VALIDATE_OUTPUT"
echo "{"pipeline":"training-factory","status":"failed","reason":"validation_failed"}"
exit 1
fi
log "Validation passed."
# Re-count total after generation
TOTAL_NOW=0
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
[[ -f "$f" ]] || continue
n=$(grep -c '[^[:space:]]' "$f" 2>/dev/null || echo 0)
TOTAL_NOW=$((TOTAL_NOW + n))
done
log "Total timmy-voice pairs after generation: $TOTAL_NOW"
# Estimate token usage (rough: ~150 tokens per pair avg)
TOKENS_USED=$((NEEDED * 150))
record_usage "$TOKENS_USED"
log "Token usage recorded: ~$TOKENS_USED tokens"
echo "{"pipeline":"training-factory","status":"success","batch":$BATCH_NUM,"generated":$NEEDED,"total":$TOTAL_NOW,"tokens_used":$TOKENS_USED}"
log "=== Training Factory complete ==="
exit 0