Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 30s
Smoke Test / smoke (pull_request) Failing after 22s
Validate Matrix Scaffold / validate-scaffold (pull_request) Failing after 28s
Validate Training Data / validate (pull_request) Successful in 23s
Architecture Lint / Lint Repository (pull_request) Failing after 22s
PR Checklist / pr-checklist (pull_request) Failing after 3m30s
- Add scripts/pipeline_training_factory.sh — pipeline entry point - Add training-data/generate_timmy_voice.py — parameterized generator Pipeline will generate up to 10K total prompt→response pairs.
189 lines
5.6 KiB
Bash
Executable File
189 lines
5.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# pipeline_training_factory.sh — Generate Timmy Voice training data to reach 10K pairs.
|
|
#
|
|
# This is the Training Factory pipeline. It checks existing timmy-voice training
|
|
# data count and generates just enough new pairs to reach the 10,000 target.
|
|
# Uses the existing curated_dataset.jsonl as seed prompts and applies quality
|
|
# filtering per SOUL.md.
|
|
#
|
|
# Usage:
|
|
# ./scripts/pipeline_training_factory.sh # Run with default 150k token budget
|
|
# ./scripts/pipeline_training_factory.sh --max-tokens 200000
|
|
#
|
|
# Exit codes: 0 = success, 1 = failure, 2 = validation failed
|
|
|
|
set -euo pipefail
|
|
|
|
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
|
BUDGET_FILE="$HERMES_HOME/pipeline_budget.json"
|
|
LOG_DIR="$HERMES_HOME/logs"
|
|
LOG_FILE="$LOG_DIR/pipeline-training-factory.log"
|
|
TRAINING_DATA_DIR="$(cd "$(dirname "$0")/../training-data" && pwd)"
|
|
|
|
# Token budget handling
|
|
DAILY_LIMIT="${PIPELINE_DAILY_LIMIT:-150000}"
|
|
|
|
ensure_dirs() {
|
|
mkdir -p "$(dirname "$LOG_FILE")" "$(dirname "$BUDGET_FILE")"
|
|
}
|
|
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
get_tokens_used_today() {
|
|
if [[ -f "$BUDGET_FILE" ]]; then
|
|
local today
|
|
today=$(date +%Y-%m-%d)
|
|
python3 -c "
|
|
import json, sys
|
|
try:
|
|
with open('$BUDGET_FILE') as f:
|
|
d = json.load(f)
|
|
print(d.get('daily', {}).get('$today', {}).get('tokens_used', 0))
|
|
except Exception:
|
|
print(0)
|
|
" 2>/dev/null || echo 0
|
|
else
|
|
echo 0
|
|
fi
|
|
}
|
|
|
|
record_usage() {
|
|
local tokens="$1"
|
|
local today
|
|
today=$(date +%Y-%m-%d)
|
|
python3 -c "
|
|
import json, os
|
|
path = '$BUDGET_FILE'
|
|
d = {}
|
|
if os.path.exists(path):
|
|
with open(path) as f:
|
|
d = json.load(f)
|
|
daily = d.setdefault('daily', {})
|
|
day = daily.setdefault('$today', {'tokens_used': 0, 'pipelines': {}})
|
|
day['tokens_used'] = day.get('tokens_used', 0) + $tokens
|
|
day['pipelines']['training-factory'] = day['pipelines'].get('training-factory', 0) + $tokens
|
|
with open(path, 'w') as f:
|
|
json.dump(d, f, indent=2)
|
|
" 2>/dev/null || true
|
|
}
|
|
|
|
# Parse args
|
|
MAX_TOKENS=150000
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--max-tokens)
|
|
MAX_TOKENS="$2"
|
|
shift 2
|
|
;;
|
|
*)
|
|
shift
|
|
;;
|
|
esac
|
|
done
|
|
|
|
log "=== Training Factory start (budget: $MAX_TOKENS tokens) ==="
|
|
|
|
# Check current budget
|
|
USED=$(get_tokens_used_today)
|
|
REMAINING=$((DAILY_LIMIT - USED))
|
|
if [[ $REMAINING -lt 50000 ]]; then
|
|
log "Budget too low: $REMAINING remaining. Skipping."
|
|
echo "{"pipeline":"training-factory","status":"skipped","reason":"insufficient_budget"}"
|
|
exit 0
|
|
fi
|
|
|
|
# Count existing timmy-voice pairs
|
|
COUNT_EXISTING=0
|
|
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
|
|
[[ -f "$f" ]] || continue
|
|
# Count lines (pairs) in each file, skipping empty
|
|
n=$(grep -c '[^[:space:]]' "$f" 2>/dev/null || echo 0)
|
|
COUNT_EXISTING=$((COUNT_EXISTING + n))
|
|
done
|
|
log "Existing timmy-voice pairs: $COUNT_EXISTING"
|
|
|
|
TARGET=10000
|
|
NEEDED=$((TARGET - COUNT_EXISTING))
|
|
if [[ $NEEDED -le 0 ]]; then
|
|
log "Target of $TARGET already reached (have $COUNT_EXISTING). Nothing to do."
|
|
# Still report success
|
|
echo "{"pipeline":"training-factory","status":"success","existing":$COUNT_EXISTING}"
|
|
record_usage 1000 # nominal logging
|
|
exit 0
|
|
fi
|
|
|
|
log "Need to generate $NEEDED new pairs to reach $TARGET"
|
|
|
|
# Determine batch number
|
|
BATCH_NUM=10
|
|
# Find highest existing batch
|
|
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
|
|
[[ -f "$f" ]] || continue
|
|
bn=$(basename "$f" | sed -n 's/.*batch\([0-9]*\)\.jsonl/\1/p')
|
|
if [[ -n "$bn" && "$bn" -gt "$BATCH_NUM" ]]; then
|
|
BATCH_NUM=$bn
|
|
fi
|
|
done
|
|
BATCH_NUM=$((BATCH_NUM + 1))
|
|
log "New batch number: $BATCH_NUM"
|
|
|
|
OUTPUT="$TRAINING_DATA_DIR/timmy-voice-batch${BATCH_NUM:02d}.jsonl"
|
|
SEED=$((570 + BATCH_NUM))
|
|
|
|
log "Running generator: python3 $TRAINING_DATA_DIR/generate_timmy_voice.py --count $NEEDED --batch $BATCH_NUM --seed $SEED --output $OUTPUT"
|
|
|
|
if [[ ! -f "$TRAINING_DATA_DIR/generate_timmy_voice.py" ]]; then
|
|
log "ERROR: Generator not found at $TRAINING_DATA_DIR/generate_timmy_voice.py"
|
|
echo "{"pipeline":"training-factory","status":"failed","reason":"generator_missing"}"
|
|
exit 1
|
|
fi
|
|
|
|
# Run generation
|
|
set +e
|
|
OUTPUT_GEN=$(python3 "$TRAINING_DATA_DIR/generate_timmy_voice.py" --count "$NEEDED" --batch "$BATCH_NUM" --seed "$SEED" --output "$OUTPUT" 2>&1)
|
|
GEN_EXIT=$?
|
|
set -e
|
|
|
|
if [[ $GEN_EXIT -ne 0 ]]; then
|
|
log "Generation failed (exit $GEN_EXIT): $OUTPUT_GEN"
|
|
echo "{"pipeline":"training-factory","status":"failed","reason":"generation_error","details":"$OUTPUT_GEN"}"
|
|
exit 1
|
|
fi
|
|
|
|
log "Generation complete: $OUTPUT"
|
|
|
|
# Validate the generated file
|
|
log "Validating generated pairs..."
|
|
set +e
|
|
VALIDATE_OUTPUT=$(python3 "$TRAINING_DATA_DIR/validate_timmy_voice.py" "$OUTPUT" 2>&1)
|
|
VALIDATE_EXIT=$?
|
|
set -e
|
|
|
|
if [[ $VALIDATE_EXIT -ne 0 ]]; then
|
|
log "VALIDATION FAILED:\n$VALIDATE_OUTPUT"
|
|
echo "{"pipeline":"training-factory","status":"failed","reason":"validation_failed"}"
|
|
exit 1
|
|
fi
|
|
|
|
log "Validation passed."
|
|
|
|
# Re-count total after generation
|
|
TOTAL_NOW=0
|
|
for f in "$TRAINING_DATA_DIR"/timmy-voice-batch*.jsonl; do
|
|
[[ -f "$f" ]] || continue
|
|
n=$(grep -c '[^[:space:]]' "$f" 2>/dev/null || echo 0)
|
|
TOTAL_NOW=$((TOTAL_NOW + n))
|
|
done
|
|
log "Total timmy-voice pairs after generation: $TOTAL_NOW"
|
|
|
|
# Estimate token usage (rough: ~150 tokens per pair avg)
|
|
TOKENS_USED=$((NEEDED * 150))
|
|
record_usage "$TOKENS_USED"
|
|
log "Token usage recorded: ~$TOKENS_USED tokens"
|
|
|
|
echo "{"pipeline":"training-factory","status":"success","batch":$BATCH_NUM,"generated":$NEEDED,"total":$TOTAL_NOW,"tokens_used":$TOKENS_USED}"
|
|
log "=== Training Factory complete ==="
|
|
exit 0
|