feat: Add playground-factory pipeline script (#624)
This commit is contained in:
80
scripts/pipeline_playground_factory.sh
Normal file
80
scripts/pipeline_playground_factory.sh
Normal file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env bash
|
||||
# pipeline_playground_factory.sh — Generate training data pairs from sessions.
|
||||
#
|
||||
# Exports session transcripts into terse→rich prompt pairs for model training.
|
||||
# Reads from ~/.hermes/sessions/, writes to timmy-config/training-data/.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/pipeline_playground_factory.sh --max-tokens 100000
|
||||
# ./scripts/pipeline_playground_factory.sh --dry-run
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
||||
TRAINING_DIR="${TRAINING_DIR:-$HOME/.timmy/training-data/playground}"
|
||||
SESSIONS_DIR="${HERMES_HOME}/sessions"
|
||||
MAX_TOKENS="${MAX_TOKENS:-100000}"
|
||||
DRY_RUN=false
|
||||
TOKENS_USED=0
|
||||
|
||||
# Parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--max-tokens) MAX_TOKENS="$2"; shift 2 ;;
|
||||
--dry-run) DRY_RUN=true; shift ;;
|
||||
*) shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
log() { echo "[playground-factory $(date '+%H:%M:%S')] $*"; }
|
||||
|
||||
mkdir -p "$TRAINING_DIR"
|
||||
|
||||
# Find sessions newer than latest export
|
||||
LATEST_EXPORT=$(find "$TRAINING_DIR" -name '*.jsonl' -type f -print 2>/dev/null | sort | tail -1)
|
||||
SESSION_COUNT=0
|
||||
PAIR_COUNT=0
|
||||
|
||||
for session_file in $(find "$SESSIONS_DIR" -name 'session_*.json' -type f -newer "$LATEST_EXPORT" 2>/dev/null | sort | head -50); do
|
||||
if [[ "$TOKENS_USED" -ge "$MAX_TOKENS" ]]; then
|
||||
log "Token budget exhausted ($TOKENS_USED >= $MAX_TOKENS). Stopping."
|
||||
break
|
||||
fi
|
||||
|
||||
SESSION_COUNT=$((SESSION_COUNT + 1))
|
||||
|
||||
if $DRY_RUN; then
|
||||
log "DRY-RUN: Would process $(basename "$session_file")"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Extract pairs from session (Python does the heavy lifting)
|
||||
PAIRS=$(python3 -c "
|
||||
import json, sys
|
||||
|
||||
with open('$session_file') as f:
|
||||
data = json.load(f)
|
||||
|
||||
pairs = []
|
||||
messages = data.get('messages', [])
|
||||
for i in range(len(messages) - 1):
|
||||
if messages[i].get('role') == 'user' and messages[i+1].get('role') == 'assistant':
|
||||
terse = messages[i].get('content', '')[:200]
|
||||
rich = messages[i+1].get('content', '')[:2000]
|
||||
if len(terse) > 10 and len(rich) > 50:
|
||||
pairs.append(json.dumps({'terse': terse, 'rich': rich, 'source': '$session_file'}))
|
||||
for p in pairs:
|
||||
print(p)
|
||||
" 2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$PAIRS" ]]; then
|
||||
OUTFILE="$TRAINING_DIR/$(basename "$session_file" .json).jsonl"
|
||||
echo "$PAIRS" > "$OUTFILE"
|
||||
COUNT=$(echo "$PAIRS" | wc -l)
|
||||
PAIR_COUNT=$((PAIR_COUNT + COUNT))
|
||||
TOKENS_USED=$((TOKENS_USED + COUNT * 50))
|
||||
fi
|
||||
done
|
||||
|
||||
log "Done: $SESSION_COUNT sessions processed, $PAIR_COUNT pairs generated, $TOKENS_USED tokens used."
|
||||
exit 0
|
||||
Reference in New Issue
Block a user