diff --git a/scripts/pipeline_playground_factory.sh b/scripts/pipeline_playground_factory.sh new file mode 100644 index 00000000..7d3aa6fd --- /dev/null +++ b/scripts/pipeline_playground_factory.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# pipeline_playground_factory.sh — Generate training data pairs from sessions. +# +# Exports session transcripts into terse→rich prompt pairs for model training. +# Reads from ~/.hermes/sessions/, writes to timmy-config/training-data/. +# +# Usage: +# ./scripts/pipeline_playground_factory.sh --max-tokens 100000 +# ./scripts/pipeline_playground_factory.sh --dry-run + +set -euo pipefail + +HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}" +TRAINING_DIR="${TRAINING_DIR:-$HOME/.timmy/training-data/playground}" +SESSIONS_DIR="${HERMES_HOME}/sessions" +MAX_TOKENS="${MAX_TOKENS:-100000}" +DRY_RUN=false +TOKENS_USED=0 + +# Parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --max-tokens) MAX_TOKENS="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + *) shift ;; + esac +done + +log() { echo "[playground-factory $(date '+%H:%M:%S')] $*"; } + +mkdir -p "$TRAINING_DIR" + +# Find sessions newer than latest export +LATEST_EXPORT=$(find "$TRAINING_DIR" -name '*.jsonl' -type f -print 2>/dev/null | sort | tail -1) +SESSION_COUNT=0 +PAIR_COUNT=0 + +for session_file in $(find "$SESSIONS_DIR" -name 'session_*.json' -type f -newer "$LATEST_EXPORT" 2>/dev/null | sort | head -50); do + if [[ "$TOKENS_USED" -ge "$MAX_TOKENS" ]]; then + log "Token budget exhausted ($TOKENS_USED >= $MAX_TOKENS). Stopping." + break + fi + + SESSION_COUNT=$((SESSION_COUNT + 1)) + + if $DRY_RUN; then + log "DRY-RUN: Would process $(basename "$session_file")" + continue + fi + + # Extract pairs from session (Python does the heavy lifting) + PAIRS=$(python3 -c " +import json, sys + +with open('$session_file') as f: + data = json.load(f) + +pairs = [] +messages = data.get('messages', []) +for i in range(len(messages) - 1): + if messages[i].get('role') == 'user' and messages[i+1].get('role') == 'assistant': + terse = messages[i].get('content', '')[:200] + rich = messages[i+1].get('content', '')[:2000] + if len(terse) > 10 and len(rich) > 50: + pairs.append(json.dumps({'terse': terse, 'rich': rich, 'source': '$session_file'})) +for p in pairs: + print(p) +" 2>/dev/null || echo "") + + if [[ -n "$PAIRS" ]]; then + OUTFILE="$TRAINING_DIR/$(basename "$session_file" .json).jsonl" + echo "$PAIRS" > "$OUTFILE" + COUNT=$(echo "$PAIRS" | wc -l) + PAIR_COUNT=$((PAIR_COUNT + COUNT)) + TOKENS_USED=$((TOKENS_USED + COUNT * 50)) + fi +done + +log "Done: $SESSION_COUNT sessions processed, $PAIR_COUNT pairs generated, $TOKENS_USED tokens used." +exit 0