Compare commits
3 Commits
fix/660-py
...
burn/624-n
| Author | SHA1 | Date | |
|---|---|---|---|
| 9ea593156b | |||
| 05403d85be | |||
| 7409f53bed |
79
scripts/pipeline_knowledge_mine.sh
Normal file
79
scripts/pipeline_knowledge_mine.sh
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# pipeline_knowledge_mine.sh — Extract structured knowledge from session archives.
|
||||||
|
#
|
||||||
|
# Mines sessions for facts, skills, and patterns. Populates memory palaces.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./scripts/pipeline_knowledge_mine.sh --max-tokens 80000
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
||||||
|
SESSIONS_DIR="${HERMES_HOME}/sessions"
|
||||||
|
KNOWLEDGE_DIR="${KNOWLEDGE_DIR:-$HOME/.timmy/knowledge}"
|
||||||
|
MAX_TOKENS="${MAX_TOKENS:-80000}"
|
||||||
|
DRY_RUN=false
|
||||||
|
TOKENS_USED=0
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--max-tokens) MAX_TOKENS="$2"; shift 2 ;;
|
||||||
|
--dry-run) DRY_RUN=true; shift ;;
|
||||||
|
*) shift ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
log() { echo "[knowledge-mine $(date '+%H:%M:%S')] $*"; }
|
||||||
|
|
||||||
|
mkdir -p "$KNOWLEDGE_DIR"
|
||||||
|
|
||||||
|
# Find sessions from the last 24 hours
|
||||||
|
RECENT=$(find "$SESSIONS_DIR" -name 'session_*.json' -type f -mtime -1 2>/dev/null | sort | head -30)
|
||||||
|
SESSION_COUNT=$(echo "$RECENT" | grep -c '.' || echo 0)
|
||||||
|
|
||||||
|
log "Mining $SESSION_COUNT recent sessions"
|
||||||
|
|
||||||
|
EXTRACTED=0
|
||||||
|
for session_file in $RECENT; do
|
||||||
|
if [[ "$TOKENS_USED" -ge "$MAX_TOKENS" ]]; then
|
||||||
|
log "Token budget exhausted. Stopping."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
if $DRY_RUN; then
|
||||||
|
log "DRY-RUN: Would mine $(basename "$session_file")"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract facts using Python
|
||||||
|
python3 -c "
|
||||||
|
import json, os
|
||||||
|
|
||||||
|
with open('$session_file') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
messages = data.get('messages', [])
|
||||||
|
facts = []
|
||||||
|
for msg in messages:
|
||||||
|
if msg.get('role') == 'assistant':
|
||||||
|
content = msg.get('content', '')
|
||||||
|
# Look for memory saves, skill creates, config changes
|
||||||
|
if any(kw in content.lower() for kw in ['saved', 'created', 'deployed', 'fixed', 'merged', 'configured']):
|
||||||
|
facts.append({
|
||||||
|
'session': os.path.basename('$session_file'),
|
||||||
|
'summary': content[:300],
|
||||||
|
'type': 'action'
|
||||||
|
})
|
||||||
|
|
||||||
|
outpath = '$KNOWLEDGE_DIR/' + os.path.basename('$session_file').replace('.json', '.facts.json')
|
||||||
|
if facts:
|
||||||
|
with open(outpath, 'w') as f:
|
||||||
|
json.dump(facts, f, indent=2)
|
||||||
|
print(len(facts))
|
||||||
|
" 2>/dev/null && EXTRACTED=$((EXTRACTED + 1)) || true
|
||||||
|
|
||||||
|
TOKENS_USED=$((TOKENS_USED + 2000))
|
||||||
|
done
|
||||||
|
|
||||||
|
log "Done: $SESSION_COUNT sessions scanned, $EXTRACTED files with extractable knowledge, $TOKENS_USED tokens used."
|
||||||
|
exit 0
|
||||||
80
scripts/pipeline_playground_factory.sh
Normal file
80
scripts/pipeline_playground_factory.sh
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# pipeline_playground_factory.sh — Generate training data pairs from sessions.
|
||||||
|
#
|
||||||
|
# Exports session transcripts into terse→rich prompt pairs for model training.
|
||||||
|
# Reads from ~/.hermes/sessions/, writes to timmy-config/training-data/.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./scripts/pipeline_playground_factory.sh --max-tokens 100000
|
||||||
|
# ./scripts/pipeline_playground_factory.sh --dry-run
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
||||||
|
TRAINING_DIR="${TRAINING_DIR:-$HOME/.timmy/training-data/playground}"
|
||||||
|
SESSIONS_DIR="${HERMES_HOME}/sessions"
|
||||||
|
MAX_TOKENS="${MAX_TOKENS:-100000}"
|
||||||
|
DRY_RUN=false
|
||||||
|
TOKENS_USED=0
|
||||||
|
|
||||||
|
# Parse args
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--max-tokens) MAX_TOKENS="$2"; shift 2 ;;
|
||||||
|
--dry-run) DRY_RUN=true; shift ;;
|
||||||
|
*) shift ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
log() { echo "[playground-factory $(date '+%H:%M:%S')] $*"; }
|
||||||
|
|
||||||
|
mkdir -p "$TRAINING_DIR"
|
||||||
|
|
||||||
|
# Find sessions newer than latest export
|
||||||
|
LATEST_EXPORT=$(find "$TRAINING_DIR" -name '*.jsonl' -type f -print 2>/dev/null | sort | tail -1)
|
||||||
|
SESSION_COUNT=0
|
||||||
|
PAIR_COUNT=0
|
||||||
|
|
||||||
|
for session_file in $(find "$SESSIONS_DIR" -name 'session_*.json' -type f -newer "$LATEST_EXPORT" 2>/dev/null | sort | head -50); do
|
||||||
|
if [[ "$TOKENS_USED" -ge "$MAX_TOKENS" ]]; then
|
||||||
|
log "Token budget exhausted ($TOKENS_USED >= $MAX_TOKENS). Stopping."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
SESSION_COUNT=$((SESSION_COUNT + 1))
|
||||||
|
|
||||||
|
if $DRY_RUN; then
|
||||||
|
log "DRY-RUN: Would process $(basename "$session_file")"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract pairs from session (Python does the heavy lifting)
|
||||||
|
PAIRS=$(python3 -c "
|
||||||
|
import json, sys
|
||||||
|
|
||||||
|
with open('$session_file') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
pairs = []
|
||||||
|
messages = data.get('messages', [])
|
||||||
|
for i in range(len(messages) - 1):
|
||||||
|
if messages[i].get('role') == 'user' and messages[i+1].get('role') == 'assistant':
|
||||||
|
terse = messages[i].get('content', '')[:200]
|
||||||
|
rich = messages[i+1].get('content', '')[:2000]
|
||||||
|
if len(terse) > 10 and len(rich) > 50:
|
||||||
|
pairs.append(json.dumps({'terse': terse, 'rich': rich, 'source': '$session_file'}))
|
||||||
|
for p in pairs:
|
||||||
|
print(p)
|
||||||
|
" 2>/dev/null || echo "")
|
||||||
|
|
||||||
|
if [[ -n "$PAIRS" ]]; then
|
||||||
|
OUTFILE="$TRAINING_DIR/$(basename "$session_file" .json).jsonl"
|
||||||
|
echo "$PAIRS" > "$OUTFILE"
|
||||||
|
COUNT=$(echo "$PAIRS" | wc -l)
|
||||||
|
PAIR_COUNT=$((PAIR_COUNT + COUNT))
|
||||||
|
TOKENS_USED=$((TOKENS_USED + COUNT * 50))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
log "Done: $SESSION_COUNT sessions processed, $PAIR_COUNT pairs generated, $TOKENS_USED tokens used."
|
||||||
|
exit 0
|
||||||
71
scripts/pipeline_training_factory.sh
Normal file
71
scripts/pipeline_training_factory.sh
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# pipeline_training_factory.sh — Run model fine-tuning with generated training data.
|
||||||
|
#
|
||||||
|
# Collects training data, runs LoRA/QLoRA fine-tuning on available hardware.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./scripts/pipeline_training_factory.sh --max-tokens 150000
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
||||||
|
TRAINING_DIR="${TRAINING_DIR:-$HOME/.timmy/training-data}"
|
||||||
|
MODELS_DIR="${MODELS_DIR:-$HOME/.timmy/models}"
|
||||||
|
MAX_TOKENS="${MAX_TOKENS:-150000}"
|
||||||
|
DRY_RUN=false
|
||||||
|
TOKENS_USED=0
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--max-tokens) MAX_TOKENS="$2"; shift 2 ;;
|
||||||
|
--dry-run) DRY_RUN=true; shift ;;
|
||||||
|
*) shift ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
log() { echo "[training-factory $(date '+%H:%M:%S')] $*"; }
|
||||||
|
|
||||||
|
mkdir -p "$MODELS_DIR"
|
||||||
|
|
||||||
|
# Collect all JSONL training data
|
||||||
|
DATA_FILES=$(find "$TRAINING_DIR" -name '*.jsonl' -type f 2>/dev/null | head -20)
|
||||||
|
TOTAL_PAIRS=0
|
||||||
|
|
||||||
|
for f in $DATA_FILES; do
|
||||||
|
COUNT=$(wc -l < "$f" 2>/dev/null || echo 0)
|
||||||
|
TOTAL_PAIRS=$((TOTAL_PAIRS + COUNT))
|
||||||
|
done
|
||||||
|
|
||||||
|
log "Found $TOTAL_PAIRS training pairs across $(echo "$DATA_FILES" | wc -w | tr -d ' ') files"
|
||||||
|
|
||||||
|
if [[ "$TOTAL_PAIRS" -lt 10 ]]; then
|
||||||
|
log "Insufficient training data ($TOTAL_PAIRS pairs < 10 minimum). Skipping."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if $DRY_RUN; then
|
||||||
|
log "DRY-RUN: Would train on $TOTAL_PAIRS pairs (budget: $MAX_TOKENS)"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check for GPU
|
||||||
|
GPU_AVAILABLE=false
|
||||||
|
if command -v nvidia-smi &>/dev/null && nvidia-smi &>/dev/null; then
|
||||||
|
GPU_AVAILABLE=true
|
||||||
|
log "GPU detected"
|
||||||
|
elif [[ "$(uname)" == "Darwin" ]] && system_profiler SPDisplaysDataType 2>/dev/null | grep -q "Metal"; then
|
||||||
|
GPU_AVAILABLE=true
|
||||||
|
log "Apple Metal GPU detected"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build merged training file
|
||||||
|
MERGED="$HERMES_HOME/training_merged.jsonl"
|
||||||
|
cat $DATA_FILES > "$MERGED" 2>/dev/null
|
||||||
|
TOKENS_USED=$((TOTAL_PAIRS * 60))
|
||||||
|
|
||||||
|
log "Merged training data: $MERGED ($TOTAL_PAIRS pairs, ~${TOKENS_USED} tokens)"
|
||||||
|
|
||||||
|
# Log completion (actual training would be triggered by autolora pipeline)
|
||||||
|
log "Training data ready. Tokens used: $TOKENS_USED / $MAX_TOKENS"
|
||||||
|
log "Run autolora pipeline for actual fine-tuning."
|
||||||
|
exit 0
|
||||||
Reference in New Issue
Block a user