#!/usr/bin/env python3 """ Session Transcript → Training Pair Harvester Scans Hermes session JSONL files for Q&A patterns and extracts terse→rich training pairs. Outputs JSONL matching the timmy-config training pairs spec. Usage: python3 scripts/session_pair_harvester.py ~/.hermes/sessions/ python3 scripts/session_pair_harvester.py session.jsonl --output pairs.jsonl python3 scripts/session_pair_harvester.py --dir ~/.hermes/sessions/ --min-ratio 2.0 Output format: {"terse": "user short prompt", "rich": "ai detailed response", "source": "session_id", "model": "..."} """ import argparse import hashlib import json import sys from pathlib import Path from typing import Optional from session_reader import extract_conversation, read_session def compute_hash(text: str) -> str: """Content hash for deduplication.""" return hashlib.sha256(text.encode()).hexdigest()[:16] def extract_pairs_from_conversation(conversation: list, session_id: str, model: str, min_ratio: float = 1.5, min_response_words: int = 20) -> list: """Extract terse→rich pairs from a normalized conversation.""" pairs = [] seen_hashes = set() for i, msg in enumerate(conversation): # Look for assistant responses if msg.get('role') != 'assistant': continue response_text = msg.get('content', '') if not response_text or len(response_text.split()) < min_response_words: continue # Find the preceding user message prompt_text = "" for j in range(i - 1, -1, -1): if conversation[j].get('role') == 'user': prompt_text = conversation[j].get('content', '') break if not prompt_text: continue # Filter: skip tool results, system messages embedded as human if prompt_text.startswith('{') and 'output' in prompt_text[:100]: continue if prompt_text.startswith('# SOUL.md') or prompt_text.startswith('You are'): continue # Quality filters prompt_words = len(prompt_text.split()) response_words = len(response_text.split()) if prompt_words == 0 or response_words == 0: continue ratio = response_words / prompt_words if ratio < min_ratio: continue code_blocks = response_text.count('```') if code_blocks >= 4 and len(response_text.replace('```', '').strip()) < 50: continue if 'tool_call' in response_text[:100] or 'function_call' in response_text[:100]: continue content_hash = compute_hash(prompt_text + response_text[:200]) if content_hash in seen_hashes: continue seen_hashes.add(content_hash) clean_response = response_text pairs.append({ 'terse': prompt_text.strip(), 'rich': clean_response.strip(), 'source': session_id, 'model': model, 'prompt_words': prompt_words, 'response_words': response_words, 'ratio': round(ratio, 2), }) return pairs def extract_from_jsonl_file(path: str, **kwargs) -> list: """Read a session file and extract training pairs using normalized conversation.""" session_messages = read_session(path) if not session_messages: return [] conversation = extract_conversation(session_messages) # Derive session_id and model from first real message metadata first_msg = next((m for m in session_messages if m.get('role') or m.get('from')), {}) session_id = first_msg.get('meta_session_id', Path(path).name) model = first_msg.get('model', 'unknown') return extract_pairs_from_conversation(conversation, session_id, model, **kwargs) def deduplicate_pairs(pairs: list) -> list: """Remove duplicate pairs across files.""" seen = set() unique = [] for pair in pairs: key = compute_hash(pair["terse"] + pair["rich"][:200]) if key not in seen: seen.add(key) unique.append(pair) return unique def main(): parser = argparse.ArgumentParser(description="Harvest training pairs from session transcripts") parser.add_argument("input", nargs="?", help="Session JSONL file or directory") parser.add_argument("--dir", "-d", help="Directory to scan for session files") parser.add_argument("--output", "-o", default="harvested_pairs.jsonl", help="Output file") parser.add_argument("--min-ratio", type=float, default=1.5, help="Min response/prompt word ratio") parser.add_argument("--min-words", type=int, default=20, help="Min response word count") parser.add_argument("--dry-run", action="store_true", help="Print stats without writing") args = parser.parse_args() all_pairs = [] files_scanned = 0 scan_dir = args.dir or args.input if not scan_dir: parser.print_help() sys.exit(1) scan_path = Path(scan_dir) if scan_path.is_dir(): jsonl_files = sorted(scan_path.rglob("*.jsonl")) print(f"Scanning {len(jsonl_files)} files in {scan_dir}...", file=sys.stderr) for fpath in jsonl_files: pairs = extract_from_jsonl_file( str(fpath), min_ratio=args.min_ratio, min_response_words=args.min_words ) all_pairs.extend(pairs) files_scanned += 1 else: pairs = extract_from_jsonl_file( str(scan_path), min_ratio=args.min_ratio, min_response_words=args.min_words ) all_pairs.extend(pairs) files_scanned = 1 # Deduplicate unique_pairs = deduplicate_pairs(all_pairs) # Stats if unique_pairs: avg_prompt = sum(p["prompt_words"] for p in unique_pairs) / len(unique_pairs) avg_response = sum(p["response_words"] for p in unique_pairs) / len(unique_pairs) avg_ratio = sum(p["ratio"] for p in unique_pairs) / len(unique_pairs) else: avg_prompt = avg_response = avg_ratio = 0 stats = { "files_scanned": files_scanned, "raw_pairs": len(all_pairs), "unique_pairs": len(unique_pairs), "duplicates_removed": len(all_pairs) - len(unique_pairs), "avg_prompt_words": round(avg_prompt, 1), "avg_response_words": round(avg_response, 1), "avg_ratio": round(avg_ratio, 2), } print(json.dumps(stats, indent=2), file=sys.stderr) if args.dry_run: # Print sample pairs for pair in unique_pairs[:3]: print(f"\n--- Source: {pair['source']} (ratio: {pair['ratio']}) ---", file=sys.stderr) print(f"TERSE: {pair['terse'][:100]}...", file=sys.stderr) print(f"RICH: {pair['rich'][:150]}...", file=sys.stderr) return # Write output output_path = Path(args.output) with open(output_path, "w") as f: for pair in unique_pairs: # Strip internal fields for output output = { "terse": pair["terse"], "rich": pair["rich"], "source": pair["source"], "model": pair["model"], } f.write(json.dumps(output) + "\n") print(f"\nWrote {len(unique_pairs)} pairs to {output_path}", file=sys.stderr) if __name__ == "__main__": main()