#!/usr/bin/env python3 """Export Timmy session logs as LoRA training data (ChatML JSONL). Reads session JSONL files written by ``SessionLogger`` and converts them into conversation pairs suitable for fine-tuning with ``mlx_lm.lora``. Output format — one JSON object per line:: {"messages": [ {"role": "system", "content": ""}, {"role": "user", "content": ""}, {"role": "assistant", "content": ""} ]} Tool calls that appear between a user turn and the next assistant message are embedded in the assistant content using the Hermes 4 ```` XML format so the fine-tuned model learns both when to call tools and what JSON to emit. Usage:: # Export all session logs (default paths) python scripts/export_trajectories.py # Custom source / destination python scripts/export_trajectories.py \\ --logs-dir ~/custom-logs \\ --output ~/timmy-training-data.jsonl \\ --min-turns 2 \\ --verbose Epic: #1091 Project Bannerlord — AutoLoRA Sovereignty Loop (Step 3 of 7) Refs: #1103 """ from __future__ import annotations import argparse import json import logging import sys from pathlib import Path from typing import Any logger = logging.getLogger(__name__) # ── Constants ───────────────────────────────────────────────────────────────── TIMMY_SYSTEM_PROMPT = ( "You are Timmy, Alexander's personal AI agent running on a local Mac. " "You are concise, direct, and action-oriented. " "You have access to a broad set of tools — use them proactively. " "When you need to call a tool, output it in this format:\n" "\n" '{"name": "function_name", "arguments": {"param": "value"}}\n' "\n\n" "Always provide structured, accurate responses." ) # ── Entry grouping ───────────────────────────────────────────────────────────── def _load_entries(logs_dir: Path) -> list[dict[str, Any]]: """Load all session log entries, sorted chronologically.""" entries: list[dict[str, Any]] = [] log_files = sorted(logs_dir.glob("session_*.jsonl")) for log_file in log_files: try: with open(log_file) as f: for line in f: line = line.strip() if not line: continue try: entries.append(json.loads(line)) except json.JSONDecodeError: logger.warning("Skipping malformed line in %s", log_file.name) except OSError as exc: logger.warning("Cannot read %s: %s", log_file, exc) return entries def _format_tool_call(entry: dict[str, Any]) -> str: """Render a tool_call entry as a Hermes 4 XML block.""" payload = {"name": entry.get("tool", "unknown"), "arguments": entry.get("args", {})} return f"\n{json.dumps(payload)}\n" def _format_tool_result(entry: dict[str, Any]) -> str: """Render a tool result observation.""" result = entry.get("result", "") tool = entry.get("tool", "unknown") return f"\n{{\"name\": \"{tool}\", \"result\": {json.dumps(result)}}}\n" def _group_into_turns(entries: list[dict[str, Any]]) -> list[dict[str, Any]]: """Group raw session entries into (user_text, assistant_parts) turn pairs. Returns a list of dicts with keys: ``user`` - user message content ``assistant`` - assembled assistant content (responses + tool calls) """ turns: list[dict[str, Any]] = [] pending_user: str | None = None assistant_parts: list[str] = [] for entry in entries: etype = entry.get("type", "") role = entry.get("role", "") if etype == "message" and role == "user": # Flush any open turn if pending_user is not None and assistant_parts: turns.append( { "user": pending_user, "assistant": "\n".join(assistant_parts).strip(), } ) elif pending_user is not None: # User message with no assistant response — discard pass pending_user = entry.get("content", "").strip() assistant_parts = [] elif etype == "message" and role == "timmy": if pending_user is not None: content = entry.get("content", "").strip() if content: assistant_parts.append(content) elif etype == "tool_call": if pending_user is not None: assistant_parts.append(_format_tool_call(entry)) # Also append tool result as context so model learns the full loop if entry.get("result"): assistant_parts.append(_format_tool_result(entry)) # decision / error entries are skipped — they are meta-data, not conversation # Flush final open turn if pending_user is not None and assistant_parts: turns.append( { "user": pending_user, "assistant": "\n".join(assistant_parts).strip(), } ) return turns # ── Conversion ──────────────────────────────────────────────────────────────── def turns_to_training_examples( turns: list[dict[str, Any]], system_prompt: str = TIMMY_SYSTEM_PROMPT, min_assistant_len: int = 10, ) -> list[dict[str, Any]]: """Convert grouped turns into mlx-lm training examples. Each example has a ``messages`` list in ChatML order: ``[system, user, assistant]``. Args: turns: Output of ``_group_into_turns``. system_prompt: System prompt prepended to every example. min_assistant_len: Skip examples where the assistant turn is shorter than this many characters (filters out empty/trivial turns). Returns: List of training example dicts. """ examples: list[dict[str, Any]] = [] for turn in turns: assistant_text = turn.get("assistant", "").strip() user_text = turn.get("user", "").strip() if not user_text or len(assistant_text) < min_assistant_len: continue examples.append( { "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_text}, {"role": "assistant", "content": assistant_text}, ] } ) return examples def export_training_data( logs_dir: Path, output_path: Path, min_turns: int = 1, min_assistant_len: int = 10, verbose: bool = False, ) -> int: """Full export pipeline: load → group → convert → write. Args: logs_dir: Directory containing ``session_*.jsonl`` files. output_path: Destination ``.jsonl`` file for training data. min_turns: Minimum number of turns required (used for logging only). min_assistant_len: Minimum assistant response length to include. verbose: Print progress to stdout. Returns: Number of training examples written. """ if verbose: print(f"Loading session logs from: {logs_dir}") entries = _load_entries(logs_dir) if verbose: print(f" Loaded {len(entries)} raw entries") turns = _group_into_turns(entries) if verbose: print(f" Grouped into {len(turns)} conversation turns") examples = turns_to_training_examples( turns, min_assistant_len=min_assistant_len ) if verbose: print(f" Generated {len(examples)} training examples") if not examples: print("WARNING: No training examples generated. Check that session logs exist.") return 0 output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w") as f: for ex in examples: f.write(json.dumps(ex) + "\n") if verbose: print(f" Wrote {len(examples)} examples → {output_path}") return len(examples) # ── CLI ─────────────────────────────────────────────────────────────────────── def _default_logs_dir() -> Path: """Return default logs directory (repo root / logs).""" # Walk up from this script to find repo root (contains pyproject.toml) candidate = Path(__file__).resolve().parent for _ in range(5): candidate = candidate.parent if (candidate / "pyproject.toml").exists(): return candidate / "logs" return Path.home() / "logs" def _default_output_path() -> Path: return Path.home() / "timmy-training-data.jsonl" def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( description="Export Timmy session logs as LoRA training data (ChatML JSONL)", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) parser.add_argument( "--logs-dir", type=Path, default=_default_logs_dir(), help="Directory containing session_*.jsonl files (default: /logs)", ) parser.add_argument( "--output", type=Path, default=_default_output_path(), help="Output JSONL path (default: ~/timmy-training-data.jsonl)", ) parser.add_argument( "--min-turns", type=int, default=1, help="Minimum turns to process (informational, default: 1)", ) parser.add_argument( "--min-assistant-len", type=int, default=10, help="Minimum assistant response length in chars (default: 10)", ) parser.add_argument( "--verbose", "-v", action="store_true", help="Print progress information", ) args = parser.parse_args(argv) logging.basicConfig( level=logging.DEBUG if args.verbose else logging.WARNING, format="%(levelname)s: %(message)s", ) if not args.logs_dir.exists(): print(f"ERROR: Logs directory not found: {args.logs_dir}") print("Run the Timmy dashboard first to generate session logs.") return 1 count = export_training_data( logs_dir=args.logs_dir, output_path=args.output, min_turns=args.min_turns, min_assistant_len=args.min_assistant_len, verbose=args.verbose, ) if count > 0: print(f"Exported {count} training examples to: {args.output}") print() print("Next steps:") print(f" mkdir -p ~/timmy-lora-training") print(f" cp {args.output} ~/timmy-lora-training/train.jsonl") print(f" python scripts/lora_finetune.py --data ~/timmy-lora-training") else: print("No training examples exported.") return 1 return 0 if __name__ == "__main__": sys.exit(main())