training/ingest_trajectories.py

#!/usr/bin/env python3
"""
Ingest Nexus Trajectories into AutoLoRA Training Pipeline

Reads trajectory JSONL files produced by the Nexus consciousness loop
and merges them with the existing curated dataset for training.

Applies quality filtering:
  - Skip trivial thoughts (< 20 chars)
  - Skip "nothing happened" cycles
  - Skip cycles where the model just echoed the perception
  - Keep cycles with genuine thought, discovery, or action

Usage:
    python ingest_nexus_trajectories.py \\
        --trajectories ~/.nexus/trajectories/ \\
        --curated ../data/curated_dataset.jsonl \\
        --output ../data/merged_training_data.jsonl
"""

import argparse
import json
from pathlib import Path
from difflib import SequenceMatcher


def load_jsonl(path: Path) -> list[dict]:
    """Load a JSONL file."""
    entries = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line:
                entries.append(json.loads(line))
    return entries


def is_quality_cycle(cycle: dict, min_thought_len: int = 30) -> bool:
    """Filter out low-quality cycles."""
    convos = cycle.get("conversations", [])
    gpt_turns = [c for c in convos if c["from"] == "gpt"]
    human_turns = [c for c in convos if c["from"] == "human"]

    if not gpt_turns:
        return False

    thought = gpt_turns[0]["value"]

    # Too short
    if len(thought) < min_thought_len:
        return False

    # Trivial patterns
    trivial = [
        "nothing has happened",
        "nothing to think about",
        "i have no new perceptions",
        "i rest",
        "i wait",
    ]
    thought_lower = thought.lower()
    if any(t in thought_lower for t in trivial):
        return False

    # Echo check — if the thought is too similar to the perception,
    # the model is just parroting, not thinking
    if human_turns:
        perception = human_turns[0]["value"]
        similarity = SequenceMatcher(
            None, perception[:500], thought[:500]
        ).ratio()
        if similarity > 0.7:
            return False

    return True


def merge_datasets(
    trajectory_dir: Path,
    curated_path: Path,
    output_path: Path,
    min_thought_len: int = 30,
) -> dict:
    """Merge nexus trajectories with curated dataset."""
    stats = {
        "curated_count": 0,
        "trajectory_files": 0,
        "trajectory_raw": 0,
        "trajectory_quality": 0,
        "total_output": 0,
    }

    # Load curated
    curated = []
    if curated_path.exists():
        curated = load_jsonl(curated_path)
        stats["curated_count"] = len(curated)

    # Load and filter trajectories
    quality_trajectories = []
    for traj_file in sorted(trajectory_dir.glob("trajectory_*.jsonl")):
        stats["trajectory_files"] += 1
        cycles = load_jsonl(traj_file)
        stats["trajectory_raw"] += len(cycles)

        for cycle in cycles:
            if is_quality_cycle(cycle, min_thought_len):
                quality_trajectories.append(cycle)

    stats["trajectory_quality"] = len(quality_trajectories)

    # Merge: curated first (gold standard), then quality trajectories
    merged = curated + quality_trajectories
    stats["total_output"] = len(merged)

    # Write
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w") as f:
        for entry in merged:
            f.write(json.dumps(entry) + "\n")

    return stats


def main():
    parser = argparse.ArgumentParser(
        description="Ingest Nexus trajectories into AutoLoRA pipeline"
    )
    parser.add_argument(
        "--trajectories",
        type=str,
        default=str(Path.home() / ".nexus" / "trajectories"),
        help="Path to Nexus trajectory directory",
    )
    parser.add_argument(
        "--curated",
        type=str,
        default=str(Path(__file__).parent.parent / "data" / "curated_dataset.jsonl"),
        help="Path to existing curated dataset",
    )
    parser.add_argument(
        "--output",
        type=str,
        default=str(Path(__file__).parent.parent / "data" / "merged_training_data.jsonl"),
        help="Output merged dataset path",
    )
    parser.add_argument(
        "--min-thought-len",
        type=int,
        default=30,
        help="Minimum thought length to include (default: 30)",
    )
    args = parser.parse_args()

    stats = merge_datasets(
        trajectory_dir=Path(args.trajectories),
        curated_path=Path(args.curated),
        output_path=Path(args.output),
        min_thought_len=args.min_thought_len,
    )

    print("Nexus Trajectory Ingestion Complete")
    print("=" * 40)
    print(f"  Curated exemplars:     {stats['curated_count']}")
    print(f"  Trajectory files:      {stats['trajectory_files']}")
    print(f"  Raw cycles:            {stats['trajectory_raw']}")
    print(f"  Quality cycles:        {stats['trajectory_quality']}")
    print(f"  Total training data:   {stats['total_output']}")
    print(f"\nOutput: {args.output}")


if __name__ == "__main__":
    main()
feat: migrate autolora pipeline into training/ Per direction shift (the-nexus#542). Replaces the autolora repo (1,500 lines of custom pipeline code) with config files for existing tools: - axolotl.yaml: replaces train_modal.py (239 lines) - mlx-lora.yaml: replaces MLX training scripts - eval-tasks.yaml: replaces run_eval.py (300 lines) - Makefile: replaces run_vibes.py, compare.py, convert_to_mlx.py Data migrated as-is: - curated_dataset.jsonl (26 gold-standard conversations) - preference_pairs.jsonl (DPO pairs) - prompts_vibes.yaml, prompts_nexus_vibes.yaml - v0-baseline eval results (historical record) Thin glue kept: - build_curated.py (data authoring, not infrastructure) - ingest_trajectories.py (domain-specific quality filter) Dependencies: pip install axolotl mlx-lm lm-evaluation-harness 2026-03-25 23:05:45 +00:00			`#!/usr/bin/env python3`
			`"""`
			`Ingest Nexus Trajectories into AutoLoRA Training Pipeline`

			`Reads trajectory JSONL files produced by the Nexus consciousness loop`
			`and merges them with the existing curated dataset for training.`

			`Applies quality filtering:`
			`- Skip trivial thoughts (< 20 chars)`
			`- Skip "nothing happened" cycles`
			`- Skip cycles where the model just echoed the perception`
			`- Keep cycles with genuine thought, discovery, or action`

			`Usage:`
			`python ingest_nexus_trajectories.py \\`
			`--trajectories ~/.nexus/trajectories/ \\`
			`--curated ../data/curated_dataset.jsonl \\`
			`--output ../data/merged_training_data.jsonl`
			`"""`

			`import argparse`
			`import json`
			`from pathlib import Path`
			`from difflib import SequenceMatcher`


			`def load_jsonl(path: Path) -> list[dict]:`
			`"""Load a JSONL file."""`
			`entries = []`
			`with open(path) as f:`
			`for line in f:`
			`line = line.strip()`
			`if line:`
			`entries.append(json.loads(line))`
			`return entries`


			`def is_quality_cycle(cycle: dict, min_thought_len: int = 30) -> bool:`
			`"""Filter out low-quality cycles."""`
			`convos = cycle.get("conversations", [])`
			`gpt_turns = [c for c in convos if c["from"] == "gpt"]`
			`human_turns = [c for c in convos if c["from"] == "human"]`

			`if not gpt_turns:`
			`return False`

			`thought = gpt_turns[0]["value"]`

			`# Too short`
			`if len(thought) < min_thought_len:`
			`return False`

			`# Trivial patterns`
			`trivial = [`
			`"nothing has happened",`
			`"nothing to think about",`
			`"i have no new perceptions",`
			`"i rest",`
			`"i wait",`
			`]`
			`thought_lower = thought.lower()`
			`if any(t in thought_lower for t in trivial):`
			`return False`

			`# Echo check — if the thought is too similar to the perception,`
			`# the model is just parroting, not thinking`
			`if human_turns:`
			`perception = human_turns[0]["value"]`
			`similarity = SequenceMatcher(`
			`None, perception[:500], thought[:500]`
			`).ratio()`
			`if similarity > 0.7:`
			`return False`

			`return True`


			`def merge_datasets(`
			`trajectory_dir: Path,`
			`curated_path: Path,`
			`output_path: Path,`
			`min_thought_len: int = 30,`
			`) -> dict:`
			`"""Merge nexus trajectories with curated dataset."""`
			`stats = {`
			`"curated_count": 0,`
			`"trajectory_files": 0,`
			`"trajectory_raw": 0,`
			`"trajectory_quality": 0,`
			`"total_output": 0,`
			`}`

			`# Load curated`
			`curated = []`
			`if curated_path.exists():`
			`curated = load_jsonl(curated_path)`
			`stats["curated_count"] = len(curated)`

			`# Load and filter trajectories`
			`quality_trajectories = []`
			`for traj_file in sorted(trajectory_dir.glob("trajectory_*.jsonl")):`
			`stats["trajectory_files"] += 1`
			`cycles = load_jsonl(traj_file)`
			`stats["trajectory_raw"] += len(cycles)`

			`for cycle in cycles:`
			`if is_quality_cycle(cycle, min_thought_len):`
			`quality_trajectories.append(cycle)`

			`stats["trajectory_quality"] = len(quality_trajectories)`

			`# Merge: curated first (gold standard), then quality trajectories`
			`merged = curated + quality_trajectories`
			`stats["total_output"] = len(merged)`

			`# Write`
			`output_path.parent.mkdir(parents=True, exist_ok=True)`
			`with open(output_path, "w") as f:`
			`for entry in merged:`
			`f.write(json.dumps(entry) + "\n")`

			`return stats`


			`def main():`
			`parser = argparse.ArgumentParser(`
			`description="Ingest Nexus trajectories into AutoLoRA pipeline"`
			`)`
			`parser.add_argument(`
			`"--trajectories",`
			`type=str,`
			`default=str(Path.home() / ".nexus" / "trajectories"),`
			`help="Path to Nexus trajectory directory",`
			`)`
			`parser.add_argument(`
			`"--curated",`
			`type=str,`
			`default=str(Path(__file__).parent.parent / "data" / "curated_dataset.jsonl"),`
			`help="Path to existing curated dataset",`
			`)`
			`parser.add_argument(`
			`"--output",`
			`type=str,`
			`default=str(Path(__file__).parent.parent / "data" / "merged_training_data.jsonl"),`
			`help="Output merged dataset path",`
			`)`
			`parser.add_argument(`
			`"--min-thought-len",`
			`type=int,`
			`default=30,`
			`help="Minimum thought length to include (default: 30)",`
			`)`
			`args = parser.parse_args()`

			`stats = merge_datasets(`
			`trajectory_dir=Path(args.trajectories),`
			`curated_path=Path(args.curated),`
			`output_path=Path(args.output),`
			`min_thought_len=args.min_thought_len,`
			`)`

			`print("Nexus Trajectory Ingestion Complete")`
			`print("=" * 40)`
			`print(f" Curated exemplars: {stats['curated_count']}")`
			`print(f" Trajectory files: {stats['trajectory_files']}")`
			`print(f" Raw cycles: {stats['trajectory_raw']}")`
			`print(f" Quality cycles: {stats['trajectory_quality']}")`
			`print(f" Total training data: {stats['total_output']}")`
			`print(f"\nOutput: {args.output}")`


			`if __name__ == "__main__":`
			`main()`