Per direction shift (the-nexus#542). Replaces the autolora repo (1,500 lines of custom pipeline code) with config files for existing tools: - axolotl.yaml: replaces train_modal.py (239 lines) - mlx-lora.yaml: replaces MLX training scripts - eval-tasks.yaml: replaces run_eval.py (300 lines) - Makefile: replaces run_vibes.py, compare.py, convert_to_mlx.py Data migrated as-is: - curated_dataset.jsonl (26 gold-standard conversations) - preference_pairs.jsonl (DPO pairs) - prompts_vibes.yaml, prompts_nexus_vibes.yaml - v0-baseline eval results (historical record) Thin glue kept: - build_curated.py (data authoring, not infrastructure) - ingest_trajectories.py (domain-specific quality filter) Dependencies: pip install axolotl mlx-lm lm-evaluation-harness
174 lines
4.9 KiB
Python
174 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ingest Nexus Trajectories into AutoLoRA Training Pipeline
|
|
|
|
Reads trajectory JSONL files produced by the Nexus consciousness loop
|
|
and merges them with the existing curated dataset for training.
|
|
|
|
Applies quality filtering:
|
|
- Skip trivial thoughts (< 20 chars)
|
|
- Skip "nothing happened" cycles
|
|
- Skip cycles where the model just echoed the perception
|
|
- Keep cycles with genuine thought, discovery, or action
|
|
|
|
Usage:
|
|
python ingest_nexus_trajectories.py \\
|
|
--trajectories ~/.nexus/trajectories/ \\
|
|
--curated ../data/curated_dataset.jsonl \\
|
|
--output ../data/merged_training_data.jsonl
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
from difflib import SequenceMatcher
|
|
|
|
|
|
def load_jsonl(path: Path) -> list[dict]:
|
|
"""Load a JSONL file."""
|
|
entries = []
|
|
with open(path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
entries.append(json.loads(line))
|
|
return entries
|
|
|
|
|
|
def is_quality_cycle(cycle: dict, min_thought_len: int = 30) -> bool:
|
|
"""Filter out low-quality cycles."""
|
|
convos = cycle.get("conversations", [])
|
|
gpt_turns = [c for c in convos if c["from"] == "gpt"]
|
|
human_turns = [c for c in convos if c["from"] == "human"]
|
|
|
|
if not gpt_turns:
|
|
return False
|
|
|
|
thought = gpt_turns[0]["value"]
|
|
|
|
# Too short
|
|
if len(thought) < min_thought_len:
|
|
return False
|
|
|
|
# Trivial patterns
|
|
trivial = [
|
|
"nothing has happened",
|
|
"nothing to think about",
|
|
"i have no new perceptions",
|
|
"i rest",
|
|
"i wait",
|
|
]
|
|
thought_lower = thought.lower()
|
|
if any(t in thought_lower for t in trivial):
|
|
return False
|
|
|
|
# Echo check — if the thought is too similar to the perception,
|
|
# the model is just parroting, not thinking
|
|
if human_turns:
|
|
perception = human_turns[0]["value"]
|
|
similarity = SequenceMatcher(
|
|
None, perception[:500], thought[:500]
|
|
).ratio()
|
|
if similarity > 0.7:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def merge_datasets(
|
|
trajectory_dir: Path,
|
|
curated_path: Path,
|
|
output_path: Path,
|
|
min_thought_len: int = 30,
|
|
) -> dict:
|
|
"""Merge nexus trajectories with curated dataset."""
|
|
stats = {
|
|
"curated_count": 0,
|
|
"trajectory_files": 0,
|
|
"trajectory_raw": 0,
|
|
"trajectory_quality": 0,
|
|
"total_output": 0,
|
|
}
|
|
|
|
# Load curated
|
|
curated = []
|
|
if curated_path.exists():
|
|
curated = load_jsonl(curated_path)
|
|
stats["curated_count"] = len(curated)
|
|
|
|
# Load and filter trajectories
|
|
quality_trajectories = []
|
|
for traj_file in sorted(trajectory_dir.glob("trajectory_*.jsonl")):
|
|
stats["trajectory_files"] += 1
|
|
cycles = load_jsonl(traj_file)
|
|
stats["trajectory_raw"] += len(cycles)
|
|
|
|
for cycle in cycles:
|
|
if is_quality_cycle(cycle, min_thought_len):
|
|
quality_trajectories.append(cycle)
|
|
|
|
stats["trajectory_quality"] = len(quality_trajectories)
|
|
|
|
# Merge: curated first (gold standard), then quality trajectories
|
|
merged = curated + quality_trajectories
|
|
stats["total_output"] = len(merged)
|
|
|
|
# Write
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, "w") as f:
|
|
for entry in merged:
|
|
f.write(json.dumps(entry) + "\n")
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Ingest Nexus trajectories into AutoLoRA pipeline"
|
|
)
|
|
parser.add_argument(
|
|
"--trajectories",
|
|
type=str,
|
|
default=str(Path.home() / ".nexus" / "trajectories"),
|
|
help="Path to Nexus trajectory directory",
|
|
)
|
|
parser.add_argument(
|
|
"--curated",
|
|
type=str,
|
|
default=str(Path(__file__).parent.parent / "data" / "curated_dataset.jsonl"),
|
|
help="Path to existing curated dataset",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
default=str(Path(__file__).parent.parent / "data" / "merged_training_data.jsonl"),
|
|
help="Output merged dataset path",
|
|
)
|
|
parser.add_argument(
|
|
"--min-thought-len",
|
|
type=int,
|
|
default=30,
|
|
help="Minimum thought length to include (default: 30)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
stats = merge_datasets(
|
|
trajectory_dir=Path(args.trajectories),
|
|
curated_path=Path(args.curated),
|
|
output_path=Path(args.output),
|
|
min_thought_len=args.min_thought_len,
|
|
)
|
|
|
|
print("Nexus Trajectory Ingestion Complete")
|
|
print("=" * 40)
|
|
print(f" Curated exemplars: {stats['curated_count']}")
|
|
print(f" Trajectory files: {stats['trajectory_files']}")
|
|
print(f" Raw cycles: {stats['trajectory_raw']}")
|
|
print(f" Quality cycles: {stats['trajectory_quality']}")
|
|
print(f" Total training data: {stats['total_output']}")
|
|
print(f"\nOutput: {args.output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|