221 lines
6.5 KiB
Python
221 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ingest Nexus Trajectories into AutoLoRA Training Pipeline
|
|
|
|
Reads trajectory JSONL files produced by the Nexus consciousness loop
|
|
and merges them with the existing curated dataset for training.
|
|
|
|
Applies quality filtering:
|
|
- Skip trivial thoughts (< 20 chars)
|
|
- Skip "nothing happened" cycles
|
|
- Skip cycles where the model just echoed the perception
|
|
- Keep cycles with genuine thought, discovery, or action
|
|
|
|
Adds provenance metadata to every ingested pair.
|
|
|
|
Usage:
|
|
python ingest_trajectories.py \
|
|
--trajectories ~/.nexus/trajectories/ \
|
|
--curated ../data/curated_dataset.jsonl \
|
|
--output ../data/merged_training_data.jsonl
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
from difflib import SequenceMatcher
|
|
|
|
from training_pair_provenance import ProvenanceTracker
|
|
|
|
|
|
def load_jsonl(path: Path) -> list[dict]:
|
|
"""Load a JSONL file."""
|
|
entries = []
|
|
with open(path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
entries.append(json.loads(line))
|
|
return entries
|
|
|
|
|
|
def is_quality_cycle(cycle: dict, min_thought_len: int = 30) -> bool:
|
|
"""Filter out low-quality cycles."""
|
|
convos = cycle.get("conversations", [])
|
|
gpt_turns = [c for c in convos if c["from"] == "gpt"]
|
|
human_turns = [c for c in convos if c["from"] == "human"]
|
|
|
|
if not gpt_turns:
|
|
return False
|
|
|
|
thought = gpt_turns[0]["value"]
|
|
|
|
# Too short
|
|
if len(thought) < min_thought_len:
|
|
return False
|
|
|
|
# Trivial patterns
|
|
trivial = [
|
|
"nothing has happened",
|
|
"nothing to think about",
|
|
"i have no new perceptions",
|
|
"i rest",
|
|
"i wait",
|
|
]
|
|
thought_lower = thought.lower()
|
|
if any(t in thought_lower for t in trivial):
|
|
return False
|
|
|
|
# Echo check — if the thought is too similar to the perception,
|
|
# the model is just parroting, not thinking
|
|
if human_turns:
|
|
perception = human_turns[0]["value"]
|
|
similarity = SequenceMatcher(
|
|
None, perception[:500], thought[:500]
|
|
).ratio()
|
|
if similarity > 0.7:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def _extract_session_id(path: Path) -> str:
|
|
"""Extract session ID from trajectory filename."""
|
|
stem = path.stem
|
|
# trajectory_20260315_123456_abc123.jsonl -> 20260315_123456_abc123
|
|
if stem.startswith("trajectory_"):
|
|
return stem[len("trajectory_"):]
|
|
return stem
|
|
|
|
|
|
def _extract_model(cycle: dict) -> str:
|
|
"""Extract model name from cycle metadata."""
|
|
return cycle.get("model", "unknown")
|
|
|
|
|
|
def merge_datasets(
|
|
trajectory_dir: Path,
|
|
curated_path: Path,
|
|
output_path: Path,
|
|
min_thought_len: int = 30,
|
|
) -> dict:
|
|
"""Merge nexus trajectories with curated dataset. Annotates provenance."""
|
|
tracker = ProvenanceTracker()
|
|
stats = {
|
|
"curated_count": 0,
|
|
"trajectory_files": 0,
|
|
"trajectory_raw": 0,
|
|
"trajectory_quality": 0,
|
|
"total_output": 0,
|
|
"provenance_valid": 0,
|
|
"provenance_invalid": 0,
|
|
}
|
|
|
|
# Load curated
|
|
curated = []
|
|
if curated_path.exists():
|
|
curated = load_jsonl(curated_path)
|
|
stats["curated_count"] = len(curated)
|
|
curated = [
|
|
tracker.annotate(pair, source="curated", model="timmy-curated", session_id=pair.get("id", "curated"))
|
|
for pair in curated
|
|
]
|
|
|
|
# Load and filter trajectories
|
|
quality_trajectories = []
|
|
for traj_file in sorted(trajectory_dir.glob("trajectory_*.jsonl")):
|
|
stats["trajectory_files"] += 1
|
|
session_id = _extract_session_id(traj_file)
|
|
cycles = load_jsonl(traj_file)
|
|
stats["trajectory_raw"] += len(cycles)
|
|
|
|
for cycle in cycles:
|
|
if is_quality_cycle(cycle, min_thought_len):
|
|
model = _extract_model(cycle)
|
|
cycle = tracker.annotate(
|
|
cycle,
|
|
source="trajectory",
|
|
model=model,
|
|
session_id=session_id,
|
|
timestamp=cycle.get("started_at", ""),
|
|
)
|
|
quality_trajectories.append(cycle)
|
|
else:
|
|
cycle = tracker.exclude(cycle, "quality_filter")
|
|
|
|
stats["trajectory_quality"] = len(quality_trajectories)
|
|
|
|
# Merge: curated first (gold standard), then quality trajectories
|
|
merged = curated + quality_trajectories
|
|
stats["total_output"] = len(merged)
|
|
|
|
# Validate all merged pairs
|
|
for pair in merged:
|
|
errs = tracker.validate(pair)
|
|
if errs:
|
|
stats["provenance_invalid"] += 1
|
|
else:
|
|
stats["provenance_valid"] += 1
|
|
|
|
# Write
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, "w") as f:
|
|
for entry in merged:
|
|
f.write(json.dumps(entry) + "\n")
|
|
|
|
print(tracker.report_text())
|
|
return stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Ingest Nexus trajectories into AutoLoRA pipeline"
|
|
)
|
|
parser.add_argument(
|
|
"--trajectories",
|
|
type=str,
|
|
default=str(Path.home() / ".nexus" / "trajectories"),
|
|
help="Path to Nexus trajectory directory",
|
|
)
|
|
parser.add_argument(
|
|
"--curated",
|
|
type=str,
|
|
default=str(Path(__file__).parent.parent / "data" / "curated_dataset.jsonl"),
|
|
help="Path to existing curated dataset",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
default=str(Path(__file__).parent.parent / "data" / "merged_training_data.jsonl"),
|
|
help="Output merged dataset path",
|
|
)
|
|
parser.add_argument(
|
|
"--min-thought-len",
|
|
type=int,
|
|
default=30,
|
|
help="Minimum thought length to include (default: 30)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
stats = merge_datasets(
|
|
trajectory_dir=Path(args.trajectories),
|
|
curated_path=Path(args.curated),
|
|
output_path=Path(args.output),
|
|
min_thought_len=args.min_thought_len,
|
|
)
|
|
|
|
print("Nexus Trajectory Ingestion Complete")
|
|
print("=" * 40)
|
|
print(f" Curated exemplars: {stats['curated_count']}")
|
|
print(f" Trajectory files: {stats['trajectory_files']}")
|
|
print(f" Raw cycles: {stats['trajectory_raw']}")
|
|
print(f" Quality cycles: {stats['trajectory_quality']}")
|
|
print(f" Total training data: {stats['total_output']}")
|
|
print(f" Provenance valid: {stats['provenance_valid']}")
|
|
print(f" Provenance invalid: {stats['provenance_invalid']}")
|
|
print(f"\nOutput: {args.output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|