Files
timmy-config/training/ingest_trajectories.py

221 lines
6.5 KiB
Python

#!/usr/bin/env python3
"""
Ingest Nexus Trajectories into AutoLoRA Training Pipeline
Reads trajectory JSONL files produced by the Nexus consciousness loop
and merges them with the existing curated dataset for training.
Applies quality filtering:
- Skip trivial thoughts (< 20 chars)
- Skip "nothing happened" cycles
- Skip cycles where the model just echoed the perception
- Keep cycles with genuine thought, discovery, or action
Adds provenance metadata to every ingested pair.
Usage:
python ingest_trajectories.py \
--trajectories ~/.nexus/trajectories/ \
--curated ../data/curated_dataset.jsonl \
--output ../data/merged_training_data.jsonl
"""
import argparse
import json
from pathlib import Path
from difflib import SequenceMatcher
from training_pair_provenance import ProvenanceTracker
def load_jsonl(path: Path) -> list[dict]:
"""Load a JSONL file."""
entries = []
with open(path) as f:
for line in f:
line = line.strip()
if line:
entries.append(json.loads(line))
return entries
def is_quality_cycle(cycle: dict, min_thought_len: int = 30) -> bool:
"""Filter out low-quality cycles."""
convos = cycle.get("conversations", [])
gpt_turns = [c for c in convos if c["from"] == "gpt"]
human_turns = [c for c in convos if c["from"] == "human"]
if not gpt_turns:
return False
thought = gpt_turns[0]["value"]
# Too short
if len(thought) < min_thought_len:
return False
# Trivial patterns
trivial = [
"nothing has happened",
"nothing to think about",
"i have no new perceptions",
"i rest",
"i wait",
]
thought_lower = thought.lower()
if any(t in thought_lower for t in trivial):
return False
# Echo check — if the thought is too similar to the perception,
# the model is just parroting, not thinking
if human_turns:
perception = human_turns[0]["value"]
similarity = SequenceMatcher(
None, perception[:500], thought[:500]
).ratio()
if similarity > 0.7:
return False
return True
def _extract_session_id(path: Path) -> str:
"""Extract session ID from trajectory filename."""
stem = path.stem
# trajectory_20260315_123456_abc123.jsonl -> 20260315_123456_abc123
if stem.startswith("trajectory_"):
return stem[len("trajectory_"):]
return stem
def _extract_model(cycle: dict) -> str:
"""Extract model name from cycle metadata."""
return cycle.get("model", "unknown")
def merge_datasets(
trajectory_dir: Path,
curated_path: Path,
output_path: Path,
min_thought_len: int = 30,
) -> dict:
"""Merge nexus trajectories with curated dataset. Annotates provenance."""
tracker = ProvenanceTracker()
stats = {
"curated_count": 0,
"trajectory_files": 0,
"trajectory_raw": 0,
"trajectory_quality": 0,
"total_output": 0,
"provenance_valid": 0,
"provenance_invalid": 0,
}
# Load curated
curated = []
if curated_path.exists():
curated = load_jsonl(curated_path)
stats["curated_count"] = len(curated)
curated = [
tracker.annotate(pair, source="curated", model="timmy-curated", session_id=pair.get("id", "curated"))
for pair in curated
]
# Load and filter trajectories
quality_trajectories = []
for traj_file in sorted(trajectory_dir.glob("trajectory_*.jsonl")):
stats["trajectory_files"] += 1
session_id = _extract_session_id(traj_file)
cycles = load_jsonl(traj_file)
stats["trajectory_raw"] += len(cycles)
for cycle in cycles:
if is_quality_cycle(cycle, min_thought_len):
model = _extract_model(cycle)
cycle = tracker.annotate(
cycle,
source="trajectory",
model=model,
session_id=session_id,
timestamp=cycle.get("started_at", ""),
)
quality_trajectories.append(cycle)
else:
cycle = tracker.exclude(cycle, "quality_filter")
stats["trajectory_quality"] = len(quality_trajectories)
# Merge: curated first (gold standard), then quality trajectories
merged = curated + quality_trajectories
stats["total_output"] = len(merged)
# Validate all merged pairs
for pair in merged:
errs = tracker.validate(pair)
if errs:
stats["provenance_invalid"] += 1
else:
stats["provenance_valid"] += 1
# Write
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
for entry in merged:
f.write(json.dumps(entry) + "\n")
print(tracker.report_text())
return stats
def main():
parser = argparse.ArgumentParser(
description="Ingest Nexus trajectories into AutoLoRA pipeline"
)
parser.add_argument(
"--trajectories",
type=str,
default=str(Path.home() / ".nexus" / "trajectories"),
help="Path to Nexus trajectory directory",
)
parser.add_argument(
"--curated",
type=str,
default=str(Path(__file__).parent.parent / "data" / "curated_dataset.jsonl"),
help="Path to existing curated dataset",
)
parser.add_argument(
"--output",
type=str,
default=str(Path(__file__).parent.parent / "data" / "merged_training_data.jsonl"),
help="Output merged dataset path",
)
parser.add_argument(
"--min-thought-len",
type=int,
default=30,
help="Minimum thought length to include (default: 30)",
)
args = parser.parse_args()
stats = merge_datasets(
trajectory_dir=Path(args.trajectories),
curated_path=Path(args.curated),
output_path=Path(args.output),
min_thought_len=args.min_thought_len,
)
print("Nexus Trajectory Ingestion Complete")
print("=" * 40)
print(f" Curated exemplars: {stats['curated_count']}")
print(f" Trajectory files: {stats['trajectory_files']}")
print(f" Raw cycles: {stats['trajectory_raw']}")
print(f" Quality cycles: {stats['trajectory_quality']}")
print(f" Total training data: {stats['total_output']}")
print(f" Provenance valid: {stats['provenance_valid']}")
print(f" Provenance invalid: {stats['provenance_invalid']}")
print(f"\nOutput: {args.output}")
if __name__ == "__main__":
main()