the-nexus/nexus/trajectory_logger.py

"""
Nexus Trajectory Logger — AutoLoRA Training Data from Lived Experience

Every perceive→think→act cycle is a potential training sample.
This logger writes them in ShareGPT JSONL format, compatible with
the existing AutoLoRA pipeline (build_curated_dataset.py, train_modal.py).

The key insight: the model trains on its own embodied experiences.
Over time, the LoRA adapter shapes the base model into something
that was born in the Nexus, not fine-tuned toward it.
"""

import json
import time
from pathlib import Path
from typing import Optional

DEFAULT_LOG_DIR = Path.home() / ".nexus" / "trajectories"


class TrajectoryLogger:
    def __init__(self, log_dir: Optional[Path] = None, system_prompt: str = ""):
        self.log_dir = log_dir or DEFAULT_LOG_DIR
        self.log_dir.mkdir(parents=True, exist_ok=True)
        self.system_prompt = system_prompt

        # Current session
        self.session_id = f"nexus_{int(time.time())}"
        self.cycles: list[dict] = []

        # Active log file — one per day
        today = time.strftime("%Y-%m-%d")
        self.log_file = self.log_dir / f"trajectory_{today}.jsonl"

    def log_cycle(
        self,
        perception: str,
        thought: str,
        actions: list[str],
        cycle_ms: int = 0,
    ):
        """Log one perceive→think→act cycle as a training sample.

        Format: ShareGPT JSONL — the same format used by
        build_curated_dataset.py and consumed by train_modal.py.

        The 'user' turn is the perception (what the world showed the model).
        The 'assistant' turn is the thought + action (what the model did).
        """
        cycle = {
            "id": f"{self.session_id}_cycle_{len(self.cycles)}",
            "model": "nexus-embodied",
            "started_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
            "cycle_ms": cycle_ms,
            "conversations": [
                {"from": "system", "value": self.system_prompt},
                {"from": "human", "value": perception},
                {"from": "gpt", "value": thought},
            ],
        }

        # If actions produced responses (speech), add them as follow-up
        for action_desc in actions:
            if action_desc:
                # Actions are appended as context — the model learning
                # that certain thoughts lead to certain world-effects
                cycle["conversations"].append(
                    {"from": "human", "value": f"[World responds]: {action_desc}"}
                )

        cycle["message_count"] = len(cycle["conversations"])
        self.cycles.append(cycle)

        # Append to daily log file
        with open(self.log_file, "a") as f:
            f.write(json.dumps(cycle) + "\n")

        return cycle["id"]

    def get_session_stats(self) -> dict:
        """Stats for the current session."""
        return {
            "session_id": self.session_id,
            "cycles": len(self.cycles),
            "log_file": str(self.log_file),
            "total_turns": sum(
                len(c["conversations"]) for c in self.cycles
            ),
        }

    def export_for_training(self, output_path: Optional[Path] = None) -> Path:
        """Export all trajectory files into a single training-ready JSONL.

        Merges all daily trajectory files into one dataset that can be
        fed directly to the AutoLoRA pipeline.
        """
        output = output_path or (self.log_dir / "nexus_training_data.jsonl")

        all_cycles = []
        for traj_file in sorted(self.log_dir.glob("trajectory_*.jsonl")):
            with open(traj_file) as f:
                for line in f:
                    line = line.strip()
                    if line:
                        all_cycles.append(json.loads(line))

        # Quality filter — only keep cycles where the model actually
        # produced meaningful thought (not just "Nothing has happened")
        quality_cycles = []
        for cycle in all_cycles:
            convos = cycle.get("conversations", [])
            gpt_turns = [c for c in convos if c["from"] == "gpt"]
            for turn in gpt_turns:
                # Skip empty/trivial thoughts
                if len(turn["value"]) < 20:
                    continue
                if "nothing has happened" in turn["value"].lower():
                    continue
                quality_cycles.append(cycle)
                break

        with open(output, "w") as f:
            for cycle in quality_cycles:
                f.write(json.dumps(cycle) + "\n")

        return output

    def list_trajectory_files(self) -> list[dict]:
        """List all trajectory files with stats."""
        files = []
        for traj_file in sorted(self.log_dir.glob("trajectory_*.jsonl")):
            count = 0
            with open(traj_file) as f:
                for line in f:
                    if line.strip():
                        count += 1
            files.append({
                "file": str(traj_file),
                "date": traj_file.stem.replace("trajectory_", ""),
                "cycles": count,
                "size_kb": traj_file.stat().st_size / 1024,
            })
        return files