From a5baa36fd68227de06d910854c8d850d6a8cd8cd Mon Sep 17 00:00:00 2001 From: Merge Bot Date: Thu, 16 Apr 2026 05:07:33 +0000 Subject: [PATCH] Merge PR #760: training/ingest_trajectories.py (changed) --- training/ingest_trajectories.py | 46 ++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/training/ingest_trajectories.py b/training/ingest_trajectories.py index 46645a29..deae00a2 100644 --- a/training/ingest_trajectories.py +++ b/training/ingest_trajectories.py @@ -11,10 +11,14 @@ Applies quality filtering: - Skip cycles where the model just echoed the perception - Keep cycles with genuine thought, discovery, or action +Provenance: + - Each ingested pair gets provenance metadata attached + - Tracks source_session_id, model, timestamp, source="trajectory" + Usage: - python ingest_nexus_trajectories.py \\ - --trajectories ~/.nexus/trajectories/ \\ - --curated ../data/curated_dataset.jsonl \\ + python ingest_trajectories.py \ + --trajectories ~/.nexus/trajectories/ \ + --curated ../data/curated_dataset.jsonl \ --output ../data/merged_training_data.jsonl """ @@ -23,6 +27,30 @@ import json from pathlib import Path from difflib import SequenceMatcher +try: + from training_pair_provenance import attach_provenance, extract_trajectory_provenance +except ImportError: + # Fallback: inline provenance for standalone use + from datetime import datetime, timezone + + def attach_provenance(pair, source, source_session_id, model, timestamp=None, extras=None): + pair["provenance"] = { + "source": source, + "source_session_id": source_session_id, + "model": model, + "timestamp": timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + } + if extras: + pair["provenance"].update(extras) + return pair + + def extract_trajectory_provenance(entry): + return { + "source_session_id": entry.get("id") or entry.get("session_id") or "unknown", + "model": entry.get("model", "unknown"), + "timestamp": entry.get("started_at") or entry.get("timestamp") or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + } + def load_jsonl(path: Path) -> list[dict]: """Load a JSONL file.""" @@ -87,6 +115,7 @@ def merge_datasets( "trajectory_files": 0, "trajectory_raw": 0, "trajectory_quality": 0, + "provenance_attached": 0, "total_output": 0, } @@ -105,6 +134,16 @@ def merge_datasets( for cycle in cycles: if is_quality_cycle(cycle, min_thought_len): + # Extract provenance from trajectory entry + prov = extract_trajectory_provenance(cycle) + cycle = attach_provenance( + cycle, + source="trajectory", + source_session_id=prov["source_session_id"], + model=prov["model"], + timestamp=prov["timestamp"], + ) + stats["provenance_attached"] += 1 quality_trajectories.append(cycle) stats["trajectory_quality"] = len(quality_trajectories) @@ -165,6 +204,7 @@ def main(): print(f" Trajectory files: {stats['trajectory_files']}") print(f" Raw cycles: {stats['trajectory_raw']}") print(f" Quality cycles: {stats['trajectory_quality']}") + print(f" Provenance attached: {stats['provenance_attached']}") print(f" Total training data: {stats['total_output']}") print(f"\nOutput: {args.output}")