Merge PR #760: training/ingest_trajectories.py (changed)

This commit is contained in:
Merge Bot
2026-04-16 05:07:33 +00:00
parent b5455cea8a
commit a5baa36fd6

View File

@@ -11,10 +11,14 @@ Applies quality filtering:
- Skip cycles where the model just echoed the perception
- Keep cycles with genuine thought, discovery, or action
Provenance:
- Each ingested pair gets provenance metadata attached
- Tracks source_session_id, model, timestamp, source="trajectory"
Usage:
python ingest_nexus_trajectories.py \\
--trajectories ~/.nexus/trajectories/ \\
--curated ../data/curated_dataset.jsonl \\
python ingest_trajectories.py \
--trajectories ~/.nexus/trajectories/ \
--curated ../data/curated_dataset.jsonl \
--output ../data/merged_training_data.jsonl
"""
@@ -23,6 +27,30 @@ import json
from pathlib import Path
from difflib import SequenceMatcher
try:
from training_pair_provenance import attach_provenance, extract_trajectory_provenance
except ImportError:
# Fallback: inline provenance for standalone use
from datetime import datetime, timezone
def attach_provenance(pair, source, source_session_id, model, timestamp=None, extras=None):
pair["provenance"] = {
"source": source,
"source_session_id": source_session_id,
"model": model,
"timestamp": timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
}
if extras:
pair["provenance"].update(extras)
return pair
def extract_trajectory_provenance(entry):
return {
"source_session_id": entry.get("id") or entry.get("session_id") or "unknown",
"model": entry.get("model", "unknown"),
"timestamp": entry.get("started_at") or entry.get("timestamp") or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
}
def load_jsonl(path: Path) -> list[dict]:
"""Load a JSONL file."""
@@ -87,6 +115,7 @@ def merge_datasets(
"trajectory_files": 0,
"trajectory_raw": 0,
"trajectory_quality": 0,
"provenance_attached": 0,
"total_output": 0,
}
@@ -105,6 +134,16 @@ def merge_datasets(
for cycle in cycles:
if is_quality_cycle(cycle, min_thought_len):
# Extract provenance from trajectory entry
prov = extract_trajectory_provenance(cycle)
cycle = attach_provenance(
cycle,
source="trajectory",
source_session_id=prov["source_session_id"],
model=prov["model"],
timestamp=prov["timestamp"],
)
stats["provenance_attached"] += 1
quality_trajectories.append(cycle)
stats["trajectory_quality"] = len(quality_trajectories)
@@ -165,6 +204,7 @@ def main():
print(f" Trajectory files: {stats['trajectory_files']}")
print(f" Raw cycles: {stats['trajectory_raw']}")
print(f" Quality cycles: {stats['trajectory_quality']}")
print(f" Provenance attached: {stats['provenance_attached']}")
print(f" Total training data: {stats['total_output']}")
print(f"\nOutput: {args.output}")