Merge PR #760: training/ingest_trajectories.py (changed)
This commit is contained in:
@@ -11,10 +11,14 @@ Applies quality filtering:
|
||||
- Skip cycles where the model just echoed the perception
|
||||
- Keep cycles with genuine thought, discovery, or action
|
||||
|
||||
Provenance:
|
||||
- Each ingested pair gets provenance metadata attached
|
||||
- Tracks source_session_id, model, timestamp, source="trajectory"
|
||||
|
||||
Usage:
|
||||
python ingest_nexus_trajectories.py \\
|
||||
--trajectories ~/.nexus/trajectories/ \\
|
||||
--curated ../data/curated_dataset.jsonl \\
|
||||
python ingest_trajectories.py \
|
||||
--trajectories ~/.nexus/trajectories/ \
|
||||
--curated ../data/curated_dataset.jsonl \
|
||||
--output ../data/merged_training_data.jsonl
|
||||
"""
|
||||
|
||||
@@ -23,6 +27,30 @@ import json
|
||||
from pathlib import Path
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
try:
|
||||
from training_pair_provenance import attach_provenance, extract_trajectory_provenance
|
||||
except ImportError:
|
||||
# Fallback: inline provenance for standalone use
|
||||
from datetime import datetime, timezone
|
||||
|
||||
def attach_provenance(pair, source, source_session_id, model, timestamp=None, extras=None):
|
||||
pair["provenance"] = {
|
||||
"source": source,
|
||||
"source_session_id": source_session_id,
|
||||
"model": model,
|
||||
"timestamp": timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
}
|
||||
if extras:
|
||||
pair["provenance"].update(extras)
|
||||
return pair
|
||||
|
||||
def extract_trajectory_provenance(entry):
|
||||
return {
|
||||
"source_session_id": entry.get("id") or entry.get("session_id") or "unknown",
|
||||
"model": entry.get("model", "unknown"),
|
||||
"timestamp": entry.get("started_at") or entry.get("timestamp") or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
}
|
||||
|
||||
|
||||
def load_jsonl(path: Path) -> list[dict]:
|
||||
"""Load a JSONL file."""
|
||||
@@ -87,6 +115,7 @@ def merge_datasets(
|
||||
"trajectory_files": 0,
|
||||
"trajectory_raw": 0,
|
||||
"trajectory_quality": 0,
|
||||
"provenance_attached": 0,
|
||||
"total_output": 0,
|
||||
}
|
||||
|
||||
@@ -105,6 +134,16 @@ def merge_datasets(
|
||||
|
||||
for cycle in cycles:
|
||||
if is_quality_cycle(cycle, min_thought_len):
|
||||
# Extract provenance from trajectory entry
|
||||
prov = extract_trajectory_provenance(cycle)
|
||||
cycle = attach_provenance(
|
||||
cycle,
|
||||
source="trajectory",
|
||||
source_session_id=prov["source_session_id"],
|
||||
model=prov["model"],
|
||||
timestamp=prov["timestamp"],
|
||||
)
|
||||
stats["provenance_attached"] += 1
|
||||
quality_trajectories.append(cycle)
|
||||
|
||||
stats["trajectory_quality"] = len(quality_trajectories)
|
||||
@@ -165,6 +204,7 @@ def main():
|
||||
print(f" Trajectory files: {stats['trajectory_files']}")
|
||||
print(f" Raw cycles: {stats['trajectory_raw']}")
|
||||
print(f" Quality cycles: {stats['trajectory_quality']}")
|
||||
print(f" Provenance attached: {stats['provenance_attached']}")
|
||||
print(f" Total training data: {stats['total_output']}")
|
||||
print(f"\nOutput: {args.output}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user