feat: training data pipeline — knowledge entries → JSONL training pairs
Some checks failed
Test / pytest (pull_request) Failing after 7s
Some checks failed
Test / pytest (pull_request) Failing after 7s
Add scripts/knowledge_to_training_pairs.py which reads quality-gated knowledge entries from knowledge/index.json and emits terse→rich training pairs in JSONL format. Features: - Derives terse queries from facts via category-aware heuristics - Configurable quality filters: min-confidence, model-filter, date range - Output includes domain, source_confidence, source_model - Smoke tests added in tests/test_knowledge_to_training_pairs.py Deliverables for #199: 1. Pipeline script: scripts/knowledge_to_training_pairs.py 2. End-to-end: knowledge/index.json → training_pairs.jsonl (or custom JSONL) 3. Config: min-confidence, model-filter, after/before date filters 4. Test: 9 smoke tests covering conversion, filtering, and end-to-end run Closes #199
This commit is contained in:
174
tests/test_knowledge_to_training_pairs.py
Normal file
174
tests/test_knowledge_to_training_pairs.py
Normal file
@@ -0,0 +1,174 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Smoke tests for knowledge_to_training_pairs.py
|
||||
|
||||
Tests:
|
||||
- Output is valid JSONL
|
||||
- Each line has required fields (terse, rich, domain, source_confidence, source_model)
|
||||
- Confidence values are in [0,1]
|
||||
- Terse is non-empty and reasonably short (< 200 chars)
|
||||
- Rich matches the original fact
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
# Add scripts dir to path for imports
|
||||
SCRIPT_DIR = Path(__file__).parent.parent / "scripts"
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
|
||||
from knowledge_to_training_pairs import (
|
||||
fact_to_terse,
|
||||
filter_entries,
|
||||
entry_to_pair,
|
||||
parse_date,
|
||||
)
|
||||
|
||||
|
||||
def test_fact_to_terse_pitfall():
|
||||
fact = "deploy-crons.py leaves jobs in mixed model format"
|
||||
category = "pitfall"
|
||||
domain = "hermes-agent"
|
||||
terse = fact_to_terse(fact, category, domain)
|
||||
assert terse.startswith("How do I")
|
||||
assert "?" in terse
|
||||
assert len(terse) < 150
|
||||
print("PASS: test_fact_to_terse_pitfall")
|
||||
|
||||
|
||||
def test_fact_to_terse_fact():
|
||||
fact = "Python is a high-level programming language"
|
||||
terse = fact_to_terse(fact, "fact", "global")
|
||||
assert terse.startswith("What should I know about")
|
||||
assert "?" in terse
|
||||
print("PASS: test_fact_to_terse_fact")
|
||||
|
||||
|
||||
def test_fact_to_terse_pattern():
|
||||
fact = "Use sparse checkout for large repos"
|
||||
terse = fact_to_terse(fact, "pattern", "devops")
|
||||
assert "recommended way" in terse or "best way" in terse
|
||||
print("PASS: test_fact_to_terse_pattern")
|
||||
|
||||
|
||||
def test_entry_to_pair_structure():
|
||||
entry = {
|
||||
"id": "test:001",
|
||||
"fact": "Test fact text.",
|
||||
"category": "fact",
|
||||
"domain": "test-domain",
|
||||
"confidence": 0.85,
|
||||
"model": "test-model",
|
||||
}
|
||||
pair = entry_to_pair(entry)
|
||||
assert pair is not None
|
||||
assert "terse" in pair
|
||||
assert "rich" in pair
|
||||
assert "domain" in pair
|
||||
assert "source_confidence" in pair
|
||||
assert "source_model" in pair
|
||||
assert pair["rich"] == "Test fact text."
|
||||
assert pair["domain"] == "test-domain"
|
||||
assert 0.0 <= pair["source_confidence"] <= 1.0
|
||||
print("PASS: test_entry_to_pair_structure")
|
||||
|
||||
|
||||
def test_filter_by_confidence():
|
||||
entries = [
|
||||
{"fact": "A", "confidence": 0.9},
|
||||
{"fact": "B", "confidence": 0.4},
|
||||
{"fact": "C", "confidence": 0.6},
|
||||
]
|
||||
filtered = filter_entries(entries, min_confidence=0.5)
|
||||
assert len(filtered) == 2
|
||||
assert all(e["confidence"] >= 0.5 for e in filtered)
|
||||
print("PASS: test_filter_by_confidence")
|
||||
|
||||
|
||||
def test_filter_by_model():
|
||||
entries = [
|
||||
{"fact": "A", "model": "claude-sonnet"},
|
||||
{"fact": "B", "model": "gpt-4"},
|
||||
{"fact": "C", "model": "unknown"},
|
||||
]
|
||||
filtered = filter_entries(entries, model_filter=["claude-sonnet", "gpt-4"])
|
||||
assert len(filtered) == 2
|
||||
assert all(e["model"] in ("claude-sonnet", "gpt-4") for e in filtered)
|
||||
print("PASS: test_filter_by_model")
|
||||
|
||||
|
||||
def test_filter_by_date():
|
||||
entries = [
|
||||
{"fact": "A", "last_confirmed": "2026-04-10"},
|
||||
{"fact": "B", "last_confirmed": "2026-03-01"},
|
||||
{"fact": "C", "first_seen": "2026-04-15"},
|
||||
]
|
||||
after_dt = parse_date("2026-04-01")
|
||||
filtered = filter_entries(entries, after=after_dt)
|
||||
assert len(filtered) == 2
|
||||
print("PASS: test_filter_by_date")
|
||||
|
||||
|
||||
def test_end_to_end_jsonl_output():
|
||||
"""Integration test: run the script and verify JSONL validity."""
|
||||
import subprocess
|
||||
|
||||
repo_dir = SCRIPT_DIR.parent
|
||||
result = subprocess.run(
|
||||
["python3", "scripts/knowledge_to_training_pairs.py", "--dry-run"],
|
||||
capture_output=True, text=True, cwd=repo_dir
|
||||
)
|
||||
assert result.returncode == 0
|
||||
stderr = result.stderr.strip()
|
||||
|
||||
# The stats JSON object is at the top of stderr. Find its bounds via brace matching.
|
||||
start = stderr.find('{')
|
||||
assert start >= 0, "Stats JSON not found in stderr"
|
||||
stderr_sub = stderr[start:]
|
||||
|
||||
depth = 0
|
||||
end = 0
|
||||
for i, ch in enumerate(stderr_sub):
|
||||
if ch == '{':
|
||||
depth += 1
|
||||
elif ch == '}':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
end = i + 1
|
||||
break
|
||||
assert end > 0, "Unterminated JSON in stderr"
|
||||
|
||||
stats = json.loads(stderr_sub[:end])
|
||||
assert stats["input_entries"] > 0
|
||||
assert stats["pairs_generated"] > 0
|
||||
print("PASS: test_end_to_end_jsonl_output")
|
||||
|
||||
|
||||
def test_terse_length_constraint():
|
||||
"""Terse should be reasonably short for training."""
|
||||
# Sample facts from actual knowledge
|
||||
test_facts = [
|
||||
("deploy-crons.py leaves jobs in mixed model format", "pitfall", "hermes-agent"),
|
||||
("Cron jobs with blank fallback_model fields trigger warnings", "pitfall", "hermes-agent"),
|
||||
("Use the Gitea REST API when clone times out", "pattern", "devops"),
|
||||
]
|
||||
for fact, cat, domain in test_facts:
|
||||
terse = fact_to_terse(fact, cat, domain)
|
||||
assert len(terse) < 200, f"Terse too long ({len(terse)}): {terse}"
|
||||
print("PASS: test_terse_length_constraint")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_fact_to_terse_pitfall()
|
||||
test_fact_to_terse_fact()
|
||||
test_fact_to_terse_pattern()
|
||||
test_entry_to_pair_structure()
|
||||
test_filter_by_confidence()
|
||||
test_filter_by_model()
|
||||
test_filter_by_date()
|
||||
test_end_to_end_jsonl_output()
|
||||
test_terse_length_constraint()
|
||||
print("\nAll smoke tests passed.")
|
||||
Reference in New Issue
Block a user