feat: training data pipeline — knowledge entries → JSONL training pairs

Add scripts/knowledge_to_training_pairs.py which reads quality-gated knowledge entries from knowledge/index.json and emits terse→rich training pairs in JSONL format. Features: - Derives terse queries from facts via category-aware heuristics - Configurable quality filters: min-confidence, model-filter, date range - Output includes domain, source_confidence, source_model - Smoke tests added in tests/test_knowledge_to_training_pairs.py Deliverables for #199: 1. Pipeline script: scripts/knowledge_to_training_pairs.py 2. End-to-end: knowledge/index.json → training_pairs.jsonl (or custom JSONL) 3. Config: min-confidence, model-filter, after/before date filters 4. Test: 9 smoke tests covering conversion, filtering, and end-to-end run Closes #199
2026-04-26 13:03:06 -04:00
parent 4b5a675355
commit 86eb1c9a50
2 changed files with 429 additions and 0 deletions
--- a/tests/test_knowledge_to_training_pairs.py
+++ b/tests/test_knowledge_to_training_pairs.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+"""
+Smoke tests for knowledge_to_training_pairs.py
+
+Tests:
+  - Output is valid JSONL
+  - Each line has required fields (terse, rich, domain, source_confidence, source_model)
+  - Confidence values are in [0,1]
+  - Terse is non-empty and reasonably short (< 200 chars)
+  - Rich matches the original fact
+"""
+
+import json
+import sys
+import os
+import tempfile
+from pathlib import Path
+
+# Add scripts dir to path for imports
+SCRIPT_DIR = Path(__file__).parent.parent / "scripts"
+sys.path.insert(0, str(SCRIPT_DIR))
+
+from knowledge_to_training_pairs import (
+    fact_to_terse,
+    filter_entries,
+    entry_to_pair,
+    parse_date,
+)
+
+
+def test_fact_to_terse_pitfall():
+    fact = "deploy-crons.py leaves jobs in mixed model format"
+    category = "pitfall"
+    domain = "hermes-agent"
+    terse = fact_to_terse(fact, category, domain)
+    assert terse.startswith("How do I")
+    assert "?" in terse
+    assert len(terse) < 150
+    print("PASS: test_fact_to_terse_pitfall")
+
+
+def test_fact_to_terse_fact():
+    fact = "Python is a high-level programming language"
+    terse = fact_to_terse(fact, "fact", "global")
+    assert terse.startswith("What should I know about")
+    assert "?" in terse
+    print("PASS: test_fact_to_terse_fact")
+
+
+def test_fact_to_terse_pattern():
+    fact = "Use sparse checkout for large repos"
+    terse = fact_to_terse(fact, "pattern", "devops")
+    assert "recommended way" in terse or "best way" in terse
+    print("PASS: test_fact_to_terse_pattern")
+
+
+def test_entry_to_pair_structure():
+    entry = {
+        "id": "test:001",
+        "fact": "Test fact text.",
+        "category": "fact",
+        "domain": "test-domain",
+        "confidence": 0.85,
+        "model": "test-model",
+    }
+    pair = entry_to_pair(entry)
+    assert pair is not None
+    assert "terse" in pair
+    assert "rich" in pair
+    assert "domain" in pair
+    assert "source_confidence" in pair
+    assert "source_model" in pair
+    assert pair["rich"] == "Test fact text."
+    assert pair["domain"] == "test-domain"
+    assert 0.0 <= pair["source_confidence"] <= 1.0
+    print("PASS: test_entry_to_pair_structure")
+
+
+def test_filter_by_confidence():
+    entries = [
+        {"fact": "A", "confidence": 0.9},
+        {"fact": "B", "confidence": 0.4},
+        {"fact": "C", "confidence": 0.6},
+    ]
+    filtered = filter_entries(entries, min_confidence=0.5)
+    assert len(filtered) == 2
+    assert all(e["confidence"] >= 0.5 for e in filtered)
+    print("PASS: test_filter_by_confidence")
+
+
+def test_filter_by_model():
+    entries = [
+        {"fact": "A", "model": "claude-sonnet"},
+        {"fact": "B", "model": "gpt-4"},
+        {"fact": "C", "model": "unknown"},
+    ]
+    filtered = filter_entries(entries, model_filter=["claude-sonnet", "gpt-4"])
+    assert len(filtered) == 2
+    assert all(e["model"] in ("claude-sonnet", "gpt-4") for e in filtered)
+    print("PASS: test_filter_by_model")
+
+
+def test_filter_by_date():
+    entries = [
+        {"fact": "A", "last_confirmed": "2026-04-10"},
+        {"fact": "B", "last_confirmed": "2026-03-01"},
+        {"fact": "C", "first_seen": "2026-04-15"},
+    ]
+    after_dt = parse_date("2026-04-01")
+    filtered = filter_entries(entries, after=after_dt)
+    assert len(filtered) == 2
+    print("PASS: test_filter_by_date")
+
+
+def test_end_to_end_jsonl_output():
+    """Integration test: run the script and verify JSONL validity."""
+    import subprocess
+
+    repo_dir = SCRIPT_DIR.parent
+    result = subprocess.run(
+        ["python3", "scripts/knowledge_to_training_pairs.py", "--dry-run"],
+        capture_output=True, text=True, cwd=repo_dir
+    )
+    assert result.returncode == 0
+    stderr = result.stderr.strip()
+
+    # The stats JSON object is at the top of stderr. Find its bounds via brace matching.
+    start = stderr.find('{')
+    assert start >= 0, "Stats JSON not found in stderr"
+    stderr_sub = stderr[start:]
+
+    depth = 0
+    end = 0
+    for i, ch in enumerate(stderr_sub):
+        if ch == '{':
+            depth += 1
+        elif ch == '}':
+            depth -= 1
+            if depth == 0:
+                end = i + 1
+                break
+    assert end > 0, "Unterminated JSON in stderr"
+
+    stats = json.loads(stderr_sub[:end])
+    assert stats["input_entries"] > 0
+    assert stats["pairs_generated"] > 0
+    print("PASS: test_end_to_end_jsonl_output")
+
+
+def test_terse_length_constraint():
+    """Terse should be reasonably short for training."""
+    # Sample facts from actual knowledge
+    test_facts = [
+        ("deploy-crons.py leaves jobs in mixed model format", "pitfall", "hermes-agent"),
+        ("Cron jobs with blank fallback_model fields trigger warnings", "pitfall", "hermes-agent"),
+        ("Use the Gitea REST API when clone times out", "pattern", "devops"),
+    ]
+    for fact, cat, domain in test_facts:
+        terse = fact_to_terse(fact, cat, domain)
+        assert len(terse) < 200, f"Terse too long ({len(terse)}): {terse}"
+    print("PASS: test_terse_length_constraint")
+
+
+if __name__ == "__main__":
+    test_fact_to_terse_pitfall()
+    test_fact_to_terse_fact()
+    test_fact_to_terse_pattern()
+    test_entry_to_pair_structure()
+    test_filter_by_confidence()
+    test_filter_by_model()
+    test_filter_by_date()
+    test_end_to_end_jsonl_output()
+    test_terse_length_constraint()
+    print("\nAll smoke tests passed.")