Main harvester module that chains: session_reader → extraction prompt → LLM → validate → deduplicate → store Includes: - scripts/harvester.py — main module (reader + prompt + storage pipeline) - scripts/session_reader.py — JSONL transcript parser - scripts/test_harvester_pipeline.py — smoke tests (all passing) Pipeline: 1. Read session JSONL via session_reader 2. Truncate long sessions (first 50 + last 50 messages) 3. Send transcript + extraction prompt to LLM (mimo-v2-pro) 4. Parse structured JSON response (facts/pitfalls/patterns/quirks/questions) 5. Validate fields + confidence threshold 6. Deduplicate against knowledge/index.json (fingerprint + word overlap) 7. Write to knowledge store (index.json + per-repo markdown) CLI: Single: python3 harvester.py --session <path> --output knowledge/ Batch: python3 harvester.py --batch --since 2026-04-01 --limit 100 Dry-run: python3 harvester.py --session <path> --dry-run
163 lines
6.2 KiB
Python
163 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Smoke test for harvester pipeline — verifies the full chain:
|
|
session_reader -> prompt -> LLM (mocked) -> validate -> deduplicate -> store
|
|
|
|
Does NOT call the real LLM. Tests plumbing only.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import tempfile
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Setup path
|
|
SCRIPT_DIR = Path(__file__).parent.absolute()
|
|
sys.path.insert(0, str(SCRIPT_DIR))
|
|
|
|
from session_reader import read_session, extract_conversation, truncate_for_context, messages_to_text
|
|
from harvester import validate_fact, deduplicate, load_existing_knowledge, fact_fingerprint
|
|
|
|
|
|
def test_session_reader():
|
|
"""Test that session_reader parses JSONL correctly."""
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
|
|
f.write('{"role": "user", "content": "Hello", "timestamp": "2026-04-13T10:00:00Z"}\n')
|
|
f.write('{"role": "assistant", "content": "Hi there", "timestamp": "2026-04-13T10:00:01Z"}\n')
|
|
f.write('{"role": "user", "content": "Clone the repo", "timestamp": "2026-04-13T10:00:02Z"}\n')
|
|
f.write('{"role": "assistant", "content": "Cloned successfully", "timestamp": "2026-04-13T10:00:05Z"}\n')
|
|
path = f.name
|
|
|
|
messages = read_session(path)
|
|
assert len(messages) == 4, f"Expected 4 messages, got {len(messages)}"
|
|
|
|
conv = extract_conversation(messages)
|
|
assert len(conv) == 4, f"Expected 4 conversation turns, got {len(conv)}"
|
|
|
|
text = messages_to_text(conv)
|
|
assert "USER: Hello" in text
|
|
assert "ASSISTANT: Hi there" in text
|
|
|
|
truncated = truncate_for_context(conv, head=2, tail=2)
|
|
assert len(truncated) == 4 # 4 <= head+tail, so no truncation
|
|
|
|
os.unlink(path)
|
|
print(" [PASS] session_reader pipeline works")
|
|
|
|
|
|
def test_validate_fact():
|
|
"""Test fact validation."""
|
|
good = {"fact": "Gitea token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9}
|
|
assert validate_fact(good), "Valid fact should pass"
|
|
|
|
bad_missing = {"fact": "Something", "category": "fact"}
|
|
assert not validate_fact(bad_missing), "Missing fields should fail"
|
|
|
|
bad_category = {"fact": "Something", "category": "nonsense", "repo": "x", "confidence": 0.5}
|
|
assert not validate_fact(bad_category), "Bad category should fail"
|
|
|
|
bad_conf = {"fact": "Something", "category": "fact", "repo": "x", "confidence": 1.5}
|
|
assert not validate_fact(bad_conf), "Confidence > 1.0 should fail"
|
|
|
|
print(" [PASS] fact validation works")
|
|
|
|
|
|
def test_deduplicate():
|
|
"""Test deduplication."""
|
|
existing = [
|
|
{"fact": "Token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9}
|
|
]
|
|
new = [
|
|
{"fact": "Token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9}, # exact dup
|
|
{"fact": "Deploy uses Ansible on port 22", "category": "pattern", "repo": "fleet", "confidence": 0.8}, # unique
|
|
]
|
|
result = deduplicate(new, existing)
|
|
assert len(result) == 1, f"Expected 1 unique, got {len(result)}"
|
|
assert result[0]["fact"] == "Deploy uses Ansible on port 22"
|
|
print(" [PASS] deduplication works")
|
|
|
|
|
|
def test_knowledge_store_roundtrip():
|
|
"""Test loading and writing knowledge index."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
# Load empty index
|
|
index = load_existing_knowledge(tmpdir)
|
|
assert index["total_facts"] == 0
|
|
|
|
# Write a fact
|
|
new_facts = [{"fact": "Test fact", "category": "fact", "repo": "test", "confidence": 0.9}]
|
|
|
|
# Use harvester's write function
|
|
from harvester import write_knowledge
|
|
write_knowledge(index, new_facts, tmpdir, source_session="test.jsonl")
|
|
|
|
# Reload and verify
|
|
index2 = load_existing_knowledge(tmpdir)
|
|
assert index2["total_facts"] == 1
|
|
assert index2["facts"][0]["fact"] == "Test fact"
|
|
assert index2["facts"][0]["source_session"] == "test.jsonl"
|
|
|
|
# Check markdown was written
|
|
md_path = Path(tmpdir) / "repos" / "test.md"
|
|
assert md_path.exists(), "Markdown file should be created"
|
|
|
|
print(" [PASS] knowledge store roundtrip works")
|
|
|
|
|
|
def test_full_chain_no_llm():
|
|
"""Test the full pipeline minus the LLM call."""
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
|
|
f.write('{"role": "user", "content": "Clone compounding-intelligence", "timestamp": "2026-04-13T10:00:00Z"}\n')
|
|
f.write('{"role": "assistant", "content": "Cloned successfully", "timestamp": "2026-04-13T10:00:05Z"}\n')
|
|
session_path = f.name
|
|
|
|
with tempfile.TemporaryDirectory() as knowledge_dir:
|
|
# Step 1: Read
|
|
messages = read_session(session_path)
|
|
assert len(messages) == 2
|
|
|
|
# Step 2: Extract conversation
|
|
conv = extract_conversation(messages)
|
|
assert len(conv) == 2
|
|
|
|
# Step 3: Truncate
|
|
truncated = truncate_for_context(conv, head=50, tail=50)
|
|
|
|
# Step 4: Convert to text (this goes to the LLM)
|
|
transcript = messages_to_text(truncated)
|
|
assert "Clone compounding-intelligence" in transcript
|
|
|
|
# Step 5-7: Would be LLM call, validate, deduplicate
|
|
# We simulate LLM output here
|
|
mock_facts = [
|
|
{"fact": "compounding-intelligence repo was cloned", "category": "fact", "repo": "compounding-intelligence", "confidence": 0.9}
|
|
]
|
|
valid = [f for f in mock_facts if validate_fact(f)]
|
|
|
|
# Step 6: Deduplicate
|
|
index = load_existing_knowledge(knowledge_dir)
|
|
new_facts = deduplicate(valid, index.get("facts", []))
|
|
assert len(new_facts) == 1
|
|
|
|
# Step 7: Store
|
|
from harvester import write_knowledge
|
|
write_knowledge(index, new_facts, knowledge_dir, source_session=session_path)
|
|
|
|
# Verify
|
|
index2 = load_existing_knowledge(knowledge_dir)
|
|
assert index2["total_facts"] == 1
|
|
|
|
os.unlink(session_path)
|
|
print(" [PASS] full chain (reader -> validate -> dedup -> store) works")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("Running harvester pipeline smoke tests...")
|
|
test_session_reader()
|
|
test_validate_fact()
|
|
test_deduplicate()
|
|
test_knowledge_store_roundtrip()
|
|
test_full_chain_no_llm()
|
|
print("\nAll tests passed.")
|