#!/usr/bin/env python3 """ Smoke test for harvester pipeline — verifies the full chain: session_reader -> prompt -> LLM (mocked) -> validate -> deduplicate -> store Does NOT call the real LLM. Tests plumbing only. """ import json import sys import tempfile import os from pathlib import Path # Setup path SCRIPT_DIR = Path(__file__).parent.absolute() sys.path.insert(0, str(SCRIPT_DIR)) from session_reader import read_session, extract_conversation, truncate_for_context, messages_to_text from harvester import validate_fact, deduplicate, load_existing_knowledge, fact_fingerprint def test_session_reader(): """Test that session_reader parses JSONL correctly.""" with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f: f.write('{"role": "user", "content": "Hello", "timestamp": "2026-04-13T10:00:00Z"}\n') f.write('{"role": "assistant", "content": "Hi there", "timestamp": "2026-04-13T10:00:01Z"}\n') f.write('{"role": "user", "content": "Clone the repo", "timestamp": "2026-04-13T10:00:02Z"}\n') f.write('{"role": "assistant", "content": "Cloned successfully", "timestamp": "2026-04-13T10:00:05Z"}\n') path = f.name messages = read_session(path) assert len(messages) == 4, f"Expected 4 messages, got {len(messages)}" conv = extract_conversation(messages) assert len(conv) == 4, f"Expected 4 conversation turns, got {len(conv)}" text = messages_to_text(conv) assert "USER: Hello" in text assert "ASSISTANT: Hi there" in text truncated = truncate_for_context(conv, head=2, tail=2) assert len(truncated) == 4 # 4 <= head+tail, so no truncation os.unlink(path) print(" [PASS] session_reader pipeline works") def test_validate_fact(): """Test fact validation.""" good = {"fact": "Gitea token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9} assert validate_fact(good), "Valid fact should pass" bad_missing = {"fact": "Something", "category": "fact"} assert not validate_fact(bad_missing), "Missing fields should fail" bad_category = {"fact": "Something", "category": "nonsense", "repo": "x", "confidence": 0.5} assert not validate_fact(bad_category), "Bad category should fail" bad_conf = {"fact": "Something", "category": "fact", "repo": "x", "confidence": 1.5} assert not validate_fact(bad_conf), "Confidence > 1.0 should fail" print(" [PASS] fact validation works") def test_deduplicate(): """Test deduplication.""" existing = [ {"fact": "Token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9} ] new = [ {"fact": "Token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9}, # exact dup {"fact": "Deploy uses Ansible on port 22", "category": "pattern", "repo": "fleet", "confidence": 0.8}, # unique ] result = deduplicate(new, existing) assert len(result) == 1, f"Expected 1 unique, got {len(result)}" assert result[0]["fact"] == "Deploy uses Ansible on port 22" print(" [PASS] deduplication works") def test_knowledge_store_roundtrip(): """Test loading and writing knowledge index.""" with tempfile.TemporaryDirectory() as tmpdir: # Load empty index index = load_existing_knowledge(tmpdir) assert index["total_facts"] == 0 # Write a fact new_facts = [{"fact": "Test fact", "category": "fact", "repo": "test", "confidence": 0.9}] # Use harvester's write function from harvester import write_knowledge write_knowledge(index, new_facts, tmpdir, source_session="test.jsonl") # Reload and verify index2 = load_existing_knowledge(tmpdir) assert index2["total_facts"] == 1 assert index2["facts"][0]["fact"] == "Test fact" assert index2["facts"][0]["source_session"] == "test.jsonl" # Check markdown was written md_path = Path(tmpdir) / "repos" / "test.md" assert md_path.exists(), "Markdown file should be created" print(" [PASS] knowledge store roundtrip works") def test_full_chain_no_llm(): """Test the full pipeline minus the LLM call.""" with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f: f.write('{"role": "user", "content": "Clone compounding-intelligence", "timestamp": "2026-04-13T10:00:00Z"}\n') f.write('{"role": "assistant", "content": "Cloned successfully", "timestamp": "2026-04-13T10:00:05Z"}\n') session_path = f.name with tempfile.TemporaryDirectory() as knowledge_dir: # Step 1: Read messages = read_session(session_path) assert len(messages) == 2 # Step 2: Extract conversation conv = extract_conversation(messages) assert len(conv) == 2 # Step 3: Truncate truncated = truncate_for_context(conv, head=50, tail=50) # Step 4: Convert to text (this goes to the LLM) transcript = messages_to_text(truncated) assert "Clone compounding-intelligence" in transcript # Step 5-7: Would be LLM call, validate, deduplicate # We simulate LLM output here mock_facts = [ {"fact": "compounding-intelligence repo was cloned", "category": "fact", "repo": "compounding-intelligence", "confidence": 0.9} ] valid = [f for f in mock_facts if validate_fact(f)] # Step 6: Deduplicate index = load_existing_knowledge(knowledge_dir) new_facts = deduplicate(valid, index.get("facts", [])) assert len(new_facts) == 1 # Step 7: Store from harvester import write_knowledge write_knowledge(index, new_facts, knowledge_dir, source_session=session_path) # Verify index2 = load_existing_knowledge(knowledge_dir) assert index2["total_facts"] == 1 os.unlink(session_path) print(" [PASS] full chain (reader -> validate -> dedup -> store) works") if __name__ == "__main__": print("Running harvester pipeline smoke tests...") test_session_reader() test_validate_fact() test_deduplicate() test_knowledge_store_roundtrip() test_full_chain_no_llm() print("\nAll tests passed.")