compounding-intelligence/scripts/test_harvester_pipeline.py

#!/usr/bin/env python3
"""
Smoke test for harvester pipeline — verifies the full chain:
session_reader -> prompt -> LLM (mocked) -> validate -> deduplicate -> store

Does NOT call the real LLM. Tests plumbing only.
"""

import json
import sys
import tempfile
import os
from pathlib import Path

# Setup path
SCRIPT_DIR = Path(__file__).parent.absolute()
sys.path.insert(0, str(SCRIPT_DIR))

from session_reader import read_session, extract_conversation, truncate_for_context, messages_to_text
from harvester import validate_fact, deduplicate, load_existing_knowledge, fact_fingerprint


def test_session_reader():
    """Test that session_reader parses JSONL correctly."""
    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
        f.write('{"role": "user", "content": "Hello", "timestamp": "2026-04-13T10:00:00Z"}\n')
        f.write('{"role": "assistant", "content": "Hi there", "timestamp": "2026-04-13T10:00:01Z"}\n')
        f.write('{"role": "user", "content": "Clone the repo", "timestamp": "2026-04-13T10:00:02Z"}\n')
        f.write('{"role": "assistant", "content": "Cloned successfully", "timestamp": "2026-04-13T10:00:05Z"}\n')
        path = f.name

    messages = read_session(path)
    assert len(messages) == 4, f"Expected 4 messages, got {len(messages)}"

    conv = extract_conversation(messages)
    assert len(conv) == 4, f"Expected 4 conversation turns, got {len(conv)}"

    text = messages_to_text(conv)
    assert "USER: Hello" in text
    assert "ASSISTANT: Hi there" in text

    truncated = truncate_for_context(conv, head=2, tail=2)
    assert len(truncated) == 4  # 4 <= head+tail, so no truncation

    os.unlink(path)
    print("  [PASS] session_reader pipeline works")


def test_validate_fact():
    """Test fact validation."""
    good = {"fact": "Gitea token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9}
    assert validate_fact(good), "Valid fact should pass"

    bad_missing = {"fact": "Something", "category": "fact"}
    assert not validate_fact(bad_missing), "Missing fields should fail"

    bad_category = {"fact": "Something", "category": "nonsense", "repo": "x", "confidence": 0.5}
    assert not validate_fact(bad_category), "Bad category should fail"

    bad_conf = {"fact": "Something", "category": "fact", "repo": "x", "confidence": 1.5}
    assert not validate_fact(bad_conf), "Confidence > 1.0 should fail"

    print("  [PASS] fact validation works")


def test_deduplicate():
    """Test deduplication."""
    existing = [
        {"fact": "Token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9}
    ]
    new = [
        {"fact": "Token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9},  # exact dup
        {"fact": "Deploy uses Ansible on port 22", "category": "pattern", "repo": "fleet", "confidence": 0.8},  # unique
    ]
    result = deduplicate(new, existing)
    assert len(result) == 1, f"Expected 1 unique, got {len(result)}"
    assert result[0]["fact"] == "Deploy uses Ansible on port 22"
    print("  [PASS] deduplication works")


def test_knowledge_store_roundtrip():
    """Test loading and writing knowledge index."""
    with tempfile.TemporaryDirectory() as tmpdir:
        # Load empty index
        index = load_existing_knowledge(tmpdir)
        assert index["total_facts"] == 0

        # Write a fact
        new_facts = [{"fact": "Test fact", "category": "fact", "repo": "test", "confidence": 0.9}]

        # Use harvester's write function
        from harvester import write_knowledge
        write_knowledge(index, new_facts, tmpdir, source_session="test.jsonl")

        # Reload and verify
        index2 = load_existing_knowledge(tmpdir)
        assert index2["total_facts"] == 1
        assert index2["facts"][0]["fact"] == "Test fact"
        assert index2["facts"][0]["source_session"] == "test.jsonl"

        # Check markdown was written
        md_path = Path(tmpdir) / "repos" / "test.md"
        assert md_path.exists(), "Markdown file should be created"

    print("  [PASS] knowledge store roundtrip works")


def test_full_chain_no_llm():
    """Test the full pipeline minus the LLM call."""
    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
        f.write('{"role": "user", "content": "Clone compounding-intelligence", "timestamp": "2026-04-13T10:00:00Z"}\n')
        f.write('{"role": "assistant", "content": "Cloned successfully", "timestamp": "2026-04-13T10:00:05Z"}\n')
        session_path = f.name

    with tempfile.TemporaryDirectory() as knowledge_dir:
        # Step 1: Read
        messages = read_session(session_path)
        assert len(messages) == 2

        # Step 2: Extract conversation
        conv = extract_conversation(messages)
        assert len(conv) == 2

        # Step 3: Truncate
        truncated = truncate_for_context(conv, head=50, tail=50)

        # Step 4: Convert to text (this goes to the LLM)
        transcript = messages_to_text(truncated)
        assert "Clone compounding-intelligence" in transcript

        # Step 5-7: Would be LLM call, validate, deduplicate
        # We simulate LLM output here
        mock_facts = [
            {"fact": "compounding-intelligence repo was cloned", "category": "fact", "repo": "compounding-intelligence", "confidence": 0.9}
        ]
        valid = [f for f in mock_facts if validate_fact(f)]

        # Step 6: Deduplicate
        index = load_existing_knowledge(knowledge_dir)
        new_facts = deduplicate(valid, index.get("facts", []))
        assert len(new_facts) == 1

        # Step 7: Store
        from harvester import write_knowledge
        write_knowledge(index, new_facts, knowledge_dir, source_session=session_path)

        # Verify
        index2 = load_existing_knowledge(knowledge_dir)
        assert index2["total_facts"] == 1

    os.unlink(session_path)
    print("  [PASS] full chain (reader -> validate -> dedup -> store) works")


if __name__ == "__main__":
    print("Running harvester pipeline smoke tests...")
    test_session_reader()
    test_validate_fact()
    test_deduplicate()
    test_knowledge_store_roundtrip()
    test_full_chain_no_llm()
    print("\nAll tests passed.")