""" Tests for session_pair_harvester — training pair extraction from sessions. """ import json import tempfile import unittest from pathlib import Path import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) from session_pair_harvester import ( extract_pairs_from_conversation, extract_from_jsonl_file, deduplicate_pairs, compute_hash, ) class TestSessionPairHarvester(unittest.TestCase): def test_compute_hash_consistent(self): h1 = compute_hash("hello world") h2 = compute_hash("hello world") self.assertEqual(h1, h2) self.assertEqual(len(h1), 16) def test_extract_simple_qa_pair(self): """A simple user→assistant exchange produces one pair.""" conversation = [ {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "The capital of France is Paris. It is a major European city renowned for its art, fashion, gastronomy, cultural heritage, and historical significance. The city attracts millions of tourists annually."}, ] pairs = extract_pairs_from_conversation(conversation, "test_session", "test-model") self.assertEqual(len(pairs), 1) self.assertEqual(pairs[0]["terse"], "What is the capital of France?") self.assertIn("Paris", pairs[0]["rich"]) self.assertEqual(pairs[0]["source"], "test_session") def test_min_ratio_filter(self): """Very short responses are filtered out.""" conversation = [ {"role": "user", "content": "Yes"}, {"role": "assistant", "content": "No."}, ] # Default min_ratio = 1.5, min_words = 20 for response pairs = extract_pairs_from_conversation(conversation, "s", "m", min_response_words=3) self.assertEqual(len(pairs), 0) def test_min_words_filter(self): """Assistant responses below min word count are skipped.""" conversation = [ {"role": "user", "content": "Explain the project architecture in detail"}, {"role": "assistant", "content": "OK."}, ] pairs = extract_pairs_from_conversation(conversation, "s", "m", min_response_words=5) self.assertEqual(len(pairs), 0) def test_skip_non_assistant_messages(self): """System and tool messages are ignored.""" conversation = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there! How can I help you today?"}, ] pairs = extract_pairs_from_conversation(conversation, "s", "m", min_response_words=3) self.assertEqual(len(pairs), 1) self.assertEqual(pairs[0]["terse"], "Hello") def test_multiple_pairs_from_one_session(self): """A conversation with several Q&A turns yields multiple pairs.""" conversation = [ {"role": "user", "content": "First question?"}, {"role": "assistant", "content": "Here is a detailed and comprehensive answer that thoroughly explores multiple aspects of the subject. It provides background context and practical implications for the reader."}, {"role": "user", "content": "Second?"}, {"role": "assistant", "content": "Another comprehensive response with detailed examples. This includes practical code blocks and thorough explanations to ensure deep understanding of the topic at hand."}, ] pairs = extract_pairs_from_conversation(conversation, "s", "m", min_ratio=1.0) self.assertEqual(len(pairs), 2) def test_deduplication_removes_duplicates(self): """Identical pairs across sessions are deduplicated.""" pairs = [ {"terse": "q1", "rich": "a1", "source": "s1", "model": "m"}, {"terse": "q1", "rich": "a1", "source": "s2", "model": "m"}, {"terse": "q2", "rich": "a2", "source": "s1", "model": "m"}, ] unique = deduplicate_pairs(pairs) self.assertEqual(len(unique), 2) sources = {p["source"] for p in unique} # First unique pair can be from either s1 or s2 self.assertIn("s1", sources) def test_integration_with_test_sessions(self): """Harvester finds pairs in real test session files.""" repo_root = Path(__file__).parent.parent test_sessions_dir = repo_root / "test_sessions" if not test_sessions_dir.exists(): self.skipTest("test_sessions not found") pairs = [] for jsonl_file in sorted(test_sessions_dir.glob("*.jsonl")): pairs.extend(extract_from_jsonl_file(str(jsonl_file))) self.assertGreater(len(pairs), 0, "Should extract at least one pair from test_sessions") for p in pairs: self.assertIn("terse", p) self.assertIn("rich", p) self.assertIn("source", p) self.assertIn("model", p) # Verify content exists self.assertGreater(len(p["terse"]), 0) self.assertGreater(len(p["rich"]), 0) if __name__ == "__main__": unittest.main()