#!/usr/bin/env python3 """ Comprehensive test script for knowledge extraction prompt. Validates prompt structure, requirements, and consistency. """ import json import re from pathlib import Path def test_prompt_structure(): """Test that the prompt has the required structure.""" prompt_path = Path("templates/harvest-prompt.md") if not prompt_path.exists(): return False, "harvest-prompt.md not found" content = prompt_path.read_text() # Check for required sections required_sections = [ "System Prompt", "Instructions", "Categories", "Output Format", "Confidence Scoring", "Constraints", "Example" ] for section in required_sections: if section.lower() not in content.lower(): return False, f"Missing required section: {section}" # Check for required categories required_categories = ["fact", "pitfall", "pattern", "tool-quirk", "question"] for category in required_categories: if category not in content: return False, f"Missing required category: {category}" # Check for required output fields required_fields = ["fact", "category", "repo", "confidence"] for field in required_fields: if field not in content: return False, f"Missing required output field: {field}" # Check prompt size (should be ~1k tokens, roughly 4k chars) if len(content) > 5000: return False, f"Prompt too large: {len(content)} chars (max ~5000)" if len(content) < 1000: return False, f"Prompt too small: {len(content)} chars (min ~1000)" return True, "Prompt structure is valid" def test_confidence_scoring(): """Test that confidence scoring is properly defined.""" prompt_path = Path("templates/harvest-prompt.md") content = prompt_path.read_text() # Check for confidence scale definitions confidence_levels = [ ("0.9-1.0", "explicitly stated"), ("0.7-0.8", "clearly implied"), ("0.5-0.6", "suggested"), ("0.3-0.4", "inferred"), ("0.1-0.2", "speculative") ] for level, description in confidence_levels: if level not in content: return False, f"Missing confidence level: {level}" if description.lower() not in content.lower(): return False, f"Missing confidence description: {description}" return True, "Confidence scoring is properly defined" def test_example_quality(): """Test that examples are clear and complete.""" prompt_path = Path("templates/harvest-prompt.md") content = prompt_path.read_text() # Check for example input/output if "example" not in content.lower(): return False, "No examples provided" # Check that example includes all categories example_section = content[content.lower().find("example"):] # Look for JSON example json_match = re.search(r'\{[\s\S]*"knowledge"[\s\S]*\}', example_section) if not json_match: return False, "No JSON example found" example_json = json_match.group(0) # Check for all categories in example for category in ["fact", "pitfall", "pattern", "tool-quirk", "question"]: if category not in example_json: return False, f"Example missing category: {category}" return True, "Examples are clear and complete" def test_constraint_coverage(): """Test that constraints cover all requirements.""" prompt_path = Path("templates/harvest-prompt.md") content = prompt_path.read_text() required_constraints = [ "No hallucination", "only extract", "explicitly", "partial", "failed sessions", "1k tokens" ] for constraint in required_constraints: if constraint.lower() not in content.lower(): return False, f"Missing constraint: {constraint}" return True, "Constraints cover all requirements" def test_test_sessions(): """Test that test sessions exist and are valid.""" test_sessions_dir = Path("test_sessions") if not test_sessions_dir.exists(): return False, "test_sessions directory not found" session_files = list(test_sessions_dir.glob("*.jsonl")) if len(session_files) < 5: return False, f"Only {len(session_files)} test sessions found, need 5" # Check each session file for session_file in session_files: content = session_file.read_text() lines = content.strip().split("\n") # Check that each line is valid JSON for i, line in enumerate(lines, 1): try: json.loads(line) except json.JSONDecodeError as e: return False, f"Invalid JSON in {session_file.name}, line {i}: {e}" return True, f"Found {len(session_files)} valid test sessions" def run_all_tests(): """Run all tests and return results.""" tests = [ ("Prompt Structure", test_prompt_structure), ("Confidence Scoring", test_confidence_scoring), ("Example Quality", test_example_quality), ("Constraint Coverage", test_constraint_coverage), ("Test Sessions", test_test_sessions) ] results = [] all_passed = True for test_name, test_func in tests: try: passed, message = test_func() results.append({ "test": test_name, "passed": passed, "message": message }) if not passed: all_passed = False except Exception as e: results.append({ "test": test_name, "passed": False, "message": f"Error: {str(e)}" }) all_passed = False # Print results print("=" * 60) print("HARVEST PROMPT TEST RESULTS") print("=" * 60) for result in results: status = "✓ PASS" if result["passed"] else "✗ FAIL" print(f"{status}: {result['test']}") print(f" {result['message']}") print() print("=" * 60) if all_passed: print("ALL TESTS PASSED!") else: print("SOME TESTS FAILED!") print("=" * 60) return all_passed, results if __name__ == "__main__": all_passed, results = run_all_tests() # Save results to file with open("test_results.json", "w") as f: json.dump({ "all_passed": all_passed, "results": results, "timestamp": "2026-04-14T19:05:00Z" }, f, indent=2) print(f"Results saved to test_results.json") # Exit with appropriate code exit(0 if all_passed else 1)