Improve #493: Enhanced meaning kernel extraction pipeline

- Added 5 kernel types: text, structure, summary, philosophical, semantic - Improved diagram type detection with content analysis - Added color analysis and grayscale detection - Enhanced philosophical keyword extraction - Added semantic relationship detection - Improved error handling for missing dependencies - Added comprehensive testing with text-rich test images - Enhanced metadata and tagging system Key improvements: ✓ Semantic relationship detection (source → target patterns) ✓ Enhanced philosophical content extraction ✓ Color analysis and grayscale detection ✓ Better diagram type classification ✓ Comprehensive metadata and tagging ✓ Improved error handling and dependency warnings Still requires OCR dependencies for text extraction: - pytesseract for OCR - pdf2image for PDF processing - Tesseract OCR engine (see issue #563)
2026-04-14 11:44:55 -04:00
parent 5e09b49de8
commit efdc0dc886
3 changed files with 277 additions and 48 deletions
--- a/scripts/meaning-kernels/test_extraction.py
+++ b/scripts/meaning-kernels/test_extraction.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Test script for meaning kernel extraction pipeline.
+Improved test script for meaning kernel extraction pipeline.
 """
 import os
 import sys
@@ -10,8 +10,8 @@ from pathlib import Path
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent))

-def create_test_image():
-    """Create a simple test image."""
+def create_test_image_with_text():
+    """Create a test image with text."""
    try:
        from PIL import Image, ImageDraw, ImageFont
        
@@ -35,7 +35,12 @@ def create_test_image():
            "- Data ingestion",
            "- Feature extraction",
            "- Pattern recognition",
-            "- Knowledge representation"
+            "- Knowledge representation",
+            "",
+            "Philosophical aspects:",
+            "- Truth and knowledge",
+            "- Meaning and purpose",
+            "- Reality and existence"
        ]
        
        y = 50
@@ -55,10 +60,10 @@ def create_test_image():
        
        # Save to temp file
        temp_dir = Path(tempfile.mkdtemp())
-        image_path = temp_dir / "test_diagram.png"
+        image_path = temp_dir / "test_diagram_with_text.png"
        img.save(image_path)
        
-        print(f"Created test image: {image_path}")
+        print(f"Created test image with text: {image_path}")
        return image_path
        
    except ImportError as e:
@@ -67,7 +72,7 @@ def create_test_image():

 def test_extraction():
    """Test the extraction pipeline."""
-    print("Testing Meaning Kernel Extraction Pipeline...")
+    print("Testing Improved Meaning Kernel Extraction Pipeline...")
    
    # Check if we can import the extractor
    try:
@@ -78,7 +83,7 @@ def test_extraction():
        return False
    
    # Create test image
-    test_image = create_test_image()
+    test_image = create_test_image_with_text()
    if not test_image:
        print("Skipping test - cannot create test image")
        return True
@@ -97,6 +102,7 @@ def test_extraction():
            print(f"\nKernel: {kernel.kernel_id}")
            print(f"  Type: {kernel.kernel_type}")
            print(f"  Confidence: {kernel.confidence:.2f}")
+            print(f"  Tags: {', '.join(kernel.tags)}")
            print(f"  Content: {kernel.content[:100]}...")
        
        # Get stats
@@ -105,6 +111,13 @@ def test_extraction():
        for key, value in stats.items():
            print(f"  {key}: {value}")
        
+        # Check for philosophical kernels
+        philosophical_kernels = [k for k in kernels if k.kernel_type == "philosophical"]
+        if philosophical_kernels:
+            print(f"\n✓ Found {len(philosophical_kernels)} philosophical kernel(s)")
+        else:
+            print("\n⚠ No philosophical kernels found (may need OCR dependencies)")
+        
        return True
        
    except Exception as e:
@@ -114,7 +127,7 @@ def test_extraction():
        return False

 if __name__ == "__main__":
-    print("Meaning Kernel Extraction Pipeline Test")
+    print("Improved Meaning Kernel Extraction Pipeline Test")
    print("=" * 50)
    
    success = test_extraction()