Improve #493: Enhanced meaning kernel extraction pipeline
- Added 5 kernel types: text, structure, summary, philosophical, semantic - Improved diagram type detection with content analysis - Added color analysis and grayscale detection - Enhanced philosophical keyword extraction - Added semantic relationship detection - Improved error handling for missing dependencies - Added comprehensive testing with text-rich test images - Enhanced metadata and tagging system Key improvements: ✓ Semantic relationship detection (source → target patterns) ✓ Enhanced philosophical content extraction ✓ Color analysis and grayscale detection ✓ Better diagram type classification ✓ Comprehensive metadata and tagging ✓ Improved error handling and dependency warnings Still requires OCR dependencies for text extraction: - pytesseract for OCR - pdf2image for PDF processing - Tesseract OCR engine (see issue #563)
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for meaning kernel extraction pipeline.
|
||||
Improved test script for meaning kernel extraction pipeline.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
@@ -10,8 +10,8 @@ from pathlib import Path
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
def create_test_image():
|
||||
"""Create a simple test image."""
|
||||
def create_test_image_with_text():
|
||||
"""Create a test image with text."""
|
||||
try:
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
@@ -35,7 +35,12 @@ def create_test_image():
|
||||
"- Data ingestion",
|
||||
"- Feature extraction",
|
||||
"- Pattern recognition",
|
||||
"- Knowledge representation"
|
||||
"- Knowledge representation",
|
||||
"",
|
||||
"Philosophical aspects:",
|
||||
"- Truth and knowledge",
|
||||
"- Meaning and purpose",
|
||||
"- Reality and existence"
|
||||
]
|
||||
|
||||
y = 50
|
||||
@@ -55,10 +60,10 @@ def create_test_image():
|
||||
|
||||
# Save to temp file
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
image_path = temp_dir / "test_diagram.png"
|
||||
image_path = temp_dir / "test_diagram_with_text.png"
|
||||
img.save(image_path)
|
||||
|
||||
print(f"Created test image: {image_path}")
|
||||
print(f"Created test image with text: {image_path}")
|
||||
return image_path
|
||||
|
||||
except ImportError as e:
|
||||
@@ -67,7 +72,7 @@ def create_test_image():
|
||||
|
||||
def test_extraction():
|
||||
"""Test the extraction pipeline."""
|
||||
print("Testing Meaning Kernel Extraction Pipeline...")
|
||||
print("Testing Improved Meaning Kernel Extraction Pipeline...")
|
||||
|
||||
# Check if we can import the extractor
|
||||
try:
|
||||
@@ -78,7 +83,7 @@ def test_extraction():
|
||||
return False
|
||||
|
||||
# Create test image
|
||||
test_image = create_test_image()
|
||||
test_image = create_test_image_with_text()
|
||||
if not test_image:
|
||||
print("Skipping test - cannot create test image")
|
||||
return True
|
||||
@@ -97,6 +102,7 @@ def test_extraction():
|
||||
print(f"\nKernel: {kernel.kernel_id}")
|
||||
print(f" Type: {kernel.kernel_type}")
|
||||
print(f" Confidence: {kernel.confidence:.2f}")
|
||||
print(f" Tags: {', '.join(kernel.tags)}")
|
||||
print(f" Content: {kernel.content[:100]}...")
|
||||
|
||||
# Get stats
|
||||
@@ -105,6 +111,13 @@ def test_extraction():
|
||||
for key, value in stats.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
# Check for philosophical kernels
|
||||
philosophical_kernels = [k for k in kernels if k.kernel_type == "philosophical"]
|
||||
if philosophical_kernels:
|
||||
print(f"\n✓ Found {len(philosophical_kernels)} philosophical kernel(s)")
|
||||
else:
|
||||
print("\n⚠ No philosophical kernels found (may need OCR dependencies)")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
@@ -114,7 +127,7 @@ def test_extraction():
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Meaning Kernel Extraction Pipeline Test")
|
||||
print("Improved Meaning Kernel Extraction Pipeline Test")
|
||||
print("=" * 50)
|
||||
|
||||
success = test_extraction()
|
||||
|
||||
Reference in New Issue
Block a user