diff --git a/scripts/meaning-kernels/README.md b/scripts/meaning-kernels/README.md new file mode 100644 index 00000000..51b332f7 --- /dev/null +++ b/scripts/meaning-kernels/README.md @@ -0,0 +1,157 @@ +# Meaning Kernel Extraction Pipeline + +## Issue #493: [Multimodal] Extract Meaning Kernels from Research Diagrams + +## Overview + +This pipeline extracts structured meaning kernels from academic PDF diagrams and images. It processes visual content to generate machine-readable text representations. + +## Features + +- **PDF Processing**: Converts PDF pages to images for analysis +- **OCR Text Extraction**: Extracts text from diagrams using Tesseract +- **Structure Analysis**: Analyzes diagram type, dimensions, orientation +- **Multiple Kernel Types**: Generates text, structure, summary, and philosophical kernels +- **Confidence Scoring**: Each kernel includes confidence metrics +- **Batch Processing**: Supports single files and directories + +## Installation + +```bash +# Required dependencies +pip install Pillow pytesseract pdf2image + +# System dependencies (macOS) +brew install tesseract poppler + +# System dependencies (Ubuntu/Debian) +sudo apt-get install tesseract-ocr poppler-utils +``` + +## Usage + +```bash +# Process a single PDF +python3 scripts/meaning-kernels/extract_meaning_kernels.py research_paper.pdf + +# Process a single image +python3 scripts/meaning-kernels/extract_meaning_kernels.py diagram.png + +# Process a directory +python3 scripts/meaning-kernels/extract_meaning_kernels.py /path/to/diagrams/ + +# Specify output directory +python3 scripts/meaning-kernels/extract_meaning_kernels.py paper.pdf -o ./output + +# Run tests +python3 scripts/meaning-kernels/test_extraction.py +``` + +## Output Structure + +``` +output_directory/ +├── page_001.png # Converted page images +├── page_002.png +├── meaning_kernels.json # Structured kernel data +├── meaning_kernels.md # Human-readable report +└── extraction_stats.json # Processing statistics +``` + +## Kernel Types + +### 1. Text Kernels +Extracted from OCR processing of diagrams. +```json +{ + "kernel_id": "kernel_20260413_123456_p1_text", + "content": "Extracted text from diagram", + "kernel_type": "text", + "confidence": 0.85, + "metadata": { + "word_count": 42, + "diagram_type": "flowchart" + } +} +``` + +### 2. Structure Kernels +Diagram structure analysis. +```json +{ + "kernel_id": "kernel_20260413_123456_p1_structure", + "content": "Diagram type: flowchart. Dimensions: 800x600. Aspect ratio: 1.33.", + "kernel_type": "structure", + "confidence": 0.9, + "metadata": { + "dimensions": {"width": 800, "height": 600}, + "aspect_ratio": 1.33, + "diagram_type": "flowchart" + } +} +``` + +### 3. Summary Kernels +Combined analysis summary. +```json +{ + "kernel_id": "kernel_20260413_123456_p1_summary", + "content": "Research diagram analysis: flowchart diagram. Contains text: Input → Processing → Output...", + "kernel_type": "summary", + "confidence": 0.7, + "metadata": { + "has_text": true, + "text_length": 150 + } +} +``` + +### 4. Philosophical Kernels +Extracted philosophical themes (when detected). +```json +{ + "kernel_id": "kernel_20260413_123456_p1_philosophical", + "content": "Philosophical themes detected: knowledge, truth. Source text explores concepts of knowledge.", + "kernel_type": "philosophical", + "confidence": 0.6, + "metadata": { + "extraction_method": "keyword_analysis", + "source_text_length": 200 + } +} +``` + +## Configuration + +Create a JSON config file: +```json +{ + "ocr_confidence_threshold": 50, + "min_text_length": 10, + "diagram_types": ["flowchart", "hierarchy", "network"], + "extract_philosophical": true, + "philosophical_keywords": ["truth", "knowledge", "wisdom", "meaning"] +} +``` + +## Limitations + +- OCR quality depends on diagram clarity +- Structure analysis is simplified +- Philosophical extraction is keyword-based +- Large PDFs can be resource-intensive + +## Future Enhancements + +- Computer vision for diagram element detection +- LLM integration for semantic analysis +- Specialized processors for different diagram types +- Integration with knowledge graphs +- API endpoint for web integration + +## Files + +- `extract_meaning_kernels.py` - Main extraction pipeline +- `test_extraction.py` - Test script +- `requirements.txt` - Python dependencies +- `README.md` - This documentation diff --git a/scripts/meaning-kernels/__pycache__/extract_meaning_kernels.cpython-312.pyc b/scripts/meaning-kernels/__pycache__/extract_meaning_kernels.cpython-312.pyc new file mode 100644 index 00000000..70dba65e Binary files /dev/null and b/scripts/meaning-kernels/__pycache__/extract_meaning_kernels.cpython-312.pyc differ diff --git a/scripts/meaning-kernels/extract_meaning_kernels.py b/scripts/meaning-kernels/extract_meaning_kernels.py new file mode 100755 index 00000000..2af13bca --- /dev/null +++ b/scripts/meaning-kernels/extract_meaning_kernels.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python3 +""" +Meaning Kernel Extraction Pipeline +Extract structured meaning kernels from academic PDF diagrams. +Issue #493: [Multimodal] Extract Meaning Kernels from Research Diagrams +""" +import os +import sys +import json +import argparse +from pathlib import Path +from datetime import datetime +from typing import List, Dict, Any, Optional +import hashlib + +# Try to import vision libraries +try: + from PIL import Image + PIL_AVAILABLE = True +except ImportError: + PIL_AVAILABLE = False + print("Warning: PIL not available. Install with: pip install Pillow") + +try: + import pytesseract + TESSERACT_AVAILABLE = True +except ImportError: + TESSERACT_AVAILABLE = False + print("Warning: pytesseract not available. Install with: pip install pytesseract") + +try: + import pdf2image + PDF2IMAGE_AVAILABLE = True +except ImportError: + PDF2IMAGE_AVAILABLE = False + print("Warning: pdf2image not available. Install with: pip install pdf2image") + +class MeaningKernel: + """Represents an extracted meaning kernel.""" + + def __init__(self, kernel_id: str, content: str, source: str, + kernel_type: str = "text", confidence: float = 0.0, + metadata: Dict[str, Any] = None): + self.kernel_id = kernel_id + self.content = content + self.source = source + self.kernel_type = kernel_type # text, structure, summary, philosophical + self.confidence = confidence + self.metadata = metadata or {} + self.timestamp = datetime.now().isoformat() + self.hash = self._generate_hash() + + def _generate_hash(self) -> str: + """Generate a unique hash for this kernel.""" + content_str = f"{self.kernel_id}:{self.content}:{self.source}:{self.timestamp}" + return hashlib.sha256(content_str.encode()).hexdigest()[:16] + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for serialization.""" + return { + "kernel_id": self.kernel_id, + "content": self.content, + "source": self.source, + "kernel_type": self.kernel_type, + "confidence": self.confidence, + "metadata": self.metadata, + "timestamp": self.timestamp, + "hash": self.hash + } + + def __str__(self) -> str: + return f"Kernel[{self.kernel_id}]: {self.content[:100]}..." + +class DiagramAnalyzer: + """Analyze diagrams using multiple methods.""" + + def __init__(self, config: Dict[str, Any] = None): + self.config = config or {} + + def analyze_image(self, image_path: str) -> Dict[str, Any]: + """Analyze an image using multiple methods.""" + if not PIL_AVAILABLE: + raise ImportError("PIL is required for image analysis") + + image = Image.open(image_path) + + # Basic image analysis + analysis = { + "dimensions": {"width": image.width, "height": image.height}, + "aspect_ratio": image.width / image.height, + "mode": image.mode, + "format": image.format, + "size_bytes": os.path.getsize(image_path) + } + + # OCR text extraction + if TESSERACT_AVAILABLE: + try: + ocr_text = pytesseract.image_to_string(image) + analysis["ocr_text"] = ocr_text.strip() + analysis["ocr_confidence"] = self._estimate_ocr_confidence(image) + except Exception as e: + analysis["ocr_text"] = "" + analysis["ocr_confidence"] = 0.0 + analysis["ocr_error"] = str(e) + + # Diagram type estimation + analysis["diagram_type"] = self._estimate_diagram_type(image) + + return analysis + + def _estimate_ocr_confidence(self, image: Image.Image) -> float: + """Estimate OCR confidence (simplified).""" + # In reality, would use pytesseract's confidence output + return 0.8 # Placeholder + + def _estimate_diagram_type(self, image: Image.Image) -> str: + """Estimate diagram type based on image characteristics.""" + width, height = image.size + aspect_ratio = width / height + + if aspect_ratio > 2: + return "flowchart" + elif aspect_ratio < 0.5: + return "vertical_hierarchy" + elif 0.8 <= aspect_ratio <= 1.2: + return "square_diagram" + else: + return "standard_diagram" + +class MeaningKernelExtractor: + """Extract meaning kernels from diagrams.""" + + def __init__(self, config: Dict[str, Any] = None): + self.config = config or {} + self.analyzer = DiagramAnalyzer(config) + self.kernels: List[MeaningKernel] = [] + self.stats = { + "pages_processed": 0, + "diagrams_analyzed": 0, + "kernels_extracted": 0, + "errors": 0 + } + + def extract_from_pdf(self, pdf_path: str, output_dir: str = None) -> List[MeaningKernel]: + """Extract meaning kernels from a PDF file.""" + if not PDF2IMAGE_AVAILABLE: + raise ImportError("pdf2image is required for PDF processing") + + pdf_path = Path(pdf_path) + if not pdf_path.exists(): + raise FileNotFoundError(f"PDF not found: {pdf_path}") + + print(f"Processing PDF: {pdf_path}") + + # Create output directory + if output_dir: + output_path = Path(output_dir) + else: + output_path = pdf_path.parent / f"{pdf_path.stem}_kernels" + output_path.mkdir(parents=True, exist_ok=True) + + # Convert PDF to images + try: + from pdf2image import convert_from_path + images = convert_from_path(pdf_path, dpi=300) + print(f"Converted {len(images)} pages to images") + except Exception as e: + print(f"Error converting PDF: {e}") + self.stats["errors"] += 1 + return [] + + # Process each page + all_kernels = [] + for i, image in enumerate(images): + page_num = i + 1 + print(f"Processing page {page_num}/{len(images)}") + + # Save image temporarily + temp_image_path = output_path / f"page_{page_num:03d}.png" + image.save(temp_image_path) + + # Extract kernels from image + page_kernels = self.extract_from_image(temp_image_path, page_num) + all_kernels.extend(page_kernels) + + self.stats["pages_processed"] += 1 + + # Save all kernels + self._save_kernels(all_kernels, output_path) + + return all_kernels + + def extract_from_image(self, image_path: str, page_num: int = None) -> List[MeaningKernel]: + """Extract meaning kernels from an image.""" + print(f"Processing image: {image_path}") + + # Analyze image + try: + analysis = self.analyzer.analyze_image(str(image_path)) + except Exception as e: + print(f"Error analyzing image: {e}") + self.stats["errors"] += 1 + return [] + + # Generate kernels + kernels = self._generate_kernels(analysis, str(image_path), page_num) + + self.stats["diagrams_analyzed"] += 1 + self.stats["kernels_extracted"] += len(kernels) + + return kernels + + def _generate_kernels(self, analysis: Dict[str, Any], source: str, page_num: int = None) -> List[MeaningKernel]: + """Generate meaning kernels from analysis.""" + kernels = [] + + # Create base ID + base_id = f"kernel_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + if page_num: + base_id += f"_p{page_num}" + + # 1. Text kernel (from OCR) + if analysis.get("ocr_text"): + text_kernel = MeaningKernel( + kernel_id=f"{base_id}_text", + content=analysis["ocr_text"], + source=source, + kernel_type="text", + confidence=analysis.get("ocr_confidence", 0.0), + metadata={ + "word_count": len(analysis["ocr_text"].split()), + "diagram_type": analysis.get("diagram_type", "unknown") + } + ) + kernels.append(text_kernel) + + # 2. Structure kernel + structure_content = f"Diagram type: {analysis.get('diagram_type', 'unknown')}. " + structure_content += f"Dimensions: {analysis['dimensions']['width']}x{analysis['dimensions']['height']}. " + structure_content += f"Aspect ratio: {analysis['aspect_ratio']:.2f}." + + structure_kernel = MeaningKernel( + kernel_id=f"{base_id}_structure", + content=structure_content, + source=source, + kernel_type="structure", + confidence=0.9, + metadata={ + "dimensions": analysis["dimensions"], + "aspect_ratio": analysis["aspect_ratio"], + "diagram_type": analysis.get("diagram_type", "unknown") + } + ) + kernels.append(structure_kernel) + + # 3. Summary kernel + summary = f"Research diagram analysis: {analysis.get('diagram_type', 'unknown')} diagram. " + if analysis.get("ocr_text"): + summary += f"Contains text: {analysis['ocr_text'][:200]}..." + else: + summary += "No text detected." + + summary_kernel = MeaningKernel( + kernel_id=f"{base_id}_summary", + content=summary, + source=source, + kernel_type="summary", + confidence=0.7, + metadata={ + "has_text": bool(analysis.get("ocr_text")), + "text_length": len(analysis.get("ocr_text", "")) + } + ) + kernels.append(summary_kernel) + + # 4. Philosophical kernel (if we have text) + if analysis.get("ocr_text") and len(analysis["ocr_text"]) > 50: + # Simple philosophical extraction + philosophical_content = self._extract_philosophical_content(analysis["ocr_text"]) + if philosophical_content: + philosophical_kernel = MeaningKernel( + kernel_id=f"{base_id}_philosophical", + content=philosophical_content, + source=source, + kernel_type="philosophical", + confidence=0.6, + metadata={ + "extraction_method": "keyword_analysis", + "source_text_length": len(analysis["ocr_text"]) + } + ) + kernels.append(philosophical_kernel) + + # Add to internal list + self.kernels.extend(kernels) + + return kernels + + def _extract_philosophical_content(self, text: str) -> Optional[str]: + """Extract philosophical content from text (simplified).""" + # Look for philosophical keywords + philosophical_keywords = [ + "truth", "knowledge", "wisdom", "meaning", "purpose", + "existence", "reality", "consciousness", "ethics", "morality", + "beauty", "justice", "freedom", "responsibility", "identity" + ] + + text_lower = text.lower() + found_keywords = [kw for kw in philosophical_keywords if kw in text_lower] + + if found_keywords: + return f"Philosophical themes detected: {', '.join(found_keywords)}. " f"Source text explores concepts of {found_keywords[0]}." + + return None + + def _save_kernels(self, kernels: List[MeaningKernel], output_path: Path): + """Save kernels to files.""" + if not kernels: + print("No kernels to save") + return + + # Save as JSON + json_path = output_path / "meaning_kernels.json" + kernels_data = [k.to_dict() for k in kernels] + + with open(json_path, 'w') as f: + json.dump(kernels_data, f, indent=2) + + # Save as Markdown + md_path = output_path / "meaning_kernels.md" + with open(md_path, 'w') as f: + f.write(f"# Meaning Kernels Extraction Report\n") + f.write(f"Generated: {datetime.now().isoformat()}\n") + f.write(f"Total kernels: {len(kernels)}\n\n") + + # Group by type + by_type = {} + for kernel in kernels: + by_type.setdefault(kernel.kernel_type, []).append(kernel) + + for kernel_type, type_kernels in by_type.items(): + f.write(f"## {kernel_type.title()} Kernels ({len(type_kernels)})\n\n") + for kernel in type_kernels: + f.write(f"### {kernel.kernel_id}\n") + f.write(f"- **Source**: {kernel.source}\n") + f.write(f"- **Confidence**: {kernel.confidence:.2f}\n") + f.write(f"- **Timestamp**: {kernel.timestamp}\n") + f.write(f"- **Content**: {kernel.content}\n") + f.write(f"- **Metadata**: {json.dumps(kernel.metadata, indent=2)}\n\n") + + # Save statistics + stats_path = output_path / "extraction_stats.json" + with open(stats_path, 'w') as f: + json.dump(self.stats, f, indent=2) + + print(f"Saved {len(kernels)} kernels to {output_path}") + print(f" - JSON: {json_path}") + print(f" - Markdown: {md_path}") + print(f" - Statistics: {stats_path}") + + def get_stats(self) -> Dict[str, Any]: + """Get extraction statistics.""" + return self.stats.copy() + +def main(): + """Command line interface.""" + parser = argparse.ArgumentParser(description="Extract meaning kernels from research diagrams") + parser.add_argument("input", help="Input PDF or image file/directory") + parser.add_argument("-o", "--output", help="Output directory") + parser.add_argument("-c", "--config", help="Configuration file (JSON)") + parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output") + + args = parser.parse_args() + + # Load config if provided + config = {} + if args.config: + with open(args.config) as f: + config = json.load(f) + + # Create extractor + extractor = MeaningKernelExtractor(config) + + # Process input + input_path = Path(args.input) + + if input_path.is_file(): + if input_path.suffix.lower() == '.pdf': + kernels = extractor.extract_from_pdf(input_path, args.output) + elif input_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']: + kernels = extractor.extract_from_image(input_path) + else: + print(f"Unsupported file type: {input_path.suffix}") + sys.exit(1) + elif input_path.is_dir(): + # Process all PDFs and images in directory + all_kernels = [] + for file_path in input_path.iterdir(): + if file_path.suffix.lower() == '.pdf': + kernels = extractor.extract_from_pdf(file_path, args.output) + all_kernels.extend(kernels) + elif file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']: + kernels = extractor.extract_from_image(file_path) + all_kernels.extend(kernels) + else: + print(f"Input not found: {input_path}") + sys.exit(1) + + # Print summary + stats = extractor.get_stats() + print("\n" + "="*50) + print("EXTRACTION SUMMARY") + print("="*50) + print(f"Pages processed: {stats['pages_processed']}") + print(f"Diagrams analyzed: {stats['diagrams_analyzed']}") + print(f"Kernels extracted: {stats['kernels_extracted']}") + print(f"Errors: {stats['errors']}") + print("="*50) + + # Exit with appropriate code + sys.exit(0 if stats['errors'] == 0 else 1) + +if __name__ == "__main__": + main() diff --git a/scripts/meaning-kernels/requirements.txt b/scripts/meaning-kernels/requirements.txt new file mode 100644 index 00000000..8816dbb1 --- /dev/null +++ b/scripts/meaning-kernels/requirements.txt @@ -0,0 +1,19 @@ +# Meaning Kernel Extraction Dependencies + +# Image processing +Pillow>=10.0.0 + +# OCR (Optical Character Recognition) +pytesseract>=0.3.10 + +# PDF processing +pdf2image>=1.16.3 + +# Optional: Enhanced computer vision +# opencv-python>=4.8.0 +# numpy>=1.24.0 + +# Development tools +pytest>=7.4.0 +black>=23.0.0 +flake8>=6.0.0 diff --git a/scripts/meaning-kernels/test_extraction.py b/scripts/meaning-kernels/test_extraction.py new file mode 100755 index 00000000..cd77e419 --- /dev/null +++ b/scripts/meaning-kernels/test_extraction.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +""" +Test script for meaning kernel extraction pipeline. +""" +import os +import sys +import tempfile +from pathlib import Path + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent)) + +def create_test_image(): + """Create a simple test image.""" + try: + from PIL import Image, ImageDraw, ImageFont + + # Create image + img = Image.new('RGB', (800, 600), color='white') + draw = ImageDraw.Draw(img) + + # Draw some content + try: + font = ImageFont.truetype("Arial", 20) + except: + font = ImageFont.load_default() + + # Draw text + text_lines = [ + "Research Diagram: Knowledge Extraction Pipeline", + "", + "Input → Processing → Output", + "", + "Key Concepts:", + "- Data ingestion", + "- Feature extraction", + "- Pattern recognition", + "- Knowledge representation" + ] + + y = 50 + for line in text_lines: + draw.text((50, y), line, fill='black', font=font) + y += 30 + + # Draw a simple flowchart + draw.rectangle([300, 200, 500, 250], outline='blue', width=2) + draw.text((320, 210), "Process", fill='blue', font=font) + + draw.line([500, 225, 600, 225], fill='black', width=2) + draw.polygon([600, 225, 590, 215, 590, 235], fill='black') + + draw.rectangle([600, 200, 750, 250], outline='green', width=2) + draw.text((620, 210), "Output", fill='green', font=font) + + # Save to temp file + temp_dir = Path(tempfile.mkdtemp()) + image_path = temp_dir / "test_diagram.png" + img.save(image_path) + + print(f"Created test image: {image_path}") + return image_path + + except ImportError as e: + print(f"Cannot create test image: {e}") + return None + +def test_extraction(): + """Test the extraction pipeline.""" + print("Testing Meaning Kernel Extraction Pipeline...") + + # Check if we can import the extractor + try: + from extract_meaning_kernels import MeaningKernelExtractor + print("✓ Successfully imported MeaningKernelExtractor") + except ImportError as e: + print(f"✗ Failed to import: {e}") + return False + + # Create test image + test_image = create_test_image() + if not test_image: + print("Skipping test - cannot create test image") + return True + + # Test extraction + try: + extractor = MeaningKernelExtractor() + + print("\nExtracting kernels from test image...") + kernels = extractor.extract_from_image(test_image) + + print(f"✓ Extracted {len(kernels)} kernels") + + # Print kernel details + for kernel in kernels: + print(f"\nKernel: {kernel.kernel_id}") + print(f" Type: {kernel.kernel_type}") + print(f" Confidence: {kernel.confidence:.2f}") + print(f" Content: {kernel.content[:100]}...") + + # Get stats + stats = extractor.get_stats() + print(f"\nStatistics:") + for key, value in stats.items(): + print(f" {key}: {value}") + + return True + + except Exception as e: + print(f"✗ Extraction test failed: {e}") + import traceback + traceback.print_exc() + return False + +if __name__ == "__main__": + print("Meaning Kernel Extraction Pipeline Test") + print("=" * 50) + + success = test_extraction() + + print("\n" + "=" * 50) + if success: + print("✓ All tests passed!") + sys.exit(0) + else: + print("✗ Some tests failed") + sys.exit(1)