#!/usr/bin/env python3 """ Multimodal Meaning Kernel Extraction Pipeline Extracts structured meaning kernels from academic PDF diagrams. Issue #493: [Multimodal] Extract Meaning Kernels from Research Diagrams """ import os import sys import json import argparse from pathlib import Path from datetime import datetime from typing import List, Dict, Any, Optional import hashlib # Try to import vision libraries try: from PIL import Image PIL_AVAILABLE = True except ImportError: PIL_AVAILABLE = False print("Warning: PIL not available. Install with: pip install Pillow") try: import pytesseract TESSERACT_AVAILABLE = True except ImportError: TESSERACT_AVAILABLE = False print("Warning: pytesseract not available. Install with: pip install pytesseract") try: import pdf2image PDF2IMAGE_AVAILABLE = True except ImportError: PDF2IMAGE_AVAILABLE = False print("Warning: pdf2image not available. Install with: pip install pdf2image") class MeaningKernel: """Represents an extracted meaning kernel from a diagram.""" def __init__(self, kernel_id: str, content: str, source: str, confidence: float = 0.0, metadata: Dict[str, Any] = None): self.kernel_id = kernel_id self.content = content self.source = source self.confidence = confidence self.metadata = metadata or {} self.timestamp = datetime.now().isoformat() self.hash = self._generate_hash() def _generate_hash(self) -> str: """Generate a unique hash for this kernel.""" content_str = f"{self.kernel_id}:{self.content}:{self.source}:{self.timestamp}" return hashlib.sha256(content_str.encode()).hexdigest()[:16] def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for serialization.""" return { "kernel_id": self.kernel_id, "content": self.content, "source": self.source, "confidence": self.confidence, "metadata": self.metadata, "timestamp": self.timestamp, "hash": self.hash } def __str__(self) -> str: return f"Kernel[{self.kernel_id}]: {self.content[:100]}..." class DiagramProcessor: """Processes diagrams from PDFs to extract meaning kernels.""" def __init__(self, config: Dict[str, Any] = None): self.config = config or {} self.kernels: List[MeaningKernel] = [] self.stats = { "pages_processed": 0, "diagrams_found": 0, "kernels_extracted": 0, "errors": 0 } def extract_from_pdf(self, pdf_path: str, output_dir: str = None) -> List[MeaningKernel]: """Extract meaning kernels from a PDF file.""" if not PDF2IMAGE_AVAILABLE: raise ImportError("pdf2image is required for PDF processing") pdf_path = Path(pdf_path) if not pdf_path.exists(): raise FileNotFoundError(f"PDF not found: {pdf_path}") print(f"Processing PDF: {pdf_path}") # Create output directory if output_dir: output_path = Path(output_dir) else: output_path = pdf_path.parent / f"{pdf_path.stem}_kernels" output_path.mkdir(parents=True, exist_ok=True) # Convert PDF to images try: from pdf2image import convert_from_path images = convert_from_path(pdf_path, dpi=300) print(f"Converted {len(images)} pages to images") except Exception as e: print(f"Error converting PDF: {e}") self.stats["errors"] += 1 return [] # Process each page all_kernels = [] for i, image in enumerate(images): page_num = i + 1 print(f"Processing page {page_num}/{len(images)}") # Save image temporarily temp_image_path = output_path / f"page_{page_num:03d}.png" image.save(temp_image_path) # Process the image page_kernels = self.extract_from_image(temp_image_path, page_num) all_kernels.extend(page_kernels) self.stats["pages_processed"] += 1 # Save all kernels self._save_kernels(all_kernels, output_path) return all_kernels def extract_from_image(self, image_path: str, page_num: int = None) -> List[MeaningKernel]: """Extract meaning kernels from an image.""" if not PIL_AVAILABLE: raise ImportError("PIL is required for image processing") image_path = Path(image_path) if not image_path.exists(): raise FileNotFoundError(f"Image not found: {image_path}") print(f"Processing image: {image_path}") # Load image try: image = Image.open(image_path) except Exception as e: print(f"Error loading image: {e}") self.stats["errors"] += 1 return [] # Extract text using OCR extracted_text = self._extract_text_from_image(image) # Analyze image structure structure_analysis = self._analyze_image_structure(image) # Generate kernels kernels = self._generate_kernels( extracted_text, structure_analysis, str(image_path), page_num ) self.stats["diagrams_found"] += 1 self.stats["kernels_extracted"] += len(kernels) return kernels def _extract_text_from_image(self, image: Image.Image) -> Dict[str, Any]: """Extract text from image using OCR.""" text_data = { "full_text": "", "lines": [], "confidence": 0.0, "words": [] } if TESSERACT_AVAILABLE: try: # Get detailed OCR data data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) # Extract text with confidence texts = [] confidences = [] for i, text in enumerate(data['text']): if int(data['conf'][i]) > 0: # Filter out low confidence texts.append(text) confidences.append(int(data['conf'][i])) text_data['full_text'] = ' '.join(texts) text_data['lines'] = self._group_text_into_lines(data) text_data['confidence'] = sum(confidences) / len(confidences) if confidences else 0 text_data['words'] = texts except Exception as e: print(f"OCR error: {e}") return text_data def _group_text_into_lines(self, ocr_data: Dict) -> List[str]: """Group OCR words into lines.""" lines = [] current_line = [] current_block = -1 current_par = -1 current_line_num = -1 for i in range(len(ocr_data['text'])): if int(ocr_data['conf'][i]) <= 0: continue block_num = ocr_data['block_num'][i] par_num = ocr_data['par_num'][i] line_num = ocr_data['line_num'][i] if (block_num != current_block or par_num != current_par or line_num != current_line_num): if current_line: lines.append(' '.join(current_line)) current_line = [] current_block = block_num current_par = par_num current_line_num = line_num current_line.append(ocr_data['text'][i]) if current_line: lines.append(' '.join(current_line)) return lines def _analyze_image_structure(self, image: Image.Image) -> Dict[str, Any]: """Analyze image structure (simplified version).""" # This is a simplified version - real implementation would use # computer vision to detect diagrams, arrows, boxes, etc. width, height = image.size aspect_ratio = width / height # Basic analysis analysis = { "dimensions": {"width": width, "height": height}, "aspect_ratio": aspect_ratio, "is_landscape": aspect_ratio > 1, "is_portrait": aspect_ratio < 1, "estimated_diagram_type": self._estimate_diagram_type(width, height), "complexity": "medium" # placeholder } return analysis def _estimate_diagram_type(self, width: int, height: int) -> str: """Estimate diagram type based on dimensions (simplified).""" aspect_ratio = width / height if aspect_ratio > 2: return "flowchart" elif aspect_ratio < 0.5: return "vertical_hierarchy" elif 0.8 <= aspect_ratio <= 1.2: return "square_diagram" else: return "standard_diagram" def _generate_kernels(self, text_data: Dict[str, Any], structure: Dict[str, Any], source: str, page_num: int = None) -> List[MeaningKernel]: """Generate meaning kernels from extracted data.""" kernels = [] # Create base ID base_id = f"kernel_{datetime.now().strftime('%Y%m%d_%H%M%S')}" if page_num: base_id += f"_p{page_num}" # 1. Text-based kernel if text_data['full_text'].strip(): text_kernel = MeaningKernel( kernel_id=f"{base_id}_text", content=text_data['full_text'], source=source, confidence=text_data['confidence'] / 100.0, # Normalize to 0-1 metadata={ "type": "text_extraction", "word_count": len(text_data['words']), "line_count": len(text_data['lines']), "structure": structure } ) kernels.append(text_kernel) # 2. Structure-based kernel structure_content = f"Diagram type: {structure['estimated_diagram_type']}. " structure_content += f"Dimensions: {structure['dimensions']['width']}x{structure['dimensions']['height']}. " structure_content += f"Aspect ratio: {structure['aspect_ratio']:.2f}. " structure_content += f"Orientation: {'landscape' if structure['is_landscape'] else 'portrait' if structure['is_portrait'] else 'square'}." structure_kernel = MeaningKernel( kernel_id=f"{base_id}_structure", content=structure_content, source=source, confidence=0.8, # High confidence for structure analysis metadata={ "type": "structure_analysis", "analysis": structure } ) kernels.append(structure_kernel) # 3. Summary kernel (combines text and structure) if text_data['full_text'].strip(): summary = f"Research diagram analysis: {structure['estimated_diagram_type']} with text content. " summary += f"Key elements: {text_data['full_text'][:200]}..." summary_kernel = MeaningKernel( kernel_id=f"{base_id}_summary", content=summary, source=source, confidence=0.7, metadata={ "type": "summary", "text_length": len(text_data['full_text']), "structure_type": structure['estimated_diagram_type'] } ) kernels.append(summary_kernel) # Add to internal list self.kernels.extend(kernels) return kernels def _save_kernels(self, kernels: List[MeaningKernel], output_path: Path): """Save kernels to files.""" if not kernels: print("No kernels to save") return # Save as JSON json_path = output_path / "meaning_kernels.json" kernels_data = [k.to_dict() for k in kernels] with open(json_path, 'w') as f: json.dump(kernels_data, f, indent=2) # Save as Markdown for readability md_path = output_path / "meaning_kernels.md" with open(md_path, 'w') as f: f.write(f"# Meaning Kernels Extraction Report\n") f.write(f"Generated: {datetime.now().isoformat()}\n") f.write(f"Total kernels: {len(kernels)}\n\n") for kernel in kernels: f.write(f"## Kernel: {kernel.kernel_id}\n") f.write(f"- **Source**: {kernel.source}\n") f.write(f"- **Confidence**: {kernel.confidence:.2f}\n") f.write(f"- **Timestamp**: {kernel.timestamp}\n") f.write(f"- **Hash**: {kernel.hash}\n") f.write(f"- **Content**: {kernel.content}\n") f.write(f"- **Metadata**: {json.dumps(kernel.metadata, indent=2)}\n\n") # Save statistics stats_path = output_path / "extraction_stats.json" with open(stats_path, 'w') as f: json.dump(self.stats, f, indent=2) print(f"Saved {len(kernels)} kernels to {output_path}") print(f" - JSON: {json_path}") print(f" - Markdown: {md_path}") print(f" - Statistics: {stats_path}") def get_stats(self) -> Dict[str, Any]: """Get processing statistics.""" return self.stats.copy() def main(): """Command line interface for the pipeline.""" parser = argparse.ArgumentParser(description="Extract meaning kernels from research diagrams") parser.add_argument("input", help="Input PDF or image file/directory") parser.add_argument("-o", "--output", help="Output directory") parser.add_argument("-c", "--config", help="Configuration file (JSON)") parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output") args = parser.parse_args() # Load config if provided config = {} if args.config: with open(args.config) as f: config = json.load(f) # Create processor processor = DiagramProcessor(config) # Process input input_path = Path(args.input) if input_path.is_file(): if input_path.suffix.lower() == '.pdf': kernels = processor.extract_from_pdf(input_path, args.output) elif input_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']: kernels = processor.extract_from_image(input_path) else: print(f"Unsupported file type: {input_path.suffix}") sys.exit(1) elif input_path.is_dir(): # Process all PDFs and images in directory all_kernels = [] for file_path in input_path.iterdir(): if file_path.suffix.lower() == '.pdf': kernels = processor.extract_from_pdf(file_path, args.output) all_kernels.extend(kernels) elif file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']: kernels = processor.extract_from_image(file_path) all_kernels.extend(kernels) else: print(f"Input not found: {input_path}") sys.exit(1) # Print summary stats = processor.get_stats() print("\n" + "="*50) print("EXTRACTION SUMMARY") print("="*50) print(f"Pages processed: {stats['pages_processed']}") print(f"Diagrams found: {stats['diagrams_found']}") print(f"Kernels extracted: {stats['kernels_extracted']}") print(f"Errors: {stats['errors']}") print("="*50) # Exit with appropriate code sys.exit(0 if stats['errors'] == 0 else 1) if __name__ == "__main__": main()