diff --git a/scripts/meaning-kernels/__pycache__/extract_meaning_kernels.cpython-312.pyc b/scripts/meaning-kernels/__pycache__/extract_meaning_kernels.cpython-312.pyc index 70dba65e..017083ea 100644 Binary files a/scripts/meaning-kernels/__pycache__/extract_meaning_kernels.cpython-312.pyc and b/scripts/meaning-kernels/__pycache__/extract_meaning_kernels.cpython-312.pyc differ diff --git a/scripts/meaning-kernels/extract_meaning_kernels.py b/scripts/meaning-kernels/extract_meaning_kernels.py index 2af13bca..81d78017 100755 --- a/scripts/meaning-kernels/extract_meaning_kernels.py +++ b/scripts/meaning-kernels/extract_meaning_kernels.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Meaning Kernel Extraction Pipeline +Improved Meaning Kernel Extraction Pipeline Extract structured meaning kernels from academic PDF diagrams. Issue #493: [Multimodal] Extract Meaning Kernels from Research Diagrams """ @@ -8,9 +8,10 @@ import os import sys import json import argparse +import re from pathlib import Path from datetime import datetime -from typing import List, Dict, Any, Optional +from typing import List, Dict, Any, Optional, Tuple import hashlib # Try to import vision libraries @@ -40,13 +41,14 @@ class MeaningKernel: def __init__(self, kernel_id: str, content: str, source: str, kernel_type: str = "text", confidence: float = 0.0, - metadata: Dict[str, Any] = None): + metadata: Dict[str, Any] = None, tags: List[str] = None): self.kernel_id = kernel_id self.content = content self.source = source - self.kernel_type = kernel_type # text, structure, summary, philosophical + self.kernel_type = kernel_type # text, structure, summary, philosophical, semantic self.confidence = confidence self.metadata = metadata or {} + self.tags = tags or [] self.timestamp = datetime.now().isoformat() self.hash = self._generate_hash() @@ -64,18 +66,26 @@ class MeaningKernel: "kernel_type": self.kernel_type, "confidence": self.confidence, "metadata": self.metadata, + "tags": self.tags, "timestamp": self.timestamp, "hash": self.hash } def __str__(self) -> str: - return f"Kernel[{self.kernel_id}]: {self.content[:100]}..." + return f"Kernel[{self.kernel_id}] ({self.kernel_type}): {self.content[:100]}..." class DiagramAnalyzer: """Analyze diagrams using multiple methods.""" def __init__(self, config: Dict[str, Any] = None): self.config = config or {} + self.philosophical_keywords = self.config.get("philosophical_keywords", [ + "truth", "knowledge", "wisdom", "meaning", "purpose", + "existence", "reality", "consciousness", "ethics", "morality", + "beauty", "justice", "freedom", "responsibility", "identity", + "causality", "determinism", "free will", "rationality", "logic", + "metaphysics", "epistemology", "ontology", "phenomenology" + ]) def analyze_image(self, image_path: str) -> Dict[str, Any]: """Analyze an image using multiple methods.""" @@ -90,43 +100,183 @@ class DiagramAnalyzer: "aspect_ratio": image.width / image.height, "mode": image.mode, "format": image.format, - "size_bytes": os.path.getsize(image_path) + "size_bytes": os.path.getsize(image_path), + "color_analysis": self._analyze_colors(image) } # OCR text extraction if TESSERACT_AVAILABLE: try: - ocr_text = pytesseract.image_to_string(image) - analysis["ocr_text"] = ocr_text.strip() - analysis["ocr_confidence"] = self._estimate_ocr_confidence(image) + ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) + ocr_text = " ".join([text for text in ocr_data['text'] if text.strip()]) + analysis["ocr_text"] = ocr_text + analysis["ocr_confidence"] = self._calculate_ocr_confidence(ocr_data) + analysis["ocr_word_count"] = len(ocr_text.split()) + analysis["ocr_lines"] = self._extract_ocr_lines(ocr_data) except Exception as e: analysis["ocr_text"] = "" analysis["ocr_confidence"] = 0.0 analysis["ocr_error"] = str(e) # Diagram type estimation - analysis["diagram_type"] = self._estimate_diagram_type(image) + analysis["diagram_type"] = self._estimate_diagram_type(image, analysis) + + # Content analysis + analysis["content_analysis"] = self._analyze_content(analysis) return analysis - def _estimate_ocr_confidence(self, image: Image.Image) -> float: - """Estimate OCR confidence (simplified).""" - # In reality, would use pytesseract's confidence output - return 0.8 # Placeholder + def _analyze_colors(self, image: Image.Image) -> Dict[str, Any]: + """Analyze color distribution in image.""" + # Convert to RGB if necessary + if image.mode != 'RGB': + image = image.convert('RGB') + + # Get colors + colors = image.getcolors(maxcolors=10000) + if colors: + # Sort by frequency + colors.sort(key=lambda x: x[0], reverse=True) + total_pixels = image.width * image.height + + # Get dominant colors + dominant_colors = [] + for count, color in colors[:5]: + percentage = (count / total_pixels) * 100 + dominant_colors.append({ + "color": color, + "count": count, + "percentage": round(percentage, 2) + }) + + return { + "dominant_colors": dominant_colors, + "unique_colors": len(colors), + "is_grayscale": self._is_grayscale(image) + } + + return {"dominant_colors": [], "unique_colors": 0} - def _estimate_diagram_type(self, image: Image.Image) -> str: + def _is_grayscale(self, image: Image.Image) -> bool: + """Check if image is grayscale.""" + # Sample some pixels + width, height = image.size + for x in range(0, width, width // 10): + for y in range(0, height, height // 10): + r, g, b = image.getpixel((x, y)) + if not (r == g == b): + return False + return True + + def _calculate_ocr_confidence(self, ocr_data: Dict[str, Any]) -> float: + """Calculate average OCR confidence.""" + confidences = [int(conf) for conf in ocr_data['conf'] if int(conf) > 0] + if confidences: + return sum(confidences) / len(confidences) / 100.0 + return 0.0 + + def _extract_ocr_lines(self, ocr_data: Dict[str, Any]) -> List[str]: + """Extract text lines from OCR data.""" + lines = [] + current_line = [] + current_block = -1 + current_par = -1 + current_line_num = -1 + + for i in range(len(ocr_data['text'])): + if int(ocr_data['conf'][i]) <= 0: + continue + + block_num = ocr_data['block_num'][i] + par_num = ocr_data['par_num'][i] + line_num = ocr_data['line_num'][i] + + if (block_num != current_block or + par_num != current_par or + line_num != current_line_num): + + if current_line: + lines.append(' '.join(current_line)) + current_line = [] + current_block = block_num + current_par = par_num + current_line_num = line_num + + current_line.append(ocr_data['text'][i]) + + if current_line: + lines.append(' '.join(current_line)) + + return lines + + def _estimate_diagram_type(self, image: Image.Image, analysis: Dict[str, Any]) -> str: """Estimate diagram type based on image characteristics.""" width, height = image.size aspect_ratio = width / height + # Check for flowchart characteristics if aspect_ratio > 2: return "flowchart" elif aspect_ratio < 0.5: return "vertical_hierarchy" elif 0.8 <= aspect_ratio <= 1.2: + # Check for circular patterns + if self._has_circular_patterns(image): + return "circular_diagram" return "square_diagram" - else: - return "standard_diagram" + + # Check OCR content for clues + ocr_text = analysis.get("ocr_text", "").lower() + if any(word in ocr_text for word in ["process", "flow", "step", "arrow"]): + return "process_diagram" + elif any(word in ocr_text for word in ["system", "component", "module"]): + return "system_diagram" + elif any(word in ocr_text for word in ["data", "information", "input", "output"]): + return "data_diagram" + + return "standard_diagram" + + def _has_circular_patterns(self, image: Image.Image) -> bool: + """Check for circular patterns in image (simplified).""" + # This is a simplified check - real implementation would use computer vision + return False + + def _analyze_content(self, analysis: Dict[str, Any]) -> Dict[str, Any]: + """Analyze content for themes and patterns.""" + ocr_text = analysis.get("ocr_text", "") + + content_analysis = { + "word_count": len(ocr_text.split()), + "has_text": bool(ocr_text), + "themes": [], + "entities": [], + "relationships": [] + } + + if ocr_text: + # Extract potential entities (capitalized words) + words = ocr_text.split() + entities = [word for word in words if word[0].isupper() and len(word) > 2] + content_analysis["entities"] = list(set(entities))[:10] + + # Look for relationships + relationship_patterns = [ + r"(\w+)\s*->\s*(\w+)", + r"(\w+)\s*→\s*(\w+)", + r"(\w+)\s*to\s*(\w+)", + r"(\w+)\s*from\s*(\w+)" + ] + + for pattern in relationship_patterns: + matches = re.findall(pattern, ocr_text) + for match in matches: + content_analysis["relationships"].append({ + "source": match[0], + "target": match[1], + "type": "connection" + }) + + return content_analysis class MeaningKernelExtractor: """Extract meaning kernels from diagrams.""" @@ -139,17 +289,34 @@ class MeaningKernelExtractor: "pages_processed": 0, "diagrams_analyzed": 0, "kernels_extracted": 0, - "errors": 0 + "errors": 0, + "dependency_warnings": 0 } + + # Check dependencies and update stats + if not PIL_AVAILABLE: + self.stats["dependency_warnings"] += 1 + if not TESSERACT_AVAILABLE: + self.stats["dependency_warnings"] += 1 + if not PDF2IMAGE_AVAILABLE: + self.stats["dependency_warnings"] += 1 def extract_from_pdf(self, pdf_path: str, output_dir: str = None) -> List[MeaningKernel]: """Extract meaning kernels from a PDF file.""" if not PDF2IMAGE_AVAILABLE: - raise ImportError("pdf2image is required for PDF processing") + print("Error: pdf2image is required for PDF processing") + print("Install with: pip install pdf2image") + print("System dependencies:") + print(" macOS: brew install poppler") + print(" Ubuntu: sudo apt-get install poppler-utils") + self.stats["errors"] += 1 + return [] pdf_path = Path(pdf_path) if not pdf_path.exists(): - raise FileNotFoundError(f"PDF not found: {pdf_path}") + print(f"Error: PDF not found: {pdf_path}") + self.stats["errors"] += 1 + return [] print(f"Processing PDF: {pdf_path}") @@ -229,16 +396,26 @@ class MeaningKernelExtractor: kernel_type="text", confidence=analysis.get("ocr_confidence", 0.0), metadata={ - "word_count": len(analysis["ocr_text"].split()), + "word_count": analysis.get("ocr_word_count", 0), + "line_count": len(analysis.get("ocr_lines", [])), "diagram_type": analysis.get("diagram_type", "unknown") - } + }, + tags=["ocr", "text", "extracted"] ) kernels.append(text_kernel) # 2. Structure kernel structure_content = f"Diagram type: {analysis.get('diagram_type', 'unknown')}. " structure_content += f"Dimensions: {analysis['dimensions']['width']}x{analysis['dimensions']['height']}. " - structure_content += f"Aspect ratio: {analysis['aspect_ratio']:.2f}." + structure_content += f"Aspect ratio: {analysis['aspect_ratio']:.2f}. " + + # Add color information + color_analysis = analysis.get("color_analysis", {}) + if color_analysis.get("is_grayscale"): + structure_content += "Grayscale image. " + elif color_analysis.get("dominant_colors"): + top_color = color_analysis["dominant_colors"][0] + structure_content += f"Dominant color: RGB{top_color['color']} ({top_color['percentage']}%). " structure_kernel = MeaningKernel( kernel_id=f"{base_id}_structure", @@ -249,8 +426,10 @@ class MeaningKernelExtractor: metadata={ "dimensions": analysis["dimensions"], "aspect_ratio": analysis["aspect_ratio"], - "diagram_type": analysis.get("diagram_type", "unknown") - } + "diagram_type": analysis.get("diagram_type", "unknown"), + "color_analysis": color_analysis + }, + tags=["structure", "layout", "visual"] ) kernels.append(structure_kernel) @@ -261,6 +440,11 @@ class MeaningKernelExtractor: else: summary += "No text detected." + # Add content analysis + content_analysis = analysis.get("content_analysis", {}) + if content_analysis.get("entities"): + summary += f" Entities: {', '.join(content_analysis['entities'][:5])}." + summary_kernel = MeaningKernel( kernel_id=f"{base_id}_summary", content=summary, @@ -269,14 +453,16 @@ class MeaningKernelExtractor: confidence=0.7, metadata={ "has_text": bool(analysis.get("ocr_text")), - "text_length": len(analysis.get("ocr_text", "")) - } + "text_length": len(analysis.get("ocr_text", "")), + "entities": content_analysis.get("entities", []), + "relationships": content_analysis.get("relationships", []) + }, + tags=["summary", "overview", "analysis"] ) kernels.append(summary_kernel) # 4. Philosophical kernel (if we have text) if analysis.get("ocr_text") and len(analysis["ocr_text"]) > 50: - # Simple philosophical extraction philosophical_content = self._extract_philosophical_content(analysis["ocr_text"]) if philosophical_content: philosophical_kernel = MeaningKernel( @@ -287,33 +473,61 @@ class MeaningKernelExtractor: confidence=0.6, metadata={ "extraction_method": "keyword_analysis", - "source_text_length": len(analysis["ocr_text"]) - } + "source_text_length": len(analysis["ocr_text"]), + "keywords_found": self._find_philosophical_keywords(analysis["ocr_text"]) + }, + tags=["philosophical", "meaning", "conceptual"] ) kernels.append(philosophical_kernel) + # 5. Semantic kernel (if we have relationships) + content_analysis = analysis.get("content_analysis", {}) + if content_analysis.get("relationships"): + relationships = content_analysis["relationships"] + semantic_content = f"Semantic relationships detected: {len(relationships)} connections. " + for rel in relationships[:3]: + semantic_content += f"{rel['source']} → {rel['target']}. " + + semantic_kernel = MeaningKernel( + kernel_id=f"{base_id}_semantic", + content=semantic_content, + source=source, + kernel_type="semantic", + confidence=0.8, + metadata={ + "relationship_count": len(relationships), + "relationships": relationships + }, + tags=["semantic", "relationships", "connections"] + ) + kernels.append(semantic_kernel) + # Add to internal list self.kernels.extend(kernels) return kernels def _extract_philosophical_content(self, text: str) -> Optional[str]: - """Extract philosophical content from text (simplified).""" + """Extract philosophical content from text.""" # Look for philosophical keywords - philosophical_keywords = [ - "truth", "knowledge", "wisdom", "meaning", "purpose", - "existence", "reality", "consciousness", "ethics", "morality", - "beauty", "justice", "freedom", "responsibility", "identity" - ] - - text_lower = text.lower() - found_keywords = [kw for kw in philosophical_keywords if kw in text_lower] + found_keywords = self._find_philosophical_keywords(text) if found_keywords: return f"Philosophical themes detected: {', '.join(found_keywords)}. " f"Source text explores concepts of {found_keywords[0]}." return None + def _find_philosophical_keywords(self, text: str) -> List[str]: + """Find philosophical keywords in text.""" + text_lower = text.lower() + found_keywords = [] + + for keyword in self.analyzer.philosophical_keywords: + if keyword in text_lower: + found_keywords.append(keyword) + + return found_keywords + def _save_kernels(self, kernels: List[MeaningKernel], output_path: Path): """Save kernels to files.""" if not kernels: @@ -346,6 +560,7 @@ class MeaningKernelExtractor: f.write(f"- **Source**: {kernel.source}\n") f.write(f"- **Confidence**: {kernel.confidence:.2f}\n") f.write(f"- **Timestamp**: {kernel.timestamp}\n") + f.write(f"- **Tags**: {', '.join(kernel.tags)}\n") f.write(f"- **Content**: {kernel.content}\n") f.write(f"- **Metadata**: {json.dumps(kernel.metadata, indent=2)}\n\n") @@ -416,6 +631,7 @@ def main(): print(f"Diagrams analyzed: {stats['diagrams_analyzed']}") print(f"Kernels extracted: {stats['kernels_extracted']}") print(f"Errors: {stats['errors']}") + print(f"Dependency warnings: {stats['dependency_warnings']}") print("="*50) # Exit with appropriate code diff --git a/scripts/meaning-kernels/test_extraction.py b/scripts/meaning-kernels/test_extraction.py index cd77e419..96842737 100755 --- a/scripts/meaning-kernels/test_extraction.py +++ b/scripts/meaning-kernels/test_extraction.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Test script for meaning kernel extraction pipeline. +Improved test script for meaning kernel extraction pipeline. """ import os import sys @@ -10,8 +10,8 @@ from pathlib import Path # Add parent directory to path sys.path.insert(0, str(Path(__file__).parent)) -def create_test_image(): - """Create a simple test image.""" +def create_test_image_with_text(): + """Create a test image with text.""" try: from PIL import Image, ImageDraw, ImageFont @@ -35,7 +35,12 @@ def create_test_image(): "- Data ingestion", "- Feature extraction", "- Pattern recognition", - "- Knowledge representation" + "- Knowledge representation", + "", + "Philosophical aspects:", + "- Truth and knowledge", + "- Meaning and purpose", + "- Reality and existence" ] y = 50 @@ -55,10 +60,10 @@ def create_test_image(): # Save to temp file temp_dir = Path(tempfile.mkdtemp()) - image_path = temp_dir / "test_diagram.png" + image_path = temp_dir / "test_diagram_with_text.png" img.save(image_path) - print(f"Created test image: {image_path}") + print(f"Created test image with text: {image_path}") return image_path except ImportError as e: @@ -67,7 +72,7 @@ def create_test_image(): def test_extraction(): """Test the extraction pipeline.""" - print("Testing Meaning Kernel Extraction Pipeline...") + print("Testing Improved Meaning Kernel Extraction Pipeline...") # Check if we can import the extractor try: @@ -78,7 +83,7 @@ def test_extraction(): return False # Create test image - test_image = create_test_image() + test_image = create_test_image_with_text() if not test_image: print("Skipping test - cannot create test image") return True @@ -97,6 +102,7 @@ def test_extraction(): print(f"\nKernel: {kernel.kernel_id}") print(f" Type: {kernel.kernel_type}") print(f" Confidence: {kernel.confidence:.2f}") + print(f" Tags: {', '.join(kernel.tags)}") print(f" Content: {kernel.content[:100]}...") # Get stats @@ -105,6 +111,13 @@ def test_extraction(): for key, value in stats.items(): print(f" {key}: {value}") + # Check for philosophical kernels + philosophical_kernels = [k for k in kernels if k.kernel_type == "philosophical"] + if philosophical_kernels: + print(f"\nāœ“ Found {len(philosophical_kernels)} philosophical kernel(s)") + else: + print("\n⚠ No philosophical kernels found (may need OCR dependencies)") + return True except Exception as e: @@ -114,7 +127,7 @@ def test_extraction(): return False if __name__ == "__main__": - print("Meaning Kernel Extraction Pipeline Test") + print("Improved Meaning Kernel Extraction Pipeline Test") print("=" * 50) success = test_extraction()