timmy-config/scripts/multimodal/extract_meaning_kernels.py

#!/usr/bin/env python3
"""
Multimodal Meaning Kernel Extraction Pipeline
Extracts structured meaning kernels from academic PDF diagrams.
Issue #493: [Multimodal] Extract Meaning Kernels from Research Diagrams
"""
import os
import sys
import json
import argparse
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
import hashlib

# Try to import vision libraries
try:
    from PIL import Image
    PIL_AVAILABLE = True
except ImportError:
    PIL_AVAILABLE = False
    print("Warning: PIL not available. Install with: pip install Pillow")

try:
    import pytesseract
    TESSERACT_AVAILABLE = True
except ImportError:
    TESSERACT_AVAILABLE = False
    print("Warning: pytesseract not available. Install with: pip install pytesseract")

try:
    import pdf2image
    PDF2IMAGE_AVAILABLE = True
except ImportError:
    PDF2IMAGE_AVAILABLE = False
    print("Warning: pdf2image not available. Install with: pip install pdf2image")

class MeaningKernel:
    """Represents an extracted meaning kernel from a diagram."""

    def __init__(self, kernel_id: str, content: str, source: str,
                 confidence: float = 0.0, metadata: Dict[str, Any] = None):
        self.kernel_id = kernel_id
        self.content = content
        self.source = source
        self.confidence = confidence
        self.metadata = metadata or {}
        self.timestamp = datetime.now().isoformat()
        self.hash = self._generate_hash()

    def _generate_hash(self) -> str:
        """Generate a unique hash for this kernel."""
        content_str = f"{self.kernel_id}:{self.content}:{self.source}:{self.timestamp}"
        return hashlib.sha256(content_str.encode()).hexdigest()[:16]

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization."""
        return {
            "kernel_id": self.kernel_id,
            "content": self.content,
            "source": self.source,
            "confidence": self.confidence,
            "metadata": self.metadata,
            "timestamp": self.timestamp,
            "hash": self.hash
        }

    def __str__(self) -> str:
        return f"Kernel[{self.kernel_id}]: {self.content[:100]}..."

class DiagramProcessor:
    """Processes diagrams from PDFs to extract meaning kernels."""

    def __init__(self, config: Dict[str, Any] = None):
        self.config = config or {}
        self.kernels: List[MeaningKernel] = []
        self.stats = {
            "pages_processed": 0,
            "diagrams_found": 0,
            "kernels_extracted": 0,
            "errors": 0
        }

    def extract_from_pdf(self, pdf_path: str, output_dir: str = None) -> List[MeaningKernel]:
        """Extract meaning kernels from a PDF file."""
        if not PDF2IMAGE_AVAILABLE:
            raise ImportError("pdf2image is required for PDF processing")

        pdf_path = Path(pdf_path)
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF not found: {pdf_path}")

        print(f"Processing PDF: {pdf_path}")

        # Create output directory
        if output_dir:
            output_path = Path(output_dir)
        else:
            output_path = pdf_path.parent / f"{pdf_path.stem}_kernels"
        output_path.mkdir(parents=True, exist_ok=True)

        # Convert PDF to images
        try:
            from pdf2image import convert_from_path
            images = convert_from_path(pdf_path, dpi=300)
            print(f"Converted {len(images)} pages to images")
        except Exception as e:
            print(f"Error converting PDF: {e}")
            self.stats["errors"] += 1
            return []

        # Process each page
        all_kernels = []
        for i, image in enumerate(images):
            page_num = i + 1
            print(f"Processing page {page_num}/{len(images)}")

            # Save image temporarily
            temp_image_path = output_path / f"page_{page_num:03d}.png"
            image.save(temp_image_path)

            # Process the image
            page_kernels = self.extract_from_image(temp_image_path, page_num)
            all_kernels.extend(page_kernels)

            self.stats["pages_processed"] += 1

        # Save all kernels
        self._save_kernels(all_kernels, output_path)

        return all_kernels

    def extract_from_image(self, image_path: str, page_num: int = None) -> List[MeaningKernel]:
        """Extract meaning kernels from an image."""
        if not PIL_AVAILABLE:
            raise ImportError("PIL is required for image processing")

        image_path = Path(image_path)
        if not image_path.exists():
            raise FileNotFoundError(f"Image not found: {image_path}")

        print(f"Processing image: {image_path}")

        # Load image
        try:
            image = Image.open(image_path)
        except Exception as e:
            print(f"Error loading image: {e}")
            self.stats["errors"] += 1
            return []

        # Extract text using OCR
        extracted_text = self._extract_text_from_image(image)

        # Analyze image structure
        structure_analysis = self._analyze_image_structure(image)

        # Generate kernels
        kernels = self._generate_kernels(
            extracted_text,
            structure_analysis,
            str(image_path),
            page_num
        )

        self.stats["diagrams_found"] += 1
        self.stats["kernels_extracted"] += len(kernels)

        return kernels

    def _extract_text_from_image(self, image: Image.Image) -> Dict[str, Any]:
        """Extract text from image using OCR."""
        text_data = {
            "full_text": "",
            "lines": [],
            "confidence": 0.0,
            "words": []
        }

        if TESSERACT_AVAILABLE:
            try:
                # Get detailed OCR data
                data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

                # Extract text with confidence
                texts = []
                confidences = []

                for i, text in enumerate(data['text']):
                    if int(data['conf'][i]) > 0:  # Filter out low confidence
                        texts.append(text)
                        confidences.append(int(data['conf'][i]))

                text_data['full_text'] = ' '.join(texts)
                text_data['lines'] = self._group_text_into_lines(data)
                text_data['confidence'] = sum(confidences) / len(confidences) if confidences else 0
                text_data['words'] = texts

            except Exception as e:
                print(f"OCR error: {e}")

        return text_data

    def _group_text_into_lines(self, ocr_data: Dict) -> List[str]:
        """Group OCR words into lines."""
        lines = []
        current_line = []
        current_block = -1
        current_par = -1
        current_line_num = -1

        for i in range(len(ocr_data['text'])):
            if int(ocr_data['conf'][i]) <= 0:
                continue

            block_num = ocr_data['block_num'][i]
            par_num = ocr_data['par_num'][i]
            line_num = ocr_data['line_num'][i]

            if (block_num != current_block or
                par_num != current_par or
                line_num != current_line_num):

                if current_line:
                    lines.append(' '.join(current_line))
                current_line = []
                current_block = block_num
                current_par = par_num
                current_line_num = line_num

            current_line.append(ocr_data['text'][i])

        if current_line:
            lines.append(' '.join(current_line))

        return lines

    def _analyze_image_structure(self, image: Image.Image) -> Dict[str, Any]:
        """Analyze image structure (simplified version)."""
        # This is a simplified version - real implementation would use
        # computer vision to detect diagrams, arrows, boxes, etc.

        width, height = image.size
        aspect_ratio = width / height

        # Basic analysis
        analysis = {
            "dimensions": {"width": width, "height": height},
            "aspect_ratio": aspect_ratio,
            "is_landscape": aspect_ratio > 1,
            "is_portrait": aspect_ratio < 1,
            "estimated_diagram_type": self._estimate_diagram_type(width, height),
            "complexity": "medium"  # placeholder
        }

        return analysis

    def _estimate_diagram_type(self, width: int, height: int) -> str:
        """Estimate diagram type based on dimensions (simplified)."""
        aspect_ratio = width / height

        if aspect_ratio > 2:
            return "flowchart"
        elif aspect_ratio < 0.5:
            return "vertical_hierarchy"
        elif 0.8 <= aspect_ratio <= 1.2:
            return "square_diagram"
        else:
            return "standard_diagram"

    def _generate_kernels(self, text_data: Dict[str, Any],
                         structure: Dict[str, Any],
                         source: str,
                         page_num: int = None) -> List[MeaningKernel]:
        """Generate meaning kernels from extracted data."""
        kernels = []

        # Create base ID
        base_id = f"kernel_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        if page_num:
            base_id += f"_p{page_num}"

        # 1. Text-based kernel
        if text_data['full_text'].strip():
            text_kernel = MeaningKernel(
                kernel_id=f"{base_id}_text",
                content=text_data['full_text'],
                source=source,
                confidence=text_data['confidence'] / 100.0,  # Normalize to 0-1
                metadata={
                    "type": "text_extraction",
                    "word_count": len(text_data['words']),
                    "line_count": len(text_data['lines']),
                    "structure": structure
                }
            )
            kernels.append(text_kernel)

        # 2. Structure-based kernel
        structure_content = f"Diagram type: {structure['estimated_diagram_type']}. "
        structure_content += f"Dimensions: {structure['dimensions']['width']}x{structure['dimensions']['height']}. "
        structure_content += f"Aspect ratio: {structure['aspect_ratio']:.2f}. "
        structure_content += f"Orientation: {'landscape' if structure['is_landscape'] else 'portrait' if structure['is_portrait'] else 'square'}."

        structure_kernel = MeaningKernel(
            kernel_id=f"{base_id}_structure",
            content=structure_content,
            source=source,
            confidence=0.8,  # High confidence for structure analysis
            metadata={
                "type": "structure_analysis",
                "analysis": structure
            }
        )
        kernels.append(structure_kernel)

        # 3. Summary kernel (combines text and structure)
        if text_data['full_text'].strip():
            summary = f"Research diagram analysis: {structure['estimated_diagram_type']} with text content. "
            summary += f"Key elements: {text_data['full_text'][:200]}..."

            summary_kernel = MeaningKernel(
                kernel_id=f"{base_id}_summary",
                content=summary,
                source=source,
                confidence=0.7,
                metadata={
                    "type": "summary",
                    "text_length": len(text_data['full_text']),
                    "structure_type": structure['estimated_diagram_type']
                }
            )
            kernels.append(summary_kernel)

        # Add to internal list
        self.kernels.extend(kernels)

        return kernels

    def _save_kernels(self, kernels: List[MeaningKernel], output_path: Path):
        """Save kernels to files."""
        if not kernels:
            print("No kernels to save")
            return

        # Save as JSON
        json_path = output_path / "meaning_kernels.json"
        kernels_data = [k.to_dict() for k in kernels]

        with open(json_path, 'w') as f:
            json.dump(kernels_data, f, indent=2)

        # Save as Markdown for readability
        md_path = output_path / "meaning_kernels.md"
        with open(md_path, 'w') as f:
            f.write(f"# Meaning Kernels Extraction Report\n")
            f.write(f"Generated: {datetime.now().isoformat()}\n")
            f.write(f"Total kernels: {len(kernels)}\n\n")

            for kernel in kernels:
                f.write(f"## Kernel: {kernel.kernel_id}\n")
                f.write(f"- **Source**: {kernel.source}\n")
                f.write(f"- **Confidence**: {kernel.confidence:.2f}\n")
                f.write(f"- **Timestamp**: {kernel.timestamp}\n")
                f.write(f"- **Hash**: {kernel.hash}\n")
                f.write(f"- **Content**: {kernel.content}\n")
                f.write(f"- **Metadata**: {json.dumps(kernel.metadata, indent=2)}\n\n")

        # Save statistics
        stats_path = output_path / "extraction_stats.json"
        with open(stats_path, 'w') as f:
            json.dump(self.stats, f, indent=2)

        print(f"Saved {len(kernels)} kernels to {output_path}")
        print(f"  - JSON: {json_path}")
        print(f"  - Markdown: {md_path}")
        print(f"  - Statistics: {stats_path}")

    def get_stats(self) -> Dict[str, Any]:
        """Get processing statistics."""
        return self.stats.copy()

def main():
    """Command line interface for the pipeline."""
    parser = argparse.ArgumentParser(description="Extract meaning kernels from research diagrams")
    parser.add_argument("input", help="Input PDF or image file/directory")
    parser.add_argument("-o", "--output", help="Output directory")
    parser.add_argument("-c", "--config", help="Configuration file (JSON)")
    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")

    args = parser.parse_args()

    # Load config if provided
    config = {}
    if args.config:
        with open(args.config) as f:
            config = json.load(f)

    # Create processor
    processor = DiagramProcessor(config)

    # Process input
    input_path = Path(args.input)

    if input_path.is_file():
        if input_path.suffix.lower() == '.pdf':
            kernels = processor.extract_from_pdf(input_path, args.output)
        elif input_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
            kernels = processor.extract_from_image(input_path)
        else:
            print(f"Unsupported file type: {input_path.suffix}")
            sys.exit(1)
    elif input_path.is_dir():
        # Process all PDFs and images in directory
        all_kernels = []
        for file_path in input_path.iterdir():
            if file_path.suffix.lower() == '.pdf':
                kernels = processor.extract_from_pdf(file_path, args.output)
                all_kernels.extend(kernels)
            elif file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
                kernels = processor.extract_from_image(file_path)
                all_kernels.extend(kernels)
    else:
        print(f"Input not found: {input_path}")
        sys.exit(1)

    # Print summary
    stats = processor.get_stats()
    print("\n" + "="*50)
    print("EXTRACTION SUMMARY")
    print("="*50)
    print(f"Pages processed: {stats['pages_processed']}")
    print(f"Diagrams found: {stats['diagrams_found']}")
    print(f"Kernels extracted: {stats['kernels_extracted']}")
    print(f"Errors: {stats['errors']}")
    print("="*50)

    # Exit with appropriate code
    sys.exit(0 if stats['errors'] == 0 else 1)

if __name__ == "__main__":
    main()