Improve #493: Enhanced meaning kernel extraction pipeline
- Added 5 kernel types: text, structure, summary, philosophical, semantic - Improved diagram type detection with content analysis - Added color analysis and grayscale detection - Enhanced philosophical keyword extraction - Added semantic relationship detection - Improved error handling for missing dependencies - Added comprehensive testing with text-rich test images - Enhanced metadata and tagging system Key improvements: ✓ Semantic relationship detection (source → target patterns) ✓ Enhanced philosophical content extraction ✓ Color analysis and grayscale detection ✓ Better diagram type classification ✓ Comprehensive metadata and tagging ✓ Improved error handling and dependency warnings Still requires OCR dependencies for text extraction: - pytesseract for OCR - pdf2image for PDF processing - Tesseract OCR engine (see issue #563)
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Meaning Kernel Extraction Pipeline
|
||||
Improved Meaning Kernel Extraction Pipeline
|
||||
Extract structured meaning kernels from academic PDF diagrams.
|
||||
Issue #493: [Multimodal] Extract Meaning Kernels from Research Diagrams
|
||||
"""
|
||||
@@ -8,9 +8,10 @@ import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import re
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
import hashlib
|
||||
|
||||
# Try to import vision libraries
|
||||
@@ -40,13 +41,14 @@ class MeaningKernel:
|
||||
|
||||
def __init__(self, kernel_id: str, content: str, source: str,
|
||||
kernel_type: str = "text", confidence: float = 0.0,
|
||||
metadata: Dict[str, Any] = None):
|
||||
metadata: Dict[str, Any] = None, tags: List[str] = None):
|
||||
self.kernel_id = kernel_id
|
||||
self.content = content
|
||||
self.source = source
|
||||
self.kernel_type = kernel_type # text, structure, summary, philosophical
|
||||
self.kernel_type = kernel_type # text, structure, summary, philosophical, semantic
|
||||
self.confidence = confidence
|
||||
self.metadata = metadata or {}
|
||||
self.tags = tags or []
|
||||
self.timestamp = datetime.now().isoformat()
|
||||
self.hash = self._generate_hash()
|
||||
|
||||
@@ -64,18 +66,26 @@ class MeaningKernel:
|
||||
"kernel_type": self.kernel_type,
|
||||
"confidence": self.confidence,
|
||||
"metadata": self.metadata,
|
||||
"tags": self.tags,
|
||||
"timestamp": self.timestamp,
|
||||
"hash": self.hash
|
||||
}
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"Kernel[{self.kernel_id}]: {self.content[:100]}..."
|
||||
return f"Kernel[{self.kernel_id}] ({self.kernel_type}): {self.content[:100]}..."
|
||||
|
||||
class DiagramAnalyzer:
|
||||
"""Analyze diagrams using multiple methods."""
|
||||
|
||||
def __init__(self, config: Dict[str, Any] = None):
|
||||
self.config = config or {}
|
||||
self.philosophical_keywords = self.config.get("philosophical_keywords", [
|
||||
"truth", "knowledge", "wisdom", "meaning", "purpose",
|
||||
"existence", "reality", "consciousness", "ethics", "morality",
|
||||
"beauty", "justice", "freedom", "responsibility", "identity",
|
||||
"causality", "determinism", "free will", "rationality", "logic",
|
||||
"metaphysics", "epistemology", "ontology", "phenomenology"
|
||||
])
|
||||
|
||||
def analyze_image(self, image_path: str) -> Dict[str, Any]:
|
||||
"""Analyze an image using multiple methods."""
|
||||
@@ -90,43 +100,183 @@ class DiagramAnalyzer:
|
||||
"aspect_ratio": image.width / image.height,
|
||||
"mode": image.mode,
|
||||
"format": image.format,
|
||||
"size_bytes": os.path.getsize(image_path)
|
||||
"size_bytes": os.path.getsize(image_path),
|
||||
"color_analysis": self._analyze_colors(image)
|
||||
}
|
||||
|
||||
# OCR text extraction
|
||||
if TESSERACT_AVAILABLE:
|
||||
try:
|
||||
ocr_text = pytesseract.image_to_string(image)
|
||||
analysis["ocr_text"] = ocr_text.strip()
|
||||
analysis["ocr_confidence"] = self._estimate_ocr_confidence(image)
|
||||
ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
|
||||
ocr_text = " ".join([text for text in ocr_data['text'] if text.strip()])
|
||||
analysis["ocr_text"] = ocr_text
|
||||
analysis["ocr_confidence"] = self._calculate_ocr_confidence(ocr_data)
|
||||
analysis["ocr_word_count"] = len(ocr_text.split())
|
||||
analysis["ocr_lines"] = self._extract_ocr_lines(ocr_data)
|
||||
except Exception as e:
|
||||
analysis["ocr_text"] = ""
|
||||
analysis["ocr_confidence"] = 0.0
|
||||
analysis["ocr_error"] = str(e)
|
||||
|
||||
# Diagram type estimation
|
||||
analysis["diagram_type"] = self._estimate_diagram_type(image)
|
||||
analysis["diagram_type"] = self._estimate_diagram_type(image, analysis)
|
||||
|
||||
# Content analysis
|
||||
analysis["content_analysis"] = self._analyze_content(analysis)
|
||||
|
||||
return analysis
|
||||
|
||||
def _estimate_ocr_confidence(self, image: Image.Image) -> float:
|
||||
"""Estimate OCR confidence (simplified)."""
|
||||
# In reality, would use pytesseract's confidence output
|
||||
return 0.8 # Placeholder
|
||||
def _analyze_colors(self, image: Image.Image) -> Dict[str, Any]:
|
||||
"""Analyze color distribution in image."""
|
||||
# Convert to RGB if necessary
|
||||
if image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
# Get colors
|
||||
colors = image.getcolors(maxcolors=10000)
|
||||
if colors:
|
||||
# Sort by frequency
|
||||
colors.sort(key=lambda x: x[0], reverse=True)
|
||||
total_pixels = image.width * image.height
|
||||
|
||||
# Get dominant colors
|
||||
dominant_colors = []
|
||||
for count, color in colors[:5]:
|
||||
percentage = (count / total_pixels) * 100
|
||||
dominant_colors.append({
|
||||
"color": color,
|
||||
"count": count,
|
||||
"percentage": round(percentage, 2)
|
||||
})
|
||||
|
||||
return {
|
||||
"dominant_colors": dominant_colors,
|
||||
"unique_colors": len(colors),
|
||||
"is_grayscale": self._is_grayscale(image)
|
||||
}
|
||||
|
||||
return {"dominant_colors": [], "unique_colors": 0}
|
||||
|
||||
def _estimate_diagram_type(self, image: Image.Image) -> str:
|
||||
def _is_grayscale(self, image: Image.Image) -> bool:
|
||||
"""Check if image is grayscale."""
|
||||
# Sample some pixels
|
||||
width, height = image.size
|
||||
for x in range(0, width, width // 10):
|
||||
for y in range(0, height, height // 10):
|
||||
r, g, b = image.getpixel((x, y))
|
||||
if not (r == g == b):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _calculate_ocr_confidence(self, ocr_data: Dict[str, Any]) -> float:
|
||||
"""Calculate average OCR confidence."""
|
||||
confidences = [int(conf) for conf in ocr_data['conf'] if int(conf) > 0]
|
||||
if confidences:
|
||||
return sum(confidences) / len(confidences) / 100.0
|
||||
return 0.0
|
||||
|
||||
def _extract_ocr_lines(self, ocr_data: Dict[str, Any]) -> List[str]:
|
||||
"""Extract text lines from OCR data."""
|
||||
lines = []
|
||||
current_line = []
|
||||
current_block = -1
|
||||
current_par = -1
|
||||
current_line_num = -1
|
||||
|
||||
for i in range(len(ocr_data['text'])):
|
||||
if int(ocr_data['conf'][i]) <= 0:
|
||||
continue
|
||||
|
||||
block_num = ocr_data['block_num'][i]
|
||||
par_num = ocr_data['par_num'][i]
|
||||
line_num = ocr_data['line_num'][i]
|
||||
|
||||
if (block_num != current_block or
|
||||
par_num != current_par or
|
||||
line_num != current_line_num):
|
||||
|
||||
if current_line:
|
||||
lines.append(' '.join(current_line))
|
||||
current_line = []
|
||||
current_block = block_num
|
||||
current_par = par_num
|
||||
current_line_num = line_num
|
||||
|
||||
current_line.append(ocr_data['text'][i])
|
||||
|
||||
if current_line:
|
||||
lines.append(' '.join(current_line))
|
||||
|
||||
return lines
|
||||
|
||||
def _estimate_diagram_type(self, image: Image.Image, analysis: Dict[str, Any]) -> str:
|
||||
"""Estimate diagram type based on image characteristics."""
|
||||
width, height = image.size
|
||||
aspect_ratio = width / height
|
||||
|
||||
# Check for flowchart characteristics
|
||||
if aspect_ratio > 2:
|
||||
return "flowchart"
|
||||
elif aspect_ratio < 0.5:
|
||||
return "vertical_hierarchy"
|
||||
elif 0.8 <= aspect_ratio <= 1.2:
|
||||
# Check for circular patterns
|
||||
if self._has_circular_patterns(image):
|
||||
return "circular_diagram"
|
||||
return "square_diagram"
|
||||
else:
|
||||
return "standard_diagram"
|
||||
|
||||
# Check OCR content for clues
|
||||
ocr_text = analysis.get("ocr_text", "").lower()
|
||||
if any(word in ocr_text for word in ["process", "flow", "step", "arrow"]):
|
||||
return "process_diagram"
|
||||
elif any(word in ocr_text for word in ["system", "component", "module"]):
|
||||
return "system_diagram"
|
||||
elif any(word in ocr_text for word in ["data", "information", "input", "output"]):
|
||||
return "data_diagram"
|
||||
|
||||
return "standard_diagram"
|
||||
|
||||
def _has_circular_patterns(self, image: Image.Image) -> bool:
|
||||
"""Check for circular patterns in image (simplified)."""
|
||||
# This is a simplified check - real implementation would use computer vision
|
||||
return False
|
||||
|
||||
def _analyze_content(self, analysis: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Analyze content for themes and patterns."""
|
||||
ocr_text = analysis.get("ocr_text", "")
|
||||
|
||||
content_analysis = {
|
||||
"word_count": len(ocr_text.split()),
|
||||
"has_text": bool(ocr_text),
|
||||
"themes": [],
|
||||
"entities": [],
|
||||
"relationships": []
|
||||
}
|
||||
|
||||
if ocr_text:
|
||||
# Extract potential entities (capitalized words)
|
||||
words = ocr_text.split()
|
||||
entities = [word for word in words if word[0].isupper() and len(word) > 2]
|
||||
content_analysis["entities"] = list(set(entities))[:10]
|
||||
|
||||
# Look for relationships
|
||||
relationship_patterns = [
|
||||
r"(\w+)\s*->\s*(\w+)",
|
||||
r"(\w+)\s*→\s*(\w+)",
|
||||
r"(\w+)\s*to\s*(\w+)",
|
||||
r"(\w+)\s*from\s*(\w+)"
|
||||
]
|
||||
|
||||
for pattern in relationship_patterns:
|
||||
matches = re.findall(pattern, ocr_text)
|
||||
for match in matches:
|
||||
content_analysis["relationships"].append({
|
||||
"source": match[0],
|
||||
"target": match[1],
|
||||
"type": "connection"
|
||||
})
|
||||
|
||||
return content_analysis
|
||||
|
||||
class MeaningKernelExtractor:
|
||||
"""Extract meaning kernels from diagrams."""
|
||||
@@ -139,17 +289,34 @@ class MeaningKernelExtractor:
|
||||
"pages_processed": 0,
|
||||
"diagrams_analyzed": 0,
|
||||
"kernels_extracted": 0,
|
||||
"errors": 0
|
||||
"errors": 0,
|
||||
"dependency_warnings": 0
|
||||
}
|
||||
|
||||
# Check dependencies and update stats
|
||||
if not PIL_AVAILABLE:
|
||||
self.stats["dependency_warnings"] += 1
|
||||
if not TESSERACT_AVAILABLE:
|
||||
self.stats["dependency_warnings"] += 1
|
||||
if not PDF2IMAGE_AVAILABLE:
|
||||
self.stats["dependency_warnings"] += 1
|
||||
|
||||
def extract_from_pdf(self, pdf_path: str, output_dir: str = None) -> List[MeaningKernel]:
|
||||
"""Extract meaning kernels from a PDF file."""
|
||||
if not PDF2IMAGE_AVAILABLE:
|
||||
raise ImportError("pdf2image is required for PDF processing")
|
||||
print("Error: pdf2image is required for PDF processing")
|
||||
print("Install with: pip install pdf2image")
|
||||
print("System dependencies:")
|
||||
print(" macOS: brew install poppler")
|
||||
print(" Ubuntu: sudo apt-get install poppler-utils")
|
||||
self.stats["errors"] += 1
|
||||
return []
|
||||
|
||||
pdf_path = Path(pdf_path)
|
||||
if not pdf_path.exists():
|
||||
raise FileNotFoundError(f"PDF not found: {pdf_path}")
|
||||
print(f"Error: PDF not found: {pdf_path}")
|
||||
self.stats["errors"] += 1
|
||||
return []
|
||||
|
||||
print(f"Processing PDF: {pdf_path}")
|
||||
|
||||
@@ -229,16 +396,26 @@ class MeaningKernelExtractor:
|
||||
kernel_type="text",
|
||||
confidence=analysis.get("ocr_confidence", 0.0),
|
||||
metadata={
|
||||
"word_count": len(analysis["ocr_text"].split()),
|
||||
"word_count": analysis.get("ocr_word_count", 0),
|
||||
"line_count": len(analysis.get("ocr_lines", [])),
|
||||
"diagram_type": analysis.get("diagram_type", "unknown")
|
||||
}
|
||||
},
|
||||
tags=["ocr", "text", "extracted"]
|
||||
)
|
||||
kernels.append(text_kernel)
|
||||
|
||||
# 2. Structure kernel
|
||||
structure_content = f"Diagram type: {analysis.get('diagram_type', 'unknown')}. "
|
||||
structure_content += f"Dimensions: {analysis['dimensions']['width']}x{analysis['dimensions']['height']}. "
|
||||
structure_content += f"Aspect ratio: {analysis['aspect_ratio']:.2f}."
|
||||
structure_content += f"Aspect ratio: {analysis['aspect_ratio']:.2f}. "
|
||||
|
||||
# Add color information
|
||||
color_analysis = analysis.get("color_analysis", {})
|
||||
if color_analysis.get("is_grayscale"):
|
||||
structure_content += "Grayscale image. "
|
||||
elif color_analysis.get("dominant_colors"):
|
||||
top_color = color_analysis["dominant_colors"][0]
|
||||
structure_content += f"Dominant color: RGB{top_color['color']} ({top_color['percentage']}%). "
|
||||
|
||||
structure_kernel = MeaningKernel(
|
||||
kernel_id=f"{base_id}_structure",
|
||||
@@ -249,8 +426,10 @@ class MeaningKernelExtractor:
|
||||
metadata={
|
||||
"dimensions": analysis["dimensions"],
|
||||
"aspect_ratio": analysis["aspect_ratio"],
|
||||
"diagram_type": analysis.get("diagram_type", "unknown")
|
||||
}
|
||||
"diagram_type": analysis.get("diagram_type", "unknown"),
|
||||
"color_analysis": color_analysis
|
||||
},
|
||||
tags=["structure", "layout", "visual"]
|
||||
)
|
||||
kernels.append(structure_kernel)
|
||||
|
||||
@@ -261,6 +440,11 @@ class MeaningKernelExtractor:
|
||||
else:
|
||||
summary += "No text detected."
|
||||
|
||||
# Add content analysis
|
||||
content_analysis = analysis.get("content_analysis", {})
|
||||
if content_analysis.get("entities"):
|
||||
summary += f" Entities: {', '.join(content_analysis['entities'][:5])}."
|
||||
|
||||
summary_kernel = MeaningKernel(
|
||||
kernel_id=f"{base_id}_summary",
|
||||
content=summary,
|
||||
@@ -269,14 +453,16 @@ class MeaningKernelExtractor:
|
||||
confidence=0.7,
|
||||
metadata={
|
||||
"has_text": bool(analysis.get("ocr_text")),
|
||||
"text_length": len(analysis.get("ocr_text", ""))
|
||||
}
|
||||
"text_length": len(analysis.get("ocr_text", "")),
|
||||
"entities": content_analysis.get("entities", []),
|
||||
"relationships": content_analysis.get("relationships", [])
|
||||
},
|
||||
tags=["summary", "overview", "analysis"]
|
||||
)
|
||||
kernels.append(summary_kernel)
|
||||
|
||||
# 4. Philosophical kernel (if we have text)
|
||||
if analysis.get("ocr_text") and len(analysis["ocr_text"]) > 50:
|
||||
# Simple philosophical extraction
|
||||
philosophical_content = self._extract_philosophical_content(analysis["ocr_text"])
|
||||
if philosophical_content:
|
||||
philosophical_kernel = MeaningKernel(
|
||||
@@ -287,33 +473,61 @@ class MeaningKernelExtractor:
|
||||
confidence=0.6,
|
||||
metadata={
|
||||
"extraction_method": "keyword_analysis",
|
||||
"source_text_length": len(analysis["ocr_text"])
|
||||
}
|
||||
"source_text_length": len(analysis["ocr_text"]),
|
||||
"keywords_found": self._find_philosophical_keywords(analysis["ocr_text"])
|
||||
},
|
||||
tags=["philosophical", "meaning", "conceptual"]
|
||||
)
|
||||
kernels.append(philosophical_kernel)
|
||||
|
||||
# 5. Semantic kernel (if we have relationships)
|
||||
content_analysis = analysis.get("content_analysis", {})
|
||||
if content_analysis.get("relationships"):
|
||||
relationships = content_analysis["relationships"]
|
||||
semantic_content = f"Semantic relationships detected: {len(relationships)} connections. "
|
||||
for rel in relationships[:3]:
|
||||
semantic_content += f"{rel['source']} → {rel['target']}. "
|
||||
|
||||
semantic_kernel = MeaningKernel(
|
||||
kernel_id=f"{base_id}_semantic",
|
||||
content=semantic_content,
|
||||
source=source,
|
||||
kernel_type="semantic",
|
||||
confidence=0.8,
|
||||
metadata={
|
||||
"relationship_count": len(relationships),
|
||||
"relationships": relationships
|
||||
},
|
||||
tags=["semantic", "relationships", "connections"]
|
||||
)
|
||||
kernels.append(semantic_kernel)
|
||||
|
||||
# Add to internal list
|
||||
self.kernels.extend(kernels)
|
||||
|
||||
return kernels
|
||||
|
||||
def _extract_philosophical_content(self, text: str) -> Optional[str]:
|
||||
"""Extract philosophical content from text (simplified)."""
|
||||
"""Extract philosophical content from text."""
|
||||
# Look for philosophical keywords
|
||||
philosophical_keywords = [
|
||||
"truth", "knowledge", "wisdom", "meaning", "purpose",
|
||||
"existence", "reality", "consciousness", "ethics", "morality",
|
||||
"beauty", "justice", "freedom", "responsibility", "identity"
|
||||
]
|
||||
|
||||
text_lower = text.lower()
|
||||
found_keywords = [kw for kw in philosophical_keywords if kw in text_lower]
|
||||
found_keywords = self._find_philosophical_keywords(text)
|
||||
|
||||
if found_keywords:
|
||||
return f"Philosophical themes detected: {', '.join(found_keywords)}. " f"Source text explores concepts of {found_keywords[0]}."
|
||||
|
||||
return None
|
||||
|
||||
def _find_philosophical_keywords(self, text: str) -> List[str]:
|
||||
"""Find philosophical keywords in text."""
|
||||
text_lower = text.lower()
|
||||
found_keywords = []
|
||||
|
||||
for keyword in self.analyzer.philosophical_keywords:
|
||||
if keyword in text_lower:
|
||||
found_keywords.append(keyword)
|
||||
|
||||
return found_keywords
|
||||
|
||||
def _save_kernels(self, kernels: List[MeaningKernel], output_path: Path):
|
||||
"""Save kernels to files."""
|
||||
if not kernels:
|
||||
@@ -346,6 +560,7 @@ class MeaningKernelExtractor:
|
||||
f.write(f"- **Source**: {kernel.source}\n")
|
||||
f.write(f"- **Confidence**: {kernel.confidence:.2f}\n")
|
||||
f.write(f"- **Timestamp**: {kernel.timestamp}\n")
|
||||
f.write(f"- **Tags**: {', '.join(kernel.tags)}\n")
|
||||
f.write(f"- **Content**: {kernel.content}\n")
|
||||
f.write(f"- **Metadata**: {json.dumps(kernel.metadata, indent=2)}\n\n")
|
||||
|
||||
@@ -416,6 +631,7 @@ def main():
|
||||
print(f"Diagrams analyzed: {stats['diagrams_analyzed']}")
|
||||
print(f"Kernels extracted: {stats['kernels_extracted']}")
|
||||
print(f"Errors: {stats['errors']}")
|
||||
print(f"Dependency warnings: {stats['dependency_warnings']}")
|
||||
print("="*50)
|
||||
|
||||
# Exit with appropriate code
|
||||
|
||||
Reference in New Issue
Block a user