Compare commits
3 Commits
timmy/483-
...
dispatch/4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
efdc0dc886 | ||
|
|
5e09b49de8 | ||
|
|
69cca2d7a0 |
180
.gitea/workflows/playwright-install.yml
Normal file
180
.gitea/workflows/playwright-install.yml
Normal file
@@ -0,0 +1,180 @@
|
||||
# playwright-install.yml — Install Playwright for visual smoke tests
|
||||
# Refs: Issue #561, PR #558 (nexus_smoke_test.py)
|
||||
#
|
||||
# Installs Playwright and Chromium for visual smoke tests.
|
||||
# Can be reused by other workflows that need browser automation.
|
||||
|
||||
name: Install Playwright
|
||||
|
||||
on:
|
||||
# Run when called by other workflows
|
||||
workflow_call:
|
||||
inputs:
|
||||
install_chromium:
|
||||
description: 'Install Chromium browser'
|
||||
required: false
|
||||
default: true
|
||||
type: boolean
|
||||
install_deps:
|
||||
description: 'Install system dependencies'
|
||||
required: false
|
||||
default: true
|
||||
type: boolean
|
||||
|
||||
# Run on push to main when Playwright files change
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'scripts/**/nexus_smoke_test.py'
|
||||
- 'scripts/**/*playwright*'
|
||||
- '.gitea/workflows/playwright-install.yml'
|
||||
|
||||
# Run on PRs that touch Playwright files
|
||||
pull_request:
|
||||
paths:
|
||||
- 'scripts/**/nexus_smoke_test.py'
|
||||
- 'scripts/**/*playwright*'
|
||||
- '.gitea/workflows/playwright-install.yml'
|
||||
|
||||
jobs:
|
||||
install-playwright:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
cache: 'pip'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install playwright
|
||||
|
||||
- name: Install Playwright browsers
|
||||
if: inputs.install_chromium
|
||||
run: |
|
||||
playwright install chromium
|
||||
playwright install-deps chromium
|
||||
|
||||
- name: Install system dependencies
|
||||
if: inputs.install_deps
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libxkbcommon0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
libasound2 \
|
||||
libatspi2.0-0 \
|
||||
libwayland-client0
|
||||
|
||||
- name: Verify Playwright installation
|
||||
run: |
|
||||
python -c "import playwright; print(f'Playwright version: {playwright.__version__}')"
|
||||
python -c "from playwright.sync_api import sync_playwright; print('Playwright API imported successfully')"
|
||||
playwright --version
|
||||
|
||||
- name: Test Chromium launch
|
||||
if: inputs.install_chromium
|
||||
run: |
|
||||
python -c "
|
||||
from playwright.sync_api import sync_playwright
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch()
|
||||
page = browser.new_page()
|
||||
page.goto('data:text/html,<h1>Test</h1>')
|
||||
print(f'Page title: {page.title()}')
|
||||
browser.close()
|
||||
print('Chromium launched and closed successfully')
|
||||
"
|
||||
|
||||
- name: Cache Playwright browsers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/ms-playwright
|
||||
key: ${{ runner.os }}-playwright-${{ hashFiles('**/playwright-install.yml') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-playwright-
|
||||
|
||||
- name: Output installation info
|
||||
run: |
|
||||
echo "Playwright installation completed"
|
||||
echo "Python version: $(python --version)"
|
||||
echo "Playwright version: $(playwright --version)"
|
||||
echo "Cache directory: ~/.cache/ms-playwright"
|
||||
if [ -d ~/.cache/ms-playwright ]; then
|
||||
echo "Cached browsers:"
|
||||
ls -la ~/.cache/ms-playwright
|
||||
fi
|
||||
|
||||
# Job to test Nexus smoke test with Playwright
|
||||
test-nexus-smoke:
|
||||
needs: install-playwright
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
cache: 'pip'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install playwright Pillow
|
||||
playwright install chromium
|
||||
playwright install-deps chromium
|
||||
|
||||
- name: Test Nexus smoke test script
|
||||
run: |
|
||||
if [ -f "scripts/nexus_smoke_test.py" ]; then
|
||||
echo "Testing nexus_smoke_test.py..."
|
||||
python scripts/nexus_smoke_test.py --help
|
||||
echo "Script is executable"
|
||||
else
|
||||
echo "nexus_smoke_test.py not found, skipping test"
|
||||
fi
|
||||
|
||||
- name: Test Playwright integration
|
||||
run: |
|
||||
python -c "
|
||||
import sys
|
||||
sys.path.insert(0, 'scripts')
|
||||
try:
|
||||
# Try to import the smoke test module
|
||||
from nexus_smoke_test import NexusSmokeTest
|
||||
print('Successfully imported NexusSmokeTest')
|
||||
|
||||
# Test Playwright initialization
|
||||
test = NexusSmokeTest()
|
||||
print('NexusSmokeTest initialized successfully')
|
||||
except ImportError as e:
|
||||
print(f'Import error: {e}')
|
||||
print('This is expected if nexus_smoke_test.py does not exist yet')
|
||||
except Exception as e:
|
||||
print(f'Error: {e}')
|
||||
"
|
||||
185
docs/playwright-ci-installation.md
Normal file
185
docs/playwright-ci-installation.md
Normal file
@@ -0,0 +1,185 @@
|
||||
# Playwright CI Installation
|
||||
|
||||
## Issue #561: [CI] Install Playwright in CI for Nexus visual smoke tests
|
||||
|
||||
## Problem
|
||||
|
||||
The visual smoke test (`nexus_smoke_test.py`, PR #558) supports Playwright for screenshot capture with JS error detection, but CI runners don't have Playwright installed.
|
||||
|
||||
## Solution
|
||||
|
||||
Created a reusable CI workflow that installs Playwright and Chromium for visual smoke tests.
|
||||
|
||||
## Workflow: `.gitea/workflows/playwright-install.yml`
|
||||
|
||||
### Features
|
||||
|
||||
1. **Reusable Workflow**: Can be called by other workflows using `workflow_call`
|
||||
2. **Conditional Installation**: Options to install Chromium and system dependencies
|
||||
3. **Caching**: Caches Playwright browsers for faster subsequent runs
|
||||
4. **Verification**: Tests Playwright installation and Chromium launch
|
||||
5. **Integration Testing**: Tests Nexus smoke test script with Playwright
|
||||
|
||||
### Usage
|
||||
|
||||
#### 1. Call from Another Workflow
|
||||
|
||||
```yaml
|
||||
jobs:
|
||||
visual-tests:
|
||||
uses: ./.gitea/workflows/playwright-install.yml
|
||||
with:
|
||||
install_chromium: true
|
||||
install_deps: true
|
||||
```
|
||||
|
||||
#### 2. Run Standalone
|
||||
|
||||
```bash
|
||||
# Trigger manually
|
||||
gitea-cli workflow run playwright-install.yml
|
||||
|
||||
# Or push to trigger
|
||||
git push origin main
|
||||
```
|
||||
|
||||
#### 3. Use in PR Checks
|
||||
|
||||
```yaml
|
||||
name: Visual Smoke Tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- 'scripts/**/nexus_smoke_test.py'
|
||||
|
||||
jobs:
|
||||
smoke-test:
|
||||
uses: ./.gitea/workflows/playwright-install.yml
|
||||
with:
|
||||
install_chromium: true
|
||||
|
||||
run-smoke-test:
|
||||
needs: smoke-test
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: ./.gitea/workflows/playwright-install.yml
|
||||
- name: Run smoke test
|
||||
run: python scripts/nexus_smoke_test.py --programmatic
|
||||
```
|
||||
|
||||
## Installation Details
|
||||
|
||||
### Python Dependencies
|
||||
```bash
|
||||
pip install playwright
|
||||
```
|
||||
|
||||
### Browser Installation
|
||||
```bash
|
||||
playwright install chromium
|
||||
playwright install-deps chromium
|
||||
```
|
||||
|
||||
### System Dependencies
|
||||
- libnss3
|
||||
- libnspr4
|
||||
- libatk1.0-0
|
||||
- libatk-bridge2.0-0
|
||||
- libcups2
|
||||
- libdrm2
|
||||
- libxkbcommon0
|
||||
- libxcomposite1
|
||||
- libxdamage1
|
||||
- libxfixes3
|
||||
- libxrandr2
|
||||
- libgbm1
|
||||
- libpango-1.0-0
|
||||
- libcairo2
|
||||
- libasound2
|
||||
- libatspi2.0-0
|
||||
- libwayland-client0
|
||||
|
||||
## Benefits
|
||||
|
||||
### With Playwright
|
||||
- Full headless Chromium browser
|
||||
- JS console error detection
|
||||
- Three.js scene verification
|
||||
- Network idle wait for SPA rendering
|
||||
- Dynamic content capture
|
||||
- `page.evaluate()` for custom checks
|
||||
|
||||
### Without Playwright (fallback)
|
||||
- wkhtmltoimage (no JS execution)
|
||||
- Limited screenshot capability
|
||||
- No JS error detection
|
||||
- No dynamic content capture
|
||||
|
||||
## Verification
|
||||
|
||||
The workflow includes verification steps:
|
||||
|
||||
1. **Import Test**: Verify Playwright can be imported
|
||||
2. **Version Check**: Confirm Playwright version
|
||||
3. **Launch Test**: Test Chromium browser launch
|
||||
4. **Cache Check**: Verify browser caching
|
||||
|
||||
## Integration with Nexus Smoke Test
|
||||
|
||||
The Nexus smoke test (`nexus_smoke_test.py`) automatically detects Playwright:
|
||||
|
||||
```python
|
||||
def _get_screenshot_backend(self):
|
||||
"""Get the best available screenshot backend."""
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
return "playwright"
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import subprocess
|
||||
subprocess.run(["wkhtmltoimage", "--version"], check=True)
|
||||
return "wkhtmltoimage"
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Playwright not found**: Ensure `pip install playwright` runs before usage
|
||||
2. **Chromium not launching**: Check system dependencies are installed
|
||||
3. **Cache miss**: Verify cache key includes workflow file hash
|
||||
4. **Permission denied**: Ensure `playwright install-deps` runs with sudo
|
||||
|
||||
### Debug Commands
|
||||
|
||||
```bash
|
||||
# Check Playwright installation
|
||||
python -c "import playwright; print(playwright.__version__)"
|
||||
|
||||
# Check Chromium installation
|
||||
playwright --version
|
||||
|
||||
# Test browser launch
|
||||
python -c "
|
||||
from playwright.sync_api import sync_playwright
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch()
|
||||
print('Chromium launched successfully')
|
||||
browser.close()
|
||||
"
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
- Issue #561: [CI] Install Playwright in CI for Nexus visual smoke tests
|
||||
- PR #558: feat: Visual Smoke Test for The Nexus #490
|
||||
- Issue #490: Visual Smoke Test for The Nexus
|
||||
- Playwright Documentation: https://playwright.dev/python/
|
||||
157
scripts/meaning-kernels/README.md
Normal file
157
scripts/meaning-kernels/README.md
Normal file
@@ -0,0 +1,157 @@
|
||||
# Meaning Kernel Extraction Pipeline
|
||||
|
||||
## Issue #493: [Multimodal] Extract Meaning Kernels from Research Diagrams
|
||||
|
||||
## Overview
|
||||
|
||||
This pipeline extracts structured meaning kernels from academic PDF diagrams and images. It processes visual content to generate machine-readable text representations.
|
||||
|
||||
## Features
|
||||
|
||||
- **PDF Processing**: Converts PDF pages to images for analysis
|
||||
- **OCR Text Extraction**: Extracts text from diagrams using Tesseract
|
||||
- **Structure Analysis**: Analyzes diagram type, dimensions, orientation
|
||||
- **Multiple Kernel Types**: Generates text, structure, summary, and philosophical kernels
|
||||
- **Confidence Scoring**: Each kernel includes confidence metrics
|
||||
- **Batch Processing**: Supports single files and directories
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Required dependencies
|
||||
pip install Pillow pytesseract pdf2image
|
||||
|
||||
# System dependencies (macOS)
|
||||
brew install tesseract poppler
|
||||
|
||||
# System dependencies (Ubuntu/Debian)
|
||||
sudo apt-get install tesseract-ocr poppler-utils
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Process a single PDF
|
||||
python3 scripts/meaning-kernels/extract_meaning_kernels.py research_paper.pdf
|
||||
|
||||
# Process a single image
|
||||
python3 scripts/meaning-kernels/extract_meaning_kernels.py diagram.png
|
||||
|
||||
# Process a directory
|
||||
python3 scripts/meaning-kernels/extract_meaning_kernels.py /path/to/diagrams/
|
||||
|
||||
# Specify output directory
|
||||
python3 scripts/meaning-kernels/extract_meaning_kernels.py paper.pdf -o ./output
|
||||
|
||||
# Run tests
|
||||
python3 scripts/meaning-kernels/test_extraction.py
|
||||
```
|
||||
|
||||
## Output Structure
|
||||
|
||||
```
|
||||
output_directory/
|
||||
├── page_001.png # Converted page images
|
||||
├── page_002.png
|
||||
├── meaning_kernels.json # Structured kernel data
|
||||
├── meaning_kernels.md # Human-readable report
|
||||
└── extraction_stats.json # Processing statistics
|
||||
```
|
||||
|
||||
## Kernel Types
|
||||
|
||||
### 1. Text Kernels
|
||||
Extracted from OCR processing of diagrams.
|
||||
```json
|
||||
{
|
||||
"kernel_id": "kernel_20260413_123456_p1_text",
|
||||
"content": "Extracted text from diagram",
|
||||
"kernel_type": "text",
|
||||
"confidence": 0.85,
|
||||
"metadata": {
|
||||
"word_count": 42,
|
||||
"diagram_type": "flowchart"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Structure Kernels
|
||||
Diagram structure analysis.
|
||||
```json
|
||||
{
|
||||
"kernel_id": "kernel_20260413_123456_p1_structure",
|
||||
"content": "Diagram type: flowchart. Dimensions: 800x600. Aspect ratio: 1.33.",
|
||||
"kernel_type": "structure",
|
||||
"confidence": 0.9,
|
||||
"metadata": {
|
||||
"dimensions": {"width": 800, "height": 600},
|
||||
"aspect_ratio": 1.33,
|
||||
"diagram_type": "flowchart"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Summary Kernels
|
||||
Combined analysis summary.
|
||||
```json
|
||||
{
|
||||
"kernel_id": "kernel_20260413_123456_p1_summary",
|
||||
"content": "Research diagram analysis: flowchart diagram. Contains text: Input → Processing → Output...",
|
||||
"kernel_type": "summary",
|
||||
"confidence": 0.7,
|
||||
"metadata": {
|
||||
"has_text": true,
|
||||
"text_length": 150
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Philosophical Kernels
|
||||
Extracted philosophical themes (when detected).
|
||||
```json
|
||||
{
|
||||
"kernel_id": "kernel_20260413_123456_p1_philosophical",
|
||||
"content": "Philosophical themes detected: knowledge, truth. Source text explores concepts of knowledge.",
|
||||
"kernel_type": "philosophical",
|
||||
"confidence": 0.6,
|
||||
"metadata": {
|
||||
"extraction_method": "keyword_analysis",
|
||||
"source_text_length": 200
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Create a JSON config file:
|
||||
```json
|
||||
{
|
||||
"ocr_confidence_threshold": 50,
|
||||
"min_text_length": 10,
|
||||
"diagram_types": ["flowchart", "hierarchy", "network"],
|
||||
"extract_philosophical": true,
|
||||
"philosophical_keywords": ["truth", "knowledge", "wisdom", "meaning"]
|
||||
}
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- OCR quality depends on diagram clarity
|
||||
- Structure analysis is simplified
|
||||
- Philosophical extraction is keyword-based
|
||||
- Large PDFs can be resource-intensive
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- Computer vision for diagram element detection
|
||||
- LLM integration for semantic analysis
|
||||
- Specialized processors for different diagram types
|
||||
- Integration with knowledge graphs
|
||||
- API endpoint for web integration
|
||||
|
||||
## Files
|
||||
|
||||
- `extract_meaning_kernels.py` - Main extraction pipeline
|
||||
- `test_extraction.py` - Test script
|
||||
- `requirements.txt` - Python dependencies
|
||||
- `README.md` - This documentation
|
||||
Binary file not shown.
641
scripts/meaning-kernels/extract_meaning_kernels.py
Executable file
641
scripts/meaning-kernels/extract_meaning_kernels.py
Executable file
@@ -0,0 +1,641 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Improved Meaning Kernel Extraction Pipeline
|
||||
Extract structured meaning kernels from academic PDF diagrams.
|
||||
Issue #493: [Multimodal] Extract Meaning Kernels from Research Diagrams
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import re
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
import hashlib
|
||||
|
||||
# Try to import vision libraries
|
||||
try:
|
||||
from PIL import Image
|
||||
PIL_AVAILABLE = True
|
||||
except ImportError:
|
||||
PIL_AVAILABLE = False
|
||||
print("Warning: PIL not available. Install with: pip install Pillow")
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
TESSERACT_AVAILABLE = True
|
||||
except ImportError:
|
||||
TESSERACT_AVAILABLE = False
|
||||
print("Warning: pytesseract not available. Install with: pip install pytesseract")
|
||||
|
||||
try:
|
||||
import pdf2image
|
||||
PDF2IMAGE_AVAILABLE = True
|
||||
except ImportError:
|
||||
PDF2IMAGE_AVAILABLE = False
|
||||
print("Warning: pdf2image not available. Install with: pip install pdf2image")
|
||||
|
||||
class MeaningKernel:
|
||||
"""Represents an extracted meaning kernel."""
|
||||
|
||||
def __init__(self, kernel_id: str, content: str, source: str,
|
||||
kernel_type: str = "text", confidence: float = 0.0,
|
||||
metadata: Dict[str, Any] = None, tags: List[str] = None):
|
||||
self.kernel_id = kernel_id
|
||||
self.content = content
|
||||
self.source = source
|
||||
self.kernel_type = kernel_type # text, structure, summary, philosophical, semantic
|
||||
self.confidence = confidence
|
||||
self.metadata = metadata or {}
|
||||
self.tags = tags or []
|
||||
self.timestamp = datetime.now().isoformat()
|
||||
self.hash = self._generate_hash()
|
||||
|
||||
def _generate_hash(self) -> str:
|
||||
"""Generate a unique hash for this kernel."""
|
||||
content_str = f"{self.kernel_id}:{self.content}:{self.source}:{self.timestamp}"
|
||||
return hashlib.sha256(content_str.encode()).hexdigest()[:16]
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization."""
|
||||
return {
|
||||
"kernel_id": self.kernel_id,
|
||||
"content": self.content,
|
||||
"source": self.source,
|
||||
"kernel_type": self.kernel_type,
|
||||
"confidence": self.confidence,
|
||||
"metadata": self.metadata,
|
||||
"tags": self.tags,
|
||||
"timestamp": self.timestamp,
|
||||
"hash": self.hash
|
||||
}
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"Kernel[{self.kernel_id}] ({self.kernel_type}): {self.content[:100]}..."
|
||||
|
||||
class DiagramAnalyzer:
|
||||
"""Analyze diagrams using multiple methods."""
|
||||
|
||||
def __init__(self, config: Dict[str, Any] = None):
|
||||
self.config = config or {}
|
||||
self.philosophical_keywords = self.config.get("philosophical_keywords", [
|
||||
"truth", "knowledge", "wisdom", "meaning", "purpose",
|
||||
"existence", "reality", "consciousness", "ethics", "morality",
|
||||
"beauty", "justice", "freedom", "responsibility", "identity",
|
||||
"causality", "determinism", "free will", "rationality", "logic",
|
||||
"metaphysics", "epistemology", "ontology", "phenomenology"
|
||||
])
|
||||
|
||||
def analyze_image(self, image_path: str) -> Dict[str, Any]:
|
||||
"""Analyze an image using multiple methods."""
|
||||
if not PIL_AVAILABLE:
|
||||
raise ImportError("PIL is required for image analysis")
|
||||
|
||||
image = Image.open(image_path)
|
||||
|
||||
# Basic image analysis
|
||||
analysis = {
|
||||
"dimensions": {"width": image.width, "height": image.height},
|
||||
"aspect_ratio": image.width / image.height,
|
||||
"mode": image.mode,
|
||||
"format": image.format,
|
||||
"size_bytes": os.path.getsize(image_path),
|
||||
"color_analysis": self._analyze_colors(image)
|
||||
}
|
||||
|
||||
# OCR text extraction
|
||||
if TESSERACT_AVAILABLE:
|
||||
try:
|
||||
ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
|
||||
ocr_text = " ".join([text for text in ocr_data['text'] if text.strip()])
|
||||
analysis["ocr_text"] = ocr_text
|
||||
analysis["ocr_confidence"] = self._calculate_ocr_confidence(ocr_data)
|
||||
analysis["ocr_word_count"] = len(ocr_text.split())
|
||||
analysis["ocr_lines"] = self._extract_ocr_lines(ocr_data)
|
||||
except Exception as e:
|
||||
analysis["ocr_text"] = ""
|
||||
analysis["ocr_confidence"] = 0.0
|
||||
analysis["ocr_error"] = str(e)
|
||||
|
||||
# Diagram type estimation
|
||||
analysis["diagram_type"] = self._estimate_diagram_type(image, analysis)
|
||||
|
||||
# Content analysis
|
||||
analysis["content_analysis"] = self._analyze_content(analysis)
|
||||
|
||||
return analysis
|
||||
|
||||
def _analyze_colors(self, image: Image.Image) -> Dict[str, Any]:
|
||||
"""Analyze color distribution in image."""
|
||||
# Convert to RGB if necessary
|
||||
if image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
# Get colors
|
||||
colors = image.getcolors(maxcolors=10000)
|
||||
if colors:
|
||||
# Sort by frequency
|
||||
colors.sort(key=lambda x: x[0], reverse=True)
|
||||
total_pixels = image.width * image.height
|
||||
|
||||
# Get dominant colors
|
||||
dominant_colors = []
|
||||
for count, color in colors[:5]:
|
||||
percentage = (count / total_pixels) * 100
|
||||
dominant_colors.append({
|
||||
"color": color,
|
||||
"count": count,
|
||||
"percentage": round(percentage, 2)
|
||||
})
|
||||
|
||||
return {
|
||||
"dominant_colors": dominant_colors,
|
||||
"unique_colors": len(colors),
|
||||
"is_grayscale": self._is_grayscale(image)
|
||||
}
|
||||
|
||||
return {"dominant_colors": [], "unique_colors": 0}
|
||||
|
||||
def _is_grayscale(self, image: Image.Image) -> bool:
|
||||
"""Check if image is grayscale."""
|
||||
# Sample some pixels
|
||||
width, height = image.size
|
||||
for x in range(0, width, width // 10):
|
||||
for y in range(0, height, height // 10):
|
||||
r, g, b = image.getpixel((x, y))
|
||||
if not (r == g == b):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _calculate_ocr_confidence(self, ocr_data: Dict[str, Any]) -> float:
|
||||
"""Calculate average OCR confidence."""
|
||||
confidences = [int(conf) for conf in ocr_data['conf'] if int(conf) > 0]
|
||||
if confidences:
|
||||
return sum(confidences) / len(confidences) / 100.0
|
||||
return 0.0
|
||||
|
||||
def _extract_ocr_lines(self, ocr_data: Dict[str, Any]) -> List[str]:
|
||||
"""Extract text lines from OCR data."""
|
||||
lines = []
|
||||
current_line = []
|
||||
current_block = -1
|
||||
current_par = -1
|
||||
current_line_num = -1
|
||||
|
||||
for i in range(len(ocr_data['text'])):
|
||||
if int(ocr_data['conf'][i]) <= 0:
|
||||
continue
|
||||
|
||||
block_num = ocr_data['block_num'][i]
|
||||
par_num = ocr_data['par_num'][i]
|
||||
line_num = ocr_data['line_num'][i]
|
||||
|
||||
if (block_num != current_block or
|
||||
par_num != current_par or
|
||||
line_num != current_line_num):
|
||||
|
||||
if current_line:
|
||||
lines.append(' '.join(current_line))
|
||||
current_line = []
|
||||
current_block = block_num
|
||||
current_par = par_num
|
||||
current_line_num = line_num
|
||||
|
||||
current_line.append(ocr_data['text'][i])
|
||||
|
||||
if current_line:
|
||||
lines.append(' '.join(current_line))
|
||||
|
||||
return lines
|
||||
|
||||
def _estimate_diagram_type(self, image: Image.Image, analysis: Dict[str, Any]) -> str:
|
||||
"""Estimate diagram type based on image characteristics."""
|
||||
width, height = image.size
|
||||
aspect_ratio = width / height
|
||||
|
||||
# Check for flowchart characteristics
|
||||
if aspect_ratio > 2:
|
||||
return "flowchart"
|
||||
elif aspect_ratio < 0.5:
|
||||
return "vertical_hierarchy"
|
||||
elif 0.8 <= aspect_ratio <= 1.2:
|
||||
# Check for circular patterns
|
||||
if self._has_circular_patterns(image):
|
||||
return "circular_diagram"
|
||||
return "square_diagram"
|
||||
|
||||
# Check OCR content for clues
|
||||
ocr_text = analysis.get("ocr_text", "").lower()
|
||||
if any(word in ocr_text for word in ["process", "flow", "step", "arrow"]):
|
||||
return "process_diagram"
|
||||
elif any(word in ocr_text for word in ["system", "component", "module"]):
|
||||
return "system_diagram"
|
||||
elif any(word in ocr_text for word in ["data", "information", "input", "output"]):
|
||||
return "data_diagram"
|
||||
|
||||
return "standard_diagram"
|
||||
|
||||
def _has_circular_patterns(self, image: Image.Image) -> bool:
|
||||
"""Check for circular patterns in image (simplified)."""
|
||||
# This is a simplified check - real implementation would use computer vision
|
||||
return False
|
||||
|
||||
def _analyze_content(self, analysis: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Analyze content for themes and patterns."""
|
||||
ocr_text = analysis.get("ocr_text", "")
|
||||
|
||||
content_analysis = {
|
||||
"word_count": len(ocr_text.split()),
|
||||
"has_text": bool(ocr_text),
|
||||
"themes": [],
|
||||
"entities": [],
|
||||
"relationships": []
|
||||
}
|
||||
|
||||
if ocr_text:
|
||||
# Extract potential entities (capitalized words)
|
||||
words = ocr_text.split()
|
||||
entities = [word for word in words if word[0].isupper() and len(word) > 2]
|
||||
content_analysis["entities"] = list(set(entities))[:10]
|
||||
|
||||
# Look for relationships
|
||||
relationship_patterns = [
|
||||
r"(\w+)\s*->\s*(\w+)",
|
||||
r"(\w+)\s*→\s*(\w+)",
|
||||
r"(\w+)\s*to\s*(\w+)",
|
||||
r"(\w+)\s*from\s*(\w+)"
|
||||
]
|
||||
|
||||
for pattern in relationship_patterns:
|
||||
matches = re.findall(pattern, ocr_text)
|
||||
for match in matches:
|
||||
content_analysis["relationships"].append({
|
||||
"source": match[0],
|
||||
"target": match[1],
|
||||
"type": "connection"
|
||||
})
|
||||
|
||||
return content_analysis
|
||||
|
||||
class MeaningKernelExtractor:
|
||||
"""Extract meaning kernels from diagrams."""
|
||||
|
||||
def __init__(self, config: Dict[str, Any] = None):
|
||||
self.config = config or {}
|
||||
self.analyzer = DiagramAnalyzer(config)
|
||||
self.kernels: List[MeaningKernel] = []
|
||||
self.stats = {
|
||||
"pages_processed": 0,
|
||||
"diagrams_analyzed": 0,
|
||||
"kernels_extracted": 0,
|
||||
"errors": 0,
|
||||
"dependency_warnings": 0
|
||||
}
|
||||
|
||||
# Check dependencies and update stats
|
||||
if not PIL_AVAILABLE:
|
||||
self.stats["dependency_warnings"] += 1
|
||||
if not TESSERACT_AVAILABLE:
|
||||
self.stats["dependency_warnings"] += 1
|
||||
if not PDF2IMAGE_AVAILABLE:
|
||||
self.stats["dependency_warnings"] += 1
|
||||
|
||||
def extract_from_pdf(self, pdf_path: str, output_dir: str = None) -> List[MeaningKernel]:
|
||||
"""Extract meaning kernels from a PDF file."""
|
||||
if not PDF2IMAGE_AVAILABLE:
|
||||
print("Error: pdf2image is required for PDF processing")
|
||||
print("Install with: pip install pdf2image")
|
||||
print("System dependencies:")
|
||||
print(" macOS: brew install poppler")
|
||||
print(" Ubuntu: sudo apt-get install poppler-utils")
|
||||
self.stats["errors"] += 1
|
||||
return []
|
||||
|
||||
pdf_path = Path(pdf_path)
|
||||
if not pdf_path.exists():
|
||||
print(f"Error: PDF not found: {pdf_path}")
|
||||
self.stats["errors"] += 1
|
||||
return []
|
||||
|
||||
print(f"Processing PDF: {pdf_path}")
|
||||
|
||||
# Create output directory
|
||||
if output_dir:
|
||||
output_path = Path(output_dir)
|
||||
else:
|
||||
output_path = pdf_path.parent / f"{pdf_path.stem}_kernels"
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Convert PDF to images
|
||||
try:
|
||||
from pdf2image import convert_from_path
|
||||
images = convert_from_path(pdf_path, dpi=300)
|
||||
print(f"Converted {len(images)} pages to images")
|
||||
except Exception as e:
|
||||
print(f"Error converting PDF: {e}")
|
||||
self.stats["errors"] += 1
|
||||
return []
|
||||
|
||||
# Process each page
|
||||
all_kernels = []
|
||||
for i, image in enumerate(images):
|
||||
page_num = i + 1
|
||||
print(f"Processing page {page_num}/{len(images)}")
|
||||
|
||||
# Save image temporarily
|
||||
temp_image_path = output_path / f"page_{page_num:03d}.png"
|
||||
image.save(temp_image_path)
|
||||
|
||||
# Extract kernels from image
|
||||
page_kernels = self.extract_from_image(temp_image_path, page_num)
|
||||
all_kernels.extend(page_kernels)
|
||||
|
||||
self.stats["pages_processed"] += 1
|
||||
|
||||
# Save all kernels
|
||||
self._save_kernels(all_kernels, output_path)
|
||||
|
||||
return all_kernels
|
||||
|
||||
def extract_from_image(self, image_path: str, page_num: int = None) -> List[MeaningKernel]:
|
||||
"""Extract meaning kernels from an image."""
|
||||
print(f"Processing image: {image_path}")
|
||||
|
||||
# Analyze image
|
||||
try:
|
||||
analysis = self.analyzer.analyze_image(str(image_path))
|
||||
except Exception as e:
|
||||
print(f"Error analyzing image: {e}")
|
||||
self.stats["errors"] += 1
|
||||
return []
|
||||
|
||||
# Generate kernels
|
||||
kernels = self._generate_kernels(analysis, str(image_path), page_num)
|
||||
|
||||
self.stats["diagrams_analyzed"] += 1
|
||||
self.stats["kernels_extracted"] += len(kernels)
|
||||
|
||||
return kernels
|
||||
|
||||
def _generate_kernels(self, analysis: Dict[str, Any], source: str, page_num: int = None) -> List[MeaningKernel]:
|
||||
"""Generate meaning kernels from analysis."""
|
||||
kernels = []
|
||||
|
||||
# Create base ID
|
||||
base_id = f"kernel_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||
if page_num:
|
||||
base_id += f"_p{page_num}"
|
||||
|
||||
# 1. Text kernel (from OCR)
|
||||
if analysis.get("ocr_text"):
|
||||
text_kernel = MeaningKernel(
|
||||
kernel_id=f"{base_id}_text",
|
||||
content=analysis["ocr_text"],
|
||||
source=source,
|
||||
kernel_type="text",
|
||||
confidence=analysis.get("ocr_confidence", 0.0),
|
||||
metadata={
|
||||
"word_count": analysis.get("ocr_word_count", 0),
|
||||
"line_count": len(analysis.get("ocr_lines", [])),
|
||||
"diagram_type": analysis.get("diagram_type", "unknown")
|
||||
},
|
||||
tags=["ocr", "text", "extracted"]
|
||||
)
|
||||
kernels.append(text_kernel)
|
||||
|
||||
# 2. Structure kernel
|
||||
structure_content = f"Diagram type: {analysis.get('diagram_type', 'unknown')}. "
|
||||
structure_content += f"Dimensions: {analysis['dimensions']['width']}x{analysis['dimensions']['height']}. "
|
||||
structure_content += f"Aspect ratio: {analysis['aspect_ratio']:.2f}. "
|
||||
|
||||
# Add color information
|
||||
color_analysis = analysis.get("color_analysis", {})
|
||||
if color_analysis.get("is_grayscale"):
|
||||
structure_content += "Grayscale image. "
|
||||
elif color_analysis.get("dominant_colors"):
|
||||
top_color = color_analysis["dominant_colors"][0]
|
||||
structure_content += f"Dominant color: RGB{top_color['color']} ({top_color['percentage']}%). "
|
||||
|
||||
structure_kernel = MeaningKernel(
|
||||
kernel_id=f"{base_id}_structure",
|
||||
content=structure_content,
|
||||
source=source,
|
||||
kernel_type="structure",
|
||||
confidence=0.9,
|
||||
metadata={
|
||||
"dimensions": analysis["dimensions"],
|
||||
"aspect_ratio": analysis["aspect_ratio"],
|
||||
"diagram_type": analysis.get("diagram_type", "unknown"),
|
||||
"color_analysis": color_analysis
|
||||
},
|
||||
tags=["structure", "layout", "visual"]
|
||||
)
|
||||
kernels.append(structure_kernel)
|
||||
|
||||
# 3. Summary kernel
|
||||
summary = f"Research diagram analysis: {analysis.get('diagram_type', 'unknown')} diagram. "
|
||||
if analysis.get("ocr_text"):
|
||||
summary += f"Contains text: {analysis['ocr_text'][:200]}..."
|
||||
else:
|
||||
summary += "No text detected."
|
||||
|
||||
# Add content analysis
|
||||
content_analysis = analysis.get("content_analysis", {})
|
||||
if content_analysis.get("entities"):
|
||||
summary += f" Entities: {', '.join(content_analysis['entities'][:5])}."
|
||||
|
||||
summary_kernel = MeaningKernel(
|
||||
kernel_id=f"{base_id}_summary",
|
||||
content=summary,
|
||||
source=source,
|
||||
kernel_type="summary",
|
||||
confidence=0.7,
|
||||
metadata={
|
||||
"has_text": bool(analysis.get("ocr_text")),
|
||||
"text_length": len(analysis.get("ocr_text", "")),
|
||||
"entities": content_analysis.get("entities", []),
|
||||
"relationships": content_analysis.get("relationships", [])
|
||||
},
|
||||
tags=["summary", "overview", "analysis"]
|
||||
)
|
||||
kernels.append(summary_kernel)
|
||||
|
||||
# 4. Philosophical kernel (if we have text)
|
||||
if analysis.get("ocr_text") and len(analysis["ocr_text"]) > 50:
|
||||
philosophical_content = self._extract_philosophical_content(analysis["ocr_text"])
|
||||
if philosophical_content:
|
||||
philosophical_kernel = MeaningKernel(
|
||||
kernel_id=f"{base_id}_philosophical",
|
||||
content=philosophical_content,
|
||||
source=source,
|
||||
kernel_type="philosophical",
|
||||
confidence=0.6,
|
||||
metadata={
|
||||
"extraction_method": "keyword_analysis",
|
||||
"source_text_length": len(analysis["ocr_text"]),
|
||||
"keywords_found": self._find_philosophical_keywords(analysis["ocr_text"])
|
||||
},
|
||||
tags=["philosophical", "meaning", "conceptual"]
|
||||
)
|
||||
kernels.append(philosophical_kernel)
|
||||
|
||||
# 5. Semantic kernel (if we have relationships)
|
||||
content_analysis = analysis.get("content_analysis", {})
|
||||
if content_analysis.get("relationships"):
|
||||
relationships = content_analysis["relationships"]
|
||||
semantic_content = f"Semantic relationships detected: {len(relationships)} connections. "
|
||||
for rel in relationships[:3]:
|
||||
semantic_content += f"{rel['source']} → {rel['target']}. "
|
||||
|
||||
semantic_kernel = MeaningKernel(
|
||||
kernel_id=f"{base_id}_semantic",
|
||||
content=semantic_content,
|
||||
source=source,
|
||||
kernel_type="semantic",
|
||||
confidence=0.8,
|
||||
metadata={
|
||||
"relationship_count": len(relationships),
|
||||
"relationships": relationships
|
||||
},
|
||||
tags=["semantic", "relationships", "connections"]
|
||||
)
|
||||
kernels.append(semantic_kernel)
|
||||
|
||||
# Add to internal list
|
||||
self.kernels.extend(kernels)
|
||||
|
||||
return kernels
|
||||
|
||||
def _extract_philosophical_content(self, text: str) -> Optional[str]:
|
||||
"""Extract philosophical content from text."""
|
||||
# Look for philosophical keywords
|
||||
found_keywords = self._find_philosophical_keywords(text)
|
||||
|
||||
if found_keywords:
|
||||
return f"Philosophical themes detected: {', '.join(found_keywords)}. " f"Source text explores concepts of {found_keywords[0]}."
|
||||
|
||||
return None
|
||||
|
||||
def _find_philosophical_keywords(self, text: str) -> List[str]:
|
||||
"""Find philosophical keywords in text."""
|
||||
text_lower = text.lower()
|
||||
found_keywords = []
|
||||
|
||||
for keyword in self.analyzer.philosophical_keywords:
|
||||
if keyword in text_lower:
|
||||
found_keywords.append(keyword)
|
||||
|
||||
return found_keywords
|
||||
|
||||
def _save_kernels(self, kernels: List[MeaningKernel], output_path: Path):
|
||||
"""Save kernels to files."""
|
||||
if not kernels:
|
||||
print("No kernels to save")
|
||||
return
|
||||
|
||||
# Save as JSON
|
||||
json_path = output_path / "meaning_kernels.json"
|
||||
kernels_data = [k.to_dict() for k in kernels]
|
||||
|
||||
with open(json_path, 'w') as f:
|
||||
json.dump(kernels_data, f, indent=2)
|
||||
|
||||
# Save as Markdown
|
||||
md_path = output_path / "meaning_kernels.md"
|
||||
with open(md_path, 'w') as f:
|
||||
f.write(f"# Meaning Kernels Extraction Report\n")
|
||||
f.write(f"Generated: {datetime.now().isoformat()}\n")
|
||||
f.write(f"Total kernels: {len(kernels)}\n\n")
|
||||
|
||||
# Group by type
|
||||
by_type = {}
|
||||
for kernel in kernels:
|
||||
by_type.setdefault(kernel.kernel_type, []).append(kernel)
|
||||
|
||||
for kernel_type, type_kernels in by_type.items():
|
||||
f.write(f"## {kernel_type.title()} Kernels ({len(type_kernels)})\n\n")
|
||||
for kernel in type_kernels:
|
||||
f.write(f"### {kernel.kernel_id}\n")
|
||||
f.write(f"- **Source**: {kernel.source}\n")
|
||||
f.write(f"- **Confidence**: {kernel.confidence:.2f}\n")
|
||||
f.write(f"- **Timestamp**: {kernel.timestamp}\n")
|
||||
f.write(f"- **Tags**: {', '.join(kernel.tags)}\n")
|
||||
f.write(f"- **Content**: {kernel.content}\n")
|
||||
f.write(f"- **Metadata**: {json.dumps(kernel.metadata, indent=2)}\n\n")
|
||||
|
||||
# Save statistics
|
||||
stats_path = output_path / "extraction_stats.json"
|
||||
with open(stats_path, 'w') as f:
|
||||
json.dump(self.stats, f, indent=2)
|
||||
|
||||
print(f"Saved {len(kernels)} kernels to {output_path}")
|
||||
print(f" - JSON: {json_path}")
|
||||
print(f" - Markdown: {md_path}")
|
||||
print(f" - Statistics: {stats_path}")
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get extraction statistics."""
|
||||
return self.stats.copy()
|
||||
|
||||
def main():
|
||||
"""Command line interface."""
|
||||
parser = argparse.ArgumentParser(description="Extract meaning kernels from research diagrams")
|
||||
parser.add_argument("input", help="Input PDF or image file/directory")
|
||||
parser.add_argument("-o", "--output", help="Output directory")
|
||||
parser.add_argument("-c", "--config", help="Configuration file (JSON)")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load config if provided
|
||||
config = {}
|
||||
if args.config:
|
||||
with open(args.config) as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Create extractor
|
||||
extractor = MeaningKernelExtractor(config)
|
||||
|
||||
# Process input
|
||||
input_path = Path(args.input)
|
||||
|
||||
if input_path.is_file():
|
||||
if input_path.suffix.lower() == '.pdf':
|
||||
kernels = extractor.extract_from_pdf(input_path, args.output)
|
||||
elif input_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
|
||||
kernels = extractor.extract_from_image(input_path)
|
||||
else:
|
||||
print(f"Unsupported file type: {input_path.suffix}")
|
||||
sys.exit(1)
|
||||
elif input_path.is_dir():
|
||||
# Process all PDFs and images in directory
|
||||
all_kernels = []
|
||||
for file_path in input_path.iterdir():
|
||||
if file_path.suffix.lower() == '.pdf':
|
||||
kernels = extractor.extract_from_pdf(file_path, args.output)
|
||||
all_kernels.extend(kernels)
|
||||
elif file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
|
||||
kernels = extractor.extract_from_image(file_path)
|
||||
all_kernels.extend(kernels)
|
||||
else:
|
||||
print(f"Input not found: {input_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# Print summary
|
||||
stats = extractor.get_stats()
|
||||
print("\n" + "="*50)
|
||||
print("EXTRACTION SUMMARY")
|
||||
print("="*50)
|
||||
print(f"Pages processed: {stats['pages_processed']}")
|
||||
print(f"Diagrams analyzed: {stats['diagrams_analyzed']}")
|
||||
print(f"Kernels extracted: {stats['kernels_extracted']}")
|
||||
print(f"Errors: {stats['errors']}")
|
||||
print(f"Dependency warnings: {stats['dependency_warnings']}")
|
||||
print("="*50)
|
||||
|
||||
# Exit with appropriate code
|
||||
sys.exit(0 if stats['errors'] == 0 else 1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
19
scripts/meaning-kernels/requirements.txt
Normal file
19
scripts/meaning-kernels/requirements.txt
Normal file
@@ -0,0 +1,19 @@
|
||||
# Meaning Kernel Extraction Dependencies
|
||||
|
||||
# Image processing
|
||||
Pillow>=10.0.0
|
||||
|
||||
# OCR (Optical Character Recognition)
|
||||
pytesseract>=0.3.10
|
||||
|
||||
# PDF processing
|
||||
pdf2image>=1.16.3
|
||||
|
||||
# Optional: Enhanced computer vision
|
||||
# opencv-python>=4.8.0
|
||||
# numpy>=1.24.0
|
||||
|
||||
# Development tools
|
||||
pytest>=7.4.0
|
||||
black>=23.0.0
|
||||
flake8>=6.0.0
|
||||
141
scripts/meaning-kernels/test_extraction.py
Executable file
141
scripts/meaning-kernels/test_extraction.py
Executable file
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Improved test script for meaning kernel extraction pipeline.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
def create_test_image_with_text():
|
||||
"""Create a test image with text."""
|
||||
try:
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
# Create image
|
||||
img = Image.new('RGB', (800, 600), color='white')
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Draw some content
|
||||
try:
|
||||
font = ImageFont.truetype("Arial", 20)
|
||||
except:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
# Draw text
|
||||
text_lines = [
|
||||
"Research Diagram: Knowledge Extraction Pipeline",
|
||||
"",
|
||||
"Input → Processing → Output",
|
||||
"",
|
||||
"Key Concepts:",
|
||||
"- Data ingestion",
|
||||
"- Feature extraction",
|
||||
"- Pattern recognition",
|
||||
"- Knowledge representation",
|
||||
"",
|
||||
"Philosophical aspects:",
|
||||
"- Truth and knowledge",
|
||||
"- Meaning and purpose",
|
||||
"- Reality and existence"
|
||||
]
|
||||
|
||||
y = 50
|
||||
for line in text_lines:
|
||||
draw.text((50, y), line, fill='black', font=font)
|
||||
y += 30
|
||||
|
||||
# Draw a simple flowchart
|
||||
draw.rectangle([300, 200, 500, 250], outline='blue', width=2)
|
||||
draw.text((320, 210), "Process", fill='blue', font=font)
|
||||
|
||||
draw.line([500, 225, 600, 225], fill='black', width=2)
|
||||
draw.polygon([600, 225, 590, 215, 590, 235], fill='black')
|
||||
|
||||
draw.rectangle([600, 200, 750, 250], outline='green', width=2)
|
||||
draw.text((620, 210), "Output", fill='green', font=font)
|
||||
|
||||
# Save to temp file
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
image_path = temp_dir / "test_diagram_with_text.png"
|
||||
img.save(image_path)
|
||||
|
||||
print(f"Created test image with text: {image_path}")
|
||||
return image_path
|
||||
|
||||
except ImportError as e:
|
||||
print(f"Cannot create test image: {e}")
|
||||
return None
|
||||
|
||||
def test_extraction():
|
||||
"""Test the extraction pipeline."""
|
||||
print("Testing Improved Meaning Kernel Extraction Pipeline...")
|
||||
|
||||
# Check if we can import the extractor
|
||||
try:
|
||||
from extract_meaning_kernels import MeaningKernelExtractor
|
||||
print("✓ Successfully imported MeaningKernelExtractor")
|
||||
except ImportError as e:
|
||||
print(f"✗ Failed to import: {e}")
|
||||
return False
|
||||
|
||||
# Create test image
|
||||
test_image = create_test_image_with_text()
|
||||
if not test_image:
|
||||
print("Skipping test - cannot create test image")
|
||||
return True
|
||||
|
||||
# Test extraction
|
||||
try:
|
||||
extractor = MeaningKernelExtractor()
|
||||
|
||||
print("\nExtracting kernels from test image...")
|
||||
kernels = extractor.extract_from_image(test_image)
|
||||
|
||||
print(f"✓ Extracted {len(kernels)} kernels")
|
||||
|
||||
# Print kernel details
|
||||
for kernel in kernels:
|
||||
print(f"\nKernel: {kernel.kernel_id}")
|
||||
print(f" Type: {kernel.kernel_type}")
|
||||
print(f" Confidence: {kernel.confidence:.2f}")
|
||||
print(f" Tags: {', '.join(kernel.tags)}")
|
||||
print(f" Content: {kernel.content[:100]}...")
|
||||
|
||||
# Get stats
|
||||
stats = extractor.get_stats()
|
||||
print(f"\nStatistics:")
|
||||
for key, value in stats.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
# Check for philosophical kernels
|
||||
philosophical_kernels = [k for k in kernels if k.kernel_type == "philosophical"]
|
||||
if philosophical_kernels:
|
||||
print(f"\n✓ Found {len(philosophical_kernels)} philosophical kernel(s)")
|
||||
else:
|
||||
print("\n⚠ No philosophical kernels found (may need OCR dependencies)")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Extraction test failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Improved Meaning Kernel Extraction Pipeline Test")
|
||||
print("=" * 50)
|
||||
|
||||
success = test_extraction()
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
if success:
|
||||
print("✓ All tests passed!")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("✗ Some tests failed")
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user