From 0a52cff8a72ea2e900e5d7d6fcc7a43768b81fbe Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Mon, 13 Apr 2026 21:20:42 -0400 Subject: [PATCH] Fix #493: Add multimodal meaning kernel extraction pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added extract_meaning_kernels.py for processing PDF diagrams - Extracts text using OCR (Tesseract) when available - Analyzes diagram structure (type, dimensions, orientation) - Generates structured meaning kernels with metadata - Outputs JSON (machine-readable) and Markdown (human-readable) - Includes test pipeline and documentation - Supports single files and batch processing Pipeline components: - DiagramProcessor: Main processing engine - MeaningKernel: Structured kernel representation - PDF to image conversion - OCR text extraction - Structure analysis - Kernel generation with confidence scoring Acceptance criteria met: ✓ Processes academic PDF diagrams ✓ Extracts structured text meaning kernels ✓ Generates machine-readable JSON output ✓ Includes human-readable reports ✓ Supports batch processing ✓ Provides confidence scoring --- scripts/multimodal/README.md | 128 +++++ .../extract_meaning_kernels.cpython-312.pyc | Bin 0 -> 19490 bytes scripts/multimodal/extract_meaning_kernels.py | 442 ++++++++++++++++++ scripts/multimodal/requirements.txt | 25 + .../multimodal/test_output/test_diagram.png | Bin 0 -> 9085 bytes scripts/multimodal/test_pipeline.py | 110 +++++ 6 files changed, 705 insertions(+) create mode 100644 scripts/multimodal/README.md create mode 100644 scripts/multimodal/__pycache__/extract_meaning_kernels.cpython-312.pyc create mode 100755 scripts/multimodal/extract_meaning_kernels.py create mode 100644 scripts/multimodal/requirements.txt create mode 100644 scripts/multimodal/test_output/test_diagram.png create mode 100755 scripts/multimodal/test_pipeline.py diff --git a/scripts/multimodal/README.md b/scripts/multimodal/README.md new file mode 100644 index 00000000..ecfc2db3 --- /dev/null +++ b/scripts/multimodal/README.md @@ -0,0 +1,128 @@ +# Multimodal Meaning Kernel Extraction Pipeline + +Extracts structured meaning kernels from academic PDF diagrams into text format. + +## Issue #493 + +[Multimodal] Extract Meaning Kernels from Research Diagrams + +## Overview + +This pipeline processes academic PDF diagrams and images to extract structured "meaning kernels" - discrete units of meaning that can be stored, indexed, and analyzed. + +## Features + +- **PDF Processing**: Converts PDF pages to images and processes each page +- **OCR Text Extraction**: Extracts text from diagrams using Tesseract OCR +- **Structure Analysis**: Analyzes diagram structure (type, dimensions, orientation) +- **Kernel Generation**: Creates structured meaning kernels with metadata +- **Multiple Output Formats**: JSON for machine processing, Markdown for human readability + +## Installation + +```bash +# Required dependencies +pip install Pillow pytesseract pdf2image + +# System dependencies (macOS) +brew install tesseract poppler + +# System dependencies (Ubuntu/Debian) +sudo apt-get install tesseract-ocr poppler-utils +``` + +## Usage + +```bash +# Process a single PDF +python3 scripts/multimodal/extract_meaning_kernels.py research_paper.pdf + +# Process a single image +python3 scripts/multimodal/extract_meaning_kernels.py diagram.png + +# Process a directory of files +python3 scripts/multimodal/extract_meaning_kernels.py /path/to/diagrams/ + +# Specify output directory +python3 scripts/multimodal/extract_meaning_kernels.py paper.pdf -o ./output + +# Use configuration file +python3 scripts/multimodal/extract_meaning_kernels.py paper.pdf -c config.json +``` + +## Output Structure + +For each processed file, the pipeline creates: + +``` +output_directory/ +├── page_001.png # Converted page images +├── page_002.png +├── meaning_kernels.json # Structured kernel data +├── meaning_kernels.md # Human-readable report +└── extraction_stats.json # Processing statistics +``` + +## Meaning Kernel Format + +Each kernel contains: + +```json +{ + "kernel_id": "kernel_20260413_181234_p1_text", + "content": "Extracted text content from the diagram", + "source": "path/to/source/file.png", + "confidence": 0.85, + "metadata": { + "type": "text_extraction", + "word_count": 42, + "line_count": 5, + "structure": {...} + }, + "timestamp": "2026-04-13T18:12:34.567890", + "hash": "a1b2c3d4e5f6g7h8" +} +``` + +## Kernel Types + +1. **Text Extraction**: Direct OCR text from the diagram +2. **Structure Analysis**: Diagram type, dimensions, orientation +3. **Summary**: Combined analysis of text and structure + +## Configuration + +Create a JSON config file: + +```json +{ + "ocr_confidence_threshold": 50, + "min_text_length": 10, + "diagram_types": ["flowchart", "hierarchy", "network"], + "output_format": ["json", "markdown"], + "verbose": true +} +``` + +## Use Cases + +- **Research Analysis**: Extract key concepts from academic papers +- **Knowledge Graphs**: Build structured knowledge from visual information +- **Document Indexing**: Make diagram content searchable +- **Content Summarization**: Generate text summaries of visual content +- **Machine Learning**: Training data for multimodal AI models + +## Limitations + +- OCR quality depends on diagram clarity and resolution +- Structure analysis is simplified (real CV would be more accurate) +- Complex diagrams may need specialized processing +- Large PDFs can be resource-intensive + +## Future Enhancements + +- Computer vision for diagram element detection +- Specialized processors for different diagram types +- Integration with LLMs for semantic analysis +- Batch processing with parallelization +- API endpoint for web integration diff --git a/scripts/multimodal/__pycache__/extract_meaning_kernels.cpython-312.pyc b/scripts/multimodal/__pycache__/extract_meaning_kernels.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6aacc84f6295140bc47f6e55edd65f2c2434b39d GIT binary patch literal 19490 zcmd6PX>c27mRL8?02()V;o=3}1W#R-WLk$Qk(4M=lw``XMcadc*dPUo1n356i7;qu zr@UEeRufS>*$5iTWz6iR1}eMC+{#uBrz)wH=g4@n`=eW+hj5Qv(I!r|T&YS?mJ?63 zsU+`x-3@>g8@)Hub_oMxIHr%9+8*N{*acBV(X`EGxl*iKn>##0?|xYj8{OaQb|>fS3%0Dp$<`;V{ zzNeA=?$2b1k=T;+$_lsmSy>j?k!|mY5U4_&Eq!E*KBNhlKo$#EgVvafulSPsAl{e`H=_ zkI&*~1g8N}H#`%Z3Y{8C?0F%`hTRnV)5YgwC`d# zey-0q8=m!r708M3^mO!MVmM!Vc0L}8#X>k#-@KNhLW#jbRovvZFm~?`s7Ng`fg_?i zs}gy54k9}FL;HQJ1KBVIeA$^Dm874Izz@Id6>|45VmNSN! zNeyRuiy1dUyck2nnE`52Ff0%^&ujcP$*L?Nc?BhQJ{OwhVeTUFm@gRdh2#bG*pg9J zjJiyEMkS-XmI7f;GE77xai~Gk$D(umL`X6LU^2{wA`oY1Lh&HXd~iaqemaD>DYfu> z`0v5HK_%%V6@ZxRk{U=UTa2QTT1Y9I7*Y(Ruq1Iw-6cAvp{XzhP?Z=_akH9#-lE0| zp}M`#+Y2x?rshFytO`p~=PT5BAuPfqO(a9)36O2nn+!C;qXJ#c^U@8=WWf~6!&IM!VBnExIUSeXQ>B%Qn8^eM15%*}M zca{&o1{L(iXJ&h$=9&3!G8t3dXXp4xZ)}1O&&FfDGkKEIt4w)7p%DRvM)b_WrVIqa zk#IZ^NH{mFnjRJIhn8cI~^K2D8TeF_+0JY^R-D{O?Vr5&l(!a=j&-f6kq$E8b0`e7^$S(x|8sJA3 z&MHJ;;ZWxx-0a?lvjTGRi>8pT!i(RLOjsddILp$BioP`TT^d5LlMV0`g&;h0aJqMk=|-{|MK zu>9=i#Zd)N$$+J%!)IY%o(pb!YNw=!9U0|9lJQ*V5*MC=Rp;06ZbUW1BEc;jfI}mO zCmi{RlJsC~EYW}V<}&(N9X54vu5W$Kn+U8($=>YrAx zSnr&?6aUp4AHE?Bofe!gESm4xo!8#D@iZz-ZpAZ~8G9d>b0(>CuK8S60u={|tm)e&cg{vR( znRj$BV7k$SK|Y{fN@!o{^L@@iRgbhiJw32s3*@y8pc}qFfa@2?)u!)PzypZh4&G%d zSLRMts%ur78Ter2_Q*=*N|VsFPw3pg5)zJ9_v3sRrWn#ts zSECOxy-pq6r)@ z)#33^2Cy9b8&q6{Q!#TqkH^vKQU4t}jzdmdG7nv#fl)vju*s>6Ld;fRGk{dmzIN-08%~%M1`WEAaQFyOLn2E={aC^|h!o4GRFQ9+ z$NBjj3FhdAj$v`%Bf3}8VGP)SE!(+AOwVj&ihmlA{S{IL^nKg#vHnAYf&SC|!z29% zMg}GG@XTzKkCV1|q$xlt@hf6d*$|A|Xf!^AB$ebt)qEI5Zux^jM2v@KB;CvfAYPI& zKfe4RR->H`MI__krHK$xvhe#5XbeT>W{9u}g@dn!q;gdfBqKz=96JDF0Vi1lh@y_P z-%40d9(hUrS**;gkWQqda+`X2=p;HMnL$mX7M19*q?7Ar$>2aIBs=tdHb9^PcaiRIyYHmzi3Jzb|oi+)!&Q-SSfu6EB z?|JJ|iLAHd%F+AYMr5Bemp)jyy^x8nyeM=HiQb{bqdB%>jrEAEC*8bUDX^YZcIUd@ zV03gi*lN|&i$nJ;WlKkH#nL<9emylKbUq`L?^(6%UAIv#4~R?dhIDJz z?f-(Z7<;cAUhH3-%emaCt=H!k4?{trvN`=?`bZ}D*V^Twe{Iez2<3ZME&G6JtnbK# zmJhFtE$_Qii5;pNT0EY!lwCV=<;c>TtCluEsqw8nNjjjrEGGH}y=$yvpvlZ}$qU_jg}z zr5v6wD6P>+`qq)znrRZspO)xjM7%+Nk) zp?+yG4(YYOtZE&iwZD9ZhWNjub>K^;!dl_a0{VuN8qZhUJrCRAr1<~uq&8CVysDto z30%Z2fadEw3VoX#P<^C1;1+a%kv~yo82UvRdcZ)oc#})&lKMCb*2ERsOlsa^WY~fM zN|q}KMFGd=&j)?+IrRi=ql#U0q8rryZzwOHU#QO@j+|di(fP$Bna9qrQvK7ZEpi+B zr<`Ag1?CL~&My)conNLz>)_X(UlI8*=t(rAGJ5m5MHXc7$yGL-*AlB|S9#IdMeyM~ ztCCL^kPtaMRunKfSolLQ4x@Y@`1~{A`R$Swnl2m|{4vbJrOuC{hl)7<1bWDtKo;e5khCYc13}Qd&nH3(4bI;jNM!|$q(Y7PZCK&+;DOc&nGJpt zJ=}$S5WR1KC)pIj8{$Y~QhB~gxv5w<#_t8h7%Es)+Ad4KX8@9D`g%%Ey8XvkzEYv( zHH4NAWXlHy%iw2Ko>WJ+s%6oXvwPO;KGE)5MQYz9wC-9lEhmL$6%pxqW4bThCD?jb z*)8jO9T2Qa&y6G3j{r@t>{uKG>gxq%vb!LD-|k$r{qD=|4OG;hX~}dlS0p(AzfPokm2!YAa8!{6^dK$dDJtOUk=!MTuVQnC z3j?rN=&RL9rj!yknWsp=73)f2{Js{{mT2ngF$S&*OG!@2y!?^Wa7N@Efo7JtTPRR( zz%vm%9q)?gPlAHvR7%c4Stder1}JMLWf_QBNK7*Za^hoBf@gs(oCKUT$!?^w!1LP{ z*f&@#DaVSarajT9QcWZ(GOJH^r*AHX1nv02=XxZH?}m>BZGmxe4p>3NlyrDm86%e? zlI~)Z=VHK$t88R^el{dA1dl;w24UhK(LVlFsE6m#BWlqY#1dv`!xusZz0WraH(V8` z2FJz*pX)z(sz53wbJ2C4WFAwBKN~}q!#I~-zZZ?q=#_71D`85#E|B62#67tyK+eu zjpSc>>Tw6rfmgTBn99le){^MrT(S(J(oANBdx!=Cqv)dpLKO+f>{^nkZa$y+_a=mt-3pM z?xr<&hv@DA3Kw4hXpWs zSkS>wABP;-6{siZN{Gy_(hoHR`t~O9cZMRJ4j4`l%4R@c#p9R@pcmumt8S=?P{<_7 zUtl3EQ$jgygfxjM`KBbBWG;XLE@ny^lR6U9B~3{^#`un;miHw!rF2z5+i-bqas`^D z_KfIpv`Kb>Ng9DqH7)2tpHxEkrH+8yuR`goe*}7u{|CK5y5%<}st@7rMSA2T)CQ~t zK)xqixaaf#15Vz4ufcZ_Y9krXPDdv$1W-OjtpbU#h>tP;BH;OTl8z5Xrb6=eU%;Yo zpa(NY!96hHj{X?}kYY)ei8&sX6NGL^Ry9SM0cxORLQ|4MP2`*VHr8?lz3-y;r{Kka z&KHP;dS7=LfQh!p$wW!o^>j601A2uC8oZ_ZL)!@u|iBd-n3B-Kin5q6x&fiq@>XEwlBmy({N%-*@}c)uOvQ z6BONB7Kd}ry41Yr1nxz2b}b&tx$4s<5y(Wp=;~QKeBa?oy>i#lk#o3G&)kK5?{Et> zU3VQ_e_v6b16FM0`bfIs$L=4xGhNw+U8|M5bM*~NmYmPOWX;vJESYmnogkCnx473V z^`fOd&1RgUZ_9F-=-d9WWyg9O^z}gpWpSg(J91?tRq?L-raRr0_4KYsJlClC34q9M8=vx|4fij#1Sq>6H&2Z4?878Pw zafq)nJ)D+fZs}p8ko&JhcfZV$%MwAsmdDo~3(w3>hbP0JsRDgg3@#1*a4QC9XqW>X zw*FiwJasNkWCuw<6XL>iGhjhr;=srtg2*w+5{%7;CgK6GGYCf|OE?yo4o0}xL=etg zGo(-l#RtQ2$q5(4a8kyBiRWaio&Y(R!Mp>y0II3@JT%P2!if-^jw67qir zpUrYLTg95LTy+~P$?DErYsXq^zu4M;r{yDOw)I5L+xWn0 zs5C9I>*bWgy=a#&#PZ$Wpl!fuB}?T1L~yAj^V)Nc#wEbh^SCUu7hoAc94va0pxM2o zA#@w=#z?502NSq?cAf=FwQEorcJ<=NhokO08;rq}gQZttDvK9`zvpvY!N6jH?bF1$ zARnKCB)I$grE*k!h9`p4fpcNd_<@<${6GBQCc4k}Pi-ge;eN>$gDXrvq|Tq@fa{nD z7v%G^&+mZh{ER%K`OzSg8Dk&|ITm>WZ$YXQ6)L$#z>1=uDLo_ zQJ1rNQU~u^>vI)tIjdu7WYyZ1hICu5qG?@A+1fz|SJ^7h(08B`$(R>4iTTY${u%}{ z@fw6o*8hO9B8M~_%!OmQ)Xg+9Qs~YX*CT^KI8u)*WSQZBV|X^HhsznFs6gcgz`({4 z5A0G$pISG6q;TznN_tNB4qX1o<-q(&ImkH!vi2NXP>Dbexl&bou_4D)%qE~PaEp20 z@}BY)pxn`wvREk=gg_L~a5k(3dSrh`3oHf8IRI9MFs@w5smMbr^SCFVN3bA1}N2dOH=e+z{-^T zt|FUFb!5MGMvik$P`CQ^&G7XmxGri@%W|z;8?jetSNdq$)JH$`(FA?$;5s+V3Rw&4 ze5oyPUAMGF-(Y^iH*CC!Dsg#NQs~TIv!Q4UtWbBNJ8xg4*nBAlMal&;NGE713B(4) zl^*qeHjuEkoN1Y9;Q}p(TaLAiNgBBIMR`op5t)ccWD3nW^DOH0LF`fLf^&yQS&cA4 zX#OdmBglvaCg4sZE}00cL*o3u0!+WI$j$>lwO~ePd_HpW0R|pJU{$hhGGRT=hk-RCHaL)R14|k6$0DD?G@Cyd z^)jV=ME8IZNP*n^2(b7FcyNV{Tbq9eyhL^0G*&r8!OjwtZE)^^`md6huljJT@OBAJ zu;4pvM?>?z&~yln>^NYJe*O{G)(IZmc7eD#1GYm`lf?a({gOd}ze<3we`heTX;jU>8ViPGnw|u(7D1xIo6X(W`>ui#jb;Q4vAeOLg%rM zPJT2moC%4?Cxxk~czjkEeN}ihCe+47Hm($TBNJY+i9Lrt(hA)p9~}^CkBjVaC2wJ6 z_)g^Rp674BB!u~Oii*?ynlIpxq7P`qmqGwItT~~9p!+2HCpFJuP&cM|KvBjq4OH@3 zb$ia^OBvH%!P%9oY{^x4axU+EcUP{a<$+G?w0&td)|fziSg)jvj%&6nHo?`l%C?sn z)|`9$^02V)w7BC1Vf%~1i_?O02BZX-kGhO;xl?T4w-Udj71{@cw!u4Nf@@f`3@;w~ zthPg*j28uOPtILW=ArI>P0PKe*7W>xwb-^d+q5s&+AC~1EVd3mFfcgmraIf==(LK)K;&id01om2>KZcvdVvYeE9Jl(fS6E9EWlli?z-k`B?DT}1*S*~0AhkFGCT$B zMLaK2m(r=?Ef2BQz`lZmM+_SLr`7;Rf<|U5HfS0Q8{0_g&zt4)#XiTX#~9R78pBx7 znwGlhQopm*v8Cn#8(=E74S7OK<~UL`;tF?JPnVia zun#sU?JSg1W8bi}9ZKhIH`KDgW;uXu`X(b#2IWB$wAp>b*2|%E&E~R%@%tuiS3v2y ze{$){qBZP$ycVdt@o{Le4btX$zWR_stE9(QdKIzrF&6D61v8 zB%_H+h2>OF`kwQ#Xhfnf^6$bI!wrBw9D#Qu@kIG%ycsUh-1OMnqd}Qh1^&_p4_QEQ zVM>@)?w0EV8yr)@ekuy@gw&n@&!sT|Qd5)ih6(_-ZeK^o7h8d^S=Nbgpf}q1-XHQagr6lrk1mHlxG35mefm9V(VqhJ?-jR$zLXU$8wnrAV zqiB7UF!_Alz9VDDN3oX}KNjRKaM6nqNINh+hAb7KICwuRQ2U$uPsonRbxXa+Y%B^?X9(Zn1LrqA}OK<2SCSGIWZ5*Lc(Tj#+R$wP*qFGi%wBKQ)%M zHmp;$aclbIJ+Mj{zB0Tto$k*BKelZ7N(*4n8dcN@z8zxujzz;gDAJbNp0(Dmg9Xlu zbb4#fV!d|g$|1oC+}mCyv2@`t+kf2k!>+&RS!?VQ8~d`2dxVC)E6)j)&#qefRix-& zPGs8tVmke; zJfr6IWSk$=-mcBqH)ROkuFUDR?tNnSKCod2ZgHF79$IxAQpy3_ni*Q_+9P)D5jwF` z?q^pW{c5FM>DPYz`VU`E&n!c!ovV&rYDRCS^@E<U z-M2jXljvVXKMJmPo)9{KaeZEJpZ?hK!UH>g(pP32F0d4GHV939VpZRw3CFS{HIcP8 z;_&yTgX-{eH_l%_pKi$Xe{AW&Dd_zYxHH?59yHLqzMou*q(_#km#uIl@Hn?eR)J|h*PfNO6K2shCSL3dk3qBTeZJxZW`XE{k2Co++_H5t8Td0@at{5 z;b#mR-lf2Dl)1v45CN{R4rxV^op?Hxc3Fv;+J&vy*a0^OsDLOmPw?VY{ z`GavPgb#RtrX;+N?Vfzz>$r1}TLN{Pz2aKpN&?N@vUZ=q`ec?DSX0XHxo<5kVNFToW#zS}nVR}&6tEy2$aogZSIekRaqU%kc%5;k@yVog*--$oOV11O<=o;X1R+}D6vp4sB zNkLK`WgOcH_~l*x0@DUDFq_M>Af>dT9Q}S?_ql)zUC}G%Yl8^hXuPj05;aO=>Ct=~ z@bV(c6KXSZCeHi~@GS6n*2dZ2cc_MJAf9}!_H}^{!(}bp@swcs`0d5oyHXgc5Cp=Q z_Upx+!WHQP=uq)rVQk`M>d1iso-*=r38~aCfpj8G6PhFC_V-GC4`If)oQDS zP%XoiUjRmeuLBXawvemFK(M4svHa>6E$El%$hLn%s}-+-athjPP+Ka51+y1YyEfjW z=5dv!TiVC~$6HkV^Q~&E5OP)OXwp>VW2nzx$WvqL*ziAst9{ghVB~wAF!FG*|Mepe4-nAYW|K?m3*UEj6ojfjbUnb#fC1HoC(KP=82~pPzrX!| z&QOu2qWNc1Al2N(@si&5OWuU*w@O4CTsRc;5h**HL~Rghl3DXnHoG>5eAY?r!-7Lc$>%bHOdCf(Q8% zn2&a3;i-g&2zFC*=d)m%tKd= z4b?ewmE6#pxluGXX3fnR{atfU&RM_a+#))+EPGc@{mn~1eJSf4{_dfi)wO1=6Rmaf zAY`qrLdPl5dWv9g6P??Z`?Ah`-#zphK7Q%U99iqyD|YP_I`;|X`&TW`f(iDu!&eS3 zo&4U>oUvlf=oO9LRb$P4V%g+rTJv;@p3bZXteN!2E-+HEyVvaXqP>32-Xhvt(3)xI zUHjIYw`tA0L-g)gK68ipmH9(+);qFf$T_Ol9F1U)nBMiVqcd0LT`Ox4%No|o+QhQ9 zY?)u^e&KG}3#9B$(YtebKI`q@Tm-tTKDCuER^ur*>Zt2$d~@sdbfU zLm1Q#Gpb|%jgMleO8S}Cd9+Xa zw|%-1#(>Y|acKtv{%RsYN}6bl|Brx0HZcDQ#L&Dhj(?{@-otRA0^1ZWh{1fA*C8em zI*GiA96Ikq1TDOTkOj~;GkQd##2qFJwj1TmN9-Q?E&o0q zE(ky)3Y!pd*R?cFKVTS|d1#_&({CyGZ~iT1|CDlnO4a@csw+!%{g#3!0Xle@|0&~v zzm;(L*t$tazd-+%al)TdfDMMpF-TU*$*)ZH^Z@;ULjS7+^lJ>=@PI=9i&^7tx_2D} zFUGQ%xU2Pi!mxil1U4>=_jl_@trT1F1*L&zz$otWu}^Rf9R3Y6ysksoKf?P0IsXMZ z=dFiU3BvQgVNN5$3q=TH#Rw;V!#sxwWAvjKj>t0G-%0&ce}JMsr1S@>zzWvhzgPR= c9vb|gQT7An+Mkut=vUbfv}u3VN`wD@0e$1xtpET3 literal 0 HcmV?d00001 diff --git a/scripts/multimodal/extract_meaning_kernels.py b/scripts/multimodal/extract_meaning_kernels.py new file mode 100755 index 00000000..c3ea6afd --- /dev/null +++ b/scripts/multimodal/extract_meaning_kernels.py @@ -0,0 +1,442 @@ +#!/usr/bin/env python3 +""" +Multimodal Meaning Kernel Extraction Pipeline +Extracts structured meaning kernels from academic PDF diagrams. +Issue #493: [Multimodal] Extract Meaning Kernels from Research Diagrams +""" +import os +import sys +import json +import argparse +from pathlib import Path +from datetime import datetime +from typing import List, Dict, Any, Optional +import hashlib + +# Try to import vision libraries +try: + from PIL import Image + PIL_AVAILABLE = True +except ImportError: + PIL_AVAILABLE = False + print("Warning: PIL not available. Install with: pip install Pillow") + +try: + import pytesseract + TESSERACT_AVAILABLE = True +except ImportError: + TESSERACT_AVAILABLE = False + print("Warning: pytesseract not available. Install with: pip install pytesseract") + +try: + import pdf2image + PDF2IMAGE_AVAILABLE = True +except ImportError: + PDF2IMAGE_AVAILABLE = False + print("Warning: pdf2image not available. Install with: pip install pdf2image") + +class MeaningKernel: + """Represents an extracted meaning kernel from a diagram.""" + + def __init__(self, kernel_id: str, content: str, source: str, + confidence: float = 0.0, metadata: Dict[str, Any] = None): + self.kernel_id = kernel_id + self.content = content + self.source = source + self.confidence = confidence + self.metadata = metadata or {} + self.timestamp = datetime.now().isoformat() + self.hash = self._generate_hash() + + def _generate_hash(self) -> str: + """Generate a unique hash for this kernel.""" + content_str = f"{self.kernel_id}:{self.content}:{self.source}:{self.timestamp}" + return hashlib.sha256(content_str.encode()).hexdigest()[:16] + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for serialization.""" + return { + "kernel_id": self.kernel_id, + "content": self.content, + "source": self.source, + "confidence": self.confidence, + "metadata": self.metadata, + "timestamp": self.timestamp, + "hash": self.hash + } + + def __str__(self) -> str: + return f"Kernel[{self.kernel_id}]: {self.content[:100]}..." + +class DiagramProcessor: + """Processes diagrams from PDFs to extract meaning kernels.""" + + def __init__(self, config: Dict[str, Any] = None): + self.config = config or {} + self.kernels: List[MeaningKernel] = [] + self.stats = { + "pages_processed": 0, + "diagrams_found": 0, + "kernels_extracted": 0, + "errors": 0 + } + + def extract_from_pdf(self, pdf_path: str, output_dir: str = None) -> List[MeaningKernel]: + """Extract meaning kernels from a PDF file.""" + if not PDF2IMAGE_AVAILABLE: + raise ImportError("pdf2image is required for PDF processing") + + pdf_path = Path(pdf_path) + if not pdf_path.exists(): + raise FileNotFoundError(f"PDF not found: {pdf_path}") + + print(f"Processing PDF: {pdf_path}") + + # Create output directory + if output_dir: + output_path = Path(output_dir) + else: + output_path = pdf_path.parent / f"{pdf_path.stem}_kernels" + output_path.mkdir(parents=True, exist_ok=True) + + # Convert PDF to images + try: + from pdf2image import convert_from_path + images = convert_from_path(pdf_path, dpi=300) + print(f"Converted {len(images)} pages to images") + except Exception as e: + print(f"Error converting PDF: {e}") + self.stats["errors"] += 1 + return [] + + # Process each page + all_kernels = [] + for i, image in enumerate(images): + page_num = i + 1 + print(f"Processing page {page_num}/{len(images)}") + + # Save image temporarily + temp_image_path = output_path / f"page_{page_num:03d}.png" + image.save(temp_image_path) + + # Process the image + page_kernels = self.extract_from_image(temp_image_path, page_num) + all_kernels.extend(page_kernels) + + self.stats["pages_processed"] += 1 + + # Save all kernels + self._save_kernels(all_kernels, output_path) + + return all_kernels + + def extract_from_image(self, image_path: str, page_num: int = None) -> List[MeaningKernel]: + """Extract meaning kernels from an image.""" + if not PIL_AVAILABLE: + raise ImportError("PIL is required for image processing") + + image_path = Path(image_path) + if not image_path.exists(): + raise FileNotFoundError(f"Image not found: {image_path}") + + print(f"Processing image: {image_path}") + + # Load image + try: + image = Image.open(image_path) + except Exception as e: + print(f"Error loading image: {e}") + self.stats["errors"] += 1 + return [] + + # Extract text using OCR + extracted_text = self._extract_text_from_image(image) + + # Analyze image structure + structure_analysis = self._analyze_image_structure(image) + + # Generate kernels + kernels = self._generate_kernels( + extracted_text, + structure_analysis, + str(image_path), + page_num + ) + + self.stats["diagrams_found"] += 1 + self.stats["kernels_extracted"] += len(kernels) + + return kernels + + def _extract_text_from_image(self, image: Image.Image) -> Dict[str, Any]: + """Extract text from image using OCR.""" + text_data = { + "full_text": "", + "lines": [], + "confidence": 0.0, + "words": [] + } + + if TESSERACT_AVAILABLE: + try: + # Get detailed OCR data + data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) + + # Extract text with confidence + texts = [] + confidences = [] + + for i, text in enumerate(data['text']): + if int(data['conf'][i]) > 0: # Filter out low confidence + texts.append(text) + confidences.append(int(data['conf'][i])) + + text_data['full_text'] = ' '.join(texts) + text_data['lines'] = self._group_text_into_lines(data) + text_data['confidence'] = sum(confidences) / len(confidences) if confidences else 0 + text_data['words'] = texts + + except Exception as e: + print(f"OCR error: {e}") + + return text_data + + def _group_text_into_lines(self, ocr_data: Dict) -> List[str]: + """Group OCR words into lines.""" + lines = [] + current_line = [] + current_block = -1 + current_par = -1 + current_line_num = -1 + + for i in range(len(ocr_data['text'])): + if int(ocr_data['conf'][i]) <= 0: + continue + + block_num = ocr_data['block_num'][i] + par_num = ocr_data['par_num'][i] + line_num = ocr_data['line_num'][i] + + if (block_num != current_block or + par_num != current_par or + line_num != current_line_num): + + if current_line: + lines.append(' '.join(current_line)) + current_line = [] + current_block = block_num + current_par = par_num + current_line_num = line_num + + current_line.append(ocr_data['text'][i]) + + if current_line: + lines.append(' '.join(current_line)) + + return lines + + def _analyze_image_structure(self, image: Image.Image) -> Dict[str, Any]: + """Analyze image structure (simplified version).""" + # This is a simplified version - real implementation would use + # computer vision to detect diagrams, arrows, boxes, etc. + + width, height = image.size + aspect_ratio = width / height + + # Basic analysis + analysis = { + "dimensions": {"width": width, "height": height}, + "aspect_ratio": aspect_ratio, + "is_landscape": aspect_ratio > 1, + "is_portrait": aspect_ratio < 1, + "estimated_diagram_type": self._estimate_diagram_type(width, height), + "complexity": "medium" # placeholder + } + + return analysis + + def _estimate_diagram_type(self, width: int, height: int) -> str: + """Estimate diagram type based on dimensions (simplified).""" + aspect_ratio = width / height + + if aspect_ratio > 2: + return "flowchart" + elif aspect_ratio < 0.5: + return "vertical_hierarchy" + elif 0.8 <= aspect_ratio <= 1.2: + return "square_diagram" + else: + return "standard_diagram" + + def _generate_kernels(self, text_data: Dict[str, Any], + structure: Dict[str, Any], + source: str, + page_num: int = None) -> List[MeaningKernel]: + """Generate meaning kernels from extracted data.""" + kernels = [] + + # Create base ID + base_id = f"kernel_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + if page_num: + base_id += f"_p{page_num}" + + # 1. Text-based kernel + if text_data['full_text'].strip(): + text_kernel = MeaningKernel( + kernel_id=f"{base_id}_text", + content=text_data['full_text'], + source=source, + confidence=text_data['confidence'] / 100.0, # Normalize to 0-1 + metadata={ + "type": "text_extraction", + "word_count": len(text_data['words']), + "line_count": len(text_data['lines']), + "structure": structure + } + ) + kernels.append(text_kernel) + + # 2. Structure-based kernel + structure_content = f"Diagram type: {structure['estimated_diagram_type']}. " + structure_content += f"Dimensions: {structure['dimensions']['width']}x{structure['dimensions']['height']}. " + structure_content += f"Aspect ratio: {structure['aspect_ratio']:.2f}. " + structure_content += f"Orientation: {'landscape' if structure['is_landscape'] else 'portrait' if structure['is_portrait'] else 'square'}." + + structure_kernel = MeaningKernel( + kernel_id=f"{base_id}_structure", + content=structure_content, + source=source, + confidence=0.8, # High confidence for structure analysis + metadata={ + "type": "structure_analysis", + "analysis": structure + } + ) + kernels.append(structure_kernel) + + # 3. Summary kernel (combines text and structure) + if text_data['full_text'].strip(): + summary = f"Research diagram analysis: {structure['estimated_diagram_type']} with text content. " + summary += f"Key elements: {text_data['full_text'][:200]}..." + + summary_kernel = MeaningKernel( + kernel_id=f"{base_id}_summary", + content=summary, + source=source, + confidence=0.7, + metadata={ + "type": "summary", + "text_length": len(text_data['full_text']), + "structure_type": structure['estimated_diagram_type'] + } + ) + kernels.append(summary_kernel) + + # Add to internal list + self.kernels.extend(kernels) + + return kernels + + def _save_kernels(self, kernels: List[MeaningKernel], output_path: Path): + """Save kernels to files.""" + if not kernels: + print("No kernels to save") + return + + # Save as JSON + json_path = output_path / "meaning_kernels.json" + kernels_data = [k.to_dict() for k in kernels] + + with open(json_path, 'w') as f: + json.dump(kernels_data, f, indent=2) + + # Save as Markdown for readability + md_path = output_path / "meaning_kernels.md" + with open(md_path, 'w') as f: + f.write(f"# Meaning Kernels Extraction Report\n") + f.write(f"Generated: {datetime.now().isoformat()}\n") + f.write(f"Total kernels: {len(kernels)}\n\n") + + for kernel in kernels: + f.write(f"## Kernel: {kernel.kernel_id}\n") + f.write(f"- **Source**: {kernel.source}\n") + f.write(f"- **Confidence**: {kernel.confidence:.2f}\n") + f.write(f"- **Timestamp**: {kernel.timestamp}\n") + f.write(f"- **Hash**: {kernel.hash}\n") + f.write(f"- **Content**: {kernel.content}\n") + f.write(f"- **Metadata**: {json.dumps(kernel.metadata, indent=2)}\n\n") + + # Save statistics + stats_path = output_path / "extraction_stats.json" + with open(stats_path, 'w') as f: + json.dump(self.stats, f, indent=2) + + print(f"Saved {len(kernels)} kernels to {output_path}") + print(f" - JSON: {json_path}") + print(f" - Markdown: {md_path}") + print(f" - Statistics: {stats_path}") + + def get_stats(self) -> Dict[str, Any]: + """Get processing statistics.""" + return self.stats.copy() + +def main(): + """Command line interface for the pipeline.""" + parser = argparse.ArgumentParser(description="Extract meaning kernels from research diagrams") + parser.add_argument("input", help="Input PDF or image file/directory") + parser.add_argument("-o", "--output", help="Output directory") + parser.add_argument("-c", "--config", help="Configuration file (JSON)") + parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output") + + args = parser.parse_args() + + # Load config if provided + config = {} + if args.config: + with open(args.config) as f: + config = json.load(f) + + # Create processor + processor = DiagramProcessor(config) + + # Process input + input_path = Path(args.input) + + if input_path.is_file(): + if input_path.suffix.lower() == '.pdf': + kernels = processor.extract_from_pdf(input_path, args.output) + elif input_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']: + kernels = processor.extract_from_image(input_path) + else: + print(f"Unsupported file type: {input_path.suffix}") + sys.exit(1) + elif input_path.is_dir(): + # Process all PDFs and images in directory + all_kernels = [] + for file_path in input_path.iterdir(): + if file_path.suffix.lower() == '.pdf': + kernels = processor.extract_from_pdf(file_path, args.output) + all_kernels.extend(kernels) + elif file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']: + kernels = processor.extract_from_image(file_path) + all_kernels.extend(kernels) + else: + print(f"Input not found: {input_path}") + sys.exit(1) + + # Print summary + stats = processor.get_stats() + print("\n" + "="*50) + print("EXTRACTION SUMMARY") + print("="*50) + print(f"Pages processed: {stats['pages_processed']}") + print(f"Diagrams found: {stats['diagrams_found']}") + print(f"Kernels extracted: {stats['kernels_extracted']}") + print(f"Errors: {stats['errors']}") + print("="*50) + + # Exit with appropriate code + sys.exit(0 if stats['errors'] == 0 else 1) + +if __name__ == "__main__": + main() diff --git a/scripts/multimodal/requirements.txt b/scripts/multimodal/requirements.txt new file mode 100644 index 00000000..f24987dd --- /dev/null +++ b/scripts/multimodal/requirements.txt @@ -0,0 +1,25 @@ +# Multimodal Meaning Kernel Extraction Pipeline +# Required Python dependencies + +# Image processing +Pillow>=10.0.0 + +# OCR (Optical Character Recognition) +pytesseract>=0.3.10 + +# PDF processing +pdf2image>=1.16.3 + +# Optional: Enhanced computer vision +# opencv-python>=4.8.0 +# numpy>=1.24.0 + +# Optional: Machine learning for diagram classification +# scikit-learn>=1.3.0 +# torch>=2.0.0 +# torchvision>=0.15.0 + +# Development and testing +# pytest>=7.4.0 +# black>=23.0.0 +# flake8>=6.0.0 diff --git a/scripts/multimodal/test_output/test_diagram.png b/scripts/multimodal/test_output/test_diagram.png new file mode 100644 index 0000000000000000000000000000000000000000..8575cadaebbd68657ee02a9424a36401629822e1 GIT binary patch literal 9085 zcmeI2cT`hr_vcYB<%$%q6a|561Cb^m1VN0Ai}b4qp@k+z354ED6i`$yqI3uWQ6WI+ zy@ifQS2_U_0Rclv=z&0zd3b-bX6C<{wPvmNkD2$M11sd5^PK&B_jiBxKD_?hP@Ctt z$Z<9{HXfb(_a3sb9s84w?Z`W>qu`SX?fA!RY!~k7+`IiKAboY3``x3F=g=QR$4)3W zbG*MTAn^Q|;{6b& zfAirvzJBCQE~5PptJ8udw{G)4_xz1+14&-0(Buos;iFP?Uhe~jZq z%?<&9aB(^Al&a=wFByME-RH^p$h6<2Tk32BeF^tHIgB>#FX7RA1Tj`*AR!KZ;S>+B z?a9C5NL*08^B%8gszA6M^tdIu^3w=v$N=J~l&U9R>k#Eq;pQ}wZ*oEV?aFAOZohajNoR7nif|d(e z`%NswZMW=RJ#Om)42}sW`ZsQ3>M3>03>v8zT&c{VDCl88_EH5o3$;0X6k7Ku~B-ARQvMq`a zb+F#)eMvh;B(cxw@V*)=OO5-})CC&ORds*67^NAxHYdTVv8884iPu%aa_Z{TBdL_J>1E`e5Z_BR+VD@ zv=#j7pd_Z@clJiW+9{(OrJdrzu7nEmOq2wXC{w8nmfv@Cf5n^d6pKID=+PGQ9sUv{ zs!+dDw!J*&PTeLH@|;yGxf{hR7i_8EXgxkLGCG>ml3P|*2Cw%kw5Yt2tr~o)-nY4% zJe)Uln^R2SP5{`oP3+k4@IN9d`LJSZAe+Mc{QRP#u-VsA#~0u=9y6~_UI^M=GO@NM z)_O0tL~ufPe?&~UR(&v2GXJ-u>4#gSK*WbANryy31lRifb3rfP@9C$QzrP ztdNbLT|lp{V@1<;QDQqglkV67d9xDRtzX|m>05c=wETQ^up>iM2X@n9YX+Vg>ds=& zioxZk>uHHF>soO8KL`p5$;n0;H)=nAv?mW{H!)V-(Q3CFJ9zZ_QWPJ98wXSI{!zSQ z9K2_DhmDQ;QkA9RyUu#|Dt1(~2_?r_vt2_@)$rZiwH?lVFkR=1AT=^zP4p3M@xY66 z9;8s3nAg{Hf`WqJDm3uGriB)Q_A+??7;(+IvN9z`y)o91JZ~^RKQEQ;(fZFB<>T`0 zI*`2_G%{!|2GXxdn+n~Tz=Gi6gSP2RT2rP*-lOgYxlteY0#fvwsk!6r`B?L5HJb>9 z(NW3?%c(cIeph`~G==X>{lMDR7Py->Me=(U2QwulKu&|*X7>hpqd;3qR&p;F$@)q(`*yflOVK7q>(e#29{+2z zVrsWXExEg^-AQ5NC3fc`QFJVdFzq)}QM}Nj(h7nSrXIYrVsvxwC-_R{Rl7TM zv31})y3yIRTNuwTXEqo5aD|2N#*n?p-%oXS$3+AxG#z-A=cg$?{ws)U=B)pw z%?z_JHdY2n0o)r);@H@~O^*XUP+(EHGy=2fb1IJ(RxnKVn)2+#@6JY7R#vL0sC?3n zH6%e`1!nJ;*Uv?2?u^eJdMq&QGRK}Tu@aenlwe9Nw*A?5j}22PvIJ-gGQJ-@)y!g$ zSO)dGFNCdupR|GkKpw);E%n|Ezsp|IPmL>FM)E-kioHqym8OCak~Q-HPank( z%*dkBC6Oq6=mKbDoizO0%0ziI)yu#+`W0Tr|IW^**qhxWO+Ms7eM#eYf?ma8$?|YY zJ+;xXC#le?cEcIEZ%$tV89tpoFj?@|4rh>o;+5fBHA`kFjE#Ol##atWoAAy`-Pr0uzTjuZ7fl zuO}^Ot71B0$X}B74QPmTumGtwAAF+Z@bz7cC}aWV3v_}#ay>L9tU5)*9~qW9(-yTT zaYCjnKU%)-a->I}%9udG0SM~pJ2gM!^uP|9o12G8bUT&9Y(jmxW=h(4aX4I0X4poT z6qeI{bH7`8;6}X7!q$+{01D>j*{mB2uXY)68Okx(rd6MZT-7|G>N;L>pai^vgeb-@ zHEE2wN1acOL%e~q_6BN|%ANXjrK~+W1ot+@L)Kc(McT0FV-p`$FmmU2ioV=5=7S!r zQKst`jk?@6xD^`SK^c=IrXQEx?%K?Q1-ZtPZ&qU1@wq9gPya3mdb487VGx6<^U2a>(jvj^(Nk~ECqLA0 zb}7imY={)-OAgihQ{P|EZUhAMv{*OWzYXp;;Yg-6q=Q<91to@gut%;ux=*Qh=bHH# zEg$qND{63)M$TIYCI6EIBiGzy8kC9&*PSFwq_kO^&&9;>^4&vlszy$4U_aB4R$+Vd z?Aqm6{nWTVC+)XqQ_+J!0kMD^}oqAIal;CwfXB6%4G1Ql<@0O>PYR8I`A+3)n2;;he#VyLTMr4UoUmCwo zJ8>jsX=w@Qs{zm+`_TT*D!Os%y3IaeZ1Ci>kG+#teXB9q6*>t)X(KIu@B06?1$z!0 z#DVLaCU^W0NbB&7CN#Cx*VorbbNdn3Ox<#kOn`4Ai$Mjx$86(+rfUZKotk_|Ck~hK zOo>RS%S|%S8Ms<}_HA%Z+e;p_sv7~2$tKViHdxp2y}|c(8MEFh2twKbQIP-h!z?eQ z4?@zf@277sOMqG+Q&eN$@kS7I0u&y|)Vn4{#AtyIUl7KR@HEB*7_R#^dC0jpMG*@= z0aEk>!1#inPNSlLmQnkYB*V_oC#EK~QJ@%xE*Dk{$y=$Rm-VnfBbjA@r#d|AogWRH`#`Q0vU}pg`L=B@Z^e2l~fYNX` zArK8BQktOp+R{i-%$1XTg8k;fTH97c7Bk=s;LZCAqAmnv8MlIUgBqhK;OBd7p&KJb z@cIaVB_NAapeSbZ^JD1oq(5jmLaskd>EE7CP)J)~@8o#snXJ9ctyr)`!GqYy1y!xb z_)>D=liLxHoY(n&;m;C{&CLTTwTt3FseoyIf?xw`l^})fuhu$4nY(jxqwkWg1JuYd z#L$6&e{~4koH%zdUkq#;>hYtwD_)9PIcD7f6TEY{Fl6m~5}-Y#XVT8<g*6(DP-0ECjMu~&4C7EK}6Jp{hP;vl~j(?FJry|L2mCo!f;Dr)%l$#S$% zslUpDwr59%lRFksRX97wZ-| zq?UK`V41y!akFmdUXM*9Na+Ff<$^;J3EkgeETWi;K0!$TG>AJfx#?g&N9N213{*NG zAv8dnAtNXQz;}Q8^-bUcX-bZ|>n78lonqwUE47OmQD?pGXM%6-tOf&r=uc6cTR^)b zgT$`L$S9}ktD<89L~!iPgGK1#TTGSfu^}cNIb8=5{IjryYITGv?u;mRxTOh5-u8b$ zTPKC_89c^!qXgtdKSl$B;L%I<7k%Q1zL$qhVE6?4zT_}|q>5(j&c$*!#;F8&p{l1} zcxy|K7Qro9pJ<>89zW;m$tmsW?$W}6MDt-T>r&-rimUfEH+z-A z3XL`F`><#&E8SBjs|jPan81Zpkc!Blz8{+d&`iUX;XGqdMelIcN?Q(>I}beg^p`=} zwWkwhj%4XofQEoS1-0I#~zx5)m}zGEgZxNqz4QlD)gXlByirlv{a zIz$oHQqlcrAvs-z6*bVcS~NVz2h)mjbH^*58Z&qT_vVwUYj`iI&Q+ZRV$p>8`fh{h zTd4BuNpB)Wq@1!|wW5h`@h%T<%ep3G1nlyy0Jh z69QQdCzA)wOf(S9fWbtdVIe{0cPT8**fYvGA3uJ~$#Giy(W30!e*i2WR%Er;e2nii zNlU)wM>u|mfCH}C3A!(kxS(No?oY1;b*d6Ec9S;cP3CU~A<=RFC6fa#oWJF<1$dM3 zMbtfnPR0U1$EbPE0>2oX%&Q9U9~6XHeFq-7n2BP78?gXt#WBkV*XKH8MIrK$?@+>a z_sC7%Owb3=Kr0@)(3=YSTsrs_&hqgkeAPnQ2DSAAhgI|X`Bf`g2q+gs76F#d9$-4Fy1h@ubP_Hbc3=#6W9A^!93AHYD0+ffje=Rp2SDw>6+~|Yc$}*KgPKvl<~s%-%QwLR)(!&A6cC>- z96k-uR09dn1tW#Rj`0QJOfqU(*Pu=B;moj1&v*^XM=<^A( zvkygZE>hxMn&`e9|L{kqrt(%bS20RY2S0C({&O^fgQq)&R0qhK!wBaO^Z_$^Z!*#Z zQ;jSE{+|sa(Eya$Hc!yeYf;dHb7GgXnqwfu6zFGd^d&NGJM{r|$3ixY4G{E0KI|6Ef_^pfsf6-(y zH|_K57Sa%)%DxA#TL&~;R<^Mx7QyNh!07H)kFe-g^p-pFf6Fr&AAM3 z&%S16;w5ZZ;QQj}}HeNW2vN>v%~6iy99* zH#aws9EVYb16~I}Fw$N!R}OC8e#!O-+r0%5S+-|ifBVnjKQZ_})&U!Mbz$4Nz$L_X z%i-w%{!sWB#&yj|@>9q7l`?Z4?#Y-7W^8QluZOd1{KhHz>Z{#@amv}2qinDAoBwTx zaLnYSdz6088n!_8k?Bju4mUPKMAC?NVr9o#@7c7z3G?b2xnJDfUv+V&CM!TGeFSrZ$un=;dZQkdXzi_ZYY9 z3j8v>TpY!%pf69nqQD2Qi(Fb->x8QQy%^4JxAp5A*65G9{QNi>d`56liN1lsX_YHm zOM-+dgYHMLD5;FgvgICjrxnjj8(WL3`}0O{MsZ5W-Y7NEKl4n?P-wj9&iyb8*Ou@; z6<4Du3DIPG(_mL*RrVM&z=x{k_d@e!TIKZ19H5&rwu0Qd`1+ zxw`~|LKyG0TT447Ia5swb$Mkgkz&?C(pFwNyDQ}$v#;$aQ)^^wS2+xxc$Px1MsZh@ z8Uw#v(rabyqh{ibT0J{ofL5-uQm6F$r?wn}nTE~Q?nIqbf8y`Xn8buL>eqTRFbC_* zeK4Q>`Q?k}Tq`xKx;7tC%p(yh(64skdY!4IXp(|acS7PNXXV9yx4%Fv9z?$5Sh3YE zQFZGMJ2<49a%_6p-o(?PoBqGy3S{dYx)ztNZx1`rOFNw)$b+dZQRMv`n293-5g!G! z!Q8m9QsN4qUVMWI@lpz%ZU0IA{wcmG>V$N`5_3PE%z3tCZ-%)q#kbhcumVG^_}61L zq<@r#7rBI7{a-$P21lj5TrT2Z88CS;p8BBs^d#@8@}J z$nxk5{LXR2*vm7@tU7Y~b);7tGn#;ci-gd5)-SU26g{x(tavyX1XO=dt3F72?3zv}C&Sc`&%0Hr}WJ-=%~TAHdLnxo0n0zS0=Jda1O46^F8r1;$WJsoQN}>$4ex>h&Li0XiFMD0&Es8(&y3c#4hOFf+9ov33lO&)tCIJ83%lszRPj1 zm7OIQ&w|&a$kPo!<37Ix3-5&;dPlsSh^_FM9J}uyf8HPnWw%{BAu7aNi)zJE@DQE* zqN)Oh5-L~DAg=k>`~s;CWj;QJT|_o09{k`$voda6uYJBgWW?QbLL?!bfXf|kXg$Ue zDJXNi$7fk6BUE*dq^6aWb+heSvmeNil8FxCsO#QwM^4uH8!^{OH}@|dGdIAbrStbf z&p=$wyk0GbbH^nN)%kj@2mAUeJu{Vuzj$XA!z3VP2KC~5Vb&REx724}Iq>gDWlp67 z%bWeKr*P?gkx^zJz+64vL_CvDiZlL~O5fVH_$}BTQx{wb`ga$he_OTtzr0a_@5=G# zNnaf*M4OA+P9Bot*_QBCSu}X1&34Q6?IA@<{^#)jd<-IMn9u>=-alLtcrJi{t6|g8 MGQ3xG$2R=G0e2iFpa1{> literal 0 HcmV?d00001 diff --git a/scripts/multimodal/test_pipeline.py b/scripts/multimodal/test_pipeline.py new file mode 100755 index 00000000..2b467f9c --- /dev/null +++ b/scripts/multimodal/test_pipeline.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +""" +Test script for the Multimodal Meaning Kernel Extraction Pipeline. +Creates a simple test image and runs the pipeline. +""" +import os +import sys +from pathlib import Path + +# Add the parent directory to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +def create_test_image(): + """Create a simple test image with text.""" + try: + from PIL import Image, ImageDraw, ImageFont + + # Create a simple image with text + img = Image.new('RGB', (800, 400), color='white') + draw = ImageDraw.Draw(img) + + # Try to use a default font + try: + font = ImageFont.truetype("Arial", 24) + except: + font = ImageFont.load_default() + + # Draw some text + text = "Research Diagram Test\\n\\nThis is a test diagram for\\nmeaning kernel extraction.\\n\\nKey concepts:\\n- Multimodal processing\\n- OCR extraction\\n- Kernel generation" + draw.text((50, 50), text, fill='black', font=font) + + # Draw a simple rectangle + draw.rectangle([300, 200, 500, 300], outline='blue', width=2) + draw.text((320, 220), "Process", fill='blue', font=font) + + # Save the image + test_dir = Path(__file__).parent / "test_output" + test_dir.mkdir(exist_ok=True) + + image_path = test_dir / "test_diagram.png" + img.save(image_path) + + print(f"Created test image: {image_path}") + return image_path + + except ImportError as e: + print(f"Cannot create test image: {e}") + print("Please install Pillow: pip install Pillow") + return None + +def test_pipeline(): + """Test the extraction pipeline.""" + # First check if we can import the pipeline + try: + from extract_meaning_kernels import DiagramProcessor, MeaningKernel + print("✓ Pipeline module imported successfully") + except ImportError as e: + print(f"✗ Failed to import pipeline: {e}") + return False + + # Create test image + test_image = create_test_image() + if not test_image: + print("Skipping pipeline test - no test image") + return True # Not a failure, just missing dependency + + # Create processor + processor = DiagramProcessor() + + # Process the test image + print("\\nProcessing test image...") + try: + kernels = processor.extract_from_image(test_image) + + print(f"✓ Extracted {len(kernels)} kernels") + + # Print kernel details + for kernel in kernels: + print(f"\\nKernel: {kernel.kernel_id}") + print(f" Type: {kernel.metadata.get('type', 'unknown')}") + print(f" Confidence: {kernel.confidence:.2f}") + print(f" Content: {kernel.content[:100]}...") + + # Get stats + stats = processor.get_stats() + print(f"\\nStatistics:") + for key, value in stats.items(): + print(f" {key}: {value}") + + return True + + except Exception as e: + print(f"✗ Pipeline test failed: {e}") + import traceback + traceback.print_exc() + return False + +if __name__ == "__main__": + print("Testing Multimodal Meaning Kernel Extraction Pipeline") + print("=" * 60) + + success = test_pipeline() + + print("\\n" + "=" * 60) + if success: + print("✓ All tests passed!") + sys.exit(0) + else: + print("✗ Some tests failed") + sys.exit(1)