From efdc0dc88665840443fc0251c98e6bb7e19eff95 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Tue, 14 Apr 2026 11:44:55 -0400 Subject: [PATCH] Improve #493: Enhanced meaning kernel extraction pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added 5 kernel types: text, structure, summary, philosophical, semantic - Improved diagram type detection with content analysis - Added color analysis and grayscale detection - Enhanced philosophical keyword extraction - Added semantic relationship detection - Improved error handling for missing dependencies - Added comprehensive testing with text-rich test images - Enhanced metadata and tagging system Key improvements: ✓ Semantic relationship detection (source → target patterns) ✓ Enhanced philosophical content extraction ✓ Color analysis and grayscale detection ✓ Better diagram type classification ✓ Comprehensive metadata and tagging ✓ Improved error handling and dependency warnings Still requires OCR dependencies for text extraction: - pytesseract for OCR - pdf2image for PDF processing - Tesseract OCR engine (see issue #563) --- .../extract_meaning_kernels.cpython-312.pyc | Bin 20075 -> 30052 bytes .../extract_meaning_kernels.py | 294 +++++++++++++++--- scripts/meaning-kernels/test_extraction.py | 31 +- 3 files changed, 277 insertions(+), 48 deletions(-) diff --git a/scripts/meaning-kernels/__pycache__/extract_meaning_kernels.cpython-312.pyc b/scripts/meaning-kernels/__pycache__/extract_meaning_kernels.cpython-312.pyc index 70dba65e58bc74818d50972c1ab5cdf771b6a1af..017083eaeb8dd680b74fd4cd0b84dd71cade1865 100644 GIT binary patch literal 30052 zcmdVDd301)nkV>P?0bZy&@QCVz95AF0S2*I>>vN9Vx8mMnyp?FU zL!4myJ}20JtdZ;6Tgu`bh;ydn%2=EWae3*uav}fwTyKScoltPKiQ`0*hVxhaSZOrR zO=u=GwVc18juY*uhp9Io-`#@w$LI}mJa>V+)pL!zuD!rr<5xfyF-Xe_$cs54QGeuV zT>&|$LN>d@e_=T?JSL95>K8o6{l1aF$YoErUmWodc@A6;iN5|&V06TDGBD;J3XJ$I zN}MMc636>P<04WGE2*xsR6);xI6Ca{_4@?>aG>9Fa{ob(5b#|VeZxVoF0_nxvmPMs+SF@Z>rIcQ`+XWyR5w>eZS#Ome57 z_3H#3{`B}W;LqsS3no88e&c|q6sZKW-{jX(Xuu&@zOU~!3s%2wK!XLL>$M=91&dfa zVyy~<17X{QrrDLUD-%neSK-DN{9_`9bR-n?_(nW_d73>lle|4CvplEz0z%5zKROaZ zl9VAhIxhD6Q�$XkyHtvf%4LK=6;?gK5|w@?kpoQo4}uajv9fdx-|?IWJM}L^&4_y8!AOdxWVEtg|w+(C%F53PqSULBSLhLM->sN z4q_g8`Q%ZtBDE`}9T~lrG6#aA1Eb=wFO)LV+QMQO9!oj;F8fC?(L?^eE56{Bl#cvn zQ%V=~4-I^c0)jLk9>qO9)jlQ$UPWf@q2aN1lrlWg#s>3p+of@Fq&?U#2F60cc4e1o zQ+AnlW!Cg5@qJ3XcMQwDuP-nX2=(=ai&jiLuNvQi3IqjsH@Ia3S5`ZvyX8n0SK?ty z7FFP3O_n!I=@v{S$+9}UI+G=yKXpX+B4|&#OaA2O3+&MJIK zPsGf39W7PX`JLkW;JFoJ(biLOe%hmX2~k2PCxY=8B9;)44?`lXasE; zcbaR~g&PkkbH(HHjE@9f!R}Af#e-?(30(;UJ@UADpYokOpV4e=Hm#mQC5TNKDb-Nm z5|;lJ-=-~FQwFT{QNf=wU-4fT0+%s2n>AuNfm$(rXs77I85N{{^LWGxJVe^X!WFBg z#EQB693X<{;oaaKmacoN@@}P6(iG!kb?>&`Z;e&XzB1?jY5Du*bN2aD^Pyk9_Q7k? z!55{Xb5qtOXVL4g-F$84>SE=Fc;$vf<)(zQW6`-Q?%b7d?wQxl=S$B0l4-wKiOg3_ z0vZ4|3H}uVYfXaO69BO}1|P9G1|G55`yk39huB;bfE!Hj8#0^%OZ&zuha7V3V*>w% z8N;s$H|-l8dDSn5JfTrw0SuS$i4$z}aaIO=LxHf5j%M$D^Xg&GwKdUH5X;DGhIgN5 z10JL`B@c1R7#i)Pa>9ixhgpf<3CQ3Hc=E!pq!CUQm*0upjwFl9?p(cnHCbM@SiU}9 zzCKx2`BwYg_GC%LTlTxO>hB!6eIzMQ3gzulfyHH6<}7B*v%^ZA&4%?N0>Er(4I{ML zVGZNZSz~qM0LnGM0w?IoIHc6SX+$R*dcwwThI+je*1qg%3TvA?J)a3E>8aN1^)_qe zj%D4+HvSfTTG3~Hv{JBJuIe)-`Y3^Kg?EEX=9SF2)m5R5?O$x&8E@Swt=~0wdfxrZ ziVrFt2oHi%*Jq}Ys`hW+H z4^i0G*Z0b}Z%FxK?CTRo`w^{09aH*&p;2E*r1>qnmv-0)yrDMCb#Wk z&xhqzQEl{e)O2_EG9`MrZRgzS_svfzyj-T$AK;_fCmcPOEjGPn#y!&??U=c8xAF;x z*X0Uce}SJV_?)AsTtli?KmO8AUA#{YV_vUB+Jt;U*!+6GK}ODiaOr@O8G*j(34P1@ z0#pv@mkG$6d616wRqh#RGehUrhPDpqfc^oYBjheq4)U?%3n7D!XTCW7dWJaa7+UCB zvo4NRIqC-tf)hFBt(n5$r~0@6lP|1JomD%)AtNN7RIsSpocpaUktelgsngwo`R zXFN#M&oGfCpb|jXgszMVLExbV94P~V%l+A>FikzS!oQ6Gp^{lhV7j++ulhksx5^3p zW$Xn%;#JxaXkNOs39eb&lPVg!5*Qi{j^fep8v?>Jacxu-g3ToKJGAkItwbx=nPQ8tQxY`~$3lpW0r_6J7CgCn3}aJ1rN z?hgWy9u;K(!zI6O9Dv5b@n9$b9NRb``u$XZ1Bm<>QU@*tSXL%N!b6m)x!*S)l#^Qp zf5A{I76d#O81eemM!9xg^_E0W1vOFDrB%? zqGXN&4Er4}gzvHoT0`lHAn+<;^#^VSQ zZk3B@LWCzUH%Lg=DmUm^6@k}xbn4=p1Vubp`z%BRR5ER8`?=Tk{hG@d=aFc&cl z(p+8T)H1WunU(z8R)OMmMgA5At|JGWCZbjQ1~qXikXv#>kh`qW_ig0I{)%B9W8$=!6Icg3X$g{#ro5&T6>g4_v+yYBr=S0_b3Dci8e$ zHeYbe-yiB@yI4vG3n7`iOV&l=p}m z&@OCm{bC=!<4F zn9|Wcmoj3vrADTla?8+M*3Mw6;LL&3rw_c)weL)JAz8exlwIC02`eW)jo}3A4zTQw z>^qav4UPszQhMNffw5F+AI`e|@gd?itm+l*ke|TDaPFk?`&52FrfbXxuJ`*H<4+{? zHC5CHA`t~6r?{eo zh2;=lxBA#w@tL|g_+4OpH@JW2tXRrlCspoE&_+0 z&jYBjecCir7I)Pp+cqw?b;sMf9~@4!ot-vGrOk0y%Tn8h*^-}BzE_#p*p+D8t41fA zS{Ix4#GCfan-Wb&kVARgRln@u>Y5g7cgJgY&yCMtNz|U0GEG;;O&;LxfN$IxZ`?U| zKGAq^$|U90#!Yn(oyAkm-#@PA>d*7P``p60*FEO6_V#3BbGl9&)DoAP*2lVjY@nh( zcGdswvJrq!gPSlAKdbNJ4|3ek8wwBXF#UX|4*p+qT`olZZ#(qxf5Dj#uGjv;Tyn5M z`-=uW!r~1iTXE_Wi-k`2C$BOz9!wRytabu_6)Xoa4`0)mKn5m*6HM}eNMi+A5Sn+= z`z>Ju;`eZ36JfNMu{!+B!zl^7-@Z2r04(C#cn3Bw8umGHq zIryR(EJCoofwKl^v|Vt#p#f>^4A*tZ*uIBtO~CYNSGhFKWEuhGi14PgFC5w%aN+!g zV8A@=yDslzE7&xIi&NoGh+ZQ$jldHkKunGvkv@vs;2}MKJ&U$}YT7yE8@?p?c852v zUjA9T4o-)lcjwS3wz}YM1mq=#mGA?~+`>)+#*Zh?PRfjPK*>lK#4(aNHc|%P7|szPr58~RK@1~#%HEfq zy}H%jpiiK(s3%gk&?uxXePe;^{-K~;@-)7SuakEh-m{yn_zI%KRbSD^-=nlic#QpH z%X_`^X6H=dht^8kGmqas9<5Kfo2Cw6bIC8hQ+2y4wq?31k+*Tm@QF3=p)LRQ?wj3r zduI1fbuZYqC(Au=?YO%mR`_nk{fc<`mZ@V%R#iQ9BI#*}iZMQF0jXdwxNV!(Pmd=n zYokN7>GsFhZJ)LzT~&*&hPbOCxvmEPmuebgmf5aE&DKXoT~q$waJsyLWjzuv=W}%( zOVxGJ7yqn0>8V@vY>InmFkbW8&}sb$H$$WlBttXBP+Hc+6>q>^Z)^( zQgdxXY$0oaV5c`B@T;($2%0eF(~p%{Ec)lM;B_mYv_RQbLjf%XN}O}Uf*3_VkVfk- zBIilNq;b-O&B55Oxvr6=q!AN)^0yjz4pLGr`SjFrxAy9|30<>3T(R$pzyGQyFreUU zz~s_%*2^aCXS8$ z@nN>}H0$K?rtT2mhQ|a2iqJ3^)FFn3S!Nkd;4R$z%yzOCE=p8aum>LG25nnr2jXqJ zJ}x>DJ@c0Z^Jjiu`ry>h*Zsrb%;tYIAr+mNvcmhsTC`M9H2s}KLG3^8kF8HMY>gLe zohytN?3Qf1lg|9#KQ?m3CwaE}PZe~TbZ^+<XIC-(qHW#jW zX0bd&k^N7U!+(W`RpQE@(kF{cr)soRcrkGOc(Hft@IzPW%*!9TT9U5fnH?YE2zHf76|En-TK~GBiqL># zw~s{&-Y&UU5^GIVKeyoCmaM9twk18y)AnR#{j@b%yIyj69@mtnVm^V4*pCt7Co~9auj0%`7%Z#^m{d4ZX~9B1z@9}T zV8x6S0F}-z8%2}LpwdWhoG@WQ<{~=G8Pq-30;A24+#eY-a^@++054d-ZxU?#(@GBN z!bMs3RgexqI&(t0Rzcd8se*c6f@!xag@eueKp;XpS z;shOaBapC=e=McZ$w0ovx zA+I{_I{AC?1AGUvVF*r;qyl1(QE<~nu;Zo-vP=_W&p!SCp|StR>c(EkYU~B2u@L4m zww*OFB6B;z4F#WU1RTwjLFR0N35g?4ZE?y4&7l#&2L`4fYw+O^dkaN@CkMTbzVvDV zV-bsRBq*MxkB|sm^NT61Z)Bp`n1!s<)|rK|U!>Q5ctImPoJw%d%O}%ohzK`)z1dDm zIKh7c5A*tA$+`@}o-D4IdG+>dQ#KHIwvw6Z53S|Ng36@5bY|a&_Nrt-W76)LKDJwTC2EkYcJXMwkp$+CA@sq}g5%@bKpOdBxhb@<0 z4sh^HPnsiUr1Bt@1^HPBchAXRvk0uE5omi7j9?S&Z=?x297Q!C%l8oOoGs^Rc4TBR ziUdN@`SL@L5Ne-7dLDW+har@AEm?976U{0n5ix|WP`Z?f36}#1*+oBu0wjmI5*Q0I zvg}N{YXM2lcc3ri<6sH(w04wC#q+UdD-yr$+yq|L#xJ_zY}lG#%$w0ZFM@ zjTM)W!)G*6ncnFLWI(6~q|i=^GC(o_9s?9!fM|eQMq8PTtbd0->n@Fs4yBBN;5c|= zVoEmzlmkR65@a+e3+d zU?C_0_(J_x&}VuU$c@B<`s2c?g0;dm6}?$ZN+ufl98f_XO~)QtP@c7=O_@G7aOIUV zSKb=AJCdmMCd%5EN-Ac~|Ih*49yoq-UDYG2p~3(WZUB);`$(E3*^kJ z`n=-l14(yXv@h=7vgqC!ckfJs0mt8KQXigHu-A4Mfe%+${74J)vhI<&%|7!|_}Dz8G(7E+XQ73&KHWrH3RQ>e+!h04ctqw_nL5>(b1 zn5FTNQEy@z^666@r@i`5^TrB2`;a;ICtyM( zwPEfXr*x!KmVC1^CqSSz>t*2(&Nb&4xR=2`h4AqUsXVn`)jbwbiW0jj%m$Q!3H2dU z7yKkjf!@|c-!<7{B`7{Y-NfI*gBWE8g=vl}qH1WkrHo2`Buw{(g5p(z5ecRMW;vUV z9Pc`GAS0p|U&NOcxE}3L;i?sywCPEc)|#ro$Or!rkSnyNmP#69hD1p_Q<-n|;$eGO zt+VFq$1m90zy^0I z8d;_>VSwplr7ViBg%A*B-K_LXL!Hx7+JA-E6}nj`GqW}$hvm7f*t8g9MQj#klm(mC z<5>Zl#t0#g&8_NdNY7_u_I$n!!DAL)n_g8+PMm{Vn#M>Qv+4wn?GL#(vs&?Ys6)Ch z1@L=U<)#sBS#?a8GpQY^fN&_imN5#7Lu`7Ts&O1CI$Pe37A_fX`k@R?=%=|aAMiQK5V~DnuRdB$c zaYQrmnF$5=AiGj5mJdv-xov-tLDr%Tm}6aXoI;Y4X+taE6*%#u<~%zm!4}rW}7$= zfT3Twohd)T#80)Odqy>k>=>2J8%o0A+OT}R$8+`)r1s;Tp5VAJ>hX<*+CWdNhHHZo1Ek}R(X;Fb2p1_$ zpo)?i;{eIrJ3V37Ntq8vs?GS;>4Dz8axRc20ZAD^R1s-Kiot!OR~an@O@=(sZN%;D zBie`UrGZxL_4uL{#sXo>KKV2Fs9_g~Ih<}}<-r)LL8Ue%sbdUNwd|f1)uBY5r+j-# zPa){|Z|D%7>bzqkmqlV7H0Oz5;DN6qBzfY0p{CO0!J$Z1g`%=tcg9;B26b~4dwj~A z?thVF4KnF9Ny*H9$WWN!8d5->n9BFdG8(ystmc$A)a3-|DG-R-Q?Q()g(&`xyx&u1 zstO;7tShXbltFH~`1jan5OHc^@qeaNHz?Y~8cNa~Czyp}vTB%oPTANeWn_ZH0w6|K z+A)S!%C(}#Xlb}{jpKPW;08LT{}Vh?FsoUzR>O4GuD?5Ze=;^YcV22e z7%w|G)txjIESgH=rqXELtXndbE||718;s_n-`YApDQt*dpFO=$xP9s%>M?!vjbL=^ z55F@rEUn)mGU8~2Ln*{PPMKG>ai&i16a-3@VN!@EYoNDZub*h=Zf?87* zpS|Jytc{;-O*jmS*&zY;8`;3niJ*MQoFhHrG6qk41P@dN@Fz|rrjN|1*h(SVOW9_e z%F9>+EBltLXN(nrLUVRdTd1m^LgM^yVV6=$pJfxmdg@UcBiqI%dOvz9&(n_BTqW=Fo$5xe%|opIB1`0&tKH0Ai+=fx|Q zCFIMo4k@3cykEI231geC+iT{2W;XBBYJZmBuy?oiXWMuR@7BYYJB-+g{W^H6BU+TL z%8TG>-ywYm=z3&kUanQBV*a^SBe?45Npdt_v5vH&i3G?IExps6M;xC^#4DunR`Gg9 zAREy$qeKm`ZJA@N5e#djhZd=dC$0gJC1lQU#@B$gUgp%?)E2D)3Es5|2^~=__~*_a z@t_P883A-g*3ey5^TjnCq^+yK%e^Q2#1~idmgxuL%L@hQ4L9RR6p~N0Ksu&aiOGLc z&w88SNw21q!Iyy&02QKq$z5i7Al-6k;FxmOt3s{22qimVot)pu`Zb^{)6iw0XDV4u zm}~&_jZvtIm<5kg>uQ8+lyI%uf>pDj?oCa^2t~qrp+Rl6+P<&uMU>z8h7M!Yv|@}b zYm8CWWhg@PSLcqAXn9i$8Y_3tW@fl6Vine-)@)7-twI|!zV#|2V;#}sdpqmZwes4K zHA=RKE$1k0O!s>w^4lbIhW>x=hMgLpT|IZ2AvNWo{ zqQz=WzH;5^gzf5#SJSN3y37h)1EMnnA?#qIzH`N>+t(O%yIL~SY0tGTGb8y*c8lck zR8tGPm%JLReil1eeFPB?-8CCy4EfUC8RkHnPOI#Anawk z)4mmb;9R2*oNB$+?1Qf}13T4R#D2`PPMT+vE@6LGOI#`hX50a+oxF$(>){}?2SBe) z=11}(`NAP(PdJS55ha}eCd0ud^KKRDxK;gj6#bTme(M&FZziiLWDMp1LOtZrHtbvWWr0LMAA}ES(xG<)8S_T zq|*xfCfQsJhP>>0CdllKU*qnCutQOUU12YSz#2?mp znwTPgSXbZd^}zfeW*PEzQYmvfc==!cn%w`n2b#OAYE(As2Ua#ZsRrmv$}r@c0E;(e zcoi~L-%z+beOr{mV3)C7j1?MfQPsfEcqbSbAs;SuVk(-Qo}SMhPyt@nq@Gb(ag!{U zp%^B3ne3IxTfrE{?t=~6WiSwaLnSZ4L#vvpRVG}>CB7Q)Ut_4?XCzL<#XBo3z*De-&N2vB zw9o!6w9gdNa9OZGhK$-&o-$h?sK%8!WY?a`moHLcvs0-@%As17_c8VnYU4$1#4akH z?pPv&I_l)=)Ba&tkOcZ+Kba+l2d7K!E_Ahnm_LeaJm~dΜnukdKo<(*w!7NBk?w z*W4FWGE14%koZ2nfIF3+oxN=RsCLsNw^G&&}m=>!=A;8j(A1K?1@CheyRL`lz(u__=&Y3X)Bn?pE*3!IsM(ZtzqgA zlGxm@@4UHl#Md%ikf73?OQwV?v$Fg&DAE# zcR#FYdAs#qtJJ<{J}*(TZ>gp}dgbk*dqcB3<~50ieTka=$-36JkKH>qTRC5vsOyFv zKw17{3suUrqY>6?C|WxcKCvNJ&H?$c%G2AyLHEf;JBuaNoIg-}WMeDk_b=}O!g0(Ga zTPIa-j@$5mUD95@Xs?Rft0d3H1^cFCNrTk5Enc#Ht~y?_6DraGt=&31F<%^i?ufMY z=!2RE7pPjt1?iYy@&}~q!MJTuvaS2HyeV1gnK4Jpq@vcOyFOV^nk=bJ7MDG&Y=}nU zm79~ErdVa%vklL;ZTD=kZ_Qmucn&3NHas%xiyV(_Tz<#rrFN$Gv0TBKU9UTCI;7&p z1yd6Z`-WN5Pi*hm=DzjdLSn-?sr|fk{*sj6KV?i>3(~_&gI^UZnQe(T@0t_lf>QH7 zscHZGKB@R{+;;fpp@+5tskkL>Yhm-EqD~(0&bhsb@;y*DNm|PntyOU=w7?gv>nXGH z=$W@K+`B-tz!Gooez5n!DXINLqW+{*cS<_-f>eAuZaY17=+n{$x$PxVQA^Sdr9ZU2 z8g2iy0({H2EACaq1{2lW6BV6HRW;FVZ|}UfbJl`VYxg9oo=;Y^E>>)aS8SLqn|mQq z@qDs+<6`xec=eXq(RndZeKg%YThKjamMKTNccx!kFg2{4;O}TLje6s)&nKF?=J$Ts zbTA34*t$dUxhqDOGMuR%}d`S0_E~$*P)% zZF`oy8)uv5I^&xTCcKA|E#AeJ?syA~bEUI=@nhdkwD>UHD~mAQ%l7k_?@jrVs}2)B zujG#7wnM7wSa5Ady}qP_oeS0tXn3qIUa@QHc(Qu$Uv|t-Cib0`_P!{+C`eU)DC;DP z>!L42eNu7LV~)2U=clzvuz`cIZTBXlqqAq{I_9pwziY1jLB3RXEbhX87&j3d)QhTG zBiXz$W?AmbTb%qGe~Ye$ykp77NzI7479eFSJ%UrcNxEs4I5Qf^9wO zZ??Z~yJ=fAmB&rxGlv$)W*F=oSx1EI@K@+J&{4%WF5gCgRIvxKd98LCwfRZ1!7V3! z#hmK(S^O4oS#Ws|p`yU0^>2cwIALfuW+YZCwrn~{$%bDxAR)ZL8?HK;efZ-pHKtdW zIe$gi&T-*duje$oE>Xs2p#BXDO9*O|qde#t9q`ClXd}RTVCcC6PuZA6E1zcUZUP5G z2XM~Fcg`pZ){6Z{CzZ5it?geyJXW2aN{jw0k_LU~1*RcdhMV*{PQ1HvhbjAW7? zwMh3#5w#)@Aw$CGB$tnH8uoC9c)sPSGpmkV|d|b4G(W*G)ZYI|uXRe6oG@8+Ij2wg$u+C0RphZb@brL|R*R+5q&D zBN&61+tByWlE}&bFc0RS)iXSyoK$<(Maj3?v35+kv?bIQ?dPWXmv<`K8W!;Va`&A* zV#%dZq2_|NIkMWE*2F-(^UbvW2u8-r%EL&Z#koszLA!$lbgSIg?D;&T_N+>Z*RPp6 zAE|4eO)V6F=lpfX-Hr5h&!+#Hr56fCId3~M4>moUr#joeX5Pi9S z*<6q}u?JSU?3*=E&WgD|&EL}LF!M-59(qa7?gFK37qDgRG`5wyKvs>?GRI5>a3ax0 zzsh~p?vu4=mSxT6v`iMQD353$1|veHz@5+ZeEpwp)8NU#oA7cs7serA2EjM!Fw zn}ESNyBSJHH$!d2Ie8857qGMO+_dJ_57dj#SAMzm#v1WMa(r`Dxac&LH_~dXP)i{- zpCDwS;Oi(PEh(#zibqe_t#J5MeGTtm5C%>BHD*r$br%E9Vqjn4{MB&7Lb$H>nJdc= z$b30)T0FSu8aHBC!dCSbTqxu`am9&UW14%Z3exLsO*O$@+(flw;O#1V@rL~3;@3#=pr|9}to zGs+iN6S@P#GicJlwgDVDs82y?sqkLta1()p$=IxXwKWM)Q+g^VC|`K}Ul5_{_M|b6 zl$q)yYvu4)Q#NJ7$ahuo1F)w6FNj@%b!kGrWXm9fuv+2z5*U7{a|x9u)msL?hEk7+ zUnbzfb!*{X*=k4yglzp4yc^tKJDEC2egpUrNn1Wq(;lg2Tim^E%A9Q5{BiM?7(c_m zWxi{E(<&8jnXdCItiB#kp2347HtS75%#M>i&I_SX;HJR}u?1F}nbnZEjun~1UUXWqq{&Uj5{ zqGqR5y=(3T$t~*-u@uvzxa*+t&xWGkn%yk9x8V*IR2u3yhmvL0L<=6}rwy>mUUb#O zA=2NvSi2=&yJc2PxH=bIU2#{}e8WesBSZ`y<&n_4<=vO>zx?AqSRdFg@^M%5v~}iU z(v^3|c-uHrOBXS>$1B?tl^dmXn`T@7*VFSAiCsO?&Jz#DrIND?t{0P)El)UO-bH@; zVA5T9=g{p#GXfRb5wGo#YBtaAl`5F^xAJWy_9B%RJ2AIOs@T2Y-lHUvDmTm)EpFT! z-?(?aL)vg?UX&`1F1WkXpW9~*iyNMgZ+Kp6@4_w675f+5us)Fs9Expzcl-VAvoHSS zrT1P+tnZRq_RiNx6$cjF={k+Z2HqXHKO(hJZ&p0N;Oa3bhjvijX2-MvR{78UATm7?JvLiDYnL3m7fgGWO!jHZ%=Ve} zlA~e4)JU(6nQJov$cwW( z(=WUzj8*1ouo>%erntLC_s>ha8?_&_A8PL2to;|8YL9W+Up4BxHyVGnS>L_O_-jso z%xqj?Jxn+{qY$BbGSqqj|c7t!|@{*pHTRb$mi>eW4KQlkGXy9 z@nExb^5u9pEQ%2KsOh^r0P14F*?qC=_hCCk@AJrsp3nmqb8$9ORbN%m^l6_L;a#5?kJ$feA7O&s7%pp7< zdP0xoi@Zi(J+oQ8)%kM{uW~<7>-+JSc4?)!4UG8AAU3&}0s{X|n%y%=j3XMHRZaj= z`Ho&CD?v2mhJwkbU;Q0vJ> z>WUp%Lff1WDw+tgS^MRLY35RwNsW+4|1y1(rM{5qS6w#pI!Ho_RZcA%icI=J!X#EX zwT0pgz7w*r%6%C@Dde+TswyFfCWWq54lDpZ7kQ_)Xwopvk8Juolv?mAey0OvR<9}$ zuNj3!mb*4otHKDlfHqhAR-)sfdKG@KK@DY|LZRB5xIyG2{ip^mM0qfnP=`9xm!s+h>j zLBzhS5T*6u;(epg4gg=5)H=Y$1(`G8>xcD;S3wSP|CAQ}R1*93E7f2SBq?rGmwy-s zRHpc^F}mUEBXs`;Giga*WK2YFJ8UAL!7wULAerupe`qYMX&Vij+S+8hxUlO4yL=al z5egi>_tST<=yB(LxRjBZm&fJ1+8H73X(C#$8Gx^b&24SaRJcS}%Y)Q`aG?+KYJb@A zq8#awOMxigApew@=m$d46NZ(CFaTCH?)KyGGB|cQtcUNXM<_5bfVfM;W8vbnBV;HK zBARY!MspPH71`t^Pi}LzwM&O(*Ee?I`~&CD&~?Q}PV{(ApFMuO>xB#9P2V`1UE#cw z%nBxL;|ZetKhxM0D5^Jf?*aSq0k8p>nsvB9rS=tE(@$R@&fvl(rl}2bnW<)n3-_n( zKRv4bC%!!asBEuO0=rM~G7Mrc5RNYKGAvz2LMQ1}#6{JD(1))q4!40)EWHP1tI=V~ z)`mnK;XJ8bYK{!L@61DNs8jQ#$W>1{lGW} z+Bir=);LBY<`G~D5)9L9wlKRLs+TTPhh9prsx*y?xKW>`Gb5X21q=L8AKl1=ddQK4 zgR}ebGTI0XbHzQ}(CHDc1N;Et*S})a@~8H~MSES`UMFqzE!Z#JIGEH~7InpOUGa=j zYB(b4iWhW8GjY0^L6B=KcJHd#qb#;NJND{qt90Nji+}Nc5w}((td%$RgHCn0 zP?*vBhu`^w?-0pVcW3?W^$9!d)OmAFw0a35lMA;m%)A1j$w8vFYCbn{d5zIm6E5$K zBcQzUi>4VtmdLBVarB`vZ_!v9HHB4e=t9 zJD>UKOYgswC_3^-hm!WxZ}tr>u6dY;(M9^X!Fr-7l>lSQBN(a3_PS zT%q`%`^dFEnOC-mdzJF47xNn9d5wv@W~uGmhk55%+O6?2=qx14x>l!pWJkH5yFjK& zuIgw@v_f)t7fkI-ChO~_o2Kc!TUHE&y-2>XEn%y^abQVjo@!grmEb8^Di?HB6w&bg zm^IWg07K)R!8flrd{!x-BR@)$@2Vyt_#srECyO{`oqKDJ3L)KW4hfQJ@sLJ!_=vd zbY4*5I`0=3cuWkUm}{N}ZJ75zD=Ru?)&AP5KUQd@U(ul%-`Ce%DPJ=&LW^<~^PjD0 z@d}0h9pc!9AoLqr^4cccAD5L(jKK^2J z6AQ;%e#7Cv^*5aJH=OymT*+^_b$`Wme8hG9hI7EygFnjuR#%FjW4dvC*`nvq@xRd( z;fI^B6zTOeUfuHR7nVBy0{@63|BGY%Q7!Lz#F78SdBdQNuYLr>Q@;A~dGmI@efd?M z)7hrNA8JcK(V71KL0FaR%6_+eu7EQYK#GYnaZlil>^-f_Df1j@-j|9Rs0rZb)(J$hzI{e+m52P{zw zB??ufq;w@W2M>P~37A$26;x*@g>jvBW6hx9~hFqY(B z7JgajUEpwj5RkJ(B~hx7sG!iOXeS^Y55Fi@t?zXy_EU1yrZ`loS5Ta%q^KQmiz=zs zUZEJK0@Vdn$HE9V;LcGY;^m?=QWUo??1B|aR)pXDEI%j3xaVj&pJ=!93kO4%H0iMu z!kybodCF%CLeq&eu#A$+bXUa1T!8ccluP(G<<%~$A!%x|H%Ba%HQa~?Eon;x5bYp; z$+G(DKU)saJe6b%=tU?sfY5w@&H!ZywwtICVZzE*y| zeAXX4yZy}exqN$`BSY+V2g`2EulM?C8>h}SZ3J(%5|MX}S-Vs=s3spC z6A!Pi_*j88Z;1^J4H3a)CHsY-es{M1sx!P}D-w`;lFa4@&-rop>0s?8@lwZ|-Iu$s z`LBhi>o-j=ZMmjRd)udN?ItG?(U5d?ME?jTng!&~OI#fjA!xB81Sn;;48Ub#PGP1j z@z5B88u)3W08WvgN=+FHX{8->wto4X?2zlG!Wjb9716A&?J=v_^2&?|)Y-Kg&H97T zqA77|&y?-_rg@x{35xQ{PC6yd6H)G@8HZDLob#WHPpv$6^nCd|fo7(J${qBa?^A+x zmMtf*bQk;o7&y6C#T6&tr;D%V){{5##3d*LY;8cogbtXig-aD7j@J%v#^bJqNAdjrN4y7^8I{c zflV5h(h`rY^P?HL(xM`5fDviZzbH2XiShz*agNvzM2ATf_mD8rE3)en0#2)HTvjZ( zl@x1kx*pDrNy|B31bsGK%P|Q-K}liorZJ!z)*_moj91GeHv0~ zNF8LG0A(!*+7Ps3kC4&~3B_qx`&3;^jH!liZ|9ypoe#FQ@5MDa4frR#S+VP>9!+P7 z!9GLAddLX&sG1Iay4r*6#Z!a52}K=@tA@REB(9pyl&!*j0(VB> zF>t`6T0&!;NTsUgvH((H7H}B94K!JI13~+o@P?vT-<;FpcTC#;tAMx`Wdy6c@n@l$ z_d}~^LaVO~-wbVh!8W_3@zoQroVdAU%?q}3B{O-|v&)*^U$$dr*^VDJ-(2><3%2Rv z$V^^c#zU%W-w$t^32(YKd^6lRX?vl3##Zrxw`kJ)`KS4OJh-wPz1WSy4ed+F_m|i^ z3dHX(4|cf4zjMogv;91}w0dH3=YmKo?r(pv;*LlvYPH0`P%_b*P?bo-M=zbn@^K4W z4NHQHLRDq^@w4u>h`m4E}-p@F;mc(`e$jV6I?v zF(b?oK+s%ylikmEFP1xnaWO5V#W8SBXd((d)&U&RQ2}uA7_0%fq{wM0E{q7rz+M?> zy;+N+kZp|GMr{#0KT)$r_*mfIs#zLfhhU4om1dc5F+AF1!!f4v|A~ZFQV33jP&IpROyLc-OsT))&r*#8W$GA^w`IyEFZen=Qvhn-Hu(uo3}knW?P|LnSj^c@#0U zA=Zol&x7qlfX9|y(R5vFv58;@{IocL33Ti$<|HbI&*@X`mwZ#(&YzeE#~aRA%H`lC z@a<4tk*@5y7QS-eEzdkQGn=U>`!1<(9=m)j18&@RO+|0+%Q#$e&85yayDoQS2%y*6 zKgBA`0B(yr-Y#Z}h^jjOBLH7;>xJOfi)bPL@rn?@Uz^anvZX0swW)U0LZv{>12^vj zH=nOqA&rq$%k4zQ57Yv1{sAMvV;ckf>nrQ%I{wDq3jVRDkc+F)w7CRg|1_8E~4>p$<+z04A8Xw}{X|A?Bh2&=bQFCL>oLvq$AOd7l0K+z< z^qT(PaHNLyp<%s8Nic{7&#f+|1>BY27roTikKGr^ZCpnf3G6HZpr1m z6ca?NVoO5uJuXqm?9x)2D)!eMxe?>?UAsDU%o)sbCs;skKdRZu#^R5{Adc=IGz7^EaYmnFT^$)41g>9^7fm z-Esxl5eU4n_8QOsy>=go3lIwP#w{l-dlu@uLSYeqV*OHD%imZZp4B-LFheM@S1P%)#U zArC<~)A_?2D)V$4DkdRLK7=y|_|qHOT1>Bu=|4+hw{t<|%>2WgU@>Es#txd@v1GF6 zn3;rX22DUfaBOS=>F(ILspSj|&|U)0Pm+K4R{gA?Y`T2g&4TUIuI(TCiq6^irb*kZ zx9ItaXC~hBR{X3WJe9g~^h$KPdh^YK)@fJk)!?QCt-E2Z?R1g1UG^;&@$JHzEmVB_ zJ_`5`s0@(daj|Cs#6?V$uYR}XmvjOp#vTW-iycKY0U%;yClEh|z;tn|5X*izU`BpV zpE{^A4D1Wv!@dOc$X(&u+=v~(dHC@x2|&9$P$Tgv0*+uT0_CzlY_}1@SJHZ8?_RwmUYilp^Og zSi1~3xARxFgtyw2a#+=rhO$VhFfToo3mn-8<+NR?GP$#Vfx9qpX9w=8mBo5KNb<3c zGX9ONs}|SfTu^RXWyw9VbmSQ0{ac*;*tSagFn@Mi$XaV=1UvYf+m=*1(@wVP3;RW2 zKPT*0rz}1f@ipj(nfs1; zTD2iHu2+C^CsOMx7-k!APh8P^i#G6ZcV(W)%t=T$m*hj;t$})Ijpgr3RUlIpimJmB zs=|NOU0#MaNZ`bFG^Uv>8%`Y41D>Q$Kgv6IZl(f%YUh#-a6`wa$ov2d8zj)ZkUFb| z2zj@8BQz}xGHb{Yb;A-Hf|Ok`q(@V!q*4CjzVa&5!r0Ten>(8~WvvL;M5AX(F5X}7 zH}YWV;kZ7`a<mMMlf{yPPu-4?XR<33;M2QP zj;86doinzb)3##%w-2oG&RL~^=To<{)G=)fW`f-HV2uCugBv|C6X}i;$D{{l*6mqJ zKj8cJw3R|fc>p?U3#Rj`GXatxJnK2*nO?N=y}VWYxxFQpQvCJ?$U%w=&H;KB;-o_V zjE+N;@25N_m#o6fwu3wmnJWgI!;lDIhMN{2ho^wkV1{Re(@u$u%RrS`ND&mVUo`b{ zAYrctQv|XiFE|MczBY_n<8a4~3**vp@TFtTyz!wWdG{vx@VpcE{wd%8P^B~`)K~EF zhnA*Z2KIJ=s>sRIag`ZTe=0GUbE&B3Mqub@BAL=sP{m^yN=8w=HN%nBD07x@?6~3S z#SnJ8{}0^m8?YE~1xE^sv#uVZj&=%2C&?!ksR{&No^g>t$=T?cC_gmmm|e2*C&dp= z6+cPT}@@^CAyH>W&)xQ+(-D~F5VW$N39L)MbCh`9#;?%_fj;&X>Xw3NFa zX{2$!@sTyOj1NDul|I71^T=Nmy#zatI3AO6H`$v;HrWSjdC{j9LSeu!r3__Oho4{fC+ zxcW1R+MDJGfLotA%ua&v)@Sw9-k1S;<1Gz8{#?`hPeuv;ELK;&YF3vA42!FO`cfRbSGsv|k#z+>jyI zz7C_Yo_=8O0&+x|R@uxEZQi73l7d(EUs;(UP+ePlz5Ux8=Ml|REtMOm_Fh^xvv_TW z0DfJc$0`%$uk`w}u1tjoIY0k#uWa!^c5{~SSbO=^&wD3iem=RvUS+; z6x*<9!@a$U5%vS@G7r;|O`Nmmd4lkdQ;Q}rl9@*saTu1N7}L-t##o=0y&oF5q^$ za-tGYLxg9z>|4PO>@hDOW}5u~Hg_X<62TJ)oCww;Fr|pxWi91ieZFPeFpyG)ljy5i}D;n_}MnPhStmx z0G~0M5NYKcLHNl&>ss2F(J7H!lVk6S#Xps7f3yXjOr(;}GYJ=eM^{%}v9&gnZ(G_N mg!2bmUBaAix diff --git a/scripts/meaning-kernels/extract_meaning_kernels.py b/scripts/meaning-kernels/extract_meaning_kernels.py index 2af13bca..81d78017 100755 --- a/scripts/meaning-kernels/extract_meaning_kernels.py +++ b/scripts/meaning-kernels/extract_meaning_kernels.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Meaning Kernel Extraction Pipeline +Improved Meaning Kernel Extraction Pipeline Extract structured meaning kernels from academic PDF diagrams. Issue #493: [Multimodal] Extract Meaning Kernels from Research Diagrams """ @@ -8,9 +8,10 @@ import os import sys import json import argparse +import re from pathlib import Path from datetime import datetime -from typing import List, Dict, Any, Optional +from typing import List, Dict, Any, Optional, Tuple import hashlib # Try to import vision libraries @@ -40,13 +41,14 @@ class MeaningKernel: def __init__(self, kernel_id: str, content: str, source: str, kernel_type: str = "text", confidence: float = 0.0, - metadata: Dict[str, Any] = None): + metadata: Dict[str, Any] = None, tags: List[str] = None): self.kernel_id = kernel_id self.content = content self.source = source - self.kernel_type = kernel_type # text, structure, summary, philosophical + self.kernel_type = kernel_type # text, structure, summary, philosophical, semantic self.confidence = confidence self.metadata = metadata or {} + self.tags = tags or [] self.timestamp = datetime.now().isoformat() self.hash = self._generate_hash() @@ -64,18 +66,26 @@ class MeaningKernel: "kernel_type": self.kernel_type, "confidence": self.confidence, "metadata": self.metadata, + "tags": self.tags, "timestamp": self.timestamp, "hash": self.hash } def __str__(self) -> str: - return f"Kernel[{self.kernel_id}]: {self.content[:100]}..." + return f"Kernel[{self.kernel_id}] ({self.kernel_type}): {self.content[:100]}..." class DiagramAnalyzer: """Analyze diagrams using multiple methods.""" def __init__(self, config: Dict[str, Any] = None): self.config = config or {} + self.philosophical_keywords = self.config.get("philosophical_keywords", [ + "truth", "knowledge", "wisdom", "meaning", "purpose", + "existence", "reality", "consciousness", "ethics", "morality", + "beauty", "justice", "freedom", "responsibility", "identity", + "causality", "determinism", "free will", "rationality", "logic", + "metaphysics", "epistemology", "ontology", "phenomenology" + ]) def analyze_image(self, image_path: str) -> Dict[str, Any]: """Analyze an image using multiple methods.""" @@ -90,43 +100,183 @@ class DiagramAnalyzer: "aspect_ratio": image.width / image.height, "mode": image.mode, "format": image.format, - "size_bytes": os.path.getsize(image_path) + "size_bytes": os.path.getsize(image_path), + "color_analysis": self._analyze_colors(image) } # OCR text extraction if TESSERACT_AVAILABLE: try: - ocr_text = pytesseract.image_to_string(image) - analysis["ocr_text"] = ocr_text.strip() - analysis["ocr_confidence"] = self._estimate_ocr_confidence(image) + ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) + ocr_text = " ".join([text for text in ocr_data['text'] if text.strip()]) + analysis["ocr_text"] = ocr_text + analysis["ocr_confidence"] = self._calculate_ocr_confidence(ocr_data) + analysis["ocr_word_count"] = len(ocr_text.split()) + analysis["ocr_lines"] = self._extract_ocr_lines(ocr_data) except Exception as e: analysis["ocr_text"] = "" analysis["ocr_confidence"] = 0.0 analysis["ocr_error"] = str(e) # Diagram type estimation - analysis["diagram_type"] = self._estimate_diagram_type(image) + analysis["diagram_type"] = self._estimate_diagram_type(image, analysis) + + # Content analysis + analysis["content_analysis"] = self._analyze_content(analysis) return analysis - def _estimate_ocr_confidence(self, image: Image.Image) -> float: - """Estimate OCR confidence (simplified).""" - # In reality, would use pytesseract's confidence output - return 0.8 # Placeholder + def _analyze_colors(self, image: Image.Image) -> Dict[str, Any]: + """Analyze color distribution in image.""" + # Convert to RGB if necessary + if image.mode != 'RGB': + image = image.convert('RGB') + + # Get colors + colors = image.getcolors(maxcolors=10000) + if colors: + # Sort by frequency + colors.sort(key=lambda x: x[0], reverse=True) + total_pixels = image.width * image.height + + # Get dominant colors + dominant_colors = [] + for count, color in colors[:5]: + percentage = (count / total_pixels) * 100 + dominant_colors.append({ + "color": color, + "count": count, + "percentage": round(percentage, 2) + }) + + return { + "dominant_colors": dominant_colors, + "unique_colors": len(colors), + "is_grayscale": self._is_grayscale(image) + } + + return {"dominant_colors": [], "unique_colors": 0} - def _estimate_diagram_type(self, image: Image.Image) -> str: + def _is_grayscale(self, image: Image.Image) -> bool: + """Check if image is grayscale.""" + # Sample some pixels + width, height = image.size + for x in range(0, width, width // 10): + for y in range(0, height, height // 10): + r, g, b = image.getpixel((x, y)) + if not (r == g == b): + return False + return True + + def _calculate_ocr_confidence(self, ocr_data: Dict[str, Any]) -> float: + """Calculate average OCR confidence.""" + confidences = [int(conf) for conf in ocr_data['conf'] if int(conf) > 0] + if confidences: + return sum(confidences) / len(confidences) / 100.0 + return 0.0 + + def _extract_ocr_lines(self, ocr_data: Dict[str, Any]) -> List[str]: + """Extract text lines from OCR data.""" + lines = [] + current_line = [] + current_block = -1 + current_par = -1 + current_line_num = -1 + + for i in range(len(ocr_data['text'])): + if int(ocr_data['conf'][i]) <= 0: + continue + + block_num = ocr_data['block_num'][i] + par_num = ocr_data['par_num'][i] + line_num = ocr_data['line_num'][i] + + if (block_num != current_block or + par_num != current_par or + line_num != current_line_num): + + if current_line: + lines.append(' '.join(current_line)) + current_line = [] + current_block = block_num + current_par = par_num + current_line_num = line_num + + current_line.append(ocr_data['text'][i]) + + if current_line: + lines.append(' '.join(current_line)) + + return lines + + def _estimate_diagram_type(self, image: Image.Image, analysis: Dict[str, Any]) -> str: """Estimate diagram type based on image characteristics.""" width, height = image.size aspect_ratio = width / height + # Check for flowchart characteristics if aspect_ratio > 2: return "flowchart" elif aspect_ratio < 0.5: return "vertical_hierarchy" elif 0.8 <= aspect_ratio <= 1.2: + # Check for circular patterns + if self._has_circular_patterns(image): + return "circular_diagram" return "square_diagram" - else: - return "standard_diagram" + + # Check OCR content for clues + ocr_text = analysis.get("ocr_text", "").lower() + if any(word in ocr_text for word in ["process", "flow", "step", "arrow"]): + return "process_diagram" + elif any(word in ocr_text for word in ["system", "component", "module"]): + return "system_diagram" + elif any(word in ocr_text for word in ["data", "information", "input", "output"]): + return "data_diagram" + + return "standard_diagram" + + def _has_circular_patterns(self, image: Image.Image) -> bool: + """Check for circular patterns in image (simplified).""" + # This is a simplified check - real implementation would use computer vision + return False + + def _analyze_content(self, analysis: Dict[str, Any]) -> Dict[str, Any]: + """Analyze content for themes and patterns.""" + ocr_text = analysis.get("ocr_text", "") + + content_analysis = { + "word_count": len(ocr_text.split()), + "has_text": bool(ocr_text), + "themes": [], + "entities": [], + "relationships": [] + } + + if ocr_text: + # Extract potential entities (capitalized words) + words = ocr_text.split() + entities = [word for word in words if word[0].isupper() and len(word) > 2] + content_analysis["entities"] = list(set(entities))[:10] + + # Look for relationships + relationship_patterns = [ + r"(\w+)\s*->\s*(\w+)", + r"(\w+)\s*→\s*(\w+)", + r"(\w+)\s*to\s*(\w+)", + r"(\w+)\s*from\s*(\w+)" + ] + + for pattern in relationship_patterns: + matches = re.findall(pattern, ocr_text) + for match in matches: + content_analysis["relationships"].append({ + "source": match[0], + "target": match[1], + "type": "connection" + }) + + return content_analysis class MeaningKernelExtractor: """Extract meaning kernels from diagrams.""" @@ -139,17 +289,34 @@ class MeaningKernelExtractor: "pages_processed": 0, "diagrams_analyzed": 0, "kernels_extracted": 0, - "errors": 0 + "errors": 0, + "dependency_warnings": 0 } + + # Check dependencies and update stats + if not PIL_AVAILABLE: + self.stats["dependency_warnings"] += 1 + if not TESSERACT_AVAILABLE: + self.stats["dependency_warnings"] += 1 + if not PDF2IMAGE_AVAILABLE: + self.stats["dependency_warnings"] += 1 def extract_from_pdf(self, pdf_path: str, output_dir: str = None) -> List[MeaningKernel]: """Extract meaning kernels from a PDF file.""" if not PDF2IMAGE_AVAILABLE: - raise ImportError("pdf2image is required for PDF processing") + print("Error: pdf2image is required for PDF processing") + print("Install with: pip install pdf2image") + print("System dependencies:") + print(" macOS: brew install poppler") + print(" Ubuntu: sudo apt-get install poppler-utils") + self.stats["errors"] += 1 + return [] pdf_path = Path(pdf_path) if not pdf_path.exists(): - raise FileNotFoundError(f"PDF not found: {pdf_path}") + print(f"Error: PDF not found: {pdf_path}") + self.stats["errors"] += 1 + return [] print(f"Processing PDF: {pdf_path}") @@ -229,16 +396,26 @@ class MeaningKernelExtractor: kernel_type="text", confidence=analysis.get("ocr_confidence", 0.0), metadata={ - "word_count": len(analysis["ocr_text"].split()), + "word_count": analysis.get("ocr_word_count", 0), + "line_count": len(analysis.get("ocr_lines", [])), "diagram_type": analysis.get("diagram_type", "unknown") - } + }, + tags=["ocr", "text", "extracted"] ) kernels.append(text_kernel) # 2. Structure kernel structure_content = f"Diagram type: {analysis.get('diagram_type', 'unknown')}. " structure_content += f"Dimensions: {analysis['dimensions']['width']}x{analysis['dimensions']['height']}. " - structure_content += f"Aspect ratio: {analysis['aspect_ratio']:.2f}." + structure_content += f"Aspect ratio: {analysis['aspect_ratio']:.2f}. " + + # Add color information + color_analysis = analysis.get("color_analysis", {}) + if color_analysis.get("is_grayscale"): + structure_content += "Grayscale image. " + elif color_analysis.get("dominant_colors"): + top_color = color_analysis["dominant_colors"][0] + structure_content += f"Dominant color: RGB{top_color['color']} ({top_color['percentage']}%). " structure_kernel = MeaningKernel( kernel_id=f"{base_id}_structure", @@ -249,8 +426,10 @@ class MeaningKernelExtractor: metadata={ "dimensions": analysis["dimensions"], "aspect_ratio": analysis["aspect_ratio"], - "diagram_type": analysis.get("diagram_type", "unknown") - } + "diagram_type": analysis.get("diagram_type", "unknown"), + "color_analysis": color_analysis + }, + tags=["structure", "layout", "visual"] ) kernels.append(structure_kernel) @@ -261,6 +440,11 @@ class MeaningKernelExtractor: else: summary += "No text detected." + # Add content analysis + content_analysis = analysis.get("content_analysis", {}) + if content_analysis.get("entities"): + summary += f" Entities: {', '.join(content_analysis['entities'][:5])}." + summary_kernel = MeaningKernel( kernel_id=f"{base_id}_summary", content=summary, @@ -269,14 +453,16 @@ class MeaningKernelExtractor: confidence=0.7, metadata={ "has_text": bool(analysis.get("ocr_text")), - "text_length": len(analysis.get("ocr_text", "")) - } + "text_length": len(analysis.get("ocr_text", "")), + "entities": content_analysis.get("entities", []), + "relationships": content_analysis.get("relationships", []) + }, + tags=["summary", "overview", "analysis"] ) kernels.append(summary_kernel) # 4. Philosophical kernel (if we have text) if analysis.get("ocr_text") and len(analysis["ocr_text"]) > 50: - # Simple philosophical extraction philosophical_content = self._extract_philosophical_content(analysis["ocr_text"]) if philosophical_content: philosophical_kernel = MeaningKernel( @@ -287,33 +473,61 @@ class MeaningKernelExtractor: confidence=0.6, metadata={ "extraction_method": "keyword_analysis", - "source_text_length": len(analysis["ocr_text"]) - } + "source_text_length": len(analysis["ocr_text"]), + "keywords_found": self._find_philosophical_keywords(analysis["ocr_text"]) + }, + tags=["philosophical", "meaning", "conceptual"] ) kernels.append(philosophical_kernel) + # 5. Semantic kernel (if we have relationships) + content_analysis = analysis.get("content_analysis", {}) + if content_analysis.get("relationships"): + relationships = content_analysis["relationships"] + semantic_content = f"Semantic relationships detected: {len(relationships)} connections. " + for rel in relationships[:3]: + semantic_content += f"{rel['source']} → {rel['target']}. " + + semantic_kernel = MeaningKernel( + kernel_id=f"{base_id}_semantic", + content=semantic_content, + source=source, + kernel_type="semantic", + confidence=0.8, + metadata={ + "relationship_count": len(relationships), + "relationships": relationships + }, + tags=["semantic", "relationships", "connections"] + ) + kernels.append(semantic_kernel) + # Add to internal list self.kernels.extend(kernels) return kernels def _extract_philosophical_content(self, text: str) -> Optional[str]: - """Extract philosophical content from text (simplified).""" + """Extract philosophical content from text.""" # Look for philosophical keywords - philosophical_keywords = [ - "truth", "knowledge", "wisdom", "meaning", "purpose", - "existence", "reality", "consciousness", "ethics", "morality", - "beauty", "justice", "freedom", "responsibility", "identity" - ] - - text_lower = text.lower() - found_keywords = [kw for kw in philosophical_keywords if kw in text_lower] + found_keywords = self._find_philosophical_keywords(text) if found_keywords: return f"Philosophical themes detected: {', '.join(found_keywords)}. " f"Source text explores concepts of {found_keywords[0]}." return None + def _find_philosophical_keywords(self, text: str) -> List[str]: + """Find philosophical keywords in text.""" + text_lower = text.lower() + found_keywords = [] + + for keyword in self.analyzer.philosophical_keywords: + if keyword in text_lower: + found_keywords.append(keyword) + + return found_keywords + def _save_kernels(self, kernels: List[MeaningKernel], output_path: Path): """Save kernels to files.""" if not kernels: @@ -346,6 +560,7 @@ class MeaningKernelExtractor: f.write(f"- **Source**: {kernel.source}\n") f.write(f"- **Confidence**: {kernel.confidence:.2f}\n") f.write(f"- **Timestamp**: {kernel.timestamp}\n") + f.write(f"- **Tags**: {', '.join(kernel.tags)}\n") f.write(f"- **Content**: {kernel.content}\n") f.write(f"- **Metadata**: {json.dumps(kernel.metadata, indent=2)}\n\n") @@ -416,6 +631,7 @@ def main(): print(f"Diagrams analyzed: {stats['diagrams_analyzed']}") print(f"Kernels extracted: {stats['kernels_extracted']}") print(f"Errors: {stats['errors']}") + print(f"Dependency warnings: {stats['dependency_warnings']}") print("="*50) # Exit with appropriate code diff --git a/scripts/meaning-kernels/test_extraction.py b/scripts/meaning-kernels/test_extraction.py index cd77e419..96842737 100755 --- a/scripts/meaning-kernels/test_extraction.py +++ b/scripts/meaning-kernels/test_extraction.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Test script for meaning kernel extraction pipeline. +Improved test script for meaning kernel extraction pipeline. """ import os import sys @@ -10,8 +10,8 @@ from pathlib import Path # Add parent directory to path sys.path.insert(0, str(Path(__file__).parent)) -def create_test_image(): - """Create a simple test image.""" +def create_test_image_with_text(): + """Create a test image with text.""" try: from PIL import Image, ImageDraw, ImageFont @@ -35,7 +35,12 @@ def create_test_image(): "- Data ingestion", "- Feature extraction", "- Pattern recognition", - "- Knowledge representation" + "- Knowledge representation", + "", + "Philosophical aspects:", + "- Truth and knowledge", + "- Meaning and purpose", + "- Reality and existence" ] y = 50 @@ -55,10 +60,10 @@ def create_test_image(): # Save to temp file temp_dir = Path(tempfile.mkdtemp()) - image_path = temp_dir / "test_diagram.png" + image_path = temp_dir / "test_diagram_with_text.png" img.save(image_path) - print(f"Created test image: {image_path}") + print(f"Created test image with text: {image_path}") return image_path except ImportError as e: @@ -67,7 +72,7 @@ def create_test_image(): def test_extraction(): """Test the extraction pipeline.""" - print("Testing Meaning Kernel Extraction Pipeline...") + print("Testing Improved Meaning Kernel Extraction Pipeline...") # Check if we can import the extractor try: @@ -78,7 +83,7 @@ def test_extraction(): return False # Create test image - test_image = create_test_image() + test_image = create_test_image_with_text() if not test_image: print("Skipping test - cannot create test image") return True @@ -97,6 +102,7 @@ def test_extraction(): print(f"\nKernel: {kernel.kernel_id}") print(f" Type: {kernel.kernel_type}") print(f" Confidence: {kernel.confidence:.2f}") + print(f" Tags: {', '.join(kernel.tags)}") print(f" Content: {kernel.content[:100]}...") # Get stats @@ -105,6 +111,13 @@ def test_extraction(): for key, value in stats.items(): print(f" {key}: {value}") + # Check for philosophical kernels + philosophical_kernels = [k for k in kernels if k.kernel_type == "philosophical"] + if philosophical_kernels: + print(f"\n✓ Found {len(philosophical_kernels)} philosophical kernel(s)") + else: + print("\n⚠ No philosophical kernels found (may need OCR dependencies)") + return True except Exception as e: @@ -114,7 +127,7 @@ def test_extraction(): return False if __name__ == "__main__": - print("Meaning Kernel Extraction Pipeline Test") + print("Improved Meaning Kernel Extraction Pipeline Test") print("=" * 50) success = test_extraction()