From f2dda1eef257b961c4688016c07162498a9beea1 Mon Sep 17 00:00:00 2001 From: Anjan Biswas Date: Sun, 10 Sep 2023 15:34:20 -0700 Subject: [PATCH 1/4] Added paragraphs and tables for Azure Doc Intelligence --- libs/langchain/colored-1.4.4-py3-none-any.whl | Bin 0 -> 14249 bytes .../langchain/document_loaders/parsers/pdf.py | 53 +++++++++++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 libs/langchain/colored-1.4.4-py3-none-any.whl diff --git a/libs/langchain/colored-1.4.4-py3-none-any.whl b/libs/langchain/colored-1.4.4-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..86141567069deca0d55dda7256d22c9659fbdc80 GIT binary patch literal 14249 zcmaKT1y~$Qvo`MT?(PuW-Q6X)ySqCChedFMeB?W&&H+NtVkWjSyN3=j|y7!b=qYYpWUj_n-a(Fah_fnw@t=jdW?#%yS4?O^R@ zXvpN`tsl1u7s`SldG&&UYw-AOfyaXt$~P^)Wpm7eoC8zb??IzA7-n>Gbo4#CF;1x;6Jxngc&GoZ4e_onN?O<-Y7vs5& zVxzw>ouKkRtf2=S5tRb1m<0;*zgc5qY-$U%gffm5IfMlx?BV^3B@Ub<1yxoe8nwH- zvv{Pco_#T+HT*2fdsoSZ$HS$6di9sB)xxCC9iO1f<88?zh#PmL3I37*)So>+#(6|w zU~Mt%GOgp>Ni;00p%pMacSm$j3D^PtFrp}K9XUHNkMEltcfHj}mhSbuW&ynEKi9=t= z)RP;F?r*jo8{iX3(wjLv=U?11WA~k~V>Y2U&m)T3Fz* z8@%6@aH$ys7!17E{oZfng?PB5mhe-La>vdcziJ=2rq&`O9aZT1zLmW0;m)pCI8g0B!K?nn?RYOD zeyF=-9#B2bGkiTcR#b4im5fz>NLEu;heeCOKt?w2H+X*F8Uv^t8Dy-=v4&mPyN3Lk$K(xUx?0$Z1} zcHDw5m@sDn(z#jtXn+ipk5RBDMl7io?nKAc6n)eCt2KJjv=M1In|dCoI3A}--hj9Z zwNu`J1RZBUUcr5PtV`pRF!-|P0Ejt4n>hcu?Ds5Z`XK~vMp(%M)^BlPY7_7gxN!ae z*5(PL)oc^1a8-v8JBa}3?<;SKpqle!LO6CDHMlmc5Mr22X0p8G>c+yIbYkkpMkdk& ztZ=x1RVv6<#Lt8gR=o;=>yDNUvLQ1+B$ zMMFu8ruI!f3v4x?Xvh3;eo@p6h3+Ka^lDJTMSOGn4{nbp9} zeL`6R*Mr5u0&K^e+|eRL0Z@$qAB&sy)xI;DQ6?5QQ8Sh#7B^NhmYlhVBv1E`Oy|Wkwm*0XkB7kdY0$#~?a;IvZ134tnm_@a%}AFcCgAYqswfD%KH} z=D3&zo@1~rSXg9UW5T)7%120V**Ca`HY6iu2>Z#2DpM!T2y4`iPnSPnqxj8d_CVSJ zeq*zYgOo}*?R#xC+VB~-OUtbXHT5gF(+1RJZV$2GdXnOweMS-)Hzj^fbJvH1?Rtz9 zFmn7rj;Ka&WF6!J$)Mw2|1CN1HQ!w{2avoua{odT=V!cA2-&1nY{u(2rB@Xyki(Ti zKbsDFnjLKW^LrF-bK1_#ylVt~%@ziNjgw#$se<$fNDUi_`5Di*rc#OA>rIW(kT4vv z)Yhwr_E%7`-_EoWinY(7rEpf*u~HQFt6S?EiF~T2R+^%uIat!;PSSC^Hbt4q-8?!P z1On=#{lv{0in`FHu(T~0$}MqaLRPdTHhM~VTI!3F+0*cJ>DaNTy*x&&cnZBTPSUeK z`P=$4QM?;8%0S0p4ZJkk3I&;avQgq>;m?ttp1R>&-G5*k#Pa-_mw)h9IbtI*UQo_h zG@ zssji^J$H3xwspf9k~9A?r)v!IwO@l*PqF}4(I8EuSvr!qQQgYbiq~%HDEy09X7SjH z4gH*Wx90HBj)ra}+mA~P1~#R%G=I`-sewsedzX)X2Ahju1o%A`NY1~l zAiP?%=>6q05CRV*Ph*ySDY>$OkM44dOsbu*8k(&zxP!GJ$&3sxF!|*6B|bT7E3$90 zgy}1?^CD~}qsKvgYnjO(I5N-u7TVok#a3BhVX(8REWMF77y&w}X0Xe%ZfqKrC&fK@ zG}(yC#xm{-lb7x`@+>grE8}^v!K+jg8ti5X|77o;)m(`%md#Z|i9$sQYwZB8% zLba(Rw#o`kwmYMW71Mhprn5x$fC&)p$jBk>?QneCS#eWF1~|!o62|7RTTlXw za(?bLt*0nyZdeSewZFlC>P0*ylLYEiY=2%DPsFdRT(1RxE#Tr(?ct z(|H&ZIB+N#+UL)r?r>lqi7dHsU)k!k{F7BXdLa(5D|^f;IhNp|k!7Z07a6{6`eW}o z1A2J!nCEwCf|fB2Kc(?JCnmyIhp#bUKg0o?edTx4GGk6+Co|lO-3*9vT8wpg^34i$ zJ;JgEzmmTcA83ftmXv&F5BH{(m_twIH07YG(}gQ%?SZrR`vy7p30UK9(rq(xI^?w& zyIAq761Ph2=8KjD1!bKVtGFKY&6i~niV_tohCpH2qJDZ}K(3mD`dCigp#V>E4WMn=46lf;U#2xy15lpChTi7wnOFaldrAEpp||TpWQOiWzFg;Re}6IO_FTpz_p5S4&s zCcIB>kK%iX)A#dlx~~?}$kCUs-k+}1lf91VQe)Hy$~joBBGu6koc#96XLANWvi&bo za5od&_HlFkx!!9{G`EErHZSjr7-F$h4BF!9EQUz>Ec2C<*3S|pnNw&nilG2D$bIbI zakT+w8*;DmG`u_5lV;VMObCMh&W#q^oj3QVWd=Y8_RL%U3x;6Y z>znBT35}gr&O+Yk8v)sVkKC+r!kf=afww33mx$~)IWND*eP5%37DzBMP^`)JXX`&g zCq(Ww`_)#>J$#w6!;_Zdn6y+5_akS7qPtb9CD>?kl&g+lm^6|rw0oCb+pH9N-Q$)8 zK3V8*%G9A@PYo~T9rgyuemj1%P5A9awesR0I?R$6b2HP$OtoJ`mHJ(sG6p+qv9#Zl zPsuAQg?v&<)>|2WA?tW-ycIv2JgYt+`!N*MW3!G{5x|v7B2&VZV$H;^Gg$WnzNsX; zhToHaJDQ<=z`62<$$52dq4Q%38B*zdzYi7OK%F zN1_-=UJ|s6baxpln=bA9k1BZ@>~s^){p&(X`xRw|HZ^U=WvQl;C@m?dk#kC>{74lx zEybpUdM2A7b1?cuStn306zi~~#q<=*8uW5PHAVwZkQ1VUR$a~L|1 z@+ywTMir$5#vOr_Tl>$#GF^C^6RgL0I15?fDy8cukcvKfQKFOj{z@VNB`QA|k|T4Y zOp!HZGqVH$lPnV{)mK#R-y0TZ>FdXyp^9uuLz(1L9cbS_TR+1yk%iiPbuq(_$i9jp zaDR=eGG0KJ_!e{4Ox`zyo8P2|d?V@T>V^ldIoQ>~q0ABQ-;2RhEhm@Bee^^4(!IbA zjTXf>&00?-*IQa#K;WBk2D(>1ABHi`31hluXM0DVhkQ~O=8qaNr9J(D$p%kU%P}fQ zT&YXYkEHzH{!gEUf*H`Z%x+vu&Tja4-Vn2w%8(E5&z^amKf&MY)W>$|Aa17hXh7Zc z6QOxrP`l_t-fNP@i%0BR2-b!w$arV0G{a8}apuIV=RiDb{#KuwUkqKj6%ffRA^Z@fBodBgtt{~Q70?DJ z#v>89N0dth*X-Zb(`2;I9Va9BzJGz28p}?(bK9EPt;uaL;K=YkPH^l!`KT-`mfL*j zeJ^-f5~>-fBShC?!RIeEG3AJ%`7I``xfz}*#EF$boDS~9JF13>e|+qf!651{eQ0jN zbgPd#d`QyX=PzfdhxCeMaiLpGpIxlh5tS@HjaY(xTbx1n%v&79Mmh;co{|@vWIPFD zbT5C4hI?~FFBRoO zM4*{W@A4!OYsx*?^a5u=d@-a@+5Gt`+JP=B6sI2k2{tP&7&4NkunuF=O^k>bn8-c? z-;vV=1aOQ2CgCyoqbBsS7>Jxf4erQp?a~x)eDP?p`0YhnL9|SUjm2PUM_9Iyf^L2F zOoRN7T>lK?GsowMWgtL60)a^u)IY;`S6~$HuchF)&VtZ$tY!AIpufJbC5Qbah|o+g zriWXTjt?=`nB{)fUVs6=)uaFY&2BtdTGl~nw`o#JHG$0ce(&aF`f6MWxorGpe*38| zFkw3|*TV8PCLQ~BTBwgdySFD-NFbm``{9QjC!hbT^M~cT#d`qf)%(tC{ zq7YZ|ttvhuGo15+=4XQT_a0MxgC+T|GT$boBzi|rJ;-&6k^rNOU<>p?8iC;Lm|^6~ zII&`+6nKz>8Sy#>u8Ah|2G&((o`^0y;_$)mp7^YGBR>$ND@hUnxBhzem{MIv=wAH6 z)KJ(b5pv;@f-K@vD@@Hg7v=^Ix*MbXmsM~VFX2lu8!NPhS3E*fgi z_n+@Xh2%#~g*u3llA=VcUtkncHyV>%paYHr6mM_h%btvv2P0{+P;|7-^Z zX>O#awwSP}ZA34xIb5R%15RS!ArTD@tMK9;yDBi?s?A z`oM33yNdDqjKeq#bP4=-eCCoXa~g-q=yBRFuTvTsgjPK_(UzW?Gtiu#$4HU9zEo(A z;5!c@*z#};E?~blIYFmFX^29Ss2S*~%AdUGD8!b=b7J)Nmf5@Jbx%_r+29q<`}^B2`kU?Cq+lJS+j z70>mGA$^nPr0mw`E5Mmn-d8hbpeB}Au&EC020`xuoye_D>y4qnAg!G5!$;z{Y*Ku`~)D?E>@CX-kZwKkt5xDzzRp`>>?xz&pO>+7IE*m~ubr)k*M0 zr4jX)e0|N%53S1XmBOm-b;{b>#JF|a@NLRGrUviO2^3$rhN`G~6D?Ga7MKR_xCu&+ z77F}pyac23Q@k!+V=ek~n{fS_hAIw4<2beU=qc#4Z|6e)^gEWX1HC0cfAeQX)B>30 z{a-SoBN-@BDV?3%g<=C(UuV*q!>wRGHl;c)E;nD8*33eibQs-7?qJ@g17oRBi`Oym zZqqs$jjqWi)5B26s#WS8CLkq3*(H#i-NfOe%?X&{Uork$jYol zslWf?9=&*4kL;t&2cw*b$F1v-dF)I=E%B2PFVo#wHT(!3H$<_WRq2sMDJqBd>5qiy zuN5?QQD_hlO*{|~q<{0IR_0zngW@vv9rro$!mr-w+kZ$TSENK0)5uEBOIXefVD)5! zG1k+7ttOi|zP7Yxj~dN>7^oH^A@jL591VLBsqi1q99*X!of0Fb-<}wEtO_;1XgR;m z)giyErndWfQfo$9GzA(&$!%|@=1y2BIr{sq3Dh91e>-6y8vd{t^sj9?3vcfoip+Tp z1SxH3e3?Ltd}@IQwZMHYxa8diuvPG#%-_vAqhDw$@nLZmCo_Iftj}2Wmpixs1%adj zJE+(JRHg?Zak;w!$dCjoc_#`6l)v^sCKK5q+g_HSVgr!mPDqM7w8Xzub0`zYM}$M^LdKn}Hu0SYZptbuX}dXrW6w{8#=cW{YQ?RH;=Vhk+M0E!**!E>zX)-zep zNBM|%9Qo(Qa3gYt7FYlQK>m3F7$q_rI3==bNHR2!`92~^2^6pbni6>~s4)@@AfT;$ z&tfh&o&yKqFGm3I)4<9S0e}$M(Bw#E`2}l2tjcI!9#AtH>eB<5#Dd)NluyDDA%%)a zpty8jcxz@vWmKrY2O)4s24GypZUsA}ZUT-A01YU>F#-Ua0sgWH00j`h$Uh$lX^cbz2&m^D6l670 zz7!?{3#$3AQI#U2VyQs$#(<;Dy1t#eE~YV8ax6h3*ZI4Jg?E~KIQAErSkzCh1xke@z2e>N(f^|VCN2qNvyPm;JD*;)TWZdETvqDa zM!2$e#BM4$PJKM7aeRp5u0t*h(C>2866*aRxeRa$e1Cj@i$r#9*z9gLX~HHJ^vAng zbu(GN{KmcaOXVOY_+=VoxG1If-Qn@w@##AsW^d)AMps=*sG*Xfcn|i4UeD?9y^~i5 z!EvFG|LS?3L?DX(+iAoGkzgBjL*IGY#R-8!xAj%tl|-9!sreB>?n|q~PyxC(DX-_W zMw8oY2Aap+rKX^{ao?YI);8;+#nh5X?*asyd5@Mf+`-X zd8NO!3bq^@oSz1?V5K)HX~*1Jcy6>hDB60Syl%%_?LR*}7>8}^mU*pm1}Kie>f9Aw z+?gL!U9RM6UNmeI+?#Y%Oq)DO_&IP*pKT!$M>zfV^A7M>v)``^?7Mu~_x^S5ba}Qo zzS|<;?htaN6}c()wlfBIMg%l$T<)rjF}6X-&x7ct>2Z4a>GkkfDKE^BscWq!e*6~= zQhLft2f(tSW|Mtl6Y(QEpnST<3Tf}Pk>^XmW@jRGx?7WB%hUsKq0D0isLMJI_ zU5!^wTlzhg8qep`HMO%V0!#0V*AB1Tm=Zk?uCM-q=u4ZTAFSAo?Kv?pcY}V2lY{Xr zFf~l$OK~kVp5vuhj_|*zwxZj9>0&dtWBxjLo$R4(z`4Xb7}$*3n=(Y)?;Q2M>B+_8 z`Y|Km`@-!vg2`JG|9!U)%tE_`*`em0I0i$^lj|&T%YOPQqA$K9udFXVI=!vC|J%iO z%A{L#p!r9IOs&)lbJ0URHCWyoc`*Z+3j!Xn8tv@G#b_ zWbuq1ZdMBG(X`s~ogR88Iza}O=6=ci%s z;40}`6Qss5lB73r$7Hs#g~L?W=K~ZhuWgSWV(#mVrS|O75iNx{Icu6H=)cE{t&vWP znO(%(2@C{;2^i~<|2jnUe{!jV~CModCcO@hhI%S}U7F5xQ+O7AVb zBu@+4n&5((BNBIv2G|eouccgIRPAKkjS59?4>_c(=@_uyMBBE>E8|U{%1HYk>b3)f zZKkNW_jjjP!YtTweyONidvb2!^MQImUHtxqt5};Ea%u82XUWLdDsm*p4Pij^lfX9# zz6AJt9K27NClH9uKO|CAa8SOl3x38gbM6cs@msT$w%#hYKQ(_{zW96DOI<$q2>GWxmE!tEf(xOZ_6|bt zM$q9c_8C+r$FiQb*O~n1qF&30Asp7!LTssce&1J5h*0Qu?KYI;`G-AXW#N$mCN7(4 z3TPEE%5mzX8w}0rkq5en9%Y>lwIK9l%u|pJW z(M1L>{FC)lI0=7MTmoJ54A<;PA*hD}RJ=x+h0?vpC` zddCD(r>SlI;+wMULNs#7iyRn{+W*J8hk|9QjuKRoD+ z0iUByxj{fk{=*HcAfYZIE}||{qxbS(ZyV?bLRN0YUc@lK`b7cO$3O42E7Uq9S8Oi- z=7l4!AIhpr1`yu;>e~jR_(g{EGDr6*mcmc_oXJCof#lJ44#If`qHH^+-S+vHQv~;I&pag= zEDBmbz7Iyo#ajx+(?G{i2APV)QLO_fLv`m(T$Z9~e-8Rk3P%eO1L`J&uR2)=FAq&O zAgZ^c-gpb{E-5HD*N`3L)+!bhjx!qQ>@RpZIi%P*g>4$BS4B{-4M7nsl(Ky>uS^wR zvhPBXjwKAR@ebIcr47*6UXM;OeKgZFoTW^jpU$+F{hQ_v6=W)dI&MytxhRTO+Gd_2 zzZd-cDra*NdR>=NPItOj>dl`V@F+OCzt{U-u6U!@?bj{hm_=AY6xP=vT6dEHfFSCy z!IVE}g=>uUXh*oYAPPy2V~LGG7CYg4QF(N{JrL+zOJdj9S(eBYZ+`2H8TXl!7&yU} zfy;DxI(^-{TO988-j=uGBvRfiH7M&3#mzY22R}}^B~yTnF;d@fdn}KVne;hKeC9|o z>!_-2xr(uwJ0JHE?ASsLDzE*4XoDR3y~->SmA0l(l2b-aPPNhk~@NO}O(2n2J= zkbpSRD;)rJ$yZouMK7cn+P-Q^Y_{m>el!-$oH(W#1x{oqO5qe6s1Ll6!IMx`=IUTs zH!vQV_vVmz<#33!P)-)nSl?Wh-KI|)1bRC^9gXMB9wR?!hgqZEe{0jPJJNamy$N;1 z7A*s6mV%k0_Wpb}vZ>OrzXa(Hpx{aT*9{Cs+Oak2hVcSg*NJe~J( zoNmE2OV#;R%V=58i)s?d9CE^kn8+R^T8UyETu3b8>0tG&N-O57_LobvG$ABqu=2CJ z(hZCWV@MPX?AkI42ugW>H_BKEK-4;2l*IC;+PacrgR5B3G@vlp7mg_2VL-=NIdp2_ zh^=HI8$n>&GDu4ldy;oq+KGaZ2&aO%Z6f|( zdRNM5dAA`oZk-G6oZPHW1C*h1Xt1F2EAS*Y$Z=X+amn8n$rp_1VqT$KHnGHK~Ok~SFe2Hh3Ld1*ln=Pvh$pTLi=D5e>}&9UaW!IiE5z`yMudJ zQ6-YNhm$f2R_+-@O%pCR*oDy}QLl3fBS)#!;tombe`B(CYjYi{{JQ?lf+xgCJ_*_r zT0tfFxO($J z{cwXivaK}1W2jzMa^+;zpo6M4?&rBDUO#^pac>a&g>ZybP4#jB8qiveZfUlZ)h4k5 zgZi@&j^<;BMg`Y@2oJoDT`Dzp6s}H7UiJ;w5XUc5)=P;S zn;M$$H^*&9QTx?ZPB#jZ#O!y)yFA{LZp}B4pwtRNae9Hl6$ks&<>i@jSUifY`nV~$ zz>*wO5j}ysqQO4s;2cVbl3v>0h*KCND0lT> z<&fgDv^H-nuY#JcTglOo62zTEux*gm*|t_AWZKiiD76kN7Z{yawIURuz~@!UQ{AV^ zAr@Q-u<&IkRTI`@SR8+1OF(=zUWKx42(dzTHu3*PP5L?Ydm*U_rAd!zfh-UF;CX#n z8H+bX@Q(A^t#?TUo(}2MZjLqo8lIdotpu!DzFkI?l9(dl`LRnYgK97t>aD8}- z1m;C@`&YhTC$}Nt2tFfTdSz68#Wo4Nlx{i`{RrwtHt8mldhBnUd}8!AFo%`xTseFz zXSpYu-tV#Gf)=Fyq&OLRqa+pWb#u68%cqt5@pjYgFLA z!iI%D`yc)|Nk7Z7YcLy zoDX%vrOgzs<|Ud(nxP0Qt6y+Rh+Xat z!MRNOle!5P*% z@J}fNH9jL9HEi5&)U?M91EwNP)eZ|L>7`~nlI=CO<$C}vka5JcsE?A-SCL4<2wg6m zZIF$W3|GXzQa!J;#g3R#8z29-6Xg=;Q4Hn9!{n`rHam8I8yquyty4*R!q`T8q4BkrtoK)s!I@py6>O1WcW+lBw1pV(mknfRBWqXMWL4-; z+DtNJC%ja9Mvp=r@bAQ%2fVm(l+r#_a1~aJ`=7>Hox0@tkY{ynl)XolGCfoXvAO>1 z`^1N0WJQpUifcg79w7>Oo76bnptFi`ZZmy9qT5k%>irq>6H1?TU8uc7>t3Z!BB!PN z?br9#jKb)4kK2bzlq_|aaJUpL?@b76yqPX;^i?(D&XZeY?P%F@cJ1YNPIg{_&GdD; zC>i**V?yD}Z+!=z<S0TpALH+=_z-p6T78n~#0Tj~*~Ej+ z3hVxhLS4a+;)mu{B6q=@yUkxAvcAYN{dLp>5-tPDv@>qy$K!z zWsj>q6p?;KVmKbEK&xG>QY#q3{z|OBV(NXGrEo5-pmG2PvPCg7Z@>Mv>&FakQ{Op| z%ewMI>@XUdL1~NO3aL8qM=*P|4U?YHNsna9Y&*m5 zYEu+q_F0+t#K#D6?2`&^y=xXK+;xn8?sEi){ks>hrrh96y`99d~4=K^Mt; z;=*R$VLamwd_TGwhWXYU5gNzMVGY_1%UWJNs)q&Gwl6h1Oc1*oNfw5O zJ@1TVc@vIaqrQGO4~Umc>jNFv2GIfLl`=k`%3>7Ur8v)#rvkugg)9u)vfkvo{3MXD zH6%XjFJQ6-I5F$1SLLja2zXuZUHne1xaWiysMdptpV|3NJ`M<+u&a}RU7zcNa4Q`4i& zvI@I@l13Q(@pBAVmLxOjSZmNmbl^cHE^>7zJv_KNr#kB659?py2 zXj*^I+x?cLmly^qTAlsZP33cYrO$%#d@;L_yO3|n-Vs^QrTronTdE7>ck>F#dqt0U z;a*uxXqa2iluyYf2acg%_dy~IX+4iD1*L!QW#$sVSk7GsnDsj>x~2mX7^kUeSF>E> zE$b@1L{Y!NvVru~+=4IKAxkT@$xtI@33|(Bk@GR~&mN0Gzr4sMvniD7#()|J6*_rV zyj~*56TMJcl^>Srcww$jl=obU3o6})Q_G!SIjuW#XV%}Rf6#vpKZb@RlA9oGlpri> z6mTlLuKeZ%-m|14c+Vdx&MuclL&OWtRtWLP=Sq}*?@U;!QPR*AfDor94lbYOg-&I# zKs`2Qb2?WeYwnK$<(oC&S^YAueTnI(B{EGe_u}%nug$x0zH`Q%@KDV3B_=20EZ517 zzW?V>0{2g)G-zFvSx&?@?YB&|wOj~o$(l;`*b$Wl5vPG(D&E%O96|nqoVr%^d-jA3 zmt`(FjU0zQtxR^G(QQNT5up2@pOzY63tTPV;)I2zf;taGHc`gt#=**wx2@hUJueiJ z+nQ0k(W}_tJ{AZfP1p9|9f%!`p=uWDY#rLxzJP+kbtEga6w7FRP6HB=~bF&ws-~K#GH>{*B;& z7WVuJ{&UgH-{1mZ9{XRt|Gk9fPyC;6>->$k0=~TS5By*6?)(Y=bJG29_zvdZ@c)^D z{}cS@{Py4AMy$WV|2^CNC-%>5{ohy;;P3rkdjBhT|L^GkEqDDlTABF&qyJMT`%l6@ zbBliyf|C8;g#XGk{>k>IhyI%lmi7N;`!9d}C)J-`^>3>D|3LK*ANwcepT6~9l+D2x c|AF$qJ*~1FB-CH+5rCTj&^K$c|8?~L05&TMs{jB1 literal 0 HcmV?d00001 diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 2ec7a684be61d..3b1cb8e7d9d23 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -270,18 +270,65 @@ def __init__(self, client: Any, model: str): self.model = model def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: - for p in result.pages: - content = " ".join([line.content for line in p.lines]) + page_content_dict = dict() + for paragraph in result.paragraphs: + page_number = paragraph.bounding_regions[0].page_number + + if page_number not in page_content_dict: + page_content_dict[page_number] = "" + + page_content_dict[page_number] += paragraph.content + "\n\n" + + for page, content in page_content_dict.items(): d = Document( page_content=content, metadata={ "source": blob.source, - "page": p.page_number, + "page": page, + "type": "TEXT", }, ) yield d + if self.model in ["prebuilt-document", "prebuilt-layout", "prebuilt-invoice"]: + for table_idx, table in enumerate(result.tables): + page_num = table.bounding_regions[0].page_number + headers = list() + rows = dict() + + for cell in table.cells: + if cell.kind == "columnHeader": + headers.append(cell.content) + elif cell.kind == "content": + if cell.row_index not in rows: + rows[cell.row_index] = list() + rows[cell.row_index].append(cell.content) + + if headers: + hd = Document( + page_content=",".join(headers), + metadata={ + "source": blob.source, + "page": page_num, + "type": "TABLE_HEADER", + "table_index": table_idx, + }, + ) + yield hd + + for _, row_cells in sorted(rows.items()): + rd = Document( + page_content=",".join(row_cells), + metadata={ + "source": blob.source, + "page": page_num, + "type": "TABLE_ROW", + "table_index": table_idx, + }, + ) + yield rd + def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" From 91e8edce63ddb34bccaa67b6f695567181676f43 Mon Sep 17 00:00:00 2001 From: Anjan Biswas Date: Sun, 10 Sep 2023 15:37:48 -0700 Subject: [PATCH 2/4] Enhance: Added paragraphs and tables for Azure Doc Intelligence --- libs/langchain/langchain/document_loaders/parsers/pdf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 3b1cb8e7d9d23..56d387bbaa13e 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -263,7 +263,8 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: class DocumentIntelligenceParser(BaseBlobParser): """Loads a PDF with Azure Document Intelligence - (formerly Forms Recognizer) and chunks at character level.""" + (formerly Forms Recognizer). Returns Document + with paragraphs, table headers, and rows.""" def __init__(self, client: Any, model: str): self.client = client From e788e814006065ad18fa6bbd801fc14561e6770c Mon Sep 17 00:00:00 2001 From: Anjan Biswas Date: Mon, 11 Sep 2023 00:57:49 -0700 Subject: [PATCH 3/4] Enhance: Added paragraph support --- .../langchain/document_loaders/parsers/pdf.py | 49 ++++++++++++------- .../langchain/document_loaders/pdf.py | 18 ++++++- 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 56d387bbaa13e..d972ed9e72fef 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -263,12 +263,13 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: class DocumentIntelligenceParser(BaseBlobParser): """Loads a PDF with Azure Document Intelligence - (formerly Forms Recognizer). Returns Document - with paragraphs, table headers, and rows.""" + (formerly Forms Recognizer). Returns Document with + pages or paragraphs, table headers, and rows.""" - def __init__(self, client: Any, model: str): + def __init__(self, client: Any, model: str, split_mode: str): self.client = client self.model = model + self.split_mode = split_mode def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: page_content_dict = dict() @@ -276,21 +277,33 @@ def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: for paragraph in result.paragraphs: page_number = paragraph.bounding_regions[0].page_number - if page_number not in page_content_dict: - page_content_dict[page_number] = "" - - page_content_dict[page_number] += paragraph.content + "\n\n" - - for page, content in page_content_dict.items(): - d = Document( - page_content=content, - metadata={ - "source": blob.source, - "page": page, - "type": "TEXT", - }, - ) - yield d + if self.split_mode == "page": + if page_number not in page_content_dict: + page_content_dict[page_number] = str() + + page_content_dict[page_number] += paragraph.content + "\n\n" + elif self.split_mode == "paragraph": + d = Document( + page_content=paragraph.content, + metadata={ + "source": blob.source, + "page": page_number, + "type": "PARAGRAPH", + }, + ) + yield d + + if self.split_mode == "page": + for page, content in page_content_dict.items(): + d = Document( + page_content=content.strip(), + metadata={ + "source": blob.source, + "page": page, + "type": "PAGE", + }, + ) + yield d if self.model in ["prebuilt-document", "prebuilt-layout", "prebuilt-invoice"]: for table_idx, table in enumerate(result.tables): diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index 801a426a76b54..452257f0683eb 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -608,7 +608,11 @@ class DocumentIntelligenceLoader(BasePDFLoader): """Loads a PDF with Azure Document Intelligence""" def __init__( - self, file_path: str, client: Any, model: str = "prebuilt-document" + self, + file_path: str, + client: Any, + model: str = "prebuilt-document", + split_mode: str = "page", ) -> None: """ Initialize the object for file processing with Azure Document Intelligence @@ -627,6 +631,8 @@ def __init__( A DocumentAnalysisClient to perform the analysis of the blob model : str The model name or ID to be used for form recognition in Azure. + split_mode : str + Whether to split by `paragraph` or `page`. Defaults to `page`. Examples: --------- @@ -634,11 +640,19 @@ def __init__( ... file_path="path/to/file", ... client=client, ... model="prebuilt-document" + ... split_mode="page | paragraph" ... ) """ - self.parser = DocumentIntelligenceParser(client=client, model=model) super().__init__(file_path) + if split_mode not in ["page", "paragraph"]: + raise ValueError( + f"Invalid split option {split_mode}, " + "valid values are `page` or `paragraph`." + ) + self.parser = DocumentIntelligenceParser( + client=client, model=model, split_mode=split_mode + ) def load(self) -> List[Document]: """Load given path as pages.""" From c97141309583fc70d6447b9fd216e4b48f09722e Mon Sep 17 00:00:00 2001 From: Anjan Biswas Date: Mon, 11 Sep 2023 02:43:37 -0700 Subject: [PATCH 4/4] Enhance: Added safe CSV for TABLES --- .../langchain/document_loaders/parsers/pdf.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index d972ed9e72fef..45a9f2f370d8c 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -306,10 +306,13 @@ def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: yield d if self.model in ["prebuilt-document", "prebuilt-layout", "prebuilt-invoice"]: + import csv # noqa: F401 + from io import StringIO # noqa: F401 + for table_idx, table in enumerate(result.tables): page_num = table.bounding_regions[0].page_number - headers = list() - rows = dict() + headers: list[str] = list() + rows: dict[int, list[str]] = dict() for cell in table.cells: if cell.kind == "columnHeader": @@ -320,8 +323,11 @@ def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: rows[cell.row_index].append(cell.content) if headers: + h_op = StringIO() + csv.writer(h_op, quoting=csv.QUOTE_MINIMAL).writerow(headers) + header_string = h_op.getvalue().strip() hd = Document( - page_content=",".join(headers), + page_content=header_string, metadata={ "source": blob.source, "page": page_num, @@ -332,8 +338,11 @@ def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: yield hd for _, row_cells in sorted(rows.items()): + r_op = StringIO() + csv.writer(r_op, quoting=csv.QUOTE_MINIMAL).writerow(row_cells) + row_string = r_op.getvalue().strip() rd = Document( - page_content=",".join(row_cells), + page_content=row_string, metadata={ "source": blob.source, "page": page_num,