From ec26f9f754afc6d37d16378e9d5aa9efa4de7c9b Mon Sep 17 00:00:00 2001 From: Nicole Luo Date: Thu, 11 Apr 2024 12:41:38 +0000 Subject: [PATCH 01/34] Init commit for tutorial notebook Signed-off-by: Nicole Luo --- .../config/heuristic_filter_non-en.yaml | 83 + .../single_node_tutorial/image/jaccard.png | Bin 0 -> 14952 bytes .../image/zeroshot_ablations.png | Bin 0 -> 84269 bytes .../single_gpu_tutorial.ipynb | 3425 +++++++++++++++++ 4 files changed, 3508 insertions(+) create mode 100755 tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml create mode 100755 tutorials/single_node_tutorial/image/jaccard.png create mode 100755 tutorials/single_node_tutorial/image/zeroshot_ablations.png create mode 100755 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb diff --git a/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml b/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml new file mode 100755 index 000000000..50d435e2e --- /dev/null +++ b/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml @@ -0,0 +1,83 @@ +input_field: text +filters: + # The filters below define a chain of heuristic filters to be applied to each document in a corpus. + # This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words. + # The filter listed at the top will be applied first, and the following filters will be applied in + # the order they appear in this file. Each filter can be removed and re-ordered as desired. + - name: nemo_curator.filters.heuristic_filter.SymbolsToWordsFilter + log_score: True + params: + max_symbol_to_word_ratio: 0.1 + + - name: nemo_curator.filters.heuristic_filter.NumbersFilter + log_score: True + params: + max_number_to_text_ratio: 0.15 + - name: nemo_curator.filters.heuristic_filter.UrlsFilter + log_score: True + params: + max_url_to_text_ratio: 0.2 + - name: nemo_curator.filters.heuristic_filter.WhiteSpaceFilter + log_score: True + params: + max_white_space_ratio: 0.25 + - name: nemo_curator.filters.heuristic_filter.ParenthesesFilter + log_score: True + params: + max_parentheses_ratio: 0.1 + - name: nemo_curator.filters.heuristic_filter.BoilerPlateStringFilter + log_score: True + params: + remove_if_at_top_or_bottom: True + max_boilerplate_string_ratio: 0.4 + - name: nemo_curator.filters.heuristic_filter.RepeatedLinesFilter + log_score: True + params: + max_repeated_line_fraction: 0.7 + - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsFilter + log_score: True + params: + max_repeated_paragraphs_ratio: 0.7 + - name: nemo_curator.filters.heuristic_filter.RepeatedLinesByCharFilter + params: + max_repeated_lines_char_ratio: 0.8 + - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsByCharFilter + log_score: True + params: + max_repeated_paragraphs_char_ratio: 0.8 + - name: nemo_curator.filters.heuristic_filter.WordCountFilter + log_score: True + params: + min_words: 50 + max_words: 100000 + # NOTE: This filter tends to remove many documents and will need to + # be tuned per language +# - name: nemo_curator.filters.heuristic_filter.PunctuationFilter +# params: +# max_num_sentences_without_endmark_ratio: 0.85 +# - name: nemo_curator.filters.heuristic_filter.MeanWordLengthFilter +# params: +# max_mean_word_length: 10 +# min_mean_word_length: 3 +# - name: nemo_curator.filters.heuristic_filter.LongWordFilter +# params: +# max_word_length: 1000 +# - name: nemo_curator.filters.heuristic_filter.EllipsisFilter +# params: +# max_num_lines_ending_with_ellipsis_ratio: 0.3 + # Top N-Gram filters for N-grams 2, 3, and 4 + - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter + log_score: True + params: + n: 2 + max_repeating_ngram_ratio: 0.2 + - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter + log_score: True + params: + n: 3 + max_repeating_ngram_ratio: 0.18 + - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter + log_score: True + params: + n: 4 + max_repeating_ngram_ratio: 0.16 diff --git a/tutorials/single_node_tutorial/image/jaccard.png b/tutorials/single_node_tutorial/image/jaccard.png new file mode 100755 index 0000000000000000000000000000000000000000..bc281639b0f96c94c66eeb09067782fda6019d79 GIT binary patch literal 14952 zcmb8WWmsEp^exy@Xwl-@Vr_vyaCdjt5Zv9NK=Go*r6g$4;_fcRt+*90P~4%o&7r^l z+_}$v=EKYfLQXg&=RMi)-fOMBc9^oF6b32@>a%ChFl3~~Ri8aWAOW6xA-@D3S)*x> zfnU#ERHa0pm5sdJ0S*u?L=;4xJ*$jCyE8%pj!_(?wOpP(!}<91^L*Ky{1fn^k(E01 zqnf>yqk`E-BU7V~M!;*po4zx#wQ^K;u(JQ?@{U)>>)oe!yv+3PJb*_Z`hQ0pY&`$3 z$1JSu|8vHF=RY0m`HUs0JU)BI6DuPwqW($$U_bEEVA(WSD{}OUXNgQjxhZL@XPMnxI^1{J_tM)?yUO2?d`g!Zn2H zdphs7Zp~jZC04Jv%<{L7Zh70YZhhlnJNnr&dM3D+dKy5F96*5_kV*c@<(liir+Y!! z*oY!nh$8LS;gY~FEW~?B5XsXESbyJSS^oRen8W{Z68ZH8lV7sW7T?N%@BN-ZqE0}j zz{BwuA=9(X;i1Fkqc-QEe#+gafBqX+T;9feySOEoIGn-(Ua-nf8!TN~z23?geYkCZ zjXkP z&ZjjexgR&V+6O{=>RUD#<_5v#-+HUL%6cN0+LD%!2p&(!ALD#(CNoAJ?nWQ^Q~BKO z8(mGY{w6SUkJj8=Vpwq=3Ie=K|vVQ}{jrJk$( zdU&p=xGcr4qT}UU-GY5;Y4c%|RjSopG~Z!AneWx=zQf~%Lv?-Yf!#)Wfcf2_d;1Up zJv8=yTS{n<%;QhoezHZn%tE4p@3n0d$4tNA&3KIYfi?m@$LcEs1t*(e*m z%1~RJh)=VnOiEd>)>||7N%<8i7 zZD$5nnMCtY-*ytV#oE=3OU^OWFMh3C`Z%dD2i(#}s@dU_KJpBK>p$^URAeb#K32y2jVkRq>^>>=n3NjE^A`;3emCv-MrD ztKVR1yE4`X*=;m?khqvU78~5tWw4sbw$18$ZE!7rGSLV5T9qWyYpMEH@$vs16T5D3 zT0afLrAg3{d~Gxi1T+!C-K^ieCh%o7n+e_TC{?2|Hc+JUx>{dc)VB-VP72*m&i0su zPGY@r?5F%k)TzgPVy&)w`j#+r!lREx!i~=fajFpChmkG5dSGryD0CiQemb zdysg)Zy_Yd`1PP}q0CB{rlY_0o#cxiIzAn*gWAdVF9=m2GM~TW`{`bXGIMw<_lNG) zHf4>}scaUC&iA1aSJ$I{>AFHMBq|#eD~Y7uI*B{Wq`5D9FhmjO8c*cN?4*Qd%$cX! z4Y~(FkR{$D2N+^ic0K@`Bl&)>{lTVgh_Tr;UGFZNU-jmbTzb12E@o`7Z&Jn%?;?&3am0jJOxy(+(=^!{B!TEY4(L##URh`pq z|JObc!AcM$R1~wEb{pwMucKC(th0qGw|RdJ6SxLM#-we4Iu!=wg#TIj^(v-c5+dVl z-xr~~RcD*M&|$;p#I~z_46H_-L4OD+`+K6W_8*TiA7fShfNQw4?IU&ndGX662G&D~ zfAy4!)LMdIOE%@LoVTg%*QtNZSJSsS=^5%*e7)$3sOuz3BLWRol+OH9Cy$DyWkyf;P&ZnzgXSDJcSt|;TI!K*<`@f>*8lg(m{aTcx z>Rs|cD?-mW-FUtCSm$yrP20Lx);w`}oIL7HO-sJiOQdECV(D$tYmjf5uMI-{@5Yia zwH%9H>{pNG`3c?souvI-rPoc0o^brWM$2?6CDXM3w{mxpp3QSSl68BdES|%L^{?>! zR7ZsYqO9Ni=TmAMGc!im*bPlR1UfLC00_#zScxrwoI?ijUWzvBL0=<6U%za>KX8aY z?_6OU;eL-K?-pK}#O-8U6{*}b!hiPjH0@AXOBEy)prpk;-{Fc=z-Rt5$ysiU?!PO< zxv@lME?Go{oW=-TKyTcOLw&X*J|Ce?sQDcSlAG!{w7PQo-d#D{Y&q73PR1^Y_>a5i z=!BMkI`UdceJFibfSSqbt1Tfpy6knNSA8inmZZ!MRI8UL0U9zV?yY-zR=K5hh+Lua z=)`XjjjDf0BfvNenSX~XgT*?)Jv~kSRVz0U72J1MeL{@&7io`|j}Hc@e{!aXXXTh$ z*%zFL7^>gcH&8ZKM6wO33P8mW{UNeO|0{0s&@||1(Sp)^siep}OgXKm3Tp0ya22KV ze2b5tf*vDyRd|it)X|Z4n_Cvk9qoh+vFYr902V~*_KT91#aQ}n`I=spVdQzA(8I}o zN%e>b>}q3 zG_8~7F!QNcvHwkE;JK5VV_EU@W4!Fnt{XPf%zF-j?%H2|W+>Urm9~z0g*UX!@Bo_W zI?_wo;AaL5TYhbDsODH4F=VHzdF5|AJD0uzHG#E7j=ie^CGOV433|-7wWbE%5JbJ1kmjT5Ruc9C{ zapnllY#P_FTU4xi5H2|r=HR{lrtoK-e01x@Jh%}9#1d|u=5^RucqvU&e7^eH%p?$l zQ(CsWzU{o!_TlcjP`eOk2ai7?TDD{Ix8g;FvGK4WkLvvt2i{}Wl3xYvUyy_#BYxw> z8PqGzmg#r$xcfsRBm+SoLz9thO@wolVD!Bwx_eCeEL2v9u**UCY5i8uQ`8SO8j0D77C914u z?9C?lSrv$K|7{{j+-|t%y)wkQqsi!8 z5gXeAo4%((l>|g;a63P%8u4kNLCo6hM+N)NhUgrYIIsKRqPlPE0%OLi;y(=56@=-t zEk=8;aVKdue^d&x)Xn4D0d8yx45r<(C2%B@CWj*RJilWv4xT+p9~^16*0_~ktfinn zSdxU9Q9PK^%-FH_-&KBI?MmDkL$MZe%z|<}K3ul@>3u_rZ$;2vLrUS$!p#>pGCCwL zyjvq|mk!sYsRZeQ=P$&V%PKK=|Ep%t#YfR;i_^-yQG{9EwlR^`$&LrHZbFt}vE~Xh zP!8`ypnzF{i;cM{LnI9C^0)fT%)a&1*UVRPEK)ms_*W302Q>#A!ofo!@@ z`;3;DF8n6)>s%C;-wz4llbxJ_U&?(%^x8F`9+ki9z4KYSUpgg38`1vPHN2E5E?(CT zCB*tx;FNQ)cPlms|( zmyo*9@Nwamb-NfhFV9+isqirTQw9z8L?&!)ePW8m?<09!Hgg8kt00KZ7kNw8u#1D~ z**2QA2I?DJdrW){TGcfT^&9n`>bWa!%mggR`<2-vag@u;*2PUu8Pl z<5kRK(h`t+cNRU-kTbXh%M7@bdvCHM7-ieJV5yE6uS~;3iDT0}*TQJb?7^;9GT#H-DRJsG~}EIYa4 z-r7{TPfG9bUi}W#L4}~cOb#C9BJ~$PZ{+?e&6A)A4(@qWMqzjojNk%&$)1?2uQ& z$C+fPj>{u+jWGapP{_QY|FOj}ksa!qA~=WJ-l16`Dt!+D?5DjD3ke0N&=Fvxqp8K@R4$7TFkYS zH&SB`ALE9YUF7+!fzYrG9u^Bj3DL5N>vZ>_TE>m~?!LBCmQ4^DD{og-_*>U0m5rHA zdi@sev|^qiHmIb?)%we2FN;K7XPG8)5?W&B zSIsK|FNTl5sSz|7rc5su1oMcfRn^YwH-lyElwfCWjJ?IU(CEJ&S>Lc1VQ9{w{Mt_# z&+j_XZ$OcECnx8v>Psj*GBDvdooD4hd;ob@89AX$d9mUeT;B4PP}|cJAK@JnS%G)s zMuJ?qaq#)PSI1Zn2R%g0BMWbn&HQJ7#J~q8M1`id(zWVDJvXg30I2;LSFfTN#{3cR3Lj$9aE`;v9^Ci|fndc(647=^4zt>pnID%B4qT=kP3mweF;Z zZd5$Yt<&o5Mg?ze+m709H{)rPMk3(QCjH<5P2X%Uyy89Y+k^Vi81A)D0LI0O4h*9q zF)LBEy4jp}X{cV`CAmbbNSAA9>YdkK24qFpNLS|%M-(6X&6aC; z&+MikMfk-_WZ{N~z0;z)l0&0R`56mQI;_^_9ucjSDHbxfI>h%lk{|C1Hm*{Hgwd+y znHHB+$@2P@p^nOzp4AOsY2&jsvsS-(we+Dm3LjkEPDZNBBXZlox!I#VJHcU&Q6{VB z_1{X9ES7M6*TZPiLG?3vMH?L{DjbM5gnhjM6C?lIB7GUaMgQ9((MTrCM<9gNO^#WH z=tBIT$Y$K|FgjJeKOY@C%}dc7!+kHRGZfmUjPcXFf0h1FA8gt8W%DSj>?PUY6C-C~ z(EXqrsk#eGv`3OiWuIrAQ$rR!jf;UDeM|)SBV#JFktlDk;+p`Zd$jt`s&j|I=B~V^ zfgbv_+qFT?SRW&NHQ13;ck$7MiuPur6CMG&KyCAqDo`Q^cVCSEv!S|(miLd+cPnYm zq64G==k5z15$-~G|5Bo&SX3VG3&hBWgw0%jw@(;*n<-1ggKsm*L=`J5bJB3z2rdX< z_m4{b6$P)5#ArVzboG}Y-=sc$UV)~8p7t6iQF@KnK^wpy@&E|k?`*RUEg*Wz)e>S)6JHx@x|)lCTH zsw9|b1r9bp6o(oT$wMRb4=y} zF@vpr&oM>#+)TM6_Os=*!c;undhrrokDJl7@?GdV&;cuvC%`<6D9zM3C60O!YAxxhQ z9`*5=Vw-CKOsTzFITqFs#l;XAld(`jXlFVsa?AXo@&`mhuhcR2hb`k;z6d)@@XHlB zhZAzgliY$2Uo#G)xNqwas50ybsOdM3GwsRzQy856i6XH8=YejIST#^PYic-nR z`Mwv>!z}+y>;deHp`J4zI=qP~bmyqgPjs!owFG0*Z)s5Sn0NjjftK$46K$BaeLb4* zRJ4m}#g5ct?sCP?Pk&K;=NFmZ-AUbPSz6bq-^1D6#e#!UpNeJYlR%{0#`pJ$1Drld zb{>>JQqnCdI~82XXS8ifTcA8}odv1ZkS9j4kwqxVHr;J`IZuddqmTUK)UWG>D^a+N z(6o>;p_rl{-yNW*3;`n01f2KTu%l0m;LW!8RDsHl2l8nPv^CF@DY#0{o|v~FObv8! z4d{B40@?QB%zs09NnRkYIIxVH17Ocddwc@H>I)zf*`0z>*==9tc~9e5CEvj>l$teh zblUwb(mnF-Onpodb=~I7Xfb}ecVm4@bL>PZ+&^4$U(xfe5@ZGajZ%&pK^5x}+5(v= z*Y-)`6P6xNFB3lg*1hUa8?~!nN?5K>(GdN`+4p_fe;FmJLTQETbqf1+`jj$0@*Yww zhbR@a*!g+V8WOLBe26W+ef-*k{IpQ6CrLX*MYO^Fc0VgRg{?}`bU+$Y5YYRyo4CL^ zr?NJ0L#Jbfb!cmObs-F|5?m42qX7v`$7Psfh(YWXF(_JrGr>#zPL=tu@}szS)U{1_ z+*FirHfcnN#$FWHa#9iIM3Yzc;DU-2DZISpbnx1#k2LW~Po-TcjJdB2DXAKyvy#vU z5pD3v8Od-1kn~j4a!X0*{$t~L6n8@FVzcc?01SbfD5%)V+SQJxz=&PSW3k{D~%WPHt|S5+|wf95peh= zs=vV`j^V-!O#axTsv*Yfmv3xiN7DuPGCSikEb z1IGG~(c!_6A;^_}@bRz^px$lvk$Ktr0*p5Y-!%(VYqvu*Sm4>TmF|`2Vm5uOZIuf@ z1qwpWZLjO0`FUsC+8iBq@5k1K+BQUy3R@OaJPhhzN`5z!9+eYJ^(|zg$JH74wsgr} z^H%xO^h`P;$a+AvPm#lH1?=EEH=E-Y0|k%#J@G2QRs z+WY43jtuk~-Cf0@!M1@L1pQf98$^&e20-B?asU-FfJyqu{k)qvHT6Br8gpIl5^R3r z9q4+tfwao{1aJ4_{mxQXejWjs=q|DS&g^=46$43*_yiEr1=GM(L)SO;mAOI9%ewAL z{3qR+blP}=xn)ZW@N5!95#@+7SBFw07r{yXzZXo9^PQDequyDl^zV!iG1WWIb>`X^ zD-P}VUyDEQW$SxbNyK~%HJitG(qo$bDkuJ-qL&vH`=GLq3L9BdlDo(Ywii zFW=_-mL%kvdx}dLnabmZbLn)o;}Trgf##XI?Dv)J96=eS26hemU}5e&aLfrEu9wn!q(!piJ75fBCv$TGt3Wq?<20o@O;B|}mB$TYWk+S1c9fRs4z z?HOSk>Q&${1z9)Naza8D>2z6B+|Glne$hD})Xt7LG#}JC+t@+HQCHI(0GMi(JBO=K zfGAysiY!_h47bV>&c)w<&9|>?UmCMQ2TS*UVj^f8S&L10Q323ksH*4bpLnIsRJ%GY z-h8%E{^}j;Uv$&EjY-F;F9S(j4&U?{jCxMd1`ErH6qXeV)6PzxyX!vOD2RLyKDL|} z9*Q%YFLh8rXd|OZ|70BA4z);g-R<>McO?C_1t=r)Rfdm)nr&Z4 z&eM*(Eu5p+1C1&5yP#YyX^wi#d+m)_clSmk^-7)c)L|y>OHbPGN5S5XUuwdF*8@s3 zuHQoku!j{+aM#t+Vtj^vVWBzwc>XHKPV9M~)EsW=>a`;ix!W(J!pXcdj+uw1%LW$f z4pI==m+any{%IAI<+io9YLKJBn}xPchV6NkueWC#+Vwc=_=qBxL^$Cg8VwpDGE;Pg zWYi;puX#ZSqk|;`KzGO^<#XvpNyr-!OOzjz+AS6p1=Dbi4+f3g!RES-9Y``F6;1nT zyqp#z-|k+JUkRhy>sAO2WD5*yU{F5@`kw z4{=%qdM*>6Fh0=Z;RWGCpp;Rso@;{%Le@52%ESbEB3*}e?(Qf3dq4KNs%I0>H>ZVmC0J&g?= zfA+}!NT1=w;*o22x8+@LOn$X}J&R_Pj^D|L|6N6}p6jWhP*`wRad&%nbnp%%;jvK} zs+vF8g(=kHozM0y@zb-yR8Wv8Z#h5$1=A9taX`=SUC#$gJ4X8;@;0(HCOp}j=JwlW z7~=wZXA$$7 z3V6AZXb%7CUR1xt!kdU|+tY0^KywmREr=Qi0D@Z6n(_u*Q(uO5{9kf)r&j!yv}Te* zTD`a4s1iJNKTFuHrjQ{h&_f=ka=OR}{zSlEEW)gehZdo}Z20TzQ+3sdkaXxaX1*j4 zWzVgpn>Ur%q&olFlbysoq|}}cG&rb};rC|FhluSKaNA&8jTMX%?w)5qQ@NNLl}*1$ zW55yy@s&$oYk^Zru}aNLWp&OuhLS3^xfs{5wR3^@*07XZl+K4za@|BpUfqkb$Zfg* ziIZwmyscMadFf^F-<}nS-0f~gValw!)R8tz zjJ=}fk?W53CHxvH@4ybZpUr4#pUoE32j0qN(c8_Rhi)tJyOp>cYF>)b>;t%?u%^;Q z)3Q{H&u27v>m*K!OlGE|jl!!(z!k`EG z@DX5$*aVy-o`=z3)ldnSY|$l_@BR+NR4g9xB)}lRJc=Mze)Sz!QQ&;4MwGy|03u;f zuxlCGt}HIMDO0LJ-_8e?Rt;$#inkGFy^#D-F`}Fcs?LDF{?-zi{d^o5UGD1q?_i*+F8dGqc zc_{{L0t_bwR9}wosRS7QscWhH<->=w6vzp7yo@3bYY|L&nw2rma&{#Fd!<2O*brny z=iswwL9Xh$EFHj1P93hwR&4&JT034vYm^biC@Qx#VQKH&`e|FXQQ^0~93~-l5C!n?+Vz)8BXVye%?)HtE+a=PT9uX6 z?h%12hdtxW8xg@sc<3(rbyTtF8nBM{#^~o%Jn?oC4hw_qw2l>rSDTJMhb#jz)qd;L zr=1u=G+H@5EW{U5QsKQoYcUS7U`<^-QY`N*b4gzH?#{eFVO{p1+B6slEWE zG@enFo)e>A1iq$f5U&U}edpi_IB)v)QoF<59tcHs=AS&y5x*P+yz^#`K(Ohs{Nk{Zu)%t6=IF9zcVeBbC3#&tJ~H*svDlFjjBr~8wNqgXRhL68Us2&UaZPK;buv}-I6qU`Wa+0X0&T+2tKsm} zb4fdN|D0Wx8fV>F2-@e;S4UTV%T_B>ZxQN(Z zWs4b6>B;FICtcTg)RtH7wT`Ya11N;02g*cc4PTjtBqPn}H=JDyx@~vu$4PefiN~O=Vl# zNA;&A=NIeVY?sAdw9l2oZ-T|2E|Z$7(b zk(LZbCF7soU7rn=mVSRxYTxp6ZKSdu-07=RS)xH?!*rOW-Jl6wfqA+gxg(%WXG=t= zS~J=xHcKmM%rBqMDm0+N$fWrr%2I!rUfKc<7Xe;XcH|Kt^Iz8{{5glcz&@ewDy^F@ zvQKZ7DGKjlZk$!45O%~h#Y4c=L&}1p5XG0#WA{mqLxK7`za0AHY-!Z}rWUM^#VVFS zuk}@zZ+MZARRZ~So+g@?T-dO&s5K1pSmJTVSb%CQphK`D?KFFt#)tMr^P9}iq%oV77A9Q~}&Mj3XOIfWR znSHSKS$5@F3w=M)nx*l$C3xA$o_?^lTv|miZ|BcsF1_cir3*kO{XuSG^zIu+C1*_) z&p-;~-SN2W4w{lUIP52jNiR(2F;qrdHa9Gk!vxS93-<`k(khtzaw%|lYb%-(1ho$2 zNxKU8TF`j>Dh!Hkq*tVm3W@KY#B|*QOLlZ64l-qzB9{OzBvm=*UFDe%g zrFeo(%W=lQ@7|O&D=3t*5J^Im{>wBsRalki=Kw=CGL0-IXx43HJIvo4RZPNFP&JZc z8i!BSAy)}YRkFLSA;Gurh$BFvqY-t#gkz@Gkql7JCb{{Jsz(L+v`R^6Uq<&Ffu9xP;QtRvj(kvYQT zfOI_vaHO-837*(bue%eu-B?%xozTTFT+k)vI8Xx=nG_4c2xTo1otr z7}5uSmJ=mKlO>o%K$QC?<{e2)calJpE&EMlT8-#_p3e(9C;;Dj-{q(A}s?E&+HRsVLU>RD8&STJi^y@x4 z(`bdQvASfst3mCLcY>q6xJF-3W5M2da~I7n)$eOnDzNkUlUcmis#n#?d~xU@Zz&mEG5b1l)5oicYp^iO8elC3qsF zFlSjNNxw}!s)i^-za^UF=5W_7w_}2Z+K-aF=v3O|GKu)Y5s`XX=n5o~RAt{Lz8N{e z>4uDs6s8#J^4=Cu6ZG_W*I~Db{pO}lAhq|lL%b13UKo2 zNhNx*wV%>3OIjT^+89GFB)EhLTah)g|6-57&x+EF8HvBq$4$QndOjLnw5?;HbKKQx zKE9e<-)~4-hjEw;Y!xN}pj9(kx;6jSs7G?E%9XB zyNq*Lvy~;ch0;Z}72<*LkT3z{rjVktREIW?A=-|6g)IJ@k*1#R1$K-YWkyxr@|s~$ z7J8t?k#u|pRJ5j2+2TNj`UxE+KnCz$kK*#x>svQV1M)&H>`W9HuEYCXnNv-}g!blQ zM{rLe>rWocn0BOSxu|3>8BCyfy+2nyq=Yp56t8E-S&8ZHXX2%ml9fM((2yu{okx?| zXk0HiXM~O_PCp8e(RoQ0#tKY?=1Pxg0P;pfP1Bk=-C?=&-ftrbsFo#Em&Ax_z`4u$ z2mcbXJ##`!`sS3`^BEB^uRSg#A=9gZOk1UzT^}&Y3@AW`w7abbZYLt64j)0Tg8nJlbV zxIMf`zSyR#iAd29kjgQ$#|d-kSl5kT>i(o$H($6hiAo@(o70;V0lN(Z*Ac@|8HO^x`aClOp zV)zwDz&o1ZjA;#0l89`G;Xy2ZpXhE&PbcD&e+^i5to+v!WcjaM#aAV;&pLIaOfV}n2Eq5;7i%& zCF^}+>eIA47qa=d^5{P+9*T-+`37@d_|Xyt6?hUsETUN5#qV)n%I6miyf#B-oYSGU zWwIK+d#qDwC^{I%!!+p@&7JuIuZMmH0wTH)F4_UFn`hLe#*6O2@;!{&)gsyLe^;2I z?m44VOQKOhpas1;bksIPM|v_*Ac;aTqMyt`11r40lDc!zv?&WMbsF7r$_K&-hm5?~ z5aW^^@n(p3xrGX&WpmFq&@9;+QBC7f3oBCA44#}~Vg?PDISxfsnF&&30-)9>>liQCbiv6m;Y;8R2^2x7Zg%b3&-lTda~}e^|Kbf zJ2au1)k79Kd<9`gy5)7U3FERK)BXA15zme zpFoW2c0;}yg3eikLFLk-vRW*3&e-SfJ|adO@!4nJ9SijL9-q|2Q>6&KEK?NQ;(SYG zSww%j&Y7KXz5>kPTGmzYZa%MVb3@)*a6qr}W>F;p?xZ4{m}4u+mKp7{7M-Vcf{Tc8 zwpAsYiGSgvqnBmi94<15T&k?+q6(U)Xzo<&|8N$VTE@a6VVCR~rh~(qWlJ>pL`n$W z0}8ptfHg{||Jz>p;F*m=|`)v+ z`p=jZ7#b4C8i1(8HiT=Bm^EEnmR*$%O@MvSTzdrcPAl+on1<e8c;}BhV z%0STGT=E>$W)WmBw$2_k5y_R9^?F8E`yjhcdK!uIKU>fkHlO3oPJaGxo#$TwSLE$i z*FdbCPyKGcWsT;pCKB6Vq)y$~g|mw98(slgXVZR8yrOI^@04@wlj>97w|552SeO>&{iYd!oS*Joi$ZvZmc4 zb+tZdzpz?N_igXoO3{8BFDqcqYXXezs!=2vM{bXatMdPhY+{I=3PqLdw*KMr*EeB~ z9zT;<-oFv>{3tta8fCak^TA6@n(oo>bbyA2%YUBff;Lu#Ansn38!xRG70M8yylnav zG$Yl~{5t^cD@Q=HY#?7ax!fzw_`#0aMUBPx^y0WQit<)>FflFhv<4kX*8kRK1R~c3 z2Z)xpKio_q|Ku#g3r8T7QTnK^@mWy_6QP2jNN4b)3Wpo$?|TwDWqQ+or(*8|8w0so zOOQPt7Se;~_qvywKc7tWH&m6emTj9ih<+a=pDfl%6cW13Ad%jKldJWZ_AG*+x`U!f z*45d)trK7dfDcKsDikX=O_bToHT7ZMp}4}WAUSKTsB)8_P-6*t%xI=4!XN%dmvB(# zSPIyefNR~bB!O>O_LukButD(zo&I-=Z#{sS0*3br5^i34mph7+owDI7n~VNYr@~Ec zAoDG9=7HV071H2KiNH7Aasnl@In|Te)xOvUpmr zyt-bL27W25`Tw@e39Dc2TLoJZ;)AEh#{5F2v-8b@5Zw!Fo8!4f{X1p1u@ScB6?eLM^;UIfd)k^k}PRh%YbpDwICa8CDH5)z=@ zb0c3p)nRQ+0i(3vy_P-3sy-nR{s>bjfJx1KuqpP22k>zMHs=aaUGI|~8EiFsJ+WRz z?BZ-(#-3c<+34Lum}tUxK(tQyfBb--0RNk6k2H8609x=TN8sJlTnDO@Oso1%HJ3(< zIJhHbnr>x95ltRNa+)wmWBtbqO!>mvv~1+$PUzE&!qa@r{+Jjh!-^I8R#)qXYN0l_ zLXq!GIA2BS^e4oO@1_;DA4%xL05cSyMjtFI>m=1LBFb`F_7s3tj6V8}u3J5%AMWd? z&kBN{X+j8Y4bN3j;8cb9gyBD+koSkF{|vuX)?-}%1^csZX8xP)@3b0Unak$C+AQZD zLl`LgMYE2=di9GGt*z{XhrcSLp$2M#5#Vd@6}xuZnM6rTz=Zmrj5vW-uqn!z!N!xn zI;y9Ev0D!7p@iVU*#^00syf)Y@@#}J|A7KXjW##F1_8p9A;AlQI3G8YEzF;6K+b;3 z`%sv*mN&>{c{bZ%&Yfyr?|bG^OJM`$Ld;hbz-+D`7|Z`BR!mimL9~tlhECO9`U{Q> zNX@8-?y~FZ!VwyobY7)dghqw&1(zI>??ZF7X95@ey@G`;E*ub%gin~9uwaO{hv1Tz zo)w_4643T_(j)1p!JG+MHZy_Mc_&*{s1_zj?WUZFYonzYHPy#xwKI6&^m{~+eS@(e zl;-a{Y*XtJsVEY-XBrXMo**1E*M70VNrskK$k$tn<(+JgS`{=C#86uGKO;lZ?boeG zKIMzNtp9>FGo~invLHhQ8_P?$-`~r>f(nY4fYTz4jN2PjT@vxs(a}OR6{rYd7iEG0 zy%&C3+E%&v%ifb;Og+I>)xabP{OB)6_>1D_;Nd3&Lx4Vt69hvv%`sno2c?st5-_FZ zWU6j0SYcN$hvH|cS0r2sRDcSLorW0dKh&eTW7NY}*z4#D=@t}G#(CS++*Ls&9GU;7 z$kNzsW=dM1m0PpAE((oij!8m(zWN{<4@&+s(Q&|HwF#(V!)TP>2*G!E<5J`d^)G!k zl1!op(5R@GU{TI;Hvi|q5B7IpaE)2?w_61X)8<89GC>Jh3FaChUJh!LoWZ)jB+kE_ z7>U&_W=T6rO3{^zZSAQn-7STTiqRwmA}*)-XG+En2+uSoLyTDlMs+r3xZX{Q|M(#B zo)vua%H$0_)qkgS=g7oYJ@xm^{6OFJ(EPvbihJ*ykjsfAi(FE>0O4C?x!`mT&tK{qT3|Hn@xg47IFsoor-QT2^m|k{Bw!xU zwGkN5QJYo@4+*glLvbQOwbW918W&M2g9DAlEN?C%I%W1eh17(yVQDq?cTa^fGp4|x zLyxldV?l>&wqi&dx%_38DbQ*BeE1$;R*K}`#1?SY3Ub8%=gGtWPveRIzfZ!|x_Er9 XdUrt-7kct<=2J#OQM^plFyQ|JEdKbs literal 0 HcmV?d00001 diff --git a/tutorials/single_node_tutorial/image/zeroshot_ablations.png b/tutorials/single_node_tutorial/image/zeroshot_ablations.png new file mode 100755 index 0000000000000000000000000000000000000000..10be349440829113b3b7f5be11c66a595aa1e359 GIT binary patch literal 84269 zcmeFaXH-;Kw>3;_t8JuJOo*s02#SgdND?q3BA`SG1)?CK$T?$Z#a0nS1tl7goFr#x z1EL5hp~yi=7DbROAn?vj+g3gI-XHHb#`oiXV|=0wdR_0uH|2+6lIq# z;#|bW#J;45(k82Oq&fVrQzDheK6rW!6yUWcOXXB5} zF{ZJ$CLym$e!^W1h@RAVocZFMj zEo1(UE$DZ#xy;|19s1w)gGUcoStTubSzKNI>aUfS+GG6%Gqk~ul96`;iBay8ql3f4 z?^UAoMP(iOuS z^9%p<(ks@m#6LzQQZavc!Osh{j{X+hf9U>pwTi#5Ea>R!;^T{)$t#+S4D)ywFvZTj zKZ;l0|4fa1z^+n`eeyLgzaG80n*ME|tv=m6$j{Gjc(lP%`xoJ#pLX$bm)+#H%*xEH ztZkb%ZBnTX4!ckq_$)P5SgztQyHt2th@_Et`P;X0DiJ52Cj{)&j<~r-uv6aW=LfJ^ z_!O=ie<$_4Mklgt5^swbFEG#{e};zj@6)!bXRM7(6CYR#yf+$ z@h8%b_iaCY`b4XJ^M+5BnwyswKY#`AZ6@L6$5+GG5T}{4D3cNNx~NF&yr^gfSA3SW zu9;cfjWvQE4fenM@{49OJ3D(oP>^=S-O8UADs1X*e|y))(Xl4Ww#z2ZzcA~!-xe*Z zxcJN5_V#<@2eM;hW4}*Lb@cZq>c6}eZP}8qmE#c4L$6elQESO}ug^i}tQf4iViI`b$W%s4*RO?u??^`9l}mmW#FMV|Vz`CHC%3z9{iF zFflPvq|9Wk$6&i0RR^a(XYALX-(I-y*D}WknVFd?Y^{LPyhKd zY_%5)$Cn9tPP{n-lYSLh?XXA{dj5i1s+peAC*JB*X}u_s8};oqn%+9vd^s6w<8w^IdAZt z0L?y8H*XQ;;-rBCnVrxh5kMG~ix=dqi zpC>CjyBnWkd!*s#<=D(r;~uRq?CovLQkj|@3%GgnCKk4x!HC^!_>P~K_b8uIXw_(M z!>WxNElUNp)8q1Ird_jga*VWxdNW#R87&?`^k(nK=>DAk{2*H1WM9rh?!U}qePpK3 zgYr?NWV-BrZ0SqFzL+ot2jp~Hvm#|I6m(?((L5fb*@?anMGbo6km&YwJd zd{pF%`T|eSbX$ayN6tf?Z{NJ>@=sF?-5-t2)MTTm9wvMAXh&RK!`I>As)-RsHNN@k zwQDE*9C~UWZ;*5_2<-aw=`p=Bl1k!w-I|||d~xb`u{AbI(&ORXwD{QGh72lstuDh+ zxYJeZ)HCTcP^HO9f&cBW zSm)PY7c3~*eM@KRWU$ztHzS7+9TGlU$OZtSg#5ByBO#bm#56oW&qLFTePjCM*f>2h z%x22{+S2tG#-x8fC0o0~^UD@&^Z2%IJ*Ffxm6|`%9eZ-Dt*wp3w+&)e2WF=FJw^0i z{F*mfPn}oA`|}4czPM4Cbv;_IAYa0v$Em>?Fmz_8jFO_FdU>eSsue5FHBmo2x)dmE z^5oAye>Ja5KH60lq5_bj>p5l5%g4t+>@BmM5slGGi*XtLV%^{jG?qHq{PXkK2Cw1s zO=xisrKi8O?XPj}&l`(N$~yAbia;69{E1=udH2wJ_rgV6+*CHYeZ9m(4{>Zh-H>ix zoo?Q+zdz(Oa$fJCJM-h(WdH}(WLtf@EdBlKVTKDp(!&=QVazaP%V#<3aS`S8`NBfYiHUuIj>Bs7`V z7zD56r($c^nTqHp?TKRUjdV-F4r9D2oX-=2+_+)>9h)w%(QUhqEn?1zs+oVaO zo+z1=_2E_nMjUy|7gqaJ;m1>0Jy`Ml>7wj;Z2Wtr$8Yw$Q9ph9G&u_jRA*i5<)>3F zDCA*>*JoNkEen;3rPd^D+qX|$UtfRX>q~Z_kVDIsEu$4ok0kD3y!sa{_qw;`-V!lW zi&p0K=9bcGaUTmIKpVT7mxt#|xy(!zK)_Z{PfuzeyUh1T((V&yeCDte3Sy)hpV#!O z#ZCzct6a+z3wvaBgi|JvF!fI;tosvE=s<6K7Tj6|@EjOG_t7ZV5(G^P*$K)hqe zj`Fl0t5SHUD89@73Y)LW-HlsEQ`K|_g%p2aZNu6fa;P{ea5>dPT`5L0khW!Dz(7c; zoSd9^{Y-rt^uBHkJ?nyJ;~fY{9<^!zoWSJUN2gf z-JEJtsi1FNG`5M4ho?QN&wI#ba+jgDxVX4xde{aT3yn6lGFx$b=vD~hg!Jy``*9$B3Hsy?^i(uQDdry7;KG^i zx6d~3P2VM3Y~ysZx+e%`Ul4kqJ32QOg-W^ZrL;JGCb7%3IXBCG3Pn;=dS_>8P`tI* zvJHc@v%99I##(Ot_1E(R%kM0)dC~2rxm}6*6DI)iGyw5LOe%O>1Kyaw50h!(oE#rF z9RJ36^k@^6i;K&x#3fQGO#S%r4cT**9Wy_#j3|w`0>ieF+8Rw48<)0IB8F#U%q@fGTs#iq>R&i+SD|LCmJ>X zq9se-Tgzzv<>RBFrKNSdv#<8~VK51439e&(@#kLO=rHkQF3RZnu|n~-zv{usG~|4z z7`?u+$`mnuwV-Y={bR*CN}h`v0(+>-NVnnDDIANMFCU+#1nWINcQo(Y=bN+H5ga1vxP8;617v3+COzl~#@VLd zRgyj(sThLy&rhGAcLM8A`m-*Fo`cLT7qBH~+KP*hy9NlJ3fM)d07f2uv(b|u8&$-# z>eN_YlUjxa#j7vJwo7rOzeR7?u3dveLw8Z@y}P&V>Aia#A^V+`8!{|4+}v_S3e4(~ zC#EO+<5k{kIy2%khD!ts2(zC7Rq? z3x5GRXpL}gd*f5XNGu2)6STU59JE2&?F{fnnS&zAj8$A*amTzl>)cs3gfliomcd&; zm(PFkap3Pk*y(~T85|i&8{6lw$NKXT^V+0?e!@ttW?fT=%A68*Q6rhAFL%e4S5#CC zBii!=t1Mo>OSdk^vIE7#58O^4PX=zyDz84 z$H(h>CsQq&w7}ct08m8W1<{M|Y{|Bx`=Q3V9~>M9!l>lU8`;tzQ4Jq%sjA0^7v%IV z{kiW8^Op!o9Zh>06~#~L+2zZZW6P)*8%J$di!taoALPO2LN3U5=nVu7lK|%H-OHD4 z5zgR&VgRXbZ;;%Iy&V{_SY!MuPcsDQTPB``q-D{)ADKJO&Up{P}_v^35 z08s=HY)Maf3o6R4w_ar3wa4o5T88WP$yYeGxEoz9^&`Lt3@8XoTM6sWe~##$uM`#m zN8`p|iF}I!n>I%8lskNF$sHc@xSlgp8Bd-(5x($#t@j2=_5A#NP>r|uOG&B4oheHf zU2+()JdNt(IW-a>V$qmsoj#QReNbMmW^Mi% z3}4V<0Z^%iy~wDVW#fpb@z&#m9o3HxUfI!gUqGXb^UbSQwb6%vUYd=+k!yw6%y6aB zZGNi8Gx>n@A|$W(Hj2lOUk_*aR9XP`q^AMc#XSc&2$5itOc!62KK7~F_1=wSKAdWc ziT&UZWxjvCVy~P73apMs5U@zuS_391D-7vLSQ;Dzr2hH!%@Sb+>~)3p`^<`m$FX0c zfHoLX6+Zh1kEf@N_BBzx-QC={Dg>GL%CT>mbGXClRysWzxyXH_T4S$*%E5yNIrp6V z6J$>$KF$O|i$?EU!0mJZcvX`o#~xQ{D9R+e*!qMW&0BZK%}kFKq+KZG1hC@Yy!l{d zghGn6yiA?h+G~iL56r!B*cji(hy21%+~02fg2}`cQns|r=_2SD_x&NUU^=EUQi*|- za|1cbn^S~i#R@+Y3oWfcu=YH1Ha0fOSpE$aDmV)xfX3YeZ#LH8J;eEnhv=_G$>N8S z2A9Kz)3PNTo3Av>Yu#<*)cPH1iBT%rQbmi`7HU#rcMJ&$vD7*r7(wct6r=KYyO(Sb zw>biUZgbodoAJY)EwaM4ohQM<$dNh-)Y_`GYoicLwQvt54!wV^+OMRRb*<243)w^s>wiD66Z}e0t2gV!dVVW;v}?<2%jHT^qbcG#?+D z=e#p7Do!|>|tKjq;YjV78%4B=k&TjGgwT!GUc7-uT6f%UQW1lMMvCi_a#F@#0nH+O0 zwP|rni(Z6@DT+XXXI#S&`W)iNu)+;#+{_!{xUg(tu%7#f8kLKKBOFO8#-uVLE8%Yb z)}1>Q7O-=tYJ1YTo!gfKA5`y3lOAth3G7|J>t8&J4|>F(gERX&lXNWO6gB>^ zNhuA{+`@b1%9TT^0s?ftM3jBckk0qjFhIW&0p^UpEvZcFS)wP$1>Av(vU1$>c%KTz zH5)dVdm~GQL~3b{g8_|8asmtNg12Ft>wwURqU-_BTVaLjXcgJbn>Rc3FK6D&B7vL* zWzcVui@;zPtHfVfz^>IsszLCLVAD?@*%)zXMn z-+A@VKcf(D|NBQmW_VtNIA3u)K&C5PK!3;YWU%UfePPtI38gX(R?g^okfnqNs zy6nl(n|2e!CP+p|2Qf%U4t>L(7s`U8xusl=`zfoaj9^_%0dho3wV$863g$GDRKLo& zHi(BmEG;d)Mwo>>+tvF`a{+FQq_CWR31Mrw#jMUo$e!h_)e-h)`*E#w@k-sWWv_ zR{r4wZEs>x#p0t-fgqJ#jX3%6No{Sdc7nXTd}U8vN_B+7dV+fWNG_8aYqY&>EVNQX zLnB@@<=9eAVPE~VH*VjiZSmRv1elsEXLn_!kFv5dj~p`3X&?v)n7v=~nJ@iO()P~l zPgZSL{ebXYonoYfRo}L2mttbxXdnp-ge8E;Lu-8SV68&veoa#B;y8jQ-c~8hEBE^5 znwlG{1ll`0JFoAv{ER3Lb@4Rn7sQ)LyU`xaG?U6>Ke!v7Xy`JF7A_>@%FwfaK?iPy zKMruG$^IfzRX1oR^+Sg)BTv=tTk?`qJ~=_i!y+&Og?p;;$3N`*o10Ge8Vc;+uY=sK zj4FTc^q^|onbX)IgJWamW1(kd=rgUG+qP8xh`En;b7nMWPVul^>X5)hL)FfY7x5HZOWKUX&z0Re)$h1Atn@W9HAJr$>rxbqfm%bst?_-TEZ` zx%w=8^>aOXmm8)M27i-_)WRE#Ui0DOxl4w(5x947{Du5 zM-EWK&iCroMkGJU7j1s$!GkF5@J5GF{k13L0$AS zG5G@_mCWzB$XK+)s!Yjw;mw7C%xtxZia6U<8WhPF>U4+*i$}(F>QYTq4rVZalFg9^ zk^SYhrIlbl1VKG0%$>jF6ka(r==SUJ7O}|S^O9LIej}aBe+~#kQ@{{{nouxBJU{=& zw7)s;c0E%EV)HHMS}hh?Fj*_C5F+ELi*&AXcdJ5G)afk>Arfjx47cl9BsgDoV4nVl z3?;$*Z%<}bwS1GnMT-{``L4aUHy(O16>E0l_;ITv>sju8!QS`7Pm7nG4UE`l(HMpL z)CAZOq+L#awEk>H+bl;^%r+mH1RVHSa0a(c`O5w7-W@?1PKV5M3aq4QRn%i#fE?X< zO`JBHQAzsY?5h`s{J2tKytlR`huvbfQwAqJYW1IYZ4d;qK0LSU*5|gik&zlbDliT? z8d_TQBl9hO3j^8vXB2a}WhDbcyrVQlIIF2YlsKn|_>+@3ITCWgM?(mqTZRaf!gcFDKA!w(Y=q{V2oxZl7 zj7;Qx`;Yc^-)?G86R9=lg1P+n>*9;ZGY2g~eSIHcElE}M&-Mx&x!r!NUp_%C*1u`& z_RX6qMToj(Hf1=0Bx^Dx`1$!u2pwPGFQ~UmZW;SKW;l41R17+1>SHBjLBeyywblqL zFr?o4wIlA65J65ps35Ki*=(rnJG#u8vPD#6@L`_*L)%rO&U&9ecdn;u^S`$U9@}Dk zwb(}J1mqL3(N-;AzM1a1+1BM7qo*g1oY}&Qzd)r70x;@J^4J?{D(yOI=5GrHm6sx8 z-UO!Vu*bd#lGw*hhC?XvSHyR)Y2PbC? zfeoTEdvkz#T)LwpiGG6GU;K%;wrPE8bZ|(>{lLH&U*F#e#ssOEE*%E#0hMhW10Yze zL5r@R&ExFqDjA{AyavAk;u!nd-Gwfy4n?vTa&mHF!BlhZK6{ZkN_w}%3|$Eb2;lbk zrcc<76AB8|B-v*)yT~K-67@pZdC(wM2495kYtX`OPANhOy{vx&^CFApdKJ8^+r0$# zqSNG^0dA`$8_3C8K0FUXC7+h4mxyPhW>D3krMWBNl7G#W{ESFAb5C0TT%&-%a!R^XG$Kzn((kg8x7Yi9(>Df%*Jw zwfuT>x*%jxgpA6nz<@*+7Z;m#-51o;rDWTE1OZfyC-`p6vQ5u!;genMgLFGo9d=zKsH+N#8jvhW2iqhh>hJWjTyd~apD%a|oMwQ9ko-?Zn7rG4 z>ttBVT8Wl#b!+t89%H!(RTu*!($o6fw}5LS4VDFT#B2g0&Yk3oS{-n3CkGVY_ zeYHB?8Vr_7mfQIVc3&xnaE5tO9w z6q6J8W>HSc#M>_pVGSxQ9zRK2(jxC=n&Hve=a1}sKaAS#{aA&V!}rE;XC%pSy(FEo7$E6OKnp3 z0Rte+cYl2970$TSjsySY`aZ(^!Kp*V^F2J&hK);1V}3j@cej83ObQR=_U)DX@!OuB z9jL#-oWfnvu-RP#^~5dkZBLP}O%Tc_Kp?=7l2{TT$UyZSC}AH%)LIl(;s!Gy5QsIe zyDlX%e|mySwgr4t2#CLJpguv-WCMZqPGo*XRbt=Qs7Y%6KB=#Tb3tl;K)xf<8GJW9 zYo=dY{|044oM%1hOjvy}H^uc(;GP3L@DqVqG>>UR3Il>?AVB!rO?|8%T^1}98r+A} zqaJt0y5U1@Nr^nRj56#>!shkDgqRF-AAMLBCXppgMEd{e`MfaZ z>BJ0Mg$`gsU|BWxX6F*~jnrCKV-!bapUvdpE=z50>#y|-fA#8>8G}PW{iKx3urJRB zDVM5Um*Sqf;V_kO3ZJG*3n$zrrK5_qn-2dp z@S8^7>>H)c!sc(_*|X=gq(e_qLPHwWu|Z^A*Pqv}b(+k`%zRgYGPf&fin-FYN`1Oo z!E=|c-=~TK-yW##=+UD@ozViF0cF!hXA695h(gMc6M3SsvZzn)P$umh9W#ocBx#ln z3=gZq!in0$zT@4h`@1L-CC*3<8AXI{>m52qKlmMGy#9#lf3XQzNR`#^UV@|lEuA`N z?%eo+$A`VlET^xH;0UNAcn#WnGS`D>cba~>F7#R9`a%vc~VzR5xx+`UX)&}B{*g8bG;xU1MM$8BPL$|r5Oubc9RC;SWO4So} zSZB^rHOAGSokwv_OArX(!#_l*-1M9)Q1!c}nGXL0k zOYCBpH+R@s=z?>RIt>Pt(TtRAJf_PV z4f8coAmLWz@$~d`c6KI#7(C$zu%jd(5*5>rC~_-!PZ;CKidMh<)@Mrm!v8oRY)W(| zyLhaY{q|e&;Pl6*vTEsOn(oLR;y<~}MJN6`Gr6&N&PB;;KF>Q-wcEFEKLNcx#Uy^} zZsSZM7}^Q2?ZQr9&pwQJNEfOUww%Zs7{U%T1=#=(Gq`d^MMcww^!RW_!9`HbnI?)5 zsoUX1f4Tb}K{FfORehvJACTbYy-tYmHtJE-wbuwc1AtYZR|5rN3Ptu04xgxP#nshn z@I!%Hs(G;Iyd46fJt%gqA%rTe$P^*u<n9PDQ*6NMgJHZR0O zyu7?R%R*ua$K6{}SO=lyj!9tPUc(z; zvG1G=TYv2_%&>G+p2{aq+`zjdjt{){vTlHQIc4w~BIk>icnxbJfH#zFQ;WGr{5}MC z%iiX?!$WvG^4-FMZIVHE{rqfFX8>&WQo4o>Y}5yBXOfbVcJxPvxjDRgTK^-=6owYX zi~eJTIVNS+?6D%52oOOv62exBU2tuKb0E(m;n){^n@ju&wzdGAFhJO(Fo1ky`(07b zl@2fwQCW@bCmND%UK!yHdziq|b$gGXQ9(1bDB;xO%Sf(w=y@OJ{asqB0Lix!E>d1O z6yw=+_c!=7nph;?Rm-(n+j!bz`ujLn^?t`bod~LSSc`?vl)V3r zhfl{unZ<-vt{Z#s%@~M63}Q_J)1mw!zEK%5Z!bmm*s%-QqfnrWfQL`Pt{m}3$EX8F z7+R)ImQ4(vvUJL@aR!c**~kOJxgnS7TVb2D+5TnzSv_RIhY3whO+u8(iHUb_=@XVY zIL`18n^XIm^BOq2MxK9KR|Bn}FQT&Yb~6*Z5j!u7*Bh4oQ$wAl@&7ozMMQr%y?rC6 ziQCt;FDtT;1x1{dakM9aWQUPa*GV7e^rLtQ2n=!OUN4$n*fG>qo>4?JH|c5c0>zvQ z7A$BpF-utbB&#@ z|J@&+94rd1pK-bYGX4Z2W=i}x#3X*iOTVfoM;FhVKfg0GtmoF+Juh71#t>{$@gjG0 zfu%NdF>(MJ8yucC{u#*GM(s5glvRwZ%(z+yTDppqRJyrm5cD+5vRp=DnqB*)kTXjb z<;7|wCLjzMi5I|g;xh1hO*{iEjxNd8!hGAd$wB3h$^Z6wF{2r<+MI%VP zr;(o!fU0oTIhHTiFAdwRf0@{-GWv(X&{L5(!GEd%1CRg*yHE+VHRAunhwReJH!Gd# zNhv1;R!%Q7Vr}aGEx31+)Y_4+U#-4KgEFf}njYpVCxkY%`Y&6+jw zI$6rZ^J$+wj*};0)mo@91T#${NIfxTXQl(}jZl&GOul{nq?L5`5?N0;)rnmTNQksfU0b$cJNS_tPeD229|iAg zoagF-<$|Hc7qJ>WH`QrWjinnTRGXWd|FcYLTy@D7wb+xVPgAD$17*P;l+5kzE@@(7 zLY)2R%8CM?dS+}rtW!1Wmdd$K)`ZA&`TF;QUR_BFzMTjo`)kBKzB?t<;QW9KjX|~J z%yOU)OlJ$s>GF}DI1bX^!aoeE(MmqWC|F^fnknQMe%5pLJnk$AccPHxe zBn(^`0@R?r9`iFYSBmYh%7^g;;&~+LSHiv`^|0mSGDF-ESg#`hnw3v|q+`Iyip5CD zRbZF4f0$r?gcV;3wKW*~2M|&KT~0)sBpMk#;t}#GHFzKeCO2c$MR57vCFL4Ox>zvg zoO=zf3T>H-V>xgLp)~gyqz5qp^Nh3(XrsD)TsX(OsP6mm zUifC~QhanE<>MnOXAg2Ap!t|saSq@xg2w;mK znoV&MyJ0rIXA+nZoCErX$hxR4Q_zGr^sE46?*)xszdkZSheaI44Dd~%L-(0lWTphbISZBYafA8lvhqr5=KzGpv|(?51j5aV7#cR zs?tfUgFcI%Qyw`qkG|#D*FusA2()|ld0?0tMV;4!HJ^gaNH84 zB&GyRjqvqRvOkvyYMsUkaM_C_Xr^?UcNqoWwT{nQvQb)xG%zmgKu|-cg9n_isz_=b zeSH>VQp|1Ls;F-}%eEph!1<70;W;y@2cdf^o%$|7(9#<{ibVNsvYAF-ZHf++ohWZe z#-I&(|pg*hE-CBqFvzJh{ctn)ix;3Gvn_w0m?}0#~0ME=k|6 z9!Hv$`N0%JKnySmBt4IWj2`K2ASTZe;g+T*3M>c2Tne7t0LCgAzhNW*Q;-5Af9W>l z%7K81vp#a<&n5=*92JI??XIk(NFJAy`$#-gB+x_9ha5>v^xzmG(WI$}#*aWxyg0&2 z&Bg|FN&FKV2vt4tMrQiXHjeZR{1IZ40+9$Hx}`b~UGVC?`hxd=7M~kERm&C9V0&HkiqI=w{R!@o<`)aW$?NgT>}!q4D9K|qJ`U4z&`t}owWhn^r= zvmh$sP%eP{)=J@O^cJEpO?afuw5(nHXsz=7BfunD|x0?QG6zG`6e zw`7z($Cg2h(7|k=dT!UAJ)ka35K0Ml#K~Ti&2k?2Q@J?B@un_U=z=_TQ8|83D!-wXfnO}`p+Vk^C{%uJl_t$ zl<*8l|GKaXKw&1&teR%3M%r}<9|#IRew^&ITkC{`A(MD>i7TU^tU@(E{A#6A6rx|C zxNQV1_9i|w3&O{$V*}eK*MS8=I#))V1xVw8Puy#ad2zO*_w_pmj`x%Pm4jF2<6z0t z@`nBB2NQUtV5f!j9SY*uB4!Vy+q{9I^|&zrgYCMxnX^m`_+T(7&L)D|#K=7~l;Yyn zen(aq)wo)MPTl6o$6tSgQ#pK&r>EGDYc`f;juiFs{ldm0cXwl_Tv7TvY`;iZQ%m?G zVI74^aV}zPtp;W^pcpQ;>xaQy9cVZ?ZU`1Set3rf0q*e-7yb8d1E#e}k4Z~%!0MDV zdxkny2N;eVy#EM)ixCz8@m)F}8i7gL@p3nrx&YF1JGeOjHR9c$o}RRX%TNoO@GhJu z(5TQ*lM>g3tOaO8MM+K!*Cv4zxEqq%Ne>fo@Q=%1=)iZ)KF{t?xpA7GMY4 zH_0ThyT3k-Cr%)UHI;sNqU zB0@$LuqnL!9Cq@|PdH_O)v1ZotaK?DLyG&kRb+f4@?+GTE+qO{(<0fM41@Q- z>M)ednZ(iGd$s+V{+UnUJKop>S##CATdU5J$(lAPT}t6`c5yN7Ys_*15>L~AIe)`` zr&CZr)LG&VOv~F+^?Tx%Idy&d%0hcJ4h(XMMaDvyo*3)5Otcw4hpwS{P0U2V#-Zu3 zIQZrhoSrpsPW}8kWgx!Fj(H<&7oYr9$TV}aon09;w+phM>*(qK0M9x1OE1~tO^vk7 zOl#noM|=2W2@n5<+w~;rQXO(zz)Dp}zqwqYQ+;+~6_IkAo12L#1kd^mG69*c06B`R6eh03 z6R8WmKkCZLerfy8mx8I3CHCNJOV<4Xaxyp7TpYG~w7kTi`KId&kL|?RJJVg4Y(F0O zcc{>#s7+S$F9vUc7YrSWK^vn(5&*?W+b2d342R8pmnDS-fW?K7Zz2jRh%RbJjvTq& zZMdp^A8t-C_v>ZQB77PeHM!|#ge`<-{sb2@T2xG}j(LX6N1F2PAa%jeg_%4aM`dLh zI394S-?5V-mAW$M!;W@?Ax6@W8A@AH$lCX|wLJw&?=tQM|36%36h`d7q{Rh9u__$* z)d>`BDc4a|)U-I);yI&}Fcy*#4_R75XeM%>9`ADKWo~yU^*ec&Lgfa*ZHJm)AxxUG zQ!BNY<8yQ0aFR*j!%o#0Rk2V{fEhb*vT8ap3Fl?*NHT953ZNU&$9Yo9n;gveY8 za(jhV&hiumjiqYC@ zg*g10JvQ* zKbMI4q)7xk0NMRG<-_RGh{7j~n(6rwAT{#kV_W(Vwv@?O9y4XNdqeuZm7AMOh}n_8 zriSbQ$W4ers8vXB7!i=bry8u+By1|UOI|s+C90uE-D7Y9un^2e4EChg1?Eea-n5#Y zCe!q-N}+MYx8_G`Cb;xEjhm1>gSVM^ppYg_^{^~yj-5MyUIeC3cl7Zhc&OpAiRS_h z-qi^$gq@%G*5=cz#q=>Am@Xb$!w&0ZF=!r`kA1MORX6FU@E}+yfjKWB)DIM;l#rmn zz`$N3I9~6Eg@qxDa1#v?f z!0o45IpYw#SQcK;t@^jm<=h;Q0x@SmX#Yhi}POC%(KE zR$8UoG=tzZ&{$%pm`~D})mOi=jJz?C+t;rb+bCj}+o4L7s~{$JY$o9nXE1olMeAM~ z$*x6PCZ2LT7_QP#sRoxGN5zMoGt>6^7Bwf7&zw1vP}6t;hp=*rMB=o{EHq?s7Xr@HyI(Mv+0x;iqwT=0# zT#N)M>23_+0gnW`1i@~I{Pha9+@9a9skvx7!8sMI?1CYWI4?<~fn1H!5-QWsoi0(jIS&5_B zV6xVhc`Ng}@G5u6kv1$expS;vAB*KyC+5Y7@{JgrMrtbw@KV@=u3WxMQlQtc1TdEX7dZcP(&?iTf@g|`pNfi^380;(Mr>w%QtDmMiLF=y)QnGsw5_# zJO{}Fp}l=am)itA5+@qnB`7HXQb|rFTL+%VxBF}%`LE-x;Fygfk8vUQlqu!2S-O| zF(d|dkxnJjZVSHzF`rXQiLnywOuLC5^CgWMK8P?`ZMSkBFQhwkduAkPry6fYCJ*W) zsu9v)4YF@?TpOfoBrPo%VjcSTz`=-~nGXQCKuFJUkyzZ3Xz60KIAdHKuN=H08E!JR z03<{~Lv+?0BS9Wj0P%F9?yDh9_>~QWT!VoiE(-+ec1m`tdw%QAa9G+AIteKm&fxRM z{JRHT*>t`zuz7)VV(bu~J*d5gw<{U$Rr zuH>(8Yk^S@rjXA_l?0h^D}6%p30TXACTJ3I`E+$n)ie1u0)$fci;Of1buC^9+940A zoYXDhjF~H8680y67s>-zbh^fR$_BBnV=$W$Bmp&5pk=fXv@Wrc$ib?UM})nMdV(8B zBNul3yafxWKt>YqvEvLzpaqI#T+-o5e68rerBcAm8Lk~46YM7bK@c(EKIHvS`>>9h zMcUoQqo% zlkYh&3WqazR&7+v_D4*-Cec7l$}wWB14c$hz6bYs2ZP~S*+^RIF&c&7E-YS8a|t*W z5RSJ!&^T!sKQu7#cDN3IWY+fySS7Uxdr^%@W`p-_%%RrOYC+bLff}qzt3mnZOGD5y z@QrFj%lW+{4 z`Q9Z%I!Somf+Zt#x@{@s07hO_Mhch;bm~-Rqn^pQv8rv`*Pjc1K>t&ctSQEH5cLgJ zlsb51h_?|agJsQ)QZN*OblVZrJ5t6Sx5!h;v1okTN#CrDg`0cW|rlo}mQOEUU}1jYJ(o=v~L94P6SfM%s9P%5K9@trtvq5{z< z&AeeBzTwPJ3$#aYO7#|#8!;GRi|p+xP=m3MvEM?<2b#Ih6C&lRPKHMi@u2_&yjGUY zv%ah@?wpaIi_NyFAG;0NoeBb#bo&D4S~eA6b^tL6;ZvfqN~A9oQ%J+H^~9cdH%uVm zrn!yV2&{$f_eR8hV)*3LE%GcP1R{XaC+-4brsqH(prD|jY=(s>>9zoO<{w7r3F3%` zB;V$nS6NYE2mXJyX-FJ9Q62m(sNr^BZsr}>jfIU|Ta>Ya8JtCq4Ycl|Xz17_kAxw*(wNh)Lq# zA+1n^mH&EMA*vsV!o#{@NHyPCEQleunWx41&`L)n=fO(QOpLpFcF; z5&bVL&nb>y^BzvcVuqvWTkN7pEC%$aK42KUB?}ZgeCn;)OY@XAk!UPjShrP?8cXM0L zGtdr0swH)Gb)0;>ydBZ3pSfz@3b9B5*vO9gaL1Ew0Fd=$djbOS(mf+XiJVEy5lG#n z*(ep9TF#%@be{qLa5Nq5uLixGJPY+zb~5jjfBPa~t1>cDa+Y*KI7ihZ2YyQiMrIds z1u?5fr;Qfr8gu^s%=01zY^Fqjsf`4@DCXH=T2|< z%~>3*P+5f?lt_Gl1nlFmMWT2VIJ-|l1?yiUFPImO&g^b9nw^4zwzv8l4hwk&An3Q( zOrpIB6w8SpI1T<6g_iS>*GXv(gD4<8!p%9A4J<2TIw)l&?eRUc8Of6v9S{~t!4IC> zEJSzTlEVOUHKdhMOSi5gZOP<$@%*H_ z9E>>_@~mieVhBPvFv%`jK?8SaSpz^CF)(r=*n?4W>3RM}*tCiteu#E|1{$u0>#2#+ zcpr$bvXm1VZ6DFNN>9bl0GIq+6 z2Ps16`c}Z$9VG8+C?{s!0u1(g77<|zMj{X>pM;u=7ul#5=u{@Nd6JTL zV`AH3P;u(5z34aqad!;-`o0k+nZS0G3?+eclH3VV|H5akzNh~ZkFGBgpvlk&(ok>0 z4tgE0O8^axU^DlP5t4&+fs?LgP^R%71o^}5sQKv|w#^8LC&LNpStgDUb7Wvn7PFav zjsFH|i6#X+Hm7>w4hp+TWlO0WA($TSeGMB6<@veMwiTNlFYYv@yS~<_62hcL14^|$;BB~N}vWyyK6x} z@(YIy5_4wHWV#mWL26HTw`om1^DJ74T4Cr5+MkHsniR+AkvXs8jnnrrW)GFNL(-9d zd}2bY4SZrl9?LmGOZ5sdLG+GAn4IuKz%WwrhRhG=0<2?z*OCs76ksB##5?6G0J;B* zYJ;u`w2+}+8AF4J9}q=YRfGiEZk{V{CLPs?aj8Z@e&GSSE^^TQQ5um<_n)%%d-Oh_ ztc|ktA7%VH@O6v*ztWGEm0IIHN8cs}d994<_Y42v14#svedF@s|Wx6k?pLC~* zi+4<`eZ*}ORoR}Bs4JB-9PklNo?cxhF~`=o8gF9m+_`G`ZqDenlf#^VD&hu%=HHQB z27(Hs=H=vSS~?nGYjw?tLF-$6-d5)BY20b!IV`q-aM<*T^D$V|SZw$X(<$iaQ7C`4 z`qla%LnuUsv#kH^HeZf0s6hdE9=KbH%7vPc*yGW5lHZYysLw$65HaHrM;k2baO%OC zRkGZinG4wl3;TSk{KLW4P{F~G;aogq#5w^~n7?^`mb*nuEOO5uaIT>(Ydfr71?X5N z^%-yiw(4x%w0f6L6o8w=69v);MVQabKm5<=2n^fl z&gu%q92CEQMn`OE7^jD!dxr>dgnNLc@oD{Cd_hLVakd9X8pIH`XgmYMT!5XFj_jw^ zzyH2T4vtxYR>j%RKYH*(P~8k=K3nUBmXmh1dg3kjF$Qs#E_us7WaLZq4)NjF#i;VD z3D=GSMk~@kr6fbhqZ!8KDG0sGmIhfiW+YWH*Zbm=7l;pK#2r9vJcNITJK`j?j}%n9 z;TV}PBXT+Nl zc`M(lgUK0WES()OL9;6D5u4Bko|rYug9#)TM3fq9h12xVHC zR*!+7|HvP>_Z*4fThZMjx~)*z^VcAhF&E}bP@kl@xSSsu4S~X~-Gu3}X3K1Vg_Rsd zaA*xdhuUxbl57&FdF}nWE(tA<9_IbCIsU;X%RvE%1Lb2*!4cHPT~&f7yMOPV!PQ*g z01S)>Bmf7IkufvfKhxfZ(I>*>pMO9z=r%|+IEA;3Rbb?=LAW78jaL;jkzJIWtqzu0 zX{Vb-Z}j+f5c}etg9ora2j)wm3?gxWxL6{T!V=)?CjR9L0Mnl9`$XV&_o`wUH;{9w zNl6SmuMRXke;z3PCjq$f3~SxCfX)Hkqd=;6Bmu;+RMYD1t{MO8h}a&*TaX06aNM(- zqPz#TkAlA-CB!v$3?Xp8CO!+k#Xz{*^BPj+K!7znwJn7bbC`5h+m=g3qN-eNM`-sS zU#B+pOdszh4%jxZ*^M?9klND2h+h^$+iXaMtnvkZR5C2DY2scu&JP19+MkECd7<;z zEZr84I|`ZRE_$^cX2S{;qY+pLWt5N(EOvReMM61YR<|*#ig+I0*(gFdfw73_3nf}8 zP?CxeUNV%ftOH9j^AWL#Oo&lK{BLl^?Nvd#0GD-PUfH^tNzg~6?bU&$eTaPef-;yK zXkwUqS|U#Gx621p3^^q^p5P=Z7i|BL%M#_$1b~q>JaV`@5EJigbV2O{29PxHf|pRb zvwQ@aIOS>%cVV@*i;2}HupTu13uMk_bSGY!p?Du5l<_eAm`;57%8oW-AD zuna0vBE#uWC@H}ia6N>`cs4HzBFP4ujEY}L83^apPP)sN{h)JmMp^u*S0BzMM$NKa zCoRyEC_)lsay+WT?NlAOJ90$>{zjsT8-U|Es5ck5Rl5!F*&XhA8=LIu5_Rywh&kom zy`Ko5n18V;(+*+Mq5p-7JgfIR;37t?upfHHR=r%@(v`S@fMfg}Yj zMWzMmWc@*(m<+1CGPoW}c_C+*xNXsG3Nv^g_+&`hDG^v6Y(Z=J6Q*2S`=S%v&Vh%rOFAln3G_BV-KzhZv+!0u#u;iGXV3VZ=7{!IPG5 zV#C8wng7Gydj~{$Z~NZF#GS;tvyHJUMo6M4b_EeJv5qJf6qRC$1!*=AR0MPczXlqLmeOi%-#P@Cc(H8b71&bY4@WY168kTo}Rwe#>~#RJdhgcl)dQO zsncN!9Cl9BhO$XgTMz+oev*ekBG{GSj%b3Y=V8!5xk_kgmj2+QZ{eI&GN)sPAroYV9>OAF}n z*w9KFY2@SMsQ2hXBJrPBQvy74E!rp~;{v8IzO;g_0_CRb73$ zCcaq?DN|hXVFiuk6VaKW%V1N;l?Dm5m>0QF&gyL1`0nHzQj-@)zxX}rK;N{mIpyuH z>qs?_fCuoHL5e?SK1FZ5_eDajs?5-=BU{iI)Gbj99XK^vhOa@v2NsiM7>>9n6brCb zJ6@^<#m#S1LL@iWUzY=4a(cO}XS2NX@%lAT${9;;o~U(s%=zmRjc95ofU|1J-6B#) zIJI;1{%yD|$mJ-vKx7%*sBa=l-u(4EH0m?q9U%U4=?v*i>1#yO>s9zL%o&7O9Bs=^}bbC8rEs>}~D;$*hLT%15*6yjTzrC+mU}@so|52BqB{Ao8>B$e8`N2!e2|@Xc>=tkD(Z|Qy zc{en*-kj1k2(|6fOmN?9Nj#|L2&-h5s2jm{y6I^R+~VvUbA>(>DK!#$#LLk^kazhB zo;AZbEJQ&AjK8*Y^qic5u0_}XuVf5i)yXlU+TLPFu9QWl2iL7+R<{r}D|$2#0}%_K zR8KuI>Y{(or+;BUYr%+4I$L8J(mp`ef|SXCZsu{osgA+YH{!xOq9s=4f_rwaD-nf2 zR^2+W_t_U;e6cyYY@Zm+xh{*<=NFs^RAU+o_otqE%j&~+x5apZ_KednyJ5v8kRP$- zgQR@Ywd?+NwNKUxGeQmb0OKXVnMGp7!YgI_Z-*TbL8N8)RRGB;`4>5DK}N5&jW<{u z*Z;3&Ir-u_jJlw9nNJ_kZ~;qAY^YN z_vT?0$#`QfhK<9?aDt%zJ6+xX(e@D!8v z8_;A4KScqN2GF>{aTtC+LxNaEU8=313}3)+8iVlZ$Y=8}pXh-ULm(-<=gC3SsdzJ* zNEI|^JeM+Qq-9p+o1(-G6I$*+J3GJm`sB8+UDH0%rY(fezq#x)ggO^b@3qMCo5Ea! z)BPW!f5`CB&v_W_QuBTV;={DwaN#{xd>Z>vj~;O2+omhUfW+bUFzz9an^1P0c>yQW zEfO403s|2u@!4J^W5ViCgIOqZSueAzbLP%0`FPb?-?ph^+)US z&*!Pt71}0U9zqR*6;UK;D_UP=c`U@}II+NqDI@R&V~_(1Sb*?DC)_e_lq3Z`zua-S z2-E~vYJ(feEE`u>bMM~DeU94~yt=Zjm))PgiJVP6`EHar!SrJ=1h6MV4hQ-GtGBck zCOmt2BFv+;yw3NSc7W!$13xn#d@gt4H0?#ht`#i&aG-f;P+7^Y-`w|2dpFN^zIngd zgO}$qSn|hzANxXQZ2L2|vpNm@@|$$cqGY4yhz(yH@OgabM>oR(`zI#Z`9*qWZ7MyL zUt(^I8~?Bt5ozLvipi*jSMQSGQs zY5O-mee&ddvAA$@j}dibUIXl}zr#Zf6LQsGv*($FYFF>TJ;auY$sDYpyOliH0Ia~$ z)bipkxb1sfh#ub5yc-6VsXCMrq12@Ys6=H!qkZpo+OWA7i@vdOR z-1zmk&xa-fIAHf*d#;XmAfreWlCR)INVA1)PjvNpwRn;7W*rNC1&GH(8jGPsF}^y< z;_f9_52xf$1t|L42m5sHJDE1z7jq;VT;Stsc4{vAPU*q>1SgE1;j*M;(~hmno)gWq zj$T3OPqv@Th>KivN4zY^Viys)LG5-Lq^%M+`lZn2`~lf>D-GE=R^qBSqpG4ciYk;P z?QetKua&w;h=1X;i3pw#MW2&Ddd@R1qov0O8O7mM6Sv}WkCw5iFHWC6Et3N}514d& zdyR}D;-l0}*wChmk9;$tM z^{-{2abGL2^qx;Y>W{Yf&khjGZ#xEgD$CVP6L^Y|Zkf$@K?{WQZ+LKYxhRVEPptn2 zsS}eg&a#Ll60_3lAAmcTM6G9G_YzHM_m4gbfS03Kx}KFK2E06JizNsIO$UJsX01{~ zir*w!{qY?7^lfVerPM-}i4?aYrlVF^4CZ8SbsAVQCiCb3kv5TBGsUw`{IF;yE>-ut z?j^EooNB=!0<@mSO3H8I`(y`))%uP4jmr$e94mADz?RET252uZL>2#;HAHYUNRIeK36WS-+^l=wA3;v9%S zLXST|kbvmtbCEi;-F(yx5*ld_DpQIO3)%WX%=XZ^+JtJ7*!E*VN%ZXC4oBz;*4TAU z+dm+yzGvzPf!!rrCf{83_VoOW*Z0^s-e$_fufP7d+X~Mf*`rgD>iZ)>= z_yM=fHQpwHiQ?@wtsAa-bvv6g+2vr5krO{*mt&3)xwZr*iEuwV!@wQM-H#wT6Qxj%?i}TFGBy-S(WjxU6U7=$8WdLS z?3~#>CL>Z18A5TATY_{1G^8_G@zGv)*Jt~G+u@_3j}A{zeHqQ+o6YY&p2=h+cH{zl zce$dIt1B~_;dO!GG8Yl^1*2`Ar4$Jh-^Z5U1k)H(t2Ajkj#pZ8YxoQ@5O#=p)LC&i zLhN3tY*0gkmfby^vW1VGa1!q;Lj<+`dN{s;W@v^T!F`&HN9mxpLs_q3}Jwh5*4VJ*}1xDTKL1{x6o7?b1y{WV-6Gn44Nc~S(sL+Hg>z$dAB{!s1u>?Jg$!DB)Jm6sGHZF zk}(+~jR7YYr9SVETm0%n+YNM2)aU{&$(1U;TnVC(nuy~~e~lVVlU`?EcF$+KOQe>l zEG!&Fk!!|kC@ngnGl@%=cWpt{DzD~m?;KZLhj>HXA0HHENFnUw@bO$o7xq%kb&*UL+U~9`7 z;QOIPjww*l4;rs_)$DSl*~pW*TltnYX#@$3wQwx`mdZ&iD%`G$@hF(ungV1X#cvT-tHRJk&INAfpF6gjLVo)^lfjDzZ@MQl-jJ%(*( z+SZjN8(BD>>F4Ks#|NbHCy5Q9*c6>CQQs2fYCdD5xIvM)n6-koS8Uy<`iKWCEFYgb z5Ak5uM2$1MNBY`nY*_b~_uj3J(B||m1d}ExuNak^`HFrMI*8X+EC?Y2GH%N}xEIQb6_b8ES^-UgeSkL@>`f zDk`&!77*;n3NH6r!?0XNvbDbf`ftYKX;?P>tw=3ltMMVr04+|`BRVgP)sC6BVUHTXlB{~_fDU=i7yru;x*EkRc1Q0yFf?(z8sif+-@#U&&doSrS_ zSPG^5XTI$oQV(3s%F60+JTWoRcY)f)eg9KVW^K98a$wiHV>_mHo)zEg5o!xFf@{hh zT1->#v zgn|+af_7=Aqo!Xy$gNli_TM%ea7)!frrrO1{J~N?*X`Q@UCikP#p+rM{R)-*PZ%lj`eIOZ40<qyrq552T7Ot^zu64V6` zxY@hn7(kxbI~4FlGH=Zx^r;zpI}dbK>RN_&xHe*xmxv5NlR+$vz9WX{h#`y^m#z-2r%ZKA)I-LP-dj*3zulBxiy<1!mw=-NN`B z##7Y~RTHYDe{WC3_n0}MOD=ptS&M;Rw&0X!ov>!K@9?1*(Q)iKgw?kW1yVq}Rluc` zlL+TP-TxeuRkbWRO7Ec?Hq<%26{RI+M?_?KtU_P2|2!JG2TXa(v}N#KVq@I%kT(4$ z&r?@{9y1%<>+w)irPFV}yiazF%*Ja&G+9P7h0zwEo8ty`oGcm1F8Jn~o21i9YK#Or zNvw4zM5&UdD|taRW7YTn8rQph`}bcTO>E$a&^VEm-Mi%cs;sqTIdT3p;NpRxfs=Ao zv@u|Yozj3@r%U6+WN(=ODlw0qD`jew1fC~OSH1r9VEe8eVq1;}?NICpFa?R4;p|@Ayz4`K2dTL2`(~lo}PTn`gD9<@FYeYPAaI(zV7E{YjJV z*!CE_K+D3hyRK!SZ5jh3?ZPaeYvdy++u{K{THkcZ=mrSF+MlJQEEET3iLLriy3v>t z2lZxVK<^?K$MT=1erunxC76zLX^l`(c41NFFzCla)z-6S@_ECJjnAJ)6?4FHQxaoh zmO$ZDbfVe}hRG}eJcWC{<|XIMT06Ckl|Gy&!80DBU>zckGj<7dG{Xr$9evG(Q~2b* zG<2au|53{&mM=0YPZ$t9UCECzLPJ z@7sru&f^cbUW=dFRH$zFoynt1XdOifoVN^l7!n!GSSN!nu(M5JGNOqJs*Z+sD6#qp z6mg(%pPx2TXZJl}&GO7QvFA@Xm9jHOsSXd#EUL!axHd(zynPu79@GjAkkVDC1A2MM zNJZfo5Z&2Qfk;xfyc^UzHyeA)flJ^0$I1kQqb@G(N#Zo>$G?$N=`ePVt($VM4NrS^ARE7vtAU-xQ zu$)A{arP!|Nv&_1cqtvLfx$hYUNB zi6CR}n^Dv0n68?gvk*D+b36ia$C`wx4TY%hANX!&5EzZjVIVMHE71FfQt;q5eHZdJ z0lhE?-ehTZ@7C?K%-@ZRlW`~Ud_^V#n2KvC%Uw+ZpZ>o7M|u@KXn7aXGdU)xmHp|@ zd^{ARREz2EhQEa$-2G=ewQ_Ffl+E|MRPaqHuB+)2Bso!fPE-O!=mtH-7sDB76810E z&pXiFq_YoeYK#m`Mkg%ua@KWeGr**dWm5R9dl$TVj$d!-6{b=VX~g|%v+Eh1^K*$; z;Vd#XIIg`W-ec|TVaFN zhintjgAXqt+c^4k5rmp}n}A;&f(8c6| zt=jK*D5)FsI0z)-RMc_wILbG}EuHa|p`7X)rdGSu>-0$Y^wOqsPYDs#Ina+!fVt(% z-5D;DG!K$tAvGsC8BxegUCEuvKlU=hWhodD;51hW*H;h&IA$cHn3V0+ zuV1JjR!CuK<^`lce`h6SD6Y=lQ1Cq5P}}kK z0LumuJrfKVgECVoxCG#ECFpBRegXnwr-ro;t?_)tPh7??q4G2bNy*&hUT*PP8?-HvZ@sK#TWNKT9d3mW8M`<+Z&0aM#Y=wh!aB+Fjsp+Cb-Df!6bDEB6%syIr|GB$b z%Xp?N;9-j6#VcLMw*gd?mNF8fXxK1Q5y5i77%Nebg(9{Ym9vWZW7g3qX?~zUweI=x zcU48e_uptU2bQa)lj##Sy*QeN-+_B{C3W%g7H9Y0jg@K&3%JsFQCL!DDR}s&S z%p(&%BIxo-uK`Buqmcm|3ysV6GQ!vw^YB_H8q^gj8U#0-<=>&KGWWhx$1oAw7l<=- zbzR;#%){0vWyJPL+z$C2RVDqCSUO!xWc|VYKKS6^$^pds1}6gF7-Sw9*K^pyvj;lo zwfNVEj^Vx(c zR1iQ4+6u970%W}!nsu+dd{s1Ky6cLJ8oF33Q1vYHE>JbQPhQA3Vg>+9;d7>PQZe9E zuX1pUcd1(+?eV(uQ`wDRz4>;+pTQ+?!3|OjS#o|w{3_p?Cx{#r@!KXg(CSG{(ank6 zsp3Caf3jeygyA4C!Cm)-7~DFgrq}mntwj|}KY*&fw$KG7x!K*v_wN13Si9AUj}k7r zx5SG!MMOE2I|WR3W6ra^IQJe0sNO@pp45qwV5v2jY8qoshC>|{nFJk><)`$QVM(-F z1riadZ=!KD+wY!(ZmmUt)Wi=4NHeLU?u$;B0P#OYaqJHwad@cwg-j*wm* zItYa**Z0kv^Y|qt^-D)pQwPfQCysQ#+B-IXZB>nJQ{J+kUHtp(q8r;;Ne6?2?c$Mw zO_HM(zC@g`idjACPXq+X94Hyv?Y7wA9JPZjn1y_=P~WzRTzuw5{xd(IP(LSSD6qM7 z8Qcz{VPg88Ru_aXrkcZfheon3h=s#G^RwZ;l7@*64k^YZ;6C6S zWjGIlp;Rp~Z-Rc@YJ1ZXXj~{^B7!!W@8c|4#0Us2VdV2O??2{F6giTd|6D5Rc)HQs zO4ECUL2@|1#BN5%D9=DBH@4Ar90w(KPE=i(6Y>gJHr7VwLPbw6pKpZ+_#*8UR z2p+A2UK3hD)|~%I1Me3ScfQTkWfF=}YuC08IHhcP;K=FgkCN3iwlhl%6DTT0>F)yI-QnH%5I0imoVNj)2 ze*5XE15VGzzXVNUD(r*4OI3D`l+q9M$}STDz^jXO?pP53#DdDjR7vn^AbjZ>*Gv>o zZls(k*Z1!@iLj4^#uxSV*)qwIUd&jDdE)3QSX4k|bwE++x{)*1ko#0 z{}Gwmu^lI8${~@LgnKyP%I)~zi_jO&-cF0T#KglAVh!xo*Fq&}l_y!T68S5G*-VYN zq$01-t!tOM@<;iuoLFW}oYxaDI~!KN2bot)J0sn&;eZj{2<5bsJl-FP5Fjg8T6?6UoGdXRZHX^! zFFwWH)8@q5^*!FY@1MKsXNoV!xVHX#vinV&y$4IQH6|xTd^yVFzJ1_9yI1|&?Ywa! z+p|t{#+px!94{pFRIQy9qqlddSq@VM>bviGhHF+-^wJN&(NFvCL-?b8o8#30gvsL` z%)E=OeLh3LW`N?E(p`i5hrSS_5{K^~t5z(`LizsQ@zQ|Dclye8i0T(>4&P*ZmsqP{ zxkA|phiIGp4H`U5OJBoyckGc*7beu5LYQ@#S8eTm!iGLoLX=5|lOQBkhiXB84yaDIlG;1GuPi^_BfEh^T>%-gtghMG0rC4v9 zSv$t&W8P(3CX-98(=0dD$3AdWU%M@=iisUT2u7ELOY)uw_yAZx?B8EEXG~r139)qI zJhp>cq>^OzS|e44_;95G=Xkwyp^R-In2lsC;UxISfhzAnz$+%@1{a&b3#s9A>aV+<|?1F&)PI-FJ2tgawBM0 zeDYg9{4?8^7uM`sgar}y{CqJT!Hs-NHdIuxoap(Mu@d`}2Sv$G7vp$##_34c=Z{wI z&Jgv(8Om3MT>mn7-Og9-chp)|DoVw{Rp7Al25|CH8sf?b6avXD25oPd7q3 zwG{boxqyus=a(0!Okhm%l_6>$rY^#8qd2JYmH`J->ek+zD0x(LT3QMzoL22OTnik} z4YV%0&Z(}R^FBmE^Xl}3=LwC0`0A|Yl$?CWnof^6Aj$s$3#O!!B-<&^2z{9a-ORed zhkD*H-GRZ-TDWxqHLGH*GHEcsm$__)Mo)j~=HgQgYHr5F%94C_f;*hg>g_-N&el|0 z4~k@#(n z3=fIqZ8TSV9K(U}`@)roMx@R4EX~ln|_a9--gL^`2KiLu3fGOcw>1$7sbmM>@HPE4o+aCJ3zRuqxUhp)pN@ zx~rC0eQy*@AgVk;yrd@@iou#B`319!NF(yAQB3^-@r!T7DPcDmD59^hKKz-YsyI5B zbcfZuGlSmRW_xp}g2r01Dr@y-*t+4trOXq7Q%;o{A9CrJ!7NkBO0bAseUxd|Bra8^Caf|v;>yj`Reyw?0{vR;dwNRVB?DfEkk|l7nh(=0 zh$l^W5J;sA7y}vAM(8C*x#AX|92U2CZOGD{q6Sz#b3lMI3G(j zsQa##zT;Xo+3?*Bd(?A;?;hxjS}dH!f59yio~L-yr>d8Ie_HFKlsEv@QJqp_P@~@D zec7K>6=WLqqgV}c^WP}PlpwNZ)`0Xx40QWIE(;VEk!#*846rGlpeoljn!6!4x`Vqd z)L>7JXbA9JGHy{18RQz*Ni$L(M)~be>HXK|NHIE?#|+DjU*gqh~?Qy>2N>#sIk z4$SK8a4_-GB^htSut;;c@EJg}V(cwsvFujsQDhAVxyNZ6RlFJQ&VLUjJ zNN*wTCzP+cox`CetrSI^9rg?Apo7iL&704t6EZfsna9R_^2u}xzC1eB2W0xvNlEqV z(&?mj5cq-YVCUM~K|ohS^!1l=ol7oI=p>Gl98y1pQ#3aSpxhH!zd2BZ+?-9MW*nbP7wc=(2ZwBDXS3K0Mk!n_NR+$c0&K4Dpy zs>C_9l+g;!kq*(;M*xYzC$tno4p3AdwTo8$^vNb1$VmhOu}+4-QBsymtvsDreztfA^G zxudhd|C`^_=wYr7hi-|G+sTdY=?+}8y3g^~#F*Q;0MlZg;BGG#!E1-zy( zaOS~}m8)xc={Q9dnK!q)&`wne72~EtRfAH|(!+FD0kNrRy zkDWWvR1ii4)J)Q7`0e}Q!h(pwGT;h|<5`k{Ol{v_K6!GALkk7P>gej+pz?X4-9DMk z7wfWbPM}QsR}xOfPml#PNM9bUOyr`nl%_(qY!ZN*Ot}C2)-D+cbA8 zOiEp<6b4ByCVE3}r_nc2q=YZ?XxS1x7shW9+g%`3(9!(r02}iJ&+5G3K#qj6ajl$G zQOq=!w2SZ2|KFYGOEbQ-_#XGjpSoB%nINx4al}?Mh*Fcu+_!F%SN+;!xwx=obavbJbjW#(3(x2pWq&^u_l zKUW2pYc#t(DC|w~Lcb4NU`fzWX z6ZzFPB~)^>Y0^(WXQ1OFessPJOX66%KV8kCa3EN3frShtAyZ?J*r>@ExUg()pK)cc zeeh-qY$(f|#poDfrg~24I-6JTc?L?>AS>{1{AuGoOmdiH| z^>cr%K)~?3uvczS(`wZ&-bsCE;IqG+)82pCca?_eC0lua0;@x9+s{wZdQ5KRDT z1!Gb`Zc&=Tc#7|wfDbPD%wF16Y#OQh7L~zP$V|VX8Bf{a)yb_b;itAe?A4YjCBuC% zh-Wu$SDovc;hSdNoroS-xN@pkxL%3xrfk#Hxjm@g zWPlAM$oZPFo?n0ch)vHoO7W~dxt#Uaon>x>(ENL`e#ZRf#i%zKh%zEJz zAvE9xXDYeMC!r?!?7Aa~T61J}1Tg!?890LM_9?S>7Zw@D7}UJP31j_V4KQI|J&KTyJpujY2g|{ zv;9{3wQIkASwG+SZm&&vMZ*-bq~)wP-927p-?pv1C^}T2vdoRkRJ-S;0nIFY%Y|a< z(Cg>-qqv}mN5MBc=Tf;P(50Ed#~)u#`dFCG1>nADTm|NXeXG1Vk+LnDYeQSXf5@ef zTyK3`^^*@+i=3H-U|Uu>X%#qqTS3A2qitz%+5JzuA<#WJbp$kVa_y(T|@2 z!<*Nz!)|X1Tkg@4XIo|y>*Op!qtbx=k&qQ+`YABwG=a}HZ{F;=3II)P&d5R*K}*a7 z%Jx5*_H{CNM@+%i;s3Ytg_>x>{LLjE7tTEjJuUUva$&rwzuRI5$RLD6$fsF%Sgt~< zCpUIK?|PZb&b)?Iw6-u1O-KF*`F=apWRap(D_r)M@}bgeWLAiHwee-f zocjL5!EF+Da2ff~pfw`!RAloeBnwoyS(c&w&bITcMz~5Ds76!ef*< zUxXPS0?u}XiF&NNNGTsgG6;rHWj36!IU&O|vA)R23K7o2j>$9@_9eV!C0+j`;stZ) zIeZqBl|mQ(rJvBrDdz;w-kkw9FQuDEQ`jAkL<|k{gU0S0_ui5g5sy&)7JC$3~W<^{KEDFGs&zR6fKGc z&;cCfM{A;07lar`9jWIzKkYe<&;9btFD0*MlNAL>Y4*)Y@O!Xe$N}U0`FYtHHyxqV zGd~;ci-e#`qCZegKc~y6lKPWFX(1FDG?|qkIQ+kD?-q%dAcow*c7c5x+=jDAP_yuw z=ka5q3ZjPYDLd3wVRpx=-lJHDqT{bj;Gnh^=pU>P1+X@rba z8o&H!e6G>AN2;2BO_xIVJy2!#MfIm%bU2E_So97;r2$~xZcKs~muY&e!Z|b!o2MV| zB{BC@cX(Y}`&x_+p7iQ|x3&q=^gas#b|x8@Gb}gM8C7U)Q8GZKS;@hspK@r&JaNZnw zY2skgj35h?LiJt#@kt&S^MnXGujW%Nrgi4t{?wXlwk1Z%w0As3)dGIJ=nM3s6MTo% z88z9V=Kl1&6#XNT{Ew0Nw(g+iJ>mXocFwVOWV7YpDqnhwz*VJkc^=-)YM z#n0_@Ha4QhM1a(H_%fk{Lt}_NB@C=iKY#wQSlt~heW<%vBxhZm#65qpM$AaQHk^w2>V|ZOR z@~v?K8HG5rr$&vv!o#qIxccnx7HetN92BAW%Twhz%Y5%u z%M@%`zx3m?L)#YMOY7i8D`vJ4)Ezu(*M@(5VMo`uCa=wSa|rIp>eANvkb|4ng>coi zUGjKCL_1cRV_k%hF2^vi^P&xISzUVVj&YXjw$0`v=InrW1qs+@_U4B#52tq#(VL|7 zc(w?PCDY&#F?+=18+Y&1$1a*P^+eFf*hZQ0-dLgbszktMbExh{?TRe{G&KGR6?zVu zJE~osH%`BMiKCWCZ-tWPRf09_vqZoqRvnlTnXJSSB06N+9UX2A;?x%7Y=aAxG$5g= zc$is@t#j~;g*u}?7&Z1`{>E4Qu;aaZ^zVP_UUiUx-=0qQHvFTlMXwFb>FL_~SVQAY z?VWyKK6HNHLuXLI@2U&`&Sjr_ak6-)u5#!qm&%Ih;W)%zJnv#$y2UN<#p4w&TZbvc zMgpaxtpNEK`AMXqH7)?rDuMuF%ef?xezB!l&0x*JAZwD6L&Y3GX=9FVqNv!)5}8V{ zFGjMUk#usEr$#4?RsniD9jTsT;o8MHO4|(|yz<5Kr4P%?9n&;luCII0UHl1N2_hC) zNI;Q&)+_2pivsx)4c zYcD`+FmOP=gVan?-I4l}Q?otO#wGsb%Vl*B+M`*N8AV!Hl;+ZlUT^f_1bGeQpjCSN zpy7Y78VF4(0`0Q(qmVUSMQj1L z4f?_zg#i{%sa63t=lwV!(bue_|F0samf@PB3bW(Hka3Toq`&ReZf%6+ILEVJdu{_^ zg7M`yay)PG49pB_)|V&5MVC?3^zGsUq958QhDJ&bAluR{vl+GHS+2V9?&|^zQ-K}U z64$sy0D>3dNw*eb6rH>v#Iq0zv}->qxf#mcK6E4?IAy^ht?t8@S)x;o-16qqInXb& zCNA0Iy}R}6=UW>Gw%jFr`Trc);OiW#x)O+rz&i}HM^~i`X=M@`huBS!!^by$oZro) zIEb$5)TvX{mh}nmkj4-nLw6y58f?a1SMhx=v9C%)*j zn7-GdM#LarYU>A6y|G#RkY?y8B+mFds#Q^CbN0_J7>~+{IAVbiogDW(1t6}|bD>9J z4&wWYY=mLhw7%}tB&J(&e~EyqlJwaeegNCYe5sqLefXF*yg8-6Tzgb5UEJ2HK3C?w z!zr&!()25|kqi0dp#WVpT}lk3QkBWK2eicA#S&QvZMO)JQ0A3#YvD369$!lm z!6EP9mTbyl(C*~Anp%?>knFkGH zucn^aY3Q?kkK?knJv{*uKZY`=;BDc8jk8+d1;m;SQ{fL|P)#*w#v*@!=o7f`oCwp& zx`PmuCZ-ZVJ;pO52&U=>YKCb%TvIwDnajrIxP%i~LIy-ZNiGE)Ls_HRw_WYWq|M_c z)9!m?ArXB5pO^--0Mp268s_`XOE*UR&w7k{qsIL{zF*!qPuioQ85t0D&met%(bcRs z7Yux~1qno|HJ|1~4lyK)W)e)wg@q<$_ibHOfv8a*)i)tkA09ylNg!MLnAmMBA>=s; z9>Kf55i}}0j#D_>5NP(#PE1R>WpN(tYInS84ZD`>KRVGwwV2Ys_vx?K%G(p~ThgnC zeSj(aU9-l_;gy%i$Th=E>Ug;8v(PJ#WNpzU)I`h3ch61F^pv@+b3%=kL{R>OV*0jY z|I`tJMX4m!8h7h5gJ#%@us{Y-L0l{!Oyw0|6VMcQ2wAHLNwF^o_CS$&KdSeJ(5nY^ zLR)_>SfRRTUcT!HuqlHZc)`X!uEZxK1hYkm0?DP2uA4B4A0q01Epbajh{Psd5na7H z>(*>4W_+S*kUfv6j-#2EDzfFrY=5UgVn`qeD>*xg8TcZ&I~ZiK`KRh~ z8NJNGGi;NwhU8~HBo0)Do=bJseVnx!(pxRgBZWxbYqqE^3XQ&Sb5rruJUvj_|L?Bh*j$kue)j?Xt+x2kL%E)2GZL;qk+zT zNe!kk+=Vy3@$!<6TBbWfYzwGITORB6UUHAr5m8KY_;j&aZo{^(C>1&JLL0dYWxn8^ z3L$ht8mDwMJnS1$D`_c`x+X9r#p#}hBj%WTt}L|drTQ@UtzrfQwG^(}FRU%?hETaA z8uU9+0g)hXC~XCF@^-@JYKG=SfXPYK!XAin9$i-b>7wc3swbIZw3}MHBZvg7yA8M= zKTbMrDl0_KzLolHB*7~cW>$P$WE{w#C^4fA;`juN-VUOZ5XK#F0B0^8 zgM#BD8GG6i&49IuI`bA0Zab zusvlcs`!uu{3lUS|IFX_`M>le#1$4M*7+?;tF{T6GLlE|0-3p!_k#B&Sfa4TqNm`J zFr7N}myF*mEu(0#pl^#Cdv7340RR2v;-ZQCH8FyKC?;1d!K5r&S@Fn86(U#xc&jXv zza2Nh_C^Z~@+gBvtGADt*K~1S1ksIkskIE=PhK})hj#Hu?N|jkgUq?~r^A_WCW#*= zT0MMO_zy7!UajvdLI;j|j`mMQm&+^5pC{E%Z_kC_LeZp9b+Yx zAwGvMO?Ru-&Vg9{nwd3XjAp5!`{<#_hBf;SIRh8kiuRZE2O{#Z1&OsTOyj@@05j=f zVl5%DW=%9{jUTGfo!czE-pP#R>^#iew)I0D215wtCBZCvx{05io=AjEvaN?wkfLs zwyxQzcH3^|c`ZSX^Q@vPDBtr{oZyL1Rd>Q_N z+TJ(ux#5{w8Pvl+yzuV_q>7x-CeFAzQ96F=1DU@pVoP_9x&k^xztOu@7v$vrlIa5; z&(Zun=&P2(Y{^M_VPSqS(;og~01VKy-s!JrQxEN9Qw6ZPWgHfySzIVqfc?}m{`}gd zeyLcmYQ!`Ft23Wia>polwI?>m^Z2cpWhX6@b}Z%mi%xo7r33QR5Yz7$&y_Qk2$;^n ztDZw$DV0OywW~2e$+on9-|gKC7eJ8bEx@od9OQbj##25*LSUgpLwV}@k+$Da|CPzA zy%N$;f28{%-^Ll4tuy~@v(HGIea*!EVQz|kDAcJAxFx_rHuu2>-Ss`q-t+A}X*?Ay zzsnS45E3KE=d;eOBi_lBpS&|fMm&cM+!)h?+kAEZU6JJxhy3(gU`$y`d0hEorWKJ3 zNCn2;nFVQM=E(mG10-M@(YjG=C?^-SSy2eVS#|q2ZNK!)FbA5-Rf&$KyfBV*_(!Nk081a87Pb*(tB04+G=w@T)pcV6C{L< zlaw{dcA#cGa_rb=m94doWOh1UD`r1L=FhZ}K)w?rS9uC}MloIu{wd>nO!L*gO7qag z?J6hw!{RZqncnaf$jb?VRBoV6f+qkc5=3Hp&Qah7$< z+s^zpJH%Y@Hu~ezqmb*0^k&Q?@CP5cfIf45DAr9$mQG09a zH(fheEIh5GDw0RaVJYNUZ0U7NG!l{^Eg~o>hrqTiJD%Ji+!qO@jzaA-tS^Ypa55)d za!pbNRfM1%lI4v>^antniZAn2XT*U>pUwU;%n*rR6%oo*c`*Xff691f{I4a6QKtR$ zHcl1{gPxk1nwv_Rh|pkAL60F*%r^Rulb@uiqqm@sMug{}=$c;-7Hb^X);Fl;;$uV? zMfY|D1Gd?cpGmQjEZG*~^naym@7=~a(F27^nZ-xdVO9!*4Q`jKRP1*LT(xo0<3=?X zFNAz3FkO-_80rEXo)d5_B5On^iE4q(7BoqMB1*JXTW7KtH`yf^HN$5&MvKHh)zLE9 z-Aaz}BrKDH!Am0Lg)h=%Zj-o{=FR(O+DO%Vc$vlzprEtflLo_J%I?c!EjrQuVXP0H zVJ5Fr+Eb7LaVp3(miQX?D)f2uN}A(msSXj5z7L9y{1`7cDTR{3*p|(GEl(oqq`&4* zWwFr4oC*ubJs$@RATX6oh=GB#RF|M`HV$i_3Bv)iCy<7CCH?)ibkgT^J~1a}AzDwF zAlf%bJVY|T(xbI$Ia??5+(3tPOJc`_SrUc~70nNv8V_bVs4a$q`&;ksK!L(4m`gg1 zn9ux6_6K5pKS)jz)t-!0Oh;c~(=c`Vbn_RZ0!yuio#~n+MoO*{8P0PESRtoM^%izE ze>|gI1>M82WEt9$TP@GmbyF{fVC5%NW<6 zN^jpLq<1kVMtWA(@EO(WAMKaM6q(j7axqyFk}8*7^FlKrKE8^bWCp(RoS2t8AH9Bv zZ%2ZsIZCAu=7E~QfjNe%f2!g39epq%hQoI*K*~3dTEzc(!f{>-Dy`h`55 zF*w=2yIMDx?#gc>ZF0$swPZnquwY7a7Y`uk^qLuJ71GX$*Srk~H<<^ao}299AqSGE z_g9(!^_w*&_kZ|h!Bp{CF-hzJp)mM29?_^S#K2&cquOH{^~dbnXUah!QM=Xst#ige z#D=4HGs70JkU~kk{!*!iscB9$woT=mfp|aSgp?87Fu;o#exQiBTp?s4reBF<6t2(> zEJOOkVQNK~#^$5LX4sCWpOZaJOa7dDF-X9tJLBh=!afuL1_~M%y`gLwdYnbP%6`|? zKJ?~Y4la7eN>Bljn6lmm1K@@yHlDcVJxr%^?W3UzPL`V-1Pz3^UcG)@r_^=h4ZH9} zq60(xD4Xv@dlV-vZF#>_ryI_RMC{$Dk7raxdSkrm0X|-1ch^V8(MxWHwDRQ6DK0LT>6EE1+7ZekJ9*X<7=AEvw*%LU?syHZSq9% zNg@6`nwN$I_{tSq1i8JLjV?iyvQR};L0qek)t5=6z4^cO_CIJ^wT;U?yYxFznZy+; z<2f2`S_Mi8ErHB*gJQ&ajuP(?ceN?WZ~iBL zc4SS+90rQsM`g@~WUjJ8J_51#mL}dOhO2Y%e`FZYsOja{O$2Z;O8{TOF=~?k-K9NT zHfw{__T;ubqMYtQ~_P^hg!{WbUbEfWm8n)re-OM!K#4BbCLp2Cr^N`d_R3LxLxN{}@z!u_U?LtFdp&*he>nMboOcG6 zmRiQxFJ+6|8C)$FE3jhjDOCKV14sxQ)h4PersXu;|4kVKP9P{NA9^&0Og>776i!4OjVNBYsTz<*o{o9 z?GKJD)>24f1!3U6F@l02NU|}k$lp}_$2lZzK1==Ktj?+jR=c-f=Br=Wn*j)46xl>w ziR?TuR6LHsR!A$wf6zE~qM@O1kJW9XJ%)VUSUa?SX5J=Q+gk7c)YhXLj+GW&m8YRmfQ)Z ztq_wRDchYH`Yiz|T9ur;x!_$`|5vygrk{B7+fAu<6Z>_fT! z&8`;e|473PRRfy4`AqO^ih4`a?&acPO}X2B>NGXdzngOMWtRx!z>z!NeQPrIksB@t z{)_8+FQ)5@=xr?SJw>Z@U}d3wU{soo7)#dfub4iLUnp;+T z2I!FESzXm`&HUMU{yKnlvnKrT^UzQG#NPoFY#YAKa;M!)?!V*7=d$CJPOqQapxw2i zgNS$y`?&;sh7wct!y-H}X6u4+Lr2G)8tqsXX$2im^5j7DeUH4{glA{bSIeZD3u;&M z>Mr<`Z?mDC%CB?=jSi(dh%_xg74`qiFKiO3I>mODlSz|tTen@hg6-$6wVS!4o3&pZ zTG{iDJKililhZTlD>^T;*$)lUP5uk+@}gU@@2QOa8X8>&ZdAQ#4ZW}bW{>*k|NDpk@8SHllK$Vy`On=^k-SSScrDScHOejW zHF=)v>}oLBN{K6TzUWCsL`hv>O2N=TICm8RvN3AudMT6IZ%)uyodtC^mLj>YMb`D} z*JI+O4&2!`2qDnbkRwqv9CY^*Q7Fl=G4GcC^=>pG_NV9%Fecdrw@rNkW9+9(?&9xj_AbuRd`rtxj3^rF7C`vH@={iwv#9BqKQG*8> zH8*Ca3Hr`|P$;Fmgix}!y-j)HQ#tmOTjSE;Qi(Ex<*}P~=V!9h?RRG&NxOw&LWdf> zx&@JM=e~M3U(be@`iWva0IabvV$xq zT7H>0!)$XPbFqMFbsxH99+hgunmhkXB`C@F+bnGV`nVbg+}_E2b&~|Mp8zgqDFaZq z@+1w`_)-8YsevvE2Hq_?l>^9Q8JE2s*0R;A$%vqJi5t+u_Mug!zY6O2t8s%6$#-Oa zq~y0)Jbd$C<&){&6I9=5^#F>U5AgLaUbbzTav|a+n8acmJIShK3*<*yQND~fnlWVV z+^pIAp4Qh#6cZxBD><(Yt$67iC~2?hxXiU?Ea^B3rJ(O95)2#!X*RDB%a=j+goH0e zMhh%68g-yMAri-5H*=Y0K51YhFse=4G;LeCGfObCpzQu2N7)2oBqgq6dbL;}L5n+L4cx3}?KpRO{anw5r3YFy*9XO6|@q7@{ZF2vfv zWxTOWPsr27V@=vjZ(ZOMLyNIJJ%1D@IZbTQiFYU8k;+f80LWbuK~jt$N;G!fzRbn0 zIsU2PJIyV>T)85yr7rulH)fWL!U%`?EF}_M+4Kfm5(OFIcOKFr@c112BPP?Yal>?s zCTnePgw9xJ?QHF|b?erL9p63>e&*D=l-9DhH8%JD`!CTPR?E+plvk)3W2FEL(nbbV zpD{FP0{1d;@rdY0OizgOT9+YZ#lvg*s$S^ku2NAl6$aE~mJ$oCz0sZ5W+Km}uMe@d z4yj-&RdklG!O@_R1Z78y^~ABNMGbKtwSQ3bv-4nkLmsH&|pyvs7W*69pt5wIVSh?RK;j5A% zYE>t(9kp#7o76`M3RfRz6d=A4hYE!Q^&?uHPncM7p%mrg-&MzdE3uvPpO(^Aid!#4 z=IY*Yyg@@B5(6SS`kG2Pa#tgDw&9dsuZp2j)aM#K9lDR!;ykH4rcx_TLMvLTZ4Qhg zQ|yeD66+PZEb+CMnP>M4}?vYgNsiZ9DXP27||(OGYbFjZjbq1M(l>rI5!iMx2a zIW!h|0w<7esyo4;GD@J~Hdx^0wST=g&8`DCdJ0rbOa&7XK6vq8s_~6!?gRlX0+0-? zfYYdrIUNm$Dx<9au5Zgzna@)6JKT4EE8R`x9L0>8m>O^3?fc+=CsosV=cxZ5LxVP| zSL37h|6Yv`DeZTS1Rni}^^%(^!eO$+k+UnsUvcy1O{W%?<``yu2M&0q>f<%5-}7m` zC+c_Lw3+d}+PR?NNy`soejmc~9+KB3DDUOVmz`+Jko8eDMXqV%xx@g?aDXa0TmVBX#<3=9to^HrPHZ%wfcqbvUzn$LaWkp2! z0*^VJQZKIZYm*rQm>+Ze^}zv_p1EH*3@AV8&F9qc_v53M*Y|3v$Z<(Hf~;OyHPJ~}l6Hr{znQ}`47$TdET ztRxPp2E5UB0Ey&r606olBYhPbg&(<>ss-i~Ito|UIehhPaSsJyp+cq!K?b<(Q*!3= zCth#HKjyD&PH@urc;i0~bmee_%Q6*ru^eAKV;i`o-#WZ*-enare$B%TOWqtpb3a9F zP%vJSmoH(cs40=he9aT-j?A7cGzp-moj1k0!7TODV3upDx@uvzKFuk14#%&Q9|B22 zyjKF_G5})X!i9%rZm2N4LfQccHLww729y4i%C`z1)WxKB(s+|emBGe!!l)ogx3ruE zoJ64>S$#-aVUmDj!tCZb7rOm3eYyA-L|ZEFjS4CRI@$^}RRXp6d%_P%@+8!v%*h+$ z_Sy;f^?`HIwl1xS_x0XP?3NziE{*eXoM3^;f#5xIyj^Me2gKI5dl6`1(q(k`&!~o7tIfy2$XJPoL{b5bg)6Kb^a3vM@F(CW z5sY{+e!W%v(gIp`Keect@c+wCYk6d%@L(hqUN57afrhs=hBwlao)aRbt^RmVV1Ozxvbc?3#DGb*uSz+!6~5W?OwIfgkw~2EzD%ocbs) zxW}6}O&d1Vc>DV9qYN|K81&6IkJc#W%vtov^G6Qp!Wf4>v**kin`yu;c?qKQ%Nqk!ta`R1Dxh~}G+-Fo`^KVQDwj*ayNYKaee z_WXwPaG;@~mBS?Jt`9f3;B*fkl0(g6?w@mb93pzTa(#*)RR6|F$<*<2RE0ECTaFz+ z{v)ahoEmAM>bdJ9Db~LmRa8`TkFRQb6w}AQX+6K+MltnCK=(ZclGgsxp4nm@N&;Vo zuYT_bbnPP@_zW7~t)Iko#Yr(iqH9Nt9Jy=uCsUXyd-)P`FFs$gq$eVayF78OxYXR- z4;vdB&n{1%#0{|z6I*D^8UaOdFmi)tfow_x3mJ z%|S@;&%c1(ccE4N7*Mpl?(+dkt$SR@-OI(tUp?@rZ@+yX8q%^#Y~6!~5AUk6V$GV} zB5)7Szb1oOw)Ow1Rk?R>r;sHV+L150>-+CJ4jndZ<*HS0qt|3G;9YpsAr*;^6>w8HE=2`+OVJk6We__*jRoLE0_M4er5 z<~zM)Cav@(7$k*txVyP|$5%tiL~Tzf_;f&uyN@4t>e#U(Gb2uOKOE!!%OBR-Y0I#8 zKHQz;T{hrX_QXDp`k$a?&PPRk{nb}*^P0~?05B$H-Hig|@#S&nU&eotbS8ss_Z0p1 z<${79R4db{oA1`vdW~Flu0R6{+-+I|RYK;`}8^GlzV70+1IW5iVdWQ1)i zxUj#RIAll%A#L=QFI(0htbQl`YHDd|zj5QnosWq4_sNreSPkC+A6I>7R?q48l1K@8 zyVz8i$wQXlKVBxcHW@Lvd|LI6`*rQ3Y9hWk>3Eb`sO(jK{k;bd+S}RL>FMeH!0LVp z>GVrnWXN|}iPvh&7Q(7_p&I)Cs(bUWp4a!?`-5dlmZ=OGBC#k*nMsBU4M-X&iiifO z2$c*Ag_0o+rcer%M9Po`qNtEchESr>6cs9|{kmIg?PKr#$8+rGzvprMj&-cx`jXG* z{l4#OIIr_OufM*Oj2bm+xt(2Sc*J#Ff5ZfCHKqHq8oIi$w908Rj;`Nc66*piRRdlx zNW64aQMaCoOAR+%TCm%Ng>uiHJ!9B%n*?$h{b5XsZ`szzLp@}L0mfW?{dR_~CoRNM zslZ3+VDS=G5vy6ob*r>aY`$c&tVL-9Op{T|5@l%lCa|w(IcH(fqD9ydbmdIlBdwgt zch~N<@ncxd?>suW6{}ZwIA+j+fdXt*TayE6X|p)b#_ji2lO z=9MhC)LrgbE+EgODN}rEpB);aC6h9=@O76w2WUsSH*1#4@2{&w0!`g`|mS1H6UDczG5?48-AL?sW=AzmGAxY(*D7 zZkbdTpA%VksEho>TA77#|H=vt~_1E&Fv19%AyX?)TBEsZaOrgJEVlBsg;2RjFF%MSK4K{X5+g z)t9+bJ4qflQ(xuQnKLuQx*tXEM?&rKqetgbEd^0jj=z*c&{TvJ9Y_py*M+ol7_Cus zzSE)*0>AR8yWNW$td}fV;vW$a@n8#Lwhq}RE@(VxxyB4$Iet`4s{TBimvCa)JJQwf zejHLb{m95jz1qb%RL@pSen@R&d<}3u@P?KKm@a zeI^wLBsCWl@P&ts9XnR7rR8^Vi`~*wRCpOHmjc>_L-?~bX(l;DKfM3&p@ASencm;n z;Mgh;er9#T(MWt*vK~JU5|jC3M~~XR7Af1v z+}u1;LwcjrU#aVkH|u@*YuU2~lC(d>*bJL^`WNnGP+?Nm$CGoyV|*rUzmb^eV?&N# z605;^JMSDbKwjQT$!~P3gKl2)lx?h$GatBSmHIK^c9)P-s2KU_zRlL^V2tfXX#Rq$ zHGZAEu75b}+!1Tf?%dfnrhL)tZOI8Qr)Sx27_H)d(q#O+{L@kD85tRRqB@+E3YD|f zD5Y#v!|&|O%)?@(&A3=3|2_@zH#>}s-7=LQha5;SPVAp=rC~8zx9*bZ&z*IRPewmh z?2)Uio#J(gZNXWt{rdHrtE<~aA_5ggG0rY7e*xh-s*bN>`4z>4Q1LkF20~L!mp8X2O}BC7!fCdKTXA2Tt5lzX$If1d#_&YBwxRO_rfwOGKtd@vHlHy`d)n35{7A)uUwhA=g)y} zOG|fCX9r3}o;rGTFAr6+gKc?qggfZzMESurz*A0vMo?NZ$?fZ42>U+Rkp}+$)%G@2 zv0VI)G(77KJMT5lzuojaI>PDZm@)(Qdj573e%pTFtfL9(t^~82s8mZs? zUy@c}3pu;GYV)voz%K0kLOL0SXi}**H_$VB4yT{RYfqNDpXTQrJ9f<46DKTg{9Yds z#UbW&k6FKV?b;_8HBOp3_3W855(36d0|RLyGM86ehXK>K4G0Pfl4{sLU~*4-uR9!A z#L>Q_51FsnabxX*h^{oA0aNM^cRL~b8o2gjp>)lVv#P?-Lp ziC*;=X@$Sm)@m_>Ah!KY=|6PBgx&-#sfspRJ(1_s^X}8#sQR9u_NNBq`SqvPy-SoC ztlN{EyoA6#(#s-$JvW#riH&7PjPzDst{+l3;xA2WV@I`J1uY>>>) z)bskvvKyfc?gmx=gZ1Fz=@IzZqa#a>5+%)ptgKlxXSU&r?BxoJ_1j9d$62L}_B;JX zOpFQ)NJD5S;F}8S>OJXRwJ48KzpOb_)9bK3Zj>|NS=>uFn>f0)JC~N%T&jKZ%H_*5 z2t02;e2~Igf&10&ulXlu@K$N8b9wqWP@Ez6>DAJY%CLh(#k`t8S^w*oD1G=uX=DqR zmgcPnEPgGSwCu2G)gOjT?IU?xQql%)@6i({-h=Z`;!pFR?(X6G-J?JWEb{GV?%^?o z#Tf@EmG|aQz~$n*2O+b+zanbDHll&h)009Zc5m8A4Xrn?Fq4s`NVu`-vt0AQonJ^&( z*%KFc7xbjL{<~@N1~Hnb2MW%L(P%7E^^>1&sjgujyJbxwPvZw2P9+7ggs(VPl zclUB0LGJkl^10T<_wM~cZL&K!xDO3Tk3N0&0L3bBlHf(o0c>%Gyg1JG$&8gR1?YTn z>CK)qa@{jtz3NX?r>35L?E0mjN)@(5M& zqI2r`$`=$IZtdHvsC&KlV{wvImSMqsLEVwBFTiqrv8te9(;JAB5wuA@O+Yp#=f2m} z^sItsYiZv|#JX2UyosE*<<}2KwPeDS{qrULYhog%X=sGwa}um~%r|BVd-Sc|h~myWNzs&n<8m+%Qqjx7ddO5Ld@-^m66TN&OPe zWr4!Y;F{%LKR*-Oh#wH(E*SixaL)u^3n$q|#k<$8j)>WdV{nAnB`SyOV_uR^?US;n zzaS^>b008pU<44a((i3Ff7MbdB?1l@mrB0&>MbPWM@~)Oe(6v-kc1J;-(eD|82zk5 z|7h?Z1+j8dz04#J`Ph(}3F_)f-=_`nC`ekTaoYYwBvv7q+rfq-RPK_)rPvs@KI!?YAEowR*pp=sz?}yX@#;v<6C#a-KY&Rr8UOb$I3~^~}3>wIJoJ zij0Tgc!ZOq(?}qQTcdatYId)8!QaO*CqZi?e&T>+U2Sp#{ zonGy@Rb48m!bZH-e+;d%YNOW7nSZEw_s;WXki`IUymx1UZnaP zzOD8YzJg2c$sL&2lx)jaa!u8j?bdsu92 zlNB9KlR83UfTBl1@_gCH6be$RMYd9VTE3QOW#;BCZ0nd+#8~wY}zWm)@kcP zu}n_$`n7AO{WrxWDW4?fUOLo5+^fr)b7K9uhzMO`?7I~^&wV{ly6^Eu1udRV)`JI6 zqSvO!-|X#gb?{x;a5hmrT$G|<@n-yuoR^_44~gS+ArGl(Kn+I5aD+pU<_Qo61Z| zMs45jXm771L{m1~grQ!iv6nvlz196>?OxY!aXIoes)zgI4I@1YLzp!%haeul|c7(3KxL${^(P^5O#U5z#AV*jU+&v?ew!=#! zesXY-nb!vVc8<=;zxoioB9hQ#pHnfcI+es4IAN`l?sKU%BvIj)IV6X8eBq(cFhwL5 zE%&PZSpEHg(+usF&USWolRWFkfEPq{FLb9wu0&WdGf0i@UCk4eJ;H9BSHbA!i&#Rr zRVV+{Ufg}7oql=rLB`*EOIVBmMd7L83C&59`XW>5C?V<$#;(yOCB06nu%a)fRmIW4ei9l2Dp#96f zhhK7sW=)i7P1W05dY#_m?G4m__F{BsFa^L`bTRv5VeZ-VHPu>7&No`LsEtfW^i%{R z6Sl&t?=BzuKYTom=XdMexyRk~l1gb&5B6m}QC1aM);l&< zRWie05g72C@o`d8NnKUG#3!ws1zFlQH)eopj9Pl*iDcMc3XY%+BliOv7_87hM7;a; zC9~K;La~?7K%ia0P{6uL&`AQzafIn{7v8r?e{*-%6FTFged}Rx1Vt~^{obr9M(n*R zY$S_8B5wxGX&z(^LEDYVR`y2s9h*4TVuMrfyVPmgNPmpaegFPFof70;J@biiH+YM)Q=Xb;++2JR65cp^9=MUFPr z2=a>Eh~)-PpKfIkTpbK1SgTiiD7J4<_4b>KU0NOi!1vf{ECeog=33v2Po6$~kOHUO zh!4HAPi%9sQSS#hxgUYROn*hugTU}n34ENF*OQ`cXKLznI?ivPnRlK&Gp&=&VEg5>Uk0pP zyLLa}FO0|l+`Gu_ZZTC=Xu6@!q3PWM#>?%SPB`Ss2Q!;es)v>^gYd{X**UvPeQP9YR9)q3&EyR#drvF)vVWXqY<(r-0P^+${vC6ld`;yFpFp86Zj;La_iN^K^#rb`vzt0P98r!Bz7hg_kG?O;ZA1~KLC_Iil za`I$TmdT`cw?*Xr{eWy66`$6?3zfaglyWB+%FJxi&2DDDa^rGp6t{0(59K=M(gMle zimmCY#*rRXX-}WVzqY)felM9}!Vt^=24^JPywC2Y_8@v`8|9LVTF<2nOWVA98PT27 zLbw2CapBH@z%bxX2V@qp1F(_44O)*^Gyj%TKs-8T-~N}7FvdX4Wrod^q2f^??*E^&b7gk;m=%QudUG9fF z6{Smuh;Y~BKKq2!6aR4`DWK=sgaYSa~O1S5~cgmB(qe z3D&9Ezkff|qfw`nZKDQHyf^t^rO990PcG2c7fn#niiEO*-&AY2&3f+BWUX3zG{P5( zLKt6n@4q^mt!5LE7U;rqyXcn#zKRCnOYO(@A)c1vI>l z@)`2wb6#GajAnlMpls$Vn>KCo4Eot&g&1fWp#hX_v>W}SGIyxKY0ZHYL>(lf7GIWd z9!*|Z6#$T>w=+fX{zu1-?{UVSMaVa#Ofsf4UV1wtJ$>J{8$QMB%+8T}6vACstx{x~ z{m0uDGX)ECa~~=ExlY`r7@(Y#vMbZ-AN>8%dab17QS$Q_^*d%@sj&&(o*3>utSEKMwu?RV~`#*gSGixeO~^Rk7{ z3{7j8iMaqqNXEFO+Ov$tdXlDQ(efb{`|3(6D)zv5i=MZEsiRBnOLUihfr|^{4P&?R%CW4O_p+&%bI=miLV2TUP1Bs|1HVkoK@K?)muf~=%|15$pqih*XlAEURbbR8crQ>$2;&1q53qVXF z3$hm{-S@a^d*J&l?K+<^>gs7wixlRY;(d^gB~ZY*X~dn%mLa0SZ)YmGvAOz`sSP-> zjMKW|!-kb`#>7V8PJb_rOJDy@$K9on_dggYRbq&^7jgWe=&~pyYJ4@%ipK+YrS@}Y zx0ly8bBc^^XDa>j(FhM#QF(MGTe(X^%5-7-Z|50pnXegKd<~dD#iGPjG4&Jk$z8jX zk93u|9u`%0hN zhnT2|yO~>YYtiDx^+rEB=6<~Q=uuZpv#pmFZ)x=R8&>4;F-H8vwWCLj=-ySI(EOE- z+y3FAS1UK`ke7M`ceOuL+3yo0;5HJ59T_B}HZsX>XGb@dZr61$o|^iBPDN&$RpINE z5$DhM54V|=Jf$^-29h~e(mz`_L07v+?W`AZq9<-=8;om zRW)$uo)!JYD*ooFAg7y3#qZ_^LZ+}cSmZz9=XO4ZOm;B!BQ84blM8K;CRA=3Id<$- zgP%RUQ?9qmsyKcJe=OH|hbi<6H23@Y$;RB|pIrw{^RC=Do~gH*CrY%Cq4m6lm55)h zUl$k4Bgl-%>(}^#W_v$CkVMDNxYSB}VL69S?$^4B;c4^-m7O6XpnNmmW=f1}GlyvBcL>{-X2nu;-7NDcx&t&B%RL<#Fru zZqkxRD{|Gxj=h3wTnyC0g#-Ha8>w_SCMG5qqZ_Yy{5BO7SK|^U<%KWyERw#acwdw9qt$dr))hu#aIZVhq zv~T>Nhpns|xN!FK&x2crf8o_ym3JA_5G*g0Fi$3)QWxeiiRr-;R;l-6>NG54rL}ck zV@i2Ng>p>&F}0N)3=Dicq;wZ7xKBk{r*wXZ($9TGics---=)z>W6UA#Uk#3<2A@)m z?MO!0PjnLjSyq%`taLTq(xjZT!GIWe&!vo%U)KC&#x@J!ke(pKHd(P6v0g=g)I487 z%X^Qy@#(qg+nOff9-DRDP`ky2QEDqC%neXe)-q${@a;)We24E*7XsI;T)7K+^4@PW z1X1%S*kesTP6cfKcDWI@m)uu6|VjVHH+7l1HvJy zI!St~Q^$au@miT{BsH(h1Ch3>eX6MVnN`#;ci6R3!`RSjdZEF-6miMMX~mQdm+ ztXMC0M=x@i+i+wJ+}Tbc`KYg5RxEr7puYQZzyb^P({c4^r|*3w#KtFPs+{|f(L|9Z zs66TQOnFL5OaG7vd2U)fAVOr<6{d_0)s~(L9Z~kv-NkpZ4Ra!DZ+Y8?Ib4MyC+kJ0 zN9OyuOmo@wyW%vzs$n3JL2^y742t{BlhyKi<{@~$FVkp-`;T4o_^F3`q2SA>wM!GG zl_gzQ&0`X~hhf2p*RN$ZY~F0v>59OQS28&_-&3Q{!J?HKvm)(}m~5>&t1*btpF#zk zk=w4AOk?}|i~``R-F5>??!FV&9(CLC8sCp^x=#46cuL7v*$e*pNMTi;Kh!b5Vz zi8QbJR9YI|eb74WdduXd;X2#-^7ZQ-4P~}{!he9VZ z!*%_~aEJq6O`Sen9@nUn>`{Yqw?77x6Qekni*fPs#uq$;Dk>`e`Vyh2n->{>iWMI2X-Qno+0aM>&WW9q7hY@=gMi@2cuMyeH0aW%mvp3m zGX>re_Vraw&TlR}3c48vZH7~?c2X<$CUro>Y%bo6CGGK#8n*QRt)yqZu5JbH-deCf0H`a*4 zr1IOfZL59yl3)WkJC6YZmCXx}h**K`)nZWG`A)J^r%VZBS`3O=E%S3)WK`Rm^&1-E zEiEl=zNFq<40jEJm12h0tXW^FaigTW1{^uE3mRuHd_HDiYNJ07THk$&%JjRZc%;@q z#8P%5qCNXa@7-6;T064ioAlDVk>Je%7R3WyP&1Nu zDAOmsd;v~bPf}25OW&>Z-T4cxMrCp8bf#I|pf&iFJ@qmeeivTN+p4OOSsI*waHft> zww2`TtP)*2bE&k1%Z;~Z$m9*fi`**)s-+;^5OhgGp~5{AT`aT~$F1s&km)jX4DXsD zN_Sqo=!;KIch%dUic{sqN|pZR^?(z??!pjiY(%$GdgMhXr)}vyV_~1RVhd?n9Uno- zSRI=8eI<+7V=yBw4PsMJLD7>B?{S#w-AzthUjl0M+NzVskNW^`jMU!n#0xUitkOM1 zOFs4-8*M3Fe=&+ilhvNuDI#&(h#ZH|i40{uhj@$aJwR^PjJw<;dRW5_nzXIy#;obD zw2e14G&Jn>@kxUy|BX$9C9tt2K9~AIT{|=3sk5!^g&Xoo&LwRjMmjTC2&{5~_kxXw z6a-PdyQ&@GqSfj$@xQQ>V&`dd83>`hqw1l~=R=L$jr6Lqtpf47DEp`t8H5 zB_MImsDoRkHvGCmh>^S4t$;eux7vQ)y3xXBZk$4XYD=?d#eK3)k>*x11ne-fUG515 zY$i%3C2XjeU*FgW<-({cxskE+81{!g#E0J#bG>Cf8kh8AAn*15DM?muV+8z=e<_*{ zNzG%`)n_eVex|Ry7+@QwG0+u^4f+Mk4;!x!`4Y_jM`fj$vRmujj#x}=*ElU_>E-2r zyA_={bVve3vlhL+&8~xtAx2qTlO|D{{c8L2V;K1iF!yp_`N``Q*)eRX^oBcgulmlD zoI#P+$86`*T6|!Ch7>fh{jeoDFE=g;q;Pk~4bINDA!qIm!8EuKp*1|JB>9xkDklvT9A12g_xcdJq%Fn%#a|BA1OX0{o9+d-iMwx$W3j38Rh7w6&O4stV&j=h_-*A4L}YNWRF#d!_uuQ)VLFhyU^xQWY<9fQDOlYA5H78+8cu zPp^%7ItLPx*s%*)$}p+n)|Q{^J{2YR7hE15vS9o4?bojr9VSpj_O7ehIy&=6<|=|) z7h?1Qa$TDl3vIrm<%L4s9y$(PU@vfSxVfFhL=Log#OoVTStlw}FLFh21#VBq+)aZ4 zVeW2DPB*R(U*>NL7pkDIz(psgZ?|-O@tL`SH5@95&$g@lIUQ;PZ6_auja`U8l@_SM0h4)$9EC@ngr{B^nAQo+97_6N*lqI_)<9#Nu5sYN0g+(S5~O;_E=l$nFBth7e3@E{mjGFpEfd4Uh2x%Jj5Lt!Y`vpUCb#-+ynGvim#@D-{K!R=xswIo8?~)!l3Rg2KyCdW2*GyUx zw8qh+{}hjzhrbX2gr$KLIKAHKyymo&@CxlMX7x8;QXPy;ZWlAb<$#j!NZ?~x@g3D9 z^;_ymg*#Qo_;1YM6vm9{4tLb8{_7BYuK!?0Cm3+x155|BL81h24-Qt6OILZgOD@2U zY%U$b9EMlC6NVRQZn?GfP7(e#)RR|vvVVSieemaEC&|9O7j_EE+_r<;91?`+@16)X z4l^b&etZH(6)?G;f!U8WPEM-AvMh1S&r9_WkrBO*yUrGsLkHV|la;vNb%+=%eebtrbFO+CQt8UEf8$;R-MoQN=)J z80}A>O=l@7qq*`_W#@!lS8`WKV59N4j9ymFXk4$`AdP=M>Q8R!e6gC`^c61u*Yw0C zJ;HSK;wE$z)k+}~$Bm1Bx_NHFX@)X_lCgdn#bpEosJVcRHiHN*c@R`L7S2x0dh{qz zPzNJ<;^r}ZB!cE7c6>$za5{f zyH;_Tk7~zI%|6GM(pK z^o50i$7FD9;fHNp{*x9{kISP}F%imHkgC zH2bV}5zBPBpQ#gmS^=Fa%**_B-JkFj!Ex6G@|F9!kCzz`6lAJ&u;#|~sR#&HIXFbC zr_xh?NPY~Z16z%@)4f$1DC7*WyZJ9&J5ATLy*@ z&b5tRR>Aer6Kuhs$%&R7^P=~t2pyTQU=yJ!$xBEU7;7x63$X^s7CD*9ba8^cEGB3B zNax_w%;$MFWeG)ApZ1b527lkNH5S_^`*3CF?DA!FN5A~P&Flt#jQX5dW z{Y;D$=tDVV_j%LWZnvxvIZxV9F1@Yk+nfQ@JY*-c(Q&~#3ld4Stf;!n=d^pO5uhiiq zyDs`u#>>$M4I6R#GGRz?MFE&i`h8eZMBWo4MEb+!P@CXi>KGaIjJtXj66r1wO@Uul zq5))jRPyQ5MRnC<2Ht4s#M2e&)fo&S(IKPcfw8Q%`eEO1QmHo@~Svc~?ZlFr>`~$ed?EAyESc+ZHTZOJf?= zEt-?$J7QQAYJugHYL$fv%EYD~a&kW4t)cwgQdOiX5y^$hjORKLGj<<=e*dUoHs#0P zxq^1>MS?>gYf9M| zLqeyYU1ro>pQl|&%_k;EwEuiIk?h1+iAT!YI+FEkDBda+~zdykG6v~ zkzNR-Apd9qC<-bS?#t$%zpt7xLn6K`zGW&KHEb6UO5k!RN}9~m_MhhV!4M?xtwN&CA&Qwmhl5|?fsNsmN_qM_X96x^5dE4>HuwMA{%!QUj9=?h5 zzP**Hb5r8{57zN7_Atc6JKYiX{vGA+nDwO(q(&E<_(%sBmOX>9Ho;P@NiFatI9Gah zKDjRJL=TUx-!9Df=jVsd_Vi4u>{h`01C^CsDwMckEtf=~pe2y_Sf)^K-l1GubSJ`N z!*4`G){_(!cLzRN)twHw*WKe`Yx6$-J*R{CbNrHpVu2gB_JaZ{^iX<9*`-SS=dhO1 zL?0?){Lp13Y@#@bka3^wxK)#=W?rOhow_4z^pl``cPR97 z+A$c#{AxZTLat{+S#a3)Z~l-SzLBJsBN&kQGhK4&}ckD|!0p7l!`(WlAS!ptntBZzB%hO4ZSSy4VhmIeYlGx^j zclJQn-BW*Vnb+1yy~I!Nk-I$)v*I!237WZ3Vyx|p%R{)wOiTBP=7+409dn;5f$)eA zE4$cQp)q!){9|ox<5NJ6J{XmRpFcko4O3mT1LupbZm;I%ea0>uH>Lv`B8J?>UA{*<-TEfOTEamE zx|3b*al!@k$tvtWD61bldZa_2TT)fE4+k#*rJ#~_SUc^loQ?x&2MVLxTsffH(>|mB zOdd`R^^K|8TUgaJf`4*v5BvMKrEYIkdz#13z?Vf04jIyAQ&*SF9eo)P{%^nPtc`Qq z4{rTP|NSHEc`vl+|J(1HtNP#P{_i&_B>&Ho^WX2ROJmmlzx~SV^8ei*G}j5ks6&~n z_&*?qn96ra&)5{Be9gw$*;%L0d4u>ReYS1>=c^S-50_mg5#KDVC6T&l2#2d`9LE68 zDa^*+RJVH$9_$7c@c^&v#*G_ErT#rzPc#1*Y2Bsusf3v=oZ6wR-25xvZs8d0;)z{AsHZ}@VpeR427#ZgKZ8lS{ zZJ&Rwd)7X6>OVi#se|{^*3<3bCkPh+E<;4>6tO6W1meqWHd&te-*Kr--7Cwjc5NN`Pc$GN=>PXQp9lPdBy)p@YW_lb{q*q$EO|WHdkx!gB1RoR zOkKs%7ae<-pr^ebsjgBGL>}iNG<1Hk-E%neksa+Be?dd)i37imnU*Z*3bU9Qnh| zWJfPqa-9B$8Ap54_%6D0rkQ^bp@G^*K0FZ^=*7R3h*cJt@=X;Vm!Bz0mgs2ZqHVt% z!R9+q(aU95qRQ8#c|4W3RSCUBxu36ZuhW>!+rJ(p={|6vAB#|1)t%1HJHYltQ9`17 zM?An}-#bHyR{sjJ7~ri*g-xM8hZ1dJw5bXzw6k=U0%sDYDJC!Dh4bE(rgwd&!JFMN zc|-Xbh=&)J-c+aY77(LB$P`)OOMdkHgyKLF$G&l7`<`O{+C_XnJ~cCRL5UK;k3hhX zz|8*`1QV7Zc$VXiN?d@CgmIWUS_Y>y`VqgLjht>0|NF~`UtgAib%;R#_`q^rp@oHo z+mz@aWk9XBAy0J@a%~={w+ps-nm^&!1-z)7w=APZVwcWJ9(3ADBKAF+SiqP5avk;k*kf_yH%_89D5vUJ=+&% z{jU&4Z4;GD%2BJ~M|APn(#r?I^>O{sj94i`KwM=t@tJ4qRdK^uQAA-QdQwPGJcT6H<9A0wvSPs^vG zafn2`?wC^O-|_}Dx>YQ+%E>8q{mb(5aw8@dg&GMBmpEoV^hwC5qL(=VbBo>W z(cc@n9K(>;Bs3t@`d3RnGC$Gw*PD?IZ!8dYtwOmJ=gFY%Fq^UB&5OZIbis0;)Q1qX zt_K`H9%XXsIGpmOA6Islj8s)!@KCuW`FWk$`V)Kv`{bp{%F1@cC-ayE==PqXU`}WN z@}4ii6B@vKYbZ-hPB`g&>ff4@@(yprmK7ssi3AG0QeX~uzKw*Vqz!N+IkUZ``2q-hWW10H z^<4mv%8*QQ8`lZ`!wj&}<%l|HmPB(2AEJXqRO&n#MbFTVQ3|zS2U>caO`9fA0!JSb z>t*StI!Y!nWfWfqF1#SwQap*QgRKL_)10MC2QU!bOU)97bWy4SWu=5Kn!#Vh0zePR z4oJ&0f3?qf_)wefhx0#7bHn=evk>445MTTt>ZB1PM%<=8&%N$swYDg!Uv*OLfs7^~U-k7+(VVczYa|i@;gEv!RcEd+ z^|A(jExVaHgPXqvnA$`C$LiB*yHvI?Ob;u#MMFu`znjKEbhkhPk{uL%K&XOmM%aqW zN)jqj!6~y&g6wQt5G4z6CD3IW)e@K{z`Pv(nY=wfz7?zLYIdF{boZu@X)EE=g>n%E zogZ%}Z+N~)8^XiG&wvX+J69rznUp6`?~1Y3fNuYHx@?bNAcS8$wT_laZfzt8p1qK^ zuv4-N!Z?gj27}Hr6`@U69-qqh6Z0sV{42#FCr^%9etUsURq3Avuf4c;RvV?qQcg;O9W*RE0By5=E^FAHnUjx z{y#`X4y2U-fwKL%J7XHfyng-r>ofNnSx{IHoM8mUqdkRG5?DO5A?Q?k%(7iOcj_g4 z&3W+P5RxvXJaY>ReU#tNv?7+_NHbN=(*kJGa^ zZrE@Hz-)<Gimg9S^=00&{zQR1PE!Pwn2{4Fn%N!aK~+IA>vew!PBvAfU8UN9 z7RM##=6WDtTO!ASnjGd1DDvb}t{Ud2-3|=jLUGkd3!9#}* zsRyoM#;hRrazq-OU*L818nva)hyscsr5NX2PTi@r2_J~mvyPfC3)qY-Fp>kJWF`Fs zyeL__841P0rmVOjA|5~POg>DW~+d|U_u#~BMN-xGf#m;T@t%jQ|7c-}NV&a|Y* zpRbvs;3{Wy>kEJJVd@dZGb21VQU+Om40H0@0A(X9p04%mTnh@C2s`G=iV!IduLB>j zfBushw?6}x3zB5Wl!bL49;6z&)RxanuFXB4wxo|BP&{a%oHROKPV|uawHic4sdno2 z=A<;=g;7c;mDd+{7fj0pt-Jp^b!4-Mx434!w?;UEdQb=Ft=#>b3%+wXV0Reomip_VVTKk|!iP zvD?JP9K2A7W~@CapD6!t7B)S4`0(Yc=BO*w=2hy0WGEV}eFm0R)vZ4$EAJ~`6{jQWlSpxIt_{k01_6pS8OfIOAdM5*Lpp6850(oWkA-h^g?BMb3T&NM3D?`{EpW^71{mw z#}Oo5@;%`P#Z2Twh?zSx{~5@cT~Dcw9V<422qO`}6(fJ;T1}AT zZdNvopJsIq4AV!9_{q?clfQYYLQ4+jMOUG!;n=6by16H)zk8-_wR>#`&wy^ z381eKGC0TM0Ec4OCps()9h~v?#Uv{AVfM3gr$1BD4Vbz~TGG8=za{pK;6g*aaUon5 z6GTb4L(owr;`b*6`gxPI2Pfn^u3d|_gs|n%@>c|I6T@FAySin?{|VhTj`-eU22Qsy zG?A(JL*i(xXI2!aND!(R_)=?0pZ`aw zaFR6Wt==eW#VqkIRBUG3tU@au&&v zQagPOkYe|{Vp>5`A6LF4X^=z=P*wDoZt{NdyyjznSYriU`qnfA*lgWHi7#dclCR+S z)^M&MeoZ3mcYt-7Tw7p@9|(B#6>gc=)(Z#X-qY^(&YEhF=mOF-lLhXi;`*;8JuKHM-lOD@Bz4^IG`(0cKS ztndC=aVbJ)0dHDR7bG)q2O@h(Bm%>woTEFU9+B+Y^BRu(OrFgZVBV1OMVpdFyi~wz zr%@$VBsl5 zVmqX$+mvZ0)%$0%g(<@UA02Wde_(QFvsU%Hbu*lLA>{I({O zaQ)&(1l2x)j~G21C9V;PouN_ig)z;Uv35a&VU~B{`9dmWv%c5k;_`TMuE&)gjQ6S> z?Iff$5&1SDSq?Xc1v~D2Rz!tk^H|G*hhwv>BJN@=WhnDrZzfhjAAG0qAxK2&_zeh4 zB4!BhK<#amqLa8&zFDiOgrxgVkS6&~6%Yt7i`_%Xo@2u$@JwM$3~8%*QE z!jl5c%D$iz(@9ODGoIvLqbN3zsQ=DhTQDlNRP!;O0a1CAhYr<}eg!umX{mPr`U;aN zTCqnovsG?>^8ckUemo7#z1XqzR4${Y5FVM4SbeBL-b=_AQ!u6oK&n!V(K6_noRLgO zbY#zCNvSM~b;QsCHxo91Z*eiw>$%Qqp7qXwRpSS$>TPpk?7FdeS#;L*{~1Wcj8epM z%MGm{Hw|V);{#H!=DMP!&JqAP;VJpFJecw9=oPxYzf{`qX!%{eUCu(T_}1o}<=H&a z#J(x;e~#7ocY7W^&tZzz&pt~Hwr%Oqv14@1+78mUrfph|&+<&Hgup^xtS@92cr_{qrji(F>6uwDs^Ze2K(jdl;3M`XmJ+`=<8IJ zbn{NO&0NCXlcTdy&}+CG?eA|L0ziylO2w}VoFJXR?I)`w)#i3i&H`zcJO|ud8asu6 zs@4=06MO#rU%f{!-OIdcspk*jL!)PQ>b}*C+HXRNg~dyc_cf_zCcVGb} zk>7ui-bh`4&P{lCg+|~PB;gM#TdaOK^xCZ1AlqvoSq&#!I1`6jMb8ZBMyvyHI_{cb`d7JiC-2uQl{@Mp#`?P z=t?o3s%?`3(8MlT$IEd6dpMu^E3+R#R};#g84Hyg{111k!^gOTpB;$MU*7caOYJ?t z$kV@~gg6`$;&@1K)t_$`^%N|S_Cd2JH>hjE<9u3IqtVR7z#7VdTng z{{C?yV_fXrf9jO5JRy1ESDl<5JY7n1-|fu}b37+-A0h%5L+A_}{~DhrahL)d=)Prv z_nt*b1B2jyAZz#GhM>WVd5}#aq$y#=U>-Qrf?99_poT=o@fL-LgY2Ic{TiN~;g6bH zxKAUKtDCHgKKuE{D0*^gvcptDY71xt7OH(O&Fr~gTI`T*z8Z=9-I`W9Mp|iD-F-bYlS8dkEY7fSc8Qb<2CqSuQ3`v#h4m!*g8^af$=4(DYIk^`~Bq=+_-n*H$Dxe;W{%prQ z3Z4{x9}Y2>+UoW}m2-5=ik*E=tJ_GyY7x$xUao>V4(maueH0UH&0n9!?g|LiAmOc` zZa6x5!|F(%-(3DM0Y^BO?40YcSJU0M7GIbdaD4INT4diyJ{<$uKuy#7UZ;qXfY1LJ zNT{)X!J9eWg{G}|brG_c!_R>YG4gxzmI9jRmE24$y*fz#V^t@1^<8$uxear!&cZzX z+_tx@N!vx9dC&AyN%;c}lW<5@8`QV&VkJLT)7Q@1tOAiTx;!3&N2j=0G0;YX+2v^m^u2SD5!3RWYLWzI}TJYK7ZaJvyF}z3KE%wZHwt zRHvb>ry%$P1@J=KHfFqTdcN>;=iYH?_2kd(*}Z!?Thr{7_rdkn7lc?kfJvLb44}y0 zNzP6KilfyR1PU;{P7-0;dfC1}NlZx`m%u!AWR&+8pb+SWuq}l8diKfvZ{qSIxo9h4 za4o7m;Rnap=nI!vxURwk#K=%s)4~@VC%Fx1$G+j)xcZ#`oss_*C_0U>sbU_+eGzlM z-OvHSt^NR=)u8TWJ4kognwmi8bR zZk}O<4O=L;a=49*eN+JrfopeyB!Q6#rhxd}AU9XaxAmj}lmaG#@Lk83^?&a}95Ht{h+JTFloa8bq zwCpMgCIxIF(4Czw0>!I;=c_7_@Cxc77d^S*vU1$I*MdGn<2D25L_z%#jFXF6I#uvf zOfN3}n*$Ay2IUSRbu}` zWCH(gh2XShqlivDB_Yj7@4~qnc2wKus%yBV{#5IQ8Wj(Z4HZ6K&<@|#k_w_8)^2b4 zb-Hjpqn453kZcl5zkIp0>D2#1QdHK2oqcIrf3tm038$Tu`Y7NDgP1;PCnjq?G{S<(1%G)%g3#rr!y}T5kF6-ya}YshN)-&o7%oWYfjI zBv}j;1_dlV$jy$#eA52<8W6lz3)lopD(zP4p=NLD8(m1@9x=YxiU*n$PczZ`(tjgE zpb=Pp*e9pZa{EJij#t{Fjys~cYBoyAM7%<>k~pCSUzBZLu^{E8eK6L(Lt++RndPte z`TDQ_sC19;!F0I5_#g7_|1^q%H+cxJwX=K7A5S<@NRT!Uh`gVi$1k zjad$GY6a6JX!J5+h^sh!co-ZFh6BCnn*_U#z!*}~#6f>@&Cx=P`Iv)c(CP^-xcXN3 zLA0@xF#sBXP^BY0(*s-=F-oQ0y2E643_btRqXES+SxBLtzzw9T97I~5N$LcsFf=rj zfCvc$8+ga9B4qQ^bqB>bkLQPoDxJ#xEA_yO!a{-j3r{1#Ny92Wx@&t2QyQQbSJ!mI z6m>WCR0(1(;Akz%of1sAMG8Y#fEr8eyESz$EG&FSP7oYV+?oP^9AAIY5}=6hsgIe?cxA^1{lD_ntvF zcNxtJ)Tf~s5dr?pGB=mQC2S#Ym(|C;?oEUx(WWKQQTEjg6+ubz5`ugD3wMyW)43m7@q* zEV@>_6l~iE{Zqn8|JM-kJHms9Xn8LOI(>kv^pt7f9Rel8A5J0&oZw~^`lE)l{#Ah! zkNAw1wEipqNIoT~s*DH4LjqXb^UuIpI4xo0HG~x);R$hZX;gnJR^9v2t@Wcj^)&sL zP}HNx!edEp-`Xx*xDXP6(G-@@4;11eCV7_Y?zDc73$L&I$Aa3sx2`8TD8X+ML_>~< z(8J(l^6ydtWTowe2@)yp4hAw|zWS|yUD*s|mBzVV|0j{A+tf$@z?c8L2>$v1ciK$r j1LGh6cZ{H#7RmYesncK2G`b>IO6ls%*S<7s`Og0V{53x@ literal 0 HcmV?d00001 diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb new file mode 100755 index 000000000..f0fada829 --- /dev/null +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -0,0 +1,3425 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1c1a4119", + "metadata": {}, + "source": [ + "# Nemo Curator pipeline example\n", + "\n", + "## NeMo Curator introduction\n", + "The NeMo Curator is a Python library that consists of a collection of scalable data-mining modules for curating natural language processing (NLP) data for training large language models (LLMs). The modules within the NeMo Data Curator enable NLP researchers to mine high-quality text at scale from massive uncurated web corpora. \n", + "\n", + "NeMo Curator includes the following modules to perform data curation:\n", + "- Data download and Extraction\n", + "- Language identification and separation\n", + "- Text reformatting and cleaning\n", + "- Quality filtering\n", + "- Document-level deduplication\n", + "- Multilingual downstream-task decontamination\n", + "- Distributed Data Classification\n", + "\n", + "NeMo Curator team has perform ablation experiments using Common Crawl dataset to train a 357M GPT-style model to assess the effect of different curation stage on model performance. \n", + "\n", + "![alt text](./image/zeroshot_ablations.png)\n", + "\n", + "For the latest NeMo Data Curator user guide, please refer to https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/index.html " + ] + }, + { + "cell_type": "markdown", + "id": "be41377f", + "metadata": {}, + "source": [ + "## About this notebook\n", + "\n", + "\n", + "This notebook will use **Thai Wikipedia dataset** as example to demonstrate a typical data curation pipeline using NeMo Curator. After running through this script, user will be able to know how to use NDC to download wikipedia data, perform language separation using fasttext, perform GPU based exact deduplication and fuzzy deduplication and use CPU based heuristic filtering. \n", + "\n", + "Step description:\n", + "1. Download and extract data\n", + "2. Language detection and separation\n", + "3. GPU based deduplication\n", + " 1. Exact deduplication\n", + " 2. Fuzzy deduplication\n", + "4. Heuristic filtering\n", + "\n", + "What is not included:\n", + "1. Customized downloading\n", + "2. Classifier filtering\n", + "3. Downstream-task deduplication\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "8860c239", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "### System Requirements\n", + "Here is the hardware setting for this notebook\n", + "\n", + "**GPU**: NVIDIA A10 24G. \n", + "\n", + "**CUDA & Nvidia Drivers**: CUDA 12.2 with Driver 535.154.05\n", + "\n", + "**OS**: ubuntu 22.04\n", + "\n", + "### Getting NeMo FrameWork Training Container\n", + "- Get access to the container via https://developer.nvidia.com/nemo-framework\n", + "- Set your docker credentials \n", + " ```bash\n", + " docker login nvcr.io\n", + "\n", + " Username: $oauthtoken\n", + " Password: \n", + "- Get NeMo NeMo FrameWork Training Container\n", + " ```bash\n", + " docker pull nvcr.io/ea-bignlp/ga-participants/nemofw-training:24.01\n" + ] + }, + { + "cell_type": "markdown", + "id": "ff6bff1b", + "metadata": {}, + "source": [ + "## 0. Env Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24dce020", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", + "Collecting jsonlines\n", + " Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)\n", + "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines) (23.2.0)\n", + "Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n", + "Installing collected packages: jsonlines\n", + "Successfully installed jsonlines-4.0.0\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install jsonlines" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6831f331", + "metadata": {}, + "outputs": [], + "source": [ + "import argparse\n", + "\n", + "from nemo_curator.utils.distributed_utils import get_client,get_num_workers\n", + "from nemo_curator.utils.script_utils import add_distributed_args\n", + "from nemo_curator.utils.file_utils import get_all_files_paths_under, separate_by_metadata\n", + "from nemo_curator.utils.distributed_utils import read_data, write_to_disk\n", + "from nemo_curator.gpu_deduplication.utils import (create_logger, parse_nc_args, performance_report_if, enable_spilling)\n", + "from nemo_curator.datasets import DocumentDataset\n", + "\n", + "import os\n", + "import sys\n", + "import pandas as pd\n", + "import time\n", + "import cudf\n", + "import dask_cudf\n", + "import numpy as np\n", + "from dask.distributed import Client, LocalCluster\n", + "import jsonlines" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e28739b3", + "metadata": {}, + "outputs": [], + "source": [ + "def pre_imports():\n", + " import cudf \n", + "\n", + "def load_dataset(input_data_dir, file_type='jsonl'):\n", + " files = list(get_all_files_paths_under(input_data_dir))\n", + " raw_data = read_data(files, file_type=file_type, backend=\"pandas\", add_filename=True)\n", + " dataset = DocumentDataset(raw_data)\n", + "\n", + " return dataset\n", + "\n", + "def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)):\n", + " return add_distributed_args(parser)\n", + "\n", + "def check_jsonl_file(file_dir):\n", + " for file in os.listdir(file_dir):\n", + " if 'jsonl' not in file:\n", + " continue\n", + " with open(os.path.join(file_dir,file), 'r', encoding='utf-8') as f:\n", + " first_line = f.readline()\n", + " print(first_line)\n", + " break\n", + "\n", + "def extract_lines_with_id(file_path,target_list):\n", + " with jsonlines.open(file_path) as reader:\n", + " for obj in reader:\n", + " if obj.get('id') in target_list:\n", + " yield obj" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d279329f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/nluo_data/NeMo-Curator/tutorials/single_node_tutorial\n" + ] + } + ], + "source": [ + "cur_dir = os.getcwd()\n", + "print(cur_dir)\n", + "data_dir = f\"{cur_dir}/workspace/\"" + ] + }, + { + "cell_type": "markdown", + "id": "f3f452a3", + "metadata": {}, + "source": [ + "## 1. Download\n", + "In this example, Thai wikipedia data will be downloaded.\n", + "\n", + "Here is what happens when function `download_wikipedia()` is called:\n", + "1. Run `get_wikipedia_urls()` to obtain a list of urls to download .bz2 files for Thai wikipedia data. In this module, we use the base link and the language from user input to formulate a repo links for downloadable wikipedia .bz2 dump files. The formulated link will be `https://dumps.wikimedia.org/wiki`. All the links will be stored in a .txt file. Argument for this function includes:\n", + " - `dump_dates`: A date in the string format of 'YYYYMMDD'. It determines which wikipedia snapshot will be downloaded. If not specified, the `latest` snapshot will be downloaded\n", + " - `language`: language code of the desired language in lower case. Default value is `en`\n", + "\n", + "2. \n", + " Run `download_and_extract()` to download and extract contents based on the url list obtained from `get_wikipedia_urls`. User will need to define `downloader`, `extractor` and `iterator` for the dataset. \n", + " In this case, `WikipediaDownloader`,`WikipediaIterator` and `WikipediaExtractor` are used.\n", + " - `WikipediaDownloader`: Downloads wikipedia dumps file to local folder.\n", + " - `WikipediaIterator`: Extracts the .bz2 files and useful content from the base html content.\n", + " - `WikipediaExtractor`: Performs further task specific html content cleaning such as removing media files, removing references/tables etc. and finally yield pure text data which will be store in .jsonl format. \n", + " Please refer to `./NeMo-Curator/nemo_curator/download/wikipedia.py` for detail implementation.\n", + " \n", + " Argument for this function includes:\n", + " - `output_path`: Output path for downloaded and extracted dataset\n", + " - `output_type`: Type of output file. Default is .jsonl. User might choose other types such as parquet. In this example, .jsonl will be used\n", + " - `language`: See above\n", + " - `dump_date`: See above\n", + " - `raw_download_dir`: Output path for intermediate downloaded .bz2 file. If not specified, will be downloaded to `output_path`\n", + " - `keep_raw_download`: Whether to keep downloaded .bz2 files after extraction. Default is not to keep.\n", + " - `force_download`: Whether to restart downloading process if the target .bz2 files are detected under the `raw_download_dir` \n", + " - `url_limit`: Number of .bz2 files to be downloaded.\n", + "\n", + "The resultant .jsonl for Thai wikipedia will contain the following keys:\n", + "1. text\n", + "2. title\n", + "3. id\n", + "4. url\n", + "5. language\n", + "6. source_id\n", + "7. file_name" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1773cda2", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator.download import download_wikipedia" + ] + }, + { + "cell_type": "markdown", + "id": "d711a8f8", + "metadata": {}, + "source": [ + " Start a CPU based Dask cluster. Please modify `n_workers` and `memory_limit` according to your hardware specification. To process TH wikipedia data, it's advised to have `memory_limit` greater than 12GB" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "56ec66e0", + "metadata": {}, + "outputs": [], + "source": [ + "cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n", + "client = Client(cluster)" + ] + }, + { + "cell_type": "markdown", + "id": "f794b51c", + "metadata": {}, + "source": [ + "Define parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a90f3505", + "metadata": {}, + "outputs": [], + "source": [ + "#Output\n", + "download_base_directory= os.path.join(data_dir,\"wiki_downloads\")\n", + "download_output_directory = os.path.join(download_base_directory,\"data\")\n", + "\n", + "#Relevant parameter\n", + "dump_date = \"20240201\"\n", + "language = 'th'\n", + "url_limit = 1" + ] + }, + { + "cell_type": "markdown", + "id": "5628356b", + "metadata": {}, + "source": [ + "Download TH wikipedia data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b591b9f2", + "metadata": {}, + "outputs": [], + "source": [ + "res = download_wikipedia(download_output_directory,\n", + " language=language, \n", + " dump_date=dump_date,\n", + " url_limit=url_limit).df.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "2aae29dd", + "metadata": {}, + "source": [ + "Verify result" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "169fadb9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "downloads thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n", + "162164 /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/wiki_downloads/data/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n" + ] + } + ], + "source": [ + "! ls {download_output_directory}\n", + "! wc -l {download_output_directory}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f2bcb168", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"id\":\"1\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\",\"language\":\"th\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\"}\n", + "\n" + ] + } + ], + "source": [ + "check_jsonl_file(download_output_directory)" + ] + }, + { + "cell_type": "markdown", + "id": "44fa2d13", + "metadata": {}, + "source": [ + "**[Optional]**Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "590c489c", + "metadata": {}, + "outputs": [], + "source": [ + "# client.cluster.close()\n", + "# client.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "5ba566fc", + "metadata": {}, + "source": [ + "## 2.Language separation and unicode fixing\n", + "\n", + "**Note**: In order to be run on interactive python. Please comment `from.code import *` and the related imports in `./nemo_curator/filters/__init__.py`" + ] + }, + { + "cell_type": "markdown", + "id": "f742b881", + "metadata": {}, + "source": [ + "In this section, we will be using a language classification model by fasttext to separate the TH wikipedia dataset based on the document major languages, and we will also fix the unicode in the documents. Detailed steps are:\n", + "\n", + "1. Download fasttext model for text language detection\n", + "2. Construct a filter which uses the downloaded fasttext model to produce a language label to each document. \n", + "3. Separate each document by the language label. This will create sub-folders for each languages under the output path and the documents under the same language will be output to a .jsonl file in the corresponding sub-folder.\n", + "4. Load .jsonl file in the folder of desirable language. In this example, `TH` folder will be loaded.\n", + "5. Apply `UnicodeReformatter` to the data and output the result in .jsonl format. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "71a6e4a2", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import ScoreFilter,Modify\n", + "from nemo_curator.filters import FastTextLangId\n", + "from nemo_curator.modifiers import UnicodeReformatter" + ] + }, + { + "cell_type": "markdown", + "id": "4916079c", + "metadata": {}, + "source": [ + "**[Optional]**8Start a cpu based Dask cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "23a63375", + "metadata": {}, + "outputs": [], + "source": [ + "# cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n", + "# client = Client(cluster)" + ] + }, + { + "cell_type": "markdown", + "id": "957d7357", + "metadata": {}, + "source": [ + "Define parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6270de3f", + "metadata": {}, + "outputs": [], + "source": [ + "# Input path\n", + "multilingual_data_path = download_output_directory\n", + "\n", + "# Output path\n", + "language_base_output_path = os.path.join(data_dir,\"language_sep\")\n", + "language_data_output_path = os.path.join(language_base_output_path,\"data\")\n", + "language_separated_output_path = os.path.join(language_data_output_path,\"language\")\n", + "lang_sep_cleaned_data_output_path = os.path.join(language_data_output_path,\"cleaned\")\n", + "\n", + "# Fasttext model path\n", + "model_path = language_base_output_path\n", + "\n", + "# Define desired language\n", + "target_language = \"TH\"\n", + "\n", + "# Define key in output .jsonl files to store the language information\n", + "language_field = \"language\"" + ] + }, + { + "cell_type": "markdown", + "id": "598cff2d", + "metadata": {}, + "source": [ + "Download fasttext model" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0c7cc007", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-03-22 08:40:55-- https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\n", + "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.227.74.12, 13.227.74.118, 13.227.74.9, ...\n", + "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.227.74.12|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 131266198 (125M) [application/octet-stream]\n", + "Saving to: ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin’\n", + "\n", + "lid.176.bin 100%[===================>] 125.18M 220MB/s in 0.6s \n", + "\n", + "2024-03-22 08:40:56 (220 MB/s) - ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin’ saved [131266198/131266198]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P {model_path}" + ] + }, + { + "cell_type": "markdown", + "id": "d875771b", + "metadata": {}, + "source": [ + "Apply fasttext model to separate documents by their languages" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c959800c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken for splitting language:147.80864667892456\n" + ] + } + ], + "source": [ + "t0 = time.time()\n", + "\n", + "# Load dataset \n", + "multilingual_dataset = load_dataset(multilingual_data_path)\n", + "\n", + "#Define Language separation pipeline\n", + "lang_filter = FastTextLangId(os.path.join(model_path,'lid.176.bin'))\n", + "language_id_pipeline = ScoreFilter(lang_filter, score_field=language_field, score_type='object')\n", + "filtered_dataset = language_id_pipeline(multilingual_dataset)\n", + "\n", + "# The language separation pipeline will produce a result looks like ['EN',0.96873], we only want to keep the 'EN' label and drop the detailed classifier score\n", + "filtered_dataset.df[language_field] = filtered_dataset.df[language_field].apply(lambda score: score[1],meta = (language_field, 'object'))\n", + "\n", + "# Split the dataset to corresponding language sub-folders\n", + "language_stats = separate_by_metadata(filtered_dataset.df, language_separated_output_path, metadata_field=language_field).compute()\n", + "\n", + "print(f\"Time taken for splitting language:{time.time()-t0}\")" + ] + }, + { + "cell_type": "markdown", + "id": "bd54a24a", + "metadata": {}, + "source": [ + "Load `UnicodeReformatter` to reformat any unicode appeared in the desired language dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "0c09bc28", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n", + "Writing to disk complete for 1 partitions\n", + "Time taken for fixing unicode:444.5816135406494\n" + ] + } + ], + "source": [ + "t0 = time.time()\n", + "\n", + "# Read the language specific data and fix the unicode in it\n", + "lang_data_path = os.path.join(language_separated_output_path, target_language)\n", + "lang_data = load_dataset(lang_data_path)\n", + "\n", + "cleaner = Modify(UnicodeReformatter())\n", + "cleaned_data = cleaner(lang_data)\n", + "\n", + "# Write the cleaned_data\n", + "write_to_disk(cleaned_data.df, lang_sep_cleaned_data_output_path, write_to_filename=True, output_type='jsonl')\n", + "\n", + "print(f\"Time taken for fixing unicode:{time.time()-t0}\")" + ] + }, + { + "cell_type": "markdown", + "id": "00c6e5a1", + "metadata": {}, + "source": [ + "Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file (no. of lines = 162164)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "b2b34d46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n", + "161748 /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/data/cleaned/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n" + ] + } + ], + "source": [ + "! ls {lang_sep_cleaned_data_output_path}\n", + "! wc -l {lang_sep_cleaned_data_output_path}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl" + ] + }, + { + "cell_type": "markdown", + "id": "39d539a2", + "metadata": {}, + "source": [ + "Furthur verify by loading documents that has been identified as other language, such as 'EN'. We can see from output that the removed document is indeed in English and contains very little or even no Thai." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ace3c5b", + "metadata": {}, + "outputs": [], + "source": [ + "check_jsonl_file(os.path.join(language_separated_output_path,'EN'))" + ] + }, + { + "cell_type": "markdown", + "id": "9b817bf7", + "metadata": {}, + "source": [ + "**[Optional]**Close the Dask cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "id": "bf05b6c2", + "metadata": {}, + "outputs": [], + "source": [ + "# client.cluster.close()\n", + "# client.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "cc8b6aef", + "metadata": {}, + "source": [ + "## 3.Add ID\n", + "TH wikipedia data do have `id` field, but the `id` field contains number only. It will be better if we unified the `id` field and transform it to the format of `_`. In this way, when handling multiple dataset, we will able to know which document from which dataset has been removed. This `id` will be useful when we are running deduplication and heuristic filtering. The function we will be using is `AddID()`. Arguments for this function include:\n", + "- `id_field`: fields will be added to input .json file. If the key already exists in the .jsonl, it's value will be replaced.\n", + "- `id_prefix`: prefix used in ID. Default is 'doc-id'\n", + "- `start_index`: starting index in ID. Default is 0" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "fe9e6eef", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import AddId" + ] + }, + { + "cell_type": "markdown", + "id": "232c01a5", + "metadata": {}, + "source": [ + "**[Optional]**If there is no running Dask cluster, start CPU based Dask cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "id": "f3f483eb", + "metadata": {}, + "outputs": [], + "source": [ + "# cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n", + "# client = Client(cluster)" + ] + }, + { + "cell_type": "markdown", + "id": "2be65a51", + "metadata": {}, + "source": [ + "Define relevant parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "054019a5", + "metadata": {}, + "outputs": [], + "source": [ + "#Input\n", + "add_id_input_data_dir = lang_sep_cleaned_data_output_path\n", + "\n", + "#Output\n", + "added_id_output_path = os.path.join(data_dir,\"add_id/cleaned\")\n", + "\n", + "#Format of output ID will be _, Define prefix here\n", + "add_ID_id_prefix=\"TH_wiki\"" + ] + }, + { + "cell_type": "markdown", + "id": "80f9591c", + "metadata": {}, + "source": [ + "Adding ID to dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "e8fd7e09", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n", + "Writing to disk complete for 1 partitions\n", + "Time taken for add ID:56.01176333427429\n" + ] + } + ], + "source": [ + "t0 = time.time()\n", + "# Read input files\n", + "dataset = load_dataset(add_id_input_data_dir)\n", + "\n", + "# Run AddID() on the input dataset\n", + "add_id = AddId(id_field='id',id_prefix=add_ID_id_prefix,start_index=0)\n", + "id_dataset = add_id(dataset)\n", + "\n", + "#Output files\n", + "write_to_disk(id_dataset.df, output_file_dir=added_id_output_path, write_to_filename=True, output_type='jsonl')\n", + "\n", + "print(f\"Time taken for add ID:{time.time()-t0}\")" + ] + }, + { + "cell_type": "markdown", + "id": "50016a50", + "metadata": {}, + "source": [ + "Verify the result. From the output, we can see that the `id` value has been changed to `TH_wiki-0000000000` " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "27a634e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\",\"id\":\"TH_wiki-0000000000\",\"language\":\"TH\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\"}\n", + "\n" + ] + } + ], + "source": [ + "check_jsonl_file(added_id_output_path)" + ] + }, + { + "cell_type": "markdown", + "id": "e7084fed", + "metadata": {}, + "source": [ + "Close Dask cluster. This cell needs to be run as we are starting a new GPU Dask cluster in the following task" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "16399469", + "metadata": {}, + "outputs": [], + "source": [ + "client.cluster.close()\n", + "client.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "cb227709", + "metadata": {}, + "source": [ + "## 4.Exact Dedplication\n", + "\n", + "In exact deduplication, the document text is hashed into unique string using certain hashing algorithm, such as 'md5'. The documents with exact hashed values are having identical text. We will output the `ID` of duplicated documents for removal later. The function used is `ExactDuplicates()`. Arguments for this function include:\n", + "- `id_field`: Key in input file for identifying document ID\n", + "- `text_field`: Key in input file which contains document text.\n", + "- `hash_method`: Hashing algorithm used. Default is `md5`\n", + "- `cache_dir`: If specified, the duplicated document IDs will be output to the `cache_dir`. Otherwise, the IDs will not be saved\n", + "\n", + "Also, we are going to use GPU dask cluster to accelerate computation for deduplication (both exact and fuzzy)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "8fa6c3af", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator.modules import ExactDuplicates" + ] + }, + { + "cell_type": "markdown", + "id": "aa70fd06", + "metadata": {}, + "source": [ + "Start a GPU based Dask cluster. Since GPU based Dask cluster involves setting several arguments, we will use the `get_client()` wrapper function to quickly set up. Please make sure the `device` in `args` is `gpu`" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "7e9530f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Namespace(scheduler_address=None, scheduler_file=None, n_workers=20, threads_per_worker=1, rmm_pool_size=None, protocol='tcp', nvlink_only=False, files_per_partition=2, num_files=-1, device='gpu', set_torch_to_use_rmm=False)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sys.argv=['','--device','gpu']\n", + "parser = argparse.ArgumentParser()\n", + "args = attach_args(parser).parse_args()\n", + "args.set_torch_to_use_rmm = False\n", + "args" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f71ab145", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of dask worker:1\n" + ] + }, + { + "data": { + "text/plain": [ + "{'tcp://127.0.0.1:37795': None}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client = get_client(args, args.device)\n", + "print(f\"Number of dask worker:{get_num_workers(client)}\")\n", + "client.run(pre_imports)" + ] + }, + { + "cell_type": "markdown", + "id": "4ef57149", + "metadata": {}, + "source": [ + "Define parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "26e6927e", + "metadata": {}, + "outputs": [], + "source": [ + "#Input\n", + "exact_dedup_input_dataset_dir = added_id_output_path\n", + "\n", + "#Output\n", + "exact_dedup_base_output_path = os.path.join(data_dir,\"exact_dedup\")\n", + "exact_dedup_log_dir = os.path.join(exact_dedup_base_output_path,'log')\n", + "exact_dedup_output_dir = os.path.join(exact_dedup_base_output_path,'data')\n", + "\n", + "#Parameters for ExactDuplicates()\n", + "exact_dedup_dataset_id_field = \"id\"\n", + "exact_dedup_dataset_text_field = \"text\" \n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "b9a75a74", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {exact_dedup_log_dir}\n", + "!mkdir -p {exact_dedup_output_dir}" + ] + }, + { + "cell_type": "markdown", + "id": "a9fc0bd2", + "metadata": {}, + "source": [ + "Apply exact deduplication" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "daf8f324", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n", + "Number of exact duplicated file:53\n", + "Time taken for exact duplicate:3.0404415130615234\n" + ] + } + ], + "source": [ + "t0 = time.time()\n", + "# Read input dataset\n", + "input_dataset = DocumentDataset.read_json(exact_dedup_input_dataset_dir, backend='cudf')\n", + "\n", + "#Run exact deduplication to the input\n", + "exact_dup = ExactDuplicates(\n", + " logger=exact_dedup_log_dir,\n", + " id_field=exact_dedup_dataset_id_field,\n", + " text_field=exact_dedup_dataset_text_field,\n", + " hash_method=\"md5\",\n", + " cache_dir=exact_dedup_output_dir #Duplicated document ID list is output to the cache_dir\n", + ")\n", + "duplicates = exact_dup(dataset=input_dataset)\n", + "\n", + "print(f\"Number of exact duplicated file:{len(duplicates)}\")\n", + "\n", + "print(f\"Time taken for exact duplicate:{time.time()-t0}\")" + ] + }, + { + "cell_type": "markdown", + "id": "517c60e4", + "metadata": {}, + "source": [ + "Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "2f3c67f8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of exact duplicated document:53\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_hashes
0TH_wiki-00000212111708cb56ec582f78716f0864dca9382d
1TH_wiki-00000212131708cb56ec582f78716f0864dca9382d
2TH_wiki-0000105191e77a248506ef16737288fae5759db33a
3TH_wiki-00001051922e386f5c3af70f43874618988d4842b2
4TH_wiki-00001051932e386f5c3af70f43874618988d4842b2
\n", + "
" + ], + "text/plain": [ + " id _hashes\n", + "0 TH_wiki-0000021211 1708cb56ec582f78716f0864dca9382d\n", + "1 TH_wiki-0000021213 1708cb56ec582f78716f0864dca9382d\n", + "2 TH_wiki-0000105191 e77a248506ef16737288fae5759db33a\n", + "3 TH_wiki-0000105192 2e386f5c3af70f43874618988d4842b2\n", + "4 TH_wiki-0000105193 2e386f5c3af70f43874618988d4842b2" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exact_dedup_res = pd.read_parquet(os.path.join(exact_dedup_output_dir,\"_exact_duplicates.parquet\"))\n", + "print(f\"Number of exact duplicated document:{len(exact_dedup_res)}\")\n", + "exact_dedup_res.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "7ed7d4de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_hashesid
00b908a91cdf0544c1ef3015cff4ee07eTH_wiki-0000157216 TH_wiki-0000066307
115f35c239b6579b4642f7656e64576acTH_wiki-0000098621 TH_wiki-0000074714 TH_wiki-...
21708cb56ec582f78716f0864dca9382dTH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...
32e386f5c3af70f43874618988d4842b2TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-...
43e6e96a80410d5a191d098f464e66f86TH_wiki-0000122055 TH_wiki-0000116550
\n", + "
" + ], + "text/plain": [ + " _hashes \\\n", + "0 0b908a91cdf0544c1ef3015cff4ee07e \n", + "1 15f35c239b6579b4642f7656e64576ac \n", + "2 1708cb56ec582f78716f0864dca9382d \n", + "3 2e386f5c3af70f43874618988d4842b2 \n", + "4 3e6e96a80410d5a191d098f464e66f86 \n", + "\n", + " id \n", + "0 TH_wiki-0000157216 TH_wiki-0000066307 \n", + "1 TH_wiki-0000098621 TH_wiki-0000074714 TH_wiki-... \n", + "2 TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-... \n", + "3 TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-... \n", + "4 TH_wiki-0000122055 TH_wiki-0000116550 " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exact_dedup_res.groupby('_hashes')['id'].agg(lambda x: ' '.join(x)).reset_index().head()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "3051ed4b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'filename': 'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl', 'id': 'TH_wiki-0000066307', 'language': 'TH', 'source_id': 'thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2', 'text': '\\n\\nแหล่งข้อมูลอื่น \\n\\nสงขลา\\n \\nรายชื่อเกี่ยวกับจังหวัดสงขลา', 'title': 'รายชื่อโบราณสถานในจังหวัดสงขลา', 'url': 'https://th.wikipedia.org/wiki/%E0%B8%A3%E0%B8%B2%E0%B8%A2%E0%B8%8A%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B9%82%E0%B8%9A%E0%B8%A3%E0%B8%B2%E0%B8%93%E0%B8%AA%E0%B8%96%E0%B8%B2%E0%B8%99%E0%B9%83%E0%B8%99%E0%B8%88%E0%B8%B1%E0%B8%87%E0%B8%AB%E0%B8%A7%E0%B8%B1%E0%B8%94%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2'}\n", + "{'filename': 'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl', 'id': 'TH_wiki-0000157216', 'language': 'TH', 'source_id': 'thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2', 'text': '\\n\\nแหล่งข้อมูลอื่น \\n\\nสงขลา\\n \\nรายชื่อเกี่ยวกับจังหวัดสงขลา', 'title': 'รายชื่อโบราณสถานในจังหวัดสงขลา (อำเภอเมืองสงขลาและสิงหนคร)', 'url': 'https://th.wikipedia.org/wiki/%E0%B8%A3%E0%B8%B2%E0%B8%A2%E0%B8%8A%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B9%82%E0%B8%9A%E0%B8%A3%E0%B8%B2%E0%B8%93%E0%B8%AA%E0%B8%96%E0%B8%B2%E0%B8%99%E0%B9%83%E0%B8%99%E0%B8%88%E0%B8%B1%E0%B8%87%E0%B8%AB%E0%B8%A7%E0%B8%B1%E0%B8%94%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2%20%28%E0%B8%AD%E0%B8%B3%E0%B9%80%E0%B8%A0%E0%B8%AD%E0%B9%80%E0%B8%A1%E0%B8%B7%E0%B8%AD%E0%B8%87%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2%E0%B9%81%E0%B8%A5%E0%B8%B0%E0%B8%AA%E0%B8%B4%E0%B8%87%E0%B8%AB%E0%B8%99%E0%B8%84%E0%B8%A3%29'}\n" + ] + } + ], + "source": [ + "target_list = ['TH_wiki-0000157216', 'TH_wiki-0000066307']\n", + "for line in extract_lines_with_id(os.path.join(exact_dedup_input_dataset_dir,'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl'),target_list):\n", + " print(line)" + ] + }, + { + "cell_type": "markdown", + "id": "ec31440b", + "metadata": {}, + "source": [ + "**[Optional]** You might choose to close Dask cluster here" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "2ee05303", + "metadata": {}, + "outputs": [], + "source": [ + "# client.cluster.close()\n", + "# client.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "710e8540", + "metadata": {}, + "source": [ + "## 5. Fuzzy Deduplication\n", + "Fuzzy deduplication involves 5 intermediate steps to generate duplicates. Refer to https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/gpudeduplication.html for details\n", + "\n", + "Fuzzy deduplication in this example is a GPU implementation of MinhashLSH algorithm. This algorithm measures similarity based on statistics but not semantic meanings of text. There are a few concepts to be introduced before heading into fuzzy deduplication.\n", + "1. Jaccard similarity: Jaccard similarity is often used as a metric to calculate the similarity between two sets. It's calculated by dividing the number of common elements in the two sets (Intersection) by the number of total unique elements in the two sets (Union). In the case of text documents, we transform a document into a set of n-grams. If two documents share a large amount of n-grams, most likely the documents are similar. \n", + "\n", + " ![alt text](./image/jaccard.png )\n", + "\n", + "2. Complexity of the problem: To find all the similar document pairs in a dataset, we need to compute pair-wise Jaccard similarity across the dataset. Hence, making the complexity $O(N^2)$\n", + "\n", + "The MinhashLSH algorithm is a technique for quickly estimating the similarity between sets, such as the similarity between documents represented as sets of shingles (n-grams). It's able to find out Jaccard similar pair in the corpus but in a much computational efficient way. This algorithm has following steps in a high-level:\n", + "1. Compute minhash for each document\n", + "2. Run Locality Sensitive Hashing (LSH) based on the minhash which further assign buckets to each document. Each documents will be assigned to multiple buckets. Documents within the same bucket are deemed to be similar.\n", + "3. Run pair-wise Jaccard similarity within each buckets to remove false positive cases within the buckets\n", + "4. Based on the Jaccard similarity, transform the similarity matrix to a graph ans run connected component algorithm. For a group of connected components in the graph, they are the final similar document groups and the IDs within each groups will be output for duplicate removal.\n", + "More detailed explanation please refer to https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/cpudeduplication.html.\n", + "\n", + "For implementation of MinhahsLSH on GPU, there are 5 steps:\n", + "1. Minhash computation\n", + "2. Bucket computation\n", + "3. Jaccard shuffle for load balancing in a distributed system\n", + "4. Jaccard similarity computation\n", + "5. Connected component " + ] + }, + { + "cell_type": "markdown", + "id": "c4b99c5e", + "metadata": {}, + "source": [ + "**If there is not running Dask cluster, start a GPU Dask cluster here**" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "115ff2dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'tcp://127.0.0.1:33223': None}" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sys.argv=['','--device','gpu']\n", + "# parser = argparse.ArgumentParser()\n", + "# args = attach_args(parser).parse_args()\n", + "# args.set_torch_to_use_rmm = False\n", + "\n", + "# client = get_client(args, args.device)\n", + "# get_num_workers(client)\n", + "# client.run(pre_imports)" + ] + }, + { + "cell_type": "markdown", + "id": "1979977d", + "metadata": {}, + "source": [ + "### 5.1 Minhash\n", + "\n", + "Run `MinHash()` for this section. The output of a minhash is a parquet file which contains document ID and hashed value which is an array contains 260 32-bit integer data. To obtain such hashed values we need to go through the following steps:\n", + "1. Generate a set of n-gram components of a document. For example, doc = `Nemo Curator is a data curation tool`, a 3-gram set of this document will be `['Nemo Curator is','Curator is a','is a data','a data curation','data curation tool']`\n", + "2. Hashed each n-gram into numerical values\n", + "3. Generate a random hash function $H_1()$ which will hash each numeric n-gram into a 32-bit integer and take the minimum integer to use as minhash value for $H_1()$\n", + "4. Repeat step 2 and 3 with hash function $H_x()$ until desired minhash length is reached. Minhash value of each iteration will be append together to form the final minhash array. \n", + "\n", + "Arguments include:\n", + "- `seed`:Random seed used for initializing the hash functions used to compute the MinHashes. It's advised to keep this value the same for different experiment for reproducibility\n", + "- `num_hashes`:Length of each minhash array. Default is 260. Longer minhash length will have better estimate of actual Jaccard similarity, but require more computational power\n", + "- `char_ngrams`:n-gram length\n", + "- `use_64bit_hash`:Whether to use 64bit or 32bit hash function\n", + "- `id_field`: Key in input file for identifying document ID\n", + "- `text_field`: Key in input file which contains document text.\n", + "- `cache_dir`: If specified, the intermediate result will be output to the `cache_dir`. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "f9b2a642", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import MinHash" + ] + }, + { + "cell_type": "markdown", + "id": "4c152974", + "metadata": {}, + "source": [ + "Define parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "117a569d", + "metadata": {}, + "outputs": [], + "source": [ + "#Input\n", + "minhash_data_path = added_id_output_path\n", + "#Output\n", + "minshah_base_output_path = os.path.join(data_dir,\"fuzzy/minhash\")\n", + "minshah_log_dir = os.path.join(minshah_base_output_path,'log')\n", + "minshah_output_dir = os.path.join(minshah_base_output_path,'data')\n", + "#Specify dataset name\n", + "dataset_name = 'TH_wikipedia'\n", + "\n", + "#Relevant parameters\n", + "minhash_id_field = 'id'\n", + "minhash_text_field = 'text'\n", + "seed = 10\n", + "minhash_length = 260\n", + "char_ngram = 5\n", + "use_64bit_hash = False\n", + "files_per_partition = 2\n", + "\n", + "!mkdir -p {minshah_log_dir}\n", + "!mkdir -p {minshah_output_dir}" + ] + }, + { + "cell_type": "markdown", + "id": "73c1ad41", + "metadata": {}, + "source": [ + "Run MinHash" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "a17954eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing minhashes for /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/add_id/cleaned\n", + "Reading 1 files\n", + "Time taken for MinHash:7.543871879577637\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", + "\n", + "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "t0 = time.time()\n", + "print(f\"Computing minhashes for {minhash_data_path}\")\n", + "\n", + "# Load data. Only the [minhash_id_field, text_field] columns are needed\n", + "files = get_all_files_paths_under(root=minhash_data_path, recurse_subdirectories=False)\n", + "files = [f for f in files if f.endswith(\".jsonl\")]\n", + "df = read_data(\n", + " files,\n", + " file_type=\"jsonl\",\n", + " backend=\"cudf\",\n", + " files_per_partition=files_per_partition,\n", + " add_filename=False,\n", + ")[[minhash_id_field, minhash_text_field]]\n", + "\n", + "# Run MinHash() on input data\n", + "minhasher = MinHash(\n", + " seed=seed,\n", + " num_hashes=minhash_length,\n", + " char_ngrams=char_ngram,\n", + " use_64bit_hash=use_64bit_hash,\n", + " logger=minshah_log_dir,\n", + " id_field=minhash_id_field,\n", + " text_field=minhash_text_field,\n", + " cache_dir=minshah_output_dir\n", + ")\n", + "res = minhasher(DocumentDataset(df)).df\n", + "\n", + "print(f\"Time taken for MinHash:{time.time()-t0}\")" + ] + }, + { + "cell_type": "markdown", + "id": "19cddba5", + "metadata": {}, + "source": [ + "Verify result" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "df83eec5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_minhash_signature
0TH_wiki-0000000000[11565725, 19782487, 9831980, 5480992, 2306475...
1TH_wiki-0000000001[407876, 107572, 824528, 346831, 216554, 10963...
2TH_wiki-0000000002[727721, 694551, 233868, 346831, 216554, 77001...
3TH_wiki-0000000003[1149282, 931656, 2515604, 1428622, 4964646, 4...
4TH_wiki-0000000004[1559901, 11771639, 487706, 826569, 1203860, 5...
\n", + "
" + ], + "text/plain": [ + " id _minhash_signature\n", + "0 TH_wiki-0000000000 [11565725, 19782487, 9831980, 5480992, 2306475...\n", + "1 TH_wiki-0000000001 [407876, 107572, 824528, 346831, 216554, 10963...\n", + "2 TH_wiki-0000000002 [727721, 694551, 233868, 346831, 216554, 77001...\n", + "3 TH_wiki-0000000003 [1149282, 931656, 2515604, 1428622, 4964646, 4...\n", + "4 TH_wiki-0000000004 [1559901, 11771639, 487706, 826569, 1203860, 5..." + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "minhash_res = pd.read_parquet(os.path.join(minshah_output_dir, \"_minhashes.parquet\"))\n", + "minhash_res.head()" + ] + }, + { + "cell_type": "markdown", + "id": "998ab08a", + "metadata": {}, + "source": [ + "### 5.2 LSH\n", + "`LSH()` implements LSH algorithm which includes the following steps:\n", + "1. Divide the minhash array into `X` different portions. \n", + "2. For each portions, hash the minhash values into buckets. One document will be assigned to `X` buckets.\n", + "3. Documents within the same bucket will be deemed similar. Since every document will be assigned `X` buckets and as long as two documents share 1 or more buckets they are deemed similar, the result of LSH will have more false positive as compared to false negative. The false positive cases will be filtered in following modules, namely jaccard compute.\n", + "\n", + "Arguments include:\n", + "- `minhash_length`:Length of minhash signature. Must bu consistent with `MinHash()`\n", + "- `num_buckets`: Number of buckets\n", + "- `buckets_per_shuffle`: Number of buckets to shuffle concurrently\n", + "- `id_field`: Key in input file for identifying document ID\n", + "- `minhash_field`: Key in input file for identifying document MinHash signature \n", + "- `cache_dir`:If specified, the intermediate result will be output to the `cache_dir`.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "138544a5", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import LSH\n", + "from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import \\\n", + " convert_str_id_to_int" + ] + }, + { + "cell_type": "markdown", + "id": "178fd0e4", + "metadata": {}, + "source": [ + "Define parameter" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "21d2a261", + "metadata": {}, + "outputs": [], + "source": [ + "#Input\n", + "lsh_input_data_path = minshah_output_dir\n", + "\n", + "#Output\n", + "lsh_base_output_path = os.path.join(data_dir,\"fuzzy/lsh\")\n", + "lsh_log_dir = os.path.join(lsh_base_output_path,'log')\n", + "lsh_output_dir = os.path.join(lsh_base_output_path,'data')\n", + "\n", + "#Relevant parameters\n", + "lsh_id_field = 'id'\n", + "minhash_field = '_minhash_signature'\n", + "minhash_length=260\n", + "num_bands=20\n", + "buckets_per_shuffle=1\n", + "\n", + "!mkdir -p {lsh_log_dir}\n", + "!mkdir -p {lsh_output_dir}" + ] + }, + { + "cell_type": "markdown", + "id": "a18708d2", + "metadata": {}, + "source": [ + "Run LSH" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "9eebeb10", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", + "\n", + "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken for LSH:20.533941984176636\n" + ] + } + ], + "source": [ + "t0 = time.time()\n", + "\n", + "#Load MinHash output\n", + "df = dask_cudf.read_parquet(lsh_input_data_path, blocksize=\"2GB\", aggregate_files=True, backend = \"cudf\")\n", + "df = df.map_partitions(\n", + " convert_str_id_to_int,\n", + " id_column=lsh_id_field,\n", + " meta=cudf.DataFrame(\n", + " {minhash_field: [[1, 2, 3]], \"doc_id\": [1], \"dataset_id\": np.uint32(1)}\n", + " ),\n", + ")\n", + "\n", + "#Run LSH()\n", + "lsh = LSH(\n", + " cache_dir=lsh_output_dir,\n", + " minhash_length=minhash_length,\n", + " num_buckets=num_bands,\n", + " buckets_per_shuffle=buckets_per_shuffle,\n", + " id_fields=[\"dataset_id\", \"doc_id\"],\n", + " minhash_field=minhash_field,\n", + " logger=lsh_log_dir,\n", + ")\n", + "res = lsh(DocumentDataset(df))\n", + "\n", + "t1 = time.time()\n", + "print(f\"Time taken for LSH:{time.time()-t0}\")" + ] + }, + { + "cell_type": "markdown", + "id": "813603e2", + "metadata": {}, + "source": [ + "Verify result" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "c47da6b9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dataset_iddoc_id_bucket_id
0169236187812469296
1169236187885282385
21692361878156638529
31692361878160566540
41692361878160567540
\n", + "
" + ], + "text/plain": [ + " dataset_id doc_id _bucket_id\n", + "0 1692361878 124692 96\n", + "1 1692361878 85282 385\n", + "2 1692361878 156638 529\n", + "3 1692361878 160566 540\n", + "4 1692361878 160567 540" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lsh_res = pd.read_parquet(os.path.join(lsh_output_dir, \"_buckets.parquet\"))\n", + "lsh_res.head()" + ] + }, + { + "cell_type": "markdown", + "id": "07bade4a", + "metadata": {}, + "source": [ + "### 5.3 Jaccard Shuffle\n", + "In this section, we will be using `_MapBucket()` and `_Shuffle()`.\n", + "\n", + "For `_MapBucket()`, it is designed to take input text data in .jsonl format and bucket information which is output of LSH, map the documents to their respective buckets, and write the resulting DataFrame containing the anchor documents and their associated bucket information to a Parquet file.Arguments include:\n", + "- `id_field`: Key in input .jsonl file for identifying document ID\n", + "- `text_field`: Key in input .jsonl file which contains document text.\n", + "- `bucket_field`: Key in input _buckets.parquet which contains `bucket_id`.\n", + "- `num_anchors`: Number of anchors (document in the same buckets) to be output\n", + "\n", + "\n", + "For `_Shuffle()`, it perform a shuffling operation on the documents based on their bucket assignments, output in .parquet format. This shuffling operation is a crucial step in the deduplication process, as it helps distribute similar documents across different partitions or workers, enabling efficient parallel processing and deduplication in subsequent steps. Arguments include:\n", + "- `id_fields`: Columns in `_buckets.parquet` that maps to original `id` in .jsonl data file. In this example, it is `[\"dataset_id\", \"doc_id\"]`\n", + "- `text_field`: Key in input .jsonl file which contains document text.\n", + "- `int_to_str_id`: Key in input .jsonl file for identifying document ID\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "565253ae", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (\n", + " get_bucket_ddf_from_parquet_path,\n", + " get_text_ddf_from_json_path_with_blocksize,\n", + ")\n", + "from nemo_curator.modules.fuzzy_dedup import _MapBuckets,_Shuffle" + ] + }, + { + "cell_type": "markdown", + "id": "70387977", + "metadata": {}, + "source": [ + "Define parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "5cff7d76", + "metadata": {}, + "outputs": [], + "source": [ + "#Input\n", + "input_data_paths = [minhash_data_path]\n", + "input_bucket_path = lsh_output_dir\n", + "\n", + "#Output\n", + "jaccard_shuffle_base_output_path = os.path.join(data_dir,\"fuzzy/jaccard_shuffle\")\n", + "output_anchor_docs_with_bk_path = os.path.join(jaccard_shuffle_base_output_path, \"anchor_docs_with_bk.parquet\")\n", + "input_anchor_docs_with_bk_dir = output_anchor_docs_with_bk_path\n", + "output_shuffled_docs_path = os.path.join(jaccard_shuffle_base_output_path, \"shuffled_docs.parquet\")\n", + "\n", + "#Relevant parameter for _MapBucket()\n", + "text_ddf_blocksize = 256\n", + "bucket_mapping_ddf_blocksize = 256\n", + "num_files = None\n", + "shuffle_type ='tasks'\n", + "input_bucket_field = '_bucket_id'\n", + "input_id_field = 'id'\n", + "input_text_field = 'text'\n", + "\n", + "#Relevant parameter for _Shuffle()\n", + "shuffle_id_fields=[\"dataset_id\", \"doc_id\"]\n", + "int_to_str_id='id'\n", + "\n", + "!mkdir -p {jaccard_shuffle_base_output_path}" + ] + }, + { + "cell_type": "markdown", + "id": "699a53f1", + "metadata": {}, + "source": [ + "Run Jaccard map bucket" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "0a6e5a84", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of files being read for jaccard calculation = 1\n", + "Number of ddf_bk partitions = 1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", + "\n", + "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken for Bucket Mapping:2.1162023544311523 s\n" + ] + } + ], + "source": [ + "t0 = time.time()\n", + "num_workers = get_num_workers(client)\n", + "\n", + "# Read .jsonl input data\n", + "ddf_text = get_text_ddf_from_json_path_with_blocksize(\n", + " input_data_paths=input_data_paths,\n", + " num_files=num_files,\n", + " blocksize=text_ddf_blocksize,\n", + " id_column=input_id_field,\n", + " text_column=input_text_field,\n", + ")\n", + "# Read \"_buckets.parquet\"\n", + "ddf_bk = get_bucket_ddf_from_parquet_path(input_bucket_path=input_bucket_path, num_workers=num_workers)\n", + "\n", + "#Run _MapBuckets()\n", + "map_buckets = _MapBuckets(id_fields=shuffle_id_fields, bucket_field=input_bucket_field)\n", + "ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(documents_df=ddf_text, buckets_df=ddf_bk, shuffle_type=shuffle_type)\n", + "\n", + "#Write to disk\n", + "ddf_anchor_docs_with_bk.to_parquet(output_anchor_docs_with_bk_path, write_index=False)\n", + "\n", + "print(f\"Time taken for Bucket Mapping:{time.time()-t0} s\")" + ] + }, + { + "cell_type": "markdown", + "id": "96246266", + "metadata": {}, + "source": [ + "Verify results " + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "09e65f8b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dataset_iddoc_idanchor_1_dataset_idanchor_1_doc_idanchor_0_dataset_idanchor_0_doc_id_output_partition_id
01692361878138220169236187814525616923618781436720
11692361878505091692361878505091692361878504570
21692361878939891692361878938461692361878938070
31692361878204481692361878200901692361878204440
41692361878939911692361878939271692361878936970
\n", + "
" + ], + "text/plain": [ + " dataset_id doc_id anchor_1_dataset_id anchor_1_doc_id \\\n", + "0 1692361878 138220 1692361878 145256 \n", + "1 1692361878 50509 1692361878 50509 \n", + "2 1692361878 93989 1692361878 93846 \n", + "3 1692361878 20448 1692361878 20090 \n", + "4 1692361878 93991 1692361878 93927 \n", + "\n", + " anchor_0_dataset_id anchor_0_doc_id _output_partition_id \n", + "0 1692361878 143672 0 \n", + "1 1692361878 50457 0 \n", + "2 1692361878 93807 0 \n", + "3 1692361878 20444 0 \n", + "4 1692361878 93697 0 " + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "map_bucket_res = pd.read_parquet(output_anchor_docs_with_bk_path)\n", + "map_bucket_res.head()" + ] + }, + { + "cell_type": "markdown", + "id": "35bb1e86", + "metadata": {}, + "source": [ + "**[Optional]**Remove previous Jaccard Shuffle results. Run only when there are files under the Jaccard Shuffle output path" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "da7dcc10", + "metadata": {}, + "outputs": [], + "source": [ + "#!rm -r {output_shuffled_docs_path}" + ] + }, + { + "cell_type": "markdown", + "id": "24c2b39d", + "metadata": {}, + "source": [ + "Run Jaccard Shuffle" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "a9dcf646", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text_text_bytesidanchor_0_idanchor_1_id
0พุทธศักราช 676 ใกล้เคียงกับ\\n เมษายน ค.ศ. 133 ...2631692361878-70321692361878-70321692361878-7052
1พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...2171692361878-90821692361878-88051692361878-9071
2พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...2171692361878-90821692361878-90281692361878-9045
3พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...2171692361878-90821692361878-90721692361878-9082
4ประเทศฮังการี เข้าร่วมแข่งขันกีฬาโอลิมปิกฤดูร้...20391692361878-490911692361878-490931692361878-49087
\n", + "" + ], + "text/plain": [ + " text _text_bytes \\\n", + "0 พุทธศักราช 676 ใกล้เคียงกับ\\n เมษายน ค.ศ. 133 ... 263 \n", + "1 พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5... 217 \n", + "2 พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5... 217 \n", + "3 พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5... 217 \n", + "4 ประเทศฮังการี เข้าร่วมแข่งขันกีฬาโอลิมปิกฤดูร้... 2039 \n", + "\n", + " id anchor_0_id anchor_1_id \n", + "0 1692361878-7032 1692361878-7032 1692361878-7052 \n", + "1 1692361878-9082 1692361878-8805 1692361878-9071 \n", + "2 1692361878-9082 1692361878-9028 1692361878-9045 \n", + "3 1692361878-9082 1692361878-9072 1692361878-9082 \n", + "4 1692361878-49091 1692361878-49093 1692361878-49087 " + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "jaccard_shuffle_res = pd.read_parquet(os.path.join(output_shuffled_docs_path,\"_output_partition_id=0/batch_1_1.parquet\"))\n", + "jaccard_shuffle_res.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ffb70238", + "metadata": {}, + "source": [ + "### 5.4 Jaccard Compute\n", + "We will be using `JaccardSimilarity()`.This is to computes the Jaccard similarity between document pairs. Result is a parquet dataset consisting of document id pair along with their Jaccard similarity score. To compute Jaccard similarity between two documents, we first convert the document into sets of n-grams and then compute the Jaccard similarity of the two sets.\n", + "\n", + "Arguments include:\n", + "- `id_field`: Column in input .parquet file identifying document ID\n", + "- `text_field`: Column in input .parquet file identifying document text\n", + "- `anchor_id_fields`: Column in input .parquet file identifying anchors. This can be generated by specifying number of anchor used in `_MapBucket` whose default value is 2\n", + "- `ngram_width`: n-gram used" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "06346b88", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity" + ] + }, + { + "cell_type": "markdown", + "id": "d71f440f", + "metadata": {}, + "source": [ + "Define parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "457ae138", + "metadata": {}, + "outputs": [], + "source": [ + "#Input\n", + "shuffled_docs_path = output_shuffled_docs_path\n", + "\n", + "#Output\n", + "jaccard_compute_base_output_path = os.path.join(data_dir,\"fuzzy/jaccard_compute\")\n", + "jaccard_compute_output_results_path = os.path.join(jaccard_compute_base_output_path, \"jaccard_similarity_results.parquet\")\n", + "\n", + "#Relevant parameters\n", + "input_id_field = 'id'\n", + "input_text_field = 'text'\n", + "ngram_size = 5\n", + "num_anchors = 2\n", + "\n", + "!mkdir -p {jaccard_compute_base_output_path}" + ] + }, + { + "cell_type": "markdown", + "id": "619bf820", + "metadata": {}, + "source": [ + "Run Jaccard Compute" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "2f094db1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running jaccard compute script\n", + "Time taken for Jaccard Computing: 0.8689384460449219\n" + ] + } + ], + "source": [ + "enable_spilling()\n", + "client.run(enable_spilling)\n", + "\n", + "print(\"Running jaccard compute script\", flush=True)\n", + "t0 = time.time()\n", + "\n", + "jaccard = JaccardSimilarity(\n", + " id_field=input_id_field,\n", + " text_field=input_text_field,\n", + " anchor_id_fields=[f\"anchor_{i}_{input_id_field}\" for i in range(num_anchors)],\n", + " ngram_width=ngram_size,\n", + ")\n", + "\n", + "#Load and run Jaccard compute\n", + "result_df = jaccard.jaccard_compute(shuffled_docs_path)\n", + "\n", + "result_df.to_parquet(jaccard_compute_output_results_path, write_index=False, write_metadata_file=False)\n", + "\n", + "print(f\"Time taken for Jaccard Computing: {time.time()-t0}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b31e619c", + "metadata": {}, + "source": [ + "Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets." + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "ae2efe3e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_xid_yjaccard
01692361878-1275211692361878-1275170.755481
11692361878-1275211692361878-1275170.755481
21692361878-459341692361878-459400.922061
31692361878-459341692361878-459400.922061
41692361878-459341692361878-459400.922061
\n", + "
" + ], + "text/plain": [ + " id_x id_y jaccard\n", + "0 1692361878-127521 1692361878-127517 0.755481\n", + "1 1692361878-127521 1692361878-127517 0.755481\n", + "2 1692361878-45934 1692361878-45940 0.922061\n", + "3 1692361878-45934 1692361878-45940 0.922061\n", + "4 1692361878-45934 1692361878-45940 0.922061" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "jaccard_compute_res = pd.read_parquet(jaccard_compute_output_results_path)\n", + "jaccard_compute_res.head()" + ] + }, + { + "cell_type": "markdown", + "id": "834f1831", + "metadata": {}, + "source": [ + "### 5.5 Connected Components\n", + "This section uses `ConnectedComponents()`.This section takes a dataset consisting of document pairs and their corresponding jaccard similarity to construct a non-directed graph. A edge will be form between documents whose Jaccard similarity is higher than the threshold (0.8 in this example). It will then identify the connected components in this graph. Documents within the same connected components are deemed duplicated\n", + "\n", + "Arguments include:\n", + "- `cache_dir`:Output path for intermediate results\n", + "- `jaccard_pairs_path`:Input path for `jaccard_similarity_results.parquet`\n", + "- `id_column`:prefix of ID column in `jaccard_similarity_results.parquet`\n", + "- `jaccard_threshold`:Threshold to determine if an edge exists between two documents" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "5756fde8", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator.modules.fuzzy_dedup import ConnectedComponents" + ] + }, + { + "cell_type": "markdown", + "id": "217957d6", + "metadata": {}, + "source": [ + "Define parameter" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "72a1952e", + "metadata": {}, + "outputs": [], + "source": [ + "#Input\n", + "jaccard_pairs_path = jaccard_compute_output_results_path\n", + "\n", + "#Output\n", + "connected_component_base_output_path = os.path.join(data_dir,\"fuzzy/cc\")\n", + "connected_component_output_path = os.path.join(connected_component_base_output_path, \"connected_components.parquet\")\n", + "connected_component_cache_dir = os.path.join(connected_component_base_output_path, \"cache\")\n", + "\n", + "#Relevant parameter\n", + "input_id_field = 'id'\n", + "jaccard_threshold = 0.8\n", + "\n", + "!mkdir -p {connected_component_base_output_path}" + ] + }, + { + "cell_type": "markdown", + "id": "c53b3a8c", + "metadata": {}, + "source": [ + "Run Connected Component" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "46578e2b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", + "\n", + "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", + "\n", + "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", + "\n", + "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "batch_id = 0/1, time = 0.3100006580352783\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", + "\n", + "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", + "\n", + "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# of groups 5465\n", + "# of docs removed 3079\n", + "assert num_nodes:8544==labels_df:8544 passed\n", + "Time taken for Connected Component: 11.238884925842285 s\n" + ] + } + ], + "source": [ + "client.run(enable_spilling)\n", + "\n", + "t0 = time.time()\n", + " \n", + "components_stage = ConnectedComponents(\n", + " cache_dir=connected_component_cache_dir,\n", + " jaccard_pairs_path=jaccard_pairs_path,\n", + " id_column=input_id_field,\n", + " convert_str_ids=True,\n", + " jaccard_threshold=jaccard_threshold,\n", + ")\n", + "\n", + "#Load and run connected component\n", + "components_stage.cc_workflow(output_path=connected_component_output_path)\n", + "print(f\"Time taken for Connected Component: {time.time()-t0} s\")" + ] + }, + { + "cell_type": "markdown", + "id": "6827158e", + "metadata": {}, + "source": [ + "Verify the result of `Connected Components`" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "2bcfc470", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dataset_iddoc_idgroup
016923618781369993837
11692361878853183838
21692361878706701196
31692361878134587138
416923618781361251320
\n", + "
" + ], + "text/plain": [ + " dataset_id doc_id group\n", + "0 1692361878 136999 3837\n", + "1 1692361878 85318 3838\n", + "2 1692361878 70670 1196\n", + "3 1692361878 134587 138\n", + "4 1692361878 136125 1320" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cc_compute_res = pd.read_parquet(connected_component_output_path)\n", + "cc_compute_res.head()" + ] + }, + { + "cell_type": "markdown", + "id": "aa1ee07d", + "metadata": {}, + "source": [ + "Let's check if the output fuzzy duplicated documents within the same group are similar. Please note that the `group` id in your output might be different from the notebook output." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "f1f10a1c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
groupdoc_id
0121134756, 134762, 134748, 134742, 134740, 134750...
1138134587, 134908, 135024, 135029, 135019, 134566...
2323134794, 134780, 134793, 134785, 134798, 134781...
3344136092, 136103, 136090, 136093, 136100, 136089...
442894120, 94084, 94059, 94128, 94130, 94056, 9413...
.........
54608539125651
54618540125971
5462854184926
5463854240115
5464854350282
\n", + "

5465 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " group doc_id\n", + "0 121 134756, 134762, 134748, 134742, 134740, 134750...\n", + "1 138 134587, 134908, 135024, 135029, 135019, 134566...\n", + "2 323 134794, 134780, 134793, 134785, 134798, 134781...\n", + "3 344 136092, 136103, 136090, 136093, 136100, 136089...\n", + "4 428 94120, 94084, 94059, 94128, 94130, 94056, 9413...\n", + "... ... ...\n", + "5460 8539 125651\n", + "5461 8540 125971\n", + "5462 8541 84926\n", + "5463 8542 40115\n", + "5464 8543 50282\n", + "\n", + "[5465 rows x 2 columns]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cc_compute_res['doc_id'] = cc_compute_res['doc_id'].astype(str)\n", + "cc_compute_res.groupby('group')['doc_id'].agg(lambda x: ', '.join(x)).reset_index()" + ] + }, + { + "cell_type": "markdown", + "id": "f621c2cb", + "metadata": {}, + "source": [ + "Change the `group` number if necessary. By running the code below, we can obtain a list of near duplicated documents." + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "bd79a7f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dataset_iddoc_idgroup
141692361878121545735
661692361878121487735
2131692361878121541735
2911692361878121539735
4221692361878121524735
\n", + "
" + ], + "text/plain": [ + " dataset_id doc_id group\n", + "14 1692361878 121545 735\n", + "66 1692361878 121487 735\n", + "213 1692361878 121541 735\n", + "291 1692361878 121539 735\n", + "422 1692361878 121524 735" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cc_compute_res[cc_compute_res['group']==735].head()" + ] + }, + { + "cell_type": "markdown", + "id": "e7c02f4b", + "metadata": {}, + "source": [ + "Print the text of near duplicated document. Please replace the `id` if necessary, `id` should be in the format of `_`" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "dd0b2e33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['ประเทศสวิตเซอร์แลนด์ ได้เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวชนฤดูหนาว ครั้งที่ 3 ค.ศ. 2020 (พ.ศ. 2563) ณ เมืองโลซาน ประเทศสวิตเซอร์แลนด์ ระหว่างวันที่ 9 - 22 มกราคม พ.ศ. 2563 คณะกรรมการโอลิมปิกแห่งชาติสวิตเซอร์แลนด์ได้ส่งทีมนักกีฬาเข้าแข่งขันทั้งหมด 56 คน แบ่งเป็นเป็นชาย 32 คนและหญิง 56 คน เข้าร่วมการแข่งขันใน 15 ชนิดกีฬา\\n\\nจำนวนผู้เข้าแข่งขัน\\n\\nผลการแข่งขัน\\n\\nสเกตลีลา\\n\\nสเกตความเร็ว\\n\\nสเกตความเร็วระยะสั้น\\n\\nฮอกกี้น้ำแข็ง\\n\\nเคอร์ลิง\\n\\nสกีลงเขา\\n\\nสกีข้ามทุ่ง\\n\\nสกีกระโดดไกล\\n\\nสกีนอร์ดิกผสม\\n\\nสกีลีลา\\n\\nสกีปีนเขา\\n\\nสโนว์บอร์ด\\n\\nทวิกีฬาฤดูหนาว\\n\\nบอบสเล\\n\\nสเกเลตัน\\n\\nอ้างอิง\\n\\nแหล่งข้อมูลอื่น \\n เว็บไซต์อย่างเป็นทางการ \\n\\nประเทศสวิตเซอร์แลนด์ในโอลิมปิกเยาวชน\\nประเทศที่เข้าร่วมแข่งขันโอลิมปิกเยาวชนฤดูหนาว 2020',\n", + " 'ประเทศบัลแกเรีย ได้เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวชนฤดูหนาว ครั้งที่ 3 ค.ศ. 2020 (พ.ศ. 2563) ณ เมืองโลซาน ประเทศสวิตเซอร์แลนด์ ระหว่างวันที่ 9 - 22 มกราคม พ.ศ. 2563 คณะกรรมการโอลิมปิกแห่งชาติบัลแกเรียได้ส่งทีมนักกีฬาเข้าแข่งขันทั้งหมด 18 คน แบ่งเป็นเป็นชาย 11 คนและหญิง 7 คน เข้าร่วมการแข่งขันใน 8 ชนิดกีฬา\\n\\nจำนวนผู้เข้าแข่งขัน\\n\\nผลการแข่งขัน\\n\\nสเกตลีลา\\n\\nสเกตความเร็ว\\n\\nสเกตความเร็วระยะสั้น\\n\\nฮอกกี้น้ำแข็ง\\n\\nเคอร์ลิง\\n\\nสกีลงเขา\\n\\nสกีข้ามทุ่ง\\n\\nสกีกระโดดไกล\\n\\nสกีนอร์ดิกผสม\\n\\nสกีลีลา\\n\\nสกีปีนเขา\\n\\nสโนว์บอร์ด\\n\\nทวิกีฬาฤดูหนาว\\n\\nลูช\\n\\nบอบสเล\\n\\nสเกเลตัน\\n\\nอ้างอิง\\n\\nแหล่งข้อมูลอื่น \\n เว็บไซต์อย่างเป็นทางการ \\n\\nประเทศบัลแกเรียในโอลิมปิกเยาวชน\\nประเทศที่เข้าร่วมแข่งขันโอลิมปิกเยาวชนฤดูหนาว 2020'],\n", + " dtype=object)" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "jaccard_shuffle_res[jaccard_shuffle_res['id'].isin(['1692361878-121545','1692361878-121487'])]['text'].unique()" + ] + }, + { + "cell_type": "markdown", + "id": "c3f8d12f", + "metadata": {}, + "source": [ + "Below is the English translation of the output above. We can see that the two documents are indeed very similar to each other.\n", + "- `Text 1`:\n", + "```\n", + "Switzerland participated in the 3rd Youth Olympic Winter Games in 2020 (B.E. 2563) in Lausanne, Switzerland from January 9 - 22, 2563. The Swiss Olympic Committee sent a total of 56 athletes, consisting of 32 men and 56 women, to compete in 15 sports.\n", + "Number of Competitors:\n", + "Competition Results:\n", + "Figure Skating\n", + "Speed Skating\n", + "Short Track Speed Skating\n", + "Ice Hockey\n", + "Curling\n", + "Alpine Skiing\n", + "Cross-Country Skiing\n", + "Ski Jumping\n", + "Nordic Combined\n", + "Freestyle Skiing\n", + "Ski Mountaineering\n", + "Snowboard\n", + "Biathlon\n", + "Bobsleigh\n", + "Skeleton\n", + "References:\n", + "Other Resources:\n", + "Official Website\n", + "Switzerland at the Youth Olympics\n", + "Countries at the 2020 Youth Winter Olympics\n", + "```\n", + "- `Text 2`:\n", + "```\n", + "Bulgaria participated in the 3rd Youth Olympic Winter Games in 2020 (B.E. 2563) in Lausanne, Switzerland from January 9 - 22, 2563. The Bulgarian Olympic Committee sent a total of 18 athletes, consisting of 11 men and 7 women, to compete in 8 sports.\n", + "Number of Competitors:\n", + "Competition Results:\n", + "Figure Skating\n", + "Speed Skating\n", + "Short Track Speed Skating\n", + "Ice Hockey\n", + "Curling\n", + "Alpine Skiing\n", + "Cross-Country Skiing\n", + "Ski Jumping\n", + "Nordic Combined\n", + "Freestyle Skiing\n", + "Ski Mountaineering\n", + "Snowboard\n", + "Biathlon\n", + "Luge\n", + "Bobsleigh\n", + "Skeleton\n", + "References:\n", + "Other Resources:\n", + "Official Website\n", + "Bulgaria at the Youth Olympics\n", + "Countries at the 2020 Youth Winter Olympics\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "70ca66df", + "metadata": {}, + "source": [ + "## 6. Remove duplicates\n", + "\n", + "Now we have duplicated document IDs output by both exact deduplication and fuzzy deduplication. We will run this section to remove those documents. This is done be loading the output .parquet files and the unicode fixed input dataset in .jsonl as DataFrame. Then use DataFrame operation to remove the duplicated documents." + ] + }, + { + "cell_type": "markdown", + "id": "93d031ec", + "metadata": {}, + "source": [ + "Define parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "911be9d9", + "metadata": {}, + "outputs": [], + "source": [ + "#Input\n", + "dataset_dir = added_id_output_path\n", + "\n", + "#Output\n", + "dudped_output_dir = os.path.join(data_dir,\"remove_duplicate/result.parquet\")\n", + "\n", + "#Relevant parameter\n", + "input_id_field = 'id'\n", + "id_prefix = add_ID_id_prefix\n", + "\n", + "!mkdir -p {dudped_output_dir}" + ] + }, + { + "cell_type": "markdown", + "id": "969f6543", + "metadata": {}, + "source": [ + "We will first process the result of exact deduplication. Since result of exact deduplication contains original ID used in input dataset, it is more straightforward to deal with." + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "bbbfdbb3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n", + "Reading 1 files\n" + ] + } + ], + "source": [ + "#Load .jsonl dataset\n", + "input_dataset = DocumentDataset.read_json(dataset_dir, backend='cudf')\n", + "\n", + "#Load exact deduplicate result and extract list of duplicated document ID\n", + "exact_duplicates = DocumentDataset.read_parquet(os.path.join(exact_dedup_output_dir,\"_exact_duplicates.parquet\"), backend='cudf')\n", + "exact_docs_to_remove = exact_duplicates.df.map_partitions(\n", + " lambda x: x[x._hashes.duplicated(keep=\"first\")]\n", + ")\n", + "\n", + "#Remove the duplicated document from input dataset\n", + "result = input_dataset.df[\n", + " ~input_dataset.df[input_id_field].isin(exact_docs_to_remove[input_id_field].compute())\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "8b97567d", + "metadata": {}, + "source": [ + "For result of fuzzy deduplication, we need to first reconstructed document ID by combining `dataset_id` and `doc_id`, then use the reconstructed `ID` for removal" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "513cf7a0", + "metadata": {}, + "outputs": [], + "source": [ + "#List of id_prefix used in Add ID\n", + "base_ids = [id_prefix]\n", + "\n", + "#Obtain a mapping between `dataset_id` and `id_prefix`\n", + "df = cudf.DataFrame()\n", + "df['base_id'] = [base_id for base_id in base_ids]\n", + "df['dataset_id'] = df['base_id'].hash_values()\n", + "df_pd = df.to_pandas()\n", + "mapping = {\n", + " hashed_id: base_id\n", + " for base_id, hashed_id in zip(df_pd['base_id'], df_pd['dataset_id'])\n", + "}\n", + "\n", + "#Load result of fuzzy deduplication\n", + "fuzzy_duplicates = pd.read_parquet(connected_component_output_path)\n", + "#Reconstruct the original document ID\n", + "fuzzy_duplicates['id']=fuzzy_duplicates.apply(lambda x: f\"{mapping[x['dataset_id']]}-{x['doc_id']:010d}\", axis=1)\n", + "#Generate list of near duplicate document ID\n", + "fuzzy_docs_to_remove = fuzzy_duplicates.drop_duplicates(subset=['group'], keep='first')" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "dc7d647c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing to disk complete for 1 partitions\n" + ] + } + ], + "source": [ + "#Remove near duplicates\n", + "result = result[~result[input_id_field].isin(fuzzy_docs_to_remove[input_id_field])]\n", + "\n", + "#Save final result to local\n", + "write_to_disk(result, dudped_output_dir, output_type=\"parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "b47a967f", + "metadata": {}, + "source": [ + "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset (length = 161748)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "5e8097b1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Length of duplicate removed dataset:156257\n" + ] + } + ], + "source": [ + "res = pd.read_parquet(dudped_output_dir)\n", + "print(f\"Length of duplicate removed dataset:{len(res)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "85caf66f", + "metadata": {}, + "source": [ + "Close the GPU Dask Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "cd91f5fe", + "metadata": {}, + "outputs": [], + "source": [ + "client.cluster.close()\n", + "client.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "1c6cee97", + "metadata": {}, + "source": [ + "## 7. Heuristic Fitlering\n", + "\n", + "In this section, we will apply multiple heuristic filters to the dataset, record the heuristic score for documents and documents removed for each filter. For each heuristic filter, the filter calculates a quality scores based on user defined heuristics/algorithms and classifies documents into high quality documents or low quality documents if the quality score is above the user defined threshold.\n", + "\n", + "Sample lists of heuristic filters can be found in `./config/`\n", + "- `heuristic_filter_en.yaml`: Sample heuristic filter list for English dataset\n", + "- `heuristic_filter_non-en.yaml`:Sample heuristic filter list for Non-English dataset\n", + "- `heuristic_filter_code.yaml`:Sample heuristic filter list for Code language dataset\n", + "Please adjust the sample list e.g. remove/add filters or change filter threshold based on your own use case. In this example, `heuristic_filter_non-en.yaml` will be used.\n", + "\n", + "For detailed implementation and description of each heuristic filter, please refer to `./NeMo-Curator/nemo-curator/filters/heuristics_filter.py`. For customized heuristic filter implementation, user shall follow the sample implementations, write customized filters and update the .yaml files accordingly.\n", + "\n", + "For analysis of impact of each filters on the dataset, user should set `log-score` to true for the filters in the corresponding config .yaml file. This will output quality score for all filters in separate .txt files for each individual filter. With the quality score and filter threshold, use can calculate quality score distribution and other analysis to assess the effectiveness of each filter.\n", + "\n", + "In this example, in order to get a comprehensive output of each filter, we are iterating through ever filter using a for loop and saving the intermediate result. This process will involve extensive I/O operations and is less effective. Alternatively, after loading input dataset and filter pipeline, user can simply call `filter_pipeline(dataset)` to obtain the final filtered result." + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "1ddff58c", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator.utils.config_utils import build_filter_pipeline\n", + "from nemo_curator import Score, Filter, ScoreFilter\n", + "from nemo_curator.utils.file_utils import get_batched_files,expand_outdir_and_mkdir" + ] + }, + { + "cell_type": "markdown", + "id": "a728a161", + "metadata": {}, + "source": [ + "**[Optional]**The following cell is to remove warning from dask." + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "e5114945", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "# Disable the metadata warning\n", + "warnings.filterwarnings(\"ignore\",module=\"dask.dataframe.core\")" + ] + }, + { + "cell_type": "markdown", + "id": "6243a7cb", + "metadata": {}, + "source": [ + "Create a CPU Dask Cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "fa752ded", + "metadata": {}, + "outputs": [], + "source": [ + "cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n", + "client = Client(cluster)" + ] + }, + { + "cell_type": "markdown", + "id": "c3dda877", + "metadata": {}, + "source": [ + "Define some helper functions" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "a8abf841", + "metadata": {}, + "outputs": [], + "source": [ + "def get_dataframe_complement(original_df, filtered_df):\n", + " def partition_complement(part_original_df, partition_info=None):\n", + " if not partition_info:\n", + " return part_original_df\n", + " part_filtered_df = filtered_df.get_partition(partition_info[\"number\"])\n", + " complement_mask = ~part_original_df.index.isin(part_filtered_df.index.persist())\n", + " complement_df = part_original_df[complement_mask]\n", + " return complement_df\n", + "\n", + " return original_df.map_partitions(partition_complement)\n", + "\n", + "def write_scores(df, output_dir):\n", + " for column in df.columns:\n", + " output_path = os.path.join(output_dir, f\"{column}.txt\")\n", + " df[column].to_csv(output_path, single_file=True, encoding=\"utf-8\", header=False, index=False, mode=\"a\")\n", + "\n", + "def get_score_fields(pipeline):\n", + " score_fields = []\n", + " for nc_module in pipeline.modules:\n", + " if isinstance(nc_module, Score) or isinstance(nc_module, ScoreFilter):\n", + " if nc_module.score_field:\n", + " score_fields.append(nc_module.score_field)\n", + " return score_fields" + ] + }, + { + "cell_type": "markdown", + "id": "04e6b0f8", + "metadata": {}, + "source": [ + "Define parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "55e43a6c", + "metadata": {}, + "outputs": [], + "source": [ + "#Input\n", + "HF_input_data_dir = dudped_output_dir\n", + "input_file_type = 'parquet'\n", + "batch_size = 1\n", + "\n", + "#Output\n", + "HF_base_output_path = os.path.join(data_dir,'heuristic_filtering')\n", + "kept_document_dir = os.path.join(HF_base_output_path,'data','hq.parquet')\n", + "removed_document_dir = os.path.join(HF_base_output_path,'data','lq.parquet')\n", + "output_document_score_dir = os.path.join(HF_base_output_path,'data','score')\n", + "output_file_type = 'parquet'\n", + "\n", + "#Relevant parameters\n", + "filter_config_file = './config/heuristic_filter_non-en.yaml'\n", + "input_id_field = 'id'\n", + "\n", + "#Set to False if do not want to save intermediate results\n", + "is_cache = True\n", + "\n", + "!mkdir -p {kept_document_dir}\n", + "!mkdir -p {removed_document_dir}\n", + "!mkdir -p {output_document_score_dir}" + ] + }, + { + "cell_type": "markdown", + "id": "4c5f6c8e", + "metadata": {}, + "source": [ + "Run heuristic filtering" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "f6f50332", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n", + "Saving data for symbol_to_word\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for numbers_ratio\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for urls_ratio\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for word_count\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for repeating_top_2grams\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for repeating_top_3grams\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for repeating_top_4grams\n", + "Writing to disk complete for 1 partitions\n", + "Writing to disk complete for 1 partitions\n", + "Time taken for Heuristic filtering: 729.7436628341675 s\n" + ] + } + ], + "source": [ + "t0 = time.time()\n", + "\n", + "#Load filters from config\n", + "filter_pipeline = build_filter_pipeline(filter_config_file)\n", + "score_fields = get_score_fields(filter_pipeline)\n", + "\n", + "# Load dataset\n", + "dataset = load_dataset(HF_input_data_dir,file_type='parquet')\n", + "\n", + "\n", + "# Iterate through filters. For each filter, the low quality document will be removed from the dataset and output to corresponding folder for analysis\n", + "# Output of previous filter will be input of the next filter\n", + "if is_cache:\n", + " curr_dataset = prev_dataset = dataset\n", + " for filter_module in filter_pipeline.modules:\n", + " #Apply filter\n", + " curr_dataset = filter_module(curr_dataset).persist()\n", + "\n", + " #Output filtered document\n", + " print(f\"Saving data for {filter_module.filter_obj._name}\")\n", + " removed_df = get_dataframe_complement(prev_dataset.df, curr_dataset.df)\n", + " removed_filter_dir = os.path.join(removed_document_dir, filter_module.filter_obj._name)\n", + " expand_outdir_and_mkdir(removed_filter_dir)\n", + " write_to_disk(removed_df, removed_filter_dir, write_to_filename=True, output_type=output_file_type)\n", + " prev_dataset = curr_dataset\n", + " filtered_dataset = curr_dataset\n", + "else:\n", + " filtered_dataset = filter_pipeline(dataset)\n", + "\n", + "# Write scores of retained doucment to separate directory\n", + "output_df = filtered_dataset.df[[input_id_field, *score_fields]]\n", + "write_scores(output_df, output_document_score_dir)\n", + "\n", + "# Remove scores from dataset df\n", + "filtered_dataset = DocumentDataset(filtered_dataset.df.drop(columns=score_fields))\n", + "\n", + "# Output filtered dataset\n", + "write_to_disk(filtered_dataset.df, kept_document_dir, write_to_filename=True, output_type=output_file_type)\n", + "\n", + "print(f\"Time taken for Heuristic filtering: {time.time()-t0} s\")" + ] + }, + { + "cell_type": "markdown", + "id": "b19731f5", + "metadata": {}, + "source": [ + "Verify the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f945362", + "metadata": {}, + "outputs": [], + "source": [ + "res = pd.read_parquet(kept_document_dir)\n", + "print(f\"Dataset size after heuristic filtering:{len(res)}\")\n", + "res.head()" + ] + }, + { + "cell_type": "markdown", + "id": "cb52fe04", + "metadata": {}, + "source": [ + "Close the CPU Dask Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "aaa9823a", + "metadata": {}, + "outputs": [], + "source": [ + "client.cluster.close()\n", + "client.shutdown()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94f6e74e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 462a1a39bf59435afa99de171cf56ea21ebba56a Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 19 Apr 2024 16:22:18 -0700 Subject: [PATCH 02/34] Fix metadata inference with pandas and dask (#35) * Fix metadata inference with pandas and dask Signed-off-by: Ryan Wolf * Fix datatypes for task decontamination Signed-off-by: Ryan Wolf * Use targetted import Signed-off-by: Ryan Wolf --------- Signed-off-by: Ryan Wolf Signed-off-by: Nicole Luo --- nemo_curator/modules/filter.py | 9 ++++++++- nemo_curator/modules/task.py | 12 +++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/nemo_curator/modules/filter.py b/nemo_curator/modules/filter.py index 07f8cb634..7053f26fe 100644 --- a/nemo_curator/modules/filter.py +++ b/nemo_curator/modules/filter.py @@ -11,12 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import pandas as pd +from dask.dataframe.extensions import make_array_nonempty from dask.typing import no_default from nemo_curator.datasets import DocumentDataset from nemo_curator.utils.module_utils import is_batched +# Override so that pd.NA is not passed during the metadata inference +make_array_nonempty.register( + pd.StringDtype, + lambda x: pd.array(["a", "b"], dtype=x), +) + class Score: def __init__(self, score_fn, score_field, text_field="text", score_type=None): diff --git a/nemo_curator/modules/task.py b/nemo_curator/modules/task.py index a7d9ae722..2571b6a8c 100644 --- a/nemo_curator/modules/task.py +++ b/nemo_curator/modules/task.py @@ -302,6 +302,8 @@ def _threshold_ngram_count(self, matched_ngrams: dict) -> set: return filtered_ngrams def _remove_ngrams_partition(self, partition, task_ngrams, ngrams_freq_sorted): + text_type = partition[self.text_field].dtype + document_fn = partial( self._remove_ngrams, task_ngrams=task_ngrams, @@ -318,7 +320,15 @@ def _remove_ngrams_partition(self, partition, task_ngrams, ngrams_freq_sorted): partition[self.text_field] = split_text filtered_partition = partition[valid_documents_mask] - return filtered_partition.explode(self.text_field, ignore_index=True) + exploded_partition = filtered_partition.explode( + self.text_field, ignore_index=True + ) + # After exploding, the string datatype can become an "object" type + exploded_partition[self.text_field] = exploded_partition[ + self.text_field + ].astype(text_type) + + return exploded_partition def _remove_ngrams(self, document, task_ngrams, ngrams_freq_sorted): """ From f2970765be4c0a68d11ea126f357668f7096543f Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 22 Apr 2024 13:26:37 -0700 Subject: [PATCH 03/34] Disable PyTorch Compile Multiprocessing (#34) * Move tokenizer import Signed-off-by: Ryan Wolf * Reduce inductor threads Signed-off-by: Ryan Wolf * Change env int to string Signed-off-by: Ryan Wolf * Change location of env var Signed-off-by: Ryan Wolf * Add comment linking issue Signed-off-by: Ryan Wolf --------- Signed-off-by: Ryan Wolf Signed-off-by: Nicole Luo --- nemo_curator/filters/code.py | 3 ++- nemo_curator/modules/__init__.py | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/nemo_curator/filters/code.py b/nemo_curator/filters/code.py index 9a209ec47..56e4ea69a 100644 --- a/nemo_curator/filters/code.py +++ b/nemo_curator/filters/code.py @@ -18,7 +18,6 @@ import numpy as np from bs4 import BeautifulSoup from comment_parser import comment_parser -from nemo.collections.common.tokenizers import SentencePieceTokenizer from nemo_curator.filters.doc_filter import DocumentFilter, import_filter from nemo_curator.utils.constants import regex_alpha, regex_alphanum @@ -104,6 +103,8 @@ def keep_document(self, score): class TokenizerFertilityFilter(DocumentFilter): def __init__(self, path_to_tokenizer=None, min_char_to_token_ratio=2.5): + from nemo.collections.common.tokenizers import SentencePieceTokenizer + if path_to_tokenizer is None: raise ValueError( "Must provide a valid path to a SentencePiece " "tokenizer" diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py index d845441f3..d7c099803 100644 --- a/nemo_curator/modules/__init__.py +++ b/nemo_curator/modules/__init__.py @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os + +# Disables multiprocessing in torch.compile calls. +# Without this, Dasks multiprocessing combined with PyTorch's +# gives errors like "daemonic processes are not allowed to have children" +# See https://github.com/NVIDIA/NeMo-Curator/issues/31 +os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" from .add_id import AddId from .exact_dedup import ExactDuplicates From dbe76060c5b0b77880c8131112c001f35d7d32c1 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 23 Apr 2024 11:34:56 -0700 Subject: [PATCH 04/34] Improve speed of AddId module (#36) * Add fast id method Signed-off-by: Ryan Wolf * Add type conversion Signed-off-by: Ryan Wolf * Fix off by one errors in tests Signed-off-by: Ryan Wolf --------- Signed-off-by: Ryan Wolf Signed-off-by: Nicole Luo --- nemo_curator/modules/add_id.py | 45 ++++++++++++++++++++++++--- nemo_curator/scripts/add_id.py | 6 ++-- nemo_curator/utils/module_utils.py | 5 +++ tests/test_add_id.py | 50 ++++++++++++++++++++++++++---- 4 files changed, 94 insertions(+), 12 deletions(-) diff --git a/nemo_curator/modules/add_id.py b/nemo_curator/modules/add_id.py index e8f30739b..83da7bd25 100644 --- a/nemo_curator/modules/add_id.py +++ b/nemo_curator/modules/add_id.py @@ -12,22 +12,58 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + import dask.dataframe as dd import numpy as np from dask import delayed from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.module_utils import count_digits class AddId: - def __init__(self, id_field, id_prefix="doc_id", start_index=0) -> None: + def __init__( + self, id_field, id_prefix: str = "doc_id", start_index: Optional[int] = None + ) -> None: self.id_field = id_field self.id_prefix = id_prefix self.start_index = start_index def __call__(self, dataset: DocumentDataset) -> DocumentDataset: + if self.start_index is None: + return self._add_id_fast(dataset) + else: + return self._add_id_ordered(dataset) + + def _add_id_fast(self, dataset: DocumentDataset) -> DocumentDataset: + meta = dataset.df.dtypes.to_dict() + meta[self.id_field] = "string" + + partition_zero_padding = count_digits(dataset.df.npartitions) + id_df = dataset.df.map_partitions( + self._add_id_fast_partition, + partition_zero_padding, + meta=meta, + ) + + return DocumentDataset(id_df) + + def _add_id_fast_partition(self, partition, global_padding, partition_info=None): + local_padding = count_digits(len(partition)) + global_id = partition_info["number"] + + id_column = [ + f"{self.id_prefix}-{local_id:0{local_padding}d}{global_id:0{global_padding}d}" + for local_id in range(len(partition)) + ] + partition[self.id_field] = id_column + + return partition + + def _add_id_ordered(self, dataset: DocumentDataset) -> DocumentDataset: original_meta = dataset.df.dtypes.to_dict() - original_meta[self.id_field] = "object" + original_meta[self.id_field] = "string" delayed_dataset = dataset.df.to_delayed() parition_lengths = [0] @@ -38,7 +74,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset: delayed_id_dataset = [] for i, partition in enumerate(delayed_dataset): delayed_id_dataset.append( - delayed(self._add_id_to_partition)(partition, lower_id_bounds[i]) + delayed(self._add_id_ordered_partition)(partition, lower_id_bounds[i]) ) id_dataset = DocumentDataset( @@ -47,11 +83,12 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset: return id_dataset - def _add_id_to_partition(self, partition, partition_start_id): + def _add_id_ordered_partition(self, partition, partition_start_id): id_column = [ f"{self.id_prefix}-{int(i + self.start_index):010d}" for i in range(partition_start_id, len(partition) + partition_start_id) ] partition[self.id_field] = id_column + partition[self.id_field] = partition[self.id_field].astype("string") return partition diff --git a/nemo_curator/scripts/add_id.py b/nemo_curator/scripts/add_id.py index 4e49663aa..3e91e8062 100644 --- a/nemo_curator/scripts/add_id.py +++ b/nemo_curator/scripts/add_id.py @@ -79,8 +79,10 @@ def attach_args( parser.add_argument( "--starting-index", type=int, - default=0, - help="Starting index from which to start indexing the documents", + default=None, + help="If supplied, determines the starting index from which to start " + "indexing the documents. By default, it is unspecified, and uses an id" + " scheme that is fast to calculate and is not guaranteed to be ordered.", ) parser.add_argument( "--output-data-dir", diff --git a/nemo_curator/utils/module_utils.py b/nemo_curator/utils/module_utils.py index dc4a693d2..388a949f6 100644 --- a/nemo_curator/utils/module_utils.py +++ b/nemo_curator/utils/module_utils.py @@ -11,7 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import math def is_batched(function): return hasattr(function, "batched") and function.batched + + +def count_digits(num): + return math.floor(math.log10(num)) + 1 diff --git a/tests/test_add_id.py b/tests/test_add_id.py index 458b4868d..42a8575e5 100644 --- a/tests/test_add_id.py +++ b/tests/test_add_id.py @@ -16,7 +16,7 @@ import pandas as pd import pytest -import nemo_curator +import nemo_curator as nc from nemo_curator.datasets import DocumentDataset @@ -41,10 +41,10 @@ def two_partition_dataset(): ) -class TestPrepareTaskData: +class TestAddId: def test_basic_id(self, single_partition_dataset): id_field = "id" - add_id = nemo_curator.AddId(id_field) + add_id = nc.AddId(id_field, start_index=0) id_dataset = add_id(single_partition_dataset) actual_ids = id_dataset.df[id_field].compute() expected_ids = pd.Series( @@ -63,7 +63,7 @@ def test_basic_id(self, single_partition_dataset): def test_two_partitions(self, two_partition_dataset): id_field = "id" - add_id = nemo_curator.AddId(id_field) + add_id = nc.AddId(id_field, start_index=0) id_dataset = add_id(two_partition_dataset) actual_ids = id_dataset.df[id_field].compute() expected_ids = pd.Series( @@ -83,7 +83,7 @@ def test_two_partitions(self, two_partition_dataset): def test_id_prefix(self, two_partition_dataset): id_field = "id" id_prefix = "my_id" - add_id = nemo_curator.AddId(id_field, id_prefix=id_prefix) + add_id = nc.AddId(id_field, id_prefix=id_prefix, start_index=0) id_dataset = add_id(two_partition_dataset) actual_ids = id_dataset.df[id_field].compute() expected_ids = pd.Series( @@ -103,7 +103,7 @@ def test_id_prefix(self, two_partition_dataset): def test_start_index(self, two_partition_dataset): id_field = "id" start_index = 13 - add_id = nemo_curator.AddId(id_field, start_index=start_index) + add_id = nc.AddId(id_field, start_index=start_index) id_dataset = add_id(two_partition_dataset) actual_ids = id_dataset.df[id_field].compute() expected_ids = pd.Series( @@ -119,3 +119,41 @@ def test_start_index(self, two_partition_dataset): assert all( expected_ids == actual_ids ), f"Expected: {expected_ids}, got: {actual_ids}" + + def test_fast_id_single_partition(self, single_partition_dataset): + id_field = "id" + add_id = nc.AddId(id_field) + id_dataset = add_id(single_partition_dataset) + actual_ids = id_dataset.df[id_field].compute() + expected_ids = pd.Series( + [ + "doc_id-00", + "doc_id-10", + "doc_id-20", + "doc_id-30", + "doc_id-40", + ] + ) + + assert all( + expected_ids == actual_ids + ), f"Expected: {expected_ids}, got: {actual_ids}" + + def test_fast_id_two_partitions(self, two_partition_dataset): + id_field = "id" + add_id = nc.AddId(id_field) + id_dataset = add_id(two_partition_dataset) + actual_ids = id_dataset.df[id_field].compute() + expected_ids = pd.Series( + [ + "doc_id-00", + "doc_id-10", + "doc_id-20", + "doc_id-01", + "doc_id-11", + ] + ) + + assert all( + expected_ids == actual_ids + ), f"Expected: {expected_ids}, got: {actual_ids}" From 417e874bc42a32f80f77c58d8e792e93c7ef49f5 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Tue, 23 Apr 2024 13:40:00 -0700 Subject: [PATCH 05/34] Make GPU dependencies optional (#27) * Move GPU imports and make them optional Signed-off-by: Ayush Dattagupta * Move gpu dependencies to a seperate install Signed-off-by: Ayush Dattagupta * Remove unused import Signed-off-by: Ayush Dattagupta * Switch to placeholder import that raises on usage Signed-off-by: Ayush Dattagupta * Remove deprecated utils usage Signed-off-by: Ayush Dattagupta * Add cuML attribution Signed-off-by: Ayush Dattagupta * Safe import tests, improve install instruction, update gha workflow Signed-off-by: Ayush Dattagupta * Fix pytests due to loc bug Signed-off-by: Ayush Dattagupta * update install instructions Signed-off-by: Ayush Dattagupta * Raise on non module-not-found errors, update logging Signed-off-by: Ayush Dattagupta * Update logging to not change root logger Signed-off-by: Ayush Dattagupta --------- Signed-off-by: Ayush Dattagupta Signed-off-by: Nicole Luo --- .github/workflows/test.yml | 5 +- README.md | 14 +- nemo_curator/datasets/doc_dataset.py | 6 +- nemo_curator/gpu_deduplication/utils.py | 76 ---- nemo_curator/modules/__init__.py | 7 +- nemo_curator/modules/exact_dedup.py | 3 +- nemo_curator/modules/fuzzy_dedup.py | 15 +- nemo_curator/scripts/compute_minhashes.py | 9 +- nemo_curator/scripts/connected_components.py | 7 +- nemo_curator/scripts/find_exact_duplicates.py | 3 +- nemo_curator/scripts/jaccard_compute.py | 8 +- nemo_curator/scripts/jaccard_shuffle.py | 9 +- nemo_curator/scripts/map_buckets.py | 9 +- nemo_curator/scripts/minhash_lsh.py | 3 +- nemo_curator/utils/distributed_utils.py | 46 ++- nemo_curator/utils/gpu_utils.py | 3 + nemo_curator/utils/import_utils.py | 384 ++++++++++++++++++ setup.py | 13 +- tests/test_filters.py | 8 +- tests/test_fuzzy_dedup.py | 6 +- 20 files changed, 493 insertions(+), 141 deletions(-) create mode 100644 nemo_curator/utils/import_utils.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d179a2a57..baa968f47 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,9 +40,8 @@ jobs: # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | pip install wheel cython - pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com . + pip install --no-cache-dir . pip install pytest - name: Run tests - # TODO: Remove env variable when gpu dependencies are optional run: | - RAPIDS_NO_INITIALIZE=1 python -m pytest -v --cpu + python -m pytest -v --cpu diff --git a/README.md b/README.md index eb8c37abe..a17a573eb 100644 --- a/README.md +++ b/README.md @@ -37,12 +37,20 @@ These modules are designed to be flexible and allow for reordering with few exce ## Installation -NeMo Curator currently requires Python 3.10 and a GPU with CUDA 12 or above installed in order to be used. +NeMo Curator currently requires Python 3.10 and the GPU accelerated modules require CUDA 12 or above installed in order to be used. -NeMo Curator can be installed manually by cloning the repository and installing as follows: +NeMo Curator can be installed manually by cloning the repository and installing as follows - + +For CPU only modules: +``` +pip install . ``` -pip install --extra-index-url https://pypi.nvidia.com . + +For CPU + CUDA accelerated modules ``` +pip install --extra-index-url https://pypi.nvidia.com ".[cuda12x]" +``` + ### NeMo Framework Container NeMo Curator is available in the [NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo). The NeMo Framework Container provides an end-to-end platform for development of custom generative AI models anywhere. The latest release of NeMo Curator comes preinstalled in the container. diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py index af45f290c..37592b188 100644 --- a/nemo_curator/datasets/doc_dataset.py +++ b/nemo_curator/datasets/doc_dataset.py @@ -13,7 +13,6 @@ # limitations under the License. import dask.dataframe as dd -import dask_cudf from nemo_curator.utils.distributed_utils import read_data, write_to_disk from nemo_curator.utils.file_utils import get_all_files_paths_under @@ -182,10 +181,7 @@ def _read_json_or_parquet( ) dfs.append(df) - if backend == "cudf": - raw_data = dask_cudf.concat(dfs, ignore_unknown_divisions=True) - else: - raw_data = dd.concat(dfs, ignore_unknown_divisions=True) + raw_data = dd.concat(dfs, ignore_unknown_divisions=True) elif isinstance(input_files, str): # Single file diff --git a/nemo_curator/gpu_deduplication/utils.py b/nemo_curator/gpu_deduplication/utils.py index ed69477be..f6faefe77 100644 --- a/nemo_curator/gpu_deduplication/utils.py +++ b/nemo_curator/gpu_deduplication/utils.py @@ -13,84 +13,8 @@ # limitations under the License. import argparse -import logging -import os -import socket -from contextlib import nullcontext from time import time -import cudf -from dask_cuda import LocalCUDACluster -from distributed import Client, performance_report - - -def create_logger(rank, log_file, name="logger", log_level=logging.INFO): - # Create the logger - logger = logging.getLogger(name) - logger.setLevel(log_level) - - myhost = socket.gethostname() - - extra = {"host": myhost, "rank": rank} - formatter = logging.Formatter( - "%(asctime)s | %(host)s | Rank %(rank)s | %(message)s" - ) - - # File handler for output - file_handler = logging.FileHandler(log_file, mode="a") - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - logger = logging.LoggerAdapter(logger, extra) - - return logger - - -# TODO: Remove below to use nemo_curator.distributed_utils.get_client -def get_client(args) -> Client: - if args.scheduler_address: - if args.scheduler_file: - raise ValueError( - "Only one of scheduler_address or scheduler_file can be provided" - ) - else: - return Client(address=args.scheduler_address, timeout="30s") - elif args.scheduler_file: - return Client(scheduler_file=args.scheduler_file, timeout="30s") - else: - extra_kwargs = ( - { - "enable_tcp_over_ucx": True, - "enable_nvlink": True, - "enable_infiniband": False, - "enable_rdmacm": False, - } - if args.nvlink_only and args.protocol == "ucx" - else {} - ) - - cluster = LocalCUDACluster( - rmm_pool_size=args.rmm_pool_size, - protocol=args.protocol, - rmm_async=True, - **extra_kwargs, - ) - return Client(cluster) - - -def performance_report_if(path=None, report_name="dask-profile.html"): - if path is not None: - return performance_report(os.path.join(path, report_name)) - else: - return nullcontext() - - -# TODO: Remove below to use nemo_curator.distributed_utils._enable_spilling -def enable_spilling(): - """ - Enables spilling to host memory for cudf - """ - cudf.set_option("spill", True) - def get_num_workers(client): """ diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py index d7c099803..434ebecf4 100644 --- a/nemo_curator/modules/__init__.py +++ b/nemo_curator/modules/__init__.py @@ -19,14 +19,19 @@ # See https://github.com/NVIDIA/NeMo-Curator/issues/31 os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" +from nemo_curator.utils.import_utils import gpu_only_import_from + from .add_id import AddId from .exact_dedup import ExactDuplicates from .filter import Filter, Score, ScoreFilter -from .fuzzy_dedup import LSH, MinHash from .meta import Sequential from .modify import Modify from .task import TaskDecontamination +# GPU packages +LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "LSH") +MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "MinHash") + # Pytorch related imports must come after all imports that require cugraph, # because of context cleanup issues b/w pytorch and cugraph # See this issue: https://github.com/rapidsai/cugraph/issues/2718 diff --git a/nemo_curator/modules/exact_dedup.py b/nemo_curator/modules/exact_dedup.py index 5d960ac6e..2831f516f 100644 --- a/nemo_curator/modules/exact_dedup.py +++ b/nemo_curator/modules/exact_dedup.py @@ -28,7 +28,8 @@ from nemo_curator._compat import DASK_P2P_ERROR from nemo_curator.datasets import DocumentDataset -from nemo_curator.gpu_deduplication.utils import create_logger, performance_report_if +from nemo_curator.log import create_logger +from nemo_curator.utils.distributed_utils import performance_report_if from nemo_curator.utils.gpu_utils import is_cudf_type diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py index 3b0576058..b51499678 100644 --- a/nemo_curator/modules/fuzzy_dedup.py +++ b/nemo_curator/modules/fuzzy_dedup.py @@ -22,12 +22,12 @@ from typing import List, Tuple, Union import cudf -import cugraph import cugraph.dask as dcg import cugraph.dask.comms.comms as Comms import cupy as cp import dask_cudf import numpy as np +from cugraph import MultiGraph from dask import dataframe as dd from dask.dataframe.shuffle import shuffle as dd_shuffle from dask.utils import M @@ -39,12 +39,13 @@ filter_text_rows_by_bucket_batch, merge_left_to_shuffled_right, ) -from nemo_curator.gpu_deduplication.utils import create_logger, performance_report_if -from nemo_curator.utils.distributed_utils import get_current_client, get_num_workers -from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import ( - convert_str_id_to_int, - int_ids_to_str, +from nemo_curator.log import create_logger +from nemo_curator.utils.distributed_utils import ( + get_current_client, + get_num_workers, + performance_report_if, ) +from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import int_ids_to_str from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( aggregated_anchor_docs_with_bk_read, get_restart_offsets, @@ -1120,7 +1121,7 @@ def _run_connected_components( df = df[[self.left_id, self.right_id]].astype(np.int64) df = dask_cudf.concat([df, self_edge_df]) - G = cugraph.MultiGraph(directed=False) + G = MultiGraph(directed=False) G.from_dask_cudf_edgelist( df, source=self.left_id, destination=self.right_id, renumber=False ) diff --git a/nemo_curator/scripts/compute_minhashes.py b/nemo_curator/scripts/compute_minhashes.py index c7a7e68b2..044653ceb 100644 --- a/nemo_curator/scripts/compute_minhashes.py +++ b/nemo_curator/scripts/compute_minhashes.py @@ -18,12 +18,13 @@ from nemo_curator import MinHash from nemo_curator.datasets import DocumentDataset from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep -from nemo_curator.gpu_deduplication.utils import ( - create_logger, - parse_nc_args, +from nemo_curator.gpu_deduplication.utils import parse_nc_args +from nemo_curator.log import create_logger +from nemo_curator.utils.distributed_utils import ( + get_client, performance_report_if, + read_data, ) -from nemo_curator.utils.distributed_utils import get_client, read_data from nemo_curator.utils.file_utils import get_all_files_paths_under diff --git a/nemo_curator/scripts/connected_components.py b/nemo_curator/scripts/connected_components.py index 1ab1282af..c04f0349d 100644 --- a/nemo_curator/scripts/connected_components.py +++ b/nemo_curator/scripts/connected_components.py @@ -15,7 +15,7 @@ import os import time -from nemo_curator.gpu_deduplication.utils import enable_spilling, parse_nc_args +from nemo_curator.gpu_deduplication.utils import parse_nc_args from nemo_curator.modules.fuzzy_dedup import ConnectedComponents from nemo_curator.utils.distributed_utils import get_client @@ -32,9 +32,10 @@ def main(args): st = time.time() output_path = os.path.join(args.output_dir, "connected_components.parquet") args.set_torch_to_use_rmm = False + args.enable_spilling = True + client = get_client(args, cluster_type="gpu") - enable_spilling() - client.run(enable_spilling) + components_stage = ConnectedComponents( cache_dir=args.cache_dir, jaccard_pairs_path=args.jaccard_pairs_path, diff --git a/nemo_curator/scripts/find_exact_duplicates.py b/nemo_curator/scripts/find_exact_duplicates.py index 7da01ea8e..16173861d 100644 --- a/nemo_curator/scripts/find_exact_duplicates.py +++ b/nemo_curator/scripts/find_exact_duplicates.py @@ -19,7 +19,8 @@ from nemo_curator.datasets import DocumentDataset from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep -from nemo_curator.gpu_deduplication.utils import create_logger, parse_nc_args +from nemo_curator.gpu_deduplication.utils import parse_nc_args +from nemo_curator.log import create_logger from nemo_curator.modules import ExactDuplicates from nemo_curator.utils.distributed_utils import get_client, read_data from nemo_curator.utils.file_utils import get_all_files_paths_under diff --git a/nemo_curator/scripts/jaccard_compute.py b/nemo_curator/scripts/jaccard_compute.py index f59157164..d16e95654 100644 --- a/nemo_curator/scripts/jaccard_compute.py +++ b/nemo_curator/scripts/jaccard_compute.py @@ -15,13 +15,13 @@ import os import time -from nemo_curator.gpu_deduplication.utils import enable_spilling, parse_nc_args +from nemo_curator.gpu_deduplication.utils import parse_nc_args from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity from nemo_curator.utils.distributed_utils import get_client, get_num_workers def main(args): - description = """Computes the Jaccard similarity between document pairs + """Computes the Jaccard similarity between document pairs from partitioned parquet dataset. Result is a parquet dataset consiting of document id pair along with their Jaccard similarity score. """ @@ -30,9 +30,9 @@ def main(args): output_final_results_path = os.path.join( OUTPUT_PATH, "jaccard_similarity_results.parquet" ) + args.enable_spilling = True client = get_client(args, "gpu") - enable_spilling() - client.run(enable_spilling) + print(f"Num Workers = {get_num_workers(client)}", flush=True) print("Connected to dask cluster", flush=True) print("Running jaccard compute script", flush=True) diff --git a/nemo_curator/scripts/jaccard_shuffle.py b/nemo_curator/scripts/jaccard_shuffle.py index dc5d20f9b..c01935a61 100644 --- a/nemo_curator/scripts/jaccard_shuffle.py +++ b/nemo_curator/scripts/jaccard_shuffle.py @@ -15,12 +15,9 @@ import os import time -from nemo_curator.gpu_deduplication.utils import ( - get_client, - get_num_workers, - parse_nc_args, -) +from nemo_curator.gpu_deduplication.utils import get_num_workers, parse_nc_args from nemo_curator.modules.fuzzy_dedup import _Shuffle +from nemo_curator.utils.distributed_utils import get_client from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( get_text_ddf_from_json_path_with_blocksize, ) @@ -38,7 +35,7 @@ def main(args): OUTPUT_PATH = args.output_dir output_shuffled_docs_path = os.path.join(OUTPUT_PATH, "shuffled_docs.parquet") - client = get_client(args) + client = get_client(args, "gpu") client.run(func) print(f"Num Workers = {get_num_workers(client)}", flush=True) print("Connected to dask cluster", flush=True) diff --git a/nemo_curator/scripts/map_buckets.py b/nemo_curator/scripts/map_buckets.py index 522e4f417..9e3f71a51 100644 --- a/nemo_curator/scripts/map_buckets.py +++ b/nemo_curator/scripts/map_buckets.py @@ -15,12 +15,9 @@ import os import time -from nemo_curator.gpu_deduplication.utils import ( - get_client, - get_num_workers, - parse_nc_args, -) +from nemo_curator.gpu_deduplication.utils import get_num_workers, parse_nc_args from nemo_curator.modules.fuzzy_dedup import _MapBuckets +from nemo_curator.utils.distributed_utils import get_client from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( get_bucket_ddf_from_parquet_path, get_text_ddf_from_json_path_with_blocksize, @@ -157,7 +154,7 @@ def main(args): output_anchor_docs_with_bk_path = os.path.join( OUTPUT_PATH, "anchor_docs_with_bk.parquet" ) - client = get_client(args) + client = get_client(args, "gpu") print(f"Num Workers = {get_num_workers(client)}", flush=True) print("Connected to dask cluster", flush=True) print("Running jaccard map buckets script", flush=True) diff --git a/nemo_curator/scripts/minhash_lsh.py b/nemo_curator/scripts/minhash_lsh.py index fb2c6a90d..ec206dc10 100644 --- a/nemo_curator/scripts/minhash_lsh.py +++ b/nemo_curator/scripts/minhash_lsh.py @@ -24,7 +24,8 @@ from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import ( convert_str_id_to_int, ) -from nemo_curator.gpu_deduplication.utils import create_logger, parse_nc_args +from nemo_curator.gpu_deduplication.utils import parse_nc_args +from nemo_curator.log import create_logger from nemo_curator.utils.distributed_utils import get_client diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py index 71fa1cdca..2d7dc9213 100644 --- a/nemo_curator/utils/distributed_utils.py +++ b/nemo_curator/utils/distributed_utils.py @@ -11,20 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import os os.environ["RAPIDS_NO_INITIALIZE"] = "1" import warnings +from contextlib import nullcontext from pathlib import Path from typing import Union -import cudf import dask.dataframe as dd -import dask_cudf import pandas as pd -from dask.distributed import Client, LocalCluster, get_worker -from dask_cuda import LocalCUDACluster +from dask.distributed import Client, LocalCluster, get_worker, performance_report + +from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING, is_cudf_type +from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from + +cudf = gpu_only_import("cudf") +LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster") class DotDict: @@ -48,7 +53,6 @@ def start_dask_gpu_local_cluster(args) -> Client: GPUs present on the machine. """ - # Setting conservative defaults # which should work across most systems nvlink_only = getattr(args, "nvlink_only", False) @@ -166,6 +170,8 @@ def _enable_spilling(): i.e., computing on objects that occupy more memory than is available on the GPU. """ + import cudf + cudf.set_option("spill", True) @@ -265,6 +271,10 @@ def read_data( A Dask-cuDF or a Dask-pandas DataFrame. """ + if backend == "cudf": + # Try using cuDF. If not availible will throw an error. + test_obj = cudf.Series + if file_type == "pickle": df = read_pandas_pickle(input_files[0], add_filename=add_filename) df = dd.from_pandas(df, npartitions=16) @@ -369,10 +379,12 @@ def single_partition_write_with_filename(df, output_file_dir, output_type="jsonl warnings.warn(f"Empty partition found") empty_partition = False - if isinstance(df, pd.DataFrame): - success_ser = pd.Series([empty_partition]) - else: + if is_cudf_type(df): + import cudf + success_ser = cudf.Series([empty_partition]) + else: + success_ser = pd.Series([empty_partition]) if empty_partition: filename = df.filename.iloc[0] @@ -425,10 +437,13 @@ def write_to_disk(df, output_file_dir, write_to_filename=False, output_type="jso ) if write_to_filename: - if isinstance(df, dd.DataFrame): - output_meta = pd.Series([True], dtype="bool") - else: + if is_cudf_type(df): + import cudf + output_meta = cudf.Series([True]) + else: + output_meta = pd.Series([True], dtype="bool") + os.makedirs(output_file_dir, exist_ok=True) output = df.map_partitions( single_partition_write_with_filename, @@ -440,7 +455,7 @@ def write_to_disk(df, output_file_dir, write_to_filename=False, output_type="jso output = output.compute() else: if output_type == "jsonl": - if isinstance(df, dask_cudf.DataFrame): + if is_cudf_type(df): # See open issue here: https://github.com/rapidsai/cudf/issues/15211 # df.to_json(output_file_dir, orient="records", lines=True, engine="cudf", force_ascii=False) df.to_json( @@ -521,3 +536,10 @@ def get_current_client(): return Client.current() except ValueError: return None + + +def performance_report_if(path=None, report_name="dask-profile.html"): + if path is not None: + return performance_report(os.path.join(path, report_name)) + else: + return nullcontext() diff --git a/nemo_curator/utils/gpu_utils.py b/nemo_curator/utils/gpu_utils.py index de1c23dfe..86ba888fc 100644 --- a/nemo_curator/utils/gpu_utils.py +++ b/nemo_curator/utils/gpu_utils.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +GPU_INSTALL_STRING = """Install GPU packages via `pip install --extra-index-url https://pypi.nvidia.com nemo_curator[cuda12x]` +or use `pip install --extra-index-url https://pypi.nvidia.com ".[cuda12x]"` if installing from source""" + def is_cudf_type(obj): """ diff --git a/nemo_curator/utils/import_utils.py b/nemo_curator/utils/import_utils.py new file mode 100644 index 000000000..ea78e4597 --- /dev/null +++ b/nemo_curator/utils/import_utils.py @@ -0,0 +1,384 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file is adapted from cuML's safe_imports module: +# https://github.com/rapidsai/cuml/blob/e93166ea0dddfa8ef2f68c6335012af4420bc8ac/python/cuml/internals/safe_imports.py + + +import importlib +import logging +import traceback +from contextlib import contextmanager + +from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(logging.StreamHandler()) + + +class UnavailableError(Exception): + """Error thrown if a symbol is unavailable due to an issue importing it""" + + +@contextmanager +def null_decorator(*args, **kwargs): + if len(kwargs) == 0 and len(args) == 1 and callable(args[0]): + return args[0] + else: + + def inner(func): + return func + + return inner + + +class UnavailableMeta(type): + """A metaclass for generating placeholder objects for unavailable symbols + + This metaclass allows errors to be deferred from import time to the time + that a symbol is actually used in order to streamline the usage of optional + dependencies. This is particularly useful for attempted imports of GPU-only + modules which will only be invoked if GPU-only functionality is + specifically used. + + If an attempt to import a symbol fails, this metaclass is used to generate + a class which stands in for that symbol. Any attempt to call the symbol + (instantiate the class) or access its attributes will throw an + UnavailableError exception. Furthermore, this class can be used in + e.g. isinstance checks, since it will (correctly) fail to match any + instance it is compared against. + + In addition to calls and attribute access, a number of dunder methods are + implemented so that other common usages of imported symbols (e.g. + arithmetic) throw an UnavailableError, but this is not guaranteed for + all possible uses. In such cases, other exception types (typically + TypeErrors) will be thrown instead. + """ + + def __new__(meta, name, bases, dct): + if dct.get("_msg", None) is None: + dct["_msg"] = f"{name} could not be imported" + name = f"MISSING{name}" + return super(UnavailableMeta, meta).__new__(meta, name, bases, dct) + + def __call__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __getattr__(cls, name): + raise UnavailableError(cls._msg) + + def __eq__(cls, other): + raise UnavailableError(cls._msg) + + def __lt__(cls, other): + raise UnavailableError(cls._msg) + + def __gt__(cls, other): + raise UnavailableError(cls._msg) + + def __ne__(cls, other): + raise UnavailableError(cls._msg) + + def __abs__(cls, other): + raise UnavailableError(cls._msg) + + def __add__(cls, other): + raise UnavailableError(cls._msg) + + def __radd__(cls, other): + raise UnavailableError(cls._msg) + + def __iadd__(cls, other): + raise UnavailableError(cls._msg) + + def __floordiv__(cls, other): + raise UnavailableError(cls._msg) + + def __rfloordiv__(cls, other): + raise UnavailableError(cls._msg) + + def __ifloordiv__(cls, other): + raise UnavailableError(cls._msg) + + def __lshift__(cls, other): + raise UnavailableError(cls._msg) + + def __rlshift__(cls, other): + raise UnavailableError(cls._msg) + + def __mul__(cls, other): + raise UnavailableError(cls._msg) + + def __rmul__(cls, other): + raise UnavailableError(cls._msg) + + def __imul__(cls, other): + raise UnavailableError(cls._msg) + + def __ilshift__(cls, other): + raise UnavailableError(cls._msg) + + def __pow__(cls, other): + raise UnavailableError(cls._msg) + + def __rpow__(cls, other): + raise UnavailableError(cls._msg) + + def __ipow__(cls, other): + raise UnavailableError(cls._msg) + + def __rshift__(cls, other): + raise UnavailableError(cls._msg) + + def __rrshift__(cls, other): + raise UnavailableError(cls._msg) + + def __irshift__(cls, other): + raise UnavailableError(cls._msg) + + def __sub__(cls, other): + raise UnavailableError(cls._msg) + + def __rsub__(cls, other): + raise UnavailableError(cls._msg) + + def __isub__(cls, other): + raise UnavailableError(cls._msg) + + def __truediv__(cls, other): + raise UnavailableError(cls._msg) + + def __rtruediv__(cls, other): + raise UnavailableError(cls._msg) + + def __itruediv__(cls, other): + raise UnavailableError(cls._msg) + + def __divmod__(cls, other): + raise UnavailableError(cls._msg) + + def __rdivmod__(cls, other): + raise UnavailableError(cls._msg) + + def __neg__(cls): + raise UnavailableError(cls._msg) + + def __invert__(cls): + raise UnavailableError(cls._msg) + + def __hash__(cls): + raise UnavailableError(cls._msg) + + def __index__(cls): + raise UnavailableError(cls._msg) + + def __iter__(cls): + raise UnavailableError(cls._msg) + + def __delitem__(cls, name): + raise UnavailableError(cls._msg) + + def __setitem__(cls, name, value): + raise UnavailableError(cls._msg) + + def __enter__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __get__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __delete__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __len__(cls): + raise UnavailableError(cls._msg) + + +def is_unavailable(obj): + """Helper to check if given symbol is actually a placeholder""" + return type(obj) is UnavailableMeta + + +class UnavailableNullContext: + """A placeholder class for unavailable context managers + + This context manager will return a value which will throw an + UnavailableError if used in any way, but the context manager itself can be + safely invoked. + """ + + def __init__(self, *args, **kwargs): + pass + + def __enter__(self): + return UnavailableMeta( + "MissingContextValue", + (), + {"_msg": "Attempted to make use of placeholder context return value."}, + ) + + def __exit__(self, *args, **kwargs): + pass + + +def safe_import(module, *, msg=None, alt=None): + """A function used to import modules that may not be available + + This function will attempt to import a module with the given name, but it + will not throw an ModuleNotFoundError if the module is not found. Instead, it will + return a placeholder object which will raise an exception only if used. + + Parameters + ---------- + module: str + The name of the module to import. + msg: str or None + An optional error message to be displayed if this module is used + after a failed import. + alt: object + An optional module to be used in place of the given module if it + fails to import + + Returns + ------- + object + The imported module, the given alternate, or a class derived from + UnavailableMeta. + """ + try: + return importlib.import_module(module) + except ModuleNotFoundError: + exception_text = traceback.format_exc() + logger.debug(f"Import of {module} failed with: {exception_text}") + except Exception: + exception_text = traceback.format_exc() + raise + if msg is None: + msg = f"{module} could not be imported" + if alt is None: + return UnavailableMeta(module.rsplit(".")[-1], (), {"_msg": msg}) + else: + return alt + + +def safe_import_from(module, symbol, *, msg=None, alt=None): + """A function used to import symbols from modules that may not be available + + This function will attempt to import a symbol with the given name from + the given module, but it will not throw an ImportError if the symbol is not + found. Instead, it will return a placeholder object which will raise an + exception only if used. + + Parameters + ---------- + module: str + The name of the module in which the symbol is defined. + symbol: str + The name of the symbol to import. + msg: str or None + An optional error message to be displayed if this symbol is used + after a failed import. + alt: object + An optional object to be used in place of the given symbol if it fails + to import + + Returns + ------- + object + The imported symbol, the given alternate, or a class derived from + UnavailableMeta. + """ + try: + imported_module = importlib.import_module(module) + return getattr(imported_module, symbol) + except ModuleNotFoundError: + exception_text = traceback.format_exc() + logger.debug(f"Import of {module} failed with: {exception_text}") + except AttributeError: + exception_text = traceback.format_exc() + logger.info(f"Import of {symbol} from {module} failed with: {exception_text}") + except Exception: + exception_text = traceback.format_exc() + raise + if msg is None: + msg = f"{module}.{symbol} could not be imported" + if alt is None: + return UnavailableMeta(symbol, (), {"_msg": msg}) + else: + return alt + + +def gpu_only_import(module, *, alt=None): + """A function used to import modules required only in GPU installs + + This function will attempt to import a module with the given name. + This function will attempt to import a symbol with the given name from + the given module, but it will not throw an ImportError if the symbol is not + found. Instead, it will return a placeholder object which will raise an + exception only if used with instructions on installing a GPU build. + + Parameters + ---------- + module: str + The name of the module to import. + alt: object + An optional module to be used in place of the given module if it + fails to import in a non-GPU-enabled install + + Returns + ------- + object + The imported module, the given alternate, or a class derived from + UnavailableMeta. + """ + + return safe_import( + module, + msg=f"{module} is not installed in non GPU-enabled installations. {GPU_INSTALL_STRING}", + alt=alt, + ) + + +def gpu_only_import_from(module, symbol, *, alt=None): + """A function used to import symbols required only in GPU installs + + This function will attempt to import a module with the given name. + This function will attempt to import a symbol with the given name from + the given module, but it will not throw an ImportError if the symbol is not + found. Instead, it will return a placeholder object which will raise an + exception only if used with instructions on installing a GPU build. + + Parameters + ---------- + module: str + The name of the module to import. + symbol: str + The name of the symbol to import. + alt: object + An optional object to be used in place of the given symbol if it fails + to import in a non-GPU-enabled install + + Returns + ------- + object + The imported symbol, the given alternate, or a class derived from + UnavailableMeta. + """ + return safe_import_from( + module, + symbol, + msg=f"{module}.{symbol} is not installed in non GPU-enabled installations. {GPU_INSTALL_STRING}", + alt=alt, + ) diff --git a/setup.py b/setup.py index b47ef5c95..8fc60e926 100644 --- a/setup.py +++ b/setup.py @@ -55,10 +55,6 @@ "comment_parser", "beautifulsoup4", "mwparserfromhell @ git+https://github.com/earwig/mwparserfromhell.git@0f89f44", - "cudf-cu12>=24.2", - "dask-cudf-cu12>=24.2", - "cugraph-cu12>=24.2", - "dask-cuda>=24.2", "spacy>=3.6.0, <4.0.0", "presidio-analyzer==2.2.351", "presidio-anonymizer==2.2.351", @@ -68,6 +64,15 @@ # due to this: https://github.com/miso-belica/jusText/issues/47 "lxml[html_clean]", ], + extras_require={ + "cuda12x": [ + "cudf-cu12>=24.2", + "dask-cudf-cu12>=24.2", + "cugraph-cu12>=24.2", + "dask-cuda>=24.2", + "spacy[cuda12x]>=3.6.0, <4.0.0", + ] + }, entry_points={ "console_scripts": [ "get_common_crawl_urls=nemo_curator.scripts.get_common_crawl_urls:console_script", diff --git a/tests/test_filters.py b/tests/test_filters.py index 11bf57388..4ab11c21a 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -149,7 +149,9 @@ def test_retain_score_filter(self, letter_count_data): filtered_data = filter_step(letter_count_data) expected_indices = [2, 3] - expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices]) + # Compute before loc due to https://github.com/dask/dask-expr/issues/1036 + expected_data = letter_count_data.df.compute().loc[expected_indices] + expected_data = DocumentDataset(dd.from_pandas(expected_data, 2)) expected_data.df[score_field] = pd.Series([5, 7], index=expected_data.df.index) assert all_equal( expected_data, filtered_data @@ -168,7 +170,9 @@ def test_filter(self, letter_count_data): filtered_data = filter_step(scored_data) expected_indices = [2, 3] - expected_data = letter_count_data.df.loc[expected_indices] + # Compute before loc due to https://github.com/dask/dask-expr/issues/1036 + expected_data = letter_count_data.df.compute().loc[expected_indices] + expected_data = dd.from_pandas(expected_data, 2) expected_data[score_field] = pd.Series([5, 7], index=expected_data.index) expected_data = DocumentDataset(expected_data) assert all_equal( diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py index 3c6a32754..a1acb901f 100644 --- a/tests/test_fuzzy_dedup.py +++ b/tests/test_fuzzy_dedup.py @@ -16,14 +16,16 @@ from itertools import combinations from typing import Iterable -import cudf -import dask_cudf import numpy as np import pytest from dask.dataframe.utils import assert_eq from nemo_curator.datasets import DocumentDataset from nemo_curator.modules import LSH, MinHash +from nemo_curator.utils.import_utils import gpu_only_import + +cudf = gpu_only_import("cudf") +dask_cudf = gpu_only_import("dask_cudf") @pytest.fixture From 6d992924a835709b5cbbbd75a47b5bf1eb18d953 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Tue, 23 Apr 2024 14:31:24 -0700 Subject: [PATCH 06/34] Fix failing GPU tests with latest pandas bump (#41) Signed-off-by: Ayush Dattagupta Signed-off-by: Nicole Luo --- tests/test_fuzzy_dedup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py index a1acb901f..f0ded450e 100644 --- a/tests/test_fuzzy_dedup.py +++ b/tests/test_fuzzy_dedup.py @@ -114,7 +114,7 @@ def test_minhash_approximation( tuple(zip(minhash_signatures, strings)) ): true_jaccard = jaccard_index(str1, str2, char_ngrams) - minhash_approximation = minhash_overlap(sig1, sig2) + minhash_approximation = minhash_overlap(np.array(sig1), np.array(sig2)) assert abs(true_jaccard - minhash_approximation) < THRESHOLD def test_minhash_cache(self, fuzzy_dedup_data, tmpdir): @@ -172,7 +172,9 @@ def test_multiple_id_cols(self, tmpdir): ) buckets = lsh(self.dataset) buckets_df = buckets.df.compute().to_pandas() - buckets_df["new_id"] = list(zip(buckets_df.dataset_id, buckets_df.id)) + buckets_df["new_id"] = list( + map(list, zip(buckets_df.dataset_id, buckets_df.id)) + ) docs_list = buckets_df.groupby("_bucket_id").new_id.apply(list) expected_df = cudf.Series( [[(1, 1), (1, 2)], [(1, 2), (2, 3)], [(3, 4), (4, 5)]], name="new_id" From dff70cc7c890f886b3ac9ae3a4402611af3abddc Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 23 Apr 2024 16:32:22 -0700 Subject: [PATCH 07/34] Adds Nemo Curator K8s example (#40) * [K8s]: Adds a helper script to create a dask cluster on k8s and includes instructions for how to a Curator workload on k8s Signed-off-by: Terry Kong * black formatting Signed-off-by: Terry Kong * big_english -> my_dataset Signed-off-by: Terry Kong * 24.01 -> 24.03 default container Signed-off-by: Terry Kong * Add help kwarg to all flags Signed-off-by: Terry Kong * Clarify why venv is needed Signed-off-by: Terry Kong * fix precommit failures Signed-off-by: Terry Kong --------- Signed-off-by: Terry Kong Signed-off-by: Nicole Luo --- docs/user-guide/CPUvsGPU.rst | 8 + docs/user-guide/KubernetesCurator.rst | 386 ++++++++++++++++++++++++++ docs/user-guide/index.rst | 4 + examples/k8s/create_dask_cluster.py | 134 +++++++++ 4 files changed, 532 insertions(+) create mode 100644 docs/user-guide/KubernetesCurator.rst create mode 100644 examples/k8s/create_dask_cluster.py diff --git a/docs/user-guide/CPUvsGPU.rst b/docs/user-guide/CPUvsGPU.rst index 5fd901d19..fa5ea6aa8 100644 --- a/docs/user-guide/CPUvsGPU.rst +++ b/docs/user-guide/CPUvsGPU.rst @@ -96,3 +96,11 @@ Every SLURM cluster is different, so make sure you understand how your SLURM clu Our Python examples are designed to work such that they can be run locally on their own, or easily substituted into the ``start-slurm.sh`` to run on multiple nodes. You can adapt your scripts easily too by simply following the pattern of adding ``get_client`` with ``add_distributed_args``. + +----------------------------------------- +Dask with K8s +----------------------------------------- + +We also provide an example guide for how to get started with NeMo Curator on a Kubernetes cluster. + +Please visit :ref:`curator_kubernetes` for more information. diff --git a/docs/user-guide/KubernetesCurator.rst b/docs/user-guide/KubernetesCurator.rst new file mode 100644 index 000000000..c7f727df0 --- /dev/null +++ b/docs/user-guide/KubernetesCurator.rst @@ -0,0 +1,386 @@ +.. _curator_kubernetes: + +====================================== +Running NeMo Curator on Kubernetes +====================================== +The following example demonstrates how to run the NeMo Curator with NVIDIA GPUs on a Kubernetes cluster, +with PersistentVolumeClaims as the storage option. + +.. note:: + This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use. + +Prerequisuites +-------------- +* Kubernetes cluster + * `GPU operator `__ + * `Dask Operator `__ +* `kubectl `__: the Kubernetes Cluster CLI + * Please reach out to your Kubernetes cluster admin for how to setup your ``kubectl`` KUBECONFIG +* `ReadWriteMany `__ `StorageClass `__ (setup by Kubernetes cluster admin) + +Storage +------- +To run NeMo Curator, we need to setup storage to upload and store the input +files, as well as any processed outputs. + +Here is an example of how to create a dynamic PV from a StorageClass setup +by your cluster admin. Replace ``STORAGE_CLASS=<...>`` with the name of +your StorageClass. + +This example requests ``150Gi`` of space. Adjust that number for your +workloads and be aware that not all storage provisioners support volume +resizing. + +.. code-block:: bash + + STORAGE_CLASS=<...> + PVC_NAME=nemo-workspace + + kubectl apply -f - <`` with your NGC secret +key. Note that if you have any special characters in your key you might need to wrap +the key in single quotes (``'``) so it can be parsed correctly by k8s):: + + kubectl create secret docker-registry ngc-registry --docker-server=nvcr.io --docker-username=\$oauthtoken --docker-password= + +Setup Python Environment +------------------------ + +The environment to run the provided scripts in this example does not need the full +``nemo_curator`` package, so you can create a virtual environment with just the +required packages as follows: + +.. code-block:: bash + + python3 -m venv venv + source venv/bin/activate + + pip install 'dask_kubernetes>=2024.4.1' + +Upload Data to PVC +------------------ + +To copy into the ``nemo-workspace`` PVC, we will do so with ``kubectl exec``. You may also +use ``kubectl cp``, but ``exec`` has fewer surprises regarding compressed files: + +.. code-block:: bash + + # Replace <...> with a path on your local machine + LOCAL_WORKSPACE=<...> + + # This copies $LOCAL_WORKSPACE/my_dataset to /my_dataset within the PVC. + # Change foobar to the directory or file you wish to upload. + ( cd $LOCAL_WORKSPACE; tar cf - my_dataset | kubectl exec -i nemo-workspace-busybox -- tar xf - -C /nemo-workspace ) + +.. note:: + See :ref:`data-curator-download` for an example of how to download local data that can be uploaded to the PVC + with the above instruction. + +Create a Dask Cluster +--------------------- + +Use the ``create_dask_cluster.py`` to create a CPU or GPU dask cluster. + +.. note:: + If you are creating another Dask cluster with the same ``--name ``, first delete it via:: + + kubectl delete daskcluster + +.. code-block:: bash + + # Creates a CPU Dask cluster with 1 worker + python create_dask_cluster.py \ + --name rapids-dask \ + --n_workers 1 \ + --image nvcr.io/nvidian/bignlp-train:nemofw-nightly \ + --image_pull_secret ngc-registry \ + --pvcs nemo-workspace:/nemo-workspace + + #╭───────────────────── Creating KubeCluster 'rapids-dask' ─────────────────────╮ + #│ │ + #│ DaskCluster Running │ + #│ Scheduler Pod Running │ + #│ Scheduler Service Created │ + #│ Default Worker Group Created │ + #│ │ + #│ ⠧ Getting dashboard URL │ + #╰──────────────────────────────────────────────────────────────────────────────╯ + #cluster = KubeCluster(rapids-dask, 'tcp://localhost:61757', workers=2, threads=510, memory=3.94 TiB) + + # Creates a GPU Dask cluster with 2 workers with 1 GPU each + python create_dask_cluster.py \ + --name rapids-dask \ + --n_workers 2 \ + --n_gpus_per_worker 1 \ + --image nvcr.io/nvidian/bignlp-train:nemofw-nightly \ + --image_pull_secret ngc-registry \ + --pvcs nemo-workspace:/nemo-workspace + +After creating a cluster, you should be able to proceed after confirming the scheduler and the workers are all ``Running``: + +.. code-block:: bash + + # Set DASK_CLUSTER_NAME to the value of --name + DASK_CLUSTER_NAME=rapids-dask + kubectl get pods -l "dask.org/cluster-name=$DASK_CLUSTER_NAME" + + # NAME READY STATUS RESTARTS AGE + # rapids-dask-default-worker-587238cf2c-7d685f4d75-k6rnq 1/1 Running 0 57m + # rapids-dask-default-worker-f8ff963886-5577fff76b-qmvcd 1/1 Running 3 (52m ago) 57m + # rapids-dask-scheduler-654799869d-9bw4z 1/1 Running 0 57m + +(Opt #1) Running Existing Module +-------------------------------- + +Here is an example of running the existing `gpu_exact_dedup` Curator module. The arguments and script name +will need to be changed according to the module you wish to run: + +.. code-block:: bash + + # Set DASK_CLUSTER_NAME to the value of --name + DASK_CLUSTER_NAME=rapids-dask + SCHEDULER_POD=$(kubectl get pods -l "dask.org/cluster-name=$DASK_CLUSTER_NAME,dask.org/component=scheduler" -o name) + # Starts an interactive shell session in the scheduler pod + kubectl exec -it $SCHEDULER_POD -- bash + + ######################## + # Inside SCHEDULER_POD # + ######################## + # Run the following inside the interactive shell to launch script in the background and + # tee the logs to the /nemo-workspace PVC that was mounted in for persistence. + # The command line flags will need to be replaced with whatever the module script accepts. + # Recall that the PVC is mounted at /nemo-workspace, so any outputs should be written + # to somewhere under /nemo-workspace. + + mkdir -p /nemo-workspace/curator/{output,log,profile} + # Write logs to script.log and to a log file with a date suffix + LOGS="/nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.$(date +%y_%m_%d-%H-%M-%S)" + ( + echo "Writing to: $LOGS" + gpu_exact_dedup \ + --input-data-dirs /nemo-workspace/my_dataset \ + --output-dir /nemo-workspace/curator/output \ + --hash-method md5 \ + --log-dir /nemo-workspace/curator/log \ + --num-files -1 \ + --files-per-partition 1 \ + --profile-path /nemo-workspace/curator/profile \ + --log-frequency 250 \ + --scheduler-address localhost:8786 \ + 2>&1 + echo "Finished!" + ) | tee $LOGS & + + # At this point, feel free to disconnect the shell via Ctrl+D or simply + exit + +At this point you can tail the logs and look for ``Finished!`` in ``/nemo-workspace/curator/script.log``: + +.. code-block:: bash + + # Command will follow the logs of the running module (Press ctrl+C to close) + kubectl exec -it $SCHEDULER_POD -- tail -f /nemo-workspace/curator/script.log + + # Writing to: /nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.24_03_27-15-52-31 + # Computing hashes for /nemo-workspace/my_dataset + # adlr_id _hashes + # 0 cc-2023-14-0397113620 91b77eae49c10a65d485ac8ca18d6c43 + # 1 cc-2023-14-0397113621 a266f0794cc8ffbd431823e6930e4f80 + # 2 cc-2023-14-0397113622 baee533e2eddae764de2cd6faaa1286c + # 3 cc-2023-14-0397113623 87dd52a468448b99078f97e76f528eab + # 4 cc-2023-14-0397113624 a17664daf4f24be58e0e3a3dcf81124a + # Finished! + + +(Opt #2) Running Custom Module +------------------------------ + +In this example, we'll demonstrate how to run a NeMo Curator module that you have defined locally. + +Since your curator module may depend on version of the Curator that differs from what is in the +container, we will need to build a custom image with your code installed: + +.. code-block:: bash + + # Clone your repo. This example uses the official repo + git clone https://github.com/NVIDIA/NeMo-Curator.git NeMo-Curator-dev + + # Checkout specific ref. This example uses a commit in the main branch + git -C NeMo-Curator-dev checkout fc167a6edffd38a55c333742972a5a25b901cb26 + + # Example NeMo base image. Change it according to your requirements + BASE_IMAGE=nvcr.io/nvidian/bignlp-train:nemofw-nightly + docker build -t nemo-curator-custom ./NeMo-Curator-dev -f - </: accordingly + docker tag nemo-curator-custom /: + docker push /: + +.. note:: + When using a custom image, you will likely need to create a different secret unless you pushed to a public registry: + + .. code-block:: bash + + # Fill in // + kubectl create secret docker-registry my-private-registry --docker-server= --docker-username= --docker-password= + + And with this new secret, you create your new dask cluster: + + .. code-block:: bash + + # Fill in // + python create_dask_cluster.py \ + --name rapids-dask \ + --n_workers 2 \ + --n_gpus_per_worker 1 \ + --image /: \ + --image_pull_secret my-private-registry \ + --pvcs nemo-workspace:/nemo-workspace + +After the Dask cluster is deployed, you can proceed to run your module. In this example we'll use +the ``NeMo-Curator/nemo_curator/scripts/find_exact_duplicates.py`` module, but you can find other templates +in `NeMo-Curator/examples `__: + +.. code-block:: bash + + # Set DASK_CLUSTER_NAME to the value of --name + DASK_CLUSTER_NAME=rapids-dask + SCHEDULER_POD=$(kubectl get pods -l "dask.org/cluster-name=$DASK_CLUSTER_NAME,dask.org/component=scheduler" -o name) + # Starts an interactive shell session in the scheduler pod + kubectl exec -it $SCHEDULER_POD -- bash + + ######################## + # Inside SCHEDULER_POD # + ######################## + # Run the following inside the interactive shell to launch script in the background and + # tee the logs to the /nemo-workspace PVC that was mounted in for persistence. + # The command line flags will need to be replaced with whatever the module script accepts. + # Recall that the PVC is mounted at /nemo-workspace, so any outputs should be written + # to somewhere under /nemo-workspace. + + mkdir -p /nemo-workspace/curator/{output,log,profile} + # Append logs to script.log and write to a log file with a date suffix + LOGS="/nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.$(date +%y_%m_%d-%H-%M-%S)" + ( + echo "Writing to: $LOGS" + # Recall that /NeMo-Curator-dev was copied and installed in the Dockerfile above + python3 -u /NeMo-Curator-dev/nemo_curator/scripts/find_exact_duplicates.py \ + --input-data-dirs /nemo-workspace/my_dataset \ + --output-dir /nemo-workspace/curator/output \ + --hash-method md5 \ + --log-dir /nemo-workspace/curator/log \ + --files-per-partition 1 \ + --profile-path /nemo-workspace/curator/profile \ + --log-frequency 250 \ + --scheduler-address localhost:8786 \ + 2>&1 + echo "Finished!" + ) | tee $LOGS & + + # At this point, feel free to disconnect the shell via Ctrl+D or simply + exit + +At this point you can tail the logs and look for ``Finished!`` in ``/nemo-workspace/curator/script.log``: + +.. code-block:: bash + + # Command will follow the logs of the running module (Press ctrl+C to close) + kubectl exec -it $SCHEDULER_POD -- tail -f /nemo-workspace/curator/script.log + + # Writing to: /nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.24_03_27-20-52-07 + # Reading 2 files + # /NeMo-Curator-dev/nemo_curator/modules/exact_dedup.py:157: UserWarning: Output path f/nemo-workspace/curator/output/_exact_duplicates.parquet already exists and will be overwritten + # warnings.warn( + # Finished! + +Deleting Cluster +---------------- +After you have finished using the created dask cluster, you can delete it to release the resources: + +.. code-block:: bash + + # Where is the flag passed to create_dask_cluster.py. Example: `--name ` + kubectl delete daskcluster + +Download Data from PVC +---------------------- + +To download data from your PVC, you can use the ``nemo-workspace-busybox`` Pod created earlier: + +.. code-block:: bash + + # Replace <...> with a path on your local machine + LOCAL_WORKSPACE=<...> + + # Tar will fail if LOCAL_WORKSPACE doesn't exist + mkdir -p $LOCAL_WORKSPACE + + # Copy file in PVC at /nemo-workspace/foobar.txt to local file-system at $LOCAL_WORKSPACE/nemo-workspace/foobar.txt + kubectl exec nemo-workspace-busybox -- tar cf - /nemo-workspace/foobar.txt | tar xf - -C $LOCAL_WORKSPACE + + # Copy directory in PVC /nemo-workspace/fizzbuzz to local file-system at $LOCAL_WORKSPACE/fizzbuzz + kubectl exec nemo-workspace-busybox -- tar cf - /nemo-workspace/fizzbuzz | tar xf - -C $LOCAL_WORKSPACE diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index 278e47ab3..7ba84c03e 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -27,6 +27,9 @@ :ref:`Personally Identifiable Information Identification and Removal ` The purpose of the personally identifiable information (PII) redaction tool is to help scrub sensitive data out of training datasets +:ref:`curator-kubernetes` + Demonstration of how to run the NeMo Curator on a Dask Cluster deployed on top of Kubernetes + .. toctree:: :maxdepth: 4 :titlesonly: @@ -41,3 +44,4 @@ TaskDecontamination.rst PersonalIdentifiableInformationIdentificationAndRemoval.rst DistributedDataClassification.rst + KubernetesCurator.rst diff --git a/examples/k8s/create_dask_cluster.py b/examples/k8s/create_dask_cluster.py new file mode 100644 index 000000000..28be575eb --- /dev/null +++ b/examples/k8s/create_dask_cluster.py @@ -0,0 +1,134 @@ +import argparse + +from dask_kubernetes.operator.kubecluster import KubeCluster, make_cluster_spec + + +def create_cluster( + name: str, + n_workers: int, + n_gpus_per_worker: int, + n_cpus_per_worker: int, + image: str, + image_pull_secret: str, + pvcs: dict[str, str], +): + dask_worker_command = "dask-worker" + if n_gpus_per_worker and n_gpus_per_worker > 0: + dask_worker_command = "dask-cuda-worker" + + custom_cluster_spec = make_cluster_spec( + name=name, + worker_command=dask_worker_command, + n_workers=n_workers, + image=image, + ) + scheduler_spec = custom_cluster_spec["spec"]["scheduler"]["spec"] + worker_spec = custom_cluster_spec["spec"]["worker"]["spec"] + if image_pull_secret: + scheduler_spec["imagePullSecrets"] = [{"name": image_pull_secret}] + worker_spec["imagePullSecrets"] = [{"name": image_pull_secret}] + + obj_vols = [] + obj_vol_mounts = [] + for pvc_name, mount_path in pvcs.items(): + obj_vols.append( + { + "name": pvc_name, + "persistentVolumeClaim": { + "claimName": pvc_name, + }, + } + ) + obj_vol_mounts.append( + { + "name": pvc_name, + "mountPath": mount_path, + } + ) + + scheduler_spec["volumes"] = obj_vols + for ctr in scheduler_spec["containers"]: + ctr["volumeMounts"] = obj_vol_mounts + + worker_spec["volumes"] = obj_vols + for ctr in worker_spec["containers"]: + ctr["volumeMounts"] = obj_vol_mounts + # Resources are added to only the worker, since the scheduler doesn't need GPUs + if n_gpus_per_worker or n_cpus_per_worker: + if not ctr["resources"]: + ctr["resources"] = {"limits": {}} + if n_gpus_per_worker: + ctr["resources"]["limits"]["nvidia.com/gpu"] = str(n_gpus_per_worker) + if n_cpus_per_worker: + ctr["resources"]["limits"]["cpu"] = str(n_cpus_per_worker) + + cluster = KubeCluster( + custom_cluster_spec=custom_cluster_spec, shutdown_on_close=False + ) + print(f"{cluster = }") + + +if __name__ == "__main__": + + def parse_pvcs(specs: str) -> dict[str, str]: + name_to_path = {} + for pvc in specs.split(","): + # Can be empty + if not pvc: + continue + name, _, path = pvc.partition(":") + name_to_path[name] = path + return name_to_path + + parser = argparse.ArgumentParser() + parser.add_argument( + "-n", + "--name", + type=str, + default="rapids-dask", + help="The name of the DaskCluster which you would be able to inspect via `kubectl describe daskcluster `.", + ) + parser.add_argument( + "-w", "--n_workers", type=int, default=2, help="Number of workers" + ) + parser.add_argument( + "-g", + "--n_gpus_per_worker", + type=int, + default=None, + help="Number of GPUs per worker. If not specified, the Dask Cluster defaults to a CPU cluster.", + ) + parser.add_argument( + "-c", + "--n_cpus_per_worker", + type=int, + default=None, + help="Number of CPUs per worker. Provide this flag if you want to limit your CPU resources and K8s will throttle the workers to make sure this limit is satisfied.", + ) + parser.add_argument( + "-i", + "--image", + type=str, + default="nvcr.io/nvidia/nemo:24.03.framework", + help="The image used for the Dask Cluster scheduler and workers.", + ) + parser.add_argument( + "-s", + "--image_pull_secret", + type=str, + default=None, + help="If --image is from a private registry, specify the appropriate pull secret you created to allow these to be pulled.", + ) + parser.add_argument( + "-p", + "--pvcs", + type=parse_pvcs, + default="", + help="Comma sep PVC specificiation of $pvc_name_1:$mount_path_1,$pvc_name_2:$mount_path_2. Example: foo:/foo,bar:/bar mounts pvcs named foo and bar to /foo and /bar respectively.", + ) + + args = parser.parse_args() + + create_cluster( + **vars(args), + ) From f2b3904e4b0c275ecfe4cd4e50f628d3d2126133 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Tue, 30 Apr 2024 08:29:19 -0700 Subject: [PATCH 08/34] Move common dedup utils and remove unused code (#42) * Refactor common utils and remove unused code Signed-off-by: Ayush Dattagupta * More cleanup Signed-off-by: Ayush Dattagupta * More updates/shuffling Signed-off-by: Ayush Dattagupta * Move gpu_dedup scripts into subfolder Signed-off-by: Ayush Dattagupta * Remove gpu_deduplication subfolder Signed-off-by: Ayush Dattagupta * Add readme to fuzzy dedup scripts section Signed-off-by: Ayush Dattagupta * Fix typo and relative links Signed-off-by: Ayush Dattagupta * Remove legacy script entrypoints Signed-off-by: Ayush Dattagupta * Remove legacy scripts and add init file Signed-off-by: Ayush Dattagupta * Update GpuDeduplication.rst Signed-off-by: Ayush Dattagupta --------- Signed-off-by: Ayush Dattagupta Signed-off-by: Nicole Luo --- docs/user-guide/GpuDeduplication.rst | 120 +++++- examples/gpu_deduplication_example/README.md | 3 + nemo_curator/gpu_deduplication/__init__.py | 13 - .../gpu_deduplication/connected_component.py | 290 ------------- nemo_curator/gpu_deduplication/ioutils.py | 116 ----- .../gpu_deduplication/jaccard_compute.py | 154 ------- .../gpu_deduplication/jaccard_map_buckets.py | 197 --------- .../gpu_deduplication/jaccard_shuffle.py | 399 ------------------ .../jaccard_utils/__init__.py | 13 - .../jaccard_utils/batch_shuffle_utils.py | 130 ------ .../jaccard_utils/doc_id_mapping.py | 60 --- .../jaccard_utils/get_anchor_utils.py | 55 --- .../jaccard_utils/get_output_map_utils.py | 149 ------- .../jaccard_utils/io_utils.py | 185 -------- .../jaccard_utils/jaccard_similarity_utils.py | 103 ----- .../gpu_deduplication/prepare_fuzzy_ids.py | 95 ----- nemo_curator/gpu_deduplication/utils.py | 155 ------- .../verify_all_pairs_jaccard.py | 172 -------- .../write_deduped_result_with_text.py | 83 ---- nemo_curator/modules/fuzzy_dedup.py | 10 +- nemo_curator/scripts/find_exact_duplicates.py | 6 +- .../scripts/fuzzy_deduplication/README.md | 99 +++++ .../scripts/fuzzy_deduplication/__init__.py | 0 .../compute_minhashes.py | 12 +- .../connected_components.py | 4 +- .../jaccard_compute.py | 4 +- .../jaccard_shuffle.py | 6 +- .../{ => fuzzy_deduplication}/map_buckets.py | 6 +- .../{ => fuzzy_deduplication}/minhash_lsh.py | 14 +- .../utils/fuzzy_dedup_utils/io_utils.py | 7 + .../fuzzy_dedup_utils}/merge_utils.py | 4 +- nemo_curator/utils/script_utils.py | 64 ++- setup.py | 17 +- 33 files changed, 302 insertions(+), 2443 deletions(-) delete mode 100644 nemo_curator/gpu_deduplication/__init__.py delete mode 100644 nemo_curator/gpu_deduplication/connected_component.py delete mode 100644 nemo_curator/gpu_deduplication/ioutils.py delete mode 100644 nemo_curator/gpu_deduplication/jaccard_compute.py delete mode 100644 nemo_curator/gpu_deduplication/jaccard_map_buckets.py delete mode 100644 nemo_curator/gpu_deduplication/jaccard_shuffle.py delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/__init__.py delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py delete mode 100644 nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py delete mode 100644 nemo_curator/gpu_deduplication/utils.py delete mode 100644 nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py delete mode 100644 nemo_curator/gpu_deduplication/write_deduped_result_with_text.py create mode 100644 nemo_curator/scripts/fuzzy_deduplication/README.md create mode 100644 nemo_curator/scripts/fuzzy_deduplication/__init__.py rename nemo_curator/scripts/{ => fuzzy_deduplication}/compute_minhashes.py (94%) rename nemo_curator/scripts/{ => fuzzy_deduplication}/connected_components.py (95%) rename nemo_curator/scripts/{ => fuzzy_deduplication}/jaccard_compute.py (95%) rename nemo_curator/scripts/{ => fuzzy_deduplication}/jaccard_shuffle.py (95%) rename nemo_curator/scripts/{ => fuzzy_deduplication}/map_buckets.py (96%) rename nemo_curator/scripts/{ => fuzzy_deduplication}/minhash_lsh.py (91%) rename nemo_curator/{gpu_deduplication/jaccard_utils => utils/fuzzy_dedup_utils}/merge_utils.py (98%) diff --git a/docs/user-guide/GpuDeduplication.rst b/docs/user-guide/GpuDeduplication.rst index d8b54811b..61eff2b5a 100644 --- a/docs/user-guide/GpuDeduplication.rst +++ b/docs/user-guide/GpuDeduplication.rst @@ -58,24 +58,108 @@ steps (all scripts are included in the :code:`nemo_curator/scripts/` subdirector 2. Output: _exact_duplicates.parquet. List of exact duplicates and the document hash. * Fuzzy Dedup - 1. Minhashes (Compute minhashes) - 1. Input: Data Directories - 2. Output: minhashes.parquet for each data dir. - 2. Buckets (Minhash Buckets/LSH) - 1. Input: Minhash directories - 2. Output: _buckets.parquet - 3. Map Buckets - 1. Input: Buckets.parquet + Data Dirs - 2. Output: anchor_docs_with_bk.parquet - 4. Jaccard Shuffle - 1. Input: anchor_docs_with_bk.parquet + Data Dirs - 2. Output: shuffled_docs.parquet - 5. Jaccard compute - 1. Input: Shuffled docs.parquet - 2. Output: jaccard_similarity_results.parquet - 6. Connected Components - 1. Input: jaccard_similarity_results.parquet - 2. Output: connected_components.parquet + + 1. Compute Minhashes + - Input: Data Directories + - Output: minhashes.parquet for each data dir. + - Example call: + + .. code-block:: bash + + # same as `python compute_minhashes.py` + gpu_compute_minhashes \ + --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \ + --output-minhash-dir /path/to/output_minhashes \ + --input-json-text-field text_column_name \ + --input-json-id-field id_column_name \ + --minhash-length number_of_hashes \ + --char-ngram char_ngram_size \ + --hash-bytes 4(or 8 byte hashes) \ + --seed 42 \ + --log-dir ./ + # --scheduler-file /path/to/file.json + + + 2. Buckets (Minhash Buckets) + - Input: Minhash directories + - Output: Buckets.parquet + - Example call: + + .. code-block:: bash + + # same as `python minhash_lsh.py` + minhash_buckets \ + --input-data-dirs /path/to/output_minhashes/dir1 /path/to/output_minhashes/dir2 \ + --output-bucket-dir /path/to/dedup_output \ + --input-minhash-field _minhash_signature \ + --input-json-id-field id_column_name \ + --minhash-length number_of_hashes \ + --num-bands num_bands \ + --buckets-per-shuffle 1 `#Value b/w [1-num_bands]. Higher is better but might lead to oom` \ + --log-dir ./ + # --scheduler-file /path/to/file.json + + 3. Jaccard Map Buckets + - Input: Buckets.parquet + Data Dir + - Output: anchor_docs_with_bk.parquet + - Example call: + + .. code-block:: bash + + # same as `python map_buckets.py` + jaccard_map_buckets \ + --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \ + --input-bucket-dir /path/to/dedup_output/_buckets.parquet \ + --output-dir /path/to/dedup_output \ + --input-json-text-field text_column_name \ + --input-json-id-field id_column_name \ + # --scheduler-file /path/to/file.json + + 4. Jaccard Shuffle + - Input: anchor_docs_with_bk.parquet + Data Dir + - Output: shuffled_docs.parquet + - Example call: + + .. code-block:: bash + + # same as `python jaccard_shuffle.py` + jaccard_shuffle \ + --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \ + --input-bucket-mapping-dir /path/to/dedup_output/anchor_docs_with_bk.parquet \ + --output-dir /path/to/dedup_output \ + --input-json-text-field text_column_name \ + --input-json-id-field id_column_name \ + # --scheduler-file /path/to/file.json + + 5. Jaccard compute + - Input: Shuffled docs.parquet + - Output: jaccard_similarity_results.parquet + - Example call: + + .. code-block:: bash + + # same as `python jaccard_compute.py` + jaccard_compute \ + --shuffled-docs-path /path/to/dedup_output/shuffled_docs.parquet \ + --output-dir /path/to/dedup_output \ + --ngram-size char_ngram_size_for_similarity \ + # --scheduler-file /path/to/file.json + + 6. Connected Components + - Input: jaccard_similarity_results.parquet + - Output: connected_components.parquet + - Example call: + + .. code-block:: bash + + # same as `python connected_components.py` + gpu_connected_component \ + --jaccard-pairs_path /path/to/dedup_output/jaccard_similarity_results.parquet \ + --output-dir /path/to/dedup_output \ + --cache-dir /path/to/cc_cache \ + --jaccard-threshold 0.8 + # --scheduler-file /path/to/file.json + In addition to the scripts, there are examples in the `examples` directory that showcase using the python module directly in your own code. It also has examples on how to remove documents from the corpus using the list of duplicate IDs generated from exact or fuzzy diff --git a/examples/gpu_deduplication_example/README.md b/examples/gpu_deduplication_example/README.md index 9a5e64c15..2f294e1f6 100644 --- a/examples/gpu_deduplication_example/README.md +++ b/examples/gpu_deduplication_example/README.md @@ -1,5 +1,8 @@ ### Deduplication Steps +> [!CAUTION] +> The examples references here are outdated and will be replaced with an example using the Python API directly. For more details on the scripts refer to [nemo_curator/scripts/fuzzy_deduplication](/nemo_curator/scripts/fuzzy_deduplication) + 1. Exact dedup 1. Input: Data directories 2. Output: exact_duplicates.parquet. List of exact duplicates and the document hash. diff --git a/nemo_curator/gpu_deduplication/__init__.py b/nemo_curator/gpu_deduplication/__init__.py deleted file mode 100644 index d9155f923..000000000 --- a/nemo_curator/gpu_deduplication/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/nemo_curator/gpu_deduplication/connected_component.py b/nemo_curator/gpu_deduplication/connected_component.py deleted file mode 100644 index 211fb4515..000000000 --- a/nemo_curator/gpu_deduplication/connected_component.py +++ /dev/null @@ -1,290 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from time import time - -import cudf -import cugraph -import cugraph.dask as dcg -import cugraph.dask.comms.comms as Comms -import cupy -import dask_cudf -import numpy as np -from dask.dataframe.shuffle import shuffle as dd_shuffle -from dask.utils import M - -from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import ( - convert_str_pair_adlr_ids_to_int, -) -from nemo_curator.gpu_deduplication.utils import ( - enable_spilling, - get_client, - get_num_workers, - parse_nc_args, - timer, -) - - -def sort_adlr_id(df): - x = df[["adlr_id_x", "adlr_id_y"]].values - x = cupy.sort(x, axis=1) - df["adlr_id_x"] = x[:, 0] - df["adlr_id_y"] = x[:, 1] - for i in ["adlr_id_x", "adlr_id_y"]: - df[i] = df[i].astype("uint64") - return df - - -def thresholding(df, threshold=0.8): - mask = df.jaccard > threshold - df.loc[mask, "jaccard"] = np.int8(1) - df.loc[~mask, "jaccard"] = np.int8(0) - return df - - -@timer -def run_connected_components(jaccard_pairs_path, adlr_id_path, output_path): - Comms.initialize(p2p=True) - df = dask_cudf.read_parquet( - jaccard_pairs_path, blocksize="1GB", aggregate_files=True - ) - df = df[df["jaccard"] == 1].reset_index(drop=True) - - labels_df = dask_cudf.read_parquet(adlr_id_path) - num_nodes = len(labels_df) - - self_edge_df = labels_df[["uid"]].rename(columns={"uid": "adlr_id_x"}) - self_edge_df["adlr_id_y"] = self_edge_df["adlr_id_x"] - - df = df[["adlr_id_x", "adlr_id_y"]].astype(np.int64) - df = dask_cudf.concat([df, self_edge_df]) - - G = cugraph.MultiGraph(directed=False) - G.from_dask_cudf_edgelist( - df, source="adlr_id_x", destination="adlr_id_y", renumber=False - ) - result = dcg.weakly_connected_components(G) - del G - max_partitions = min(32, result.npartitions) - n_components = len(result[["labels"]].drop_duplicates(split_out=max_partitions)) - num_labels = len(result) - print("# of groups", n_components) - print("# of docs removed", num_labels - n_components) - labels_df = labels_df.merge( - result, left_on=["uid"], right_on=["vertex"], how="inner" - ) - labels_df = labels_df[["dataset_id", "doc_id", "labels"]] - labels_df = labels_df.rename(columns={"labels": "group"}) - labels_df = labels_df.persist() - # Doing an inner merge above - # should not change any rows - - assert num_nodes == len(labels_df) - print(f"assert num_nodes:{num_nodes}==labels_df:{len(labels_df)} passed") - labels_df.to_parquet(output_path, write_index=False) - Comms.destroy() - - -def attach_args(parser=None): - description = """Computes connected component""" - if not parser: - parser = parse_nc_args(description=description) - - parser.add_argument( - "--jaccard-pairs-path", - type=str, - help="The directory containing the jaccard results", - ) - parser.add_argument( - "--output-dir", - type=str, - help="The output directory to write results to", - ) - parser.add_argument( - "--cache-dir", - type=str, - help="The cache directory to write intermediate results to", - ) - return parser - - -def delete_cache_data(path): - if "cache" not in path: - return - cmd = f"rm -rf {path}" - print(cmd) - os.system(cmd) - - -def write_output(ddf, output_path): - if not isinstance(output_path, str): - assert TypeError(f"output_path should be str. got {type(output_path)}") - print(f"write {output_path} ...") - ddf.to_parquet(output_path, write_index=False) - - -def get_unique_ids_per_partition(df): - unique_df_ls = [] - for tag in ["x", "y"]: - subset_df = df[[f"dataset_id_{tag}", f"doc_id_{tag}"]].drop_duplicates() - subset_df = subset_df.rename( - columns={f"dataset_id_{tag}": "dataset_id", f"doc_id_{tag}": "doc_id"} - ) - unique_df_ls.append(subset_df) - unique_df = cudf.concat(unique_df_ls, ignore_index=True) - unique_df = unique_df.drop_duplicates() - return unique_df - - -@timer -def write_dedup_parsed_adlr_id(args): - dedup_parsed_adlr_id_path = f"{args.cache_dir}/dedup_parsed_adlr_id.parquet" - ddf = dask_cudf.read_parquet( - args.jaccard_pairs_path, - columns=["adlr_id_x", "adlr_id_y"], - blocksize="1GB", - aggregate_files=True, - ) - ddf = ddf.map_partitions( - convert_str_pair_adlr_ids_to_int, - meta={ - "dataset_id_x": "uint32", - "doc_id_x": "int64", - "dataset_id_y": "uint32", - "doc_id_y": "int64", - }, - ) - - unique_docs = ddf.map_partitions(get_unique_ids_per_partition) - unique_docs = unique_docs.drop_duplicates(split_out=ddf.npartitions // 4) - unique_docs["uid"] = np.uint64(1) - unique_docs["uid"] = unique_docs["uid"].cumsum() - unique_docs["uid"] = unique_docs["uid"] - 1 - write_output(unique_docs, dedup_parsed_adlr_id_path) - return dedup_parsed_adlr_id_path - - -def batched_merge_and_write(ddf, ddf_adlr_id, output_path, batch_size=32): - total_batches = (ddf.npartitions + batch_size - 1) // batch_size - for batch_id, offset in enumerate(range(0, ddf.npartitions, batch_size)): - st = time() - subset_ddf = ddf.partitions[offset : offset + batch_size] - for tag in ["x", "y"]: - subset_ddf = subset_ddf.merge( - ddf_adlr_id, - left_on=[f"dataset_id_{tag}", f"doc_id_{tag}"], - right_on=["dataset_id", "doc_id"], - how="inner", - broadcast=True, - ) - subset_ddf = subset_ddf.rename(columns={"uid": f"adlr_id_{tag}"}) - subset_ddf = subset_ddf.drop(columns=[f"dataset_id_{tag}", f"doc_id_{tag}"]) - - subset_ddf = subset_ddf[["adlr_id_x", "adlr_id_y", "jaccard"]] - output_batch_path = os.path.join(output_path, f"{batch_id}.parquet") - subset_ddf.to_parquet(output_batch_path, write_index=False) - - et = time() - print(f"batch_id = {batch_id}/{total_batches}, time = {et - st}", flush=True) - - -@timer -def write_encoded_jaccard_pair(args, client): - dedup_parsed_adlr_id_path = f"{args.cache_dir}/dedup_parsed_adlr_id.parquet" - output_path = f"{args.cache_dir}/encoded_jaccard_pair/" - ddf_adlr_id = dask_cudf.read_parquet( - dedup_parsed_adlr_id_path, blocksize="2GB", aggregate_files=True - ) - ddf_adlr_id = ddf_adlr_id.persist() - len(ddf_adlr_id) - ddf = dask_cudf.read_parquet( - args.jaccard_pairs_path, - blocksize="256MB", - aggregate_files=True, - ) - ddf = ddf.map_partitions( - convert_str_pair_adlr_ids_to_int, - meta={ - "jaccard": "float32", - "dataset_id_x": "uint32", - "doc_id_x": "int64", - "dataset_id_y": "uint32", - "doc_id_y": "int64", - }, - ) - num_workers = get_num_workers(client) - batched_merge_and_write(ddf, ddf_adlr_id, output_path, num_workers) - - -@timer -def write_dedup_encoded_jaccard_pair(args, client): - input_path = f"{args.cache_dir}/encoded_jaccard_pair" - output_path = f"{args.cache_dir}/final_dedup_encoded_jaccard_pair.parquet" - - ddf = dask_cudf.read_parquet(input_path, blocksize="512MB", aggregate_files=True) - meta = {"adlr_id_x": "uint64", "adlr_id_y": "uint64", "jaccard": "float32"} - ddf = ddf.map_partitions(sort_adlr_id, meta=meta) - ddf = ddf.map_partitions(thresholding, meta=meta) - ddf = ddf.map_partitions( - M.drop_duplicates, - meta=ddf._meta, - enforce_metadata=False, - transform_divisions=False, - align_dataframes=False, - ) - ddf = dd_shuffle( - ddf, - ["adlr_id_x", "doc_id"], - ignore_index=True, - shuffle="tasks", - ) - ddf = ddf.map_partitions( - M.drop_duplicates, - meta=ddf._meta, - enforce_metadata=False, - transform_divisions=False, - align_dataframes=False, - ) - - write_output(ddf, output_path) - return output_path - - -def main(args): - description = """Takes a dataset consisting of document pairs - and their corresponding jaccard similarity to compute connected - components of docuements across pairs to find similar docuemnt - after applying a given threshold. The result is a dataset - consisting of all documents that are similar (above the threshold) - and the component they belong to.""" - start = time() - output_path = os.path.join(args.output_dir, "connected_components.parquet") - - client = get_client(args) - enable_spilling() - client.run(enable_spilling) - adlr_id_path = write_dedup_parsed_adlr_id(args) - write_encoded_jaccard_pair(args, client) - jaccard_pairs_path = write_dedup_encoded_jaccard_pair(args, client) - run_connected_components(jaccard_pairs_path, adlr_id_path, output_path) - print(f"All done in {time()-start:.1f} seconds") - - -def console_script(): - main(attach_args().parse_args()) - - -if __name__ == "__main__": - main(attach_args().parse_args()) diff --git a/nemo_curator/gpu_deduplication/ioutils.py b/nemo_curator/gpu_deduplication/ioutils.py deleted file mode 100644 index 7ac253c04..000000000 --- a/nemo_curator/gpu_deduplication/ioutils.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Sequence - -import cudf -import dask_cudf -from dask import dataframe as dd -from tqdm import tqdm - - -# TODO: -# Combine this with -# nemo_curator.distributed_utils.read_cudf_jsonl -def read_json_func(files, engine="cudf", include_path_column=False, columns=None): - """ - Reads multiple Json Lines files into a cuDF - dataframe with an additional `path` column denoting the path - of the input file. - """ - if not include_path_column: - if columns: - return cudf.read_json(files, engine="cudf", lines=True)[columns] - else: - return cudf.read_json(files, engine="cudf", lines=True) - - dfs = [] - for file in files: - if columns: - df = cudf.read_json(file, engine=engine, lines=True)[columns] - else: - df = cudf.read_json(file, engine=engine, lines=True) - df["path"] = file - dfs.append(df) - return cudf.concat(dfs, ignore_index=True) - - -def bucketed_read(files, func=read_json_func, b_size=2, meta=None, **kwargs): - """ - Read files with `b_size` number of files per bucket. - Users can specify their own read - """ - filepaths = [ - files[i : i + b_size] for i in range(0, len(files), b_size) # noqa: E203 - ] - if meta: - return dd.from_map(func, filepaths, meta=meta, **kwargs) - else: - return dd.from_map(func, filepaths, **kwargs) - - -# TODO: Remove this function -def regular_read_json(files, include_path_column=False): - return dask_cudf.read_json( - files, engine="cudf", lines=True, include_path_column=include_path_column - ) - - -def batched_writing( - dask_df: dask_cudf.DataFrame, - output_path: str, - partition_on: Sequence[str], - parts_ber_batch: int = 32, -): - """ - Write a dask dataframe to parquet in batches. - This allows us to do batched exectution and prevent OOMs - Args: - dask_df: dask dataframe to write - output_path: path to write to - partition_on: columns to partition on - parts_ber_batch: number of partitions per batch - """ - - total_partitions = dask_df.npartitions - for batch_id, part_offset in tqdm( - enumerate(range(0, dask_df.npartitions, parts_ber_batch)) - ): - print(f"\nStarted processing batch in = {batch_id}", flush=True) - df = dask_df.partitions[part_offset : part_offset + parts_ber_batch] - if partition_on: - df.to_parquet( - output_path, - partition_on=partition_on, - name_function=lambda x: f"batch_{batch_id}_part_{x}.parquet", - write_metadata_file=False, - ) - else: - df.to_parquet( - output_path, - name_function=lambda x: f"batch_{batch_id}_part_{x}.parquet", - write_metadata_file=False, - ) - print( - f"Part {part_offset+parts_ber_batch}/{total_partitions} completed", - flush=True, - ) - - -def strip_trailing_sep(path: str): - """ - Strips a path string of trailing path seperators like `/` if any. - """ - return path.rstrip(os.path.sep) diff --git a/nemo_curator/gpu_deduplication/jaccard_compute.py b/nemo_curator/gpu_deduplication/jaccard_compute.py deleted file mode 100644 index f90e6c444..000000000 --- a/nemo_curator/gpu_deduplication/jaccard_compute.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import time - -import cudf -import dask.dataframe as dd -import numpy as np - -from nemo_curator.gpu_deduplication.jaccard_utils.jaccard_similarity_utils import ( - compute_jaccard_and_create_pair_df, -) -from nemo_curator.gpu_deduplication.utils import ( - enable_spilling, - get_client, - get_num_workers, - parse_nc_args, -) - - -def create_bins(path_dicts, max_size): - path_dicts.sort(key=lambda x: x["str_bytes"], reverse=True) - bins, bin_sizes = [], [] - for path_d in path_dicts: - new_path, new_size = path_d["path"], path_d["str_bytes"] - for i, bin_size in enumerate(bin_sizes): - if bin_size + new_size <= max_size: - bins[i].append(new_path) - bin_sizes[i] += new_size - new_size = 0 - break - if new_size: - bins.append([new_path]) - bin_sizes.append(new_size) - return bins - - -def get_anchor_docs_and_string_size(path): - df = cudf.read_parquet(path) - str_bytes = df["text"].str.byte_count().sum() - is_anchor_flag = (df["adlr_id"] == df["anchor_1_adlr_id"]) | ( - df["adlr_id"] == df["anchor_0_adlr_id"] - ) - anchor_df = df[is_anchor_flag].reset_index(drop=True) - return anchor_df, {"path": path, "str_bytes": str_bytes} - - -def compute_jaccard_on_1_partition(path): - try: - df = cudf.read_parquet(path) - pair_df = compute_jaccard_and_create_pair_df(df) - except OverflowError: - paths = [entry.path for entry in os.scandir(os.path.join(path))] - anchor_df_str_size_ls = [ - get_anchor_docs_and_string_size(path) for path in paths - ] - anchor_df = cudf.concat( - [anchor_doc for anchor_doc, _ in anchor_df_str_size_ls], ignore_index=True - ).drop_duplicates() - df_str_size = [str_size for _, str_size in anchor_df_str_size_ls] - paths = create_bins(df_str_size, np.iinfo(np.int32).max // 10) - pair_dfs = [] - for path in paths: - print(path) - df = cudf.read_parquet(path).reset_index(drop=True) - df = cudf.concat([df, anchor_df], ignore_index=True) - pair_df = compute_jaccard_and_create_pair_df(df) - pair_dfs.append(pair_df) - pair_df = cudf.concat(pair_dfs, ignore_index=True) - return pair_df - - -def run_jaccard_compute(shuffled_docs_path, output_final_results_path): - print("Starting Jaccard Computation", flush=True) - st = time.time() - paths = [ - entry.path - for entry in os.scandir(shuffled_docs_path) - if not entry.path.endswith(".txt") - ] - meta_df = cudf.DataFrame( - { - "adlr_id_x": ["x"], - "adlr_id_y": ["y"], - "jaccard": np.float32([0.0]), - } - ) - result_df = dd.from_map( - compute_jaccard_on_1_partition, paths, meta=meta_df - ).reset_index(drop=True) - - result_df.to_parquet( - output_final_results_path, - write_index=False, - write_metadata_file=False, - ) - print(f"Jaccard Computing+Writing time: {time.time() - st:.1f} seconds") - - -def main(args): - description = """Computes the Jaccard similarity between document pairs - from partitioned parquet dataset. Result is a parquet dataset consiting of - document id pair along with their Jaccard similarity score. - """ - OUTPUT_PATH = args.output_dir - shuffled_docs_path = args.shuffled_docs_path - output_final_results_path = os.path.join(OUTPUT_PATH, "dedup_final_results.parquet") - client = get_client(args) - enable_spilling() - client.run(enable_spilling) - print(f"Num Workers = {get_num_workers(client)}", flush=True) - print("Connected to dask cluster", flush=True) - print("Running jaccard compute script", flush=True) - - # Run actual computation - run_jaccard_compute(shuffled_docs_path, output_final_results_path) - - -def attach_args(parser=None): - description = """Computes jaccard similarity""" - if not parser: - parser = parse_nc_args(description=description) - - parser.add_argument( - "--shuffled-docs-path", - type=str, - help="The directory containing the shuffled documents", - ) - parser.add_argument( - "--output-dir", - type=str, - help="The output directory to write results to", - ) - return parser - - -def console_script(): - main(attach_args().parse_args()) - - -if __name__ == "__main__": - main(attach_args().parse_args()) diff --git a/nemo_curator/gpu_deduplication/jaccard_map_buckets.py b/nemo_curator/gpu_deduplication/jaccard_map_buckets.py deleted file mode 100644 index aa60787d4..000000000 --- a/nemo_curator/gpu_deduplication/jaccard_map_buckets.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import time - -from dask.dataframe.shuffle import shuffle as dd_shuffle -from dask.utils import M - -from nemo_curator.gpu_deduplication.jaccard_utils.get_anchor_utils import ( - add_anchor_docs, -) -from nemo_curator.gpu_deduplication.jaccard_utils.get_output_map_utils import ( - get_output_map_based_on_str_bytes, -) -from nemo_curator.gpu_deduplication.jaccard_utils.io_utils import ( - get_bucket_ddf_from_parquet_path, - get_text_ddf_from_json_path_with_blocksize, -) -from nemo_curator.gpu_deduplication.utils import ( - get_client, - get_num_workers, - parse_nc_args, -) - - -def get_anchor_and_output_map_info( - input_data_paths, - input_bucket_path, - text_ddf_blocksize, - num_files, - num_workers, - shuffle_type, -): - """ - Get anchor docs with bucket info - Args: - input_data_paths: list of paths to input data - input_bucket_path: path to input buckets - text_ddf_blocksize: blocksize for text ddf - num_files: number of files to read - num_workers: number of workers - shuffle_type: type of shuffle to use - Returns: - ddf_anchor_docs_with_bk - """ - ddf_text = get_text_ddf_from_json_path_with_blocksize( - input_data_paths=input_data_paths, - num_files=num_files, - blocksize=text_ddf_blocksize, - ) - ddf_bk = get_bucket_ddf_from_parquet_path( - input_bucket_path=input_bucket_path, num_workers=num_workers - ) - output_map_df = get_output_map_based_on_str_bytes(ddf_bk=ddf_bk, ddf_text=ddf_text) - ddf_anchor_docs_with_bk = ddf_bk.map_partitions(add_anchor_docs) - print("output_map_df is based on string bytes", flush=True) - ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.merge( - output_map_df, on=["bucket"] - ) - # Bucket is no longer needed - ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.drop(columns=["bucket"]) - # Below removes any duplicates lying around after dropping buckets - ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.map_partitions( - M.drop_duplicates, - meta=ddf_anchor_docs_with_bk._meta, - enforce_metadata=False, - transform_divisions=False, - align_dataframes=False, - ) - ddf_anchor_docs_with_bk = dd_shuffle( - ddf_anchor_docs_with_bk, - ["dataset_id", "doc_id"], - ignore_index=True, - shuffle=shuffle_type, - ).map_partitions( - M.drop_duplicates, - meta=ddf_anchor_docs_with_bk._meta, - enforce_metadata=False, - transform_divisions=False, - align_dataframes=False, - ) - del output_map_df - return ddf_anchor_docs_with_bk - - -def attach_args(parser=None): - description = """Takes the buckets generated from minhashes and uses - document length information to create a coarse mapping of mapping multiple - buckets to a logical partition by using a modified bin packing algorithm. - """ - if not parser: - parser = parse_nc_args(description=description) - parser.add_argument( - "--input-bucket-dir", - type=str, - help="The directory containing bucket information files", - ) - parser.add_argument( - "--text-ddf-blocksize", - type=int, - default=256, - help="The block size for chunking jsonl files for text ddf in mb", - ) - parser.add_argument( - "--output-dir", - type=str, - help="The output directory to write results in", - ) - parser.add_argument( - "--shuffle-type", - type=str, - default="tasks", - help="Type of shuffle to use before writing to parquet", - ) - return parser - - -def jaccard_get_output_map_workflow( - client, - input_data_paths, - input_bucket_path, - output_anchor_docs_with_bk_path, - text_ddf_blocksize, - num_files, - shuffle_type, -): - """ - Workflow for jaccard shuffle - Args: - client: dask client - input_data_paths: list of paths to input data - input_bucket_path: path to input buckets - output_anchor_docs_with_bk_path: path to save anchor docs with bucket info - text_ddf_blocksize: blocksize for text ddf - num_files: number of files to read - parts_per_worker: number of parts per worker - shuffle_type: type of shuffle to use before writing to parquet - """ - num_workers = get_num_workers(client) - ddf_anchor_docs_with_bk = get_anchor_and_output_map_info( - input_data_paths, - input_bucket_path, - text_ddf_blocksize, - num_files, - num_workers, - shuffle_type, - ) - ddf_anchor_docs_with_bk.to_parquet( - output_anchor_docs_with_bk_path, - write_index=False, - ) - - -def main(args): - input_data_paths = args.input_data_dirs - input_bucket_path = args.input_bucket_dir - OUTPUT_PATH = args.output_dir - output_anchor_docs_with_bk_path = os.path.join( - OUTPUT_PATH, "anchor_docs_with_bk.parquet" - ) - client = get_client(args) - print(f"Num Workers = {get_num_workers(client)}", flush=True) - print("Connected to dask cluster", flush=True) - print("Running jaccard map buckets script", flush=True) - print(f"Args = {args}") - st = time.time() - jaccard_get_output_map_workflow( - client, - input_data_paths, - input_bucket_path, - output_anchor_docs_with_bk_path, - args.text_ddf_blocksize, - args.num_files, - args.shuffle_type, - ) - et = time.time() - print(f"Bucket Mapping time taken = {et-st} s") - - -def console_script(): - main(attach_args().parse_args()) - - -if __name__ == "__main__": - main(attach_args().parse_args()) diff --git a/nemo_curator/gpu_deduplication/jaccard_shuffle.py b/nemo_curator/gpu_deduplication/jaccard_shuffle.py deleted file mode 100644 index 846d30c4d..000000000 --- a/nemo_curator/gpu_deduplication/jaccard_shuffle.py +++ /dev/null @@ -1,399 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import time - -import cudf -from tqdm import tqdm - -from nemo_curator.gpu_deduplication.jaccard_utils.batch_shuffle_utils import ( - text_bytes_aware_shuffle, -) -from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import ( - combine_back_adlr_ids, -) -from nemo_curator.gpu_deduplication.jaccard_utils.io_utils import ( - aggregated_anchor_docs_with_bk_read, - get_restart_offsets, - get_text_ddf_from_json_path_with_blocksize, - update_restart_offsets, -) -from nemo_curator.gpu_deduplication.jaccard_utils.merge_utils import ( - extract_partitioning_index, - filter_text_rows_by_bucket_batch, - merge_left_to_shuffled_right, -) -from nemo_curator.gpu_deduplication.utils import ( - get_client, - get_num_workers, - parse_nc_args, - performance_report_if, -) - - -def write_partitioned_file(df, output_path, partition_on, batch_id): - if len(df) == 0: - return cudf.Series([True]) - - cudf.io.parquet.write_to_dataset( - df, - output_path, - partition_cols=[partition_on], - filename=f"batch_{batch_id}.parquet", - ) - return cudf.Series([True]) - - -def batched_merge_and_write( - left_df, - right_df, - merge_on, - partition_on, - output_path, - parts_per_text_batch, - parts_per_bucket_batch, - bk_mapping, - num_workers=None, -): - - total_text_partitions = left_df.npartitions - total_bucket_partitions = right_df.npartitions - - # Extract global partitioning index - left_df, global_partitioning_index = extract_partitioning_index( - left_df, - merge_on, - bk_mapping, - parts_per_bucket_batch, - total_bucket_partitions, - ) - - # Set start offsets - bucket_part_start_offset, text_part_start_offset = get_restart_offsets(output_path) - - # Set end offsets - # NOTE: These end offsets are always set to the end - # of the data. However, we may want to be able to set - # both the start and end offsets from the command line - # in the future. - bucket_part_end_offset = total_bucket_partitions - text_part_end_offset = total_text_partitions - - # Check that offsets are valid - assert bucket_part_start_offset % parts_per_bucket_batch == 0 - assert bucket_part_end_offset > bucket_part_start_offset - assert text_part_end_offset > text_part_start_offset - - # Initialize "retry" variables - # - # - retry_count: The number of successive batches that - # we have already performed at a reduced batch size. - # - retry_threshold: The number of successive batches - # for which we should keep the batch size low - # before attempting the default batch size again. - # Every time we return to the default batch size - # and immediately fail, retry_threshold will double. - parts_per_text_batch_retry = None - retry_count, retry_threshold = 0, 1 - - print( - f"Starting at bucket-map partition {bucket_part_start_offset}" - f" and text-df partition {text_part_start_offset}", - flush=True, - ) - - for bucket_part_offset in tqdm( - range(bucket_part_start_offset, bucket_part_end_offset, parts_per_bucket_batch) - ): - - # Outer loop over batches of "bucket-map" partitions - end_bucket_offset = min( - bucket_part_offset + parts_per_bucket_batch, bucket_part_end_offset - ) - print( - f"\nStarted processing bucket-map partitions {bucket_part_offset} " - f"through {end_bucket_offset} of {bucket_part_end_offset}", - flush=True, - ) - st_bucket = time.time() - - # Select our bucket-mapping batch - subset_bucket_df = right_df.partitions[bucket_part_offset:end_bucket_offset] - subset_bucket_df = subset_bucket_df.persist() - - # Filter out rows of left_df that we know cannot - # align with any rows of subset_bucket_df - left_df_use = filter_text_rows_by_bucket_batch( - left_df, - global_partitioning_index, - bucket_part_offset, - bucket_part_end_offset, - total_bucket_partitions, - ) - - text_part_offset = text_part_start_offset - while text_part_offset < text_part_end_offset: - - # Check if we are "retrying" with a smaller "parts_per_text_batch" - if parts_per_text_batch_retry: - parts_per_text_batch_use = parts_per_text_batch_retry - else: - st_text = time.time() - parts_per_text_batch_use = parts_per_text_batch - print(f"Using {parts_per_text_batch_use} text partitions.", flush=True) - - # Select partitions for our text batch - end_text_offset = min( - text_part_offset + parts_per_text_batch_use, text_part_end_offset - ) - subset_text_df = left_df_use.partitions[text_part_offset:end_text_offset] - - try: - # NOTE: If we have more text-df partitions than bucket-map - # partitions, we are more likely to see an OverflowError - output_df = text_bytes_aware_shuffle( - merge_left_to_shuffled_right( - subset_text_df, - subset_bucket_df, - merge_on, - ), - partition_on, - num_workers=num_workers, - ) - except OverflowError as err: - # We encountered an overflow error! - # Let's try again with less text data - parts_per_text_batch_retry = int(parts_per_text_batch_use / 2) - if parts_per_text_batch_retry < 1: - raise err - print( - f"\nWe encountered an OverflowError and will retry " - f"the current batch with {parts_per_text_batch_retry} " - f"text partitions instead of {parts_per_text_batch_use}.", - flush=True, - ) - continue - - output_df = output_df.map_partitions(combine_back_adlr_ids) - batch_label = f"{end_bucket_offset}_{end_text_offset}" - written_files = output_df.map_partitions( - write_partitioned_file, - output_path, - partition_on, - batch_label, - meta=cudf.Series([True]), - ) - written_files = written_files.compute() - update_restart_offsets(output_path, bucket_part_offset, end_text_offset) - del output_df - - print( - "Text-df partition ", - f"{end_text_offset}/{text_part_end_offset} " - f"completed in {time.time()-st_text}", - flush=True, - ) - - # Update loop control-flow variables - if parts_per_text_batch_use == parts_per_text_batch: - # We succeeded at the default batch size. - # Reset the retry count - retry_count, retry_threshold = 0, 1 - else: - # We succeeded at a lower batch size - retry_count += 1 - if retry_count >= retry_threshold: - # Go back to the default text-batch size, - # but increase the retry_threshold in - # case we fail again - parts_per_text_batch_retry = None - retry_count, retry_threshold = 0, min(retry_threshold * 2, 16) - text_part_offset += parts_per_text_batch_use - - update_restart_offsets(output_path, end_bucket_offset, end_text_offset) - print( - "Bucket partition ", - f"{end_bucket_offset}/{bucket_part_end_offset} " - f"completed in {time.time()-st_bucket}", - flush=True, - ) - - # Need to reset text_part_start_offset to 0 after - # a single bucket-batch pass (only matters if we are - # breaking the bucket-mapping df into multiple batches) - text_part_start_offset = 0 - - -def jaccard_shuffling_workflow( - client, - input_data_paths, - input_anchor_docs_with_bk_dir, - output_shuffled_docs_path, - text_ddf_blocksize, - bucket_mapping_ddf_blocksize, - num_files, - parts_per_worker, - profile_path, - bucket_parts_per_worker, -): - """' - Args: - client: dask client - input_data_paths: paths to input data - input_anchor_docs_with_bk_dir: path to input anchor docs with buckets - output_shuffled_docs_path: path to output shuffled docs - text_ddf_blocksize: block size for chunking jsonl files for text ddf - bucket_mapping_ddf_blocksize: block size for chunking parquet files - for anchor_docs_with_bk ddf - num_files: number of files to process - parts_per_worker: parts per worker to process in a batch - profile_path: dask profile path - bucket_parts_per_worker: bucket parts per worker to process in a batch - """ - # Part1. Reading+Shuffling Data - # Read Text from Data from jsonl files - - text_ddf = get_text_ddf_from_json_path_with_blocksize( - input_data_paths=input_data_paths, - num_files=num_files, - blocksize=text_ddf_blocksize, - ) - print( - "Graph creation for get_text_ddf_from_json_path_with_blocksize" " complete.", - flush=True, - ) - print(f"text_ddf.npartitions = {text_ddf.npartitions}", flush=True) - st = time.time() - ddf_anchor_docs_with_bk, bk_mapping = aggregated_anchor_docs_with_bk_read( - input_anchor_docs_with_bk_dir, - blocksize=bucket_mapping_ddf_blocksize, - ) - print("Getting ddf_anchor_docs_with_bk completed") - print( - f"ddf_anchor_docs_with_bk.npartitions = {ddf_anchor_docs_with_bk.npartitions}", - flush=True, - ) - st = time.time() - num_workers = get_num_workers(client) - parts_per_batch = num_workers * parts_per_worker - print(f"parts_per_batch = {parts_per_batch}") - parts_per_bucket_batch = num_workers * bucket_parts_per_worker - print(f"parts_per_bucket_batch = {parts_per_bucket_batch}") - dask_profile_name = f"blocksize-{text_ddf_blocksize}" - dask_profile_name = dask_profile_name + f"parts_per_batch-{parts_per_batch}" - dask_profile_name = ( - dask_profile_name + f"-parts_per_bucket_batch-{parts_per_bucket_batch}" - ) - dask_profile_name = dask_profile_name + f"-jaccard-n_input_files-{num_files}.html" - - text_ddf = text_ddf[["dataset_id", "doc_id", "text"]] - - with performance_report_if(profile_path, dask_profile_name): - # Merge and write the dataframes - batched_merge_and_write( - text_ddf, - ddf_anchor_docs_with_bk, - output_path=output_shuffled_docs_path, - merge_on=["dataset_id", "doc_id"], - partition_on="output_partition_id", - parts_per_text_batch=parts_per_batch, - parts_per_bucket_batch=parts_per_bucket_batch, - bk_mapping=bk_mapping, - num_workers=num_workers, - ) - print(f"Writing+Shuffling data took = {time.time()-st} s", flush=True) - - -def main(args): - input_data_paths = args.input_data_dirs - input_anchor_docs_with_bk_dir = args.input_bucket_mapping_dir - OUTPUT_PATH = args.output_dir - output_anchor_docs_with_bk_path = os.path.join( - OUTPUT_PATH, "anchor_docs_with_bk.parquet" - ) - output_shuffled_docs_path = os.path.join(OUTPUT_PATH, "shuffled_docs.parquet") - client = get_client(args) - print(f"Num Workers = {get_num_workers(client)}", flush=True) - print("Connected to dask cluster", flush=True) - print("Running jaccard shuffle script", flush=True) - print(f"Args = {args}") - st = time.time() - jaccard_shuffling_workflow( - client=client, - input_data_paths=input_data_paths, - input_anchor_docs_with_bk_dir=input_anchor_docs_with_bk_dir, - output_shuffled_docs_path=output_shuffled_docs_path, - text_ddf_blocksize=args.text_ddf_blocksize, - bucket_mapping_ddf_blocksize=args.bucket_mapping_ddf_blocksize, - num_files=args.num_files, - parts_per_worker=args.parts_per_worker, - profile_path=args.profile_path, - bucket_parts_per_worker=args.bucket_parts_per_worker, - ) - et = time.time() - print(f"Jaccard Shuffle E2E time taken = {et-st} s") - - -def attach_args(parser=None): - description = """Shuffles input text documents based on the given bucket - map. The output is a partitioned parquet dataset with the documents - shuffled by buckets - """ - if not parser: - parser = parse_nc_args(description=description) - - parser.add_argument( - "--input-bucket-mapping-dir", - type=str, - help="The directory containing anchor docs with bk files", - ) - parser.add_argument( - "--text-ddf-blocksize", - type=int, - default=256, - help="The block size for chunking jsonl files for text ddf in mb", - ) - parser.add_argument( - "--bucket-mapping-ddf-blocksize", - type=int, - default=256, - help="The block size for for anchor_docs_with_bk ddf in mb", - ) - parser.add_argument( - "--output-dir", - type=str, - help="The output directory to write results in", - ) - parser.add_argument( - "--parts-per-worker", - default=2, - type=int, - help="The number of parts to process per worker per batch", - ) - parser.add_argument( - "--bucket-parts-per-worker", - default=8, - type=int, - help="The number of bucket parts to process per worker per batch", - ) - return parser - - -def console_script(): - main(attach_args().parse_args()) - - -if __name__ == "__main__": - main(attach_args().parse_args()) diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/__init__.py b/nemo_curator/gpu_deduplication/jaccard_utils/__init__.py deleted file mode 100644 index d9155f923..000000000 --- a/nemo_curator/gpu_deduplication/jaccard_utils/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py deleted file mode 100644 index 755112d0c..000000000 --- a/nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import cudf -import dask_cuda -import numpy as np -from dask import config -from dask.dataframe.shuffle import rearrange_by_column -from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle -from packaging.version import Version - -from nemo_curator.gpu_deduplication.jaccard_utils.get_output_map_utils import ( - build_partition, - get_agg_text_bytes_df, -) - -USE_EXCOMMS = Version(dask_cuda.__version__) >= Version("23.10") - - -def rearange_by_column_direct( - df, - col, - npartitions, - ignore_index, - excomms_default=USE_EXCOMMS, -): - # Execute a "direct" shuffle operation without staging - if config.get("explicit-comms", excomms_default): - # Use explicit comms unless the user has - # disabled it with the dask config system, - # or we are using an older version of dask-cuda - return explicit_comms_shuffle( - df, - [col], - npartitions=npartitions, - ignore_index=ignore_index, - ) - else: - return rearrange_by_column( - df, - col=col, - shuffle="tasks", - # Prevent staged shuffling by setting max_branch - # to the number of input partitions + 1 - max_branch=npartitions + 1, - npartitions=npartitions, - ignore_index=ignore_index, - ) - - -def get_shuffle_part_ids_df(agg_df, partition_on, num_workers=None): - sizes = agg_df[f"{partition_on}_text_bytes"].values - max_text_bytes_per_part = int(np.iinfo(np.int32).max // 1.2) - - # Adjust max_text_bytes_per_part if the number of output - # partitions is small compared to the number of workers. - # Sometimes we just have very few output partitions to - # deal with, and just need a larger batch - npartitions_min = int(num_workers * 0.8) - while True: - output_ar = build_partition(sizes.get(), max_text_bytes_per_part) - if output_ar.max() > npartitions_min or max_text_bytes_per_part < 2**24: - break - max_text_bytes_per_part = int(max_text_bytes_per_part // 2.0) - - df = cudf.DataFrame() - df[partition_on] = agg_df[partition_on] - df["_partitions"] = output_ar - return df - - -def get_shuffle_partition_info(df, partition_on, num_workers=None): - df["text_bytes"] = df["text"].map_partitions(lambda s: s.str.byte_count()) - agg_df = get_agg_text_bytes_df(df, partition_on, 1) - del df - - agg_df = agg_df.reset_index(drop=True) - shuffle_part_ids = agg_df.map_partitions( - get_shuffle_part_ids_df, partition_on, num_workers=num_workers - ).persist() - return shuffle_part_ids - - -def text_bytes_aware_shuffle(df, partition_on, num_workers=None): - """ - This shuffle takes into account the text bytes of each partition - and tries to make sure that the output partitions do not exceed - the char limit of cuDF - - Args: - df: dask_cudf dataframe - partition_on: column name to partition on - - - Returns: - dask_cudf dataframe with _partitions columns - """ - print("Starting text bytes aware shuffle", flush=True) - df = df.persist() - shuffle_part_ids = get_shuffle_partition_info( - df, partition_on, num_workers=num_workers - ) - n_output_partitions = shuffle_part_ids["_partitions"].max().compute() + 1 - n_output_partitions = int(n_output_partitions) - df = df.merge(shuffle_part_ids, on=partition_on, how="inner").persist() - - df = ( - rearange_by_column_direct( - df, - col="_partitions", - npartitions=n_output_partitions, - ignore_index=True, - excomms_default=True, - ) - .drop(columns=["_partitions"]) - .persist() - ) - print(f"Will write {len(df)} rows to disk", flush=True) - return df diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py b/nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py deleted file mode 100644 index e29c626fe..000000000 --- a/nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def convert_str_id_to_int(df, id_column="id"): - """ - Converts the legacy id format "dataset_name-0000034" - type of ID into 2 int based ID's - """ - dx = df[id_column].str.rsplit("-", n=1, expand=True) - df["doc_id"] = dx[1].astype("int64").values - df["dataset_id"] = dx[0].hash_values() - df.drop(columns=[id_column], inplace=True) - return df - - -def convert_str_pair_adlr_ids_to_int(df): - assert "adlr_id_x" in df.columns - assert "adlr_id_y" in df.columns - - for tag in ["x", "y"]: - dx = df[f"adlr_id_{tag}"].str.rsplit("-", n=1, expand=True) - df[f"dataset_id_{tag}"] = dx[0].astype("uint32").values - df[f"doc_id_{tag}"] = dx[1].astype("int64").values - # See the above convert_adlr_id_to_int function - df = df.drop(columns=[f"adlr_id_{tag}"]) - return df - - -def combine_back_adlr_ids(df): - df["adlr_id"] = df["dataset_id"].astype(str) + "-" + df["doc_id"].astype(str) - df.drop(columns=["dataset_id", "doc_id"], inplace=True) - - if "anchor_0_dataset_id" in df.columns: - df["anchor_0_adlr_id"] = ( - df["anchor_0_dataset_id"].astype(str) - + "-" - + df["anchor_0_doc_id"].astype(str) - ) - df.drop(columns=["anchor_0_dataset_id", "anchor_0_doc_id"], inplace=True) - - if "anchor_1_dataset_id" in df.columns: - df["anchor_1_adlr_id"] = ( - df["anchor_1_dataset_id"].astype(str) - + "-" - + df["anchor_1_doc_id"].astype(str) - ) - df.drop(columns=["anchor_1_dataset_id", "anchor_1_doc_id"], inplace=True) - return df diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py deleted file mode 100644 index ea734dede..000000000 --- a/nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def random_select_anchor(df_bk, n=2): - """ - Randomly select `n` anchors from each bucket. - """ - df_bk = df_bk.copy() - df_bk["hash"] = df_bk[["doc_id", "dataset_id"]].hash_values() - df_bk = df_bk.sort_values(["bucket", "hash"]) - df_bk["order_in_bucket"] = df_bk.groupby("bucket").cumcount() - df_bk["is_anchor"] = df_bk["order_in_bucket"] < n - for i in range(0, n): - df_bk[f"is_anchor_id_{i}"] = df_bk["order_in_bucket"] == i - df_bk = df_bk.drop(columns=["hash", "order_in_bucket"], axis=1) - df_bk = df_bk.reset_index(drop=True) - df_bk = df_bk[df_bk.is_anchor] - return df_bk - - -def add_anchor_docs(df_bk): - """ - Get anchor documents for each bucket. - """ - num_anchors = 2 - df_anchor_bk = random_select_anchor(df_bk=df_bk, n=num_anchors) - df_anchor_bk_0 = df_anchor_bk[df_anchor_bk["is_anchor_id_0"]][ - ["bucket", "dataset_id", "doc_id"] - ].reset_index(drop=True) - df_anchor_bk_0 = df_anchor_bk_0.rename( - columns={"doc_id": "anchor_0_doc_id", "dataset_id": "anchor_0_dataset_id"} - ) - - df_anchor_bk_1 = df_anchor_bk[df_anchor_bk["is_anchor_id_1"]][ - ["bucket", "dataset_id", "doc_id"] - ].reset_index(drop=True) - df_anchor_bk_1 = df_anchor_bk_1.rename( - columns={"doc_id": "anchor_1_doc_id", "dataset_id": "anchor_1_dataset_id"} - ) - - df_anchor_docs = df_anchor_bk_1.merge(df_anchor_bk_0, on=["bucket"], how="inner") - df_anchor_docs_with_bk = df_bk.merge(df_anchor_docs, on=["bucket"], how="inner") - return df_anchor_docs_with_bk diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py deleted file mode 100644 index bdbdedc6f..000000000 --- a/nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import cudf -import dask_cudf -import numba -import numpy as np - -from nemo_curator._compat import DASK_SHUFFLE_METHOD_ARG - - -# next-fit-descending bin packing -# https://en.wikipedia.org/wiki/Next-fit-decreasing_bin_packing -@numba.jit(nopython=True) -def build_partition(sizes: np.ndarray, max_size): - i: int = 0 - count: int = 0 - current: int = 0 - size: int = 0 - partition = np.empty(sizes.shape, dtype=np.int32) - for i in range(len(sizes)): - size = sizes[i] - if current + size < max_size: - partition[i] = count - current += size - else: - count += 1 - current = size - partition[i] = count - return partition - - -def update_id(df, lower_bound): - df["output_partition_id"] += lower_bound - return df - - -def get_output_part_ids_with_approx_equal_sum( - bucket_text_bytes_df, max_text_bytes_per_part: int -): - """' - Create a output_series that maps the ser.index into `nparts` - so that the total sum of bucket_val_counts_df - for each output id are all most equal and - less than max_text_bytes_per_part - This is used downstream for creating equal output_ids - """ - sizes = bucket_text_bytes_df["bucket_text_bytes"].values - bucket_output_ar = build_partition(sizes.get(), max_text_bytes_per_part) - df = cudf.DataFrame() - df["bucket"] = bucket_text_bytes_df["bucket"] - df["output_partition_id"] = bucket_output_ar - return df - - -def get_agg_text_bytes_df(df, agg_column, n_partitions, shuffle=False): - shuffle_arg = "shuffle_method" if DASK_SHUFFLE_METHOD_ARG else "shuffle" - agg_df = ( - df[[agg_column, "text_bytes"]] - .groupby([agg_column]) - .agg({"text_bytes": "sum"}, split_out=n_partitions, **{shuffle_arg: shuffle}) - ) - agg_df = agg_df.rename(columns={"text_bytes": f"{agg_column}_text_bytes"}) - agg_df = agg_df.reset_index(drop=False) - # Doing a per partition sort - # seems to cause issues with - # jaccard shuffle (Overflow errors) - # which are caught and then - # retried with text_bytes_aware_merge - agg_df = agg_df.persist() - agg_df = agg_df.sort_values( - by=[f"{agg_column}_text_bytes"], ascending=False, ignore_index=True - ) - agg_df = agg_df.persist() - # Added length to force computation - # after persist - print(f"Agg_df computed of length = {len(agg_df)}", flush=True) - return agg_df - - -def get_output_map_from_text_bytes_per_bucket(ddf_bk_text_bytes): - # String bytes limit for cuDF - max_text_bytes_per_part = int(np.iinfo(np.int32).max // 1.2) - print(f"max_text_bytes_per_part = {max_text_bytes_per_part}") - - # Increasing in an attempt to prevent hitting - # ulimits - output_map_df_meta = cudf.DataFrame({"bucket": [0], "output_partition_id": [1]}) - output_map_df_meta["bucket"] = output_map_df_meta["bucket"].astype(np.uint64) - output_map_df_meta["output_partition_id"] = output_map_df_meta[ - "output_partition_id" - ].astype(np.int32) - output_map_df = ddf_bk_text_bytes.map_partitions( - get_output_part_ids_with_approx_equal_sum, - max_text_bytes_per_part, - meta=output_map_df_meta, - ) - output_map_df = output_map_df.persist() - print(f"Step 1 of output_map_df of len: {len(output_map_df)} computed") - lower_bounds = ( - output_map_df["output_partition_id"] - .map_partitions(lambda s: (s.max() + 1)) - .compute() - ) - lower_bounds = np.cumsum(lower_bounds) - - updated_parts = [ - output_map_df.get_partition(i).map_partitions(update_id, lower_bounds[i - 1]) - for i in range(1, len(lower_bounds)) - ] - updated_parts.append(output_map_df.get_partition(0)) - output_map_df = dask_cudf.concat(updated_parts) - output_map_df = output_map_df.persist() - print(f"All steps of output_map_df of len: {len(output_map_df)} computed") - return output_map_df - - -def get_output_map_based_on_str_bytes(ddf_bk, ddf_text): - """ - Add output_partition_id to ddf_bk - """ - print("Getting text bytes", flush=True) - ddf_text["text_bytes"] = ddf_text["text"].map_partitions( - lambda s: s.str.byte_count() - ) - n_partitions = ddf_bk.npartitions - ddf_text = ddf_text.drop(columns=["text"]).repartition(npartitions=n_partitions) - ddf_bk = ddf_bk.merge(ddf_text).repartition(npartitions=n_partitions) - del ddf_text - ddf_bk_text_bytes = get_agg_text_bytes_df( - ddf_bk, - agg_column="bucket", - n_partitions=n_partitions, - shuffle=True, - ) - del ddf_bk - output_map_df = get_output_map_from_text_bytes_per_bucket(ddf_bk_text_bytes) - return output_map_df diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py deleted file mode 100644 index a24b99dd5..000000000 --- a/nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from glob import glob - -import cudf -import dask_cudf -import numpy as np -from dask import dataframe as dd - -from nemo_curator.gpu_deduplication.ioutils import bucketed_read, read_json_func -from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import ( - convert_adlr_id_to_int, -) - - -def get_bucket_ddf_from_parquet_path(input_bucket_path, num_workers): - # Read parquet-formatted parquet files - ddf_bk = dask_cudf.read_parquet( - input_bucket_path, - blocksize="512MiB", - aggregate_files=True, - ) - # Repartition to ensure we at least have num_workers partitions - npartitions = max(ddf_bk.npartitions, num_workers) - ddf_bk = ddf_bk.repartition(npartitions=npartitions) - print(f"Number of ddf_bk partitions = {ddf_bk.npartitions}", flush=True) - return ddf_bk - - -def aggregated_anchor_docs_with_bk_read(path, blocksize): - from dask.utils import natural_sort_key - from pyarrow.dataset import dataset - - ds = dataset( - sorted(glob(f"{path}/*.parquet"), key=natural_sort_key), - format="parquet", - ) - chunks = chunk_files(ds.get_fragments(), blocksize) - - # Record mapping between file indices and partition indices. - # We need to do this, because our anchor_docs_with_bk data - # should be shuffled on disk. - assert len(chunks) - part_id = np.repeat( - np.arange(len(chunks), dtype="int32"), - np.fromiter(map(len, chunks), dtype="int32"), - ) - file_id = np.arange(len(part_id), dtype="int32") - mapping_df = cudf.DataFrame({"file_id": file_id, "part_id": part_id}) - - meta = cudf.DataFrame.from_arrow(ds.schema.empty_table()) - return dd.from_map(cudf.read_parquet, chunks, meta=meta), mapping_df - - -def get_text_ddf_from_json_path(input_data_paths, num_files, files_per_input_partition): - data_paths = [ - entry.path for data_path in input_data_paths for entry in os.scandir(data_path) - ] - data_paths = [f for f in data_paths if f.endswith(".jsonl")] - if num_files != -1: - data_paths = data_paths[:num_files] - meta_df = cudf.DataFrame( - { - "text": ["x"], - "adlr_id": ["x"], - } - ) - print( - f"Number of files being read for jaccard shuffling= {len(data_paths)}", - flush=True, - ) - - text_ddf = bucketed_read( - data_paths, - b_size=files_per_input_partition, - columns=list(meta_df.columns), - meta=meta_df, - ) - text_ddf = text_ddf.map_partitions( - convert_adlr_id_to_int, - meta=cudf.DataFrame({"text": ["a"], "doc_id": [0], "dataset_id": np.uint32(1)}), - ) - return text_ddf - - -def get_file_size(file_path): - return os.path.getsize(file_path) - - -def get_frag_size(frag): - # Pyarrow dataset fragment - return sum(rg.total_byte_size for rg in frag.row_groups) - - -def chunk_files(file_list, max_size_mb): - """ - Chunk files into lists of files that are less than max_size_mb - """ - - max_size_bytes = max_size_mb * 1024 * 1024 - chunks = [] - current_chunk = [] - current_size = 0 - - for frag_or_path in file_list: - if isinstance(frag_or_path, str): - file_path = frag_or_path - file_size = get_file_size(file_path) - else: - file_path = frag_or_path.path - file_size = get_frag_size(frag_or_path) - - if current_size + file_size <= max_size_bytes: - current_chunk.append(file_path) - current_size += file_size - else: - # Handle case when the first - # file is larger than max_size_mb - if current_chunk: - chunks.append(current_chunk) - current_chunk = [file_path] - current_size = file_size - - if current_chunk: - chunks.append(current_chunk) - - return chunks - - -def get_text_ddf_from_json_path_with_blocksize(input_data_paths, num_files, blocksize): - data_paths = [ - entry.path for data_path in input_data_paths for entry in os.scandir(data_path) - ] - data_paths = [f for f in data_paths if f.endswith(".jsonl")] - data_paths.sort() - if num_files != -1: - data_paths = data_paths[:num_files] - meta_df = cudf.DataFrame( - { - "text": ["x"], - "adlr_id": ["x"], - } - ) - print( - f"Number of files being read for jaccard calculation = {len(data_paths)}", - flush=True, - ) - filepaths_ls = chunk_files(data_paths, blocksize) - text_ddf = dd.from_map( - read_json_func, filepaths_ls, columns=list(meta_df.columns), meta=meta_df - ) - text_ddf = text_ddf.map_partitions( - convert_adlr_id_to_int, - meta=cudf.DataFrame({"text": ["a"], "doc_id": [0], "dataset_id": np.uint32(1)}), - ) - return text_ddf - - -def get_restart_offsets(output_path): - bucket_offset, text_offset = 0, 0 - fn = f"{output_path}/_restart_offset.txt" - if os.path.exists(fn): - with open(fn, "r") as f: - offsets = f.readline().strip("\n").split(",") - bucket_offset = int(offsets[0]) - text_offset = int(offsets[1]) - return bucket_offset, text_offset - - -def update_restart_offsets(output_path, bucket_offset, text_offset): - with open(f"{output_path}/_restart_offset.txt", "w") as f: - f.write(f"{bucket_offset},{text_offset}\n") diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py deleted file mode 100644 index be3f4d7b9..000000000 --- a/nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import cudf -import numpy as np - - -def compute_jaccard_partition(df): - df["jaccard"] = df["text_x"].str.jaccard_index(df["text_y"], width=5) - df.drop(columns=["text_x", "text_y"], inplace=True) - return df - - -def get_max_num_rows_to_process_once(df): - nbytes = df["text"].str.byte_count().sum() - # Number of exmploded bytes - exploded_bytes = nbytes * 5 * 2 - max_chars_allowed = 2_147_483_647 - byte_ratio = int(exploded_bytes) // max_chars_allowed - if byte_ratio > 1: - nrows_at_once = len(df) // byte_ratio - else: - nrows_at_once = len(df) - - nrows_at_once = max(1, nrows_at_once) - return nrows_at_once - - -def create_empty_jaccard_result(): - df = cudf.DataFrame() - df["adlr_id_x"] = "x" - df["adlr_id_y"] = "y" - df["jaccard"] = np.empty(shape=0, dtype=np.float32) - return df - - -def compute_jaccard_pair(docs_df, anchor_df): - nrows_at_once = get_max_num_rows_to_process_once(docs_df) - result_ls = [] - for i in range(0, docs_df.shape[0], nrows_at_once): - pair_df = docs_df[i : i + nrows_at_once] - pair_df = pair_df.merge(anchor_df, on="anchor_adlr_id") - pair_df = pair_df.rename( - columns={"adlr_id": "adlr_id_x", "anchor_adlr_id": "adlr_id_y"} - ) - mask = pair_df.adlr_id_x != pair_df.adlr_id_y - pair_df = pair_df[mask].reset_index(drop=True) - if len(pair_df) == 0: - result_df = create_empty_jaccard_result() - else: - result_df = compute_jaccard_partition(pair_df) - result_ls.append(result_df) - if len(result_ls) == 0: - return create_empty_jaccard_result() - df_pair = cudf.concat(result_ls) - return df_pair - - -def get_anchor_df(df, anchor_col): - anchor_df = df[df["adlr_id"] == df[anchor_col]] - anchor_df = anchor_df.reset_index(drop=True) - anchor_df = anchor_df[[anchor_col, "text"]] - anchor_df = anchor_df.rename(columns={anchor_col: "anchor_adlr_id"}) - return anchor_df - - -def compute_jaccard_and_create_pair_df(df): - df = df.drop_duplicates( - subset=["adlr_id", "anchor_1_adlr_id", "anchor_0_adlr_id"], ignore_index=True - ) - anchor_columns = ["anchor_0_adlr_id", "anchor_1_adlr_id"] - result_ls = [] - try: - for anchor_col in anchor_columns: - doc_df = df[["adlr_id", "text", anchor_col]] - doc_df = doc_df.rename(columns={anchor_col: "anchor_adlr_id"}) - doc_df = doc_df[doc_df["adlr_id"] != doc_df["anchor_adlr_id"]] - anchor_df = get_anchor_df(df, anchor_col) - result_df = compute_jaccard_pair(doc_df, anchor_df) - result_ls.append(result_df) - - return cudf.concat(result_ls) - except OverflowError as e: - print( - "Failed with OverflowError in compute_jaccard_and_create_pair_df", - flush=True, - ) - print(df, flush=True) - print("--" * 30) - print("Error") - print("---" * 30) - raise e diff --git a/nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py b/nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py deleted file mode 100644 index b06601b8d..000000000 --- a/nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -import cudf -from dask import dataframe as dd -from dask.distributed import Client - - -def main(args): - # Create the ID mapping - df = cudf.DataFrame() - df["base_id"] = [base_id for base_id in args.base_ids.split(",")] - df["dataset_id"] = df["base_id"].hash_values() - df_pd = df.to_pandas() - - output_dict = { - hashed_id: base_id - for base_id, hashed_id in zip(df_pd["base_id"], df_pd["dataset_id"]) - } - - # Write out the mapping to disk - with open(args.output_id_mapping, "w") as output_file: - json.dump(output_dict, output_file) - - # Index the parquet files by group - client = Client() - ddf = dd.read_parquet(args.path_to_connected_components) - ddf = ddf.set_index("group") - ddf.to_parquet(args.output_indexed_connected_components) - - -def attach_args( - parser=argparse.ArgumentParser( - """ -Prepares the output connected components from dedup for -extraction to .txt and .jsonl files - """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) -): - parser.add_argument( - "--base-ids", - type=str, - default="doc_id", - help="A comma-delimited list of base-ids that were used for " - "different datasets during dedup. For example, " - "if you were deduplicating Wikipedia and Common Crawl, you might " - "have adlr_ids such has wiki-000001 and cc-000001. " - "The base-ids in this case would be 'wiki,cc'", - ) - parser.add_argument( - "--path-to-connected-components", - type=str, - default=None, - help="Path to the connected components that is created " - "at the last step of the fuzzy dedup.", - ) - parser.add_argument( - "--output-indexed-connected-components", - type=str, - default=None, - help="Path to the output connected components " - "that have been prepared for " - "extraction to .txt and .jsonl files", - ) - parser.add_argument( - "--output-id-mapping", - type=str, - default="mapping.json", - help="A mapping between each of the strings specified " - "in '--base-ids' and their respective hashes", - ) - return parser - - -if __name__ == "__main__": - main(attach_args().parse_args()) - - -def console_script(): - main(attach_args().parse_args()) diff --git a/nemo_curator/gpu_deduplication/utils.py b/nemo_curator/gpu_deduplication/utils.py deleted file mode 100644 index f6faefe77..000000000 --- a/nemo_curator/gpu_deduplication/utils.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -from time import time - - -def get_num_workers(client): - """ - Returns the number of workers in the cluster - """ - worker_list = list(client.scheduler_info()["workers"].keys()) - return len(worker_list) - - -def get_list_of_lists(lst, nchunks): - """ - Splits a list into nchunks lists - """ - return [lst[i::nchunks] for i in range(nchunks)] - - -def parse_nc_args( - description="Default gpu dedup nemo_curator argument parser", -) -> argparse.ArgumentParser: - """ - Adds default set of arguments that are common to multiple stages - of the pipeline - """ - parser = argparse.ArgumentParser( - description, - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "--input-data-dirs", - type=str, - nargs="+", - default=None, - required=False, - help="Input directories consisting of .jsonl files that are accessible " - "to all nodes. This path must be accessible by all machines in the cluster", - ) - parser.add_argument( - "--scheduler-address", - type=str, - default=None, - help="Address to the scheduler of a created dask cluster. If not provided" - "a single node LocalCUDACluster will be started.", - ) - parser.add_argument( - "--scheduler-file", - type=str, - default=None, - help="Path to the scheduler file of a created dask cluster. If not provided" - " a single node LocalCUDACluster will be started.", - ) - parser.add_argument( - "--rmm-pool-size", - type=str, - default=None, - help="Initial pool size to use for the RMM Pool Memory allocator" - "Note: This only applies to the localCUDACluster. If providing an user created " - "cluster refer to" - "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-rmm-pool-size", # noqa: E501 - ) - parser.add_argument( - "--protocol", - type=str, - default="tcp", - help="Protcol to use for dask cluster" - "Note: This only applies to the localCUDACluster. If providing an user created " - "cluster refer to" - "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-protocol", # noqa: E501 - ) - parser.add_argument( - "--nvlink-only", - action="store_true", - help="Start a local cluster with only NVLink enabled." - "Only applicable when protocol=ucx and no scheduler file/address is specified", - ) - parser.add_argument( - "--input-json-text-field", - type=str, - default="text", - help="The name of the field within each json object of the jsonl " - "file that contains the text from which minhashes will be computed. ", - ) - parser.add_argument( - "--input-json-id-field", - type=str, - default="adlr_id", - help="The name of the field within each json object of the jsonl " - "file that assigns a unqiue ID to each document. " - "Can be created by running the script " - "'./prospector/add_id.py' which adds the field 'adlr_id' " - "to the documents in a distributed fashion", - ) - parser.add_argument( - "--log-dir", - type=str, - default="./logs/", - help="The output log directory where node and local", - ) - parser.add_argument( - "--files-per-partition", - type=int, - default=2, - help="Number of jsonl files to combine into single partition", - ) - parser.add_argument( - "--num-files", - type=int, - default=None, - help="Upper limit on the number of json files to process", - ) - parser.add_argument( - "--log-frequency", - type=int, - default=500, - help="The frequency with which to write log messages when " - "computing MinHashses. By default a log message will " - "be written every 500 partitions", - ) - parser.add_argument( - "--profile-path", - type=str, - default=None, - help="Path to save dask profile", - ) - return parser - - -def timer(func): - - def wrapper(*args, **kw): - print(f"function {func.__name__} started...") - start = time() - res = func(*args, **kw) - duration = time() - start - timing = f"function {func.__name__} finished in {duration:.1f} seconds" - print(timing) - return res - - return wrapper diff --git a/nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py b/nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py deleted file mode 100644 index ae7e6c656..000000000 --- a/nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from functools import partial -from time import time - -import cudf -import dask_cudf - -from nemo_curator.gpu_deduplication.jaccard_utils.jaccard_similarity_utils import ( - compute_jaccard_partition, - create_empty_jaccard_result, -) -from nemo_curator.gpu_deduplication.utils import get_client, parse_nc_args - - -def num_ngram(ds): - return ds.str.character_ngrams(5, True).list.unique().list.len() - - -def write_eligible_pairs(dedup_with_text_path, cache_dir): - df = cudf.read_parquet(dedup_with_text_path) - df["num_ngram"] = num_ngram(df["text"]) - df.drop(columns="text", inplace=True) - df["group"] = 0 - B = 8_000 - rm = 0 - for s in range(0, df.shape[0], B): - e = min(s + B, df.shape[0]) - da = df.iloc[s:e] - db = da.merge(df, on="group") - mask = db["adlr_id_x"] < db["adlr_id_y"] - db = db[mask] - mask = (db["num_ngram_x"] < db["num_ngram_y"] * 0.8) | ( - db["num_ngram_y"] < db["num_ngram_x"] * 0.8 - ) - print(db.shape, mask.sum()) - rm += mask.sum() - db = db[~mask] - db.drop(columns=["group", "num_ngram_x", "num_ngram_y"], inplace=True) - db.to_parquet(f"{cache_dir}/pair_{s}.parquet") - del da, db - print("total pairs removed", rm) - - -def merge_text(df, dedup_with_text_path): - dg = cudf.read_parquet(dedup_with_text_path) - for i in "xy": - df = df.merge(dg, left_on=f"adlr_id_{i}", right_on="adlr_id") - df.drop(columns="adlr_id", inplace=True) - return df - - -def get_max_num_rows_to_process_once(df): - nbytes = max( - df["text_x"].str.byte_count().sum(), df["text_y"].str.byte_count().sum() - ) - - # TODO: fix below - # to 4x - exploded_bytes = nbytes * 5 * 4 - max_chars_allowed = 2_147_483_647 - byte_ratio = int(exploded_bytes) // max_chars_allowed - if byte_ratio > 1: - nrows_at_once = len(df) // byte_ratio - else: - nrows_at_once = len(df) - - nrows_at_once = max(1, nrows_at_once) - return nrows_at_once - - -def compute_jaccard_pair(docs_df): - nrows_at_once = get_max_num_rows_to_process_once(docs_df) - result_ls = [] - for i in range(0, docs_df.shape[0], nrows_at_once): - pair_df = docs_df[i : i + nrows_at_once] - if len(pair_df) == 0: - result_df = create_empty_jaccard_result() - else: - result_df = compute_jaccard_partition(pair_df) - result_ls.append(result_df) - if len(result_ls) == 0: - return create_empty_jaccard_result() - df_pair = cudf.concat(result_ls) - return df_pair - - -def run_verify_all_pairs_jaccard(dedup_with_text_path, cache_dir, output_dir): - ddf = dask_cudf.read_parquet(f"{cache_dir}/pair_*.parquet") - ddf = ddf.repartition(npartitions=2048) - - meta_df = cudf.DataFrame( - { - "adlr_id_x": [0], - "adlr_id_y": [0], - "text_x": ["x"], - "text_y": ["x"], - } - ) - - ddf = ddf.map_partitions( - partial(merge_text, dedup_with_text_path=dedup_with_text_path), meta=meta_df - ) - - meta_df = cudf.DataFrame( - { - "adlr_id_x": [0], - "adlr_id_y": [0], - "jaccard": [1.0], - } - ) - - ddf = ddf.map_partitions(compute_jaccard_pair, meta=meta_df) - mask = ddf["jaccard"] > 0.8 - dup_pairs = ddf[mask].compute() - print("# of duplicated pairs with jaccard>0.8", dup_pairs.shape[0]) - dup_pairs.to_parquet(f"{output_dir}/duplicated_pairs.parquet") - - -def main(args): - start = time() - description = """Verify correctness of deduped results by calculating all pairs""" - dedup_with_text_path = f"{args.output_dir}/dedup_with_text.parquet" - - write_eligible_pairs(dedup_with_text_path, args.cache_dir) - client = get_client(args) - - # Run actual computation - run_verify_all_pairs_jaccard( - dedup_with_text_path, - args.cache_dir, - args.output_dir, - ) - print(f"All done in {time()-start:.1f} seconds") - - -def attach_args(parser=None): - description = """verify all pairs jaccard""" - if not parser: - parser = parse_nc_args(description=description) - - parser.add_argument( - "--output-dir", - type=str, - help="The output directory to write results to", - ) - parser.add_argument( - "--cache-dir", - type=str, - help="The cache directory to write intermediate results to", - ) - return parser - - -def console_script(): - main(attach_args().parse_args()) - - -if __name__ == "__main__": - main(attach_args().parse_args()) diff --git a/nemo_curator/gpu_deduplication/write_deduped_result_with_text.py b/nemo_curator/gpu_deduplication/write_deduped_result_with_text.py deleted file mode 100644 index 155c56bc2..000000000 --- a/nemo_curator/gpu_deduplication/write_deduped_result_with_text.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from functools import partial - -import cudf - -from nemo_curator.gpu_deduplication.jaccard_utils.io_utils import ( - get_text_ddf_from_json_path, -) -from nemo_curator.gpu_deduplication.utils import parse_nc_args - - -def merge_text_partition(df, connected_components_path): - res = cudf.read_parquet(connected_components_path).drop(columns="dataset_id") - res = res.drop_duplicates("group") - res = res.drop(columns=["group"]) - df = res.merge(df, on="doc_id", how="left") - df = df.rename(columns={"doc_id": "adlr_id"}) - return df.drop(columns="dataset_id") - - -def write_result_text_parquet(original_path, output_dir): - ddf = get_text_ddf_from_json_path( - original_path, num_files=-1, files_per_input_partition=10 - ) - - connected_components_path = f"{output_dir}/connected_components.parquet" - print(ddf.head()) - merge_func = partial( - merge_text_partition, connected_components_path=connected_components_path - ) - ddf = ddf.map_partitions(merge_func, meta={"adlr_id": "uint32", "text": "O"}) - - mask = ddf.text.isnull() - ddf = ddf[~mask] - - df = ddf.compute() - df = df.reset_index(drop=True) - df.to_parquet(f"{output_dir}/dedup_with_text.parquet") - - -def main(args): - write_result_text_parquet( - original_path=[args.original_path], output_dir=args.output_dir - ) - - -def attach_args(parser=None): - description = """verify all pairs jaccard""" - if not parser: - parser = parse_nc_args(description=description) - - parser.add_argument( - "--output-dir", - type=str, - help="The output directory to write results to", - ) - parser.add_argument( - "--original-path", - type=str, - help="The path of original jsonl files", - ) - return parser - - -def console_script(): - main(attach_args().parse_args()) - - -if __name__ == "__main__": - args = attach_args().parse_args() diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py index b51499678..ac72e53d9 100644 --- a/nemo_curator/modules/fuzzy_dedup.py +++ b/nemo_curator/modules/fuzzy_dedup.py @@ -34,11 +34,6 @@ from tqdm import tqdm from nemo_curator.datasets import DocumentDataset -from nemo_curator.gpu_deduplication.jaccard_utils.merge_utils import ( - extract_partitioning_index, - filter_text_rows_by_bucket_batch, - merge_left_to_shuffled_right, -) from nemo_curator.log import create_logger from nemo_curator.utils.distributed_utils import ( get_current_client, @@ -51,6 +46,11 @@ get_restart_offsets, update_restart_offsets, ) +from nemo_curator.utils.fuzzy_dedup_utils.merge_utils import ( + extract_partitioning_index, + filter_text_rows_by_bucket_batch, + merge_left_to_shuffled_right, +) from nemo_curator.utils.fuzzy_dedup_utils.output_map_utils import ( build_partition, get_agg_text_bytes_df, diff --git a/nemo_curator/scripts/find_exact_duplicates.py b/nemo_curator/scripts/find_exact_duplicates.py index 16173861d..af1f127a4 100644 --- a/nemo_curator/scripts/find_exact_duplicates.py +++ b/nemo_curator/scripts/find_exact_duplicates.py @@ -18,12 +18,12 @@ import dask_cudf from nemo_curator.datasets import DocumentDataset -from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep -from nemo_curator.gpu_deduplication.utils import parse_nc_args from nemo_curator.log import create_logger from nemo_curator.modules import ExactDuplicates from nemo_curator.utils.distributed_utils import get_client, read_data from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.fuzzy_dedup_utils.io_utils import strip_trailing_sep +from nemo_curator.utils.script_utils import parse_gpu_dedup_args def pre_imports(): @@ -88,7 +88,7 @@ def attach_args(parser=None): description = """Compute Exact duplicates in a given dataset. """ if not parser: - parser = parse_nc_args(description=description) + parser = parse_gpu_dedup_args(description=description) parser.add_argument( "--hash-method", type=str, diff --git a/nemo_curator/scripts/fuzzy_deduplication/README.md b/nemo_curator/scripts/fuzzy_deduplication/README.md new file mode 100644 index 000000000..f5a43f405 --- /dev/null +++ b/nemo_curator/scripts/fuzzy_deduplication/README.md @@ -0,0 +1,99 @@ +## Fuzzy Deduplication Steps +This directory consists of scripts that can be invoked directly via the command line for finding fuzzy duplicates from a group of Jsonl files consisting of text & unique ID's that are specifically formatted using the `add_id` script included as a part of NeMo-Curator. + +> [!IMPORTANT] +> The scripts are helper utilities that wrap the fuzzy_dedup API for handling multiple jsonl directories and the id format generated by [add_id](../add_id.py). For most cases we recommend working with the fuzzy_deduplication API directly. + +### Usage +1. Compute Minhashes + - Input: Data Directories + - Output: minhashes.parquet for each data dir. + - Example call: + ```bash + # same as `python compute_minhashes.py` + gpu_compute_minhashes \ + --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \ + --output-minhash-dir /path/to/output_minhashes \ + --input-json-text-field text_column_name \ + --input-json-id-field id_column_name \ + --minhash-length number_of_hashes \ + --char-ngram char_ngram_size \ + --hash-bytes 4(or 8 byte hashes) \ + --seed 42 \ + --log-dir ./ + # --scheduler-file /path/to/file.json + ``` +2. Buckets (Minhash Buckets) + - Input: Minhash directories + - Output: Buckets.parquet + - Example call: + ```bash + # same as `python minhash_lsh.py` + minhash_buckets \ + --input-data-dirs /path/to/output_minhashes/dir1 /path/to/output_minhashes/dir2 \ + --output-bucket-dir /path/to/dedup_output \ + --input-minhash-field _minhash_signature \ + --input-json-id-field id_column_name \ + --minhash-length number_of_hashes \ + --num-bands num_bands \ + --buckets-per-shuffle 1 `#Value b/w [1-num_bands]. Higher is better but might lead to oom` \ + --log-dir ./ + # --scheduler-file /path/to/file.json + ``` +3. Jaccard Map Buckets + - Input: Buckets.parquet + Data Dir + - Output: anchor_docs_with_bk.parquet + - Example call: + ```bash + # same as `python map_buckets.py` + jaccard_map_buckets \ + --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \ + --input-bucket-dir /path/to/dedup_output/_buckets.parquet \ + --output-dir /path/to/dedup_output \ + --input-json-text-field text_column_name \ + --input-json-id-field id_column_name \ + # --scheduler-file /path/to/file.json + ``` +4. Jaccard Shuffle + - Input: anchor_docs_with_bk.parquet + Data Dir + - Output: shuffled_docs.parquet + - Example call: + ```bash + # same as `python jaccard_shuffle.py` + jaccard_shuffle \ + --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \ + --input-bucket-mapping-dir /path/to/dedup_output/anchor_docs_with_bk.parquet \ + --output-dir /path/to/dedup_output \ + --input-json-text-field text_column_name \ + --input-json-id-field id_column_name \ + # --scheduler-file /path/to/file.json + ``` +5. Jaccard compute + - Input: Shuffled docs.parquet + - Output: jaccard_similarity_results.parquet + - Example call: + ```bash + # same as `python jaccard_compute.py` + jaccard_compute \ + --shuffled-docs-path /path/to/dedup_output/shuffled_docs.parquet \ + --output-dir /path/to/dedup_output \ + --ngram-size char_ngram_size_for_similarity \ + # --scheduler-file /path/to/file.json + ``` +6. Connected Components + - Input: jaccard_similarity_results.parquet + - Output: connected_components.parquet + - Example call: + ```bash + # same as `python connected_components.py` + gpu_connected_component \ + --jaccard-pairs_path /path/to/dedup_output/jaccard_similarity_results.parquet \ + --output-dir /path/to/dedup_output \ + --cache-dir /path/to/cc_cache \ + --jaccard-threshold 0.8 + # --scheduler-file /path/to/file.json + ``` + +> [!TIP] +> When using these scripts in a multi-node environment (like Slurm, K8's etc.) it is recommended to start up a Dask cluster prior to execution and connect to the existing cluster via the `--scheduler-address` or `--scheduler-file` flag. +> Use the `--help` flag to view all possible CLI options for the scripts and details on what they do. diff --git a/nemo_curator/scripts/fuzzy_deduplication/__init__.py b/nemo_curator/scripts/fuzzy_deduplication/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_curator/scripts/compute_minhashes.py b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py similarity index 94% rename from nemo_curator/scripts/compute_minhashes.py rename to nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py index 044653ceb..832c7c505 100644 --- a/nemo_curator/scripts/compute_minhashes.py +++ b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py @@ -17,8 +17,6 @@ from nemo_curator import MinHash from nemo_curator.datasets import DocumentDataset -from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep -from nemo_curator.gpu_deduplication.utils import parse_nc_args from nemo_curator.log import create_logger from nemo_curator.utils.distributed_utils import ( get_client, @@ -26,6 +24,8 @@ read_data, ) from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.fuzzy_dedup_utils.io_utils import strip_trailing_sep +from nemo_curator.utils.script_utils import parse_gpu_dedup_args def pre_imports(): @@ -111,7 +111,7 @@ def attach_args(parser=None): -minhash signatures is created. This dataframe is written to file after processing """ if not parser: - parser = parse_nc_args(description=description) + parser = parse_gpu_dedup_args(description=description) parser.add_argument( "--minhash-length", @@ -149,12 +149,6 @@ def attach_args(parser=None): "Each file is a parquet file that contains two series, the document ids, " "and a series of lists, each list denoting the minhash signature for that document id.", ) - parser.add_argument( - "--device", - type=str, - default="gpu", - help="Type of cluster to start up", - ) return parser diff --git a/nemo_curator/scripts/connected_components.py b/nemo_curator/scripts/fuzzy_deduplication/connected_components.py similarity index 95% rename from nemo_curator/scripts/connected_components.py rename to nemo_curator/scripts/fuzzy_deduplication/connected_components.py index c04f0349d..f232ad100 100644 --- a/nemo_curator/scripts/connected_components.py +++ b/nemo_curator/scripts/fuzzy_deduplication/connected_components.py @@ -15,9 +15,9 @@ import os import time -from nemo_curator.gpu_deduplication.utils import parse_nc_args from nemo_curator.modules.fuzzy_dedup import ConnectedComponents from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import parse_gpu_dedup_args def main(args): @@ -51,7 +51,7 @@ def main(args): def attach_args(parser=None): description = """Computes connected component""" if not parser: - parser = parse_nc_args(description=description) + parser = parse_gpu_dedup_args(description=description) parser.add_argument( "--jaccard-pairs-path", diff --git a/nemo_curator/scripts/jaccard_compute.py b/nemo_curator/scripts/fuzzy_deduplication/jaccard_compute.py similarity index 95% rename from nemo_curator/scripts/jaccard_compute.py rename to nemo_curator/scripts/fuzzy_deduplication/jaccard_compute.py index d16e95654..4691ef935 100644 --- a/nemo_curator/scripts/jaccard_compute.py +++ b/nemo_curator/scripts/fuzzy_deduplication/jaccard_compute.py @@ -15,9 +15,9 @@ import os import time -from nemo_curator.gpu_deduplication.utils import parse_nc_args from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity from nemo_curator.utils.distributed_utils import get_client, get_num_workers +from nemo_curator.utils.script_utils import parse_gpu_dedup_args def main(args): @@ -57,7 +57,7 @@ def main(args): def attach_args(parser=None): description = """Computes jaccard similarity""" if not parser: - parser = parse_nc_args(description=description) + parser = parse_gpu_dedup_args(description=description) parser.add_argument( "--shuffled-docs-path", diff --git a/nemo_curator/scripts/jaccard_shuffle.py b/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py similarity index 95% rename from nemo_curator/scripts/jaccard_shuffle.py rename to nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py index c01935a61..f0bd555dc 100644 --- a/nemo_curator/scripts/jaccard_shuffle.py +++ b/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py @@ -15,12 +15,12 @@ import os import time -from nemo_curator.gpu_deduplication.utils import get_num_workers, parse_nc_args from nemo_curator.modules.fuzzy_dedup import _Shuffle -from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.distributed_utils import get_client, get_num_workers from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( get_text_ddf_from_json_path_with_blocksize, ) +from nemo_curator.utils.script_utils import parse_gpu_dedup_args def func(): @@ -79,7 +79,7 @@ def attach_args(parser=None): shuffled by buckets """ if not parser: - parser = parse_nc_args(description=description) + parser = parse_gpu_dedup_args(description=description) parser.add_argument( "--input-bucket-mapping-dir", diff --git a/nemo_curator/scripts/map_buckets.py b/nemo_curator/scripts/fuzzy_deduplication/map_buckets.py similarity index 96% rename from nemo_curator/scripts/map_buckets.py rename to nemo_curator/scripts/fuzzy_deduplication/map_buckets.py index 9e3f71a51..5640d9bd3 100644 --- a/nemo_curator/scripts/map_buckets.py +++ b/nemo_curator/scripts/fuzzy_deduplication/map_buckets.py @@ -15,13 +15,13 @@ import os import time -from nemo_curator.gpu_deduplication.utils import get_num_workers, parse_nc_args from nemo_curator.modules.fuzzy_dedup import _MapBuckets -from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.distributed_utils import get_client, get_num_workers from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( get_bucket_ddf_from_parquet_path, get_text_ddf_from_json_path_with_blocksize, ) +from nemo_curator.utils.script_utils import parse_gpu_dedup_args def get_anchor_and_output_map_info( @@ -73,7 +73,7 @@ def attach_args(parser=None): buckets to a logical partition by using a modified bin packing algorithm. """ if not parser: - parser = parse_nc_args(description=description) + parser = parse_gpu_dedup_args(description=description) parser.add_argument( "--input-bucket-dir", type=str, diff --git a/nemo_curator/scripts/minhash_lsh.py b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py similarity index 91% rename from nemo_curator/scripts/minhash_lsh.py rename to nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py index ec206dc10..a0484cf0d 100644 --- a/nemo_curator/scripts/minhash_lsh.py +++ b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py @@ -21,12 +21,10 @@ from nemo_curator import LSH from nemo_curator.datasets import DocumentDataset -from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import ( - convert_str_id_to_int, -) -from nemo_curator.gpu_deduplication.utils import parse_nc_args from nemo_curator.log import create_logger from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import convert_str_id_to_int +from nemo_curator.utils.script_utils import parse_gpu_dedup_args def pre_imports(): @@ -85,7 +83,7 @@ def attach_args(parser=None): denoting the bucket id's that document belongs to. """ if not parser: - parser = parse_nc_args(description=description) + parser = parse_gpu_dedup_args(description=description) parser.add_argument( "--minhash-length", @@ -111,12 +109,6 @@ def attach_args(parser=None): required=True, help="Number of buckets to shuffle per batch", ) - parser.add_argument( - "--device", - type=str, - default="gpu", - help="Type of cluster to start up", - ) parser.add_argument( "--output-bucket-dir", type=str, diff --git a/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py b/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py index cc6e0909f..105021bda 100644 --- a/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py +++ b/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py @@ -180,3 +180,10 @@ def get_frag_size(frag): def get_file_size(file_path): return os.path.getsize(file_path) + + +def strip_trailing_sep(path: str): + """ + Strips a path string of trailing path seperators like `/` if any. + """ + return path.rstrip(os.path.sep) diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/merge_utils.py b/nemo_curator/utils/fuzzy_dedup_utils/merge_utils.py similarity index 98% rename from nemo_curator/gpu_deduplication/jaccard_utils/merge_utils.py rename to nemo_curator/utils/fuzzy_dedup_utils/merge_utils.py index 08fcea53f..a144b5602 100644 --- a/nemo_curator/gpu_deduplication/jaccard_utils/merge_utils.py +++ b/nemo_curator/utils/fuzzy_dedup_utils/merge_utils.py @@ -22,9 +22,7 @@ from dask.highlevelgraph import HighLevelGraph from dask.utils import M -from nemo_curator.gpu_deduplication.jaccard_utils.batch_shuffle_utils import ( - rearange_by_column_direct, -) +from nemo_curator.utils.fuzzy_dedup_utils.shuffle_utils import rearange_by_column_direct def _split_part(part, nsplits): diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py index 8da562d35..e2811dd1e 100644 --- a/nemo_curator/utils/script_utils.py +++ b/nemo_curator/utils/script_utils.py @@ -42,14 +42,14 @@ def add_distributed_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPa type=str, default=None, help="Address to the scheduler of a created dask cluster. If not provided" - "a single node LocalCUDACluster will be started.", + "a single node Cluster will be started.", ) parser.add_argument( "--scheduler-file", type=str, default=None, help="Path to the scheduler file of a created dask cluster. If not provided" - " a single node LocalCUDACluster will be started.", + " a single node Cluster will be started.", ) parser.add_argument( "--n-workers", @@ -68,7 +68,7 @@ def add_distributed_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPa type=str, default=None, help="Initial pool size to use for the RMM Pool Memory allocator" - "Note: This only applies to the localCUDACluster. If providing an user created " + "Note: This only applies to the LocalCUDACluster. If providing an user created " "cluster refer to" "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-rmm-pool-size", # noqa: E501 ) @@ -96,7 +96,7 @@ def add_distributed_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPa parser.add_argument( "--num-files", type=int, - default=-1, + default=None, help="Upper limit on the number of json files to process", ) parser.add_argument( @@ -109,6 +109,62 @@ def add_distributed_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPa return parser +def parse_gpu_dedup_args( + description="Default gpu dedup nemo_curator argument parser", +) -> argparse.ArgumentParser: + """ + Adds default set of arguments that are common to multiple stages + of the pipeline + """ + parser = argparse.ArgumentParser( + description, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser = add_distributed_args(parser) + + # Set default device to GPU for dedup + parser.set_defaults(device="gpu") + parser.add_argument( + "--input-data-dirs", + type=str, + nargs="+", + default=None, + required=False, + help="Input directories consisting of .jsonl files that are accessible " + "to all nodes. This path must be accessible by all machines in the cluster", + ) + parser.add_argument( + "--input-json-text-field", + type=str, + default="text", + help="The name of the field within each json object of the jsonl " + "file that contains the text from which minhashes will be computed. ", + ) + parser.add_argument( + "--input-json-id-field", + type=str, + default="adlr_id", + help="The name of the field within each json object of the jsonl " + "file that assigns a unqiue ID to each document. " + "Can be created by running the script " + "'./prospector/add_id.py' which adds the field 'adlr_id' " + "to the documents in a distributed fashion", + ) + parser.add_argument( + "--log-dir", + type=str, + default="./logs/", + help="The output log directory where node and local", + ) + parser.add_argument( + "--profile-path", + type=str, + default=None, + help="Path to save dask profile", + ) + return parser + + def chunk_list(lst, nchnks): nitem = len(lst) splits = splitnum(nitem, nchnks) diff --git a/setup.py b/setup.py index 8fc60e926..91c32a296 100644 --- a/setup.py +++ b/setup.py @@ -89,18 +89,13 @@ "prepare_task_data=nemo_curator.scripts.prepare_task_data:console_script", "find_matching_ngrams=nemo_curator.scripts.find_matching_ngrams:console_script", "remove_matching_ngrams=nemo_curator.scripts.remove_matching_ngrams:console_script", - "gpu_compute_minhashes=nemo_curator.scripts.compute_minhashes:console_script", - "minhash_buckets=nemo_curator.scripts.minhash_lsh:console_script", - "jaccard_map_buckets=nemo_curator.scripts.map_buckets:console_script", - "jaccard_shuffle=nemo_curator.scripts.jaccard_shuffle:console_script", - "jaccard_compute=nemo_curator.scripts.jaccard_compute:console_script", - "gpu_connected_component=nemo_curator.scripts.connected_components:console_script", - "write_deduped_result_with_text=nemo_curator.gpu_deduplication.write_deduped_result_with_text:console_script", - "verify_all_pairs_jaccard=nemo_curator.gpu_deduplication.verify_all_pairs_jaccard:console_script", + "gpu_compute_minhashes=nemo_curator.scripts.fuzzy_deduplication.compute_minhashes:console_script", + "minhash_buckets=nemo_curator.scripts.fuzzy_deduplication.minhash_lsh:console_script", + "jaccard_map_buckets=nemo_curator.scripts.fuzzy_deduplication.map_buckets:console_script", + "jaccard_shuffle=nemo_curator.scripts.fuzzy_deduplication.jaccard_shuffle:console_script", + "jaccard_compute=nemo_curator.scripts.fuzzy_deduplication.jaccard_compute:console_script", + "gpu_connected_component=nemo_curator.scripts.fuzzy_deduplication.connected_components:console_script", "gpu_exact_dups=nemo_curator.scripts.find_exact_duplicates:console_script", - "prepare_fuzzy_ids=nemo_curator.gpu_deduplication.prepare_fuzzy_ids:console_script", - "create_list_of_duplicate_ids=nemo_curator.gpu_deduplication.create_list_of_duplicate_ids:console_script", - "remove_duplicates=nemo_curator.gpu_deduplication.remove_duplicates:console_script", "deidentify=nemo_curator.scripts.find_pii_and_deidentify:console_script", "generate_statistics=nemo_curator.distributed_data_classification.generate_statistics:console_script", "domain_classifier_inference=nemo_curator.distributed_data_classification.domain_classifier_inference:console_script", From b192e92e7966573d816e9f76a3ee9352e0f9f572 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 3 May 2024 08:38:47 -0700 Subject: [PATCH 09/34] Fix lang id example (#37) * Fix lang id example Signed-off-by: Ryan Wolf * Add classifier unit tests Signed-off-by: Ryan Wolf * Add test for failure Signed-off-by: Ryan Wolf * Remove failure test Signed-off-by: Ryan Wolf --------- Signed-off-by: Ryan Wolf Signed-off-by: Nicole Luo --- .../identify_languages_and_fix_unicode.py | 2 +- nemo_curator/filters/classifier_filter.py | 6 ++ tests/test_filters.py | 72 ++++++++++++++++++- 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/examples/identify_languages_and_fix_unicode.py b/examples/identify_languages_and_fix_unicode.py index 933c6c231..a95dc6905 100644 --- a/examples/identify_languages_and_fix_unicode.py +++ b/examples/identify_languages_and_fix_unicode.py @@ -60,7 +60,7 @@ def main(args): # Remove the language score filtered_dataset.df[language_field] = filtered_dataset.df[language_field].apply( - lambda score: score[1] + lambda score: score[1], meta=(None, str) ) # Split the dataset by language diff --git a/nemo_curator/filters/classifier_filter.py b/nemo_curator/filters/classifier_filter.py index f32e2ff57..3ade004ec 100644 --- a/nemo_curator/filters/classifier_filter.py +++ b/nemo_curator/filters/classifier_filter.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import dask import fasttext import numpy as np import pandas as pd @@ -75,6 +76,11 @@ def __init__(self, model_path=None, min_langid_score=0.3): self._cutoff = min_langid_score self._name = "lang_id" + # Dask will automatically convert the list score type + # to a string without this option. + # See https://github.com/NVIDIA/NeMo-Curator/issues/33 + dask.config.set({"dataframe.convert-string": False}) + @batched def score_document(self, df): model_attr = f"{self._name}_{self._model_path}" diff --git a/tests/test_filters.py b/tests/test_filters.py index 4ab11c21a..50676f385 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -14,6 +14,8 @@ import os +import dask +import numpy as np import pandas as pd import pytest from dask import dataframe as dd @@ -508,7 +510,7 @@ def test_repeatedparagraphschar(self): def test_repeatingtopngrams(self): dataset = list_to_dataset( [ - "this is a totally fine sentence with no repeating ngrams so we are ok", + "this is a totally fine sentence with no repeat ngrams so we are ok", "a b . a b", "a a a a a a", "totally fine small dupe a b a b", @@ -756,3 +758,71 @@ def test_per_extension_filter(self): assert all_equal( expected_data, filtered_data ), f"Expected {expected_data} but got {filtered_data}" + + +class FakeQualityFilter(DocumentFilter): + """ + Emulates FastTextQualityFilter without a model + """ + + def __init__(self, alpha=3, seed=42): + super().__init__() + self._alpha = alpha + self._seed = np.random.seed(seed) + + @batched + def score_document(self, df): + return pd.Series(np.arange(len(df)) / len(df)) + + @batched + def keep_document(self, df): + return np.random.pareto(self._alpha, size=len(df)) > 1 - df + + +class FakeLangId(DocumentFilter): + """ + Emulates FastTextLangId without a model + """ + + def __init__(self, min_langid_score=0.3, convert_string=False): + super().__init__() + self._cutoff = min_langid_score + + # Dask will automatically convert the list score type + # to a string without this option. + # See https://github.com/NVIDIA/NeMo-Curator/issues/33 + dask.config.set({"dataframe.convert-string": convert_string}) + + @batched + def score_document(self, df): + scores = [[0.5, "EN"], [0.7, "HI"], [0.2, "PT"]] + scores = scores * len(df) + scores = scores[: len(df)] + return pd.Series(scores) + + def keep_document(self, score): + return score[0] >= self._cutoff + + +class TestClassifierFilters: + def test_fake_quality_filter(self): + dataset = list_to_dataset(["a", "b", "c", "d"], npartitions=1) + filters = ScoreFilter(FakeQualityFilter()) + filtered_data = filters(dataset) + + expected_indices = [1, 2, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal( + expected_data, filtered_data + ), f"Expected {expected_data} but got {filtered_data}" + + def test_fake_langid_filter(self): + dataset = list_to_dataset(["a", "b", "c", "d"], npartitions=1) + filters = ScoreFilter(FakeLangId()) + filtered_data = filters(dataset) + + expected_indices = [0, 1, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal( + expected_data, filtered_data + ), f"Expected {expected_data} but got {filtered_data}" From 909f58d144f37669eb56567c027b87f45a59ef55 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 3 May 2024 15:30:48 -0700 Subject: [PATCH 10/34] Add dataset blending tool (#32) * Add initial dataset blending function Signed-off-by: Ryan Wolf * Add blend unit tests Signed-off-by: Ryan Wolf * Add self parameter Signed-off-by: Ryan Wolf * Fix return type of blend dataset Signed-off-by: Ryan Wolf * Fix blending tests Signed-off-by: Ryan Wolf * Change assert statement for very uneven blend Signed-off-by: Ryan Wolf * Fix key error Signed-off-by: Ryan Wolf * Add proper proportion blending test Signed-off-by: Ryan Wolf * Add four dataset blend and clarify docs Signed-off-by: Ryan Wolf * Add shuffle module Signed-off-by: Ryan Wolf * Add blend example and tests Signed-off-by: Ryan Wolf * Fix random method name Signed-off-by: Ryan Wolf * Wrap return type in DocumentDataset Signed-off-by: Ryan Wolf * Save result of column drop Signed-off-by: Ryan Wolf * Change equality check for shuffle tests Signed-off-by: Ryan Wolf * Fix expected order after shuffle Signed-off-by: Ryan Wolf * Add more documents to shuffle test Signed-off-by: Ryan Wolf * Add assert statement Signed-off-by: Ryan Wolf * Add within partition shuffle Signed-off-by: Ryan Wolf * Refactor add rand column for shuffle Signed-off-by: Ryan Wolf * Fix filename tests Signed-off-by: Ryan Wolf * Add determinism handling for shuffle Signed-off-by: Ryan Wolf * Change numpy random function Signed-off-by: Ryan Wolf * Fix tests with new random method Signed-off-by: Ryan Wolf * Remove length call from blending Signed-off-by: Ryan Wolf * Improve scaling of blending function Signed-off-by: Ryan Wolf * Fix blend tests Signed-off-by: Ryan Wolf * Add blending script Signed-off-by: Ryan Wolf * Add additional file paths call Signed-off-by: Ryan Wolf * Add documentation Signed-off-by: Ryan Wolf * Reformat docs Signed-off-by: Ryan Wolf * Remove backticks Signed-off-by: Ryan Wolf * Add context manager for shuffle tests Signed-off-by: Ryan Wolf * Add better deterministic shuffle path Signed-off-by: Ryan Wolf * Update documentation and reset index Signed-off-by: Ryan Wolf --------- Signed-off-by: Ryan Wolf Signed-off-by: Nicole Luo --- docs/user-guide/DocumentDataset.rst | 84 +++++++++++ examples/blend_and_shuffle.py | 53 +++++++ nemo_curator/datasets/doc_dataset.py | 2 +- nemo_curator/modules/__init__.py | 3 + nemo_curator/modules/dataset_ops.py | 183 ++++++++++++++++++++++++ nemo_curator/scripts/blend_datasets.py | 138 ++++++++++++++++++ setup.py | 1 + tests/test_blend_datasets.py | 103 ++++++++++++++ tests/test_shuffle.py | 186 +++++++++++++++++++++++++ 9 files changed, 752 insertions(+), 1 deletion(-) create mode 100644 examples/blend_and_shuffle.py create mode 100644 nemo_curator/modules/dataset_ops.py create mode 100644 nemo_curator/scripts/blend_datasets.py create mode 100644 tests/test_blend_datasets.py create mode 100644 tests/test_shuffle.py diff --git a/docs/user-guide/DocumentDataset.rst b/docs/user-guide/DocumentDataset.rst index 351e41a95..0086314a9 100644 --- a/docs/user-guide/DocumentDataset.rst +++ b/docs/user-guide/DocumentDataset.rst @@ -137,3 +137,87 @@ In these cases, we recommend processing the input dataset in batches using a sim This will read in 64 shards at a time, process them, and write them back to disk. Like ``get_remaining_files``, it only includes files that are in the input directory and not in the output directory. + +############################ +Blending and Shuffling +############################ + +Blending data from multiple sources can be a great way of improving downstream model performance. +This blending can be done during model training itself (i.e., *online* blending) or it can be done before training (i.e., *offline* blending). +Online blending is useful for rapidly iterating in the training process. +Meanwhile, offline blending is useful if you want to distribute the dataset. +Online blending is currently possible in `NeMo via NVIDIA Megatron Core `_, and NeMo Curator offers a way to perform blending offline. + +Let's take a look at how datasets can be combined using ``nc.blend_datasets`` + +.. code-block:: python + + import nemo_curator as nc + + books = DocumentDataset.read_json("books_dataset/") + articles = DocumentDataset.read_json("articles_dataset/") + journals = DocumentDataset.read_json("journals_dataset/") + + datasets = [books, articles, journals] + target_samples = 1000 + weights = [5.0, 2.0, 1.0] + + blended_dataset = nc.blend_datasets(target_samples, datasets, weights) + + blended_dataset.to_json("blended_dataset/") + + +* ``datasets = [books, articles, journals]`` Here, we are choosing to blend three different datasets. + These datasets do not have to be in the same file format, or similar in size. + So long as they can be read in as a DocumentDataset, they will be fine. + The samples from each dataset are always drawn "in order". + The precise order depends on the format. + For sharded jsonl files, the entries at the beginning of the file with the first name in sorted order will be chosen first. +* ``target_samples = 1000`` This is the desired number of samples in the resulting dataset. + By sample, we mean document or just generally a single datapoint. + There may end up being more samples in the dataset depending on the weights. +* ``weights = [5.0, 2.0, 1.0]`` The relative number of samples that should be taken from each dataset. + Given these weights, the blended dataset will have five times as many samples from books as there are samples from journals. + Similarly, there will be two times as many samples from articles when compared to samples from journals. + Weights can be a list of non-negative real numbers. + ``nc.blend_datasets`` will do the normalization and combine the normalized weights with the target samples to determine + how many samples should be taken from each dataset. + In the case of the books dataset, the following would be the calculation. + + .. math:: + + \lceil target\_samples \cdot w_i\rceil=\lceil 1000\cdot \frac{5}{8}\rceil=625 + If any datasets have fewer samples than the calculated weight, they will be oversampled to meet the quota. + For example, if the books dataset only had 500 documents in it, the first 125 would be repeated to achieve + the 625 samples. +* ``blended_dataset = nc.blend_datasets(target_samples, datasets, weights)`` We now call the function itself. + Afterwards, we are left with a blended dataset that we can operate on like any other dataset. + We can apply filters, deduplicate, or classify the documents. + +Because blending datasets involves combining data from multiple sources, the sharding of the original datasets +cannot be preserved. The options ``add_filename=True`` and ``write_to_filename=True`` for reading and writing +datasets are therefore incompatible with ``nc.blend_datasets``. + + +Shuffling can be another important aspect of dataset management. +NeMo Curator's ``nc.Shuffle`` allows users to reorder all entries in the dataset. + +Here is a small example on how this can be done: + +.. code-block:: python + + import nemo_curator as nc + + books = DocumentDataset.read_json("books_dataset/") + + shuffle = nc.Shuffle(seed=42) + + shuffled_books = shuffle(books) + + shuffled_books.to_json("shuffled_books/") + +* ``shuffle = nc.Shuffle(seed=42)`` This creates a shuffle operation that can be chained with + the various other modules in NeMo Curator. In this example, we fix the seed to be 42. + Setting the seed will guarantee determinism, but may be slightly slower (20-30% slower) + depending on the dataset size. +* ``shuffled_books = shuffle(books)`` The dataset has now been shuffled, and we can save it to the filesystem. diff --git a/examples/blend_and_shuffle.py b/examples/blend_and_shuffle.py new file mode 100644 index 000000000..e070d5d2a --- /dev/null +++ b/examples/blend_and_shuffle.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import nemo_curator as nc +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import add_distributed_args + + +def main(args): + # Params + dataset_paths = ["/path/to/first", "/path/to/second", "/path/to/third"] + dataset_weights = [5.0, 2.0, 1.0] + target_size = 1000 + output_path = "/path/to/output" + + # Set up Dask client + client = get_client(args, args.device) + + # Blend the datasets + datasets = [DocumentDataset.read_json(path) for path in dataset_paths] + blended_dataset = nc.blend_datasets(target_size, datasets, dataset_weights) + + shuffle = nc.Shuffle(seed=42) + blended_dataset = shuffle(blended_dataset) + + # Save the blend + blended_dataset.to_json(output_path) + + +def attach_args( + parser=argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ), +): + return add_distributed_args(parser) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py index 37592b188..a97aa1969 100644 --- a/nemo_curator/datasets/doc_dataset.py +++ b/nemo_curator/datasets/doc_dataset.py @@ -24,7 +24,7 @@ class DocumentDataset: Internally it may be distributed across multiple nodes, and may be on GPUs. """ - def __init__(self, dataset_df): + def __init__(self, dataset_df: dd.DataFrame): self.df = dataset_df def __len__(self): diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py index 434ebecf4..0867942d8 100644 --- a/nemo_curator/modules/__init__.py +++ b/nemo_curator/modules/__init__.py @@ -22,6 +22,7 @@ from nemo_curator.utils.import_utils import gpu_only_import_from from .add_id import AddId +from .dataset_ops import blend_datasets, Shuffle from .exact_dedup import ExactDuplicates from .filter import Filter, Score, ScoreFilter from .meta import Sequential @@ -50,4 +51,6 @@ "Sequential", "TaskDecontamination", "AddId", + "blend_datasets", + "Shuffle", ] diff --git a/nemo_curator/modules/dataset_ops.py b/nemo_curator/modules/dataset_ops.py new file mode 100644 index 000000000..38589b1e9 --- /dev/null +++ b/nemo_curator/modules/dataset_ops.py @@ -0,0 +1,183 @@ +import math +from typing import Any, Callable, List, Optional + +import dask.dataframe as dd +import numpy as np + +from nemo_curator.datasets.doc_dataset import DocumentDataset + + +def default_filename(partition_num: int) -> str: + return f"file_{partition_num:010d}.jsonl" + + +class Shuffle: + def __init__( + self, + seed: Optional[int] = None, + npartitions: Optional[int] = None, + partition_to_filename: Callable[[int], str] = default_filename, + ) -> None: + """ + Randomly permutes the dataset. This will make the original "filename" column invalid, so if the column is present it will be overwritten. + Args: + seed: The random seed that will be used to determine which partition (file) each datapoint goes to. + Setting the seed will guarantee determinism, but may be slightly slower (20-30% slower) + depending on the dataset size. + npartitions: The output number of partitions to create in the dataset. + If None, it will retain the same number of partitions as the original dataset. + partition_to_filename: If the filename column is present, it will be overwritten. + Passing a function in through this argument allows the user to configure what the filename + will look like given the partition number. The default method names the partition + f'file_{partition_num:010d}.jsonl' and should be changed if the user is not using a .jsonl format. + """ + self.seed = seed + self.npartitions = npartitions + self.partition_to_filename = partition_to_filename + self.rand_col = "_shuffle_rand" + + def __call__(self, dataset: DocumentDataset) -> DocumentDataset: + if self.seed is None: + return self.shuffle_nondeterministic(dataset) + else: + return self.shuffle_deterministic(dataset) + + def shuffle_deterministic(self, dataset: DocumentDataset) -> DocumentDataset: + new_npartitions = ( + dataset.df.npartitions if self.npartitions is None else self.npartitions + ) + + dataset.df[self.rand_col] = dataset.df.map_partitions(self._add_rand_col) + + shuffled_df = dataset.df.set_index(self.rand_col, npartitions=new_npartitions) + shuffled_df = shuffled_df.reset_index(drop=True) + + if "filename" in shuffled_df: + shuffled_df["filename"] = shuffled_df.map_partitions(self._add_filename) + + return DocumentDataset(shuffled_df) + + def shuffle_nondeterministic(self, dataset: DocumentDataset) -> DocumentDataset: + new_npartitions = ( + dataset.df.npartitions if self.npartitions is None else self.npartitions + ) + + dataset.df[self.rand_col] = dataset.df.map_partitions(self._add_rand_col) + + shuffled_df = dataset.df.shuffle( + self.rand_col, npartitions=new_npartitions, ignore_index=True + ) + shuffled_df = shuffled_df.drop(columns=[self.rand_col]) + shuffled_df = shuffled_df.map_partitions(self._partition_shuffle) + + return DocumentDataset(shuffled_df) + + def _add_rand_col(self, partition, partition_info=None): + if partition_info is None: + partition_info = { + "number": 0, + } + + if self.seed is not None: + np.random.seed(self.seed + partition_info["number"]) + rand_col = np.random.randint(0, np.iinfo("int64").max, size=len(partition)) + + return rand_col + + def _partition_shuffle(self, partition, partition_info=None): + if partition_info is None: + return partition + + partition_num = partition_info["number"] + if self.seed is not None: + random_state = self.seed + partition_num + else: + random_state = None + + partition = partition.sample(frac=1, random_state=random_state).reset_index( + drop=True + ) + + if "filename" in partition: + filename = self.partition_to_filename(partition_num) + partition["filename"] = filename + + return partition + + def _add_filename(self, partition, partition_info=None): + if partition_info is None: + return ["filename"] * len(partition) + + filename = self.partition_to_filename(partition_info["number"]) + + return [filename for _ in range(len(partition))] + + +def blend_datasets( + target_size: int, datasets: List[DocumentDataset], sampling_weights: List[float] +) -> DocumentDataset: + """ + Combined multiple datasets into one with different amounts of each dataset + Args: + target_size: The number of documents the resulting dataset should have. + The actual size of the dataset may be slightly larger if the normalized weights do not allow + for even mixtures of the datasets. + datasets: A list of all datasets to combine together + sampling_weights: A list of weights to assign to each dataset in the input. Weights will be + normalized across the whole list as a part of the sampling process. For example, if the normalized + sampling weight for dataset 1 is 0.02, 2% ofthe total samples will be sampled from dataset 1. + There are guaranteed to be math.ceil(normalized_weight_i * target_size) elements from dataset i in + the final blend. + """ + if len(datasets) != len(sampling_weights): + raise ValueError( + f"Different number of datasets and weights specified. {len(datasets)} datasets and {len(sampling_weights)}" + ) + + weight_sum = sum(sampling_weights) + sampling_weights = [weight / weight_sum for weight in sampling_weights] + num_documents_per_dataset = [ + math.ceil(weight * target_size) for weight in sampling_weights + ] + + blend_components = [] + for dataset, num_documents in zip(datasets, num_documents_per_dataset): + # Repeatedly sample from the dataset + while num_documents > 0: + sample = _partition_head(dataset.df, num_documents) + blend_components.append(sample) + num_documents -= len(sample) + + blended_dataset = dd.concat(blend_components) + + return DocumentDataset(blended_dataset) + + +def _partition_head(ddf: dd.DataFrame, n: int) -> dd.DataFrame: + """ + Returns the first n rows in a dataframe while preserving the partitions. + Meant as a replacement for ddf.head(npartitions=-1, compute=False) as it + uses too much memory at large scales + + Args: + ddf: The dataframe to get the first rows from + n: The number of rows to get + """ + original_meta = ddf.dtypes.to_dict() + partition_lengths = ddf.map_partitions(len) + num_partitions = 0 + total_size = 0 + last_length = 0 + for length in partition_lengths: + total_size += length + num_partitions += 1 + last_length = length + if total_size >= n: + break + + delayed_df = ddf.to_delayed() + excess_elems = max(0, total_size - n) + delayed_df = delayed_df[:num_partitions] + delayed_df[-1] = delayed_df[-1].head(last_length - excess_elems) + + return dd.from_delayed(delayed_df, meta=original_meta) diff --git a/nemo_curator/scripts/blend_datasets.py b/nemo_curator/scripts/blend_datasets.py new file mode 100644 index 000000000..4f0fc253a --- /dev/null +++ b/nemo_curator/scripts/blend_datasets.py @@ -0,0 +1,138 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import nemo_curator as nc +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk +from nemo_curator.utils.file_utils import ( + expand_outdir_and_mkdir, + get_all_files_paths_under, +) +from nemo_curator.utils.script_utils import add_distributed_args, attach_bool_arg + + +def main(args): + client = get_client(args, args.device) + + out_dir = expand_outdir_and_mkdir(args.output_data_dir) + + input_dirs = args.input_data_dirs.split(",") + weights = [float(weight) for weight in args.weights.split(",")] + + datasets = [ + DocumentDataset( + read_data( + get_all_files_paths_under(path), + file_type=args.input_file_type, + backend="pandas", + ) + ) + for path in input_dirs + ] + + output_dataset = nc.blend_datasets(args.target_samples, datasets, weights) + + if args.shuffle: + shuffle = nc.Shuffle(seed=args.seed) + output_dataset = shuffle(output_dataset) + + write_to_disk(output_dataset.df, out_dir, output_type=args.output_file_type) + + client.close() + + +def attach_args( + parser=argparse.ArgumentParser( + """ +Blends a collection of datasets together based on certain weights. + +It takes as input a comma-separated list of dataset directories, the +corresponding weights that should be associated with each datatset, +and the target number of samples to aggregate from across all the datasets. +The file shards of the resulting dataset are not guaranteed to be even +or reflect the original dataset(s). + +A blend is created from these datasets and saved to the specified output directory. +Optionally, the user can choose to shuffle this dataset as well. + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) +): + parser.add_argument( + "--input-data-dirs", + type=str, + default=None, + help="Comma-separated list of directories consisting of dataset " + "files that are accessible to all nodes.", + ) + parser.add_argument( + "--weights", + type=str, + default=None, + help="Comma-separated list of floating-point weights corresponding " + "to each dataset passed in --input-data-dirs", + ) + parser.add_argument( + "--output-data-dir", + type=str, + default=None, + help="The output directory to where the blended dataset is" + "retained during filtering will be written. If this argument " + "is not specified, then the document scores from the " + "filter(s) will be written to the document meta data in place", + ) + parser.add_argument( + "--target-samples", + type=int, + default=10000, + help="The number of samples to be included in the output dataset." + " There may be more samples in order to accurately reflect the " + "weight balance, but there will never be less", + ) + attach_bool_arg( + parser, + "shuffle", + default=False, + help_str="Shuffles the dataset after blending", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="If specified, the random seed used for shuffling.", + ) + parser.add_argument( + "--input-file-type", + type=str, + default="jsonl", + help="File type of the dataset to be read in. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + parser.add_argument( + "--output-file-type", + type=str, + default="jsonl", + help="File type the dataset will be written to. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/setup.py b/setup.py index 91c32a296..357e33e51 100644 --- a/setup.py +++ b/setup.py @@ -102,6 +102,7 @@ "quality_classifier_multiple_models_inference=nemo_curator.distributed_data_classification.quality_classifier_multiple_models_inference:console_script", "quality_classifier_inference=nemo_curator.distributed_data_classification.quality_classifier_inference:console_script", "verify_results=nemo_curator.distributed_data_classification.verify_results:console_script", + "blend_datasets=nemo_curator.scripts.blend_datasets:console_script", ], }, ) diff --git a/tests/test_blend_datasets.py b/tests/test_blend_datasets.py new file mode 100644 index 000000000..7c7f7e28b --- /dev/null +++ b/tests/test_blend_datasets.py @@ -0,0 +1,103 @@ +import dask.dataframe as dd +import pandas as pd + +import nemo_curator as nc +from nemo_curator.datasets import DocumentDataset + + +def list_to_dataset(documents, col_name="text", npartitions=2): + data = {col_name: documents} + pdf = pd.DataFrame(data) + + return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions)) + + +def all_equal(left_dataset, right_dataset): + left_result = left_dataset.df.compute() + right_result = right_dataset.df.compute() + + l_cols = set(left_result.columns) + r_cols = set(right_result.columns) + assert l_cols == r_cols + for col in left_result.columns: + left = left_result[col].reset_index(drop=True) + right = right_result[col].reset_index(drop=True) + assert all(left == right), f"Mismatch in {col} column.\n{left}\n{right}\n" + + +class TestBlending: + def test_blend_as_original(self): + first_dataset = list_to_dataset(["one", "two", "three"]) + result_dataset = nc.blend_datasets(len(first_dataset), [first_dataset], [1.0]) + all_equal(first_dataset, result_dataset) + + def test_equal_blend(self): + first_dataset = list_to_dataset(["a", "a"]) + second_dataset = list_to_dataset(["b", "b"]) + result_dataset = nc.blend_datasets( + 2, [first_dataset, second_dataset], [0.5, 0.5] + ) + counts = result_dataset.df["text"].value_counts().compute() + assert len(result_dataset) == 2 + assert counts["a"] == 1 + assert counts["b"] == 1 + + def test_equal_blend_with_weights(self): + first_dataset = list_to_dataset(["a", "a"]) + second_dataset = list_to_dataset(["b", "b"]) + result_dataset = nc.blend_datasets( + 2, [first_dataset, second_dataset], [2.0, 2.0] + ) + counts = result_dataset.df["text"].value_counts().compute() + assert len(result_dataset) == 2 + assert counts["a"] == 1 + assert counts["b"] == 1 + + def test_uneven_blend(self): + first_dataset = list_to_dataset(["a", "a"]) + second_dataset = list_to_dataset(["b", "b"]) + result_dataset = nc.blend_datasets( + 4, [first_dataset, second_dataset], [3.0, 1.0] + ) + counts = result_dataset.df["text"].value_counts().compute() + assert len(result_dataset) == 4 + assert counts["a"] == 3 + assert counts["b"] == 1 + + def test_very_uneven_blend(self): + first_dataset = list_to_dataset(["a", "a"]) + second_dataset = list_to_dataset(["b", "b"]) + result_dataset = nc.blend_datasets( + 4, [first_dataset, second_dataset], [1.0, 0.0] + ) + counts = result_dataset.df["text"].value_counts().compute() + assert len(result_dataset) == 4 + assert counts["a"] == 4 + assert "b" not in counts + + def test_proper_uneven_blend(self): + first_dataset = list_to_dataset(["a", "b", "c", "d"]) + second_dataset = list_to_dataset(["e", "f"]) + result_dataset = nc.blend_datasets( + 8, [first_dataset, second_dataset], [1.0, 0.0] + ) + counts = result_dataset.df["text"].value_counts().compute() + assert len(result_dataset) == 8 + assert counts["a"] == 2 + assert counts["b"] == 2 + assert counts["c"] == 2 + assert counts["d"] == 2 + + def test_four_dataset_blend(self): + datasets = [] + datasets.append(list_to_dataset(["a", "a"])) + datasets.append(list_to_dataset(["b", "b", "b"])) + datasets.append(list_to_dataset(["c"])) + datasets.append(list_to_dataset(["d", "d", "d", "d"])) + result_dataset = nc.blend_datasets(8, datasets, [1.0, 2.0, 3.0, 4.0]) + counts = result_dataset.df["text"].value_counts().compute() + assert len(result_dataset) == 10 + assert counts["a"] == 1 + assert counts["b"] == 2 + assert counts["c"] == 3 + assert counts["d"] == 4 diff --git a/tests/test_shuffle.py b/tests/test_shuffle.py new file mode 100644 index 000000000..a23d47906 --- /dev/null +++ b/tests/test_shuffle.py @@ -0,0 +1,186 @@ +import dask.dataframe as dd +import pandas as pd +from dask.distributed import Client, LocalCluster + +import nemo_curator as nc +from nemo_curator.datasets import DocumentDataset + + +def list_to_dataset(documents, col_name="text", npartitions=2): + data = {col_name: documents} + pdf = pd.DataFrame(data) + + return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions)) + + +def all_equal(left_dataset, right_dataset): + left_result = left_dataset.df.compute() + right_result = right_dataset.df.compute() + + l_cols = set(left_result.columns) + r_cols = set(right_result.columns) + assert l_cols == r_cols + for col in left_result.columns: + left = left_result[col].reset_index(drop=True) + right = right_result[col].reset_index(drop=True) + assert all(left == right), f"Mismatch in {col} column.\n{left}\n{right}\n" + + +class TestShuffleNondeterministic: + def test_shuffle(self): + # Single threaded Dask is the only way to guarantee shuffle determinism + # Docs: https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.shuffle.html + with LocalCluster(n_workers=1, threads_per_worker=1) as cluster: + with Client(cluster): + original_dataset = list_to_dataset( + ["one", "two", "three", "four", "five"] + ) + expected_dataset = list_to_dataset( + ["two", "five", "three", "one", "four"] + ) + shuffle = nc.Shuffle(seed=42) + result_dataset = shuffle.shuffle_nondeterministic(original_dataset) + all_equal(expected_dataset, result_dataset) + + def test_new_partitions(self): + with LocalCluster(n_workers=1, threads_per_worker=1) as cluster: + with Client(cluster): + original_dataset = list_to_dataset( + ["one", "two", "three", "four", "five"], npartitions=3 + ) + expected_dataset = list_to_dataset( + ["two", "five", "three", "one", "four"], npartitions=3 + ) + shuffle = nc.Shuffle(seed=42, npartitions=2) + result_dataset = shuffle.shuffle_nondeterministic(original_dataset) + all_equal(expected_dataset, result_dataset) + + def test_filename(self): + with LocalCluster(n_workers=1, threads_per_worker=1) as cluster: + with Client(cluster): + original_dataset = list_to_dataset( + ["one", "two", "three", "four", "five"], npartitions=1 + ) + original_dataset.df["filename"] = "original.jsonl" + + expected_data = { + "text": ["one", "two", "three", "five", "four"], + "filename": [ + "file_0000000000.jsonl", + "file_0000000000.jsonl", + "file_0000000000.jsonl", + "file_0000000001.jsonl", + "file_0000000001.jsonl", + ], + } + pdf = pd.DataFrame(expected_data) + expected_dataset = DocumentDataset(dd.from_pandas(pdf, npartitions=2)) + + shuffle = nc.Shuffle(seed=42, npartitions=2) + result_dataset = shuffle.shuffle_nondeterministic(original_dataset) + all_equal(expected_dataset, result_dataset) + + def test_custom_filenames(self): + with LocalCluster(n_workers=1, threads_per_worker=1) as cluster: + with Client(cluster): + original_dataset = list_to_dataset( + ["one", "two", "three", "four", "five"], npartitions=1 + ) + original_dataset.df["filename"] = "original.jsonl" + + expected_data = { + "text": ["one", "two", "three", "five", "four"], + "filename": [ + "my_0.test", + "my_0.test", + "my_0.test", + "my_1.test", + "my_1.test", + ], + } + pdf = pd.DataFrame(expected_data) + expected_dataset = DocumentDataset(dd.from_pandas(pdf, npartitions=2)) + + def filename_fn(x): + return f"my_{x}.test" + + shuffle = nc.Shuffle( + seed=42, npartitions=2, partition_to_filename=filename_fn + ) + result_dataset = shuffle.shuffle_nondeterministic(original_dataset) + all_equal(expected_dataset, result_dataset) + + def test_shuffle_no_seed(self): + original_dataset = list_to_dataset(["one", "two", "three", "four", "five"]) + shuffle = nc.Shuffle() + result_dataset = shuffle(original_dataset) + assert len(result_dataset.df.compute()) == 5 + + +class TestShuffleDeterministic: + def test_shuffle(self): + original_dataset = list_to_dataset(["one", "two", "three", "four", "five"]) + expected_dataset = list_to_dataset(["five", "four", "three", "one", "two"]) + shuffle = nc.Shuffle(seed=42) + result_dataset = shuffle(original_dataset) + all_equal(expected_dataset, result_dataset) + + def test_new_partitions(self): + original_dataset = list_to_dataset( + ["one", "two", "three", "four", "five"], npartitions=3 + ) + expected_dataset = list_to_dataset( + ["four", "three", "five", "one", "two"], npartitions=3 + ) + shuffle = nc.Shuffle(seed=42, npartitions=2) + result_dataset = shuffle(original_dataset) + all_equal(expected_dataset, result_dataset) + + def test_filename(self): + original_dataset = list_to_dataset( + ["one", "two", "three", "four", "five"], npartitions=1 + ) + original_dataset.df["filename"] = "original.jsonl" + + expected_data = { + "text": ["four", "five", "three", "one", "two"], + "filename": [ + "file_0000000000.jsonl", + "file_0000000001.jsonl", + "file_0000000001.jsonl", + "file_0000000001.jsonl", + "file_0000000001.jsonl", + ], + } + pdf = pd.DataFrame(expected_data) + expected_dataset = DocumentDataset(dd.from_pandas(pdf, npartitions=2)) + + shuffle = nc.Shuffle(seed=42, npartitions=2) + result_dataset = shuffle(original_dataset) + all_equal(expected_dataset, result_dataset) + + def test_custom_filenames(self): + original_dataset = list_to_dataset( + ["one", "two", "three", "four", "five"], npartitions=1 + ) + original_dataset.df["filename"] = "original.jsonl" + + expected_data = { + "text": ["four", "five", "three", "one", "two"], + "filename": [ + "my_0.test", + "my_1.test", + "my_1.test", + "my_1.test", + "my_1.test", + ], + } + pdf = pd.DataFrame(expected_data) + expected_dataset = DocumentDataset(dd.from_pandas(pdf, npartitions=2)) + + def filename_fn(x): + return f"my_{x}.test" + + shuffle = nc.Shuffle(seed=42, npartitions=2, partition_to_filename=filename_fn) + result_dataset = shuffle(original_dataset) + all_equal(expected_dataset, result_dataset) From 0bab063159943951599d9fd8f23415875bb3be0c Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Fri, 3 May 2024 16:31:43 -0700 Subject: [PATCH 11/34] High level fuzzy duplicates module (#46) * Initial pass at fuzzy dedup api Signed-off-by: Ayush Dattagupta * Update deprecated shuffle arg Signed-off-by: Ayush Dattagupta * dask_cuda gpu only import Signed-off-by: Ayush Dattagupta * Move fuzzy_dedup imports to optional Signed-off-by: Ayush Dattagupta * more tests Signed-off-by: Ayush Dattagupta * Move FuzzyDeDupConfig to it's own class Signed-off-by: Ayush Dattagupta * Add example script and config file, fix typo Signed-off-by: Ayush Dattagupta * Remove slurm examples for gpu dedup Signed-off-by: Ayush Dattagupta * Add config module Signed-off-by: Ayush Dattagupta * Rename FuzzyDeDupConfig and minhash_length to FuzzyDuplicatesConfig, num_hashes Signed-off-by: Ayush Dattagupta * Add comments and update example Signed-off-by: Ayush Dattagupta * Write to same format as input in fuzzy dedup example Signed-off-by: Ayush Dattagupta --------- Signed-off-by: Ayush Dattagupta Signed-off-by: Nicole Luo --- config/fuzzy_dedup_config.yaml | 16 ++ examples/fuzzy_deduplication.py | 109 ++++++++++ examples/gpu_deduplication_example/README.md | 29 --- examples/gpu_deduplication_example/batch.sh | 38 ---- .../create-list-of-exact-duplicate-ids.sh | 53 ----- .../create-list-of-fuzzy-duplicate-ids.sh | 66 ------ .../remove-duplicates.sh | 52 ----- .../gpu_deduplication_example/run-buckets.sh | 29 --- examples/gpu_deduplication_example/run-cc.sh | 26 --- .../gpu_deduplication_example/run-jaccard.sh | 16 -- .../gpu_deduplication_example/run-minhash.sh | 42 ---- .../gpu_deduplication_example/run-shuffle.sh | 35 ---- .../gpu_deduplication_example/run-workflow.sh | 70 ------- nemo_curator/modules/__init__.py | 6 + nemo_curator/modules/config.py | 100 +++++++++ nemo_curator/modules/fuzzy_dedup.py | 182 ++++++++++++++-- .../fuzzy_deduplication/minhash_lsh.py | 2 +- tests/test_config.py | 81 ++++++++ tests/test_fuzzy_dedup.py | 195 +++++++++++++++++- 19 files changed, 670 insertions(+), 477 deletions(-) create mode 100644 config/fuzzy_dedup_config.yaml create mode 100644 examples/fuzzy_deduplication.py delete mode 100644 examples/gpu_deduplication_example/README.md delete mode 100644 examples/gpu_deduplication_example/batch.sh delete mode 100644 examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh delete mode 100644 examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh delete mode 100644 examples/gpu_deduplication_example/remove-duplicates.sh delete mode 100644 examples/gpu_deduplication_example/run-buckets.sh delete mode 100644 examples/gpu_deduplication_example/run-cc.sh delete mode 100644 examples/gpu_deduplication_example/run-jaccard.sh delete mode 100644 examples/gpu_deduplication_example/run-minhash.sh delete mode 100644 examples/gpu_deduplication_example/run-shuffle.sh delete mode 100755 examples/gpu_deduplication_example/run-workflow.sh create mode 100644 nemo_curator/modules/config.py create mode 100644 tests/test_config.py diff --git a/config/fuzzy_dedup_config.yaml b/config/fuzzy_dedup_config.yaml new file mode 100644 index 000000000..a513a72f8 --- /dev/null +++ b/config/fuzzy_dedup_config.yaml @@ -0,0 +1,16 @@ +cache_dir: "./fuzzy_dedup_cache" +# Optional Params below with default values +# profile_dir: null +# id_field: "id" +# text_field: "text" + +# seed: 42 +# char_ngrams: 5 +# num_buckets: 20 +# hashes_per_bucket: 13 +# use_64_bit_hash: false +# buckets_per_shuffle: 1 + +# false_positive_check: True +# num_anchors: 2 +# jaccard_threshold: 0.8 diff --git a/examples/fuzzy_deduplication.py b/examples/fuzzy_deduplication.py new file mode 100644 index 000000000..d74fd775c --- /dev/null +++ b/examples/fuzzy_deduplication.py @@ -0,0 +1,109 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time + +import dask +from dask import dataframe as dd + +from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.distributed_utils import get_client, write_to_disk +from nemo_curator.utils.script_utils import add_distributed_args + + +def pre_imports(): + import cudf # noqa: F401 + + +def main(args): + + dataset_dir = "/path/to/dataset" + log_dir = "./" + cache_dir = "./fuzzy_cache" + output_dir = "./output" + dataset_id_field = "id" + dataset_text_field = "text" + + filetype = "parquet" + + # Fuzzy dup calculation only supports the cuDF/GPU backend + backend = "cudf" + assert args.device == "gpu" + + with dask.config.set({"dataframe.backend": backend}): + client = get_client(args, args.device) + client.run(pre_imports) + + t0 = time.time() + if filetype == "parquet": + input_dataset = DocumentDataset( + dd.read_parquet( + dataset_dir, + columns=[dataset_id_field, dataset_text_field], + blocksize="256MiB", + aggregate_files=True, + ) + ) + elif filetype == "jsonl": + input_dataset = DocumentDataset.read_json( + dataset_dir, + backend=backend, + ) + + fuzzy_dedup_config = FuzzyDuplicatesConfig( + cache_dir=cache_dir, + id_field=dataset_id_field, + text_field=dataset_text_field, + seed=42, + char_ngrams=5, + num_buckets=20, + hashes_per_bucket=13, + use_64_bit_hash=False, + buckets_per_shuffle=5, + false_positive_check=True, + num_anchors=2, + jaccard_threshold=0.8, + ) + fuzzy_dup = FuzzyDuplicates(logger=log_dir, config=fuzzy_dedup_config) + duplicates = fuzzy_dup(dataset=input_dataset) + + # By default all duplicate id's and the group they belong to are included in the result + # keep 1 document from each group of duplcates and mark the others to remove + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html + docs_to_remove = duplicates.df.map_partitions( + lambda x: x[x.group.duplicated(keep="first")] + ) + + # When there are few duplicates we can compute the results to a list and use `isin`. + result = input_dataset.df[ + ~input_dataset.df[dataset_id_field].isin( + docs_to_remove[dataset_id_field].compute() + ) + ] + write_to_disk(result, output_dir, output_type=filetype) + print(time.time() - t0) + + +def attach_args( + parser=argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ), +): + return add_distributed_args(parser) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/examples/gpu_deduplication_example/README.md b/examples/gpu_deduplication_example/README.md deleted file mode 100644 index 2f294e1f6..000000000 --- a/examples/gpu_deduplication_example/README.md +++ /dev/null @@ -1,29 +0,0 @@ -### Deduplication Steps - -> [!CAUTION] -> The examples references here are outdated and will be replaced with an example using the Python API directly. For more details on the scripts refer to [nemo_curator/scripts/fuzzy_deduplication](/nemo_curator/scripts/fuzzy_deduplication) - -1. Exact dedup - 1. Input: Data directories - 2. Output: exact_duplicates.parquet. List of exact duplicates and the document hash. - -Fuzzy Dedup -1. Minhashes (Compute minhashes) - 1. Input: Data Directories - 2. Output: minhashes.parquet for each data dir. -2. Buckets (Minhash Buckets) - 1. Input: Minhash directories - 2. Output: Buckets.parquet -3. Jaccard Map Buckets + Jaccard shuffle - 1. Input: Buckets.parquet + Data Dir - 2. Output: Shuffled docs.parquet -4. Jaccard compute - 1. Input: Shuffled docs.parquet - 2. Output: dedup_final_results.parquet -5. Connected Components - 1. Input: Dedup_final_Results.parquet - 2. Output: connected_components.parquet - - -While calling the main `run-workflow.sh` script that points to these runscripts users can also set the relevant `LIBCUDF_CUFILE_POLICY`. -It is reccomended to set `LIBCUDF_CUFILE_POLICY=OFF` for all runs calling the script. diff --git a/examples/gpu_deduplication_example/batch.sh b/examples/gpu_deduplication_example/batch.sh deleted file mode 100644 index eca7145c8..000000000 --- a/examples/gpu_deduplication_example/batch.sh +++ /dev/null @@ -1,38 +0,0 @@ -#! /bin/bash - -#SBATCH --job-name=nemo-data-curator:gpu-deduplication -#SBATCH --nodes=8 -#SBATCH --exclusive -#SBATCH --time=04:00:00 - -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# -# This script can be used for running both exact and fuzzy document-level -# deduplication using Dask and cuDF -# - -base_dir=`pwd` # Assumes base dir is top-level dir of repo -RUNSCRIPT=${RUNSCRIPT:-${base_dir}/examples/gpu_deduplication_example/run-minhash.sh} -LIBCUDF_CUFILE_POLICY=${LIBCUDF_CUFILE_POLICY:-OFF} -echo $RUNSCRIPT - -docker_image='nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.03' -mounts="${base_dir}:${base_dir}" - -srun -l \ - --container-mounts=${mounts} \ - --container-image=${docker_image} \ - bash -c "echo ${RUNSCRIPT};echo ${LIBCUDF_CUFILE_POLICY}; LIBCUDF_CUFILE_POLICY=${LIBCUDF_CUFILE_POLICY} RUNSCRIPT=${RUNSCRIPT} bash ${base_dir}/examples/gpu_deduplication_example/run-workflow.sh" diff --git a/examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh b/examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh deleted file mode 100644 index 757629e33..000000000 --- a/examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh +++ /dev/null @@ -1,53 +0,0 @@ -#! /bin/bash - -#SBATCH --job-name=nemo-data-curator:create-exact-dup-id-list -#SBATCH --nodes=1 -#SBATCH --exclusive -#SBATCH --time=0:30:00 - -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -eux - -## Log and intermediate results dirs -base_dir=`pwd` -src_dir="${base_dir}/workspace/nemo-data-curator" -log_dir=${src_dir}/workspace/log/create_exact_dup_id_list -res_dir=${src_dir}/workspace/data/create_exact_dup_id_list -conf_dir=${src_dir}/workspace/config -mkdir -p ${log_dir} ${res_dir} ${conf_dir} - -## Container related variables -docker_image="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11" -mounts="${base_dir}:${base_dir}" - -## Set relevant filepath -input_id_list_dir= - -srun -l \ - --mpi=pmix \ - --output=${log_dir}/create_exact_dup_id_list_%j.out \ - --error=${log_dir}/create_exact_dup_id_list_%j.err \ - --container-image=${docker_image} \ - --container-mounts=${mounts} \ - create_list_of_duplicate_ids \ - --input-id-list-dir=${input_id_list_dir} \ - --input-bucket-key="_hashes" \ - --output-id-list-dir=${res_dir}/exact_dup_ids \ - --output-bucket-list-dir=${res_dir}/buckets \ - --log-dir=${log_dir}/create_exact_dup_id_list - -# Concatenate the extracted list of ids -cat ${res_dir}/exact_dup_ids/*.txt > ${res_dir}/exact_duplicate_id_list.txt diff --git a/examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh b/examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh deleted file mode 100644 index 70b0d13bd..000000000 --- a/examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh +++ /dev/null @@ -1,66 +0,0 @@ -#! /bin/bash - -#SBATCH --job-name=nemo-data-curator:create-fuzzy-dup-id-list -#SBATCH --nodes=1 -#SBATCH --exclusive -#SBATCH --time=0:30:00 - -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -eux - -## Log and intermediate results dirs -base_dir=`pwd` -src_dir="${base_dir}/workspace/nemo-data-curator" -log_dir=${src_dir}/workspace/log/create_fuzzy_dup_id_list -res_dir=${src_dir}/workspace/data/create_fuzzy_dup_id_list -conf_dir=${src_dir}/workspace/config -mkdir -p ${log_dir} ${res_dir} ${conf_dir} - -## Container related variables -docker_image="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11" -mounts="${base_dir}:${base_dir}" - -## Set relevant filepath -input_id_list_dir= - -# Generate the mapping and prepare the connected components -srun -l \ - --nodes=1 \ - --output=${log_dir}/create_fuzzy_dup_id_list_%j.out \ - --error=${log_dir}/create_fuzzy_dup_id_list_%j.err \ - --container-image=${docker_image} \ - --container-mounts=${mounts} \ - prepare_fuzzy_ids \ - --path-to-connected-components=${input_id_list_dir} \ - --output-indexed-connected-components=${res_dir}/indexed_connected_components.parquet \ - --output-id-mapping=${res_dir}/mapping.json - -srun -l \ - --mpi=pmix \ - --output=${log_dir}/create_fuzzy_dup_id_list_%j.out \ - --error=${log_dir}/create_fuzzy_dup_id_list_%j.err \ - --container-image=${docker_image} \ - --container-mounts=${mounts} \ - create_list_of_duplicate_ids \ - --input-id-list-dir=${res_dir}/indexed_connected_components.parquet \ - --input-bucket-key="group" \ - --id-mapping=${res_dir}/mapping.json \ - --output-id-list-dir=${res_dir}/fuzzy_dup_ids \ - --output-bucket-list-dir=${res_dir}/buckets \ - --log-dir=${log_dir}/create_fuzzy_dup_id_list - -# Concatenate the extracted list of ids -cat ${res_dir}/fuzzy_dup_ids/*.txt > ${res_dir}/fuzzy_duplicate_id_list.txt diff --git a/examples/gpu_deduplication_example/remove-duplicates.sh b/examples/gpu_deduplication_example/remove-duplicates.sh deleted file mode 100644 index 275c9f153..000000000 --- a/examples/gpu_deduplication_example/remove-duplicates.sh +++ /dev/null @@ -1,52 +0,0 @@ -#! /bin/bash - -#SBATCH --job-name=nemo-data-curator:remove-duplicates -#SBATCH --nodes=10 -#SBATCH --exclusive -#SBATCH --time=01:00:00 - -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -eux - -## Log and intermediate results dirs -base_dir=`pwd` -src_dir="${base_dir}/workspace/nemo-data-curator" -log_dir=${src_dir}/workspace/log/remove_duplicates -res_dir=${src_dir}/workspace/data/remove_duplicates -conf_dir=${src_dir}/workspace/config -mkdir -p ${log_dir} ${res_dir} ${conf_dir} - -## Container related variables -docker_image="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11" -mounts="${base_dir}:${base_dir}" - -## Set relevant filepaths -input_data_dir="" -input_id_list="" -output_data_dir="" -fname=$(basename ${input_id_list}) -tag=$(basename $fname .txt) - -srun -l \ - --output=${log_dir}/remove_duplicates_${tag}_%j.out \ - --error=${log_dir}/remove_duplicates_${tag}_%j.err \ - --container-image=${docker_image} \ - --container-mounts=${mounts} \ - remove_duplicates \ - --input-data-dir=${input_data_dir} \ - --input-id-list=${input_id_list} \ - --output-deduped-dir=${output_data_dir}/all_deduped \ - --log-dir=${log_dir}/all_deduped_${tag} diff --git a/examples/gpu_deduplication_example/run-buckets.sh b/examples/gpu_deduplication_example/run-buckets.sh deleted file mode 100644 index 7ca1d1021..000000000 --- a/examples/gpu_deduplication_example/run-buckets.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -minhash_dir="/outputdir/minhashes" -datasets=$(ls ${minhash_dir}) -for dataset in $datasets; do - input_minhash_dirs="$input_minhash_dirs $minhash_dir/$dataset/minhashes.parquet" -done -output_dir="/outputdir" - -buckets_per_shuffle=1 - -mkdir -p $output_dir -echo $input_minhash_dirs - -# Remove old buckets -rm -r ${output_dir}/buckets.parquet - -python -u minhash_buckets.py \ - --input-data-dirs $input_minhash_dirs \ - --minhash-length 260 \ - --output-bucket-dir $output_dir/ \ - --log-dir $LOGDIR \ - --protocol ucx \ - --num-bands 20 \ - --buckets-per-shuffle=$buckets_per_shuffle \ - --split-out=512 \ - --scheduler-file $LOGDIR/scheduler.json - -echo "Time Check: `date`" diff --git a/examples/gpu_deduplication_example/run-cc.sh b/examples/gpu_deduplication_example/run-cc.sh deleted file mode 100644 index ab0c62108..000000000 --- a/examples/gpu_deduplication_example/run-cc.sh +++ /dev/null @@ -1,26 +0,0 @@ - -base_dir="/outputdir" -cc_folder="CC" -output_dir="${base_dir}/${cc_folder}_output" -cache_dir="${base_dir}/${cc_folder}_cache" -jaccard_pairs_path="/outputdir/dedup_final_results.parquet" - - -echo "output_dir set to $output_dir" -echo "cache_dir set to $cache_dir" - -export RAPIDS_NO_INITIALIZE="1" -export CUDF_SPILL="1" - -### compute connected component -#rm -r $cache_dir -mkdir -p $output_dir $cache_dir - -python -u connected_component.py \ - --jaccard-pairs-path $jaccard_pairs_path \ - --output-dir $output_dir \ - --cache-dir $cache_dir \ - --log-dir $LOGDIR \ - --profile-path $PROFILESDIR \ - --num-files $NUM_FILES \ - --scheduler-file $LOGDIR/scheduler.json diff --git a/examples/gpu_deduplication_example/run-jaccard.sh b/examples/gpu_deduplication_example/run-jaccard.sh deleted file mode 100644 index 6ee51d302..000000000 --- a/examples/gpu_deduplication_example/run-jaccard.sh +++ /dev/null @@ -1,16 +0,0 @@ - -shuffled_docs_dir="/outputdir/shuffled_docs.parquet" -output_dir="/outputdir" - - -export CUDF_SPILL="1" - -python jaccard_compute.py \ - --shuffled-docs-path $shuffled_docs_dir \ - --output-dir $output_dir \ - --log-dir $LOGDIR \ - --num-files $NUM_FILES \ - --scheduler-file $LOGDIR/scheduler.json - - -echo "Time Check: `date`" diff --git a/examples/gpu_deduplication_example/run-minhash.sh b/examples/gpu_deduplication_example/run-minhash.sh deleted file mode 100644 index 79e069cdb..000000000 --- a/examples/gpu_deduplication_example/run-minhash.sh +++ /dev/null @@ -1,42 +0,0 @@ -#! /bin/bash - -# Assumes each directory contains Jsonl files -input_data_dirs="/datadir/dataset1/ \ -/datadir/dataset2/ \ -/datadir/dataset3/" - -output_dir="/outputdir/minhashes" - -# NOTE: The script implicitly assumes that the last part -# of the input data paths is the dataset name and will choose -# output dir names as follows: -# /outputdir/minhashes/dataset1 -# /outputdir/minhashes/dataset2 -# /outputdir/minhashes/dataset3 -# This can cause issues if the last part of the -# dirname is the same across datasets - -mkdir -p $output_dir - -# Is a good number for files 200MB or lesser -# Use a smaller value for larger jsonl files -files_per_partition=20 - -mkdir -p $output_dir -echo $input_data_dirs - -python -u compute_minhashes.py \ - --input-data-dirs $input_data_dirs \ - --minhash-length 260 \ - --char-ngram 5 \ - --hash-bytes 4 \ - --seed 42 \ - --output-minhash-dir $output_dir \ - --log-dir $LOGDIR \ - --num-files $NUM_FILES \ - --files-per-partition $files_per_partition \ - --profile-path $PROFILESDIR \ - --log-frequency 250 \ - --scheduler-file $LOGDIR/scheduler.json - -echo "Time Check: `date`" diff --git a/examples/gpu_deduplication_example/run-shuffle.sh b/examples/gpu_deduplication_example/run-shuffle.sh deleted file mode 100644 index e559dbbb1..000000000 --- a/examples/gpu_deduplication_example/run-shuffle.sh +++ /dev/null @@ -1,35 +0,0 @@ -input_data_dirs="/datadir/dataset1/ \ -/datadir/dataset2/ \ -/datadir/dataset3/" -buckets_dir="/outputdir/buckets.parquet" -output_dir="/outputdir" - - -export CUDF_SPILL="1" - -## Run jaccard Mapping -echo "Starting Jaccard mapping..." -python jaccard_map_buckets.py \ - --input-bucket-dir $buckets_dir \ - --input-data-dirs $input_data_dirs \ - --output-dir $output_dir \ - --log-dir $LOGDIR \ - --text-ddf-blocksize 512 \ - --num-files $NUM_FILES \ - --scheduler-file $LOGDIR/scheduler.json - -### Run jaccard Shuffle - -echo "Starting Jaccard Shuffle..." - -python jaccard_shuffle.py \ - --input-bucket-mapping-dir $output_dir/anchor_docs_with_bk.parquet \ - --input-data-dirs $input_data_dirs \ - --output-dir $output_dir \ - --text-ddf-blocksize 256 \ - --bucket-mapping-ddf-blocksize 512 \ - --num-files $NUM_FILES \ - --parts-per-worker 1 \ - --scheduler-file $LOGDIR/scheduler.json - -echo "Time Check: `date`" diff --git a/examples/gpu_deduplication_example/run-workflow.sh b/examples/gpu_deduplication_example/run-workflow.sh deleted file mode 100755 index b7e1392f6..000000000 --- a/examples/gpu_deduplication_example/run-workflow.sh +++ /dev/null @@ -1,70 +0,0 @@ -#! /bin/bash - -echo "Starting Workflow..." -echo "Time Check: `date`" -if [[ -z "$SLURM_JOB_ID" ]]; then - TODAY="`date +"%Y_%m_%d"`" -else - TODAY="`date +"%Y_%m_%d"`-$SLURM_JOB_ID" -fi - -# Prepare output directory -export JOB_DIR=rapids-dedup-scripts/DEDUP-$TODAY -export FULL_OUTPUT_DIR=$HOME/$JOB_DIR -export LOGDIR=$FULL_OUTPUT_DIR/logs -export PROFILESDIR=$FULL_OUTPUT_DIR/profiles -# Take the default location within the container -RUNSCRIPT=${RUNSCRIPT:--/opt/nemo-data-curator/examples/gpu_deduplication_example/run-minhash.sh} -echo $RUNSCRIPT -mkdir -p $LOGDIR -mkdir -p $PROFILESDIR - -cd /opt/nemo-data-curator/nemo_curator/gpu_deduplication -#-----# - - -# Env vars -export RAPIDS_NO_INITIALIZE="1" -export CUDF_SPILL="1" - -export LIBCUDF_CUFILE_POLICY=${LIBCUDF_CUFILE_POLICY:-ALWAYS} - -# Network interface specific to the cluster being used -export INTERFACE=ibp12s0 -export PROTOCOL=ucx -echo $INTERFACE - -# This variable can be set to limit the number of jsonl files that -# are used in the dedup. Setting to -1 reads in all files -export NUM_FILES=-1 - -# Start the scheduler on the rank 0 node -if [[ -z "$SLURM_NODEID" ]] || [[ $SLURM_NODEID == 0 ]]; then - echo "Starting scheduler" - DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True \ - DASK_DISTRIBUTED__RMM__POOL_SIZE=1GB \ - dask scheduler \ - --scheduler-file $LOGDIR/scheduler.json \ - --protocol $PROTOCOL \ - --interface $INTERFACE >> $LOGDIR/scheduler.log 2>&1 & -fi -sleep 30 - -# Start the workers on each node -echo "Starting workers..." -dask-cuda-worker --scheduler-file $LOGDIR/scheduler.json --rmm-pool-size 72GiB --interface $INTERFACE --rmm-async >> $LOGDIR/worker_$HOSTNAME.log 2>&1 & - -sleep 60 - -if [[ -z "$SLURM_NODEID" ]] || [[ $SLURM_NODEID == 0 ]]; then - echo "Time Check: `date`" - bash $RUNSCRIPT - echo "Time Check: `date`" - touch $LOGDIR/done.txt -fi - -# All nodes wait until done -while [ ! -f $LOGDIR/done.txt ] -do - sleep 15 -done diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py index 0867942d8..8b9613261 100644 --- a/nemo_curator/modules/__init__.py +++ b/nemo_curator/modules/__init__.py @@ -22,6 +22,7 @@ from nemo_curator.utils.import_utils import gpu_only_import_from from .add_id import AddId +from .config import FuzzyDuplicatesConfig from .dataset_ops import blend_datasets, Shuffle from .exact_dedup import ExactDuplicates from .filter import Filter, Score, ScoreFilter @@ -32,6 +33,9 @@ # GPU packages LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "LSH") MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "MinHash") +FuzzyDuplicates = gpu_only_import_from( + "nemo_curator.modules.fuzzy_dedup", "FuzzyDuplicates" +) # Pytorch related imports must come after all imports that require cugraph, # because of context cleanup issues b/w pytorch and cugraph @@ -42,6 +46,8 @@ "DomainClassifier", "ExactDuplicates", "Filter", + "FuzzyDuplicatesConfig", + "FuzzyDuplicates", "LSH", "MinHash", "Modify", diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py new file mode 100644 index 000000000..45ea527f2 --- /dev/null +++ b/nemo_curator/modules/config.py @@ -0,0 +1,100 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from dataclasses import dataclass + +import yaml + + +@dataclass +class BaseConfig: + @classmethod + def from_yaml(cls, file_path: str): + with open(file_path, "r") as file: + yaml_dict = yaml.safe_load(file) + return cls(**yaml_dict) + + +@dataclass +class FuzzyDuplicatesConfig(BaseConfig): + """ + Configuration for MinHash based fuzzy duplicates detection. + Parameters + ---------- + seed: Seed for minhash permutations + char_ngrams: Size of Char ngram shingles used in minhash computation + num_buckets: Number of Bands or buckets to use during Locality Sensitive Hashing + hashes_per_bucket: Number of hashes per bucket/band. + use_64_bit_hash: Whether to use a 32bit or 64bit hash function for minhashing. + buckets_per_shuffle: Number of bands/buckets to shuffle concurrently. + Larger values process larger batches by processing multiple bands + but might lead to memory pressures and related errors. + id_field: Column in the Dataset denoting document ID. + text_field: Column in the Dataset denoting document content. + profile_dir: str, Default None + If specified directory to write dask profile + cache_dir: str, Default None + Location to store deduplcation intermediates such as minhashes/buckets etc. + false_positive_check: bool, + Whether to run a check to look for false positives within buckets. + Note: This is a computationally expensive step. + num_anchors: int + Number of documents per bucket to use as reference for computing jaccard + pairs within that bucket to identify false positives. + jaccard_threshold: float + The Jaccard similariy threshold to consider a document a near duplicate + during false positive evaluations. + """ + + # General config + cache_dir: str + profile_dir: str = None + id_field: str = "id" + text_field: str = "text" + + # Minhash + LSH Config + seed: int = 42 + char_ngrams: int = 5 + num_buckets: int = 20 + hashes_per_bucket: int = 13 + use_64_bit_hash: bool = False + buckets_per_shuffle: int = 1 + + false_positive_check: bool = True + # Only required for fp check + num_anchors: int = 2 + jaccard_threshold: float = 0.8 + + def __post_init__(self): + self.num_hashes = self.num_buckets * self.hashes_per_bucket + if self.cache_dir is None: + raise ValueError( + "Finding fuzzy duplicates requires a cache directory accessible via all workers to store intermediates" + ) + if not self.false_positive_check: + raise NotImplementedError( + "Skipping false positive checks is not supported at the moment" + ) + if self.num_anchors <= 0: + raise ValueError("Number of anchors must be greater than 0") + if self.num_anchors > 2: + warnings.warn( + "Using a higher number of anchor docs might lead to higher memory footprint and might impact performance", + category=UserWarning, + ) + if not 0 <= self.jaccard_threshold <= 1: + raise ValueError("Jaccard Threshold must be between [0,1]") + if self.buckets_per_shuffle <= 0: + raise ValueError("Buckets per shuffle must be greater than 0") diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py index ac72e53d9..b61ccde72 100644 --- a/nemo_curator/modules/fuzzy_dedup.py +++ b/nemo_curator/modules/fuzzy_dedup.py @@ -35,6 +35,8 @@ from nemo_curator.datasets import DocumentDataset from nemo_curator.log import create_logger +from nemo_curator.modules.config import FuzzyDuplicatesConfig +from nemo_curator.modules.meta import Sequential from nemo_curator.utils.distributed_utils import ( get_current_client, get_num_workers, @@ -194,7 +196,7 @@ class LSH: def __init__( self, cache_dir: str, - minhash_length: int, + num_hashes: int, num_buckets: int, buckets_per_shuffle: int = 1, logger: Union[logging.LoggerAdapter, str] = "./", @@ -207,9 +209,9 @@ def __init__( ---------- cache_dir: str Needs to be specified, will compute & write duplicate id, bucket pairs to cache directory. - minhash_length: Length of minhash signature + num_hashes: Length of minhash signature num_buckets: Number of bands/buckets to create from the minhash signature. - Hashes_per_signature = minhash_length / num_buckets + Hashes_per_signature = num_hashes / num_buckets buckets_per_shuffle: Number of bands/buckets to shuffle concurrently. Larger values process larger batches by processing multiple bands but might lead to memory pressures and related errors. @@ -219,13 +221,13 @@ def __init__( profile_dir: str, Default None If specified directory to write dask profile """ - self.minhash_length = minhash_length + self.num_hashes = num_hashes self.num_buckets = num_buckets self.id_fields = [id_fields] if isinstance(id_fields, str) else id_fields self.minhash_field = minhash_field self.buckets_per_shuffle = buckets_per_shuffle self.bucket_ranges = self._generate_bucket_ranges( - self.num_buckets, self.minhash_length + self.num_buckets, self.num_hashes ) if cache_dir is None: @@ -245,15 +247,15 @@ def __init__( self._logger = logger def _generate_bucket_ranges( - self, num_buckets: int, minhash_length: int + self, num_buckets: int, num_hashes: int ) -> List[List[int]]: """ Generates a list of indices for the minhash ranges given num_bands & - minhash_length. - eg: num_bands=3, minhash_length=6 + num_hashes. + eg: num_bands=3, num_hashes=6 [[0, 1], [2, 3], [4, 5]] """ - minhashes_per_bucket = minhash_length // num_buckets + minhashes_per_bucket = num_hashes // num_buckets bucket_ranges = [ list( @@ -308,7 +310,7 @@ def _minhash_to_bucket_meta( self, df: dask_cudf.DataFrame ) -> Tuple[cudf.DataFrame, int]: meta = df._meta_nonempty[self.id_fields] - meta[self.minhash_field] = [np.ones(self.minhash_length)] * len(meta) + meta[self.minhash_field] = [np.ones(self.num_hashes)] * len(meta) return self.minhash_to_buckets(meta, self.bucket_ranges) def lsh( @@ -325,7 +327,6 @@ def lsh( bucket_ranges=self.bucket_ranges, meta=meta, ) - bucket_start_id = 0 for i in range(0, self.num_buckets, self.buckets_per_shuffle): value_vars = [ @@ -382,6 +383,154 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset: return DocumentDataset(buckets_df) +class FuzzyDuplicates: + def __init__( + self, + config: FuzzyDuplicatesConfig, + logger: Union[logging.LoggerAdapter, str] = "./", + ): + """ + Parameters + ---------- + config: FuzzyDuplicatesConfig, + Config options for finding FuzzyDuplicates + logger: Existing logger to log to, or a path to a log directory. + + Returns + ------- + DocumentDataset containing IDs of all documents and the corresponding duplicate group + they belong to. Documents in the same group are near duplicates. + """ + if isinstance(logger, str): + self._logger = create_logger( + rank=0, + log_file=os.path.join(logger, "FuzzyDuplicates.log"), + name="FuzzyDuplicates", + ) + else: + self._logger = logger + + self.config = config + self.minhash = MinHash( + seed=self.config.seed, + num_hashes=self.config.num_hashes, + char_ngrams=self.config.char_ngrams, + use_64bit_hash=self.config.use_64_bit_hash, + logger=self._logger, + id_field=self.config.id_field, + text_field=self.config.text_field, + profile_dir=self.config.profile_dir, + cache_dir=self.config.cache_dir, + ) + self.lsh = LSH( + cache_dir=self.config.cache_dir, + num_hashes=self.config.num_hashes, + num_buckets=self.config.num_buckets, + buckets_per_shuffle=self.config.buckets_per_shuffle, + logger=self._logger, + id_fields=[self.config.id_field], + profile_dir=self.config.profile_dir, + ) + self.map_buckets = _MapBuckets( + id_fields=[self.config.id_field], + text_field=self.config.text_field, + logger=self._logger, + num_anchors=self.config.num_anchors, + ) + self.jaccard_shuffle = _Shuffle( + id_fields=[self.config.id_field], + text_field=self.config.text_field, + logger=self._logger, + profile_dir=self.config.profile_dir, + ) + self.jaccard_compute = JaccardSimilarity( + id_field=self.config.id_field, + text_field=self.config.text_field, + ngram_width=self.config.char_ngrams, + anchor_id_fields=[ + f"anchor_{i}_{self.config.id_field}" + for i in range(self.config.num_anchors) + ], + ) + self.connected_components = ConnectedComponents( + cache_dir=self.config.cache_dir, + jaccard_pairs_path=os.path.join( + self.config.cache_dir, "jaccard_similarity_results.parquet" + ), + id_column=self.config.id_field, + convert_str_ids=False, + jaccard_threshold=self.config.jaccard_threshold, + ) + + def __call__(self, dataset: DocumentDataset): + """ + Parameters + ---------- + dataset: DocumentDataset + The input datset to compute FuzzyDuplicates. Must contain a text and unique id field. + + Returns + ------- + DocumentDataset containing IDs of all documents and the corresponding duplicate group + they belong to. Documents in the same group are near duplicates. + """ + # Minhash + LSH + print("Stage1: Starting Minhash + LSH computation") + minhashLSH = Sequential([self.minhash, self.lsh]) + buckets_df = minhashLSH(dataset) + print("Stage1: Minhash + LSH complete!") + + # Map buckets to lower cardinality distribution + print("Stage2 (False Postive Check): Starting Map_Buckets") + ddf_mapped_buckets_w_anchors = self.map_buckets.map_buckets_with_anchors( + documents_df=dataset.df, buckets_df=buckets_df.df + ) + mapped_buckets_w_anchors_path = os.path.join( + self.config.cache_dir, "anchor_docs_with_bk.parquet" + ) + ddf_mapped_buckets_w_anchors.to_parquet( + mapped_buckets_w_anchors_path, write_index=False + ) + print("Stage2 (False Postive Check): Map_Buckets Complete!") + + # Shuffle documents based on mapped buckets + print("Stage3 (False Postive Check): Shuffle docs") + shuffled_docs_path = os.path.join( + self.config.cache_dir, "shuffled_docs.parquet" + ) + self.jaccard_shuffle.shuffle_docs_on_buckets( + documents_df=dataset.df, + bucket_w_anchors_path=mapped_buckets_w_anchors_path, + output_shuffled_docs_path=shuffled_docs_path, + bucket_mapping_df_blocksize=256, + parts_per_worker=1, + bucket_parts_per_worker=8, + ) + print("Stage3 (False Postive Check): Shuffle docs complete!") + + # jaccard comparision within buckets + print("Stage4 (False Postive Check): Jaccard Similarity in Buckets") + jaccard_pairs_path = os.path.join( + self.config.cache_dir, "jaccard_similarity_results.parquet" + ) + jaccard_pairs_df = self.jaccard_compute.jaccard_compute( + shuffled_docs_path=shuffled_docs_path + ) + jaccard_pairs_df.to_parquet( + jaccard_pairs_path, + write_index=False, + write_metadata_file=False, + ) + print("Stage4 (False Postive Check): Jaccard Similarity in Buckets Complete!") + + # Connected components across buckets + print("Stage5: Connected Components across buckets") + cc_path = os.path.join(self.config.cache_dir, "connected_components.parquet") + self.connected_components.cc_workflow(cc_path) + print("Stage5: Connected Components across buckets complete!") + return DocumentDataset(dask_cudf.read_parquet(cc_path, split_row_groups=False)) + + class _MapBuckets: """ buckets to a logical partition by using a modified bin packing algorithm. @@ -508,6 +657,7 @@ def _get_output_map_based_on_str_bytes( """ Add output_partition_id to buckets_ddf """ + documents_df = documents_df.copy() documents_df[bytes_column] = documents_df[self.text_field].map_partitions( lambda s: s.str.byte_count() ) @@ -620,7 +770,7 @@ def map_buckets_with_anchors( ddf_anchor_docs_with_bk, self.id_fields, ignore_index=True, - shuffle=shuffle_type, + shuffle_method=shuffle_type, ).map_partitions( M.drop_duplicates, meta=ddf_anchor_docs_with_bk._meta, @@ -1195,7 +1345,7 @@ def _write_dedup_encoded_jaccard_pair(self, encoded_jaccard_pair_path): ddf, [self.left_id, self.right_id], ignore_index=True, - shuffle="tasks", + shuffle_method="tasks", ) ddf = ddf.map_partitions( M.drop_duplicates, @@ -1301,12 +1451,12 @@ def _batched_merge_and_write( how="inner", broadcast=True, ) + subset_ddf = subset_ddf.drop( + columns=pair_ids, + ) subset_ddf = subset_ddf.rename( columns={"uid": f"{self.id_column}_{tag}"} ) - subset_ddf = subset_ddf.drop( - columns=[f"dataset_id_{tag}", f"doc_id_{tag}"] - ) subset_ddf = subset_ddf[[self.left_id, self.right_id, "jaccard"]] output_batch_path = os.path.join(output_path, f"{batch_id}.parquet") diff --git a/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py index a0484cf0d..21dac27d7 100644 --- a/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py +++ b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py @@ -64,7 +64,7 @@ def main(args): ) lsh = LSH( cache_dir=args.output_bucket_dir, - minhash_length=args.minhash_length, + num_hashes=args.minhash_length, num_buckets=args.num_bands, buckets_per_shuffle=args.buckets_per_shuffle, id_fields=["dataset_id", "doc_id"], diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 000000000..fcb34d29a --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,81 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass + +import pytest +import yaml + +from nemo_curator.modules.config import BaseConfig + + +@dataclass +class CustomConfig(BaseConfig): + a: str + b: int + c: bool + d: float = 3.0 + + def __post_init__(self): + if self.d <= 0: + raise ValueError("d must be positive") + + +class TestConfig: + @pytest.fixture(autouse=True) + def config_params(self): + self.config_dict = {"a": "a", "b": 1, "c": True, "d": 4.0} + + def test_init(self): + config = CustomConfig(a="a", b=1, c=True) + assert config.a == "a" + assert config.b == 1 + assert config.c is True + assert config.d == 3.0 + + def test_from_yaml(self, tmpdir): + with open(tmpdir / "test_config.yaml", "w") as file: + yaml.dump(self.config_dict, file) + + config = CustomConfig.from_yaml(tmpdir / "test_config.yaml") + for key, value in self.config_dict.items(): + assert getattr(config, key) == value + + def test_from_yaml_raises(self, tmpdir): + config_dict = self.config_dict.copy() + config_dict["d"] = -1.0 + with open(tmpdir / "test_config.yaml", "w") as file: + yaml.dump(config_dict, file) + with pytest.raises(ValueError): + CustomConfig.from_yaml(tmpdir / "test_config.yaml") + + def test_from_yaml_missing_key(self, tmpdir): + config_dict = self.config_dict.copy() + del config_dict["a"] + with open(tmpdir / "test_config.yaml", "w") as file: + yaml.dump(config_dict, file) + with pytest.raises(TypeError): + CustomConfig.from_yaml(tmpdir / "test_config.yaml") + + def test_from_yaml_extra_key(self, tmpdir): + config_dict = self.config_dict.copy() + config_dict["e"] = "e" + with open(tmpdir / "test_config.yaml", "w") as file: + yaml.dump(config_dict, file) + with pytest.raises(TypeError): + CustomConfig.from_yaml(tmpdir / "test_config.yaml") + + def test_post_init_raises(self): + with pytest.raises(ValueError): + CustomConfig(a="a", b=1, c=True, d=-1.0) diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py index f0ded450e..e89f998e0 100644 --- a/tests/test_fuzzy_dedup.py +++ b/tests/test_fuzzy_dedup.py @@ -18,14 +18,17 @@ import numpy as np import pytest +import yaml from dask.dataframe.utils import assert_eq +from distributed import Client +from nemo_curator import LSH, FuzzyDuplicates, FuzzyDuplicatesConfig, MinHash from nemo_curator.datasets import DocumentDataset -from nemo_curator.modules import LSH, MinHash -from nemo_curator.utils.import_utils import gpu_only_import +from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from cudf = gpu_only_import("cudf") dask_cudf = gpu_only_import("dask_cudf") +LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster") @pytest.fixture @@ -46,6 +49,25 @@ def fuzzy_dedup_data(): return DocumentDataset(df) +@pytest.fixture +def large_fuzzy_dedup_data(): + df = cudf.DataFrame( + { + "id": np.arange(500), + "text": [ + "A test string", + "A different test string", + "A different object", + "The quick brown fox jumps over the lazy dog", + "The quick black cat jumps over the lazy dog", + ] + * 100, + } + ) + df = dask_cudf.from_cudf(df, 5).reset_index(drop=True) + return DocumentDataset(df) + + def minhash_overlap(minhash1: np.array, minhash2: np.array): assert len(minhash1) == len(minhash2) overlap = sum(minhash1 == minhash2) @@ -149,7 +171,7 @@ def minhash_data(self): def test_lsh(self, tmpdir, buckets_per_shuffle): lsh = LSH( cache_dir=tmpdir, - minhash_length=6, + num_hashes=6, num_buckets=3, buckets_per_shuffle=buckets_per_shuffle, minhash_field="minhash_sig", @@ -164,7 +186,7 @@ def test_lsh(self, tmpdir, buckets_per_shuffle): def test_multiple_id_cols(self, tmpdir): lsh = LSH( cache_dir=tmpdir, - minhash_length=6, + num_hashes=6, num_buckets=3, buckets_per_shuffle=1, id_fields=["id", "dataset_id"], @@ -180,3 +202,168 @@ def test_multiple_id_cols(self, tmpdir): [[(1, 1), (1, 2)], [(1, 2), (2, 3)], [(3, 4), (4, 5)]], name="new_id" ) assert_eq(expected_df, docs_list, check_index=False) + + +@pytest.mark.gpu +class TestFuzzyDuplicates: + @pytest.fixture(autouse=True, scope="class") + def gpu_client(self, request): + with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client: + request.cls.client = client + request.cls.cluster = cluster + yield + + @pytest.mark.parametrize("use_64_bit_hash", [False, True]) + @pytest.mark.parametrize( + "num_buckets,jaccard_threshold,duplicate_docs", + # Duplcated docs estimated from true_jaccard values + [ + (5, 0.5, [[4, -1]]), + (10, 0.39, [[4, -1], [1, 2]]), + (3, 0.3, [[4, -1], [1, 2, 300]]), + ], + ) + def test_fuzzy_dedup( + self, + fuzzy_dedup_data, + use_64_bit_hash, + num_buckets, + jaccard_threshold, + duplicate_docs, + tmpdir, + ): + print(self.client) + # Dedup might fail when indices per partition do not start from 0 + fuzzy_dedup_data.df = fuzzy_dedup_data.df.reset_index(drop=True) + config = FuzzyDuplicatesConfig( + cache_dir=tmpdir, + id_field="id", + text_field="text", + seed=42, + char_ngrams=5, + num_buckets=num_buckets, + hashes_per_bucket=1, + use_64_bit_hash=use_64_bit_hash, + buckets_per_shuffle=5, + false_positive_check=True, + num_anchors=2, + jaccard_threshold=jaccard_threshold, + ) + fuzzy_duplicates = FuzzyDuplicates(config=config) + result = fuzzy_duplicates(fuzzy_dedup_data) + result_df = result.df.compute() + # Drop non duplicated docs + result_df = result_df[result_df.group.duplicated(keep=False)] + result_df = result_df.groupby("group").id.collect() + # Sort to maintain uniform ordering + + result_df = result_df.list.sort_values() + result_df = result_df.sort_values() + expected_df = cudf.Series(duplicate_docs, name="id") + expected_df = expected_df.list.sort_values() + expected_df = expected_df.sort_values() + assert_eq(expected_df, result_df, check_index=False) + + @pytest.mark.xfail + def test_non_uniform_indices( + self, + tmpdir, + ): + print(self.client) + # Dedup might fail when indices per partition do not start from 0 + df = cudf.DataFrame( + { + "id": [1, 2, 300, 4, -1], + "text": [ + "A test string", + "A different test string", + "A different object", + "The quick brown fox jumps over the lazy dog", + "The quick black cat jumps over the lazy dog", + ], + } + ) + df = dask_cudf.from_cudf(df, 2) + data = DocumentDataset(df) + duplicate_docs = [[4, -1], [1, 2, 300]] + config = FuzzyDuplicatesConfig( + cache_dir=tmpdir, + id_field="id", + text_field="text", + seed=42, + char_ngrams=5, + num_buckets=10, + hashes_per_bucket=1, + use_64_bit_hash=False, + buckets_per_shuffle=5, + false_positive_check=True, + num_anchors=2, + jaccard_threshold=0.39, + ) + fuzzy_duplicates = FuzzyDuplicates(config=config) + result = fuzzy_duplicates(data) + result_df = result.df.compute() + # Drop non duplicated docs + result_df = result_df[result_df.group.duplicated(keep=False)] + result_df = result_df.groupby("group").id.collect() + # Sort to maintain uniform ordering + + result_df = result_df.list.sort_values() + result_df = result_df.sort_values() + expected_df = cudf.Series(duplicate_docs, name="id") + expected_df = expected_df.list.sort_values() + expected_df = expected_df.sort_values() + assert_eq(expected_df, result_df, check_index=False) + + @pytest.mark.parametrize("num_anchors", [1, 3, 10]) + def test_num_anchors(self, large_fuzzy_dedup_data, num_anchors, tmpdir): + config = FuzzyDuplicatesConfig( + cache_dir=tmpdir, + id_field="id", + text_field="text", + seed=42, + char_ngrams=5, + num_buckets=5, + hashes_per_bucket=1, + use_64_bit_hash=False, + buckets_per_shuffle=5, + false_positive_check=True, + num_anchors=num_anchors, + jaccard_threshold=0.39, + ) + fuzzy_duplicates = FuzzyDuplicates(config=config) + fuzzy_duplicates(large_fuzzy_dedup_data) + anchor_docs_df_cols = dask_cudf.read_parquet( + tmpdir / "anchor_docs_with_bk.parquet" + ).columns + assert all(f"anchor_{i}_id" in anchor_docs_df_cols for i in range(num_anchors)) + + +class TestFuzzyDuplicatesConfig: + def test_bad_inputs(self, tmpdir): + with pytest.raises(ValueError): + FuzzyDuplicatesConfig(cache_dir=tmpdir, num_anchors=0) + with pytest.warns( + UserWarning, match="Using a higher number of anchor docs might" + ): + FuzzyDuplicatesConfig(cache_dir=tmpdir, num_anchors=3) + with pytest.raises(ValueError): + FuzzyDuplicatesConfig(cache_dir=tmpdir, jaccard_threshold=1.2) + with pytest.raises(NotImplementedError): + FuzzyDuplicatesConfig(cache_dir=tmpdir, false_positive_check=False) + with pytest.raises(ValueError): + FuzzyDuplicatesConfig(cache_dir=tmpdir, buckets_per_shuffle=0) + + def test_from_yaml(self, tmpdir): + yaml_params = { + "cache_dir": "./", + "num_anchors": 2, + "jaccard_threshold": 0.8, + "false_positive_check": True, + "buckets_per_shuffle": 1, + } + with open(tmpdir / "config.yaml", "w") as f: + yaml.dump(yaml_params, f) + config = FuzzyDuplicatesConfig.from_yaml(tmpdir / "config.yaml") + for param in yaml_params: + assert getattr(config, param) == yaml_params[param] From 9849164cc22a1f1d3e091a0bf125b15c26e2ba8a Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 6 May 2024 15:02:23 -0700 Subject: [PATCH 12/34] Fix indexing in PII Modifier (#55) * Fix pii index issue Signed-off-by: Ryan Wolf * Add sequential wrapper Signed-off-by: Ryan Wolf * Fix pii tests Signed-off-by: Ryan Wolf --------- Signed-off-by: Ryan Wolf Signed-off-by: Nicole Luo --- docs/user-guide/QualityFiltering.rst | 27 +++++++++ nemo_curator/filters/classifier_filter.py | 6 +- nemo_curator/modifiers/__init__.py | 2 + nemo_curator/modifiers/pii_modifier.py | 4 +- tests/test_filters.py | 17 ++++++ tests/test_pii_accuracy.py | 68 +++++++++++++++++++++++ 6 files changed, 119 insertions(+), 5 deletions(-) diff --git a/docs/user-guide/QualityFiltering.rst b/docs/user-guide/QualityFiltering.rst index 46a8c9d81..ba2c34ad6 100644 --- a/docs/user-guide/QualityFiltering.rst +++ b/docs/user-guide/QualityFiltering.rst @@ -153,6 +153,33 @@ Here is the ``WordCountFilter`` rewritten to use batches in the ``keep_document` pass_max = score <= self._max_words return pass_min & pass_max +When you use the ``batched`` decorator, the index of the series returned from the function must remain the same as the index that was passed in. +The index may not be continuous due to filters being applied prior to the current filter. +In the above code, the index will be the same automatically so no change is required. +However, when writing functions that transform the series into a different structure like a list, special care is needed. +The following code example demonstrates what this error may look like, and how to fix it. + +.. code-block:: python + + class BuggyLengthFilter(DocumentFilter): + + @batched + def score_document(self, documents: pd.Series): + scores = [] + for document in documents: + scores.append(len(document)) + + return pd.Series(scores) # Bad! Does not preserve the index + + class CorrectLengthFilter(DocumentFilter): + + @batched + def score_document(self, documents: pd.Series): + scores = [] + for document in documents: + scores.append(len(document)) + + return pd.Series(scores, index=documents.index) # Good! Preserves the index ----------------------------------------- diff --git a/nemo_curator/filters/classifier_filter.py b/nemo_curator/filters/classifier_filter.py index 3ade004ec..4f06c8b25 100644 --- a/nemo_curator/filters/classifier_filter.py +++ b/nemo_curator/filters/classifier_filter.py @@ -37,7 +37,7 @@ def __init__(self, model_path=None, label="__label__hq", alpha=3, seed=42): self._name = "fasttext_quality_filter" @batched - def score_document(self, df): + def score_document(self, df: pd.Series): model_attr = f"{self._name}_{self._model_path}" try: model = load_object_on_worker(model_attr, self._load_model, {}) @@ -56,7 +56,7 @@ def _score_document(text): return df.apply(_score_document) @batched - def keep_document(self, df): + def keep_document(self, df: pd.Series): return np.random.pareto(self._alpha, size=len(df)) > 1 - df def _load_model(self): @@ -82,7 +82,7 @@ def __init__(self, model_path=None, min_langid_score=0.3): dask.config.set({"dataframe.convert-string": False}) @batched - def score_document(self, df): + def score_document(self, df: pd.Series): model_attr = f"{self._name}_{self._model_path}" try: model = load_object_on_worker(model_attr, self._load_model, {}) diff --git a/nemo_curator/modifiers/__init__.py b/nemo_curator/modifiers/__init__.py index 4c05a31e7..f6511fdb0 100644 --- a/nemo_curator/modifiers/__init__.py +++ b/nemo_curator/modifiers/__init__.py @@ -15,6 +15,7 @@ from .c4 import BoilerPlateStringModifier from .doc_modifier import DocumentModifier from .fasttext import FastTextLabelModifier +from .pii_modifier import PiiModifier from .unicode_reformatter import UnicodeReformatter __all__ = [ @@ -22,4 +23,5 @@ "BoilerPlateStringModifier", "FastTextLabelModifier", "UnicodeReformatter", + "PiiModifier", ] diff --git a/nemo_curator/modifiers/pii_modifier.py b/nemo_curator/modifiers/pii_modifier.py index 23c713fbf..c2a398b48 100644 --- a/nemo_curator/modifiers/pii_modifier.py +++ b/nemo_curator/modifiers/pii_modifier.py @@ -85,8 +85,8 @@ def modify_document(self, text: pd.Series, partition_info: Dict = None): logging.error( f"Encountered error {str(e)} in partition {partition_info['number']}" ) - return pd.Series([True]) - output: pd.Series = pd.Series(output) + return pd.Series([True], index=text.index) + output: pd.Series = pd.Series(output, text.index) return output def load_deidentifier(self): diff --git a/tests/test_filters.py b/tests/test_filters.py index 50676f385..951c1977c 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -282,6 +282,23 @@ def test_score_type(self, letter_count_data): expected_scores == scores.compute() ), f"Expected {expected_scores} but got {scores}" + def test_chain_filter(self, letter_count_data): + letter_count_filter = LetterCountFilter(min_count=4) + length_filter = BatchedLengthFilter(min_length=8, max_length=11) + filters = Sequential( + [ + ScoreFilter(letter_count_filter, text_field="documents"), + ScoreFilter(length_filter, text_field="documents"), + ] + ) + filtered_data = filters(letter_count_data) + + expected_indices = [2] + expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices]) + assert all_equal( + expected_data, filtered_data + ), f"Expected {expected_data} but got {filtered_data}" + class TestHeuristicFilters: def test_nonalpha(self): diff --git a/tests/test_pii_accuracy.py b/tests/test_pii_accuracy.py index 9431779a3..7e7d58663 100644 --- a/tests/test_pii_accuracy.py +++ b/tests/test_pii_accuracy.py @@ -16,9 +16,17 @@ import re from pathlib import Path +import pandas as pd import pytest +from dask import dataframe as dd +from dask.distributed import Client, LocalCluster +import nemo_curator as nc +from nemo_curator.datasets import DocumentDataset +from nemo_curator.filters import DocumentFilter +from nemo_curator.modifiers import PiiModifier from nemo_curator.pii.algorithm import PiiDeidentifier +from nemo_curator.utils.decorators import batched LOGGER = logging.getLogger(__name__) @@ -118,3 +126,63 @@ def test_batch_accuracy(self): match = all(compare_outputs(x, y) for x, y in zip(outputs, targets)) print("Matches:", "No" if not match else "Yes") assert match == True + + +class BatchedLengthFilter(DocumentFilter): + """ + Keeps documents of a given length + """ + + def __init__(self, min_length=5, max_length=10): + super().__init__() + self.min_length = min_length + self.max_length = max_length + + @batched + def score_document(self, df): + return df.str.len() + + @batched + def keep_document(self, scores): + min_threshold = self.min_length <= scores + max_threshold = scores <= self.max_length + return min_threshold & max_threshold + + +class TestPIIModule: + def test_filter_chain(self): + inputs = [ + "Alice goes on a walk", + "Bob goes on a walk", + "Someone named Charlie goes on a walk", + "A human walking is David", + "A human walking is Eliza", + ] + targets = [ + "***** goes on a walk", + "*** goes on a walk", + "A human walking is *****", + "A human walking is *****", + ] + input_df = pd.DataFrame({"text": inputs}) + target_df = pd.DataFrame({"text": targets}) + with LocalCluster(n_workers=1, threads_per_worker=1) as cluster: + with Client(cluster): + input_dataset = DocumentDataset(dd.from_pandas(input_df, npartitions=1)) + pipeline = nc.Sequential( + [ + nc.ScoreFilter( + BatchedLengthFilter(min_length=0, max_length=25) + ), + nc.Modify( + PiiModifier( + language="en", anonymize_action="mask", device="cpu" + ) + ), + ] + ) + output_dataset = pipeline(input_dataset) + + output_df = output_dataset.df.compute().reset_index(drop=True) + match = all(output_df["text"] == target_df["text"]) + assert match From 794a435c172577c31a54440a87f9b9236e5dc413 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 7 May 2024 14:46:28 -0700 Subject: [PATCH 13/34] Disable string conversion globally (#56) Signed-off-by: Ryan Wolf Signed-off-by: Nicole Luo --- config/fasttext_langid.yaml | 1 + nemo_curator/__init__.py | 8 ++++++++ nemo_curator/filters/classifier_filter.py | 5 ----- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/config/fasttext_langid.yaml b/config/fasttext_langid.yaml index 86b18761d..a1f4f3530 100644 --- a/config/fasttext_langid.yaml +++ b/config/fasttext_langid.yaml @@ -1,5 +1,6 @@ input_field: text filters: - name: nemo_curator.filters.classifier_filter.FastTextLangId + log_score: True params: model_path: diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py index 000e459a9..4645d55ef 100644 --- a/nemo_curator/__init__.py +++ b/nemo_curator/__init__.py @@ -12,4 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import dask + from .modules import * + +# Dask will automatically convert the list score type +# to a string without this option. +# See https://github.com/NVIDIA/NeMo-Curator/issues/33 +# This also happens when reading and writing to files +dask.config.set({"dataframe.convert-string": False}) diff --git a/nemo_curator/filters/classifier_filter.py b/nemo_curator/filters/classifier_filter.py index 4f06c8b25..741df9640 100644 --- a/nemo_curator/filters/classifier_filter.py +++ b/nemo_curator/filters/classifier_filter.py @@ -76,11 +76,6 @@ def __init__(self, model_path=None, min_langid_score=0.3): self._cutoff = min_langid_score self._name = "lang_id" - # Dask will automatically convert the list score type - # to a string without this option. - # See https://github.com/NVIDIA/NeMo-Curator/issues/33 - dask.config.set({"dataframe.convert-string": False}) - @batched def score_document(self, df: pd.Series): model_attr = f"{self._name}_{self._model_path}" From 0f5a0298bda4aec735173d00dbcb765973be77de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Mart=C3=ADnez?= <26169771+miguelusque@users.noreply.github.com> Date: Wed, 8 May 2024 18:01:56 +0200 Subject: [PATCH 14/34] Fix issue #43 (empty files creation) and improve reading/writing speed (#57) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes issue #43 (empty files created when invoking reshard_jsonl method at nemo_curator.utils.file_utils.py) by double-checking the files size after being generated, and deleting them with size zero. In addition to that, I have noticed there is no need to parse to JSON object the content of the different lines, which should be already in json format. By removing that extra-parsing, there is a significant speed up in the execution of this method. Signed-off-by: Miguel Martínez <26169771+miguelusque@users.noreply.github.com> Signed-off-by: Nicole Luo --- nemo_curator/utils/file_utils.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py index af3c2513d..3ec466b4c 100644 --- a/nemo_curator/utils/file_utils.py +++ b/nemo_curator/utils/file_utils.py @@ -181,9 +181,8 @@ def parse_str_of_num_bytes(s, return_str=False): def _save_jsonl(documents, output_path, start_index=0, max_index=10000, prefix=None): """Worker function to write out the data to jsonl files""" - def _output_json(document): - myjson = json.dumps(document, ensure_ascii=False) - return myjson.encode("utf-8") + def _encode_text(document): + return document.strip().encode("utf-8") def _name(start_index, npad, prefix, i): tag = str(start_index + i).rjust(npad, "0") @@ -195,11 +194,22 @@ def _name(start_index, npad, prefix, i): output_glob_string = os.path.join(output_path, "*.jsonl") - documents.map(_output_json).to_textfiles( + output_files = documents.map(_encode_text).to_textfiles( output_glob_string, name_function=name, ) + # Delete empty files generated due to empty partitions in the bag + for output_file in output_files: + try: + if os.path.getsize(output_file) == 0: + os.remove(output_file) + except Exception as exception: + print( + f"An exception occurred when trying to delete {output_file}.\n{exception}", + flush=True, + ) + def reshard_jsonl( input_dir, output_dir, output_file_size="100M", start_index=0, file_prefix="" @@ -212,7 +222,8 @@ def reshard_jsonl( output_dir: The output directory where the resharded jsonl files will be written output_file_size: Approximate size of output files. Must specify with a string and with the unit K, M or G for kilo, mega or gigabytes - start_index: Starting index for naming the output files + start_index: Starting index for naming the output files. Note: The indices may not + be continuous if the sharding process would output an empty file in its place file_prefix: Prefix to use to prepend to output file number """ @@ -222,7 +233,7 @@ def reshard_jsonl( input_files = list(get_all_files_paths_under(input_dir)) # Read in the dask bag - b = db.read_text(input_files, blocksize=blocksize).map(json.loads) + b = db.read_text(input_files, blocksize=blocksize) # Prepare the output output_dir = expand_outdir_and_mkdir(output_dir) From d4a2f0f1efb758e9c891bb26b7bb185a05c6bebd Mon Sep 17 00:00:00 2001 From: Mehran Maghoumi Date: Fri, 10 May 2024 10:25:40 -0700 Subject: [PATCH 15/34] [Tutorials] Add a tutorial for PEFT data curation (#45) This PR adds a new tutorial to demonstrate data curation for PEFT use-cases. Signed-off-by: Mehran Maghoumi Signed-off-by: Nicole Luo --- tutorials/peft-curation/README.md | 19 +++ tutorials/peft-curation/docbuilder.py | 113 ++++++++++++++++ tutorials/peft-curation/filters.py | 47 +++++++ tutorials/peft-curation/main.py | 179 ++++++++++++++++++++++++++ tutorials/peft-curation/modifiers.py | 68 ++++++++++ tutorials/tinystories/README.md | 2 +- tutorials/tinystories/main.py | 6 +- 7 files changed, 432 insertions(+), 2 deletions(-) create mode 100644 tutorials/peft-curation/README.md create mode 100644 tutorials/peft-curation/docbuilder.py create mode 100644 tutorials/peft-curation/filters.py create mode 100644 tutorials/peft-curation/main.py create mode 100644 tutorials/peft-curation/modifiers.py diff --git a/tutorials/peft-curation/README.md b/tutorials/peft-curation/README.md new file mode 100644 index 000000000..afa0d66a3 --- /dev/null +++ b/tutorials/peft-curation/README.md @@ -0,0 +1,19 @@ +# Curating Datasets for Parameter Efficient Fine-tuning + +This tutorial demonstrates the usage of NeMo Curator's Python API to curate a dataset for +parameter-efficient fine-tuning (PEFT). + +In this tutorial, we use the [Enron Emails dataset](https://huggingface.co/datasets/neelblabla/enron_labeled_emails_with_subjects-llama2-7b_finetuning), +which is a dataset of emails with corresponding classification labels for each email. Each email has +a subject, a body and a category (class label). We demonstrate various filtering and processing +operations that can be applied to each record. + +## Usage +After installing the NeMo Curator package, you can simply run the following command: +``` +python tutorials/peft-curation/main.py +``` + +By default, this tutorial will use at most 8 workers to run the curation pipeline. If you face any +out of memory issues, you can reduce the number of workers by supplying the `--n-workers=N` argument, +where `N` is the number of workers to spawn. diff --git a/tutorials/peft-curation/docbuilder.py b/tutorials/peft-curation/docbuilder.py new file mode 100644 index 000000000..3ae0840c9 --- /dev/null +++ b/tutorials/peft-curation/docbuilder.py @@ -0,0 +1,113 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from typing import Dict + +import requests + +from nemo_curator.download.doc_builder import ( + DocumentDownloader, + DocumentExtractor, + DocumentIterator, +) + + +class EmailsDownloader(DocumentDownloader): + def __init__(self, download_dir: str): + super().__init__() + + if not os.path.isdir(download_dir): + os.makedirs(download_dir) + + self._download_dir = download_dir + print("Download directory: ", self._download_dir) + + def download(self, url: str) -> str: + filename = os.path.basename(url) + output_file = os.path.join(self._download_dir, filename) + + if os.path.exists(output_file): + print(f"File '{output_file}' already exists, skipping download.") + return output_file + + print(f"Downloading Enron emails dataset from '{url}'...") + response = requests.get(url) + + with open(output_file, "wb") as file: + file.write(response.content) + + return output_file + + +class EmailsIterator(DocumentIterator): + + def __init__(self): + super().__init__() + self._counter = -1 + self._extractor = EmailsExtractor() + # The regular expression pattern to extract each email. + self._pattern = re.compile(r"\".*?\"", re.DOTALL) + + def iterate(self, file_path): + self._counter = -1 + file_name = os.path.basename(file_path) + + with open(file_path, "r", encoding="utf-8") as file: + lines = file.readlines() + + # Ignore the first line which contains the header. + file_content = "".join(lines[1:]) + # Find all the emails in the file. + it = self._pattern.finditer(file_content) + + for email in it: + self._counter += 1 + content = email.group().strip('"').strip() + meta = { + "filename": file_name, + "id": f"email-{self._counter}", + } + extracted_content = self._extractor.extract(content) + + # Skip if no content extracted + if not extracted_content: + continue + + record = {**meta, **extracted_content} + yield record + + +class EmailsExtractor(DocumentExtractor): + def __init__(self): + super().__init__() + # The regular expression pattern to extract subject/body/label into groups. + self._pattern = re.compile( + r"Subject:: (.*?)\nBody:: (.*?)\n.*\[/INST\] (.*?) ", re.DOTALL + ) + + def extract(self, content: str) -> Dict[str, str]: + matches = self._pattern.findall(content) + + if not matches: + return None + + matches = matches[0] + + return { + "subject": matches[0].strip(), + "body": matches[1].strip(), + "category": matches[2].strip(), + } diff --git a/tutorials/peft-curation/filters.py b/tutorials/peft-curation/filters.py new file mode 100644 index 000000000..0ffcd5be7 --- /dev/null +++ b/tutorials/peft-curation/filters.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_curator.filters import DocumentFilter + + +class FilterEmailsWithLongBody(DocumentFilter): + """ + If the email is too long, discard. + """ + + def __init__(self, max_length: int = 5000): + super().__init__() + self.max_length = max_length + + def score_document(self, text: str) -> bool: + return len(text) <= self.max_length + + def keep_document(self, score) -> bool: + return score + + +class FilterEmptyEmails(DocumentFilter): + """ + Detects empty emails (either empty body, or labeled as empty). Returns `True` for empty emails. + """ + + def score_document(self, text: str) -> bool: + return ( + not isinstance(text, str) # The text is not a string + or len(text.strip()) == 0 # The text is empty + or "Empty message" in text # The email is labeled as empty + ) + + def keep_document(self, score) -> bool: + return score diff --git a/tutorials/peft-curation/main.py b/tutorials/peft-curation/main.py new file mode 100644 index 000000000..9210d9f89 --- /dev/null +++ b/tutorials/peft-curation/main.py @@ -0,0 +1,179 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import os +from functools import partial +from typing import Any + +from docbuilder import EmailsDownloader, EmailsIterator +from filters import FilterEmailsWithLongBody, FilterEmptyEmails +from modifiers import AddPeriod, AddSystemPrompt + +from nemo_curator import ScoreFilter, Sequential +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modifiers.pii_modifier import PiiModifier +from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter +from nemo_curator.modules.modify import Modify +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import add_distributed_args + +SCRIPT_DIR_PATH = os.path.dirname(os.path.abspath(__file__)) +DATA_DIR = os.path.join(SCRIPT_DIR_PATH, "data") +DATASET_URL = "https://huggingface.co/datasets/neelblabla/enron_labeled_emails_with_subjects-llama2-7b_finetuning/raw/main/prompts_train.csv" + + +def download_and_convert_to_jsonl() -> str: + """ + Downloads the emails dataset and converts it to JSONL format. + + Returns: + str: The path to the JSONL file. + """ + + # Download the dataset in raw format and convert it to JSONL. + downloader = EmailsDownloader(DATA_DIR) + output_path = os.path.join(DATA_DIR, "emails.jsonl") + raw_fp = downloader.download(DATASET_URL) + + iterator = EmailsIterator() + + # Parse the raw data and write it to a JSONL file. + with open(output_path, "w") as f: + for record in iterator.iterate(raw_fp): + json_record = json.dumps(record, ensure_ascii=False) + f.write(json_record + "\n") + + return output_path + + +def redact_pii(dataset: DocumentDataset, text_field) -> DocumentDataset: + """ + Redacts personally identifiable information (PII) from a given dataset. + + Args: + dataset (DocumentDataset): The dataset containing documents with PII. + + Returns: + DocumentDataset: The redacted dataset with PII replaced by a generic value. + """ + redactor = Modify( + PiiModifier( + supported_entities=[ + "ADDRESS", + "EMAIL_ADDRESS", + "LOCATION", + "PERSON", + "URL", + "PHONE_NUMBER", + ], + anonymize_action="replace", + device="cpu", + ), + text_field=text_field, + ) + return redactor(dataset) + + +def run_curation_pipeline(args: Any, jsonl_fp: str) -> str: + """ + Run the curation pipeline on the dataset. + + Args: + args (Any): Command-line arguments. + jsonl_fp (str): The path to the uncurated JSONL file. + + Returns: + str: The path to the curated JSONL file. + """ + client = get_client(args, args.device) + print(f" Running the curation pipeline on '{jsonl_fp}'...") + orig_dataset = DocumentDataset.read_json(jsonl_fp, add_filename=True) + dataset = orig_dataset + + redact_pii_subject = partial(redact_pii, text_field="subject") + redact_pii_body = partial(redact_pii, text_field="body") + + curation_steps = Sequential( + [ + # + # Unify the text encoding to Unicode. + # + Modify(UnicodeReformatter(), text_field="subject"), + Modify(UnicodeReformatter(), text_field="body"), + Modify(UnicodeReformatter(), text_field="category"), + # + # Filtering + # + # Filter out empty emails. + ScoreFilter( + FilterEmptyEmails(), text_field="subject", score_type=bool, invert=True + ), + ScoreFilter( + FilterEmptyEmails(), text_field="body", score_type=bool, invert=True + ), + ScoreFilter( + FilterEmptyEmails(), text_field="category", score_type=bool, invert=True + ), + # Filter out emails that are too long. + ScoreFilter(FilterEmailsWithLongBody(), text_field="body", score_type=bool), + # + # Redact personally identifiable information (PII). + # + redact_pii_subject, + redact_pii_body, + # + # Final modifications. + # + # Add system prompts to every email, which helps the model focus on the task. + Modify(AddSystemPrompt(), text_field="body"), + # Add a period to the end of each email category, which makes PEFT easier. + Modify(AddPeriod(), text_field="category"), + ] + ) + + dataset = curation_steps(dataset) + dataset = dataset.persist() + + print(f" Original dataset length: {len(orig_dataset.df)}") + print(f" After running the curation pipeline: {len(dataset.df)}") + print(f" Writing to '{jsonl_fp}'...") + out_path = os.path.join( + os.path.dirname(jsonl_fp), + "curated", + ) + os.makedirs(out_path, exist_ok=True) + dataset.to_json(out_path, write_to_filename=True) + client.close() + return os.path.join(out_path, os.path.basename(jsonl_fp)) + + +def main(): + parser = argparse.ArgumentParser() + parser = add_distributed_args(parser) + args = parser.parse_args() + # Limit the total number of workers to ensure we don't run out of memory. + args.n_workers = min(args.n_workers, 8) + + # Prepare the download and JSONL directories. + if not os.path.isdir(DATA_DIR): + os.makedirs(DATA_DIR) + + jsonl_fp = download_and_convert_to_jsonl() + run_curation_pipeline(args, jsonl_fp) + + +if __name__ == "__main__": + main() diff --git a/tutorials/peft-curation/modifiers.py b/tutorials/peft-curation/modifiers.py new file mode 100644 index 000000000..059036ee4 --- /dev/null +++ b/tutorials/peft-curation/modifiers.py @@ -0,0 +1,68 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_curator.modifiers import DocumentModifier + +# The system prompt template to be inserted into the documents. +SYS_PROMPT_TEMPLATE = """[INST] <> You are reviewing the contents of an email. Based on the content, please categorize this email into one of the following categories: +1. 'Company Business/Strategy.' +2. 'Purely Personal.' +3. 'Personal but in a professional context.' +4. 'Logistic Arrangements.' +5. 'Employment arrangements.' +6. 'Document editing/checking/collaboration.' +Please provide only one category (e.g., 'Purely Personal.'). <> + +Content:: +%s + +What should this email be categorized as? +[/INST] +Answer:: """ + + +class AddSystemPrompt(DocumentModifier): + """ + A simple modifier that adds system prompts to each document. + """ + + def modify_document(self, text: str) -> str: + """ + Inserts system prompts into the document. + + Args: + text (str): The text to be modified. + + Returns: + str: The modified text. + """ + return SYS_PROMPT_TEMPLATE % text + + +class AddPeriod(DocumentModifier): + """ + A simple modifier that adds a period to the end of each email category. + """ + + def modify_document(self, text: str) -> str: + """ + Adds a period to the end of each email category. + + Args: + text (str): The text to be modified. + + Returns: + str: The modified text. + """ + return text + "." diff --git a/tutorials/tinystories/README.md b/tutorials/tinystories/README.md index 47074cb3f..45bc3bf33 100644 --- a/tutorials/tinystories/README.md +++ b/tutorials/tinystories/README.md @@ -1,6 +1,6 @@ # TinyStories -This tutorial demonstrates the usage of NeMo Curator's Python API to curate the [TinyStories](https://arxiv.org/abs/2305.07759) dataset. TinyStories is a dataset of short stories generated by GPT-3.5 and GPT-4, featuring words that are undersood by 3 to 4-year olds. The small size of this dataset makes it ideal for creating and validating data curation pipelines on a local machine. +This tutorial demonstrates the usage of NeMo Curator's Python API to curate the [TinyStories](https://arxiv.org/abs/2305.07759) dataset. TinyStories is a dataset of short stories generated by GPT-3.5 and GPT-4, featuring words that are understood by 3 to 4-year olds. The small size of this dataset makes it ideal for creating and validating data curation pipelines on a local machine. For simplicity, this tutorial uses the validation split of this dataset, which contains around 22,000 samples. diff --git a/tutorials/tinystories/main.py b/tutorials/tinystories/main.py index fa4470c35..1fbbba35c 100644 --- a/tutorials/tinystories/main.py +++ b/tutorials/tinystories/main.py @@ -97,19 +97,23 @@ def filter_dataset(dataset: DocumentDataset) -> DocumentDataset: WordCountFilter(min_words=80), text_field="text", score_field="word_count", + score_type=int, ), - ScoreFilter(IncompleteStoryFilter(), text_field="text"), + ScoreFilter(IncompleteStoryFilter(), text_field="text", score_type=bool), ScoreFilter( RepeatingTopNGramsFilter(n=2, max_repeating_ngram_ratio=0.2), text_field="text", + score_type=float, ), ScoreFilter( RepeatingTopNGramsFilter(n=3, max_repeating_ngram_ratio=0.18), text_field="text", + score_type=float, ), ScoreFilter( RepeatingTopNGramsFilter(n=4, max_repeating_ngram_ratio=0.16), text_field="text", + score_type=float, ), ] ) From 8bea00b46c502285fa1db2fc005fb1f2fdde2808 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Mon, 13 May 2024 14:40:46 -0700 Subject: [PATCH 16/34] Only import PII constants during Curator import (#61) * Move PII constants to a seperate file that does not import presidio/spacy and other GPU dependencies Signed-off-by: Ayush Dattagupta * Add comment around import, move constant import to global scope Signed-off-by: Ayush Dattagupta --------- Signed-off-by: Ayush Dattagupta Signed-off-by: Nicole Luo --- nemo_curator/modifiers/pii_modifier.py | 4 ++-- nemo_curator/pii/algorithm.py | 26 +++++--------------------- nemo_curator/pii/constants.py | 20 ++++++++++++++++++++ tests/test_pii_accuracy.py | 1 - 4 files changed, 27 insertions(+), 24 deletions(-) create mode 100644 nemo_curator/pii/constants.py diff --git a/nemo_curator/modifiers/pii_modifier.py b/nemo_curator/modifiers/pii_modifier.py index c2a398b48..51ea5b6e2 100644 --- a/nemo_curator/modifiers/pii_modifier.py +++ b/nemo_curator/modifiers/pii_modifier.py @@ -17,7 +17,7 @@ import pandas as pd from nemo_curator.modifiers import DocumentModifier -from nemo_curator.pii.algorithm import DEFAULT_LANGUAGE +from nemo_curator.pii.constants import DEFAULT_LANGUAGE, DEFAULT_MAX_DOC_SIZE from nemo_curator.utils.decorators import batched from nemo_curator.utils.distributed_utils import load_object_on_worker @@ -97,7 +97,7 @@ def load_deidentifier(self): if self.device == "gpu": spacy.require_gpu() - from nemo_curator.pii.algorithm import DEFAULT_MAX_DOC_SIZE, PiiDeidentifier + from nemo_curator.pii.algorithm import PiiDeidentifier deidentifier: PiiDeidentifier = PiiDeidentifier( language=self.language, diff --git a/nemo_curator/pii/algorithm.py b/nemo_curator/pii/algorithm.py index 762214efb..2b5e16ed0 100644 --- a/nemo_curator/pii/algorithm.py +++ b/nemo_curator/pii/algorithm.py @@ -15,6 +15,10 @@ from pathlib import Path from typing import Any, List, Mapping, Union +# NOTE: Importing this module before cluster creation will create a primary CUDA context +# that leads to issues of all GPUs not being used when creating a cluster/client later on. +# Ensure that this module is always imported after cluster creation only when the algorithm +# needs to be executed. See: https://github.com/NVIDIA/NeMo-Curator/issues/64 import yaml from presidio_analyzer import AnalyzerEngine, RecognizerRegistry from presidio_analyzer.nlp_engine import NerModelConfiguration @@ -30,36 +34,16 @@ from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine from presidio_anonymizer.entities import OperatorConfig +from nemo_curator.pii.constants import DEFAULT_LANGUAGE, SUPPORTED_ENTITIES from nemo_curator.pii.custom_batch_analyzer_engine import CustomBatchAnalyzerEngine from nemo_curator.pii.custom_nlp_engine import CustomNlpEngine from nemo_curator.pii.recognizers.address_recognizer import AddressRecognizer __all__ = [ - "DEFAULT_LANGUAGE", - "SUPPORTED_ENTITIES", - "DEFAULT_MAX_DOC_SIZE", "PiiDeidentifier", ] -DEFAULT_LANGUAGE = "en" -SUPPORTED_ENTITIES = [ - "ADDRESS", - "CREDIT_CARD", - "EMAIL_ADDRESS", - "DATE_TIME", - "IP_ADDRESS", - "LOCATION", - "PERSON", - "URL", - "US_SSN", - "US_PASSPORT", - "US_DRIVER_LICENSE", - "PHONE_NUMBER", -] -DEFAULT_MAX_DOC_SIZE = 2000000 - - class PiiDeidentifier(object): """Cleans PII from an unstructured text""" diff --git a/nemo_curator/pii/constants.py b/nemo_curator/pii/constants.py new file mode 100644 index 000000000..fc8dcc545 --- /dev/null +++ b/nemo_curator/pii/constants.py @@ -0,0 +1,20 @@ +DEFAULT_LANGUAGE = "en" + +SUPPORTED_ENTITIES = [ + "ADDRESS", + "CREDIT_CARD", + "EMAIL_ADDRESS", + "DATE_TIME", + "IP_ADDRESS", + "LOCATION", + "PERSON", + "URL", + "US_SSN", + "US_PASSPORT", + "US_DRIVER_LICENSE", + "PHONE_NUMBER", +] + +DEFAULT_MAX_DOC_SIZE = 2000000 + +__all__ = ["DEFAULT_LANGUAGE", "SUPPORTED_ENTITIES", "DEFAULT_MAX_DOC_SIZE"] diff --git a/tests/test_pii_accuracy.py b/tests/test_pii_accuracy.py index 7e7d58663..850dafd54 100644 --- a/tests/test_pii_accuracy.py +++ b/tests/test_pii_accuracy.py @@ -17,7 +17,6 @@ from pathlib import Path import pandas as pd -import pytest from dask import dataframe as dd from dask.distributed import Client, LocalCluster From c66138a55839a2e2acef405f0a9c9a5582570974 Mon Sep 17 00:00:00 2001 From: Nicoel Luo Date: Wed, 15 May 2024 12:35:56 +0000 Subject: [PATCH 17/34] Deleting links Signed-off-by: Nicoel Luo Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index f0fada829..3868ebbff 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -21,9 +21,7 @@ "\n", "NeMo Curator team has perform ablation experiments using Common Crawl dataset to train a 357M GPT-style model to assess the effect of different curation stage on model performance. \n", "\n", - "![alt text](./image/zeroshot_ablations.png)\n", - "\n", - "For the latest NeMo Data Curator user guide, please refer to https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/index.html " + "![alt text](./image/zeroshot_ablations.png)\n" ] }, { From 148e1d494ac03a3390de5119f7eb3043f02adf54 Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:21:42 +0800 Subject: [PATCH 18/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 3868ebbff..ce883dd34 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -5,7 +5,7 @@ "id": "1c1a4119", "metadata": {}, "source": [ - "# Nemo Curator pipeline example\n", + "# Nemo Curator Pipeline Example\n", "\n", "## NeMo Curator introduction\n", "The NeMo Curator is a Python library that consists of a collection of scalable data-mining modules for curating natural language processing (NLP) data for training large language models (LLMs). The modules within the NeMo Data Curator enable NLP researchers to mine high-quality text at scale from massive uncurated web corpora. \n", From 7e08c96daa5a6a16805b7eed76e42baa1bcda057 Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:21:56 +0800 Subject: [PATCH 19/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index ce883dd34..0c1acdec2 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -7,7 +7,7 @@ "source": [ "# Nemo Curator Pipeline Example\n", "\n", - "## NeMo Curator introduction\n", + "## NeMo Curator Introduction\n", "The NeMo Curator is a Python library that consists of a collection of scalable data-mining modules for curating natural language processing (NLP) data for training large language models (LLMs). The modules within the NeMo Data Curator enable NLP researchers to mine high-quality text at scale from massive uncurated web corpora. \n", "\n", "NeMo Curator includes the following modules to perform data curation:\n", From 75f5dd7ec157d2723c3f8e095400089faa14af45 Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:23:12 +0800 Subject: [PATCH 20/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 0c1acdec2..ba813d1bd 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -65,7 +65,7 @@ "\n", "**OS**: ubuntu 22.04\n", "\n", - "### Getting NeMo FrameWork Training Container\n", + "### Getting NeMo Framework Training Container\n", "- Get access to the container via https://developer.nvidia.com/nemo-framework\n", "- Set your docker credentials \n", " ```bash\n", From fcd82307a2fddda64788265f69b5f35bbe6eb4a5 Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:23:20 +0800 Subject: [PATCH 21/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index ba813d1bd..44cb08b3f 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -2441,7 +2441,7 @@ "connected_component_output_path = os.path.join(connected_component_base_output_path, \"connected_components.parquet\")\n", "connected_component_cache_dir = os.path.join(connected_component_base_output_path, \"cache\")\n", "\n", - "#Relevant parameter\n", + "#Relevant parameters\n", "input_id_field = 'id'\n", "jaccard_threshold = 0.8\n", "\n", From 48af56133e5e02be92e8a65769e75f2503a49dc6 Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:23:28 +0800 Subject: [PATCH 22/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 44cb08b3f..b53200123 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -2967,7 +2967,7 @@ "#Output\n", "dudped_output_dir = os.path.join(data_dir,\"remove_duplicate/result.parquet\")\n", "\n", - "#Relevant parameter\n", + "#Relevant parameters\n", "input_id_field = 'id'\n", "id_prefix = add_ID_id_prefix\n", "\n", From 49efc21066c547ef3b277e0caef725d6401bfdd5 Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:23:41 +0800 Subject: [PATCH 23/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index b53200123..7c3b7f1cb 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -286,7 +286,7 @@ "download_base_directory= os.path.join(data_dir,\"wiki_downloads\")\n", "download_output_directory = os.path.join(download_base_directory,\"data\")\n", "\n", - "#Relevant parameter\n", + "#Relevant parameters\n", "dump_date = \"20240201\"\n", "language = 'th'\n", "url_limit = 1" From 5826eb1051e0b63c84d514e039252c01dc850016 Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:23:50 +0800 Subject: [PATCH 24/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 7c3b7f1cb..d07eb738f 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -45,7 +45,7 @@ "What is not included:\n", "1. Customized downloading\n", "2. Classifier filtering\n", - "3. Downstream-task deduplication\n", + "3. Downstream-task decontamination\n", "\n" ] }, From 30abf299edf69d7ef20a88fdf638bc8f02a83efd Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:24:00 +0800 Subject: [PATCH 25/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index d07eb738f..9b31c6753 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -73,7 +73,7 @@ "\n", " Username: $oauthtoken\n", " Password: \n", - "- Get NeMo NeMo FrameWork Training Container\n", + "- Get NeMo NeMo Framework Training Container\n", " ```bash\n", " docker pull nvcr.io/ea-bignlp/ga-participants/nemofw-training:24.01\n" ] From 43eae2717fb31441c03f00b93f5a7b5b66e46a33 Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:24:11 +0800 Subject: [PATCH 26/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 9b31c6753..8b96f8e14 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -1513,7 +1513,7 @@ "3. Documents within the same bucket will be deemed similar. Since every document will be assigned `X` buckets and as long as two documents share 1 or more buckets they are deemed similar, the result of LSH will have more false positive as compared to false negative. The false positive cases will be filtered in following modules, namely jaccard compute.\n", "\n", "Arguments include:\n", - "- `minhash_length`:Length of minhash signature. Must bu consistent with `MinHash()`\n", + "- `minhash_length`:Length of minhash signature. Must be consistent with `MinHash()`\n", "- `num_buckets`: Number of buckets\n", "- `buckets_per_shuffle`: Number of buckets to shuffle concurrently\n", "- `id_field`: Key in input file for identifying document ID\n", From 87eefbdab23c1b37dc73f66aa7514a016ec6a2fa Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:24:37 +0800 Subject: [PATCH 27/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 8b96f8e14..a22fe8faa 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -1792,7 +1792,7 @@ "input_id_field = 'id'\n", "input_text_field = 'text'\n", "\n", - "#Relevant parameter for _Shuffle()\n", + "#Relevant parameters for _Shuffle()\n", "shuffle_id_fields=[\"dataset_id\", \"doc_id\"]\n", "int_to_str_id='id'\n", "\n", From 262d8e03a579e890119b621c0db0c2409f2d1655 Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:25:03 +0800 Subject: [PATCH 28/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index a22fe8faa..4e8e21bc1 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -1783,7 +1783,7 @@ "input_anchor_docs_with_bk_dir = output_anchor_docs_with_bk_path\n", "output_shuffled_docs_path = os.path.join(jaccard_shuffle_base_output_path, \"shuffled_docs.parquet\")\n", "\n", - "#Relevant parameter for _MapBucket()\n", + "#Relevant parameters for _MapBucket()\n", "text_ddf_blocksize = 256\n", "bucket_mapping_ddf_blocksize = 256\n", "num_files = None\n", From 15db6f35bf0739c9e4e2bf45ea7532f18abfb539 Mon Sep 17 00:00:00 2001 From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Date: Thu, 16 May 2024 10:25:21 +0800 Subject: [PATCH 29/34] Update tutorials/single_node_tutorial/single_gpu_tutorial.ipynb Co-authored-by: Ryan Wolf Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com> Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 4e8e21bc1..e839455f7 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -1731,7 +1731,7 @@ "### 5.3 Jaccard Shuffle\n", "In this section, we will be using `_MapBucket()` and `_Shuffle()`.\n", "\n", - "For `_MapBucket()`, it is designed to take input text data in .jsonl format and bucket information which is output of LSH, map the documents to their respective buckets, and write the resulting DataFrame containing the anchor documents and their associated bucket information to a Parquet file.Arguments include:\n", + "For `_MapBucket()`, it is designed to take input text data in jsonl format and bucket information which is output of LSH, map the documents to their respective buckets, and write the resulting DataFrame containing the anchor documents and their associated bucket information to a parquet file. Arguments include:\n", "- `id_field`: Key in input .jsonl file for identifying document ID\n", "- `text_field`: Key in input .jsonl file which contains document text.\n", "- `bucket_field`: Key in input _buckets.parquet which contains `bucket_id`.\n", From 84587b201da0c275c2b6aac2cf4dfb4050acb873 Mon Sep 17 00:00:00 2001 From: Nicole Luo Date: Fri, 17 May 2024 06:59:48 +0000 Subject: [PATCH 30/34] Fixed typo. Update content to lastest NeMo Curator version. Added fuzzy deduplication wrapper example Signed-off-by: Nicole Luo --- .../single_gpu_tutorial.ipynb | 1535 ++++++++++------- 1 file changed, 959 insertions(+), 576 deletions(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index e839455f7..006098375 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "1c1a4119", + "id": "e12a5ec6", "metadata": {}, "source": [ "# Nemo Curator Pipeline Example\n", @@ -18,6 +18,7 @@ "- Document-level deduplication\n", "- Multilingual downstream-task decontamination\n", "- Distributed Data Classification\n", + "- Personal identifiable information (PII) redaction\n", "\n", "NeMo Curator team has perform ablation experiments using Common Crawl dataset to train a 357M GPT-style model to assess the effect of different curation stage on model performance. \n", "\n", @@ -26,7 +27,7 @@ }, { "cell_type": "markdown", - "id": "be41377f", + "id": "58d062aa", "metadata": {}, "source": [ "## About this notebook\n", @@ -46,12 +47,14 @@ "1. Customized downloading\n", "2. Classifier filtering\n", "3. Downstream-task decontamination\n", + "4. Distributed data classification with PyTorch models\n", + "5. Personal identifiable information (PII) redaction\n", "\n" ] }, { "cell_type": "markdown", - "id": "8860c239", + "id": "a6e3492e", "metadata": {}, "source": [ "## Prerequisites\n", @@ -80,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "ff6bff1b", + "id": "01d4c35a", "metadata": {}, "source": [ "## 0. Env Setup" @@ -89,7 +92,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "24dce020", + "id": "8778a517", "metadata": {}, "outputs": [ { @@ -97,12 +100,7 @@ "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", - "Collecting jsonlines\n", - " Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)\n", - "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines) (23.2.0)\n", - "Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n", - "Installing collected packages: jsonlines\n", - "Successfully installed jsonlines-4.0.0\n", + "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (2.0.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", @@ -116,8 +114,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "6831f331", + "execution_count": 1, + "id": "41d75988", "metadata": {}, "outputs": [], "source": [ @@ -126,8 +124,7 @@ "from nemo_curator.utils.distributed_utils import get_client,get_num_workers\n", "from nemo_curator.utils.script_utils import add_distributed_args\n", "from nemo_curator.utils.file_utils import get_all_files_paths_under, separate_by_metadata\n", - "from nemo_curator.utils.distributed_utils import read_data, write_to_disk\n", - "from nemo_curator.gpu_deduplication.utils import (create_logger, parse_nc_args, performance_report_if, enable_spilling)\n", + "from nemo_curator.utils.distributed_utils import read_data,write_to_disk\n", "from nemo_curator.datasets import DocumentDataset\n", "\n", "import os\n", @@ -136,28 +133,24 @@ "import time\n", "import cudf\n", "import dask_cudf\n", + "import dask\n", "import numpy as np\n", "from dask.distributed import Client, LocalCluster\n", - "import jsonlines" + "import jsonlines\n", + "\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "e28739b3", + "execution_count": 2, + "id": "0150b7e7", "metadata": {}, "outputs": [], "source": [ "def pre_imports():\n", " import cudf \n", "\n", - "def load_dataset(input_data_dir, file_type='jsonl'):\n", - " files = list(get_all_files_paths_under(input_data_dir))\n", - " raw_data = read_data(files, file_type=file_type, backend=\"pandas\", add_filename=True)\n", - " dataset = DocumentDataset(raw_data)\n", - "\n", - " return dataset\n", - "\n", "def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)):\n", " return add_distributed_args(parser)\n", "\n", @@ -179,8 +172,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "d279329f", + "execution_count": 3, + "id": "3d7e6547", "metadata": {}, "outputs": [ { @@ -199,7 +192,7 @@ }, { "cell_type": "markdown", - "id": "f3f452a3", + "id": "cf0aea31", "metadata": {}, "source": [ "## 1. Download\n", @@ -240,8 +233,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "1773cda2", + "execution_count": 4, + "id": "f41df88e", "metadata": {}, "outputs": [], "source": [ @@ -250,7 +243,7 @@ }, { "cell_type": "markdown", - "id": "d711a8f8", + "id": "b0f2d6d9", "metadata": {}, "source": [ " Start a CPU based Dask cluster. Please modify `n_workers` and `memory_limit` according to your hardware specification. To process TH wikipedia data, it's advised to have `memory_limit` greater than 12GB" @@ -258,8 +251,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "56ec66e0", + "execution_count": 5, + "id": "8742c111", "metadata": {}, "outputs": [], "source": [ @@ -269,7 +262,7 @@ }, { "cell_type": "markdown", - "id": "f794b51c", + "id": "f910ae71", "metadata": {}, "source": [ "Define parameters" @@ -278,7 +271,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "a90f3505", + "id": "c55bcfa8", "metadata": {}, "outputs": [], "source": [ @@ -294,7 +287,7 @@ }, { "cell_type": "markdown", - "id": "5628356b", + "id": "b11fdf43", "metadata": {}, "source": [ "Download TH wikipedia data" @@ -303,7 +296,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b591b9f2", + "id": "ff615514", "metadata": {}, "outputs": [], "source": [ @@ -315,7 +308,7 @@ }, { "cell_type": "markdown", - "id": "2aae29dd", + "id": "ff7ae4c0", "metadata": {}, "source": [ "Verify result" @@ -323,8 +316,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "169fadb9", + "execution_count": 26, + "id": "98564093", "metadata": {}, "outputs": [ { @@ -343,8 +336,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "f2bcb168", + "execution_count": 27, + "id": "ded3510b", "metadata": {}, "outputs": [ { @@ -362,7 +355,7 @@ }, { "cell_type": "markdown", - "id": "44fa2d13", + "id": "79b4a804", "metadata": {}, "source": [ "**[Optional]**Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again." @@ -370,8 +363,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "590c489c", + "execution_count": 28, + "id": "f1e8f645", "metadata": {}, "outputs": [], "source": [ @@ -381,17 +374,15 @@ }, { "cell_type": "markdown", - "id": "5ba566fc", + "id": "4db3267a", "metadata": {}, "source": [ - "## 2.Language separation and unicode fixing\n", - "\n", - "**Note**: In order to be run on interactive python. Please comment `from.code import *` and the related imports in `./nemo_curator/filters/__init__.py`" + "## 2.Language separation and unicode fixing" ] }, { "cell_type": "markdown", - "id": "f742b881", + "id": "228e3978", "metadata": {}, "source": [ "In this section, we will be using a language classification model by fasttext to separate the TH wikipedia dataset based on the document major languages, and we will also fix the unicode in the documents. Detailed steps are:\n", @@ -406,8 +397,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "71a6e4a2", + "execution_count": 7, + "id": "bd5d6920", "metadata": {}, "outputs": [], "source": [ @@ -418,16 +409,16 @@ }, { "cell_type": "markdown", - "id": "4916079c", + "id": "bd2923bb", "metadata": {}, "source": [ - "**[Optional]**8Start a cpu based Dask cluster." + "**[Optional]** Start a cpu based Dask cluster." ] }, { "cell_type": "code", - "execution_count": 17, - "id": "23a63375", + "execution_count": 8, + "id": "4375c02b", "metadata": {}, "outputs": [], "source": [ @@ -437,7 +428,7 @@ }, { "cell_type": "markdown", - "id": "957d7357", + "id": "2f834de0", "metadata": {}, "source": [ "Define parameters" @@ -445,13 +436,13 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "6270de3f", + "execution_count": 9, + "id": "3b3856c6", "metadata": {}, "outputs": [], "source": [ "# Input path\n", - "multilingual_data_path = download_output_directory\n", + "multilingual_data_path = f\"{download_output_directory}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\"\n", "\n", "# Output path\n", "language_base_output_path = os.path.join(data_dir,\"language_sep\")\n", @@ -471,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "598cff2d", + "id": "3b6f887f", "metadata": {}, "source": [ "Download fasttext model" @@ -479,24 +470,24 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "0c7cc007", + "execution_count": 10, + "id": "218c955e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2024-03-22 08:40:55-- https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\n", - "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.227.74.12, 13.227.74.118, 13.227.74.9, ...\n", - "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.227.74.12|:443... connected.\n", + "--2024-05-17 03:17:09-- https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\n", + "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 99.84.238.181, 99.84.238.154, 99.84.238.162, ...\n", + "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|99.84.238.181|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 131266198 (125M) [application/octet-stream]\n", - "Saving to: ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin’\n", + "Saving to: ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin.1’\n", "\n", - "lid.176.bin 100%[===================>] 125.18M 220MB/s in 0.6s \n", + "lid.176.bin.1 100%[===================>] 125.18M 184MB/s in 0.7s \n", "\n", - "2024-03-22 08:40:56 (220 MB/s) - ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin’ saved [131266198/131266198]\n", + "2024-05-17 03:17:10 (184 MB/s) - ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin.1’ saved [131266198/131266198]\n", "\n" ] } @@ -507,7 +498,7 @@ }, { "cell_type": "markdown", - "id": "d875771b", + "id": "c410253e", "metadata": {}, "source": [ "Apply fasttext model to separate documents by their languages" @@ -515,8 +506,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "id": "c959800c", + "execution_count": 11, + "id": "c9afe965", "metadata": {}, "outputs": [ { @@ -537,7 +528,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Time taken for splitting language:147.80864667892456\n" + "Time taken for splitting language:140.04064464569092\n" ] } ], @@ -545,7 +536,7 @@ "t0 = time.time()\n", "\n", "# Load dataset \n", - "multilingual_dataset = load_dataset(multilingual_data_path)\n", + "multilingual_dataset = DocumentDataset.read_json(multilingual_data_path,add_filename=True)\n", "\n", "#Define Language separation pipeline\n", "lang_filter = FastTextLangId(os.path.join(model_path,'lid.176.bin'))\n", @@ -563,7 +554,7 @@ }, { "cell_type": "markdown", - "id": "bd54a24a", + "id": "31917e7b", "metadata": {}, "source": [ "Load `UnicodeReformatter` to reformat any unicode appeared in the desired language dataset" @@ -571,8 +562,8 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "0c09bc28", + "execution_count": 12, + "id": "55da5f12", "metadata": {}, "outputs": [ { @@ -581,7 +572,7 @@ "text": [ "Reading 1 files\n", "Writing to disk complete for 1 partitions\n", - "Time taken for fixing unicode:444.5816135406494\n" + "Time taken for fixing unicode:437.4811737537384\n" ] } ], @@ -590,20 +581,20 @@ "\n", "# Read the language specific data and fix the unicode in it\n", "lang_data_path = os.path.join(language_separated_output_path, target_language)\n", - "lang_data = load_dataset(lang_data_path)\n", + "lang_data = DocumentDataset.read_json(lang_data_path,add_filename=True)\n", "\n", "cleaner = Modify(UnicodeReformatter())\n", "cleaned_data = cleaner(lang_data)\n", "\n", "# Write the cleaned_data\n", - "write_to_disk(cleaned_data.df, lang_sep_cleaned_data_output_path, write_to_filename=True, output_type='jsonl')\n", + "cleaned_data.to_json(lang_sep_cleaned_data_output_path, write_to_filename=True)\n", "\n", "print(f\"Time taken for fixing unicode:{time.time()-t0}\")" ] }, { "cell_type": "markdown", - "id": "00c6e5a1", + "id": "bc214e82", "metadata": {}, "source": [ "Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file (no. of lines = 162164)" @@ -611,8 +602,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "id": "b2b34d46", + "execution_count": 13, + "id": "6b6eb634", "metadata": {}, "outputs": [ { @@ -631,7 +622,7 @@ }, { "cell_type": "markdown", - "id": "39d539a2", + "id": "57e22770", "metadata": {}, "source": [ "Furthur verify by loading documents that has been identified as other language, such as 'EN'. We can see from output that the removed document is indeed in English and contains very little or even no Thai." @@ -639,26 +630,35 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "5ace3c5b", + "execution_count": 38, + "id": "79e32205", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\",\"id\":\"1\",\"language\":\"TH\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\"}\n", + "\n" + ] + } + ], "source": [ "check_jsonl_file(os.path.join(language_separated_output_path,'EN'))" ] }, { "cell_type": "markdown", - "id": "9b817bf7", + "id": "39020971", "metadata": {}, "source": [ - "**[Optional]**Close the Dask cluster." + "**[Optional]** Close the Dask cluster." ] }, { "cell_type": "code", - "execution_count": 153, - "id": "bf05b6c2", + "execution_count": 37, + "id": "64da23ec", "metadata": {}, "outputs": [], "source": [ @@ -668,20 +668,20 @@ }, { "cell_type": "markdown", - "id": "cc8b6aef", + "id": "6134eaf3", "metadata": {}, "source": [ "## 3.Add ID\n", - "TH wikipedia data do have `id` field, but the `id` field contains number only. It will be better if we unified the `id` field and transform it to the format of `_`. In this way, when handling multiple dataset, we will able to know which document from which dataset has been removed. This `id` will be useful when we are running deduplication and heuristic filtering. The function we will be using is `AddID()`. Arguments for this function include:\n", + "TH wikipedia data do have `id` field, but the `id` field contains number only. It will be better if we unified the `id` field and transform it to the format of `_`. In this way, when handling multiple dataset, we will be able to know which document from which dataset has been removed. This `id` will be useful when we are running deduplication and heuristic filtering. The function we will be using is `AddID()`. Arguments for this function include:\n", "- `id_field`: fields will be added to input .json file. If the key already exists in the .jsonl, it's value will be replaced.\n", - "- `id_prefix`: prefix used in ID. Default is 'doc-id'\n", - "- `start_index`: starting index in ID. Default is 0" + "- `id_prefix`: prefix used in ID. Default is 'doc_id'\n", + "- `start_index`: starting index in ID. Default is None. When set to None, an unordered ID scheme will be used for fast calculation. In this notebook, it's set to 0 for easier reference." ] }, { "cell_type": "code", - "execution_count": 24, - "id": "fe9e6eef", + "execution_count": 14, + "id": "5bed2e25", "metadata": {}, "outputs": [], "source": [ @@ -690,16 +690,16 @@ }, { "cell_type": "markdown", - "id": "232c01a5", + "id": "be1c546b", "metadata": {}, "source": [ - "**[Optional]**If there is no running Dask cluster, start CPU based Dask cluster." + "**[Optional]** If there is no running Dask cluster, start CPU based Dask cluster." ] }, { "cell_type": "code", - "execution_count": 155, - "id": "f3f483eb", + "execution_count": 15, + "id": "3a6349d9", "metadata": {}, "outputs": [], "source": [ @@ -709,7 +709,7 @@ }, { "cell_type": "markdown", - "id": "2be65a51", + "id": "503bfa4c", "metadata": {}, "source": [ "Define relevant parameters" @@ -717,8 +717,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "id": "054019a5", + "execution_count": 16, + "id": "a14c6ba3", "metadata": {}, "outputs": [], "source": [ @@ -734,7 +734,7 @@ }, { "cell_type": "markdown", - "id": "80f9591c", + "id": "b249dcf9", "metadata": {}, "source": [ "Adding ID to dataset" @@ -742,8 +742,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "id": "e8fd7e09", + "execution_count": 17, + "id": "d12bb962", "metadata": {}, "outputs": [ { @@ -752,28 +752,28 @@ "text": [ "Reading 1 files\n", "Writing to disk complete for 1 partitions\n", - "Time taken for add ID:56.01176333427429\n" + "Time taken for add ID:47.33783745765686\n" ] } ], "source": [ "t0 = time.time()\n", "# Read input files\n", - "dataset = load_dataset(add_id_input_data_dir)\n", + "dataset = DocumentDataset.read_json(add_id_input_data_dir,add_filename=True)\n", "\n", "# Run AddID() on the input dataset\n", "add_id = AddId(id_field='id',id_prefix=add_ID_id_prefix,start_index=0)\n", "id_dataset = add_id(dataset)\n", "\n", "#Output files\n", - "write_to_disk(id_dataset.df, output_file_dir=added_id_output_path, write_to_filename=True, output_type='jsonl')\n", + "id_dataset.to_json(added_id_output_path, write_to_filename=True)\n", "\n", "print(f\"Time taken for add ID:{time.time()-t0}\")" ] }, { "cell_type": "markdown", - "id": "50016a50", + "id": "ce2934df", "metadata": {}, "source": [ "Verify the result. From the output, we can see that the `id` value has been changed to `TH_wiki-0000000000` " @@ -781,8 +781,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "id": "27a634e9", + "execution_count": 18, + "id": "cd51cd14", "metadata": {}, "outputs": [ { @@ -800,7 +800,7 @@ }, { "cell_type": "markdown", - "id": "e7084fed", + "id": "f249ab8b", "metadata": {}, "source": [ "Close Dask cluster. This cell needs to be run as we are starting a new GPU Dask cluster in the following task" @@ -808,8 +808,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "id": "16399469", + "execution_count": 20, + "id": "62336143", "metadata": {}, "outputs": [], "source": [ @@ -819,7 +819,7 @@ }, { "cell_type": "markdown", - "id": "cb227709", + "id": "d6fb16b1", "metadata": {}, "source": [ "## 4.Exact Dedplication\n", @@ -835,8 +835,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "id": "8fa6c3af", + "execution_count": 21, + "id": "044f7eee", "metadata": {}, "outputs": [], "source": [ @@ -845,7 +845,7 @@ }, { "cell_type": "markdown", - "id": "aa70fd06", + "id": "6e5da88e", "metadata": {}, "source": [ "Start a GPU based Dask cluster. Since GPU based Dask cluster involves setting several arguments, we will use the `get_client()` wrapper function to quickly set up. Please make sure the `device` in `args` is `gpu`" @@ -853,17 +853,17 @@ }, { "cell_type": "code", - "execution_count": 31, - "id": "7e9530f6", + "execution_count": 22, + "id": "e4d6920d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Namespace(scheduler_address=None, scheduler_file=None, n_workers=20, threads_per_worker=1, rmm_pool_size=None, protocol='tcp', nvlink_only=False, files_per_partition=2, num_files=-1, device='gpu', set_torch_to_use_rmm=False)" + "Namespace(scheduler_address=None, scheduler_file=None, n_workers=20, threads_per_worker=1, rmm_pool_size=None, protocol='tcp', nvlink_only=False, files_per_partition=2, num_files=None, device='gpu', set_torch_to_use_rmm=False)" ] }, - "execution_count": 31, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -878,8 +878,8 @@ }, { "cell_type": "code", - "execution_count": 32, - "id": "f71ab145", + "execution_count": 23, + "id": "717b6cef", "metadata": {}, "outputs": [ { @@ -892,10 +892,10 @@ { "data": { "text/plain": [ - "{'tcp://127.0.0.1:37795': None}" + "{'tcp://127.0.0.1:42505': None}" ] }, - "execution_count": 32, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -908,7 +908,7 @@ }, { "cell_type": "markdown", - "id": "4ef57149", + "id": "f267e161", "metadata": {}, "source": [ "Define parameters" @@ -916,8 +916,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "id": "26e6927e", + "execution_count": 24, + "id": "d01e2f08", "metadata": {}, "outputs": [], "source": [ @@ -936,8 +936,8 @@ }, { "cell_type": "code", - "execution_count": 34, - "id": "b9a75a74", + "execution_count": 25, + "id": "6395ffde", "metadata": {}, "outputs": [], "source": [ @@ -947,7 +947,7 @@ }, { "cell_type": "markdown", - "id": "a9fc0bd2", + "id": "a654a16e", "metadata": {}, "source": [ "Apply exact deduplication" @@ -955,17 +955,31 @@ }, { "cell_type": "code", - "execution_count": 35, - "id": "daf8f324", + "execution_count": 26, + "id": "a5e0117c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Reading 1 files\n", + "Reading 1 files\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/exact_dedup.py:158: UserWarning: Output path f/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/exact_dedup/data/_exact_duplicates.parquet already exists and will be overwritten\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "Number of exact duplicated file:53\n", - "Time taken for exact duplicate:3.0404415130615234\n" + "Time taken for exact duplicate:1.9629592895507812\n" ] } ], @@ -991,7 +1005,7 @@ }, { "cell_type": "markdown", - "id": "517c60e4", + "id": "7f8bdb88", "metadata": {}, "source": [ "Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary" @@ -999,8 +1013,8 @@ }, { "cell_type": "code", - "execution_count": 36, - "id": "2f3c67f8", + "execution_count": 27, + "id": "e045d65a", "metadata": {}, "outputs": [ { @@ -1038,27 +1052,27 @@ " \n", " \n", " 0\n", - " TH_wiki-0000021211\n", + " TH_wiki-0000021096\n", " 1708cb56ec582f78716f0864dca9382d\n", " \n", " \n", " 1\n", - " TH_wiki-0000021213\n", + " TH_wiki-0000021100\n", " 1708cb56ec582f78716f0864dca9382d\n", " \n", " \n", " 2\n", - " TH_wiki-0000105191\n", - " e77a248506ef16737288fae5759db33a\n", + " TH_wiki-0000067251\n", + " edf8af427a33ed94150899970f39770f\n", " \n", " \n", " 3\n", - " TH_wiki-0000105192\n", - " 2e386f5c3af70f43874618988d4842b2\n", + " TH_wiki-0000105191\n", + " e77a248506ef16737288fae5759db33a\n", " \n", " \n", " 4\n", - " TH_wiki-0000105193\n", + " TH_wiki-0000105192\n", " 2e386f5c3af70f43874618988d4842b2\n", " \n", " \n", @@ -1067,14 +1081,14 @@ ], "text/plain": [ " id _hashes\n", - "0 TH_wiki-0000021211 1708cb56ec582f78716f0864dca9382d\n", - "1 TH_wiki-0000021213 1708cb56ec582f78716f0864dca9382d\n", - "2 TH_wiki-0000105191 e77a248506ef16737288fae5759db33a\n", - "3 TH_wiki-0000105192 2e386f5c3af70f43874618988d4842b2\n", - "4 TH_wiki-0000105193 2e386f5c3af70f43874618988d4842b2" + "0 TH_wiki-0000021096 1708cb56ec582f78716f0864dca9382d\n", + "1 TH_wiki-0000021100 1708cb56ec582f78716f0864dca9382d\n", + "2 TH_wiki-0000067251 edf8af427a33ed94150899970f39770f\n", + "3 TH_wiki-0000105191 e77a248506ef16737288fae5759db33a\n", + "4 TH_wiki-0000105192 2e386f5c3af70f43874618988d4842b2" ] }, - "execution_count": 36, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1087,8 +1101,8 @@ }, { "cell_type": "code", - "execution_count": 37, - "id": "7ed7d4de", + "execution_count": 28, + "id": "8754887e", "metadata": {}, "outputs": [ { @@ -1125,12 +1139,12 @@ " \n", " 1\n", " 15f35c239b6579b4642f7656e64576ac\n", - " TH_wiki-0000098621 TH_wiki-0000074714 TH_wiki-...\n", + " TH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-...\n", " \n", " \n", " 2\n", " 1708cb56ec582f78716f0864dca9382d\n", - " TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...\n", + " TH_wiki-0000021096 TH_wiki-0000021100 TH_wiki-...\n", " \n", " \n", " 3\n", @@ -1156,13 +1170,13 @@ "\n", " id \n", "0 TH_wiki-0000157216 TH_wiki-0000066307 \n", - "1 TH_wiki-0000098621 TH_wiki-0000074714 TH_wiki-... \n", - "2 TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-... \n", + "1 TH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-... \n", + "2 TH_wiki-0000021096 TH_wiki-0000021100 TH_wiki-... \n", "3 TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-... \n", "4 TH_wiki-0000122055 TH_wiki-0000116550 " ] }, - "execution_count": 37, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1173,8 +1187,8 @@ }, { "cell_type": "code", - "execution_count": 39, - "id": "3051ed4b", + "execution_count": 29, + "id": "13712668", "metadata": {}, "outputs": [ { @@ -1194,7 +1208,7 @@ }, { "cell_type": "markdown", - "id": "ec31440b", + "id": "7a388445", "metadata": {}, "source": [ "**[Optional]** You might choose to close Dask cluster here" @@ -1202,8 +1216,8 @@ }, { "cell_type": "code", - "execution_count": 89, - "id": "2ee05303", + "execution_count": 31, + "id": "7875bf12", "metadata": {}, "outputs": [], "source": [ @@ -1213,7 +1227,7 @@ }, { "cell_type": "markdown", - "id": "710e8540", + "id": "20502f76", "metadata": {}, "source": [ "## 5. Fuzzy Deduplication\n", @@ -1238,12 +1252,14 @@ "2. Bucket computation\n", "3. Jaccard shuffle for load balancing in a distributed system\n", "4. Jaccard similarity computation\n", - "5. Connected component " + "5. Connected component \n", + "\n", + "In this section, we will firstly provide examples to each sub-steps for users to have a better understanding on what is going on under the hood. At the last sub section, we will provide example for the fuzzy deduplication wrapper." ] }, { "cell_type": "markdown", - "id": "c4b99c5e", + "id": "de98daed", "metadata": {}, "source": [ "**If there is not running Dask cluster, start a GPU Dask cluster here**" @@ -1251,17 +1267,17 @@ }, { "cell_type": "code", - "execution_count": 90, - "id": "115ff2dc", + "execution_count": 60, + "id": "0a84ae27", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'tcp://127.0.0.1:33223': None}" + "{'tcp://127.0.0.1:43209': None}" ] }, - "execution_count": 90, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } @@ -1279,7 +1295,7 @@ }, { "cell_type": "markdown", - "id": "1979977d", + "id": "5de7a035", "metadata": {}, "source": [ "### 5.1 Minhash\n", @@ -1303,8 +1319,8 @@ }, { "cell_type": "code", - "execution_count": 40, - "id": "f9b2a642", + "execution_count": 30, + "id": "bbc84690", "metadata": {}, "outputs": [], "source": [ @@ -1313,7 +1329,7 @@ }, { "cell_type": "markdown", - "id": "4c152974", + "id": "3b0beafe", "metadata": {}, "source": [ "Define parameters" @@ -1321,8 +1337,8 @@ }, { "cell_type": "code", - "execution_count": 41, - "id": "117a569d", + "execution_count": 31, + "id": "52f056f7", "metadata": {}, "outputs": [], "source": [ @@ -1350,7 +1366,7 @@ }, { "cell_type": "markdown", - "id": "73c1ad41", + "id": "aaefe7bd", "metadata": {}, "source": [ "Run MinHash" @@ -1358,8 +1374,8 @@ }, { "cell_type": "code", - "execution_count": 43, - "id": "a17954eb", + "execution_count": 32, + "id": "da632a42", "metadata": {}, "outputs": [ { @@ -1367,19 +1383,23 @@ "output_type": "stream", "text": [ "Computing minhashes for /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/add_id/cleaned\n", - "Reading 1 files\n", - "Time taken for MinHash:7.543871879577637\n" + "Reading 1 files\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", - "\n", - "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", + "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:175: UserWarning: Output path /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/fuzzy/minhash/data/_minhashes.parquet already exists and will be overwritten\n", " warnings.warn(\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken for MinHash:5.899524927139282\n" + ] } ], "source": [ @@ -1415,7 +1435,7 @@ }, { "cell_type": "markdown", - "id": "19cddba5", + "id": "9ad4ba59", "metadata": {}, "source": [ "Verify result" @@ -1423,8 +1443,8 @@ }, { "cell_type": "code", - "execution_count": 45, - "id": "df83eec5", + "execution_count": 33, + "id": "93220b5c", "metadata": {}, "outputs": [ { @@ -1491,7 +1511,7 @@ "4 TH_wiki-0000000004 [1559901, 11771639, 487706, 826569, 1203860, 5..." ] }, - "execution_count": 45, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -1503,7 +1523,7 @@ }, { "cell_type": "markdown", - "id": "998ab08a", + "id": "b407928e", "metadata": {}, "source": [ "### 5.2 LSH\n", @@ -1524,28 +1544,28 @@ }, { "cell_type": "code", - "execution_count": 46, - "id": "138544a5", + "execution_count": 34, + "id": "f3801d7a", "metadata": {}, "outputs": [], "source": [ "from nemo_curator import LSH\n", - "from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import \\\n", + "from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import \\\n", " convert_str_id_to_int" ] }, { "cell_type": "markdown", - "id": "178fd0e4", + "id": "2a2c178a", "metadata": {}, "source": [ - "Define parameter" + "Define parameters" ] }, { "cell_type": "code", - "execution_count": 47, - "id": "21d2a261", + "execution_count": 35, + "id": "d52707b9", "metadata": {}, "outputs": [], "source": [ @@ -1570,7 +1590,7 @@ }, { "cell_type": "markdown", - "id": "a18708d2", + "id": "c59b4fe6", "metadata": {}, "source": [ "Run LSH" @@ -1578,17 +1598,15 @@ }, { "cell_type": "code", - "execution_count": 48, - "id": "9eebeb10", + "execution_count": 36, + "id": "71c0848f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", - "\n", - "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", + "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:361: UserWarning: Output path /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/fuzzy/lsh/data/_buckets.parquet already exists and will be overwritten\n", " warnings.warn(\n" ] }, @@ -1596,7 +1614,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Time taken for LSH:20.533941984176636\n" + "Time taken for LSH:18.237318754196167\n" ] } ], @@ -1616,7 +1634,7 @@ "#Run LSH()\n", "lsh = LSH(\n", " cache_dir=lsh_output_dir,\n", - " minhash_length=minhash_length,\n", + " num_hashes=minhash_length,\n", " num_buckets=num_bands,\n", " buckets_per_shuffle=buckets_per_shuffle,\n", " id_fields=[\"dataset_id\", \"doc_id\"],\n", @@ -1631,7 +1649,7 @@ }, { "cell_type": "markdown", - "id": "813603e2", + "id": "3789c538", "metadata": {}, "source": [ "Verify result" @@ -1639,8 +1657,8 @@ }, { "cell_type": "code", - "execution_count": 49, - "id": "c47da6b9", + "execution_count": 37, + "id": "d8663302", "metadata": {}, "outputs": [ { @@ -1673,32 +1691,32 @@ " \n", " 0\n", " 1692361878\n", - " 124692\n", - " 96\n", + " 124883\n", + " 38\n", " \n", " \n", " 1\n", " 1692361878\n", - " 85282\n", - " 385\n", + " 123211\n", + " 141\n", " \n", " \n", " 2\n", " 1692361878\n", - " 156638\n", - " 529\n", + " 124885\n", + " 38\n", " \n", " \n", " 3\n", " 1692361878\n", - " 160566\n", - " 540\n", + " 85294\n", + " 345\n", " \n", " \n", " 4\n", " 1692361878\n", - " 160567\n", - " 540\n", + " 124886\n", + " 38\n", " \n", " \n", "\n", @@ -1706,14 +1724,14 @@ ], "text/plain": [ " dataset_id doc_id _bucket_id\n", - "0 1692361878 124692 96\n", - "1 1692361878 85282 385\n", - "2 1692361878 156638 529\n", - "3 1692361878 160566 540\n", - "4 1692361878 160567 540" + "0 1692361878 124883 38\n", + "1 1692361878 123211 141\n", + "2 1692361878 124885 38\n", + "3 1692361878 85294 345\n", + "4 1692361878 124886 38" ] }, - "execution_count": 49, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -1725,7 +1743,7 @@ }, { "cell_type": "markdown", - "id": "07bade4a", + "id": "00f5567b", "metadata": {}, "source": [ "### 5.3 Jaccard Shuffle\n", @@ -1746,8 +1764,8 @@ }, { "cell_type": "code", - "execution_count": 50, - "id": "565253ae", + "execution_count": 38, + "id": "c5d458d1", "metadata": {}, "outputs": [], "source": [ @@ -1760,7 +1778,7 @@ }, { "cell_type": "markdown", - "id": "70387977", + "id": "e904bc34", "metadata": {}, "source": [ "Define parameters" @@ -1768,8 +1786,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "id": "5cff7d76", + "execution_count": 39, + "id": "170a44fd", "metadata": {}, "outputs": [], "source": [ @@ -1801,7 +1819,7 @@ }, { "cell_type": "markdown", - "id": "699a53f1", + "id": "333e91a8", "metadata": {}, "source": [ "Run Jaccard map bucket" @@ -1809,8 +1827,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "id": "0a6e5a84", + "execution_count": 40, + "id": "67b96227", "metadata": {}, "outputs": [ { @@ -1818,24 +1836,8 @@ "output_type": "stream", "text": [ "Number of files being read for jaccard calculation = 1\n", - "Number of ddf_bk partitions = 1\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", - "\n", - "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Time taken for Bucket Mapping:2.1162023544311523 s\n" + "Number of ddf_bk partitions = 1\n", + "Time taken for Bucket Mapping:2.2563915252685547 s\n" ] } ], @@ -1866,16 +1868,16 @@ }, { "cell_type": "markdown", - "id": "96246266", + "id": "8f76b8ef", "metadata": {}, "source": [ - "Verify results " + "Verify result" ] }, { "cell_type": "code", - "execution_count": 53, - "id": "09e65f8b", + "execution_count": 41, + "id": "81c9c7c7", "metadata": {}, "outputs": [ { @@ -1912,51 +1914,51 @@ " \n", " 0\n", " 1692361878\n", - " 138220\n", + " 8895\n", " 1692361878\n", - " 145256\n", + " 8964\n", " 1692361878\n", - " 143672\n", + " 8895\n", " 0\n", " \n", " \n", " 1\n", " 1692361878\n", - " 50509\n", + " 127089\n", " 1692361878\n", - " 50509\n", + " 127220\n", " 1692361878\n", - " 50457\n", + " 127089\n", " 0\n", " \n", " \n", " 2\n", " 1692361878\n", - " 93989\n", + " 127090\n", " 1692361878\n", - " 93846\n", + " 127220\n", " 1692361878\n", - " 93807\n", + " 127089\n", " 0\n", " \n", " \n", " 3\n", " 1692361878\n", - " 20448\n", + " 151728\n", " 1692361878\n", - " 20090\n", + " 151728\n", " 1692361878\n", - " 20444\n", + " 151729\n", " 0\n", " \n", " \n", " 4\n", " 1692361878\n", - " 93991\n", + " 137262\n", " 1692361878\n", - " 93927\n", + " 137301\n", " 1692361878\n", - " 93697\n", + " 137262\n", " 0\n", " \n", " \n", @@ -1965,21 +1967,21 @@ ], "text/plain": [ " dataset_id doc_id anchor_1_dataset_id anchor_1_doc_id \\\n", - "0 1692361878 138220 1692361878 145256 \n", - "1 1692361878 50509 1692361878 50509 \n", - "2 1692361878 93989 1692361878 93846 \n", - "3 1692361878 20448 1692361878 20090 \n", - "4 1692361878 93991 1692361878 93927 \n", + "0 1692361878 8895 1692361878 8964 \n", + "1 1692361878 127089 1692361878 127220 \n", + "2 1692361878 127090 1692361878 127220 \n", + "3 1692361878 151728 1692361878 151728 \n", + "4 1692361878 137262 1692361878 137301 \n", "\n", " anchor_0_dataset_id anchor_0_doc_id _output_partition_id \n", - "0 1692361878 143672 0 \n", - "1 1692361878 50457 0 \n", - "2 1692361878 93807 0 \n", - "3 1692361878 20444 0 \n", - "4 1692361878 93697 0 " + "0 1692361878 8895 0 \n", + "1 1692361878 127089 0 \n", + "2 1692361878 127089 0 \n", + "3 1692361878 151729 0 \n", + "4 1692361878 137262 0 " ] }, - "execution_count": 53, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -1991,16 +1993,16 @@ }, { "cell_type": "markdown", - "id": "35bb1e86", + "id": "b4896749", "metadata": {}, "source": [ - "**[Optional]**Remove previous Jaccard Shuffle results. Run only when there are files under the Jaccard Shuffle output path" + "**[Optional]** Remove previous Jaccard Shuffle results. Run only when there are files under the Jaccard Shuffle output path" ] }, { "cell_type": "code", - "execution_count": 88, - "id": "da7dcc10", + "execution_count": 43, + "id": "2d4dd55f", "metadata": {}, "outputs": [], "source": [ @@ -2009,7 +2011,7 @@ }, { "cell_type": "markdown", - "id": "24c2b39d", + "id": "f9b5ab9e", "metadata": {}, "source": [ "Run Jaccard Shuffle" @@ -2017,15 +2019,15 @@ }, { "cell_type": "code", - "execution_count": 54, - "id": "a9dcf646", + "execution_count": 44, + "id": "acccb80b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " 0%| | 0/1 [00:00\n", " \n", " 0\n", - " พุทธศักราช 676 ใกล้เคียงกับ\\n เมษายน ค.ศ. 133 ...\n", - " 263\n", - " 1692361878-7032\n", - " 1692361878-7032\n", - " 1692361878-7052\n", + " ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...\n", + " 1894\n", + " 1692361878-127021\n", + " 1692361878-127021\n", + " 1692361878-126958\n", " \n", " \n", " 1\n", - " พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...\n", - " 217\n", - " 1692361878-9082\n", - " 1692361878-8805\n", - " 1692361878-9071\n", + " ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...\n", + " 1894\n", + " 1692361878-127021\n", + " 1692361878-127021\n", + " 1692361878-127017\n", " \n", " \n", " 2\n", - " พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...\n", - " 217\n", - " 1692361878-9082\n", - " 1692361878-9028\n", - " 1692361878-9045\n", + " ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...\n", + " 1894\n", + " 1692361878-127021\n", + " 1692361878-126928\n", + " 1692361878-126891\n", " \n", " \n", " 3\n", - " พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...\n", - " 217\n", - " 1692361878-9082\n", - " 1692361878-9072\n", - " 1692361878-9082\n", + " วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...\n", + " 423\n", + " 1692361878-87271\n", + " 1692361878-87204\n", + " 1692361878-87271\n", " \n", " \n", " 4\n", - " ประเทศฮังการี เข้าร่วมแข่งขันกีฬาโอลิมปิกฤดูร้...\n", - " 2039\n", - " 1692361878-49091\n", - " 1692361878-49093\n", - " 1692361878-49087\n", + " วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...\n", + " 423\n", + " 1692361878-87271\n", + " 1692361878-87267\n", + " 1692361878-87271\n", " \n", " \n", "\n", @@ -2174,21 +2176,21 @@ ], "text/plain": [ " text _text_bytes \\\n", - "0 พุทธศักราช 676 ใกล้เคียงกับ\\n เมษายน ค.ศ. 133 ... 263 \n", - "1 พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5... 217 \n", - "2 พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5... 217 \n", - "3 พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5... 217 \n", - "4 ประเทศฮังการี เข้าร่วมแข่งขันกีฬาโอลิมปิกฤดูร้... 2039 \n", + "0 ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช... 1894 \n", + "1 ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช... 1894 \n", + "2 ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช... 1894 \n", + "3 วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค... 423 \n", + "4 วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค... 423 \n", "\n", - " id anchor_0_id anchor_1_id \n", - "0 1692361878-7032 1692361878-7032 1692361878-7052 \n", - "1 1692361878-9082 1692361878-8805 1692361878-9071 \n", - "2 1692361878-9082 1692361878-9028 1692361878-9045 \n", - "3 1692361878-9082 1692361878-9072 1692361878-9082 \n", - "4 1692361878-49091 1692361878-49093 1692361878-49087 " + " id anchor_0_id anchor_1_id \n", + "0 1692361878-127021 1692361878-127021 1692361878-126958 \n", + "1 1692361878-127021 1692361878-127021 1692361878-127017 \n", + "2 1692361878-127021 1692361878-126928 1692361878-126891 \n", + "3 1692361878-87271 1692361878-87204 1692361878-87271 \n", + "4 1692361878-87271 1692361878-87267 1692361878-87271 " ] }, - "execution_count": 55, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -2200,7 +2202,7 @@ }, { "cell_type": "markdown", - "id": "ffb70238", + "id": "1a23a5c0", "metadata": {}, "source": [ "### 5.4 Jaccard Compute\n", @@ -2215,8 +2217,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "id": "06346b88", + "execution_count": 46, + "id": "6cfa08ea", "metadata": {}, "outputs": [], "source": [ @@ -2225,7 +2227,7 @@ }, { "cell_type": "markdown", - "id": "d71f440f", + "id": "389f305b", "metadata": {}, "source": [ "Define parameters" @@ -2233,8 +2235,8 @@ }, { "cell_type": "code", - "execution_count": 57, - "id": "457ae138", + "execution_count": 47, + "id": "c142a42a", "metadata": {}, "outputs": [], "source": [ @@ -2256,7 +2258,7 @@ }, { "cell_type": "markdown", - "id": "619bf820", + "id": "7a0f610f", "metadata": {}, "source": [ "Run Jaccard Compute" @@ -2264,8 +2266,8 @@ }, { "cell_type": "code", - "execution_count": 58, - "id": "2f094db1", + "execution_count": 48, + "id": "8ceae838", "metadata": {}, "outputs": [ { @@ -2273,13 +2275,13 @@ "output_type": "stream", "text": [ "Running jaccard compute script\n", - "Time taken for Jaccard Computing: 0.8689384460449219\n" + "Time taken for Jaccard Computing: 0.5923423767089844\n" ] } ], "source": [ - "enable_spilling()\n", - "client.run(enable_spilling)\n", + "# enable_spilling()\n", + "# client.run(enable_spilling)\n", "\n", "print(\"Running jaccard compute script\", flush=True)\n", "t0 = time.time()\n", @@ -2301,7 +2303,7 @@ }, { "cell_type": "markdown", - "id": "b31e619c", + "id": "ae06ad56", "metadata": {}, "source": [ "Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets." @@ -2309,8 +2311,8 @@ }, { "cell_type": "code", - "execution_count": 59, - "id": "ae2efe3e", + "execution_count": 49, + "id": "686eb956", "metadata": {}, "outputs": [ { @@ -2342,33 +2344,33 @@ " \n", " \n", " 0\n", - " 1692361878-127521\n", - " 1692361878-127517\n", - " 0.755481\n", + " 1692361878-49094\n", + " 1692361878-49078\n", + " 0.784000\n", " \n", " \n", " 1\n", - " 1692361878-127521\n", - " 1692361878-127517\n", - " 0.755481\n", + " 1692361878-49094\n", + " 1692361878-49078\n", + " 0.784000\n", " \n", " \n", " 2\n", - " 1692361878-45934\n", - " 1692361878-45940\n", - " 0.922061\n", + " 1692361878-49094\n", + " 1692361878-49078\n", + " 0.784000\n", " \n", " \n", " 3\n", - " 1692361878-45934\n", - " 1692361878-45940\n", - " 0.922061\n", + " 1692361878-49094\n", + " 1692361878-49078\n", + " 0.784000\n", " \n", " \n", " 4\n", - " 1692361878-45934\n", - " 1692361878-45940\n", - " 0.922061\n", + " 1692361878-161128\n", + " 1692361878-161122\n", + " 0.890339\n", " \n", " \n", "\n", @@ -2376,14 +2378,14 @@ ], "text/plain": [ " id_x id_y jaccard\n", - "0 1692361878-127521 1692361878-127517 0.755481\n", - "1 1692361878-127521 1692361878-127517 0.755481\n", - "2 1692361878-45934 1692361878-45940 0.922061\n", - "3 1692361878-45934 1692361878-45940 0.922061\n", - "4 1692361878-45934 1692361878-45940 0.922061" + "0 1692361878-49094 1692361878-49078 0.784000\n", + "1 1692361878-49094 1692361878-49078 0.784000\n", + "2 1692361878-49094 1692361878-49078 0.784000\n", + "3 1692361878-49094 1692361878-49078 0.784000\n", + "4 1692361878-161128 1692361878-161122 0.890339" ] }, - "execution_count": 59, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -2395,7 +2397,7 @@ }, { "cell_type": "markdown", - "id": "834f1831", + "id": "63911051", "metadata": {}, "source": [ "### 5.5 Connected Components\n", @@ -2410,8 +2412,8 @@ }, { "cell_type": "code", - "execution_count": 60, - "id": "5756fde8", + "execution_count": 50, + "id": "5eae08f1", "metadata": {}, "outputs": [], "source": [ @@ -2420,16 +2422,16 @@ }, { "cell_type": "markdown", - "id": "217957d6", + "id": "ed713696", "metadata": {}, "source": [ - "Define parameter" + "Define parameters" ] }, { "cell_type": "code", - "execution_count": 61, - "id": "72a1952e", + "execution_count": 51, + "id": "a0881f12", "metadata": {}, "outputs": [], "source": [ @@ -2450,7 +2452,7 @@ }, { "cell_type": "markdown", - "id": "c53b3a8c", + "id": "4fba31d2", "metadata": {}, "source": [ "Run Connected Component" @@ -2458,62 +2460,24 @@ }, { "cell_type": "code", - "execution_count": 62, - "id": "46578e2b", + "execution_count": 52, + "id": "da4a8d4e", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", - "\n", - "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", - " warnings.warn(\n", - "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", - "\n", - "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", - " warnings.warn(\n", - "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", - "\n", - "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "batch_id = 0/1, time = 0.3100006580352783\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", - "\n", - "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", - " warnings.warn(\n", - "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n", - "\n", - "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n", - " warnings.warn(\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ + "batch_id = 0/1, time = 0.26957249641418457\n", "# of groups 5465\n", "# of docs removed 3079\n", "assert num_nodes:8544==labels_df:8544 passed\n", - "Time taken for Connected Component: 11.238884925842285 s\n" + "Time taken for Connected Component: 4.331223726272583 s\n" ] } ], "source": [ - "client.run(enable_spilling)\n", + "#client.run(enable_spilling)\n", "\n", "t0 = time.time()\n", " \n", @@ -2532,7 +2496,7 @@ }, { "cell_type": "markdown", - "id": "6827158e", + "id": "24b55482", "metadata": {}, "source": [ "Verify the result of `Connected Components`" @@ -2540,8 +2504,8 @@ }, { "cell_type": "code", - "execution_count": 63, - "id": "2bcfc470", + "execution_count": 53, + "id": "ecd69e7e", "metadata": {}, "outputs": [ { @@ -2574,32 +2538,32 @@ " \n", " 0\n", " 1692361878\n", - " 136999\n", - " 3837\n", + " 139585\n", + " 1936\n", " \n", " \n", " 1\n", " 1692361878\n", - " 85318\n", - " 3838\n", + " 8059\n", + " 5312\n", " \n", " \n", " 2\n", " 1692361878\n", - " 70670\n", - " 1196\n", + " 93474\n", + " 5313\n", " \n", " \n", " 3\n", " 1692361878\n", - " 134587\n", - " 138\n", + " 127790\n", + " 2774\n", " \n", " \n", " 4\n", " 1692361878\n", - " 136125\n", - " 1320\n", + " 49650\n", + " 1425\n", " \n", " \n", "\n", @@ -2607,14 +2571,14 @@ ], "text/plain": [ " dataset_id doc_id group\n", - "0 1692361878 136999 3837\n", - "1 1692361878 85318 3838\n", - "2 1692361878 70670 1196\n", - "3 1692361878 134587 138\n", - "4 1692361878 136125 1320" + "0 1692361878 139585 1936\n", + "1 1692361878 8059 5312\n", + "2 1692361878 93474 5313\n", + "3 1692361878 127790 2774\n", + "4 1692361878 49650 1425" ] }, - "execution_count": 63, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -2626,7 +2590,7 @@ }, { "cell_type": "markdown", - "id": "aa1ee07d", + "id": "44834e54", "metadata": {}, "source": [ "Let's check if the output fuzzy duplicated documents within the same group are similar. Please note that the `group` id in your output might be different from the notebook output." @@ -2634,8 +2598,8 @@ }, { "cell_type": "code", - "execution_count": 64, - "id": "f1f10a1c", + "execution_count": 54, + "id": "6c404c89", "metadata": {}, "outputs": [ { @@ -2666,28 +2630,28 @@ " \n", " \n", " 0\n", - " 121\n", - " 134756, 134762, 134748, 134742, 134740, 134750...\n", + " 75\n", + " 160982, 161038, 161124, 161109, 161121, 160991...\n", " \n", " \n", " 1\n", - " 138\n", - " 134587, 134908, 135024, 135029, 135019, 134566...\n", + " 112\n", + " 122007, 122124, 122020, 122282, 122010, 122134...\n", " \n", " \n", " 2\n", - " 323\n", - " 134794, 134780, 134793, 134785, 134798, 134781...\n", + " 151\n", + " 134584, 135030, 134908, 134891, 135029, 135020...\n", " \n", " \n", " 3\n", - " 344\n", - " 136092, 136103, 136090, 136093, 136100, 136089...\n", + " 321\n", + " 94082, 94114, 94126, 94057, 94121, 94132, 9411...\n", " \n", " \n", " 4\n", - " 428\n", - " 94120, 94084, 94059, 94128, 94130, 94056, 9413...\n", + " 339\n", + " 116230, 116237, 116223, 116236, 116176, 116204...\n", " \n", " \n", " ...\n", @@ -2697,27 +2661,27 @@ " \n", " 5460\n", " 8539\n", - " 125651\n", + " 120646\n", " \n", " \n", " 5461\n", " 8540\n", - " 125971\n", + " 158174\n", " \n", " \n", " 5462\n", " 8541\n", - " 84926\n", + " 132405\n", " \n", " \n", " 5463\n", " 8542\n", - " 40115\n", + " 49199\n", " \n", " \n", " 5464\n", " 8543\n", - " 50282\n", + " 160924\n", " \n", " \n", "\n", @@ -2726,22 +2690,22 @@ ], "text/plain": [ " group doc_id\n", - "0 121 134756, 134762, 134748, 134742, 134740, 134750...\n", - "1 138 134587, 134908, 135024, 135029, 135019, 134566...\n", - "2 323 134794, 134780, 134793, 134785, 134798, 134781...\n", - "3 344 136092, 136103, 136090, 136093, 136100, 136089...\n", - "4 428 94120, 94084, 94059, 94128, 94130, 94056, 9413...\n", + "0 75 160982, 161038, 161124, 161109, 161121, 160991...\n", + "1 112 122007, 122124, 122020, 122282, 122010, 122134...\n", + "2 151 134584, 135030, 134908, 134891, 135029, 135020...\n", + "3 321 94082, 94114, 94126, 94057, 94121, 94132, 9411...\n", + "4 339 116230, 116237, 116223, 116236, 116176, 116204...\n", "... ... ...\n", - "5460 8539 125651\n", - "5461 8540 125971\n", - "5462 8541 84926\n", - "5463 8542 40115\n", - "5464 8543 50282\n", + "5460 8539 120646\n", + "5461 8540 158174\n", + "5462 8541 132405\n", + "5463 8542 49199\n", + "5464 8543 160924\n", "\n", "[5465 rows x 2 columns]" ] }, - "execution_count": 64, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -2753,7 +2717,7 @@ }, { "cell_type": "markdown", - "id": "f621c2cb", + "id": "b4cd941d", "metadata": {}, "source": [ "Change the `group` number if necessary. By running the code below, we can obtain a list of near duplicated documents." @@ -2761,8 +2725,8 @@ }, { "cell_type": "code", - "execution_count": 72, - "id": "bd79a7f7", + "execution_count": 55, + "id": "09b3fd0b", "metadata": {}, "outputs": [ { @@ -2793,34 +2757,34 @@ " \n", " \n", " \n", - " 14\n", + " 420\n", " 1692361878\n", - " 121545\n", - " 735\n", + " 122007\n", + " 112\n", " \n", " \n", - " 66\n", + " 425\n", " 1692361878\n", - " 121487\n", - " 735\n", + " 122124\n", + " 112\n", " \n", " \n", - " 213\n", + " 689\n", " 1692361878\n", - " 121541\n", - " 735\n", + " 122020\n", + " 112\n", " \n", " \n", - " 291\n", + " 764\n", " 1692361878\n", - " 121539\n", - " 735\n", + " 122282\n", + " 112\n", " \n", " \n", - " 422\n", + " 952\n", " 1692361878\n", - " 121524\n", - " 735\n", + " 122010\n", + " 112\n", " \n", " \n", "\n", @@ -2828,25 +2792,25 @@ ], "text/plain": [ " dataset_id doc_id group\n", - "14 1692361878 121545 735\n", - "66 1692361878 121487 735\n", - "213 1692361878 121541 735\n", - "291 1692361878 121539 735\n", - "422 1692361878 121524 735" + "420 1692361878 122007 112\n", + "425 1692361878 122124 112\n", + "689 1692361878 122020 112\n", + "764 1692361878 122282 112\n", + "952 1692361878 122010 112" ] }, - "execution_count": 72, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "cc_compute_res[cc_compute_res['group']==735].head()" + "cc_compute_res[cc_compute_res['group']==112].head()" ] }, { "cell_type": "markdown", - "id": "e7c02f4b", + "id": "8b0de04f", "metadata": {}, "source": [ "Print the text of near duplicated document. Please replace the `id` if necessary, `id` should be in the format of `_`" @@ -2855,7 +2819,7 @@ { "cell_type": "code", "execution_count": 73, - "id": "dd0b2e33", + "id": "fbf88107", "metadata": {}, "outputs": [ { @@ -2877,7 +2841,7 @@ }, { "cell_type": "markdown", - "id": "c3f8d12f", + "id": "fd33ac1d", "metadata": {}, "source": [ "Below is the English translation of the output above. We can see that the two documents are indeed very similar to each other.\n", @@ -2938,7 +2902,256 @@ }, { "cell_type": "markdown", - "id": "70ca66df", + "id": "68cfec8a", + "metadata": {}, + "source": [ + "### 5.6 Fuzzy deduplication wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "fe7de030", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "fe8794b8", + "metadata": {}, + "outputs": [], + "source": [ + "#Input\n", + "fuzzy_dedup_data_path = added_id_output_path\n", + "#Output\n", + "fuzzy_dedup_base_output_path = os.path.join(data_dir,\"fuzzy_wrapper\")\n", + "fuzzy_dedup_log_dir = os.path.join(fuzzy_dedup_base_output_path,'log')\n", + "fuzzy_dedup_cache_dir = os.path.join(fuzzy_dedup_base_output_path,'cache')\n", + "fuzzy_dedup_output_dir = os.path.join(fuzzy_dedup_base_output_path,'data')\n", + "#Specify dataset name\n", + "dataset_name = 'TH_wikipedia'\n", + "\n", + "#Relevant parameters\n", + "id_field = 'id'\n", + "text_field = 'text'\n", + "filetype = \"parquet\"\n", + "\n", + "!mkdir -p {fuzzy_dedup_base_output_path}\n", + "!mkdir -p {fuzzy_dedup_log_dir}\n", + "!mkdir -p {fuzzy_dedup_cache_dir}\n", + "!mkdir -p {fuzzy_dedup_output_dir}" + ] + }, + { + "cell_type": "markdown", + "id": "0aa0b60c", + "metadata": {}, + "source": [ + "**[Optional]** If the cache folder is not empty, please CLEAR the folder before proceeding" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "83705eaa", + "metadata": {}, + "outputs": [], + "source": [ + "#!rm -r {fuzzy_dedup_cache_dir}" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "72494e54", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n", + "Stage1: Starting Minhash + LSH computation\n", + "Stage1: Minhash + LSH complete!\n", + "Stage2 (False Postive Check): Starting Map_Buckets\n", + "Stage2 (False Postive Check): Map_Buckets Complete!\n", + "Stage3 (False Postive Check): Shuffle docs\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgroup
0TH_wiki-0000134798736
1TH_wiki-00001162261526
2TH_wiki-00001267962934
3TH_wiki-0000138218156
4TH_wiki-00000854372722
\n", + "" + ], + "text/plain": [ + " id group\n", + "0 TH_wiki-0000134798 736\n", + "1 TH_wiki-0000116226 1526\n", + "2 TH_wiki-0000126796 2934\n", + "3 TH_wiki-0000138218 156\n", + "4 TH_wiki-0000085437 2722" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fuzzy_dedup_res = pd.read_parquet(fuzzy_dedup_output_dir)\n", + "fuzzy_dedup_res.head()" + ] + }, + { + "cell_type": "markdown", + "id": "b9dfbdde", "metadata": {}, "source": [ "## 6. Remove duplicates\n", @@ -2948,7 +3161,7 @@ }, { "cell_type": "markdown", - "id": "93d031ec", + "id": "bb722fd2", "metadata": {}, "source": [ "Define parameters" @@ -2956,8 +3169,8 @@ }, { "cell_type": "code", - "execution_count": 80, - "id": "911be9d9", + "execution_count": 81, + "id": "5a4b97b7", "metadata": {}, "outputs": [], "source": [ @@ -2976,7 +3189,7 @@ }, { "cell_type": "markdown", - "id": "969f6543", + "id": "d3962deb", "metadata": {}, "source": [ "We will first process the result of exact deduplication. Since result of exact deduplication contains original ID used in input dataset, it is more straightforward to deal with." @@ -2984,8 +3197,8 @@ }, { "cell_type": "code", - "execution_count": 81, - "id": "bbbfdbb3", + "execution_count": 82, + "id": "a29d720d", "metadata": {}, "outputs": [ { @@ -3015,65 +3228,80 @@ }, { "cell_type": "markdown", - "id": "8b97567d", + "id": "b4c1c057", "metadata": {}, "source": [ "For result of fuzzy deduplication, we need to first reconstructed document ID by combining `dataset_id` and `doc_id`, then use the reconstructed `ID` for removal" ] }, + { + "cell_type": "markdown", + "id": "1c0f1ee6", + "metadata": {}, + "source": [ + "**[Optional]** Uncomment the cell to use result from step by step fuzzy deduplication" + ] + }, { "cell_type": "code", - "execution_count": 82, - "id": "513cf7a0", + "execution_count": 83, + "id": "1ff911ad", "metadata": {}, "outputs": [], "source": [ - "#List of id_prefix used in Add ID\n", - "base_ids = [id_prefix]\n", - "\n", - "#Obtain a mapping between `dataset_id` and `id_prefix`\n", - "df = cudf.DataFrame()\n", - "df['base_id'] = [base_id for base_id in base_ids]\n", - "df['dataset_id'] = df['base_id'].hash_values()\n", - "df_pd = df.to_pandas()\n", - "mapping = {\n", - " hashed_id: base_id\n", - " for base_id, hashed_id in zip(df_pd['base_id'], df_pd['dataset_id'])\n", - "}\n", - "\n", - "#Load result of fuzzy deduplication\n", - "fuzzy_duplicates = pd.read_parquet(connected_component_output_path)\n", - "#Reconstruct the original document ID\n", - "fuzzy_duplicates['id']=fuzzy_duplicates.apply(lambda x: f\"{mapping[x['dataset_id']]}-{x['doc_id']:010d}\", axis=1)\n", + "# #List of id_prefix used in Add ID\n", + "# base_ids = [id_prefix]\n", + "\n", + "# #Obtain a mapping between `dataset_id` and `id_prefix`\n", + "# df = cudf.DataFrame()\n", + "# df['base_id'] = [base_id for base_id in base_ids]\n", + "# df['dataset_id'] = df['base_id'].hash_values()\n", + "# df_pd = df.to_pandas()\n", + "# mapping = {\n", + "# hashed_id: base_id\n", + "# for base_id, hashed_id in zip(df_pd['base_id'], df_pd['dataset_id'])\n", + "# }\n", + "\n", + "# #Load result of fuzzy deduplication \n", + "# fuzzy_duplicates = pd.read_parquet(connected_component_output_path)\n", + "# #Reconstruct the original document ID\n", + "# fuzzy_duplicates['id']=fuzzy_duplicates.apply(lambda x: f\"{mapping[x['dataset_id']]}-{x['doc_id']:010d}\", axis=1)\n", + "\n", + "# #Generate list of near duplicate document ID\n", + "# fuzzy_docs_to_remove = fuzzy_duplicates.drop_duplicates(subset=['group'], keep='first')" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "2220d8fc", + "metadata": {}, + "outputs": [], + "source": [ + "#Loads result from fuzzy dedup wrapper\n", + "fuzzy_duplicates = pd.read_parquet(fuzzy_dedup_output_dir)\n", + "\n", "#Generate list of near duplicate document ID\n", "fuzzy_docs_to_remove = fuzzy_duplicates.drop_duplicates(subset=['group'], keep='first')" ] }, { "cell_type": "code", - "execution_count": 83, - "id": "dc7d647c", + "execution_count": 85, + "id": "08143e1e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Writing to disk complete for 1 partitions\n" - ] - } - ], + "outputs": [], "source": [ "#Remove near duplicates\n", "result = result[~result[input_id_field].isin(fuzzy_docs_to_remove[input_id_field])]\n", "\n", "#Save final result to local\n", - "write_to_disk(result, dudped_output_dir, output_type=\"parquet\")" + "result.to_parquet(dudped_output_dir, write_to_filename=True)" ] }, { "cell_type": "markdown", - "id": "b47a967f", + "id": "a5008578", "metadata": {}, "source": [ "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset (length = 161748)" @@ -3081,15 +3309,15 @@ }, { "cell_type": "code", - "execution_count": 84, - "id": "5e8097b1", + "execution_count": 86, + "id": "a692c916", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Length of duplicate removed dataset:156257\n" + "Length of duplicate removed dataset:156265\n" ] } ], @@ -3100,16 +3328,16 @@ }, { "cell_type": "markdown", - "id": "85caf66f", + "id": "24440f5f", "metadata": {}, "source": [ - "Close the GPU Dask Cluster" + "Close the GPU Dask Cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again." ] }, { "cell_type": "code", - "execution_count": 85, - "id": "cd91f5fe", + "execution_count": 88, + "id": "656a24f2", "metadata": {}, "outputs": [], "source": [ @@ -3119,7 +3347,7 @@ }, { "cell_type": "markdown", - "id": "1c6cee97", + "id": "3a00f6ea", "metadata": {}, "source": [ "## 7. Heuristic Fitlering\n", @@ -3141,8 +3369,8 @@ }, { "cell_type": "code", - "execution_count": 86, - "id": "1ddff58c", + "execution_count": 89, + "id": "41f7cdf4", "metadata": {}, "outputs": [], "source": [ @@ -3153,16 +3381,16 @@ }, { "cell_type": "markdown", - "id": "a728a161", + "id": "f5ed694b", "metadata": {}, "source": [ - "**[Optional]**The following cell is to remove warning from dask." + "**[Optional]** The following cell is to remove warning from dask." ] }, { "cell_type": "code", - "execution_count": 87, - "id": "e5114945", + "execution_count": 90, + "id": "39aab4d9", "metadata": {}, "outputs": [], "source": [ @@ -3174,7 +3402,7 @@ }, { "cell_type": "markdown", - "id": "6243a7cb", + "id": "3c196329", "metadata": {}, "source": [ "Create a CPU Dask Cluster." @@ -3182,8 +3410,8 @@ }, { "cell_type": "code", - "execution_count": 88, - "id": "fa752ded", + "execution_count": 91, + "id": "1ffd3928", "metadata": {}, "outputs": [], "source": [ @@ -3193,7 +3421,7 @@ }, { "cell_type": "markdown", - "id": "c3dda877", + "id": "4a514d3c", "metadata": {}, "source": [ "Define some helper functions" @@ -3201,8 +3429,8 @@ }, { "cell_type": "code", - "execution_count": 89, - "id": "a8abf841", + "execution_count": 92, + "id": "d4aaccc4", "metadata": {}, "outputs": [], "source": [ @@ -3233,7 +3461,7 @@ }, { "cell_type": "markdown", - "id": "04e6b0f8", + "id": "5d43a755", "metadata": {}, "source": [ "Define parameters" @@ -3241,8 +3469,8 @@ }, { "cell_type": "code", - "execution_count": 90, - "id": "55e43a6c", + "execution_count": 93, + "id": "4f8b0336", "metadata": {}, "outputs": [], "source": [ @@ -3272,7 +3500,7 @@ }, { "cell_type": "markdown", - "id": "4c5f6c8e", + "id": "bddd9dd9", "metadata": {}, "source": [ "Run heuristic filtering" @@ -3280,8 +3508,8 @@ }, { "cell_type": "code", - "execution_count": 91, - "id": "f6f50332", + "execution_count": 94, + "id": "1df83255", "metadata": {}, "outputs": [ { @@ -3295,6 +3523,34 @@ "Writing to disk complete for 1 partitions\n", "Saving data for urls_ratio\n", "Writing to disk complete for 1 partitions\n", + "Saving data for white_space\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for parentheses_ratio\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for boilerplate_string_ratio\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for repeated_lines\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for repeated_paragraphs\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for repeated_lines_char\n", + "Writing to disk complete for 1 partitions\n", + "Saving data for repeated_paragraphs_char\n", + "Writing to disk complete for 1 partitions\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/nemo_curator/utils/distributed_utils.py:379: UserWarning: Empty partition found\n", + " warnings.warn(f\"Empty partition found\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "Saving data for word_count\n", "Writing to disk complete for 1 partitions\n", "Saving data for repeating_top_2grams\n", @@ -3304,7 +3560,7 @@ "Saving data for repeating_top_4grams\n", "Writing to disk complete for 1 partitions\n", "Writing to disk complete for 1 partitions\n", - "Time taken for Heuristic filtering: 729.7436628341675 s\n" + "Time taken for Heuristic filtering: 1120.5212895870209 s\n" ] } ], @@ -3316,7 +3572,7 @@ "score_fields = get_score_fields(filter_pipeline)\n", "\n", "# Load dataset\n", - "dataset = load_dataset(HF_input_data_dir,file_type='parquet')\n", + "dataset = DocumentDataset.read_parquet(HF_input_data_dir, backend='pandas', add_filename=True)\n", "\n", "\n", "# Iterate through filters. For each filter, the low quality document will be removed from the dataset and output to corresponding folder for analysis\n", @@ -3346,14 +3602,14 @@ "filtered_dataset = DocumentDataset(filtered_dataset.df.drop(columns=score_fields))\n", "\n", "# Output filtered dataset\n", - "write_to_disk(filtered_dataset.df, kept_document_dir, write_to_filename=True, output_type=output_file_type)\n", + "filtered_dataset.to_parquet(kept_document_dir, write_to_filename=True)\n", "\n", "print(f\"Time taken for Heuristic filtering: {time.time()-t0} s\")" ] }, { "cell_type": "markdown", - "id": "b19731f5", + "id": "0fab7ee5", "metadata": {}, "source": [ "Verify the result." @@ -3361,10 +3617,137 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "8f945362", + "execution_count": 95, + "id": "65160254", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset size after heuristic filtering:192786\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenameidlanguagesource_idtexttitleurl
1part.0.parquetTH_wiki-0000000001THthwiki-20240201-thwiki-20240201-pages-articles...ดาราศาสตร์ คือวิชาวิทยาศาสตร์ที่ศึกษาวัตถุในท้...ดาราศาสตร์https://th.wikipedia.org/wiki/%E0%B8%94%E0%B8%...
2part.0.parquetTH_wiki-0000000002THthwiki-20240201-thwiki-20240201-pages-articles...ภูมิศาสตร์ (, แปลว่า \"การพรรณนาเกี่ยวกับโลก\")...ภูมิศาสตร์https://th.wikipedia.org/wiki/%E0%B8%A0%E0%B8%...
3part.0.parquetTH_wiki-0000000003THthwiki-20240201-thwiki-20240201-pages-articles...พันทิป.คอม หรือพันทิป ก่อตั้งขึ้นเมื่อวันที่ 7...พันทิป.คอมhttps://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...
4part.0.parquetTH_wiki-0000000004THthwiki-20240201-thwiki-20240201-pages-articles...พันธุ์ทิพย์พลาซ่า () เป็นศูนย์การค้าเกี่ยวกับเ...พันธุ์ทิพย์พลาซ่าhttps://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...
5part.0.parquetTH_wiki-0000000005THthwiki-20240201-thwiki-20240201-pages-articles...วิทยาการคอมพิวเตอร์ศึกษาเกี่ยวกับโครงสร้างพื้น...วิทยาการคอมพิวเตอร์https://th.wikipedia.org/wiki/%E0%B8%A7%E0%B8%...
\n", + "
" + ], + "text/plain": [ + " filename id language \\\n", + "1 part.0.parquet TH_wiki-0000000001 TH \n", + "2 part.0.parquet TH_wiki-0000000002 TH \n", + "3 part.0.parquet TH_wiki-0000000003 TH \n", + "4 part.0.parquet TH_wiki-0000000004 TH \n", + "5 part.0.parquet TH_wiki-0000000005 TH \n", + "\n", + " source_id \\\n", + "1 thwiki-20240201-thwiki-20240201-pages-articles... \n", + "2 thwiki-20240201-thwiki-20240201-pages-articles... \n", + "3 thwiki-20240201-thwiki-20240201-pages-articles... \n", + "4 thwiki-20240201-thwiki-20240201-pages-articles... \n", + "5 thwiki-20240201-thwiki-20240201-pages-articles... \n", + "\n", + " text title \\\n", + "1 ดาราศาสตร์ คือวิชาวิทยาศาสตร์ที่ศึกษาวัตถุในท้... ดาราศาสตร์ \n", + "2 ภูมิศาสตร์ (, แปลว่า \"การพรรณนาเกี่ยวกับโลก\")... ภูมิศาสตร์ \n", + "3 พันทิป.คอม หรือพันทิป ก่อตั้งขึ้นเมื่อวันที่ 7... พันทิป.คอม \n", + "4 พันธุ์ทิพย์พลาซ่า () เป็นศูนย์การค้าเกี่ยวกับเ... พันธุ์ทิพย์พลาซ่า \n", + "5 วิทยาการคอมพิวเตอร์ศึกษาเกี่ยวกับโครงสร้างพื้น... วิทยาการคอมพิวเตอร์ \n", + "\n", + " url \n", + "1 https://th.wikipedia.org/wiki/%E0%B8%94%E0%B8%... \n", + "2 https://th.wikipedia.org/wiki/%E0%B8%A0%E0%B8%... \n", + "3 https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%... \n", + "4 https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%... \n", + "5 https://th.wikipedia.org/wiki/%E0%B8%A7%E0%B8%... " + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "res = pd.read_parquet(kept_document_dir)\n", "print(f\"Dataset size after heuristic filtering:{len(res)}\")\n", @@ -3373,7 +3756,7 @@ }, { "cell_type": "markdown", - "id": "cb52fe04", + "id": "412bd6d2", "metadata": {}, "source": [ "Close the CPU Dask Cluster" @@ -3381,8 +3764,8 @@ }, { "cell_type": "code", - "execution_count": 94, - "id": "aaa9823a", + "execution_count": 96, + "id": "e6129857", "metadata": {}, "outputs": [], "source": [ @@ -3393,7 +3776,7 @@ { "cell_type": "code", "execution_count": null, - "id": "94f6e74e", + "id": "4679d955", "metadata": {}, "outputs": [], "source": [] From 4b024cb7a3bef5f45da5b463d14bcf952fceecb0 Mon Sep 17 00:00:00 2001 From: Nicole Luo Date: Mon, 20 May 2024 07:00:32 +0000 Subject: [PATCH 31/34] Fixing Style Signed-off-by: Nicole Luo --- .pre-commit-config.yaml | 0 .../config/heuristic_filter_non-en.yaml | 17 ++++++++--------- 2 files changed, 8 insertions(+), 9 deletions(-) mode change 100644 => 100755 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100644 new mode 100755 diff --git a/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml b/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml index 50d435e2e..4c1b80905 100755 --- a/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml +++ b/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml @@ -1,33 +1,32 @@ input_field: text filters: # The filters below define a chain of heuristic filters to be applied to each document in a corpus. - # This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words. + # This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words. # The filter listed at the top will be applied first, and the following filters will be applied in # the order they appear in this file. Each filter can be removed and re-ordered as desired. - name: nemo_curator.filters.heuristic_filter.SymbolsToWordsFilter log_score: True params: max_symbol_to_word_ratio: 0.1 - - name: nemo_curator.filters.heuristic_filter.NumbersFilter log_score: True params: max_number_to_text_ratio: 0.15 - name: nemo_curator.filters.heuristic_filter.UrlsFilter log_score: True - params: + params: max_url_to_text_ratio: 0.2 - name: nemo_curator.filters.heuristic_filter.WhiteSpaceFilter log_score: True - params: + params: max_white_space_ratio: 0.25 - name: nemo_curator.filters.heuristic_filter.ParenthesesFilter log_score: True - params: + params: max_parentheses_ratio: 0.1 - name: nemo_curator.filters.heuristic_filter.BoilerPlateStringFilter log_score: True - params: + params: remove_if_at_top_or_bottom: True max_boilerplate_string_ratio: 0.4 - name: nemo_curator.filters.heuristic_filter.RepeatedLinesFilter @@ -50,17 +49,17 @@ filters: params: min_words: 50 max_words: 100000 - # NOTE: This filter tends to remove many documents and will need to + # NOTE: This filter tends to remove many documents and will need to # be tuned per language # - name: nemo_curator.filters.heuristic_filter.PunctuationFilter # params: # max_num_sentences_without_endmark_ratio: 0.85 # - name: nemo_curator.filters.heuristic_filter.MeanWordLengthFilter # params: -# max_mean_word_length: 10 +# max_mean_word_length: 10 # min_mean_word_length: 3 # - name: nemo_curator.filters.heuristic_filter.LongWordFilter -# params: +# params: # max_word_length: 1000 # - name: nemo_curator.filters.heuristic_filter.EllipsisFilter # params: From 0a50fd433c3c020508cf08670986b7ca1daf583c Mon Sep 17 00:00:00 2001 From: Nicole Luo Date: Mon, 20 May 2024 07:01:10 +0000 Subject: [PATCH 32/34] Updating container version Signed-off-by: Nicole Luo --- tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 006098375..3ceecad2c 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -78,7 +78,7 @@ " Password: \n", "- Get NeMo NeMo Framework Training Container\n", " ```bash\n", - " docker pull nvcr.io/ea-bignlp/ga-participants/nemofw-training:24.01\n" + " docker pull docker pull nvcr.io/nvidia/nemo:dev.framework\n" ] }, { From 2a9052c7a7c3684d6b1f7b4f5770de6e5f09bd75 Mon Sep 17 00:00:00 2001 From: Nicole Luo Date: Mon, 20 May 2024 08:31:02 +0000 Subject: [PATCH 33/34] Fixing style Signed-off-by: Nicole Luo --- tests/test_fuzzy_dedup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py index 022940b2d..1c952d27d 100644 --- a/tests/test_fuzzy_dedup.py +++ b/tests/test_fuzzy_dedup.py @@ -440,4 +440,4 @@ def add_partition_info(df, partition_info=None): ) # Check that the real and expected partitions match - assert (check["file_id"] == check["expected_file_id"]).all() \ No newline at end of file + assert (check["file_id"] == check["expected_file_id"]).all() From 11e4eba48c4a8ce83f9ac9ff453f92b8aae4fca3 Mon Sep 17 00:00:00 2001 From: Nicole Luo Date: Fri, 24 May 2024 03:45:27 +0000 Subject: [PATCH 34/34] Update get_client() according to latest version; Update log path for map_bucket section Signed-off-by: Nicole Luo --- .../single_gpu_tutorial.ipynb | 798 +++++++++--------- 1 file changed, 389 insertions(+), 409 deletions(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 3ceecad2c..0653279b8 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "e12a5ec6", + "id": "9bd01afc", "metadata": {}, "source": [ "# Nemo Curator Pipeline Example\n", @@ -27,7 +27,7 @@ }, { "cell_type": "markdown", - "id": "58d062aa", + "id": "7b1808ea", "metadata": {}, "source": [ "## About this notebook\n", @@ -48,13 +48,13 @@ "2. Classifier filtering\n", "3. Downstream-task decontamination\n", "4. Distributed data classification with PyTorch models\n", - "5. Personal identifiable information (PII) redaction\n", + "5. Personal identifiable information (PII) redaction \n", "\n" ] }, { "cell_type": "markdown", - "id": "a6e3492e", + "id": "78537bd7", "metadata": {}, "source": [ "## Prerequisites\n", @@ -83,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "01d4c35a", + "id": "062b5423", "metadata": {}, "source": [ "## 0. Env Setup" @@ -92,7 +92,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "8778a517", + "id": "8add9bbd", "metadata": {}, "outputs": [ { @@ -100,11 +100,10 @@ "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", - "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (2.0.0)\n", + "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (4.0.0)\n", + "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines) (23.2.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n" + "\u001b[0m" ] } ], @@ -114,8 +113,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "41d75988", + "execution_count": 2, + "id": "9940c70d", "metadata": {}, "outputs": [], "source": [ @@ -143,8 +142,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "0150b7e7", + "execution_count": 3, + "id": "fd8a381d", "metadata": {}, "outputs": [], "source": [ @@ -172,15 +171,15 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "3d7e6547", + "execution_count": 4, + "id": "589ff257", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/nluo_data/NeMo-Curator/tutorials/single_node_tutorial\n" + "/work_dir/tutorials/single_node_tutorial\n" ] } ], @@ -192,7 +191,7 @@ }, { "cell_type": "markdown", - "id": "cf0aea31", + "id": "662d505f", "metadata": {}, "source": [ "## 1. Download\n", @@ -233,8 +232,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "f41df88e", + "execution_count": 5, + "id": "adb59379", "metadata": {}, "outputs": [], "source": [ @@ -243,7 +242,7 @@ }, { "cell_type": "markdown", - "id": "b0f2d6d9", + "id": "9b56f12a", "metadata": {}, "source": [ " Start a CPU based Dask cluster. Please modify `n_workers` and `memory_limit` according to your hardware specification. To process TH wikipedia data, it's advised to have `memory_limit` greater than 12GB" @@ -252,7 +251,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "8742c111", + "id": "e822b5ac", "metadata": {}, "outputs": [], "source": [ @@ -262,7 +261,7 @@ }, { "cell_type": "markdown", - "id": "f910ae71", + "id": "e90cc8b1", "metadata": {}, "source": [ "Define parameters" @@ -271,7 +270,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "c55bcfa8", + "id": "9a03b463", "metadata": {}, "outputs": [], "source": [ @@ -287,7 +286,7 @@ }, { "cell_type": "markdown", - "id": "b11fdf43", + "id": "f41734a1", "metadata": {}, "source": [ "Download TH wikipedia data" @@ -296,7 +295,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ff615514", + "id": "a45965a7", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +307,7 @@ }, { "cell_type": "markdown", - "id": "ff7ae4c0", + "id": "22b7d5b3", "metadata": {}, "source": [ "Verify result" @@ -317,7 +316,7 @@ { "cell_type": "code", "execution_count": 26, - "id": "98564093", + "id": "45a69041", "metadata": {}, "outputs": [ { @@ -337,7 +336,7 @@ { "cell_type": "code", "execution_count": 27, - "id": "ded3510b", + "id": "53bdccfd", "metadata": {}, "outputs": [ { @@ -355,7 +354,7 @@ }, { "cell_type": "markdown", - "id": "79b4a804", + "id": "c5f58643", "metadata": {}, "source": [ "**[Optional]**Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again." @@ -364,7 +363,7 @@ { "cell_type": "code", "execution_count": 28, - "id": "f1e8f645", + "id": "0669a830", "metadata": {}, "outputs": [], "source": [ @@ -374,7 +373,7 @@ }, { "cell_type": "markdown", - "id": "4db3267a", + "id": "43334988", "metadata": {}, "source": [ "## 2.Language separation and unicode fixing" @@ -382,7 +381,7 @@ }, { "cell_type": "markdown", - "id": "228e3978", + "id": "86ccdc1f", "metadata": {}, "source": [ "In this section, we will be using a language classification model by fasttext to separate the TH wikipedia dataset based on the document major languages, and we will also fix the unicode in the documents. Detailed steps are:\n", @@ -398,7 +397,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "bd5d6920", + "id": "1e9198e8", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +408,7 @@ }, { "cell_type": "markdown", - "id": "bd2923bb", + "id": "76e46d2a", "metadata": {}, "source": [ "**[Optional]** Start a cpu based Dask cluster." @@ -418,7 +417,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "4375c02b", + "id": "da3aed8a", "metadata": {}, "outputs": [], "source": [ @@ -428,7 +427,7 @@ }, { "cell_type": "markdown", - "id": "2f834de0", + "id": "4a72479c", "metadata": {}, "source": [ "Define parameters" @@ -436,8 +435,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "3b3856c6", + "execution_count": 7, + "id": "13b9d2b1", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +461,7 @@ }, { "cell_type": "markdown", - "id": "3b6f887f", + "id": "8df0322a", "metadata": {}, "source": [ "Download fasttext model" @@ -471,7 +470,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "218c955e", + "id": "2666727d", "metadata": {}, "outputs": [ { @@ -498,7 +497,7 @@ }, { "cell_type": "markdown", - "id": "c410253e", + "id": "58452516", "metadata": {}, "source": [ "Apply fasttext model to separate documents by their languages" @@ -507,7 +506,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "c9afe965", + "id": "d8b8c491", "metadata": {}, "outputs": [ { @@ -554,7 +553,7 @@ }, { "cell_type": "markdown", - "id": "31917e7b", + "id": "d443a5d1", "metadata": {}, "source": [ "Load `UnicodeReformatter` to reformat any unicode appeared in the desired language dataset" @@ -563,7 +562,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "55da5f12", + "id": "272a5f67", "metadata": {}, "outputs": [ { @@ -594,7 +593,7 @@ }, { "cell_type": "markdown", - "id": "bc214e82", + "id": "9bd57a53", "metadata": {}, "source": [ "Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file (no. of lines = 162164)" @@ -603,7 +602,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "6b6eb634", + "id": "e3329c83", "metadata": {}, "outputs": [ { @@ -622,7 +621,7 @@ }, { "cell_type": "markdown", - "id": "57e22770", + "id": "0b6cbc26", "metadata": {}, "source": [ "Furthur verify by loading documents that has been identified as other language, such as 'EN'. We can see from output that the removed document is indeed in English and contains very little or even no Thai." @@ -631,7 +630,7 @@ { "cell_type": "code", "execution_count": 38, - "id": "79e32205", + "id": "050d944c", "metadata": {}, "outputs": [ { @@ -649,7 +648,7 @@ }, { "cell_type": "markdown", - "id": "39020971", + "id": "7d17f010", "metadata": {}, "source": [ "**[Optional]** Close the Dask cluster." @@ -658,7 +657,7 @@ { "cell_type": "code", "execution_count": 37, - "id": "64da23ec", + "id": "7e64cc35", "metadata": {}, "outputs": [], "source": [ @@ -668,7 +667,7 @@ }, { "cell_type": "markdown", - "id": "6134eaf3", + "id": "1d46cece", "metadata": {}, "source": [ "## 3.Add ID\n", @@ -681,7 +680,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "5bed2e25", + "id": "5f788b91", "metadata": {}, "outputs": [], "source": [ @@ -690,7 +689,7 @@ }, { "cell_type": "markdown", - "id": "be1c546b", + "id": "cd17be33", "metadata": {}, "source": [ "**[Optional]** If there is no running Dask cluster, start CPU based Dask cluster." @@ -699,7 +698,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "3a6349d9", + "id": "5ba1d54a", "metadata": {}, "outputs": [], "source": [ @@ -709,7 +708,7 @@ }, { "cell_type": "markdown", - "id": "503bfa4c", + "id": "12f59d5e", "metadata": {}, "source": [ "Define relevant parameters" @@ -717,8 +716,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "a14c6ba3", + "execution_count": 8, + "id": "843eba7f", "metadata": {}, "outputs": [], "source": [ @@ -734,7 +733,7 @@ }, { "cell_type": "markdown", - "id": "b249dcf9", + "id": "e7a8307c", "metadata": {}, "source": [ "Adding ID to dataset" @@ -743,7 +742,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "d12bb962", + "id": "b7a91bf1", "metadata": {}, "outputs": [ { @@ -773,7 +772,7 @@ }, { "cell_type": "markdown", - "id": "ce2934df", + "id": "e92b5dab", "metadata": {}, "source": [ "Verify the result. From the output, we can see that the `id` value has been changed to `TH_wiki-0000000000` " @@ -782,7 +781,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "cd51cd14", + "id": "e585cedd", "metadata": {}, "outputs": [ { @@ -800,7 +799,7 @@ }, { "cell_type": "markdown", - "id": "f249ab8b", + "id": "0cbddf6e", "metadata": {}, "source": [ "Close Dask cluster. This cell needs to be run as we are starting a new GPU Dask cluster in the following task" @@ -809,7 +808,7 @@ { "cell_type": "code", "execution_count": 20, - "id": "62336143", + "id": "4daa1f2a", "metadata": {}, "outputs": [], "source": [ @@ -819,7 +818,7 @@ }, { "cell_type": "markdown", - "id": "d6fb16b1", + "id": "1baf027e", "metadata": {}, "source": [ "## 4.Exact Dedplication\n", @@ -835,8 +834,8 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "044f7eee", + "execution_count": 7, + "id": "3f7ba34c", "metadata": {}, "outputs": [], "source": [ @@ -845,41 +844,16 @@ }, { "cell_type": "markdown", - "id": "6e5da88e", + "id": "e268cfca", "metadata": {}, "source": [ - "Start a GPU based Dask cluster. Since GPU based Dask cluster involves setting several arguments, we will use the `get_client()` wrapper function to quickly set up. Please make sure the `device` in `args` is `gpu`" + "Start a GPU based Dask cluster. Since GPU based Dask cluster involves setting several arguments, we will use the `get_client()` wrapper function to quickly set up. " ] }, { "cell_type": "code", - "execution_count": 22, - "id": "e4d6920d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Namespace(scheduler_address=None, scheduler_file=None, n_workers=20, threads_per_worker=1, rmm_pool_size=None, protocol='tcp', nvlink_only=False, files_per_partition=2, num_files=None, device='gpu', set_torch_to_use_rmm=False)" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sys.argv=['','--device','gpu']\n", - "parser = argparse.ArgumentParser()\n", - "args = attach_args(parser).parse_args()\n", - "args.set_torch_to_use_rmm = False\n", - "args" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "717b6cef", + "execution_count": 9, + "id": "4b73e5f9", "metadata": {}, "outputs": [ { @@ -892,23 +866,44 @@ { "data": { "text/plain": [ - "{'tcp://127.0.0.1:42505': None}" + "{'tcp://127.0.0.1:36179': None}" ] }, - "execution_count": 23, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "client = get_client(args, args.device)\n", + "client = get_client(cluster_type = 'gpu', set_torch_to_use_rmm=False)\n", "print(f\"Number of dask worker:{get_num_workers(client)}\")\n", "client.run(pre_imports)" ] }, { "cell_type": "markdown", - "id": "f267e161", + "id": "0fc99440", + "metadata": {}, + "source": [ + "If you encounter the following error\n", + "`get_client() missing 1 required positional argument: 'args'`:\n", + "\n", + "This is probably because the `nemo_curator` library is not updated to the newer version. Please run the following line in the terminal, following instruction in our [GitHub](https://github.com/nicoleeeluo/NeMo-Curator/tree/main) repo, and restart the notebook. Intermediate result of the previous section has been saved to local, you can start from this section after updating." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a590c78a", + "metadata": {}, + "outputs": [], + "source": [ + "#pip install --extra-index-url https://pypi.nvidia.com \".[cuda12x]\"" + ] + }, + { + "cell_type": "markdown", + "id": "0151abe0", "metadata": {}, "source": [ "Define parameters" @@ -916,8 +911,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "id": "d01e2f08", + "execution_count": 10, + "id": "54b627a4", "metadata": {}, "outputs": [], "source": [ @@ -936,8 +931,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "id": "6395ffde", + "execution_count": 11, + "id": "6ede2e41", "metadata": {}, "outputs": [], "source": [ @@ -947,7 +942,7 @@ }, { "cell_type": "markdown", - "id": "a654a16e", + "id": "1882204a", "metadata": {}, "source": [ "Apply exact deduplication" @@ -955,8 +950,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "id": "a5e0117c", + "execution_count": 12, + "id": "dfaaa765", "metadata": {}, "outputs": [ { @@ -970,7 +965,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/exact_dedup.py:158: UserWarning: Output path f/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/exact_dedup/data/_exact_duplicates.parquet already exists and will be overwritten\n", + "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/exact_dedup.py:158: UserWarning: Output path f/work_dir/tutorials/single_node_tutorial/workspace/exact_dedup/data/_exact_duplicates.parquet already exists and will be overwritten\n", " warnings.warn(\n" ] }, @@ -979,7 +974,7 @@ "output_type": "stream", "text": [ "Number of exact duplicated file:53\n", - "Time taken for exact duplicate:1.9629592895507812\n" + "Time taken for exact duplicate:1.9788782596588135\n" ] } ], @@ -1005,7 +1000,7 @@ }, { "cell_type": "markdown", - "id": "7f8bdb88", + "id": "e68f0399", "metadata": {}, "source": [ "Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary" @@ -1013,8 +1008,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "id": "e045d65a", + "execution_count": 15, + "id": "28d8bb0b", "metadata": {}, "outputs": [ { @@ -1052,27 +1047,27 @@ " \n", " \n", " 0\n", - " TH_wiki-0000021096\n", - " 1708cb56ec582f78716f0864dca9382d\n", + " TH_wiki-0000122055\n", + " 3e6e96a80410d5a191d098f464e66f86\n", " \n", " \n", " 1\n", - " TH_wiki-0000021100\n", - " 1708cb56ec582f78716f0864dca9382d\n", + " TH_wiki-0000105191\n", + " e77a248506ef16737288fae5759db33a\n", " \n", " \n", " 2\n", - " TH_wiki-0000067251\n", - " edf8af427a33ed94150899970f39770f\n", + " TH_wiki-0000105192\n", + " 2e386f5c3af70f43874618988d4842b2\n", " \n", " \n", " 3\n", - " TH_wiki-0000105191\n", - " e77a248506ef16737288fae5759db33a\n", + " TH_wiki-0000105193\n", + " 2e386f5c3af70f43874618988d4842b2\n", " \n", " \n", " 4\n", - " TH_wiki-0000105192\n", + " TH_wiki-0000105194\n", " 2e386f5c3af70f43874618988d4842b2\n", " \n", " \n", @@ -1081,14 +1076,14 @@ ], "text/plain": [ " id _hashes\n", - "0 TH_wiki-0000021096 1708cb56ec582f78716f0864dca9382d\n", - "1 TH_wiki-0000021100 1708cb56ec582f78716f0864dca9382d\n", - "2 TH_wiki-0000067251 edf8af427a33ed94150899970f39770f\n", - "3 TH_wiki-0000105191 e77a248506ef16737288fae5759db33a\n", - "4 TH_wiki-0000105192 2e386f5c3af70f43874618988d4842b2" + "0 TH_wiki-0000122055 3e6e96a80410d5a191d098f464e66f86\n", + "1 TH_wiki-0000105191 e77a248506ef16737288fae5759db33a\n", + "2 TH_wiki-0000105192 2e386f5c3af70f43874618988d4842b2\n", + "3 TH_wiki-0000105193 2e386f5c3af70f43874618988d4842b2\n", + "4 TH_wiki-0000105194 2e386f5c3af70f43874618988d4842b2" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1101,8 +1096,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "id": "8754887e", + "execution_count": 16, + "id": "fca41870", "metadata": {}, "outputs": [ { @@ -1144,7 +1139,7 @@ " \n", " 2\n", " 1708cb56ec582f78716f0864dca9382d\n", - " TH_wiki-0000021096 TH_wiki-0000021100 TH_wiki-...\n", + " TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...\n", " \n", " \n", " 3\n", @@ -1171,12 +1166,12 @@ " id \n", "0 TH_wiki-0000157216 TH_wiki-0000066307 \n", "1 TH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-... \n", - "2 TH_wiki-0000021096 TH_wiki-0000021100 TH_wiki-... \n", + "2 TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-... \n", "3 TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-... \n", "4 TH_wiki-0000122055 TH_wiki-0000116550 " ] }, - "execution_count": 28, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1187,8 +1182,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "id": "13712668", + "execution_count": 17, + "id": "8c9624ac", "metadata": {}, "outputs": [ { @@ -1208,7 +1203,7 @@ }, { "cell_type": "markdown", - "id": "7a388445", + "id": "4013203c", "metadata": {}, "source": [ "**[Optional]** You might choose to close Dask cluster here" @@ -1216,8 +1211,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "id": "7875bf12", + "execution_count": 13, + "id": "5ef2f05e", "metadata": {}, "outputs": [], "source": [ @@ -1227,7 +1222,7 @@ }, { "cell_type": "markdown", - "id": "20502f76", + "id": "7a2feadc", "metadata": {}, "source": [ "## 5. Fuzzy Deduplication\n", @@ -1259,7 +1254,7 @@ }, { "cell_type": "markdown", - "id": "de98daed", + "id": "ffca14ad", "metadata": {}, "source": [ "**If there is not running Dask cluster, start a GPU Dask cluster here**" @@ -1267,35 +1262,19 @@ }, { "cell_type": "code", - "execution_count": 60, - "id": "0a84ae27", + "execution_count": null, + "id": "e00ba2fd", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'tcp://127.0.0.1:43209': None}" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# sys.argv=['','--device','gpu']\n", - "# parser = argparse.ArgumentParser()\n", - "# args = attach_args(parser).parse_args()\n", - "# args.set_torch_to_use_rmm = False\n", - "\n", - "# client = get_client(args, args.device)\n", - "# get_num_workers(client)\n", + "# client = get_client(cluster_type = 'gpu', set_torch_to_use_rmm=False)\n", + "# print(f\"Number of dask worker:{get_num_workers(client)}\")\n", "# client.run(pre_imports)" ] }, { "cell_type": "markdown", - "id": "5de7a035", + "id": "5df73743", "metadata": {}, "source": [ "### 5.1 Minhash\n", @@ -1319,8 +1298,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "id": "bbc84690", + "execution_count": 11, + "id": "1fc5bff3", "metadata": {}, "outputs": [], "source": [ @@ -1329,7 +1308,7 @@ }, { "cell_type": "markdown", - "id": "3b0beafe", + "id": "7bf9cc8d", "metadata": {}, "source": [ "Define parameters" @@ -1337,8 +1316,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "id": "52f056f7", + "execution_count": 12, + "id": "d600d1b8", "metadata": {}, "outputs": [], "source": [ @@ -1366,7 +1345,7 @@ }, { "cell_type": "markdown", - "id": "aaefe7bd", + "id": "1c31ddf4", "metadata": {}, "source": [ "Run MinHash" @@ -1374,15 +1353,15 @@ }, { "cell_type": "code", - "execution_count": 32, - "id": "da632a42", + "execution_count": 13, + "id": "88540950", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Computing minhashes for /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/add_id/cleaned\n", + "Computing minhashes for /work_dir/tutorials/single_node_tutorial/workspace/add_id/cleaned\n", "Reading 1 files\n" ] }, @@ -1390,7 +1369,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:175: UserWarning: Output path /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/fuzzy/minhash/data/_minhashes.parquet already exists and will be overwritten\n", + "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:175: UserWarning: Output path /work_dir/tutorials/single_node_tutorial/workspace/fuzzy/minhash/data/_minhashes.parquet already exists and will be overwritten\n", " warnings.warn(\n" ] }, @@ -1398,7 +1377,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Time taken for MinHash:5.899524927139282\n" + "Time taken for MinHash:6.340771198272705\n" ] } ], @@ -1435,7 +1414,7 @@ }, { "cell_type": "markdown", - "id": "9ad4ba59", + "id": "158bf3ab", "metadata": {}, "source": [ "Verify result" @@ -1443,8 +1422,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "id": "93220b5c", + "execution_count": 14, + "id": "10b5eb55", "metadata": {}, "outputs": [ { @@ -1511,7 +1490,7 @@ "4 TH_wiki-0000000004 [1559901, 11771639, 487706, 826569, 1203860, 5..." ] }, - "execution_count": 33, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1523,7 +1502,7 @@ }, { "cell_type": "markdown", - "id": "b407928e", + "id": "0bce0f80", "metadata": {}, "source": [ "### 5.2 LSH\n", @@ -1544,8 +1523,8 @@ }, { "cell_type": "code", - "execution_count": 34, - "id": "f3801d7a", + "execution_count": 15, + "id": "645b8a53", "metadata": {}, "outputs": [], "source": [ @@ -1556,7 +1535,7 @@ }, { "cell_type": "markdown", - "id": "2a2c178a", + "id": "110db216", "metadata": {}, "source": [ "Define parameters" @@ -1564,8 +1543,8 @@ }, { "cell_type": "code", - "execution_count": 35, - "id": "d52707b9", + "execution_count": 16, + "id": "738ab265", "metadata": {}, "outputs": [], "source": [ @@ -1590,7 +1569,7 @@ }, { "cell_type": "markdown", - "id": "c59b4fe6", + "id": "a5250a2a", "metadata": {}, "source": [ "Run LSH" @@ -1598,15 +1577,15 @@ }, { "cell_type": "code", - "execution_count": 36, - "id": "71c0848f", + "execution_count": 17, + "id": "1ef61e2b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:361: UserWarning: Output path /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/fuzzy/lsh/data/_buckets.parquet already exists and will be overwritten\n", + "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:361: UserWarning: Output path /work_dir/tutorials/single_node_tutorial/workspace/fuzzy/lsh/data/_buckets.parquet already exists and will be overwritten\n", " warnings.warn(\n" ] }, @@ -1614,7 +1593,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Time taken for LSH:18.237318754196167\n" + "Time taken for LSH:19.37230634689331\n" ] } ], @@ -1649,7 +1628,7 @@ }, { "cell_type": "markdown", - "id": "3789c538", + "id": "ad2e3b60", "metadata": {}, "source": [ "Verify result" @@ -1657,8 +1636,8 @@ }, { "cell_type": "code", - "execution_count": 37, - "id": "d8663302", + "execution_count": 18, + "id": "9d0449c6", "metadata": {}, "outputs": [ { @@ -1691,32 +1670,32 @@ " \n", " 0\n", " 1692361878\n", - " 124883\n", - " 38\n", + " 123547\n", + " 210\n", " \n", " \n", " 1\n", " 1692361878\n", - " 123211\n", - " 141\n", + " 93844\n", + " 120\n", " \n", " \n", " 2\n", " 1692361878\n", - " 124885\n", - " 38\n", + " 66564\n", + " 86\n", " \n", " \n", " 3\n", " 1692361878\n", - " 85294\n", - " 345\n", + " 93845\n", + " 120\n", " \n", " \n", " 4\n", " 1692361878\n", - " 124886\n", - " 38\n", + " 66565\n", + " 86\n", " \n", " \n", "\n", @@ -1724,14 +1703,14 @@ ], "text/plain": [ " dataset_id doc_id _bucket_id\n", - "0 1692361878 124883 38\n", - "1 1692361878 123211 141\n", - "2 1692361878 124885 38\n", - "3 1692361878 85294 345\n", - "4 1692361878 124886 38" + "0 1692361878 123547 210\n", + "1 1692361878 93844 120\n", + "2 1692361878 66564 86\n", + "3 1692361878 93845 120\n", + "4 1692361878 66565 86" ] }, - "execution_count": 37, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1743,7 +1722,7 @@ }, { "cell_type": "markdown", - "id": "00f5567b", + "id": "f952f074", "metadata": {}, "source": [ "### 5.3 Jaccard Shuffle\n", @@ -1764,8 +1743,8 @@ }, { "cell_type": "code", - "execution_count": 38, - "id": "c5d458d1", + "execution_count": 19, + "id": "707ea54d", "metadata": {}, "outputs": [], "source": [ @@ -1778,7 +1757,7 @@ }, { "cell_type": "markdown", - "id": "e904bc34", + "id": "8f2e321d", "metadata": {}, "source": [ "Define parameters" @@ -1786,8 +1765,8 @@ }, { "cell_type": "code", - "execution_count": 39, - "id": "170a44fd", + "execution_count": 25, + "id": "70e2dff9", "metadata": {}, "outputs": [], "source": [ @@ -1799,6 +1778,7 @@ "jaccard_shuffle_base_output_path = os.path.join(data_dir,\"fuzzy/jaccard_shuffle\")\n", "output_anchor_docs_with_bk_path = os.path.join(jaccard_shuffle_base_output_path, \"anchor_docs_with_bk.parquet\")\n", "input_anchor_docs_with_bk_dir = output_anchor_docs_with_bk_path\n", + "jaccard_shuffle_log_path = os.path.join(jaccard_shuffle_base_output_path,\"log\")\n", "output_shuffled_docs_path = os.path.join(jaccard_shuffle_base_output_path, \"shuffled_docs.parquet\")\n", "\n", "#Relevant parameters for _MapBucket()\n", @@ -1814,12 +1794,13 @@ "shuffle_id_fields=[\"dataset_id\", \"doc_id\"]\n", "int_to_str_id='id'\n", "\n", - "!mkdir -p {jaccard_shuffle_base_output_path}" + "!mkdir -p {jaccard_shuffle_base_output_path}\n", + "!mkdir -p {jaccard_shuffle_log_path}" ] }, { "cell_type": "markdown", - "id": "333e91a8", + "id": "d0f19efa", "metadata": {}, "source": [ "Run Jaccard map bucket" @@ -1827,8 +1808,8 @@ }, { "cell_type": "code", - "execution_count": 40, - "id": "67b96227", + "execution_count": 26, + "id": "b2850b0a", "metadata": {}, "outputs": [ { @@ -1837,7 +1818,7 @@ "text": [ "Number of files being read for jaccard calculation = 1\n", "Number of ddf_bk partitions = 1\n", - "Time taken for Bucket Mapping:2.2563915252685547 s\n" + "Time taken for Bucket Mapping:1.239295244216919 s\n" ] } ], @@ -1857,7 +1838,7 @@ "ddf_bk = get_bucket_ddf_from_parquet_path(input_bucket_path=input_bucket_path, num_workers=num_workers)\n", "\n", "#Run _MapBuckets()\n", - "map_buckets = _MapBuckets(id_fields=shuffle_id_fields, bucket_field=input_bucket_field)\n", + "map_buckets = _MapBuckets(id_fields=shuffle_id_fields, bucket_field=input_bucket_field, logger=jaccard_shuffle_log_path)\n", "ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(documents_df=ddf_text, buckets_df=ddf_bk, shuffle_type=shuffle_type)\n", "\n", "#Write to disk\n", @@ -1868,7 +1849,7 @@ }, { "cell_type": "markdown", - "id": "8f76b8ef", + "id": "a1533a15", "metadata": {}, "source": [ "Verify result" @@ -1876,8 +1857,8 @@ }, { "cell_type": "code", - "execution_count": 41, - "id": "81c9c7c7", + "execution_count": 27, + "id": "d74012c3", "metadata": {}, "outputs": [ { @@ -1914,51 +1895,51 @@ " \n", " 0\n", " 1692361878\n", - " 8895\n", + " 127258\n", " 1692361878\n", - " 8964\n", + " 127781\n", " 1692361878\n", - " 8895\n", + " 126955\n", " 0\n", " \n", " \n", " 1\n", " 1692361878\n", - " 127089\n", + " 85383\n", " 1692361878\n", - " 127220\n", + " 85364\n", " 1692361878\n", - " 127089\n", + " 85374\n", " 0\n", " \n", " \n", " 2\n", " 1692361878\n", - " 127090\n", + " 45030\n", " 1692361878\n", - " 127220\n", + " 85200\n", " 1692361878\n", - " 127089\n", + " 45030\n", " 0\n", " \n", " \n", " 3\n", " 1692361878\n", - " 151728\n", + " 127259\n", " 1692361878\n", - " 151728\n", + " 127781\n", " 1692361878\n", - " 151729\n", + " 126955\n", " 0\n", " \n", " \n", " 4\n", " 1692361878\n", - " 137262\n", + " 127968\n", " 1692361878\n", - " 137301\n", + " 127961\n", " 1692361878\n", - " 137262\n", + " 127996\n", " 0\n", " \n", " \n", @@ -1967,21 +1948,21 @@ ], "text/plain": [ " dataset_id doc_id anchor_1_dataset_id anchor_1_doc_id \\\n", - "0 1692361878 8895 1692361878 8964 \n", - "1 1692361878 127089 1692361878 127220 \n", - "2 1692361878 127090 1692361878 127220 \n", - "3 1692361878 151728 1692361878 151728 \n", - "4 1692361878 137262 1692361878 137301 \n", + "0 1692361878 127258 1692361878 127781 \n", + "1 1692361878 85383 1692361878 85364 \n", + "2 1692361878 45030 1692361878 85200 \n", + "3 1692361878 127259 1692361878 127781 \n", + "4 1692361878 127968 1692361878 127961 \n", "\n", " anchor_0_dataset_id anchor_0_doc_id _output_partition_id \n", - "0 1692361878 8895 0 \n", - "1 1692361878 127089 0 \n", - "2 1692361878 127089 0 \n", - "3 1692361878 151729 0 \n", - "4 1692361878 137262 0 " + "0 1692361878 126955 0 \n", + "1 1692361878 85374 0 \n", + "2 1692361878 45030 0 \n", + "3 1692361878 126955 0 \n", + "4 1692361878 127996 0 " ] }, - "execution_count": 41, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1993,7 +1974,7 @@ }, { "cell_type": "markdown", - "id": "b4896749", + "id": "1487b1ad", "metadata": {}, "source": [ "**[Optional]** Remove previous Jaccard Shuffle results. Run only when there are files under the Jaccard Shuffle output path" @@ -2001,8 +1982,8 @@ }, { "cell_type": "code", - "execution_count": 43, - "id": "2d4dd55f", + "execution_count": 30, + "id": "b414f703", "metadata": {}, "outputs": [], "source": [ @@ -2011,7 +1992,7 @@ }, { "cell_type": "markdown", - "id": "f9b5ab9e", + "id": "f33a6782", "metadata": {}, "source": [ "Run Jaccard Shuffle" @@ -2019,15 +2000,15 @@ }, { "cell_type": "code", - "execution_count": 44, - "id": "acccb80b", + "execution_count": 31, + "id": "86d1b3e5", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " 0%| | 0/1 [00:00\n", " \n", " 0\n", - " ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...\n", - " 1894\n", - " 1692361878-127021\n", - " 1692361878-127021\n", - " 1692361878-126958\n", + " การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...\n", + " 1457\n", + " 1692361878-135417\n", + " 1692361878-135463\n", + " 1692361878-135417\n", " \n", " \n", " 1\n", - " ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...\n", - " 1894\n", - " 1692361878-127021\n", - " 1692361878-127021\n", - " 1692361878-127017\n", + " การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...\n", + " 1457\n", + " 1692361878-135417\n", + " 1692361878-135392\n", + " 1692361878-135447\n", " \n", " \n", " 2\n", - " ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...\n", - " 1894\n", - " 1692361878-127021\n", - " 1692361878-126928\n", - " 1692361878-126891\n", + " สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...\n", + " 1262\n", + " 1692361878-83363\n", + " 1692361878-94231\n", + " 1692361878-83363\n", " \n", " \n", " 3\n", - " วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...\n", - " 423\n", - " 1692361878-87271\n", - " 1692361878-87204\n", - " 1692361878-87271\n", + " สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...\n", + " 1262\n", + " 1692361878-83363\n", + " 1692361878-94905\n", + " 1692361878-83363\n", " \n", " \n", " 4\n", - " วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...\n", - " 423\n", - " 1692361878-87271\n", - " 1692361878-87267\n", - " 1692361878-87271\n", + " สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...\n", + " 1262\n", + " 1692361878-83363\n", + " 1692361878-94906\n", + " 1692361878-94905\n", " \n", " \n", "\n", @@ -2176,21 +2158,21 @@ ], "text/plain": [ " text _text_bytes \\\n", - "0 ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช... 1894 \n", - "1 ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช... 1894 \n", - "2 ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช... 1894 \n", - "3 วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค... 423 \n", - "4 วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค... 423 \n", + "0 การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด... 1457 \n", + "1 การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด... 1457 \n", + "2 สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค... 1262 \n", + "3 สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค... 1262 \n", + "4 สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค... 1262 \n", "\n", " id anchor_0_id anchor_1_id \n", - "0 1692361878-127021 1692361878-127021 1692361878-126958 \n", - "1 1692361878-127021 1692361878-127021 1692361878-127017 \n", - "2 1692361878-127021 1692361878-126928 1692361878-126891 \n", - "3 1692361878-87271 1692361878-87204 1692361878-87271 \n", - "4 1692361878-87271 1692361878-87267 1692361878-87271 " + "0 1692361878-135417 1692361878-135463 1692361878-135417 \n", + "1 1692361878-135417 1692361878-135392 1692361878-135447 \n", + "2 1692361878-83363 1692361878-94231 1692361878-83363 \n", + "3 1692361878-83363 1692361878-94905 1692361878-83363 \n", + "4 1692361878-83363 1692361878-94906 1692361878-94905 " ] }, - "execution_count": 45, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -2202,7 +2184,7 @@ }, { "cell_type": "markdown", - "id": "1a23a5c0", + "id": "b8644e51", "metadata": {}, "source": [ "### 5.4 Jaccard Compute\n", @@ -2217,8 +2199,8 @@ }, { "cell_type": "code", - "execution_count": 46, - "id": "6cfa08ea", + "execution_count": 33, + "id": "b1a532a2", "metadata": {}, "outputs": [], "source": [ @@ -2227,7 +2209,7 @@ }, { "cell_type": "markdown", - "id": "389f305b", + "id": "c9e65975", "metadata": {}, "source": [ "Define parameters" @@ -2235,8 +2217,8 @@ }, { "cell_type": "code", - "execution_count": 47, - "id": "c142a42a", + "execution_count": 34, + "id": "291d3aaa", "metadata": {}, "outputs": [], "source": [ @@ -2258,7 +2240,7 @@ }, { "cell_type": "markdown", - "id": "7a0f610f", + "id": "9341b58c", "metadata": {}, "source": [ "Run Jaccard Compute" @@ -2266,8 +2248,8 @@ }, { "cell_type": "code", - "execution_count": 48, - "id": "8ceae838", + "execution_count": 35, + "id": "9b1b9bdd", "metadata": {}, "outputs": [ { @@ -2275,7 +2257,7 @@ "output_type": "stream", "text": [ "Running jaccard compute script\n", - "Time taken for Jaccard Computing: 0.5923423767089844\n" + "Time taken for Jaccard Computing: 0.735356330871582\n" ] } ], @@ -2303,7 +2285,7 @@ }, { "cell_type": "markdown", - "id": "ae06ad56", + "id": "bb740d30", "metadata": {}, "source": [ "Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets." @@ -2311,8 +2293,8 @@ }, { "cell_type": "code", - "execution_count": 49, - "id": "686eb956", + "execution_count": 36, + "id": "a41d1f09", "metadata": {}, "outputs": [ { @@ -2344,33 +2326,33 @@ " \n", " \n", " 0\n", - " 1692361878-49094\n", - " 1692361878-49078\n", - " 0.784000\n", + " 1692361878-136568\n", + " 1692361878-136566\n", + " 0.754448\n", " \n", " \n", " 1\n", - " 1692361878-49094\n", - " 1692361878-49078\n", - " 0.784000\n", + " 1692361878-136568\n", + " 1692361878-136566\n", + " 0.754448\n", " \n", " \n", " 2\n", - " 1692361878-49094\n", - " 1692361878-49078\n", - " 0.784000\n", + " 1692361878-136568\n", + " 1692361878-136566\n", + " 0.754448\n", " \n", " \n", " 3\n", - " 1692361878-49094\n", - " 1692361878-49078\n", - " 0.784000\n", + " 1692361878-136568\n", + " 1692361878-136566\n", + " 0.754448\n", " \n", " \n", " 4\n", - " 1692361878-161128\n", - " 1692361878-161122\n", - " 0.890339\n", + " 1692361878-92875\n", + " 1692361878-87743\n", + " 0.828794\n", " \n", " \n", "\n", @@ -2378,14 +2360,14 @@ ], "text/plain": [ " id_x id_y jaccard\n", - "0 1692361878-49094 1692361878-49078 0.784000\n", - "1 1692361878-49094 1692361878-49078 0.784000\n", - "2 1692361878-49094 1692361878-49078 0.784000\n", - "3 1692361878-49094 1692361878-49078 0.784000\n", - "4 1692361878-161128 1692361878-161122 0.890339" + "0 1692361878-136568 1692361878-136566 0.754448\n", + "1 1692361878-136568 1692361878-136566 0.754448\n", + "2 1692361878-136568 1692361878-136566 0.754448\n", + "3 1692361878-136568 1692361878-136566 0.754448\n", + "4 1692361878-92875 1692361878-87743 0.828794" ] }, - "execution_count": 49, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -2397,7 +2379,7 @@ }, { "cell_type": "markdown", - "id": "63911051", + "id": "a505402e", "metadata": {}, "source": [ "### 5.5 Connected Components\n", @@ -2412,8 +2394,8 @@ }, { "cell_type": "code", - "execution_count": 50, - "id": "5eae08f1", + "execution_count": 37, + "id": "3bff521b", "metadata": {}, "outputs": [], "source": [ @@ -2422,7 +2404,7 @@ }, { "cell_type": "markdown", - "id": "ed713696", + "id": "d8afed6a", "metadata": {}, "source": [ "Define parameters" @@ -2430,8 +2412,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "id": "a0881f12", + "execution_count": 38, + "id": "b40735dd", "metadata": {}, "outputs": [], "source": [ @@ -2452,7 +2434,7 @@ }, { "cell_type": "markdown", - "id": "4fba31d2", + "id": "33d8957f", "metadata": {}, "source": [ "Run Connected Component" @@ -2460,25 +2442,23 @@ }, { "cell_type": "code", - "execution_count": 52, - "id": "da4a8d4e", + "execution_count": 39, + "id": "fe62dd51", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "batch_id = 0/1, time = 0.26957249641418457\n", + "batch_id = 0/1, time = 0.29015278816223145\n", "# of groups 5465\n", "# of docs removed 3079\n", "assert num_nodes:8544==labels_df:8544 passed\n", - "Time taken for Connected Component: 4.331223726272583 s\n" + "Time taken for Connected Component: 4.489336729049683 s\n" ] } ], "source": [ - "#client.run(enable_spilling)\n", - "\n", "t0 = time.time()\n", " \n", "components_stage = ConnectedComponents(\n", @@ -2496,7 +2476,7 @@ }, { "cell_type": "markdown", - "id": "24b55482", + "id": "669495ee", "metadata": {}, "source": [ "Verify the result of `Connected Components`" @@ -2504,8 +2484,8 @@ }, { "cell_type": "code", - "execution_count": 53, - "id": "ecd69e7e", + "execution_count": 40, + "id": "efbd6973", "metadata": {}, "outputs": [ { @@ -2538,32 +2518,32 @@ " \n", " 0\n", " 1692361878\n", - " 139585\n", - " 1936\n", + " 122282\n", + " 903\n", " \n", " \n", " 1\n", " 1692361878\n", - " 8059\n", - " 5312\n", + " 139772\n", + " 1952\n", " \n", " \n", " 2\n", " 1692361878\n", - " 93474\n", - " 5313\n", + " 93927\n", + " 112\n", " \n", " \n", " 3\n", " 1692361878\n", - " 127790\n", - " 2774\n", + " 121450\n", + " 2046\n", " \n", " \n", " 4\n", " 1692361878\n", - " 49650\n", - " 1425\n", + " 85288\n", + " 3030\n", " \n", " \n", "\n", @@ -2571,14 +2551,14 @@ ], "text/plain": [ " dataset_id doc_id group\n", - "0 1692361878 139585 1936\n", - "1 1692361878 8059 5312\n", - "2 1692361878 93474 5313\n", - "3 1692361878 127790 2774\n", - "4 1692361878 49650 1425" + "0 1692361878 122282 903\n", + "1 1692361878 139772 1952\n", + "2 1692361878 93927 112\n", + "3 1692361878 121450 2046\n", + "4 1692361878 85288 3030" ] }, - "execution_count": 53, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2590,7 +2570,7 @@ }, { "cell_type": "markdown", - "id": "44834e54", + "id": "0c3e2bdc", "metadata": {}, "source": [ "Let's check if the output fuzzy duplicated documents within the same group are similar. Please note that the `group` id in your output might be different from the notebook output." @@ -2599,7 +2579,7 @@ { "cell_type": "code", "execution_count": 54, - "id": "6c404c89", + "id": "d8fa1e8e", "metadata": {}, "outputs": [ { @@ -2717,7 +2697,7 @@ }, { "cell_type": "markdown", - "id": "b4cd941d", + "id": "f34b8140", "metadata": {}, "source": [ "Change the `group` number if necessary. By running the code below, we can obtain a list of near duplicated documents." @@ -2726,7 +2706,7 @@ { "cell_type": "code", "execution_count": 55, - "id": "09b3fd0b", + "id": "fd01f5fe", "metadata": {}, "outputs": [ { @@ -2810,7 +2790,7 @@ }, { "cell_type": "markdown", - "id": "8b0de04f", + "id": "99a8d732", "metadata": {}, "source": [ "Print the text of near duplicated document. Please replace the `id` if necessary, `id` should be in the format of `_`" @@ -2819,7 +2799,7 @@ { "cell_type": "code", "execution_count": 73, - "id": "fbf88107", + "id": "68883f58", "metadata": {}, "outputs": [ { @@ -2841,7 +2821,7 @@ }, { "cell_type": "markdown", - "id": "fd33ac1d", + "id": "3b6578b4", "metadata": {}, "source": [ "Below is the English translation of the output above. We can see that the two documents are indeed very similar to each other.\n", @@ -2902,7 +2882,7 @@ }, { "cell_type": "markdown", - "id": "68cfec8a", + "id": "f36436f3", "metadata": {}, "source": [ "### 5.6 Fuzzy deduplication wrapper" @@ -2911,7 +2891,7 @@ { "cell_type": "code", "execution_count": 56, - "id": "fe7de030", + "id": "eb52ec06", "metadata": {}, "outputs": [], "source": [ @@ -2921,7 +2901,7 @@ { "cell_type": "code", "execution_count": 57, - "id": "fe8794b8", + "id": "625c1828", "metadata": {}, "outputs": [], "source": [ @@ -2948,7 +2928,7 @@ }, { "cell_type": "markdown", - "id": "0aa0b60c", + "id": "cb76d8e5", "metadata": {}, "source": [ "**[Optional]** If the cache folder is not empty, please CLEAR the folder before proceeding" @@ -2957,7 +2937,7 @@ { "cell_type": "code", "execution_count": 59, - "id": "83705eaa", + "id": "e7fb4c4c", "metadata": {}, "outputs": [], "source": [ @@ -2967,7 +2947,7 @@ { "cell_type": "code", "execution_count": 60, - "id": "72494e54", + "id": "2368443f", "metadata": {}, "outputs": [ { @@ -3072,7 +3052,7 @@ { "cell_type": "code", "execution_count": 61, - "id": "00a8530a", + "id": "14bfe3bc", "metadata": {}, "outputs": [ { @@ -3151,7 +3131,7 @@ }, { "cell_type": "markdown", - "id": "b9dfbdde", + "id": "d2726cf9", "metadata": {}, "source": [ "## 6. Remove duplicates\n", @@ -3161,7 +3141,7 @@ }, { "cell_type": "markdown", - "id": "bb722fd2", + "id": "e4dd78db", "metadata": {}, "source": [ "Define parameters" @@ -3170,7 +3150,7 @@ { "cell_type": "code", "execution_count": 81, - "id": "5a4b97b7", + "id": "0027c8d2", "metadata": {}, "outputs": [], "source": [ @@ -3189,7 +3169,7 @@ }, { "cell_type": "markdown", - "id": "d3962deb", + "id": "a373860d", "metadata": {}, "source": [ "We will first process the result of exact deduplication. Since result of exact deduplication contains original ID used in input dataset, it is more straightforward to deal with." @@ -3198,7 +3178,7 @@ { "cell_type": "code", "execution_count": 82, - "id": "a29d720d", + "id": "f59e92c3", "metadata": {}, "outputs": [ { @@ -3228,7 +3208,7 @@ }, { "cell_type": "markdown", - "id": "b4c1c057", + "id": "f55d6737", "metadata": {}, "source": [ "For result of fuzzy deduplication, we need to first reconstructed document ID by combining `dataset_id` and `doc_id`, then use the reconstructed `ID` for removal" @@ -3236,7 +3216,7 @@ }, { "cell_type": "markdown", - "id": "1c0f1ee6", + "id": "3b9c122d", "metadata": {}, "source": [ "**[Optional]** Uncomment the cell to use result from step by step fuzzy deduplication" @@ -3245,7 +3225,7 @@ { "cell_type": "code", "execution_count": 83, - "id": "1ff911ad", + "id": "c6a1bb0a", "metadata": {}, "outputs": [], "source": [ @@ -3274,7 +3254,7 @@ { "cell_type": "code", "execution_count": 84, - "id": "2220d8fc", + "id": "746d3673", "metadata": {}, "outputs": [], "source": [ @@ -3288,7 +3268,7 @@ { "cell_type": "code", "execution_count": 85, - "id": "08143e1e", + "id": "62b34838", "metadata": {}, "outputs": [], "source": [ @@ -3301,7 +3281,7 @@ }, { "cell_type": "markdown", - "id": "a5008578", + "id": "edfa52ce", "metadata": {}, "source": [ "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset (length = 161748)" @@ -3310,7 +3290,7 @@ { "cell_type": "code", "execution_count": 86, - "id": "a692c916", + "id": "78eee9b3", "metadata": {}, "outputs": [ { @@ -3328,7 +3308,7 @@ }, { "cell_type": "markdown", - "id": "24440f5f", + "id": "15e07a32", "metadata": {}, "source": [ "Close the GPU Dask Cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again." @@ -3337,7 +3317,7 @@ { "cell_type": "code", "execution_count": 88, - "id": "656a24f2", + "id": "8e807bd7", "metadata": {}, "outputs": [], "source": [ @@ -3347,7 +3327,7 @@ }, { "cell_type": "markdown", - "id": "3a00f6ea", + "id": "a416a293", "metadata": {}, "source": [ "## 7. Heuristic Fitlering\n", @@ -3370,7 +3350,7 @@ { "cell_type": "code", "execution_count": 89, - "id": "41f7cdf4", + "id": "b988ad1e", "metadata": {}, "outputs": [], "source": [ @@ -3381,7 +3361,7 @@ }, { "cell_type": "markdown", - "id": "f5ed694b", + "id": "097a1b48", "metadata": {}, "source": [ "**[Optional]** The following cell is to remove warning from dask." @@ -3390,7 +3370,7 @@ { "cell_type": "code", "execution_count": 90, - "id": "39aab4d9", + "id": "44552288", "metadata": {}, "outputs": [], "source": [ @@ -3402,7 +3382,7 @@ }, { "cell_type": "markdown", - "id": "3c196329", + "id": "9a59699d", "metadata": {}, "source": [ "Create a CPU Dask Cluster." @@ -3411,7 +3391,7 @@ { "cell_type": "code", "execution_count": 91, - "id": "1ffd3928", + "id": "b8f80ab3", "metadata": {}, "outputs": [], "source": [ @@ -3421,7 +3401,7 @@ }, { "cell_type": "markdown", - "id": "4a514d3c", + "id": "a7702918", "metadata": {}, "source": [ "Define some helper functions" @@ -3430,7 +3410,7 @@ { "cell_type": "code", "execution_count": 92, - "id": "d4aaccc4", + "id": "6f2e7523", "metadata": {}, "outputs": [], "source": [ @@ -3461,7 +3441,7 @@ }, { "cell_type": "markdown", - "id": "5d43a755", + "id": "227fa8b0", "metadata": {}, "source": [ "Define parameters" @@ -3470,7 +3450,7 @@ { "cell_type": "code", "execution_count": 93, - "id": "4f8b0336", + "id": "a894f90f", "metadata": {}, "outputs": [], "source": [ @@ -3500,7 +3480,7 @@ }, { "cell_type": "markdown", - "id": "bddd9dd9", + "id": "ccea406e", "metadata": {}, "source": [ "Run heuristic filtering" @@ -3509,7 +3489,7 @@ { "cell_type": "code", "execution_count": 94, - "id": "1df83255", + "id": "03b3da27", "metadata": {}, "outputs": [ { @@ -3609,7 +3589,7 @@ }, { "cell_type": "markdown", - "id": "0fab7ee5", + "id": "a53b04e9", "metadata": {}, "source": [ "Verify the result." @@ -3618,7 +3598,7 @@ { "cell_type": "code", "execution_count": 95, - "id": "65160254", + "id": "07475373", "metadata": {}, "outputs": [ { @@ -3756,7 +3736,7 @@ }, { "cell_type": "markdown", - "id": "412bd6d2", + "id": "24e8b173", "metadata": {}, "source": [ "Close the CPU Dask Cluster" @@ -3765,7 +3745,7 @@ { "cell_type": "code", "execution_count": 96, - "id": "e6129857", + "id": "12508f5e", "metadata": {}, "outputs": [], "source": [ @@ -3776,7 +3756,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4679d955", + "id": "83e4aed1", "metadata": {}, "outputs": [], "source": []