From ec26f9f754afc6d37d16378e9d5aa9efa4de7c9b Mon Sep 17 00:00:00 2001
From: Nicole Luo <nluo@nvidia.com>
Date: Thu, 11 Apr 2024 12:41:38 +0000
Subject: [PATCH 01/34] Init commit for tutorial notebook

Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 .../config/heuristic_filter_non-en.yaml       |   83 +
 .../single_node_tutorial/image/jaccard.png    |  Bin 0 -> 14952 bytes
 .../image/zeroshot_ablations.png              |  Bin 0 -> 84269 bytes
 .../single_gpu_tutorial.ipynb                 | 3425 +++++++++++++++++
 4 files changed, 3508 insertions(+)
 create mode 100755 tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml
 create mode 100755 tutorials/single_node_tutorial/image/jaccard.png
 create mode 100755 tutorials/single_node_tutorial/image/zeroshot_ablations.png
 create mode 100755 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

diff --git a/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml b/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml
new file mode 100755
index 000000000..50d435e2e
--- /dev/null
+++ b/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml
@@ -0,0 +1,83 @@
+input_field: text
+filters:
+  # The filters below define a chain of heuristic filters to be applied to each document in a corpus.
+  # This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words. 
+  # The filter listed at the top will be applied first, and the following filters will be applied in
+  # the order they appear in this file. Each filter can be removed and re-ordered as desired.
+  - name: nemo_curator.filters.heuristic_filter.SymbolsToWordsFilter
+    log_score: True
+    params:
+      max_symbol_to_word_ratio: 0.1
+    
+  - name: nemo_curator.filters.heuristic_filter.NumbersFilter
+    log_score: True
+    params:
+      max_number_to_text_ratio: 0.15
+  - name: nemo_curator.filters.heuristic_filter.UrlsFilter
+    log_score: True
+    params: 
+      max_url_to_text_ratio: 0.2
+  - name: nemo_curator.filters.heuristic_filter.WhiteSpaceFilter
+    log_score: True
+    params: 
+      max_white_space_ratio: 0.25
+  - name: nemo_curator.filters.heuristic_filter.ParenthesesFilter
+    log_score: True
+    params: 
+      max_parentheses_ratio: 0.1
+  - name: nemo_curator.filters.heuristic_filter.BoilerPlateStringFilter
+    log_score: True
+    params: 
+      remove_if_at_top_or_bottom: True
+      max_boilerplate_string_ratio: 0.4
+  - name: nemo_curator.filters.heuristic_filter.RepeatedLinesFilter
+    log_score: True
+    params:
+      max_repeated_line_fraction: 0.7
+  - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsFilter
+    log_score: True
+    params:
+      max_repeated_paragraphs_ratio: 0.7
+  - name: nemo_curator.filters.heuristic_filter.RepeatedLinesByCharFilter
+    params:
+      max_repeated_lines_char_ratio: 0.8
+  - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsByCharFilter
+    log_score: True
+    params:
+     max_repeated_paragraphs_char_ratio: 0.8
+  - name: nemo_curator.filters.heuristic_filter.WordCountFilter
+    log_score: True
+    params:
+      min_words: 50
+      max_words: 100000
+  # NOTE: This filter tends to remove many documents and will need to 
+  # be tuned per language
+#   - name: nemo_curator.filters.heuristic_filter.PunctuationFilter
+#     params:
+#       max_num_sentences_without_endmark_ratio: 0.85
+#   - name: nemo_curator.filters.heuristic_filter.MeanWordLengthFilter
+#     params:
+#       max_mean_word_length: 10 
+#       min_mean_word_length: 3
+#   - name: nemo_curator.filters.heuristic_filter.LongWordFilter
+#     params: 
+#       max_word_length: 1000
+#   - name: nemo_curator.filters.heuristic_filter.EllipsisFilter
+#     params:
+#       max_num_lines_ending_with_ellipsis_ratio: 0.3
+  # Top N-Gram filters for N-grams 2, 3, and 4
+  - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter
+    log_score: True
+    params:
+      n: 2
+      max_repeating_ngram_ratio: 0.2
+  - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter
+    log_score: True
+    params:
+      n: 3
+      max_repeating_ngram_ratio: 0.18
+  - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter
+    log_score: True
+    params:
+      n: 4
+      max_repeating_ngram_ratio: 0.16
diff --git a/tutorials/single_node_tutorial/image/jaccard.png b/tutorials/single_node_tutorial/image/jaccard.png
new file mode 100755
index 0000000000000000000000000000000000000000..bc281639b0f96c94c66eeb09067782fda6019d79
GIT binary patch
literal 14952
zcmb8WWmsEp^exy@Xwl-@Vr_vyaCdjt5Zv9NK=Go*r6g$4;_fcRt+*90P~4%o&7r^l
z+_}$v=EKYfLQXg&=RMi)-fOMBc9^oF6b32@>a%ChFl3~~Ri8aWAOW6xA-@D3S)*x>
zfnU#ERHa0pm5sdJ0S*u?L=;4xJ*$jCyE8%pj!_(?wOpP(!}<91^L*Ky{1fn^k(E01
zqnf>yqk`E-BU7V~M!;*po4zx#wQ^K;u(JQ?@{U)>>)oe!yv+3PJb*_Z`hQ0pY&`$3
z$1JSu|8vHF=RY0m`HUs0JU)BI6DuPwqW($$U<sv#)Xd}RL`A!s&JRrkQEQ4CpYpS)
zvUDcayITTb6r>_bEEVA(WSD{}OUXNgQjxhZL@XPMnxI^1{J_tM)?yUO2?d`g!Zn2H
zdphs7Zp~jZC04Jv%<{L7Zh70YZhhlnJNnr&dM3D+dKy5F96*5_kV*c@<(liir+Y!!
z*oY!nh$8LS;gY~FEW~?B5XsXESbyJSS^oRen8W{Z68ZH8lV7sW7T?N%@BN-ZqE0}j
zz{BwuA=9(X;i1Fkqc-QEe#+gafBqX+T;9feySOEoIGn-(Ua-nf8!TN~z23?geYkCZ
z<hxyYykFT@dibMM<=(PRI%ja#V_@pwIhi`BM8C0-XfWdYc)MZ&E^9EZTXY_R>jXkP
z&ZjjexgR&V+6O{=>RUD#<_5v#-+HUL%6cN0+LD%!2p&(!ALD#(CNoAJ?nWQ^Q~BKO
z8(mGY{w<IEdZxY&PU^XxH5W{%pQMn^-?%u>6SUkJj8=Vpwq=3Ie=K|vVQ}{jrJk$(
zdU&p=xGcr4qT}UU-GY5;Y4c%|RjSopG~Z!AneWx=zQf~%Lv?-Yf!#)Wfcf2_d;1Up
zJv8=yTS{n<%;QhoezHZn%tE4p@3n0d$4tNA&3KIYf<wDc-Mn>i?m@$LcEs1t*(e*m
z%1<Y2`=4LD?q9x{sW)G_UA`PFeIMwU2Hd}o&pX@R>~RJh)=VnOiEd>)>||7N%<8i7
zZD$5nnMCtY-*ytV#oE=3OU^OWFMh3C`Z%dD2i(#}s@dU_KJpBK>p$^URAe<TR|S1;
z{_Y=lVJe|6ZKgV;_g-y(j~TgG@q0Ao;eEV2tL}QuM^Ee9@NlzNJ*gxlaL;FnO%U}I
zEH385De&@!weV};gVpSJBJ<Nd=~jR*^0anF8_Wb5zD8qeiEY0-9l8d-Yi-|mqmMu+
z{AqtUFF&gu;RUZac43^oBKKxqu`Vt>b#K32y2jVkRq>^>>=n3NjE^A`;3emCv-MrD
ztKVR1yE4`X*=;m?khqvU78~5tWw4sbw$18$ZE!7rGSLV5T9qWyYpMEH@$vs16T5D3
zT0afLrAg3{d~Gxi1T+!C-K^ieCh%o7n+e_TC{?2|Hc+JUx>{dc)VB-VP72*m&i0su
zPGY@r?5F%<nrh!<#k%JQgqYHT;8h1^LI&#Y2@p=ETr1u#5Ripd)|gqf#psCOMP-bc
z--U(WMbj?=6Z`#@>k)TzgPVy&)w`j#+r!lREx!i~=fajFpChmkG5dSGryD0CiQemb
zdysg)Zy_Yd`1PP}q0CB{rlY_0o#cxiIzAn*gWAdVF9=m2GM~TW`{`bXGIMw<_lNG)
zHf4>}scaUC&iA1aSJ$I{>AFHMBq|#eD~Y7uI*B{Wq`5D9FhmjO8c*cN?4*Qd%$cX!
z4Y~(FkR{$D2N+^ic0K@`Bl&)>{lTVgh_Tr;<EooRdxS*G@BX~H{cfmT9_-+sg@E_!
zjTWo1TzBMK@8b^I_yC;q0VQ8nEyO(e<gX<hlX(hqkYrpXewLn;{*)bu?y8?aBBA|g
zaKEcreLka8?w!v9YsBDMWTvHo;fEWJ@&3yd=<gy37zQ7%Gah+Rv}vorB^9FAiY}&-
zwk;>UGFZNU-jmbTzb12E@o`7Z&Jn%?;?&3am0jJOxy(+(=^!{B!TEY4(L##URh`pq
z|JObc!AcM$R1~wEb{pwMucKC(th0qGw|RdJ6SxLM#-we4Iu!=wg#TIj^(v-c5+dVl
z-xr~~RcD*M&|$;p#I~z_46H_-L4OD+`+K6W_8*TiA7fShfNQw4?IU&ndGX662G&D~
zfAy4!)LMdIOE%@LoVTg%*QtNZSJSsS=^5%*e7)$3sOuz3<oKkC^AMxoa53g)zAVLk
zN+}%_Cqj>BLWRol+OH9Cy$DyWkyf;P&ZnzgXSDJcSt|;TI!K*<`@f>*8lg(m{aTcx
z>Rs|cD?-mW-FUtCSm$yrP20Lx);w`}oIL7HO-sJiOQdECV(D$tYmjf5uMI-{@5Yia
zwH%9H>{pNG`3c?souvI-rPoc0o^brWM$2?6CDXM3w{mxpp3QSSl68BdES|%L^{?>!
zR7ZsYqO9Ni=TmAMGc!im*bPlR1UfLC00_#zScxrwoI?ijUWzvBL0=<6U%za>KX8aY
z?_6OU;eL-K?-pK}#O-8U6{*}b!hiPjH0@AXOBEy)prpk;-{Fc=z-Rt5$ysiU?!PO<
zxv@lME?Go{oW=-TKyTcOLw&X*J|Ce?sQDcSlAG!{w7PQo-d#D{Y&q73PR1^Y_>a5i
z=!BMkI`UdceJFibfSSqbt1Tfpy6knNSA8inmZZ!MRI8UL0U9zV?yY-zR=K5hh+Lua
z=)`XjjjDf0BfvNenSX~XgT*?)Jv~kSRVz0U72J1MeL{@&7io`|j}Hc@e{!aXXXTh$
z*%zFL7^>gcH&8ZKM6wO33P8mW{UNeO|0{0s&@||1(Sp)^siep}OgXKm3Tp0ya22KV
ze2b5tf*vDyRd|it)X|Z4n_Cvk9qoh+vFYr902V~*_KT91#aQ}n`I=spVdQzA(8I}o
zN%e><ZPS_<h+NF9<O|>b><!2<P1;A*(?_|i4~s3PIg1`!X@&lTM%zC43}zNJ?>}q3
zG_8~7F!QNcvHwkE;JK5VV_EU@W4!Fnt{XPf%zF-j?%H2|W+>Urm9~z0g*UX!@Bo_W
zI?_wo;AaL5TY<U%sx))I_wN_v{UbbE-=@}0M+GkBSx|#AHR|E5ws;J`&0Li^k;PB2
zqbMkH!?bgY$tfpAD~pKP%yT&Zyif_NRx5foXRm`7nZL|jr4xx3fo&7XQl|TE!nM(M
z>hbDsODH4F=VHzdF5|AJD0uzHG#E7j=ie^CGOV433|-7wWbE%5JbJ1kmjT5Ruc9C{
zapnllY#P_FTU4xi5H2|r=HR{lrtoK-e01x@Jh%}9#1d|u=5^RucqvU&e7^eH%p?$l
zQ(CsWzU{o!_TlcjP`eOk2ai7?TDD{Ix8g;FvGK4WkLvvt2i{}Wl3xYvUyy_#BYxw>
z8PqGzmg#r$xcfsRBm+SoL<SMb-v>z9thO@wo<tNP(DWGR!XdIE-yfVYwsE}nEBnkb
z1+NURqs%ZNxYJPAmJPbb{i>lVD!Bwx_eCeEL2v9u**UCY5i8uQ`8SO8j0D77C914u
z?9C?lSrv$K|7{{<vFoei_M75;n3|%DfQn=cSsG$y0~(ysEA<>j+-|t%y)wkQqsi!8
z5gXeAo4%((l>|g;a63P%8u4kNLCo6hM+N)NhUgrYIIsKRqPlPE0%OLi;y(=56@=-t
zEk=8;aVKdue^d&x)Xn4D0d8yx45r<(C2%B@CWj*RJilWv4xT+p9~^16*0_~ktfinn
zSdxU9Q9PK^%-FH_-&KBI?MmDkL$MZe%z|<}K3ul@>3u_rZ$;2vLrUS$!p#>pGCCwL
zyjvq|mk!sYsRZeQ=P$&V%PKK=|Ep%t#YfR;i_^-yQG{9EwlR^`$&LrHZbFt}vE~Xh
zP!8`ypnzF{i;cM{LnI9C^0)fT%)a&1*UVRPEK)ms_*W302Q>#A<NTdNK}>!ofo!@@
z`;3;DF8n6)>s%C;-wz4llbxJ_U&?(%^x8F`9+ki9z4KYSUpgg38`1vPHN2E5E?(CT
zCB*tx;F<ShEM0WUi&a&d_SFc9=qmO)3l2U8Dh$C)!`hMNZlV2NCwzSDeQ+`DcI1DV
z1ljlD_NZN`k(*b-2Lo)ETQjTYzT~{8iabkb;LU<EYkq}JX}zH&LTnW}I+m$uF{ijl
zyk=l}O+?eGTQs$n=_C3NV^I8MF^U?4t9x2Es~`O0Fq%vay!}p7f(uKAK#nyw!-f0e
zlGci*ThggiKl7oy)DVEH_NP=nKHMhi1S<Ylsy<U(O*LYD`_zN43DeODteDCdlP#V(
z<~wRRdxM9XVI@`IwyR3COZ0-aZB2t+(Poz8PxD~Mo@Tac@sGJo7vMT>NQ)cPlms|(
zmyo*9@Nwamb-NfhFV9+isqirTQw9z8L?&!)ePW8m?<09!Hgg8kt00KZ7kNw8u#1D~
z**2QA2I?DJdrW){TGcfT^&9n`>bWa!%mggR`<2-vag@u<C(PgS9P^D(`K!s@fnv2M
zK&ze`d+l^K%*9mi79tZcr{$luSKcnT@pz^QCf+Bq(UA7u&iWEk8~bsj>;*2PUu8Pl
z<5kRK(h`t+cNRU-kTbXh%M7@bdvCHM7-ieJV5yE6uS~;3iDT0}*TQJb<Ea&72~h7@
z7H4aNYQ1j!Uy(^ijWM<ycm7R#!bC{UGugZ2<8;39Xgw^h0V}&q#nR1N6S(0a!ivwU
z2OFL+YGEQuPc%F`p_5W^1K<|EZhyz6^4m8V!rjK)m(>?7^;9GT#H-DRJsG~}EIYa4
z-r7{TPfG9bUi}W#L4}~cOb<pQ%zGW@-BDKfyXvz2@!pC<qkPa%;4j7)p7ma7{m5(n
z;~&v9Z1D(wb<^Lz`8}Ab+#b>#C9BJ~$PZ{+?e&6A)A4(@qWMqzjojNk%&$)1?2uQ&
z$C+fPj>{u+jWGapP{_QY|FOj}ksa!qA~=WJ-l16<Hd|vgIdN3$MNW@aq4?IhpHhZ)
z@<$25BJ~$CPfHwMdK?(?riY$jk8Z)uj)2{|o*z;xh*;L(w%uIsqSQb}fWP?*x$mbG
z2Z64r*c+K^E=|6}B5sZ12Ci@Qted{>`Dt!+D?5DjD3ke0N&=Fvxqp8K@R4$7TFkYS
zH&SB`ALE9YUF7+!fzYrG9u^Bj3DL5N>vZ>_TE>m~?!LBCmQ4^DD{og-_*>U0m5rHA
zdi@sev|^qiHm<MR^N~iUEZB?y0odtm9AhjQczEI}+T3U4M(bH9A=G6UlnQ%X&#l=Q
zl8Tn#NJu?hn?@Iz?nS0h2{mT^&vbWIFU^9pZ*w4{TN!@77w~d#Dq1!?y?Yd7qnX$7
znKEA7bhSYfQ-x}i9wcIR`Dv*rzj)N{<d3PPhU=iwhfRBB?3F3}v^q;aT<!tzKP!Xm
znVyPLo+*ger}AAgw=C=Dby=Lvwx@m5PTr2TDh@4bP>Ib?)%we2FN;K7XPG8)5?W&B
zSIsK|FNTl5sSz|7rc5su1oMcfRn^YwH-lyElwfCWjJ?IU(CEJ&S>Lc1VQ9{w{Mt_#
z&+j_XZ$OcECnx8v>Psj*GBDvdooD4hd;ob@89AX$d9mUeT;B4PP}|cJAK@JnS%G)s
zMuJ?qaq#)PSI1Zn2R%g0BMWbn&HQJ7#J~q8M1`id(zWVDJvXg30I2;L<v){etYtGF
z`Ibx=Z>SFfTN#{3cR3Lj$9aE`;v9^Ci|fndc(647=^4zt>pnID%B4qT=kP3mweF;Z
zZd5$Yt<&o5Mg?ze+m709H{)rPMk3(QCjH<5P2X%Uyy89Y+k^Vi81A)D0LI0O4h*9q
zF)<OSZzOQ7k#R81G$oWdRSFNGt!5C)ih5qxkKXDRH@Npk62~9DxU~tg=yEw-zHH-S
zLUd`Y6?nS<z^thmmb!ow$A|OuOY^I?%Dt%j<Q31DT9=B}$(6kUmcF;~gs7eiqehk7
zqkU0t5|zGjS1z(TZ>LBEy4jp}X{cV`CAmbbNSAA9>YdkK24qFpNLS|%M-(6X&6aC;
z&+MikMfk-_WZ{N~z0;z)l0&0R`56mQI;_^_9ucjSDHbxfI>h%lk{|C1Hm*{Hgwd+y
znHHB+$@2P@p^nOzp4AOsY2&jsvsS-(we+Dm3LjkEPDZNBBXZlox!I#VJHcU&Q6{VB
z_1{X9ES7M6*TZPiLG?3vMH?L{DjbM5gnhjM6C?lIB7GUaMgQ9((MTrCM<9gNO^#WH
z=tBIT$Y$K|FgjJeKOY@C%}dc7!+kHRGZfmUjPcXFf0h1FA8gt8W%DSj>?PUY6C-C~
z(EXqrsk#eGv`3OiWuIrAQ$rR!jf;UDeM|)SBV#JFktlDk;+p`Zd$jt`s&j|I=B~V^
zfgbv_+qFT?SRW&NHQ13;ck$7MiuPur6CMG&KyCAqDo`Q^cVCSEv!S|(miLd+cPnYm
zq64G==k5z15$-~G|5Bo&SX3VG3&hBWgw0%jw@(;*n<-1ggKsm*L=`J5bJB3z2rdX<
z_m4{b6$P)5#ArVzboG}Y-=sc$UV)~8p7t6iQF@KnK^wpy@<ZLn_tQ{Y$N@#D94T`q
z;<rOk8Jt@8d1Mz-VZ#%RuQFFIn-A8JIev>&E|k?`*RUEg*Wz)e>S)6JHx@x|)lCTH
zsw9|<f}RX-GKzD)V3Fh_8Ua-m6I8aHj-;asDOX%Fls-&>b1r9bp6o(oT$wMRb4=y}
zF@vpr&oM>#+)TM6_Os=*!c;undhrrokDJl7@?GdV&;cuvC%`<6D9zM3C60O!Y<d9#
z8%zIX!luh^LLia@9LJJyJ?XOVW$?u;41f$}lxFhUB`?L(6!pCysiiULI_6Ly^N^*q
zoP7)eNFr1Foij<jBEkgTHqO=j0ehErMT)dbE3nhDLuauhNE&J!2qY6QavPT3BM9R+
z;*NpgGuaf|ZwFrlH`C?i_Y>A<Z$Uj<Z8CqPUn1FX8Giu31Bim2SVGXN5+*H7>xxhQ
z9`*5=Vw-CKOsTzFITqFs#l;XAld(`jXlFVsa?AXo@&`mhuhcR2hb`k;z6d)@@XHlB
zhZAzgliY$2Uo#G)xNqwas50ybsOdM3GwsRzQy856i6XH8=YejI<Wf>ST#^PYic-nR
z`Mwv>!z}+y>;deHp`J4zI=qP~bmyqgPjs!owFG0*Z)s5Sn0NjjftK$46K$BaeLb4*
zRJ4m}#g5ct?sCP?Pk&K;=NFmZ-AUbPSz6bq-^1D6#e#!UpNeJYlR%{0#`pJ$1Drld
zb{>>JQqnCdI~82XXS8ifTcA8}odv1ZkS9j4kwqxVHr;J`IZuddqmTUK)UWG>D^a+N
z(6o>;p_rl{-yNW*3;`n01f2KTu%l0m;LW!8RDsHl2l8nPv^CF@DY#0{o|v~FObv8!
z4d{B40@?QB%zs09NnRkYIIxVH17Ocddwc@H>I)zf*`0z>*==9tc~9e5CEvj>l$teh
zblUwb(mnF-Onpodb=~I7Xfb}ecVm4@bL>PZ+&^4$U(xfe5@ZGajZ%&pK^5x}+5(v=
z*Y-)`6P6xNFB3lg*1hUa8?~!nN?5K>(GdN`+4p_fe;FmJLTQETbqf1+`jj$0@*Yww
zhbR@a*!g+V8WOLBe26W+ef-*k{IpQ6CrLX*MYO^Fc0VgRg{?}`bU+$Y5YYRyo4CL^
zr?NJ0L#Jbfb!cmObs-F|5?m42qX7v`$7Psfh(YWXF(_JrGr>#zPL=tu@}szS)U{1_
z+*FirHfcnN#$FWHa#9iIM3Yzc;DU-2DZISpbnx1#k2LW~Po-TcjJdB2DXAKyvy#vU
z5pD3v8Od-1kn~j4a!X0*{$t~L6n8@F<uadvUG20r<o*^=jJ_T<tVerfvFEIpNU#uZ
zy>Vzcc?01SbfD5%)V+S<!Y#_-gYNp6HX>QJxz=&PSW3k{D~%WPHt|S5+|wf95peh=
zs=vV`j^V-!O#axTsv*Yfmv3xiN7DuPG<pJ$inn<H@rGT0*<{=@m(?;W^H1>CSikEb
z1IGG~(c!_6A;^_}@bRz^px$lvk$Ktr0*p5Y-!%(VYqvu*Sm4>TmF|`2Vm5uOZIuf@
z1qwpWZLjO0`FUsC+8iBq@5k1K+BQUy3R@OaJPhhzN`5z!9+eYJ^(|zg$JH74wsgr}
z^H%xO^h`P;$a+A<m*F7-@+QvTT8eES``xKRtH;*W&0oNGeabNdecukCkzG{Ia8!xk
z2|D&3T^1k|Jp8f*Z7ed23?DZfIso7fW&m9GG>vPm#lH1?=EEH=E-Y0|k%#J@G2QRs
z+WY43jtuk~-Cf0@!M1@L1pQf98$^&e20-B?asU-FfJyqu{k)qvHT6Br8gpIl5^R3r
z9q4+tfwao{1aJ4_{mxQXejWjs=q|DS&g^=46$43*_yiEr1=GM(L)SO;mAOI9%ewAL
z{3qR+blP}=xn)ZW@N5!95#@+7SBFw07r{yXzZXo9^PQDequyDl^zV!iG1WWIb>`X^
zD-P}VUyDEQW$SxbNyK~%HJitG(qo$bDkuJ-qL&vH`=GLq3<w%IL-2t&pySbK2K8#l
zFVx1S6bw<EwH2(VN5zQ0-SehhD$&oC!l?Q9`{Uw;8c278uT~{p8!OR1H{f(iZ!Bkk
zCBhy;opLLJTP7+EUm5&*E8#@8xg;uW@y517<a*@IsxfOwv<$JWiNPrS(Y&Q6_2B$g
za~x!s#3rKjDzQ^EpybYX^zP#$CqcMQ<0$(hD`V8}<<+LO1Q4v?Pq>L9BdlDo(Ywii
zFW=_-mL%kvdx}dLnabmZbLn)o;}Trgf##XI?Dv)J96=eS26hemU}5e<SQ~{}|69sW
zBj=^=<>&aLfrEu9wn!q(!piJ75fBCv$TGt3Wq?<20o@O;B|}mB$TYWk+S1c9fRs4z
z?HOSk>Q&${1z9)Naza8D>2z6B+|Glne$hD})Xt7LG#}JC+t@+HQCHI(0GMi(JBO=K
zfGAysiY!_h47bV>&c)w<&9|>?UmCMQ2TS*UVj^f8S&L10Q323ksH*4bpLnIsRJ%GY
z-h8%E{^}j;Uv$&EjY-F;F9S(j4&U?{jCxMd1`ErH6qXeV)6PzxyX!vOD2RLyKDL|}
z9<Hq<pTU7-0`!wjD@BoeHRuj_J4KAa5QH-Pp?DMLc9&$H(ZNBLR%q=ApxK}5L^-u0
zK?!AD^g-JjGb7_cHh_4qaA|B1;0JD?SQ=eqDe+qJ_2@lFg`a(M8KL##Ko0OXlm?Md
zLs(wG$CxQ+MA2VlDfS=F&UU&hP+z!qVQ|58Y}88Ls_FWVs($oj1;9p)&sLhd{q~4(
z<vYa;JwUu(8AFOg>*Q%YFLh8rXd|OZ|70BA4z);g-R<>McO?C_1t=r)Rfdm)nr&Z4
z&eM*(Eu5p+1C1&5yP#YyX^wi#d+m)_clSmk^-7)c)L|y>OHbPGN5S5XUuwdF*8@s3
zuHQoku!j{+aM#t+Vtj^vVWBzwc>XHKPV9M~)EsW=>a`;ix!W(J!pXcdj+uw1%LW$f
z4pI==m+any{%IAI<+io9YLKJBn}xPchV6NkueWC#+Vwc=_=qBxL^$Cg8VwpDGE;Pg
zWYi;puX#ZSqk|;`KzGO^<#XvpNyr-!OOzjz+AS6p1=Dbi4+f3g!RES-9Y``F6;1nT
zyqp#z-|k+JUkRhy>sA<w0J%7H($J3i<?zHPRJz1|FG|D#%7s{>O2WD5*yU{F5@`kw
z4{=%qdM*>6Fh0=Z;RWGCpp;Rso@;{%Le@52%ESbEB3*}e?<VA!80yg#K&^-PgLy8T
zo3K#rSt<00&nr)e^YLc3Jw2dQYpI6q2wW1o<_+>(Qf3dq4KNs%I0>H>ZVmC0J&g?=
zfA+}!NT1=w;*o22x8+@LOn$X}J&R_Pj^D|L|6N6}p6jWhP*`wRad&%nbnp%%;jvK}
zs+vF8g(=kHozM0y@zb-yR8Wv8Z#h5$1=A9taX`=SUC#$gJ4X8;@;0(HCOp}j=JwlW
z7~=wZX<v^I|J`vHt)$qZ446BwP%S@CRJo~;KvwE{WX}W8^{VbKSA=?0x%2(S>A$$7
z3V6AZXb%7CUR1xt!kdU|+tY0^KywmREr=Qi0D@Z6n(_u*Q(uO5{9kf)r&j!yv}Te*
zTD`a4s1iJNKTFuHrjQ{h&_f=ka=OR}{zSlEEW)gehZdo}Z20TzQ+3sdkaXxaX1*j4
zWzVgpn>Ur%q&olFlbysoq|}}cG&rb};rC|FhluSKaNA&8jTMX%?w)5qQ@NNLl}*1$
zW55yy@s&$oYk^Zru}aNLWp&OuhLS3^xfs{5wR3^@*07XZl+K4za@|BpUfqkb$Zfg*
ziIZwmyscMad<A}NblFPKv02xz#w=`l@mvnTO*9RO(1As^61?zv4lzo$N_wA0Mf}C2
z67lpN#qbpj3utSR2ZR0zNob;c$Li16N=sqt&+yJ+u9a`}&#LJDbt-{o-i=N2TZq#I
zct|8f(m{M(r_rwc{-ThA08OO1UpTI^qsVXrRMlQy@vfqxVgz)TOD;`$H|a58p3LK7
z&dUWB!@iHACu!?*9xJNtDVp3WF$Pe?-x93e4ZbL=>Ff^F-<}nS-0f~gValw!)R8tz
zjJ=}fk?W53CHxvH@4ybZpUr4#pUoE32j0qN(c8_Rhi)tJyOp>cYF>)b>;t%?u%^;Q
z)3Q{H&u27v>m*K!OlGE|jl!!(z!k`<c`S^0E2n93qkv2Xuc`_#%slpX<yAsQ0g>EG
z@DX5$*aVy-o`=z3)ldnSY|$l_@BR+NR4g9xB)}lRJc=Mze)Sz!QQ&;4MwGy|03u;f
zuxlCGt}HIMDO0LJ-_8e?<s*oKu)Rh#ifw<AVv9$CW@qu@{5PN&&@1cMH=51+(L70+
zrB$g&9W!G0SjC>Rt;$#inkGFy^#D-F`}Fcs?LDF{?-zi{d^o5UGD1q?_i*+F8dGqc
zc_{{L0t_bwR9}wosRS7QscWhH<->=w6vzp7yo@3bYY|L&nw2rma&{#Fd!<2O*brny
z=iswwL9Xh$EFHj1P93hwR&4&JT034vYm^biC@Qx#VQKH&`e<E;wL9t?%R)ij(ZId8
ze$CE1Yi?rDVQd^cTEC*X;>|FXQQ^0~93~-l5C!n?+Vz)8BXVye%?)HtE+a=PT9uX6
z?h%12hdtxW8xg@sc<3(rbyTtF8nBM{#^~o%Jn?oC4hw_qw2l>rSDTJMhb#jz)qd;L
zr=1u=G+H@5EW{U5QsKQoYcUS7U`<^<RDPuv>-QY`N*b4gzH?#{eFVO{p1+B6slEWE
zG@enFo)e>A1iq$f5U&U}edpi_IB)v)QoF<59tcHs=AS&y5x*P+yz^#`K(Ohs<HNAu
z<4?UJ84vB2Pfm@sURa#^(FT0GZpUdrb*nxUm_pTkrzCi7N{2Vo9GaS~iD(S&X#mOf
z_HUi{?<j#-VNr^~>{Nk{Zu)%t6=IF9zcVeBbC3#&tJ~H*svDlFjjB<zef$2bToz-v
zJFpmOmht*1Lm6uGyxk|#<0^;cB7%o&RfMThKJ@*F^`6zvZw)u2VBzQm!tW{BiqZ5k
z+#NW=Lr&T?sXfB&Ezp<N^UdZ7U<aR78+H~&hnV0N{6bYVjTjKR*I9aH)=2tF&GiUD
zyY)otc|@xojkOsFe>r~8wNqgXRhL68Us2&UaZPK;buv}-I6qU`Wa+0X0&T+2tKsm}
zb4fdN|D0Wx8fV>F<rfl-&LVZKM?J}G7I~Lb5UY?()yo>2-@e;S4UTV%T_B>ZxQN(Z
zWs4b6>B;FICtcTg)RtH7wT`Ya11N;0<gklp)|#z2s9Q(cZB8aL9@u?)={m#f2_RLx
zemBK|X7B3}+(T;=31U3kt<KBd(>2g*cc4PTjtBqPn}H=JDyx@~vu$4PefiN~O=Vl#
zNA;&A=NIeVY?sAdw9l2o<B?3M%Lb~YE0vWz61Vn1qZ_~+VusvzyX>Z-T|2E|Z$7(b
zk(LZbCF7soU7rn=mVSRxYTxp6ZKSdu-07=RS)xH?!*rOW-Jl6wfqA+gxg(%WXG=t=
zS~J=xHcKmM%rBqMDm0+N$fWrr%2I!rUfKc<7Xe;XcH|Kt^Iz8{{5glcz&@ewDy^F@
zvQKZ7DGKjlZk$!45O%~h#Y4c=L&}1p5XG0#WA{mqLxK7`za0AHY-!Z}rWUM^#VVFS
zuk}@zZ+MZARRZ~So+<Z{HVbtv1-JO&ZS>g@?T-dO&s5K1pSmJTVS<qv9fveL3S#_o
zM62NS!ZL&JHmV{rroH*>b%CQphK`D?KFFt#)tMr^P9}iq%oV77A9Q~}&Mj3XOIfWR
znSHSKS$5@F3w=M)nx*l$C3xA$o_?^lTv|miZ|BcsF1_cir3*kO{XuSG^zIu+C1*_)
z&p-;~-SN2W4w{lUIP52jNiR(2F;qrdHa9Gk!vxS93-<`k(khtzaw%|lYb%-(1ho$2
zN<?Z^H;&2>xKU8TF`j>Dh!Hkq*tVm3W@KY#B|*QOLlZ64l-qzB9{OzBvm=*UFDe%g
zrFeo(%W=lQ@7|O&D=3t*5J^Im{>wBsRalki=Kw=CGL0-IXx43HJIvo4RZPNFP&JZc
z8i!BSAy)}YRkFLSA;Gurh$BFvqY-t#gkz@Gkql7JCb{{Jsz(L+v`R^6Uq<&Ffu<x_
z;^ueAcEN|_efIJ;v`&XSGZeFt0hAlF#d?SZkixpM2(Q08`5E?>9xP;QtRvj(kvYQT
zfOI_vaHO-837*(bue%eu-B?%xozTTFT+k)vI8Xx=nG<D*8US;{5X1W~8;aJsHKt6<
z7j^Yi&kNSF2fpfLrr63)P@CvMOC3v}(Y}J~WOGq}ggnDN^0{0}42l_ua6Bi=5+yd&
zz#1#)qND6mrED0w#lr*h+2i)DdLa^mzZ@+C_8i(`e+n9OqbiF!bA;L48VHgKDn`do
zq;7RPzYEI1?hkTUZ1<ySd|X<tZ*|O;P6WZPDF=;LKbeUu`$i&lh*a>_4c2xTo1otr
z7}5uSmJ=mKlO>o%K<XIc)=a<*Pw>$QC?<<Gz?bP<O1dUpU*qk#cEr_?g|l!7%Ks76
zL4s7>{e2)calJpE&EMlT8-#_p3e(9C;;Dj-{q(A}s?E&+HRsVLU>RD8&STJi^y@x4
z(`bdQvASfst3mCLcY>q6xJF-3W5M2da~I7<Cx_97vo{PU+^rdc#K_r4m8re-8aLv0
zNWiE)dfKBoQ4d#CUXp<7yJwwuIh^G%?CiU`+FQUO@XOieR=K8Vw~AGQKXHgR-Q?Vz
z6L2q?F6~htdbtE{^;B<b9sj(59RCa!+~X-v&&YI*ES8AAhIv=~K79+BIZY?BnGK-d
zC)k8P@#o<V0_^u;(P(bTCGy(gkIY(6z9Kz5@a2*B7T=CY#Tp*d_`Wf8K#Kv#x{4k|
z410-+PH7^V>n)$eOnDzNkUlUcmis#n#?d~xU@Zz&mEG5b1l)5oicYp^iO8elC3qsF
zFlSjNNxw}!s)i^-za^UF=5W_7w_}2Z+K-aF=v3O|GKu)Y5s`XX=n5o~RAt{Lz8N{e
z>4uDs6s8#J^4=Cu6ZG_W*I~Db{pO}lAh<nUxG+M0n`kp2`lBezwAQXS`dt^2SpRf*
zqtm{z9_$58L8^#lIE(Ur#Me#1hTFqtH6ymckQG?jzIHMqG#aEn5~pEzx#X@C@A+M3
z8g&)$b1BMdDY@$~G`wHLTzl;uV^%^F?XN1V;z(NL6<W6JHQ~_x<W>q|lL%b13UKo2
zNhNx*wV%>3OIjT^+89GFB)EhLTah)g|6-57&x+EF8HvBq$4$QndOjLnw5?;HbKKQx
zKE9e<-)~4-hjEw;Y!xN}pj9(kx;6jS<nWlEBh@8PsMFPI+_r=P6<pl3H7XjJJFjb^
zU%(Am%qH0uql}~RO*3t_=G+*haD0l;tzG3}Q;<tdORpAxPYfV~?StH?4R61zq#!CZ
z1c4;FkPe82M6Oczy+@8b+T%Xg($yHmb3R|~_^_|_R@;ZdC+0T9S<I4$i3?F=eR`W;
z0n2`wZhPEfsXN+#BRl`A$zwG9xTFMwG0ucbu<r7X;mCs+S3_Bc)M0D=(_O;RJd;@$
zl$TMNnpUe!tagxE-8(PAk%pO83J*IW#QakM?Q0mx7Rr_}ddYH&5;t$OH9!Af7{`{j
zPd;`D`C8~V3$y1%ufQ=TMgy7h1zTfpW+zUF*}}gjqLv=~u+H9P2+r=TbjtV?4ca?&
zm<{dhK$1E!y*_(X+)2ppAnB6jG3kv|=c$`%ttc5+kO)-G+O0%NMVjSYu#G?Tx@^83
zb*)i$jCl)K5eBliJhDD*&%W0$T<)a(BtW18nDDGg)FL3GRN$({XawUd>s7G?E%9XB
zyNq*Lvy~;ch0;Z}72<*LkT3z{rjVktREIW?A=-|6g)IJ@k*1#R1$K-YWkyxr@|s~$
z7J8t?k#u|pRJ5j2+2TNj`UxE+KnCz$kK*#x>svQV1M)&H>`W9HuEYCXnNv-}g!blQ
zM{rLe>rWocn0BOSxu|3>8BCyfy+2nyq=Yp56t8E-S&8ZHXX2%ml9fM((2yu{okx?|
zXk0HiXM~O_PCp8e(RoQ0#tKY?=1Pxg0P;pfP1Bk=-C?=&-ftrbsFo#Em&Ax_z`4u$
z2mcbXJ##`!`sS3`^BEB^uRSg#A=9gZOk1UzT^}&Y3@AW`w7abbZY<A-<~!_uR`piR
zA6&)8g<4!TuiV@3lAZ$AFm5fT#_~O_P3TY~j}{LgPPhN*{UfPAlYZX@m_x@j_$zcR
z0ti!k#L*F9FNs;?2hLgj=xVmMa8?N{Lq1EFrI{)G8vL=gxtf{>Lt64j)0Tg8nJlbV
zxIMf`zSyR#iAd29kjgQ$#|d-kSl5kT>i(o$H($6hiAo@(o70;V0lN(Z*Ac@|8H<Wc
zpCd6@raiS7L~k=V?X*!h|2)36l}CN*fa8jPuVHZ_wAQIG-435e_cs}F&7`^L5SRb_
zd3_+BCO^qPlP|Ofdr|S(prA{89f=7g(UgF+2*HqAIkT~!uqE1Zx}cAz>O^x`aClOp
zV)zwDz&o1ZjA;#0l89`G;Xy2ZpXhE&PbcD&e+^i5to+v!Wcja<hrp<P?BRV=c!lUx
z=LZKIoIh0CHQ4kVfD~?iQK;loWWKwg9L&rrgJYXUkvr`krBWOI`sOtyL-Fu=ob?K*
zqLr&e7T%+Hu_U~vUlcHlROF6(h{E{VuQ_;q!x+a>M#aAV;&pLIaOfV}n2Eq5;7i%&
zCF^}+>eIA47qa=d^5{P+9*T-+`37@d_|Xyt6?hUsETUN5#qV)n%I6miyf#B-oYSGU
zWwIK+d#qDwC^{I%!!+p@&7JuIuZMmH0wTH)F4_UFn`hLe#*6O2@;!{&)gsyLe^;2I
z?m44VOQKOhpas1;bksIPM|v_*Ac;aTqMyt`11r40lDc!zv?&WMbsF7r$_K&-hm5?~
z5aW^^@n(p3xrGX&WpmFq&@9;+QBC7f3oBCA44#}~Vg?PDISxfsnF&&30-)9>>l<p-
zLGt0^YdqPVL~P{wq#_DRUSgXFbXhnFtB4ois`&e+ND|dr4HhwNSg|@va7TqK&wwl&
z(ttjLf-pB&L5v5|koh8ZfIJBI6)>iQCbiv6m;Y;8R2^2x7Zg%b3&-lTda~}e^|Kbf
zJ2au1)k79Kd<9`gy5)7U3FER<C*^|5noWOCDy*4j)F7oUTg?=BS}ca>K)BXA15zme
zpFoW2c0;}yg3eikLFLk-vRW*3&e-SfJ|adO@!4nJ9SijL9-q|2Q>6&KEK?NQ;(SYG
zSww%j&Y7KXz5>kPTGmzYZa%MVb3@)*a6qr}W>F;p?xZ4{m}4u+mKp7{7M-Vcf{Tc8
zwpAsYiGSgvqnBmi94<15T&k?+q6(U)Xzo<&|8N$VTE@a6VVCR~rh~(qWlJ>pL`n$W
z0}8ptf<Q<Bus@#vc)_Tw;BjjBi8A`VXYyb20PgcL71fMK1N7%;@r=Delr937_%40|
zIrf;ZG?xv<3Qg`=APCvhL2*&N9R2qn#G<`&^&P<x$b`n<4No#To!noFV4=U$)zO8*
zJ1IVC=Nl1&QeM$9QXy-_*V6S;Sdy>Hg{||Jz>p;F*m=|`)v<Yflp4xShbn)(PK->+
z`p=jZ7#b4C8i1(8HiT=Bm^EEnmR*$%O@MvSTzdrcPAl+on1<<iwnX6-T>e8c;}BhV
z%0STGT=E>$W)WmBw$2_k5y_R9^?F8E`yjhcdK!uIKU>fkHlO3oPJaGxo#$TwSLE$i
z*FdbCPyKGcWsT;pCKB6Vq)y$~g|mw98(slgXVZR8yrOI^@04<ut+KU!+x6z3uUa$A
zc`$T*do%q?B9*_9F7P%wdoj-WC+BJJ+^70@j2sILDv2*khP3~A5})HWsTBudsduOn
z+iO5;naXbl)n)s>@wlj>97w|552SeO>&{i<Q8Xa9U^f<u-9h>Yd!oS*Joi$ZvZmc4
zb+tZdzpz?N_igXoO3{8BFDqcqYXXezs!=2vM{bXatMdPhY+{I=3PqLdw*KMr*EeB~
z9zT;<-oFv>{3tta8fCak^TA6@n(oo>bbyA2%YUBff;Lu#Ansn38!xRG70M8yylnav
zG$Yl~{5t^cD@Q=HY#?7ax!fzw_`#0aMUBPx^y0WQit<)>FflFhv<4kX*8kRK1R~c3
z2Z)xpKio_q|Ku#g3r8T7QTnK^@mWy_6QP2jNN4b)3Wpo$?|TwDWqQ+or(*8|8w0so
zOOQPt7Se;~_qvywKc7tWH&m6emTj9ih<+a=pDfl%6cW13Ad%jKldJWZ_AG*+x`U!f
z*45d)trK7dfDcKsDikX=O_bToHT7ZMp}4}WAUSKTsB)8_P-6*t%xI=4!XN%dmvB(#
zSPIyefNR~bB!O>O_LukButD(zo&I-=Z#{sS0*3br5^i34mph7+owDI7n~VNYr@~Ec
zAoDG9=7HV071H2KiNH7Aas<j_#Q&|BqDlzSd8V3^U92Ry1TVC;06{{e0-}JPir-(2
zKr{B#7m7x7lJF~8Mme_!y#X9B4!ghRt;WDy_*1{JR&YN4H~R4cc~#1?v}QtfBjJML
z7cCeMy(4Q4k*?dK07XQa3pWx$QHvca)J;(%cmAe?&PPSvdi5>nl@In|Te)xOvUpmr
zyt-bL27W25`Tw@e39Dc2TLoJZ;)AEh#{5F2v-8b@5Z<L{=AWuyo*==HW?oP=EcCb|
z$yuitys_r1r6%c6$YQ105GJwC=f7NZp*5U=x?7W~@JwA<=UruC`T(AQ8o7U=^&*G}
zHThi{@&#&PF@i7`Kp||GTxS{T86~7tcalnL&dP7xu=Zi`ioboc_5sDU{J_58Jk#_^
zKb`}6&+sT>w!Fo8!4f{X1p1u@ScB6?eLM^;UIfd)k^k}PRh%YbpDwICa8CDH5)z=@
zb0c3p)nRQ+0i(3vy_P-3sy-nR{s>bjfJx1KuqpP22k>zMHs=aaUGI|~8EiFsJ+WRz
z?BZ-(#-3c<+34Lum}tUxK(tQyfBb--0RNk6k2H8609x=TN8sJlTnDO@Oso1%HJ3(<
zIJhHbnr>x95ltRNa+)wmWBtbqO!>mvv~1+$PUzE&!qa@r{+Jjh!-^I8R#)qXYN<Kb
zgebS;Ye4X!4n79@s&b6k|Ez;Rz&%5L1dL@MT2#cui$7163^}X#pN|&boFv_V)>0l_
zLXq!GIA2BS^e4oO@1_;DA4%xL05cSyMjtFI>m=1LBFb`F_7s3tj6V8}u3J5%AMWd?
z&kBN{X+j8Y4bN3j;8cb9gyBD+koSkF{|vuX)?-}%1^csZX8xP)@3b0Unak$C+AQZD
zLl`LgMYE2=di9GGt*z{XhrcSLp$2M#5#Vd@6}xuZnM6rTz=Zmrj5vW-uqn!z!N!xn
zI;y9Ev0D!7p@iVU*#^00syf)Y@@#}J|A7KXjW##F1_8p9A;AlQI3G8YEzF;6K+b;3
z`%sv*mN&>{c{bZ%&Yfyr?|bG^OJM`$Ld;hbz-+D`7|Z`BR!mimL9~tlhECO9`U{Q>
zNX@8-?y~FZ!VwyobY7)dghqw&1(zI>??ZF7X95@ey@G`;E*ub%gin~9uwaO{hv1Tz
zo)w_4643T_(j)1p!JG+MHZy_Mc_&*{s1_zj?WUZFYonzYHPy#xwKI6&^m{~+eS@(e
zl;-a{Y*XtJsVEY-XBrXMo**1E*M70VNrskK$k$tn<(+JgS`{=C#86uGKO;lZ?boeG
zKIMzNtp9>FGo~invLHhQ8_P?$-`~r>f(nY4fYTz4jN2PjT@vxs(a}OR6{rYd7iEG0
zy%&C3+E%&v%ifb;Og+I>)xabP{OB)6_>1D_;Nd3&Lx4Vt69hvv%`sno2c?st5-_FZ
zWU6j0SYcN$hvH|cS0r2sRDcSLorW0dKh&eTW7NY}*z4#D=@t}G#(CS++*Ls&9GU;7
z$kNzsW=dM1m0PpAE((oij!8m(zWN{<4@&+s(Q&|HwF#(V!)TP>2*G!E<5J`d^)G!k
zl1!op(5R@GU{TI;Hvi|q5B7IpaE)2?w_61X)8<89GC>Jh3FaChUJh!LoWZ)jB+kE_
z7>U&_W=T6rO3{^zZSAQn-7STTiqRwmA}*)-XG+En2+uSoLyTDlMs+r3xZX{Q|M(#B
zo)vua%H$0_)qkgS=g7oYJ@xm^{6OFJ(EPvbihJ*ykjsfAi(F<J_JfMxqRl1$+2D~y
z{r|b?otGypyfp7&<JoP&qFR6l!Cr1-Fz`k_JlH8g;!lrju5Jqv(L!rGy$*K*Ut#7Q
zpd^coJzvBkaV(Ue6mYIki4kimczC}E+SS7Ph#*XwkNO=LR?(GZiux2V!jku7cc^-5
z*XjULB%LsX*2&l~hpu@1r2l=)2PAQH1p$~`VgguSQtL8KeCnyjY`-s^8Yi-ZG9rs0
zwNHIjR)9U5D%8j-1N>E>0O4C?x!`mT&tK{qT3|Hn@xg47IFsoor-QT2^m|k{Bw!xU
zwGkN5QJYo@4+*glLvbQOwbW918W&M2g9DAlEN?C%I%W1eh17(yVQDq?cTa^fGp4|x
zLyxldV?l>&wqi&dx%_38DbQ*BeE1$;R*K}`#1?SY3Ub8%=gGtWPveRIzfZ!|x_Er9
XdUrt-7kct<=2J#OQM^plFyQ|JEdKbs

literal 0
HcmV?d00001

diff --git a/tutorials/single_node_tutorial/image/zeroshot_ablations.png b/tutorials/single_node_tutorial/image/zeroshot_ablations.png
new file mode 100755
index 0000000000000000000000000000000000000000..10be349440829113b3b7f5be11c66a595aa1e359
GIT binary patch
literal 84269
zcmeFaXH-;Kw>3;_t8JuJOo*s02#SgdND?q3BA`SG1)?CK$T?$Z#a0nS1tl7goFr#x
z1EL5hp~yi=7DbROAn?vj+g3gI-XHHb#`oiXV|<M9JhlpT&e>=0wdR_0uH|2+6lIq#
z;#|bW#<o=M_z`6`wgtg#Y;$FPn@|3vEny{o5w|;f#!kiZf}NwjjUk(YzMYksrJb3{
zxlIm+Hnt{~7DBs3cJ0}*$=J@$%2q;9(ER5&?6R~m60Fr9%)^H)v^uV7%f_~51^M4E
zMqw4oY`?Iv$sIYU>J;45<ZP!@Gd(j<Bgb>(k82Oq&fVrQzDheK6rW!6yUWcOXXB5}
zF{ZJ$CL<Hd8TQdJ+s~QZRDO14^XZr<+74bV)7#sY-;s+LntUnPwUvh_%uRZolQivJ
zw{%y^m}`NI-A<}=UDw(OlfXH+j~`!~0)iK1;|BiyI`HvG>ym$e!^W1h@RAVocZFMj
zEo1(UE$DZ#xy;|19s1w)gGUcoStTubSzKNI>aUfS+GG6%Gqk~ul96`;iBay8ql3f4
z?^U<RJcx)^i&dwG&3unEtx4D^Dk`cLb?Tr((7x(F7p}A$Y=7%UXFlbDzEuKW=KIe4
z@y8!kN&1J69y_+$c%xBysM^f*)GAKSK)T1db5C|rvcoAk4kq1I(Sdu;Uz@*_Q+2lj
zbIpraU$R}UQsCik-%~q~KGat*Gi@f5<It;h$*XkFo;}@-St-l7_Zv64wq9Dm@%!(O
z1T+%(g@si=e*AdD+q=5;?<*bc?cu-7ozHRh4RiG^j<o8$5~k8lZuI>AoMP(i<ix*w
z_o=ZK&jN0z)?awHZZ&N!yyOt=E?F)$7QAX-fPjXgfO_1i$<f~U^7H4<kGvNuH>OuS
z^9%p<(ks@m#6LzQQZavc!Osh{j{X+hf9U>pwTi#5Ea>R!;^T{)$t#+S4D)ywFvZTj
zKZ;l0|4fa1z^+n`eeyLgzaG80n*ME|tv=m6$j{Gjc(lP%`xoJ#pLX$bm)+#H%*xEH
ztZkb%ZBnTX4!ckq_$)P5SgztQyHt2th@_Et`P;X0DiJ52Cj{)&j<~r-uv6aW=LfJ^
z_!O=ie<!TqKG~Z=8^NMjiqHJLV54eNwmq+0bab><$_4Mklgt<AYx`?iv4-dLc%)*O
zv^gb%TGN+K-CMSB=~DXx8qAM1`&%;j=7TR^jCX3MpY;4b;FX}0^`zpNe5_WQscCi0
z{qNtuFUrnqkg)sWD}ujDj!})aX%M+|>5^swbFEG#{e};zj@6)!bXRM7(6CYR#yf+$
z@h8%b_iaCY`b4XJ^M+5BnwyswKY#`AZ6@L6$5+GG<Gqr_m*y_0%5k(@%Dw;0wr$(y
zH7WZ4v1LoV^|KTAZBip_I^NO5q@{Ih5-2IIdjI|)8=H;K%Ce(JN&`iz@uT+LRdUnQ
z(}RP9$~HF1ios%!?HM$a`cxAM=fQBEZDL|)cu(B>5T}{4D3cNNx~NF&yr^gfSA3SW
zu9;cfjWvQE4fenM@{49OJ3D(oP>^=S-O8UADs1X*e|y))(Xl4Ww#z2ZzcA~!-xe*Z
zxcJN5_V#<@2eM;hW4}*Lb@cZq>c6}eZP}8qmE#c4L$6elQESO}ug<n-Ts9Wh{_C&5
zXtmYVBC=GAB>^i}tQf4iViI`b<VjV0g?9hE7@OneTkCW!mW!?V#qiy|tlnVdXD3YR
zk`1%_^C#`vi<iq);9eF}T6LZrSx61OckkX)5mD(wl97B*U%q&eoEzpbD4tXPGs0cW
zk#kU5R<^MDliLDoTieP^>$W%s4*RO?u??<HPG{PJLsj#wb&EKe*I%^k($I#8^KWj?
zt~sA#Fn^(iMQ2Bcp?7(hOxRPAbz!UYk`3N+<rS>^`9l}mmW#FMV|Vz`CHC%3z9{iF
zFflPvq|9Wk$6&i0RR^a(XYALX-(I-y*D}WknVFd<Yx?^7=2gkju+icMu)2Fgbj!;6
znsQQf*nfWaqqF40M|<k-(kF^mmd}f~Xv)s+3U!TRgn3NR)2aI`o6Wo>?Y^{LPyhKd
zY_%5)$Cn9tPP<Wiy1F74t=i6UR#a4I$zdFrl!Pl+aDF@y3?yvYuk*ac!9)DAH*jVC
zxXq=)EZ>{n-lYSLh?XXA{dj5i1s+peAC*JB*X}u_s8};oqn%+9vd^s6w<8w^IdAZt
z0L?y8H*XQ;;<BHg(^&dj@Zzjh0_stkDHpVIot~dNcP=^DcBES^Z>-rBCnVrx<vkv?
zL|w{LS)a0Ub+xC*mae<D`Q>h5kMG~ix<t%sPbo=_+_Rr*zpK<dY}}ifDYleL>=dqi
zpC>CjyBnWkd!*s#<=D(r;~uRq?CovLQkj|@3%GgnCKk4x!HC^!_>P~K_b8uIXw_(M
z!>WxNElUNp)8q1Ird_jga*VWxdNW#R87&?`^k(nK=>DAk{2*H1WM9rh?!U}qePpK3
z<T&=LpE>gYr?NWV-BrZ0SqFzL+ot2jp~Hvm#|I6m(?((L5fb*@?anMGbo6km&YwJd
zd{pF%`T|eSbX$ayN6tf?Z{NJ>@=sF?-5-t2)MTTm9wvMAXh&RK!`I>As)-RsHNN@k
zwQDE*9C~UWZ;*5_2<-aw=`p=Bl1k!w-I|||d~xb`u{AbI(&ORXwD{QGh72lstuDh+
zxYJeZ)<xsh+~1~hER9;DeslHCsz{}cBIfm}kF?94$Ed}cwiS8Pav~l3o8zxv{|!l`
zxS>HCTcP^H<HH@jy_65}ENSereni3WnoRj$fBp5i--{P77G*M{*rV1CUK!hDM;7pK
zF=y`oOIYGH3u5t2vu1c-w)lnZ#;v#PLGzkkQ2tu^|0Rs_^S!bCuZL6r|E>O9f&cBW
zSm)PY7c3~*eM@KRWU$ztHzS7+9TGlU$OZtSg#5ByBO#bm#56oW&qLFTePjCM*f>2h
z%x22{+S2tG#-x8fC0o0~^UD@&^Z2%IJ*Ffxm6|`%9eZ-Dt*wp3w+&)e2WF=FJw^0i
z{F*mfPn}oA`|}4czPM4Cbv;_IAYa0v$Em>?Fmz_8jFO_FdU>eSsue5FHBmo2x)dmE
z^5oAye>Ja5KH60lq5_bj>p5l5%g4t+>@BmM5slGGi*XtLV%^{jG?qHq{PXkK2Cw1s
zO=xisrKi8O?XPj}&l`(N$~yAbia;69{E1=udH2wJ_rgV6+*CHYeZ9m(4{>Zh-H>ix
zoo?Q+zdz(Oa$fJCJM-h(WdH}(WLtf@EdBlKVTKDp(!&=Q<ZSHitM6@7Avlb|?Nd!E
z!GFGdmtPv982^2F-i@0#@A8mU#q!vt+eV&z_y=DkV<yO~C7<Ft+VeJm`zP>V<M2R^
ze{7jQUc|}HDSZ5zzJX*-LxW_c<Wr4!t&U>azaP%V#<3aS`S8`NBfYiHUuIj>Bs7`V
z7zD56<m|qf!17&3NgQlfw6ja~2SApY{&rX+LFY6;yjr}LNSbN&2@emC49Z4s?gWCX
z{q)zGHf9c(*ZewvzE;~|uVr=3ZmU<USTT&nHU%onb{kLLzkk1KmW^SW@yA2R#)%_h
zmdwjtWk=PL;xb|yubU@PQqH;0EQ+jep3BHP2X`l@Op$~$&!3#Q$8X&rl?PaBt*Ux&
zmu_wpfJXZhMh{8AO*xL`gDHU`)b9Hlx*vtg_uH1d@n9l!{<U~&-?<BxMIrr6eDc~T
zY*fazO2A>r($c^nTqHp?TKRUjdV-F4r9D2oX-=2+_+)>9h)w%(QUhqEn?1zs+oVaO
zo+z1=_2E_nMjUy|7gqaJ;m1>0Jy`Ml>7wj;Z2Wtr$8Yw$Q9ph9G&u_jRA*i5<)>3F
zDCA*>*JoNkEen;3rPd^D+qX|$UtfRX>q~Z_kVDIsEu$4ok0kD3y!sa{_qw;`-V!lW
zi&p0K=9bcGaUTmIKpVT7mxt#|xy(!zK)_Z{PfuzeyUh1T((V&yeCDte3Sy)hpV#!O
z#ZCzct<ccW47acWY^UlBOTBnX4lO&wuw-?bMUzBjF5ZWdWx-TBPc%?`eX?PR`IwTt
z{N0q46#J>6a+z3wvaBgi|JvF!fI;tosvE=s<6K7Tj6|@EjOG_t7ZV5(G^P*$K)hqe
zj`Fl0t5SHUD89@73Y)LW-HlsEQ`K|_g%p2aZNu6fa;P{ea5>dPT`5L0khW!Dz(7c;
zoSd9^{Y-rt^<jdo%U%gdZf@?3B|*GzU%x&I3>uBHkJ?nyJ;~fY{9<^!zoWSJUN2gf
z-JEJtsi1FNG`5M4ho?QN&wI#ba+jgDxVX4x<nw2YnJ1sCqE*I|MSuSM#fB|b`7iBy
zYCm$mdGm&Wdj6sG*x+DtQCE3b`RP*It}?^y^pUHd(v6$4({yxn;uzTB9StlY!@?W*
zR|QXVvgXd6I}a1w-QD8{t}b44@4*A7hCtxJ)S`}e_f@VgS~XZ}SeBKU8Ra%UNE`8P
zVZN>de{aT3yn6lGFx$b=vD~hg!Jy``*9$B3Hsy?^i(uQDdry7;KG<xq%kaCRsK>^i
zx6d~3P2VM3Y~ysZx+e%`Ul4kqJ32QOg-W^ZrL;JGCb7%3IXBCG3Pn;=dS_>8P`tI*
zvJHc@v%99I##(Ot_1E(R%kM0)dC~2rxm}6*6DI)iGyw5LOe%O>1Kyaw50h!(oE#rF
z9RJ36^k@^6i;K&x#3fQGO#S%r<Nnm4m$}YEanB<oBRNGaf)iOP<l<0#Ah?ENVFfv5
z&=Eb9>4cT**9Wy_#j3|w`0>ieF+8Rw48<)0IB8F#U%q@fGTs#iq>R&i+SD|LCmJ>X
zq9se-Tgzzv<>RBFrKNSdv#<8~VK51439e&(@#kLO=rHkQF3RZnu|n~-zv{usG~|4z
z7`?u+$`mnuwV-Y={bR*CN}h`v0(+>-NVnnDDIANMFCU+#1nWINcQo(Y=bN+<?E6Ge
z6^4O#5$rYe+|~5l2CmxoJMht{RZ*xJ4Ax#N^4S>H5ga1vxP8;617v3+COzl~#@VLd
zRgyj(sThLy&rhGAcLM8A`m-*Fo`cLT7qBH~+KP*hy9NlJ3fM)d07f2uv(b|u8&$-#
z>eN_YlUjxa#j7vJwo7rOzeR7?u3dveLw8Z@y}P&V>Aia#A^V+`8!{|4+}v_S3e4(~
zC#EO+<5k{kIy2%khD!ts2(z<r;ll2woQ$oC!BK^U2MAYF;=l98_3KHUN)L7)G;c^x
zD;X1a`<8U<Hdi#_i8@$?y9EUW@BDWP$Vs@4#-&@fB!IG@dJKGiod#~;<cuXV?eIG}
zum?A!nrv}&bbS5h4V6KQ=ah7a%gaa#ir3CCothZ&3kSJj^5N0Jy0WD|Lfpx}6L?50
z<P^6Fk5CBOYGh<2n;zTVZqVCOpf@?zU+$oY%_(fzynp=Lx5i|D)K!Sg(O`q>C7Rq?
z3x5GRXpL}gd*f5XNGu2)6STU59JE2&?F{fnnS&zAj8$A*amTzl>)cs3gfliomcd&;
zm(PFkap3Pk*y(~T85|i&8{6lw$NKXT^V+0?e!@ttW?fT=%A68*Q6rhAFL%e4S5#CC
zBii!=t1Mo<Bgxh0|6vIv{3Z?TRXjOSFTL1nF3($>>OSdk^vIE7#58O^4PX=zyDz84
z$H(h>CsQq&w7}ct08m8W1<{M|Y{|Bx`=Q3V9~>M9!l>lU8`;tzQ4Jq%sjA0^7v%IV
z{kiW8^Op!o9Zh>06~#~L+2zZZW6P)*8%J$di!taoALPO2LN3U5=nVu7lK|%H-OHD4
z5zgR&VgRXbZ;;%Iy&V{_<mD+uD?54x@3Uvm<g&RBC2FOe0JJITPrR(N=(pc|d9Gf)
zy2^_0$Azt^ZM5|nVZXBZ@+f%J_C4>SY!MuPcsDQTPB``q-D{)ADKJO&Up{P}_v^35
z08s=HY)Maf3o6R4w_ar3wa4o5T88WP$yYeGxEoz9^&`Lt3@8XoTM6sWe~##$uM`#m
zN8`p|iF}I!n>I%8lskNF$sHc@xSlgp8Bd-(5x($#t@j2=_5A#NP>r|uOG&B4oheHf
zU2+()JdNt(IW-a>V$qmsoj#QReNbMmW^M<NgXza7N7Y-ZW7H~Z^gQFSZMWfd<9HDs
ziXfMqQ9|JV__})M@SLvJ*0YQwRui5Im8ytQK#S}fB<%R#q*ZLxH}5YL|KpDroRi<a
z8AuA^99E3<?=dL$4LUT41=+^We|+(pUH5pVaALbFp53Il&inV1*)F~)-t1GQXcQPR
za<^%!VM^=-Ip4d~VpRQYmu-*;$2-%iA$Z58*?p|Z$Xh8`Oh0Ts^H4tWwJl!Qwa>i%
z3}4V<0Z^%iy~wDVW#fpb@z&#m9o3HxUfI!gUqGXb^UbSQwb6%vUYd=+k!yw6%y6aB
zZGNi8Gx>n@A|$W(Hj2lOUk_*aR9XP`q^AMc#XSc&2$5itOc!62KK7~F_1=wSKAdWc
ziT&UZWxjvCVy~P73apMs5U@zuS_391D-7vLSQ;Dzr2hH!%@Sb+>~)3p`^<`m$FX0c
zfHoLX6+Zh1kEf@N_BBzx-QC={Dg>GL%CT>mbGXClRysWzxyXH_T4S$*%E5yNIrp6V
z6J$>$KF$O|i$?EU!0mJZcvX`o#~xQ{D9R+e*!qMW&0BZK%}kFKq+KZG1hC@Yy!l{d
zghGn6yiA?h+G~iL56r!B*cji(hy21%+~02fg2}`cQns|r=_2SD_x&NUU^=EUQi*|-
za|1cbn^S~i#R@+Y3oWfcu=YH1Ha0fOSpE$aDmV)xfX3YeZ#LH8J;eEnhv=_G$>N8S
z2A9Kz)3PNTo3Av>Yu#<*)cPH1iBT%rQbmi`7HU#rcMJ&$vD7*r7(wct6r=KYyO(Sb
zw>biUZgbodoAJY)EwaM4ohQM<$dNh-)Y_`GYoicLwQvt54!wV^<QuNm(9@K1q*uJ&
zEX8DC4(PXkC>+OMRRb*<243)w^s>wiD66Z}e0t2gV!dVVW;v}?<2%jHT^qbcG#?+D
z=e<!z7csfKT4LkIjl8_P;t|`J?~1=Ym`m;3vjE+S1smNSAzTZb866s`EW9+A2qr)U
z>#p7Do!|>|tKjq;YjV78%4B=k&TjGgwT!GUc7-uT6f%UQW1lMMvCi_a#F@#0nH+O0
zwP|rni(Z6@DT+XXXI#S&`W)iNu)+;#+{_!{xUg(tu%7#f8kLKKBOFO8#-uVLE8%Yb
z)}1>Q7O-=tYJ1YTo!gfKA5`y3lOAth3G7|J>t8&J4|>F(<l5@?{hM`fU5e~AeS+i`
zEnDV~^PrH{U6W|(J@b7?$u-Sb+^Y2=$Y}qrvEAhlg=A+YY6^O)R<Yd4QsmAI%N7f7
z*O6{sB_*XG>g<su)Le9857PNjuytw8LLu5=p3_;#&n81I%pd2E>ERX&lXNWO6gB>^
zNhuA{+`@b1%9TT^0s?ftM3jBckk0qjFhIW&0p^UpEvZcFS)wP$1>Av(vU1$>c%KTz
zH5)dVdm~GQL~3b{g8_|8asmtNg12Ft>wwURqU-_BTVaLjXcgJbn>Rc3FK6D&B7vL*
zWzcVui@;zPtHfVfz^>Iss<UWy^=s@K6Be%Aig)}lA{zf;Tox>zLCLVAD?@*%)zXMn
z-+A@VKcf(D|NBQmW_VtN<Mp)E9w``kQG$^zcXz9Ew}7N&-iZIPTWdGdC*(R#KRspw
z+)`II(;b^AY}I-IIg8wB)ak>IA3u)K&C5PK!3;YWU%UfePPtI38gX(R?g^okfnqNs
zy6nl(n|2e!CP+p|2Qf%U4t>L(7s`U8xusl=`zfoaj9^_%0dho3wV$863g$GDRKLo&
zHi(BmEG;d)Mwo>>+tvF`a{+FQq_CWR31Mrw#jMUo$e!<dH9a*c;xcT^n0ROu`kd+`
zx_8N8f-bc(EOyEfjUvOMiDUKZ3#FQAruQdCdt<ArRB@k>h_)e-h)`*E#w@k-sWWv_
zR{r4wZEs>x#p0t-fgqJ#jX3%6No{Sdc7nXTd}U8vN_B+7dV+fWNG_8aYqY&>EVNQX
zLnB@@<=9eAVPE~VH*VjiZSmRv1elsEXLn_!kFv5dj~p`3X&?v)n7v=~nJ@iO()P~l
zPgZSL{ebXYonoYfRo}L2mttbxXdnp-ge8E;Lu-8SV68&veoa#B;y8jQ-c~8hEBE^5
znwlG{1ll`0JFoAv{ER3Lb@4Rn7sQ)LyU`xaG?U6>Ke!v7Xy`JF7A_>@%FwfaK?iPy
zKMruG$^IfzRX1oR^+Sg)BTv=tTk?`qJ~=_i!y+&Og?p;;$3N`*o10Ge8Vc;+uY=sK
zj4FTc^q^|onbX)IgJWamW1(kd=rgUG+qP8xh`En;b7nMWPVul^<S`k1x$LG?zmxOZ
za<0MSy>>X5)hL)FfY7x5HZOWKUX&z0Re)$h1Atn@W9HAJr$>rxbqfm%bst?_-TEZ`
zx%w=8^><H_yN)3#En2#?x;f7^&EV~dDMa`B49l<tO3sB+YX=8^AbtEk5taQH$d9QF
z)1;C#q2Qni8@}Tp@6ZM+6A+Rb@L>aOXmm8)M27i-_)WRE#Ui0DOxl4w(5x947{Du5
zM-EWK&iCroMkGJU7j1s$!GkF5@J5GF{k13L<la{5GC$*#YP)u;+f!6%F~H8_>0$AS
zG5G@_mCWzB$XK+)s!Yjw;mw7C%xtxZia6U<8WhPF>U4+*i$}(F>QYTq4rVZalFg9^
zk^SYhrIlbl1VKG0%$>jF6ka(r==SUJ7O}|S^O9LIej}aBe+~#kQ@{{{nouxBJU{=&
zw7)s;c0E%EV)HHMS}hh?Fj*_C5F+ELi*&AXcdJ5G)afk>Arfjx47cl9BsgDoV4nVl
z3?;$*Z%<}bwS1GnMT-{``L4aUHy(O16>E0l_;ITv>sju8!QS`7Pm7nG4UE`l(HMpL
z)CAZOq+L#awEk>H+bl;^%r+mH1RVHSa0a(c`O5w7-W@?1PKV5M3aq4QRn%i#fE?X<
zO`JBHQAzsY?5h`s{J2tKytlR`huvbfQwAqJYW1IYZ4d;qK0LSU*5|gik&zlbDliT?
z8d_TQBl9hO3j<pdfm%A+b7mUQ-_}@(oI89UuN)EBWu^yw%({SmPXjc6d?;kJ7xCS;
z5^e-xhn_QDql+dtTm1-Or&M9VI8e4+tYH_f_!2DU>^8vXB2a}WhDbcyrV<f_*!%8*
zV^BmikPj7x22(=5?lX^BCv@QpN|j}FdVEH6#n(?SMclqwp-2;Ja4wjhh{bJqetG{;
zD0@$skB?7{Kw|Eb8eP}Y|3o17E%|~|Y*F#;)X6+WA|$9Y*SzTUic&e5nf!b9d`y^(
zc=jwBRSK3}<@veSc72WFNTRz+vY?vR=Qt)-YaCnW+LUZ~liTg<!JN^BlWl+gIE5FV
zT!xn8Ya48pPPrrYDL4fDboUi@=?{oLB+-m8YLs9*NR<{<PyyNH&?35H$4P5zYXn$=
zoc5VDgUVaAPhL4_pLwcLIX5YP!2;6+^LdEc=+M8Cxm6a$&S4L(L|XCUb|JXG3fEGN
zRNUazH`W#AsnxbsA<#gw|E6HRubtF4LUj_Qu*tPQZ~N&de+G(LCO}Y&0f_^Ai_ClD
z#tj?e7Urj)I>QlIIF2YlsKn|_>+@3ITCWgM?(mqTZRaf!gcFDKA!w(Y=q{V2oxZl7
zj7;Qx`;Yc^-)?G86R9=lg1P+n>*9;ZGY2g~eSIHcElE}M&-Mx&x!r!NUp_%C*1u`&
z_RX6qMToj(Hf1=0Bx^Dx`1$!u2pwPGFQ~UmZW;SKW;l41R17+1>SHBjLBeyywblqL
zFr?o4wIlA65J65ps35Ki*=(rnJG#u8vPD#6@L`_*L)%rO&U&9ecdn;u^S`$U9@}Dk
zwb(}J1mqL3(N-;AzM1a1+1BM7qo*g1oY}&Qzd)r70x;@J^4J?{D(yOI=5GrHm6sx8
z-UO!V<RzzLsElZ0=Y5$4G@IjIulZ$&5n9=H(W`fAg@ZIct{fqIAyRL_{Q0K%P?y2C
zd_p0s9l~$*o0mEHiBoWlX$cf6CBvk0Yl9Y;oa7>u*bd#lGw*hhC?XvSHyR)Y2PbC?
zfeoTEdvkz#T)LwpiGG6GU;K%;wrPE8bZ|(>{lLH&U*F#e#ssOEE*%E#0hMhW10Yze
zL5r@R&ExFqDjA{AyavAk;u!nd-Gwfy4n?vTa&mHF!BlhZK6{ZkN_w}%3|$Eb2;lbk
zrcc<76AB8|B-v*)yT~K-67@pZdC(wM2495kYtX`OPANhOy{vx&^CFApdKJ8^+r0$#
zqSNG^0dA`$8_3C8K0FUXC7+h4mxyPh<T{;(#-1)+^6g8%W3w|~182e1Xb=?WZF1Y8
zgyX$1Y9m;-hp*L{Rxa3)NTAd!@Q6`<cH?RK+q=-jmHz(wZ_fQrNkAeNE(NJPfchtt
z_75CM7rA`pO1wyhOZkrjP42PLgoN_?G=a)V-^D4Bp)Mz|cr?f?poF)As8hs=Ry}+6
z5rCt0NzyoQi+ZbBU<6@TlqgJuc;Ii_<E=WXFFNfYyQ^CTuLlD?fjAz=c;Vv5k^;E?
z{?u=ke>W>D3krMWBNl7G#W{ESFAb5C0TT%&-%a!R^XG$Kzn((kg8x7Yi9(>Df%*Jw
zwfuT>x*%jxgpA6nz<@*+7Z;m#-51o;rDWTE1OZfyC-`p6vQ5u!;genMgLF<zFL9be
zBt&BC`I!XnRS>Go9d=zKsH+N#8jvhW2iqhh>hJWjTyd~apD%a|oMwQ9ko-?Zn7rG4
z>ttBVT8Wl#b!+t89%H!(RTu*!($o6fw}5LS4VDFT#B2g0&Yk<b(>3oS{-n3CkGVY_
zeYHB?8Vr_7mfQIVc3&xnaE5t<IvxUf!W&7B7X}CcfW$SsbUxr^r@@&9jqbI?z;}=?
z^gFWLAF+)R&b{WE9PXdDc(rlSt#$Q)mZ9COcEC(+vGzOlNH{g1k3htptgJ%hCbl~i
za|(ok<aYV8864_y;zYoO0V(+itxLdGzWZcVwmmhLhMyI7pRkin-6nVA)-BaHw>O9w
z6q6J8W>HSc#M>_pVGSxQ9zR<h86KV^WY0sEYVYey0FM)mr*awZ2(bV5Ro}~9kUo*v
zMo8Rp&uJB0**_>K2(jxC=n&Hve=a1}sKaAS#{aA&V!}rE;XC%pSy(FEo7$E6OKnp3
z0Rte+cYl2970$TSjsySY`aZ(^!Kp*V^F2J&hK);1V}3j@cej83ObQR=_U)DX@!OuB
z9jL#-oWfnvu-RP#^~5dkZBLP}O%Tc_Kp?=7l2{TT$UyZSC}AH%)LIl(;s!Gy5QsIe
zyDlX%e|mySwgr4t<a9Xn);~jqA%I)Md0bY<tRfFI*G`F7gEakEmtthN_8NKM@u9Ag
z*ROdU6;=sE;J*niob5F51inU+DZ#weCggb_iV_2#{fZpW?15s|Pe}QLYDrw2Qr-rx
zXV!jIqjbdU&p!!aNWPcQu*F+^$fj84*^|Ljov5dWo6POaMg2D8AI8VW7Yn~6($(`m
z5#tXB6as}$qaq->2#CLJpguv-WCMZqPGo*XRbt=Qs7Y%6KB=#Tb3tl;K)xf<8GJW9
zYo=dY{|044oM%1hOjvy}H^uc(;GP3L@DqVqG>>UR3Il>?AVB!rO?|8%T^1}98r+A}
zqaJt0y5U1@Nr^nRj56#>!shkDgqRF-AAMLBCX<Nk)u$E2!CLu#LgejD;D+)=LGt(b
zA7j;GmBOTHUC2)^L#3i4!2ienW~(Y3C*b1Jp=Q6~M%!}DnZ$el7k10rlu_o!^L2Fa
z3?IcM;;j!HI1p-TsN)F?xxYI5G^AGS6JEI=WreN#q4BG0bU&bk5U3PRgFajOQ0Ps9
zl*Bt|TYIneVl$S&k?99Z9;r652-;)1uDv2TJ)U1%5AM`VewD48ZD$gZr0Msg3QIGq
zQ?3|cYYgR!W5%NP5Qsy=Z%)Z8L)Hm3eO4e)w;UUemp<Y8Ajh7uRZuX|tgE@(_11b(
zMQLg2y$Y2QOLb~@o=G|YJmwb;35Sl_ugswjfj+LKlGu`vmYkbCX>ppgMEd{e`MfaZ
z>BJ0Mg$`gsU|BWxX6F*~jnrCKV-!bapUvdpE=z50>#y|-fA#8>8G}PW{iKx3urJRB
zDVM5Um*Sqf<L%I#AaKye-rvZ`aEx#~2C96!k}DuFN<>;V_kO3ZJG*3n$zrrK5_q<W
zh^SF%ac?W9nV0W@fHwrxNc9d7)Jwq8Y*6KcW{c=slAIt#1mU(GMNuSlkff9>n-2dp
z@S8^7>>H)c!sc(_*|X=gq(e_qLPHwWu|Z^A*Pqv}b(+k`%zRgYGPf&fin-FYN`1Oo
z!E=|c-=~TK-yW##=+UD@ozViF0cF!hXA695h(gMc6M3SsvZzn)P$umh9W#ocBx#ln
z3=gZq!in0$zT@4h`@1L-CC*3<8AXI{>m52qKlmMGy#9#lf3XQzNR`#^UV@|lEuA`N
z?%eo+$A`VlET^xH;0UNAcn#WnGS`D>cba~>F7#R9`<lLSZ+#lrhY-+lcez58n+rS(
zLfuC-313_>a%vc~VzR5xx+`UX)&}B{*g8bG;xU1MM$8BPL$|r5Oubc9RC;SWO4So}
z<Ds}1jccOt9O(?QjL}HctxJFrHKPdDLDj!j@LJ?VW@KcD+`e(60*|Kl_|R1yyO5D-
zY&~(UyVf%=JT7znyw}ViF&-;!5z~9Ccbs_yCaGli5_9-7P<{}_mD_#zG{NyP#J=Fa
zBl_DnCrYN(K`5$~knDZw*5!Ux;c_d1Y2NioUD&@gxXCpDc0g&u1o(v`SV@!=ietlI
zOJpS2+MKI7y^Krj5eYWP3Lk>SZB^rHOAGSokwv_OArX(!#_l*-1M9)Q1!c}nGXL0k
zOYCBpH+<l4Wau71WWu(9u~5gEU-Ht|B(NT6Md8w*_sPoC&Kd+(!rcc4_e{r_K(Z;W
ztW<%0ygu7L7BpB#PfxIk;;B=<7v4YE3|R&yfO+%gp>R@s=z?>RIt>Pt(TtRAJf_PV
z4f8coAmLWz@$~d`c6KI#7(C$zu%jd(5*5>rC~_-!PZ;CKidMh<)@Mrm!v8oRY)W(|
zyLhaY{q|e&;Pl6*vTEsOn(oLR;y<~}MJN6`Gr6&N&PB;;KF>Q-wcEFEKLNcx#Uy^}
zZsSZM7}^Q2?ZQr9&pwQJNEfOUww%Zs7{U%T1=#=(Gq`d^MMcww^!RW_!9`HbnI?)5
zsoUX1f4Tb}K{FfORehvJACTbYy-tYmHtJE-wbuwc1AtYZR|5rN3Ptu04xgxP#nshn
z@I!%Hs(G;Iyd46fJt%gqA%rTe$P^*u<<z2tgakMpimXI)9Cf>n9PDQ*6NMgJHZR0O
zyu7?R%R*ua$<g`#A#pk*KHu-qsT77|G9HzkX?yV<T*A?S>K6{}SO=lyj!9tPUc(z;
zvG1G=TYv2_%&>G+p2{aq+`zjdjt{){vTlHQIc4w~BIk>icnxbJfH#zFQ;WGr{5}MC
z%iiX?!$WvG^4-FMZIVHE{rqfFX8>&WQo4o>Y}5yBXOfbVcJxPvxjDRgTK^-=6owYX
zi~eJTIVNS+?6D%52oOOv62exBU2tuKb0E(m;n){^n@ju&wzdGAFhJO(Fo1ky`(07b
zl@2fwQCW@bCmND%UK!yHdziq|b$gGXQ9(1bDB;xO%Sf(w=y@OJ{asqB0Lix!E>d1O
z6yw=+_c!=7nph;?Rm<Rj=f^Bo4YS5tJ|OK|aOT9lyD!i+maYj{{sBnHQc@729H_wx
zREE%Po&brT;Y)_p5C@<6s=(h<Sxnjl0>-(n+j!bz`ujLn^?t`bod~LSSc`?vl)V3r
zhfl{unZ<-vt{Z#s%@~M63}Q_J)1mw!zEK%5Z!bmm*s%-QqfnrWfQL`Pt{m}3$EX8F
z7+R)ImQ4(vvUJL@aR!c**~kOJxgnS7TVb2D+5TnzSv_RIhY3whO+u8(iHUb_=@XVY
zIL`18n^XIm^BOq2MxK9KR|Bn}FQT&Yb~6*Z5j!u7*Bh4oQ$wAl@&7ozMMQr%y?rC6
ziQCt;FDtT;1x1{dakM9aWQUPa*GV7e^rLtQ2n=!OUN4$n*fG>qo>4?JH|c5c0>zvQ
z7A$B<Zib~8M7dvwmiF3fk6|qDaGF83W}v?DgKwD+@`Q*Ii%lXaEJwm>pF-utbB&#@
z|J@&+94rd1pK-bYGX4Z2W=i}x#3X*iOTVfoM;FhVKfg0GtmoF+Juh71#t>{$@gjG0
zfu%NdF>(MJ8yucC{u#*GM(s5glvRwZ%(z+yTDppqRJyrm5cD+5vRp=DnqB*)kTXjb
z<;7|wCLjzMi5I|g;xh1hO*{iEjxNd8!hGAd$wB3h$^Z6wF{2r<+M<q%=#Cm?$;%ID
zSxE`0LBodBNR<k01RfFR%BN{<pFb1H!n`S4&9A+?I|d-I|3AGV^JbQOkv!m1%!vBx
zf`HZCQc!?gO`;Q6X5zPocu$H8V%0!~p?VN=8hkz4kq=S^Y1)=_7=j3a58ali>I%VP
zr;(o!fU0oTIhHTiFAdwRf0@{-GWv(X&{L5(!GEd%1CRg*yHE+VHRAunhwReJH!Gd#
zNhv1;R!%Q7Vr}aGEx31+)Y_4+U#-4KgEFf}njY<M3F{4pYHi${>pVCxkY%`Y&6+jw
zI$6rZ^J$+wj*};0)mo@91T#${N<gW8y-jl=hQ;d@=)zR5fh+Qupfp2vl8tc2xek<e
z&z`@G%eaOG*0gW9WXnyQOvo|!;9NlL@70}wVS{w+q`_%HQZjMv;faWg!rwmv7X2sy
z1tOG?>IfxTXQl(}jZl&GOul{nq?L5`5?N0;)rnmTNQksfU0b$cJNS_tPeD229|iAg
zoagF-<$|Hc7qJ>WH`QrWjinnTRGXWd|FcYLTy@D7wb+xVPgAD$17*P;l+5kzE@@(7
zLY)2R%8CM?dS+}rtW!1Wmdd$K)`ZA&`TF;QUR_BFzMTjo`)kBKzB?t<;QW9KjX|~J
z%yOU)OlJ$s>GF}DI1bX^!aoeE(MmqWC|F^fn<Ig9oD-N>knQMe%5pLJnk$AccPH<E
z9b9?0RP^1M{*SU7)!-``tlGjn8XUZe^O_ELB#WHMSIa{XqQZZqmFJR1p)$kV0ewe}
zU@LB6V_z8e0_dS8f!!Y-U%|@&*OBHl@R_T|*4p|G)(D7lGhVQy<6`Coj=sdLz$#IU
z&ZUO(rpCr9&{1wZzcIh#M!wtPVDyrKciIyoW_^h4M%W_AIY23nbW0u*N}r2I+7>xe
zBn(^`0@R?r9`iFYSBmYh%7^g;;&~+LSHiv`^|0mSGDF-ESg#`hnw3v|q+`Iyip5CD
zRbZF4f0$r?gcV;3wKW*~2M|&KT~0)sBpMk#;t}#GHFzKeCO2c$MR57vCFL4Ox>zvg
zoO=zf<H7aI)mdJEbGw1hA1JUVzC86&&2ccF8VZ}K!s?;c-$Cz$qNF=f9xpA4liln&
z<6a%FtpVcePmjr7QP)(K#Y?)fKg3EAbRJaw7%--Kuq_7M6F=*K+7(4bEJ@}-{P%!q
zp=oXh(!IY!QyHlhipB>3T>H-V>xgLp)~gyqz5qp^Nh3(XrsD)TsX(<g$Ux}03JajI
z-oAltD;KI|68vD;@*6IAREJU*-#%+h0<VM|_by_Bbh?oh2f8-KimMa19Tf`=L42B!
z3Jp5UjKt<E3gj9hVGl<!*zRlkUkD`x^#D&VDlRG6m9+;La(r@<`u@Ql;tK+jc#29^
z;Ez8JAqqlx2qfM(keUMJt}ORzBdFAGg#}SYYm7Yv1^1b$F+#9VUUaSpS7QA6)n60f
z5JmvbVh|SU^5?P&L^5W}f{{)WKm`+H_V}o@A%s(@so_cqIOm8!Ot3@=i#B>OsP6mm
zUifC~QhanE<<XGh7Mrn@I0v{lMzgSanXb$K@7rBpFR~dT>>MnO<gD$|$$SKb7{Hp2
z)S-r|m5?MHtPNHdwfTJu$KhZh`_SY2z?&U1E#Ol~TO`>XAg2Ap!t|saSq@xg2w;mK
znoV&MyJ0rIXA+nZoCErX$hxR4Q_zGr^sE46?*<tU9nwDv<;&bhl$^aQ63n=8@da<o
z+%D1$FbgP&{i~<;ITXBSD7;9zBJf|&WBi%)WRKA6SFgSRD3QBFE7GT=P^2tk1at?-
zR-fmZv0~*)^O*B%ui?n`G_tVL9E#(dVtNp_iAaQ#kKV;J;E+`UqXnxQ8ynj@I-V7I
zuV>)xszdkZSheaI44Dd~%L-(0lWTphbISZBYafA8lvhqr5=KzGpv|(?51j5aV7#cR
zs?tfUgFcI%Qyw`qkG|#D*FusA2()|ld0?0tMV;4!HJ^<jD28A`2s(WJ#RX>gaNH84
zB&GyRjqvqRvOkvyYMsUkaM_C_Xr^?UcNqoWwT{nQvQb)xG%zmgKu|-cg9n_isz_=b
zeSH>VQp|1Ls;F-}%eEph!1<70;W;y@2cdf^o%$|7(9#<{ibVNsvYAF-ZHf++ohWZe
z#-I&(<cNO~^sRY~6!ZRC6)y~JNdp0est(?ZZ6iH(Dx^aTr_?el!n&=97X67FcOcHX
zvMJP5!=#$2VVC~)C_esz30N0eCIai9rOc1?mB)cj%pJTjJ=rqz4rOG7NjFZK#(@JD
zNmN5ehEY}o@&l)|TPAp1!GiBkkj1|=Ikr^el@Vf<Xdo?aL*B#*0r$`cmqvReL%<)S
zp@8@AI!XfKtznqx(PAOy`{v`K2p)Pf69F&aC5CMS?YzV`k9#JBFj&68j%+|JCGb1<
zO#+EIZR^PgkDwb7juU@aO>|pg*hE-CBqFvzJh{ctn)ix;3Gvn_w0m?}0#~0ME=k|6
z9!Hv$`N0%JKnySmBt4IWj2`K2ASTZe;g+T*3M>c2Tne7t0LCgAzhNW*Q;-5Af9W>l
z%7K81vp#a<&n5=*92JI??XIk(NFJAy`$#-gB+x_9ha5>v^xzmG(WI$}#*aWxyg0&2
z&Bg|FN&FKV2vt4tMrQiXHjeZR{1IZ40+9$Hx}`b~UGVC?`hxd=7M~kER<IrL6CT&k
zc^&E)lG|?Nur#=moLbWD4dKhxedWs!*kzHV(7d#OJ88^eUl355uG6P^&^%Eub^}!G
z{NnG+{#)>m&C9V0&HkiqI=w{R!@o<`)aW$?NgT>}!q4D9K|qJ`U4z&`t}owWhn^r=
zvmh$sP%eP{)=J@O^cJEpO?afuw5(nH<K^Z{w%MqPyP>Xsz=7BfunD|x0?QG6zG`6e
zw`7z($Cg2h(7|k=dT!UAJ)ka35K0Ml#K~Ti&2k?2Q@J?B<dwk*4flJ_Xv1f&hSy{m
z7(u%5!5_q6qs*H=a~_w0MNsK{(Jd6;q&pm#aP}!so0gT8iBE4swR*{e*+|J&th5Z0
zFd?u(^^;yzRHQp9g-3{M8@=^72Fag4e;$mzf0DFqA_hYjG}(KL(f?w%LS-k*zRT2x
zK_fT;eb9^8*=thDr3Fw(kv^xF>@un_U=z=_TQ8|83D!-wXfnO}`p+Vk^C{%uJl_t$
zl<*8l|GKaXKw&1&teR%3M%r}<9|#IRew^&ITkC{`A(MD>i7TU^tU@(E{A#6A6rx|C
zxNQV1_9i|w3&O{$V*}eK*MS8=I#))V1xVw8Puy#ad2zO*_w_pmj`x%Pm4jF2<6z0t
z@`nBB2NQUtV5f!j9SY*uB4!Vy+q{9I^|&zrgYCMxnX^m`_+T(7&L)D|#K=7~l;Yyn
zen(aq)wo)MPTl6o$6tSgQ#pK&r>EGDYc`f;juiFs{ldm0cXwl_Tv7TvY`;iZQ%m?G
zVI74^aV}zPtp;W^pcpQ;>xaQy9cVZ?ZU`1Set3rf0q*e-7yb8d1E#e}k4Z~%!0MDV
zdxkny2N;eVy#EM)ixCz8@m)F}8i7gL@p3nrx&YF1JGeOjHR9c$o}RRX%TNoO@GhJu
z(5TQ*lM>g3tOaO8MM+K!*Cv4zxEqq%Ne>fo@Q=%1=)iZ<GV$_ltgW9@-I*(!Sy{&=
zW(k6nc%6|EL2Q7)SVm|%7zt=x(1_fJKgyC3683^Qh2j(v2ORr#6&~(A3f2k{4|F(T
z7|qN@<O!En1UW5<YR+<n^|f5YHdD?C*Rv{qrmk!QEP{Anu#dyA>)KF{t?xpA7GMY4
zH_0T<Zl!kqvttSH{Qo_8qCQD00%XkY(18qef2X#(AIF+C0c}#m<^%Bf1^kvOycgzF
z;q`!zchP1N&WKE7rbD)@^R}(vO$@2IsEf4A@}N1vF(UACw<^3LI{hP%y$Kl4(sdMo
zF^hy?WE@cGBZ%zvY0}JRYor75A(K<Rfpl!A21^7r%j6XmX>hyT3k-Cr%)UHI;sNqU
zB0@$LuqnL!9Cq@|PdH_O)v1ZotaK?DL<B@Vqdg1nv7*FXvQsZ#2bGyBo`b+_(m=;4
zbm3N0V0kRGJu0Y_?%L;1k-Fg3p6mYYlNZ^U2Qz*+bI4_`hUy1Km@q!0e}o`19Dtge
z>yG&kRb+f4@?+GTE+qO{(<0fM4<k(67vP=aTxgP=Z7qe3xMIa;UtN6}c=Y_jLFCQp
z``6BGVcs|qH`Ba<%_`+Bdjpqi^T?oI*s79}A~@NIl}H5sdP0t%$k!~3J^l0=>1@Q-
z>M)ednZ(iGd$s+V{+UnUJKop>S##CATdU5J$(lAPT}t6`c5yN7Ys_*15>L~AIe)``
zr&CZr)LG&VOv~F+^?Tx%Idy&d%0hcJ4h(XMMaDvyo*3)5Otcw4hpwS{P0U2V#-Zu3
zIQZrhoSrpsPW}8kWgx!Fj(H<&7oYr9$TV}aon09;w+phM>*(qK0M9x1OE1~tO^vk7
zOl#noM|=2W2@n5<+w~;rQXO(<v11n6<G{m$Nr)jebp4T2256jOq^AiE!|U?p%T=)G
zAayB5m`J!O{k%7}1GF1;*-3}4F8U#r4p+zvnjp$_H)JHvXXlQCiM`Nb=`rXv@Q_b{
zfFZ|KF8OjNJn41j8|8?h0}AnfiuC7hUoppg14P7}(SP4A**y34>zz)Dp}zqwqY<PG
z0JvqmUUJw^HP6K!2$Dh?zYy~~Iy&yRWn<(CF;k(XAe@xV2uSKv!|G^qBiu1?;#+^3
zr5{X$Z}0L^;Te|joX%Ac!K?-Swd#rdxpU^s;gLgDnb)}R$I~{}XO0{rPQar@?Ox6j
z$?@1;2v?Dk087noLkjMo(jO|$iS#P>Q+;+~6_IkAo12L#1kd^mG69*c06B`R6eh03
z6R8WmKkCZLerfy8mx8I3CHCNJOV<4Xaxyp7TpYG~w7kTi`KId&kL|?RJJVg4Y(F0O
zcc{>#s7+S$F9vUc7YrSWK^vn(5&*?W+b2d342R8pmnDS-fW?K7Zz2jRh%RbJjvTq&
zZMdp^A8t-C_v>ZQB77PeHM!|#ge`<-{sb2@T2xG}j(LX6N1F2PAa%jeg_%4aM`dLh
zI394S-?5V-mAW$M!;W@?Ax6@W8A@AH$lCX|wLJw&?=tQM|36%36h`d7q{Rh9u__$*
z)d>`BDc4a|)U-I);yI&}Fcy*#4_R75XeM%>9`ADKWo~yU^*ec&Lgfa*ZHJm)AxxUG
zQ!BNY<8yQ0aFR*j!%o#0Rk<z4Zhasi9ON6d*mtwT(jZ<s7?I=`7XZ3qH3W$%faDa9
z>2V{fE<z8QMPy9iz$6_#9#bP4BlJqy2$Suw(-YSW1UBNm$r%k;N#MdTz0x*s7ygb2
z@gVy}U^$w?{2t;lz!$kK(y~n!6!j%>hb*vT8ap3Fl?*NHT953ZNU&$9Yo9n;gveY8
za<m*USONu(JYaUtHa9d;k`9b2#Lp(I84-6N&YbI~?cJlGB6Q(>(jhV&hiumjiqYC@
z<fZJSWMlh}D2GYA4+H(jzzmPd&^ABR4(bpF?Yz9_@9)n@1E{KmK}MhA>g*g10JvQ*
zKbMI4q)7xk0NMRG<-_RGh{7j~n(6rwAT{#kV_W(Vwv@?O9y4XNdqeuZm7AMOh}n_8
zriSbQ$W4ers8vXB7!i=bry8u+By1|UOI|s+C90uE-D7Y9un^2e4EChg1?Eea-n5#Y
zCe!q-N}+MYx8_G`Cb;xEjhm1>gSVM^ppYg_^{^~yj-5MyUIeC3cl7Zhc&OpAiRS_h
z-qi^$gq@%G*5=cz#q=>Am@Xb$!w&0ZF=!r`kA1MORX6FU@E}+yfjKWB)DIM;l#rmn
zz`$N3I9~6Eg@qx<!Zg&|Y~-+PqjVe^PEPoRx^!=o8vbxtTr1$g1LOF+;b>Da1#v?f
z<B1*vyZQ)F<#9h?;NddK{@Q3)W}tMu3}#~S&Ed*YKRdK=UJ&P-vTi`RoDo-@=jte@
z4BQF4d?9jK3R<Y(w2VMQW;E%JMEXZ71ELpsMd6H3GdO{mOj8jqM{^?m_P1}RKQm;x
z&L%Hu<(%E}$B!o!VbS3N0DDQ|LW3`Qg#u~c&@sBqv*jzM9Fb<r<9^cLTIZtCY|k62
zV^6KVHyWF48q$#biH{qpoLG2Z-j4zTNf(*j)CZPx3`d;)u#@e;ox##0DUY;q@$oMS
zQlE5URJT#N&>!0o45IpYw#SQcK;t@^jm<=h;Q0x<zO1<WM5->@SmX#Yhi}POC%(KE
zR$8UoG=tzZ&{$%pm`~D})mOi=jJz?C+t;rb+bCj}+o4L7s~{$JY$o9nXE1olMeAM~
z$*x6PCZ2LT7_QP#sRoxGN5zMoGt>6^7Bwf7&zw1vP}6t<rkebzQEl?xk+c1=E<{j+
zTk;DSDEn`TEDrsKrnE~bXebqF1~Kk$OSJ!$c{mrPd#%t__5oqhgkrQLxwjWfNXSoK
zIUMJ^V51VKs!^x@=G-Xbv0s*o#&>;hp=*rMB=o{EHq?s7Xr@HyI(Mv+0x;iqwT=0#
zT#N)M>23_+0gnW`1i@~I{Pha9+@9a9skvx7!8sMI?1CYWI4?<~fn1H!5-<q$o2QN;
znhbyW=ogM6v)fy=IVoBZTyFdTBEQ|I7gx2*WMHestBhx0HV}H8>QWsoi0(jIS&5_B
zV6xVhc`Ng}@G5u6kv1$expS;vAB*KyC+5Y7@{JgrMrtb<H-PL3`4OQQ5`!{c13MWl
z_&1>w@KV@=u3WxMQlQtc1TdEX7dZcP(&?iTf@g|`pN<r<Ff@rk6d;;-U706BfJL?=
zh8}?stV{LGM#_wamB<Akl7LBU(9;zoj;f^M!LwDu2C!L<Ohh2lddRSYESFafum$;#
zq3(&neK!xf&gKx`1dr#Z3CiNw_B}7i{r>fi^380;(Mr>w%QtDmMiLF=y)QnGsw5_#
zJO{}Fp}l=am)itA5+@qnB`7HXQb|rFTL+%Vx<CyiF=U~p&F%@1dtl`C;~@i=u&pw(
z`~mMnJ*S-+F0cfUc5-lkLZuJ^tHB4PLedX1<>BF}%`LE-x;Fygfk8vUQlqu!2S-O|
zF(d|dkxnJjZVSHzF`rXQiLnywOuLC5^CgWMK8P?`ZMSkBFQhwkduAkPry6fYCJ*W)
zsu9v)4YF@?TpOfoBrPo%VjcSTz`=-~nGXQCKuFJUkyzZ3Xz60KIAdHKuN=H08E!JR
z03<{~Lv+?0BS9Wj0P%F9?yDh9_>~QWT!VoiE(-+ec1m`tdw%QAa9G+AIteKm&fxRM
z{JRHT*>t`zuz7<zvuM_;Yq3iu|CiNlkB*OD_)MA9C|kD2LdtX5F9#GAJRZngF}ok|
zy(hsVJRQa?$a=1raiW!a3LSHVT!E`NMBMfm{GMQ3KqIST>)VV(bu~J*d5gw<{U$Rr
zuH>(8Yk^S@rjXA_l?0h^D}6%p30TXACTJ3I`E+$n)ie1u0)$fci;Of1buC^9+940A
zoYXDhjF~H8680y67s>-zbh^fR$_BBnV=$W$Bmp&5pk=fXv@Wrc$ib?UM})nMdV(8B
zBNul3yafxWKt>YqvEvLzpaqI#T+-o5e68rerBcAm8Lk~46YM7bK@c(EKIHvS`>>9h
z<XG`*+0|P3O(Fell_Odyp%!`@Nh@vJ_3PJ9+P1hbc22czoGG@UAVFAaYWNYIh3bat
zq$|qN6@=$$!0uU-`GlMqbNgJKi^(O{!(UFr_aZE73yZ`ss&uWIbuW#z&FWL%a%Py-
z?Nv%MTfSzEkvBB@A=Z|3P22Z3y{zASLj)nM(Q<xhA`EpKeCyUU113ftq>McUoQqo%
zlkYh&3WqazR&7+v_D4*-Cec7l$}wWB14c$hz6bYs2ZP~S*+^RIF&c&7E-YS8a|t*W
z5RSJ!&^T!sKQu7#cDN3IWY+fySS7Uxdr^%@W`p-_%%RrOYC+bLff}qzt3mnZOGD5y
z@QrFj<wT8a`)ajA2D+E<9^}b(2E-e|5_IENQukbLSPuc!7`}PJKaqKnX!9>%lW+{4
z`Q9Z%I!Somf+Zt#x@{@s07hO_Mhch;bm~-Rqn^pQv8rv`*Pjc1K>t&ctSQEH5cLgJ
zlsb51h_?|agJsQ)QZN*OblVZrJ5t6Sx<L}px@*=YBjwVQaUzb`c+2{0xCwa&D*zo+
z8A=9+@RL%LJt>5!h;v1okTN#CrDg`0cW|rlo}mQOEUU}1jYJ(o=<W-X7WDsl#DnC5
z7A96b;9*^BeY#Q@mRJBFaD}A5hRiZrgkgrjl2s72Wz9nb$izAHOIT2VtD+zpxL`0R
z@ks-_cKbgbTB)@E@fuyHgMP!{^J+UdlAb;b4>v~L94P6SfM%s9P%5K9@trtvq5{z<
z&AeeBzTwPJ3$#aYO7#|#8!;GRi|p+xP=m3MvEM?<2b#Ih6C&lRPKHMi@u2_&yjGUY
zv%ah@?wpaIi_NyFAG;0NoeBb#bo&D4S~eA6b^tL6;ZvfqN~A9oQ%J+H^~9cdH%uVm
zrn!yV2&{$f_eR8hV)*3LE%GcP1R{XaC+-4brsqH(prD|jY=(s>>9zoO<{w7r3F3%`
zB;V$nS6NYE2mXJyX-FJ9Q62m(sNr^BZsr}<mZ4WZ0Yoq{pkn#x*m|_{K2lpTDoOj9
zOu_^X5-rFwe;ddTLiD1H7x&0W%*!1rlcYJly1ra1-z`hwlOy;^5%k79qPW<G!~Ovk
zOTh(GG)TmTyIuu2e7k0HIO5MU>>jfIU|Ta>Ya8JtCq4Ycl|Xz17_kAxw*(wNh)Lq#
zA+1n^m<P)X0ZWvn2k_MKbr&WUU*FIJuuEPh+8IU&so(N_CTxeIr-41FFk)FFuBX|a
zPZ=Vlata%70-uE{{i>H&EMA*vsV!o#{@NHyPCEQleunWx41&`L)n=fO(QOpLpFcF;
z5&bVL&nb>y^BzvcVuqvW<b`wj-|r7m&NsSDbIrLJ9Qb{==lr_uq{~cE%;*`OnXH5z
zVjLNe0q98lzTq<=E@-HUn+E?8i(0|u%VRHOWgA3a$?z}~0@`gr${6U-rTTEMVcY;X
z_o0f_%=gBz4(&Te<PZ>TkN7pEC%$aK42KUB?}ZgeCn;)OY@XAk!UPjShrP?8cXM0L
zGtdr0swH)Gb)0;>ydBZ3pSfz@3b9B5*vO9gaL1Ew0Fd=$djbOS(mf+XiJVEy5lG#n
z*(ep9TF#%@be{qLa5Nq5uLixGJPY+zb~5jjfBPa~t1>cDa+Y*KI7ihZ2YyQiMrIds
z1u?5f<lqqM6yR-NzXpcLP~Ctd_mAX*qoL}+o4LQ5Nl#sT^5oR7We{Jg;OM}(+ektr
z!2p&dEiXe4mi7+^Jf^4B=&C@d+IuxG>r;Qfr8gu^s%=01zY^Fqjsf`4@DCXH=T2|<
z%~>3*P+5f?lt_Gl1nlFmMWT2VIJ-|l1?yiUFPImO&g^b9nw^4zwzv8l4hwk&An3Q(
zOrpIB6w8SpI1T<6g_iS>*GXv(gD4<8!p%9A4J<2TIw)l&?eRUc8Of6v9S{~t!4IC>
zEJSzTlEVOUHKdh<v<9NOgHKTdL_wOzlnc)E84~$sNg~pB46Bq`x8au$kNpz7TJYQz
zDBNt)JmArF0U0#wuxgHAVQ3%mzY8c72V^7CF{GPtA{v69%Rr>MOSi5gZOP<$@%*H_
z9E>>_@~mieVhBPvFv%`jK?8SaSpz^CF)(r=*n?4W>3RM}*tCiteu#E|1{$u0>#2#+
zcpr$b<dA73hT=e-Q(-1#wo@FdYz+noU4p2fm2B`A98x>vXm1VZ6DFNN>9bl0GIq+6
z2Ps16`c}Z$9VG8+C?{s!<u6|zCfX)>0u1(g77<|zMj{X>pM;u=7ul#5=u{@Nd6JTL
zV`AH3P;u(5z34aqad!;-`o0k+nZS0G3?+eclH3VV|H5akzNh~ZkFGBgpvlk&(ok>0
z4tgE0O8^axU^DlP5t4&+fs?LgP^R%71o^}5sQKv|w#^8LC&LNpStgDUb7Wvn7PFav
zjsFH|i6#X+Hm7>w4hp+TWlO0WA($TSeGMB6<@veMw<Sn5iDZ!=yggLZH0qI=l9&nw
z-T{T1h;R{$F{KVHv#{B2tU&dxom_6xNgRBKK<cE($77@bF_ICNEd;g4n2Vr^14xW&
zD3?ed8bQ|(Udha=5ki9lhdb1<2*Wr2@S0Z!I9`njad#PxLS04pkMm=Jh_w|YL1JVA
zt&c@gXjS~|TPP0Q2fQcXR!$vwrV+(XBwR9pdtM{U3<Qpy%bdMz_1S}XO;D0!Bm@m(
zelXi6?Xm^SYTe*M)6daM`fF*m<>iTNlFYYv@yS~<_62hcL14^|$;BB~N}vWyyK6x}
z@(YIy5_4wHWV#mWL26HTw`om1^DJ74T4Cr5+MkHsniR+AkvXs8jnnrrW)GFNL(-9d
zd}2bY4SZrl9?LmGOZ5sdLG+GAn4IuKz%WwrhRhG=0<2?z*OCs76ksB##5?6G0J;B*
zYJ;u`w2+}+8AF4J9}q=YRfGiEZk{V{CLPs?aj8Z@e&GSSE^^TQQ5um<_n)%%d-Oh_
ztc|k<mc;N1_dIx4!Q-GU#SrPayE!ih_zA)bnb?=p&s;Ay$L$z80Ygzr5ptVz3<7C@
zA;nb-oeU)W1N@TN!MK-?1d+mWAb^$%EaWnldA4jfvd@j~TDx-PdGC^Y+wPLa54|iK
zUZu3;&pkb6-b-hF@hL_=Wd4i_B$Io7e&-@t+uf{ETw8Up@2J6h+|rL0jl8~5m!}4?
zWnSqu{o>tA7%VH@O6v*ztWGEm0IIHN8cs}d994<_Y42v14#svedF@s|Wx6k?pLC~*
zi+4<`eZ*}ORoR}Bs4JB-9PklNo?cxhF~`=o8gF9m+_`G`ZqDenlf#^VD&hu%=HHQB
z27(Hs=H=vSS~?nGYjw?tLF-$6-d5)BY20b!IV`q-aM<*T^D$V|SZw$X(<$iaQ7C`4
z`qla%LnuUsv#kH^HeZf0s6hdE9=KbH%7vPc*yGW5lHZYysLw$65HaHrM;k2baO%OC
zRkGZinG4wl3;TSk{KLW4P{F~G;aogq#5w^~n7?^`mb*nuEOO5uaIT>(Ydfr71?X5N
z^%-yiw(4x%w0f6L6o8w=69v)<IonqZPks<14pSJ=58)S%q4u>;MVQabKm5<=2n^fl
z&gu%q92CEQMn`OE7^jD!dxr>dgnNLc@oD{Cd_hLVakd9X8pIH`XgmYMT!5XFj_jw^
zzyH2T4vtxYR>j%RKYH*(P~8k=K3nUBmXmh1dg3kjF$Qs#E_us7WaLZq4)NjF#i;VD
z3D=GSMk~@kr6fbhqZ!8KDG0sGmIhfiW+YWH*Zbm=7l;pK#2r9vJcNITJK`j?j}%n9
z;TV}PBXT<K0jd<z(n(l&6#Z6q+Wy)hnJl`YLHQ?mB;dy*Z<scMLfoEkMVQ81St(Rd
zZ*?6dl$_tqoASRnvb?2nzLq&4-nXlrKs~xWs<)SvcYbzn+*lG}@j|csm*W;mKuBm(
zI*QRa$bR#hm?aTgd(a%oPr+drAsz=3&S=sZxsA-Q;;JDtN<izAE(y5mlxR~JzS8kb
zf#q!}_uVKQ$DV$-?-bw+SVtOW7MR7D2S$W5`YpLZpp~*tFFJkK@=^-yH|Tr@B6y09
zK8W^O`sRvD*6!X^KDqwdosQ5f`W6BBA8YWb+x7HX_E)nm_Nz~qKm-I4g^PHRK>+Nl
zc`M(lgUK0WES()O<pVPy1(QKBw;3pMi;>L9;6D5u4Bko|rYug9#)TM3fq9h12xVHC
zR*!+7|HvP>_Z*4fThZMjx~)*z^VcAhF&E}bP@kl@xSSsu4S~X~-Gu3}X3K1Vg_Rsd
zaA*xdhuUxbl57&FdF}nWE(tA<9_IbCIsU;X%RvE%1Lb2*!4cHPT~&f7yMOPV!PQ*g
z01S)>Bmf7IkufvfKhxfZ(I>*>pMO9z=r%|+IEA;3Rbb?=LAW78jaL;jkzJIWtqzu0
zX{Vb-Z}j+f5c}etg9ora2j)wm3?gxWxL6{T!V=)?CjR9L0Mnl9`$XV&_o`wUH;{9w
zNl6SmuMRXke;z3PCjq$f3~SxCfX)Hkqd=;6Bmu;+RMYD1t{MO8h}a&*TaX06aNM(-
zqPz#TkAlA-CB!v$3?Xp8CO!+k#Xz{*^BPj+K!7znwJn7bbC`5h+m=g3qN-eNM`-sS
zU#B+pOdszh4%jxZ*^M?9klND2h+h^$+iXaMtnvkZR5C2DY2scu&JP19+MkECd7<;z
zEZr84I|`ZRE_$^cX2S{;qY+pLWt5N(EOvReMM61YR<|*#ig+I0*(gFdfw73_3nf}8
zP?CxeUNV%ftOH9j^AWL#Oo&lK{BLl^?Nvd#0GD-PUfH^tNzg~6?bU&$eTaPef-;yK
zXkwUqS|U#Gx621p3^^q^p5P=Z7i|BL%M#_$1b~q>JaV`@5EJigbV2O{29PxHf|pRb
zvwQ@aIOS>%cVV@*i;2}HupTu13uMk_bSGY!p?D<GMVx@~>u5l<_eAm`;57%8oW-AD
zuna0vBE#uWC@H}ia6N>`cs4HzBFP4ujEY}L83^apPP)sN{h)JmMp^u*S0BzMM$NKa
zCoRyEC_)lsay+WT?NlAOJ90$>{zjsT8-U|Es5ck5Rl5!F*&XhA8=LIu5_Rywh&kom
zy`<v+H49~usfYhF5el_N{2S95@E4k8a3bA=x#Y4DyuB!ynuU#^5<Q&5!1L9mmCx=E
z0CYofKAe*vQOk5iQTBCv+atuW;t#ps<hhPnVA#mNn+kfn+8)<6cGlJ(Nt-%ZMdUCN
zxC#Abq6loOhJro>Ko5n18V;(+*+Mq5p-7JgfIR;37t?upfHHR=r%@(v`S@fMfg}Yj
zMWzMmWc@*(m<+1CGPoW}c_C+*xNXsG3Nv^g_+&`hDG^v6<I?1xHGw5TbY2<8i4vKa
z=pwM`fqH}7BU#B+DJ)c)CW@0s4fgf*?Fa_QuH(cwF*PFe+k}Fa7a?2ezYPC_@eUm|
zakA<NU2sgwf&O^di4yoQq}C58IHQ2afQv$m6*cfCK<ko!2gafQ`N@ZSX<}`Y!9tjj
zbOkFOq>Y(Z=J6Q*2S`=S%v&Vh%rOFAln3G_BV-KzhZv+!0u#u;iGXV3VZ=7{!IPG5
zV#C8wng7Gydj~{$Z~NZF#GS;tvyHJUMo6M4b_EeJv5qJf6qRC$1!*=AR0MP<Vv3@U
ziVY-*2qI0X0#YK0U;&g~L^P-f2q+2`K;F+HWhVQabMJlc-*^9Yj!~F-o?lt(+twiS
z!@@f)OLkA>czXlqLmeOi%-#P@Cc(H8b71&bY4@WY168kTo}Rwe#>~#RJdhgcl)dQO
zsncN!9Cl9BhO$XgTMz+oev*ekBG{GSj%<t&@E7Gv!!<SxF-+=qwU2$-KjrO;fB*LI
zVF2NF3mhNbIH)Cn4jv*ikQ7M)6|W~5h3vw$uGte1Hc$sghV>b<?(WNnsssIpf08<P
z30zdzTBU?)N(0h(=kj_I3eO_u96V~&0?I_`zel+ddxbM%j7_$`y<G@W6W9grD)8ZR
zh-SV@Q2%^CU%k^F2H+gUK91^??OIz_;&^RO)4U|`=$xDz&rLC+WcV2^$1+rue~iv_
zY;A7Pp;TA(A9A_&Q=gmxQ-)c27&)#KhZCIFOzF(!-AE=62PpquN-1ffYl_i6eKvIH
zjSk{MecEiR)}eLqVV{E~E?ETu>3Y=V8!5xk_kgmj2+QZ{eI&GN)sPAroYV9>OAF}n
z*w9<O^7|uA5NAEVRFE_s<SoDeQnN`qnIz~UWOJNd9`Amx8A$LJ;%mtx5FnJ@gga?I
z_vBb0H=1Jub+LX7RsLi0nD;sM-hSEnZ#o9i&j{}e=(_s#V|Z?%DkN}#OSBSi$ztUA
zkMklP0P=}%i;qTZBfA<>KFY2@SMsQ2hXBJrPBQvy74E!rp~;{v8IzO;g_0_CRb73$
zCcaq?DN|hXVFiuk6VaKW%V1N;l?Dm5m>0QF&gyL1`0nHzQj-@)zxX}rK;N{mIpyuH
z>qs?_fCuoHL5e?SK1FZ5_eDajs?5-=BU{iI)Gbj99XK^vhOa@v2NsiM7>>9n6brCb
zJ6@^<#m#S1LL@iWUzY=4a(cO}XS2NX@%lAT${9;;o~U(s%=zmRjc95ofU|1J-6B#)
zIJI;1{%yD|$mJ-vKx7%*sBa=l-u(4EH0m?q9U%U4=?v*i>1#yO>s9zL%o&7O9Bs<y
zlw9I-y>=^}bbC8rEs>}~D;$*hLT%15*6yjTzrC+mU}@<i-kgNz?rJdwK`?b5Id5Pg
zMT?`2KSr?~I<zK2^;$HV@J<g^h%%BzD1810gQlP5P0KOl-rC}Vtf%C8&vi4z7IMeE
z$w5vdb{)gO4o5~hhPi*&J+evXLL>so|52BqB{Ao8>B$e8`N2!e2|@Xc>=tkD(Z|Qy
zc{en*-kj1k2(|6fOmN?9Nj#|L2&-h5s2jm{y6I^R+~VvUbA>(>DK!#$#LLk^kazhB
zo;AZbEJQ&AjK8*Y^qic5u0_}XuVf5i)yXlU+TLPFu9QWl2iL7+R<{r}D|$2#0}%_K
zR8KuI>Y{(or+;BUYr%+4I$L8J(mp`ef|SXCZsu{osgA+YH{!xOq9s=4f_rwaD-nf2
zR^2+W_t_U;e6cyYY@Zm+xh{*<=NFs^RAU+o_otqE%j&~+x5apZ_KednyJ5v8kRP$-
zgQR@Ywd?+NwNKUxGeQmb0OKXVnMGp7!YgI_Z-*TbL8N8)RRGB;`4>5DK}N5&jW<{u
z*Z;3&Ir-<!Gw)9Hc(Y^1;pkv^ByNYB9jq|$g|NM^6%@qoG`N0;PY}dDY(rN*H8M)J
zEI11<Fc#dDBg;o`)8*b3W1n=E_yK*xl{Gar4wCEO^Y<?UtVhc};inKQ33BC_2PHkv
z?^%e`v*X3!$D)GzU+ET-(q6G(kALyo@3Mufd3kvOErb5snO~-mWE&E6|8vm6{dFGl
zwNpw%|4&o^Thf_y_V4E}-ot)Q_)zyM5<uj<rf{06T$^>u_jJlw9nNJ_kZ~;qAY^YN
z_vT?0$<SltOB1Lr=C~R!AZB8$v!mnnTPONdtWk>#`QfhK<9?aDt%zJ6+xX(e@D!8v
z8_;A4KScqN2GF>{aTtC+LxNaEU8=313}3)+8iVlZ$Y=8}pXh-ULm(-<=gC3SsdzJ*
zNEI|^JeM+Qq-9p+o1(-G6I$*+J3GJm`sB8+UDH0%rY(fezq#x)ggO^b@3qMCo5Ea!
z)BPW!f5`CB&v_W_QuBTV;={DwaN#{xd>Z>vj~;O2+omhUfW+bUFzz9an^1P0c>yQW
zEfO403s|2u@!4J^W5ViCgIOqZSueAzbLP%0`FPb?-?ph^+)US<d~*)nl8^KI|3v9>
z&*!Pt71}0U9zqR*6;UK;D_UP=c`U@}II+NqDI@R&V~_(1Sb*?DC)_e_lq3Z`zua-S
z2-E~vYJ(feEE`u>bMM~DeU94~yt=Zjm))PgiJVP6`EHar!SrJ=1h6MV4hQ-GtGBck
zCOmt2BFv+;yw3NSc7W!$13xn#d@gt4H0?#ht`#i&aG-f;P+7^Y-`w|2dpFN^zIngd
zgO}$qSn|hzANxXQZ2L2|vpNm@@|$$cqGY4yhz(yH@OgabM>oR(`zI#Z`9*qWZ7MyL
zUt(^I8~?Bt5ozLvipi*<o)R$z@4A3`9=baiRp{A`_b$ueHt9g29_;=YM>jSMQSGQs
zY5O-mee&ddvAA$@j}dibUIXl}zr#Zf6LQsGv*($FYFF>TJ;auY$sDYpyOliH0Ia~$
z)bipkxb1sfh#<Bt9hF9NA#7n?;S1VivnEP_%GVTgW_*IIjh6CXCWLDVq+^!AwRQYl
zi;0ln!EOknXYW(=*;+srn1!0r=lp)tb`@LzB#9hf+o=1{#CMQ^fo+|Hc!|lqf!Fqk
zj&Cu`mP-sD&WtCx(MR`-mtahqsC>ub5yc-6VsXCMrq12@Ys6=H!qkZpo+O<FulGB4
zyuNYbU+;G3?FUf?KK<T5xuLUnWS=Ev=-s~Av17*=p(?mnC6J8q2WOz-gDJOU;@`oK
zr$vmh=92VX=urC-N0q=f3Sk9NWX9QNEJpS$92vB((w~+?T*@x}B9Vax*I*(IRii8l
z(+2N)MlTzgYlNhRS0hpN$OMNfplfc)tF5f8-1{$W1^$IuJvM{NA=$9y>WA7i@vdOR
z-1zmk&xa-fIAHf*d#;XmAfreWlCR)INVA1)PjvNpwRn;7W*rNC1&GH(8jGPsF}^y<
z;_f9_52xf$1t|L42m5sHJDE1z7jq;VT;Stsc4{vAPU*q>1SgE1;j*M;(~hmno)gWq
zj$T3OPqv@Th>KivN4zY^Viys)LG5-Lq^%M+`lZn2`~lf>D-GE=R^qBSqpG4ciYk;P
z?QetKua&w;h=1X;i3pw#MW2&Ddd@R1qov0O8O7mM6Sv}WkCw5iFHWC6Et3N}514d&
zdyR}D;-l0}*wChmk<bB9vf9FM7}_ed6b;QpM%f^+t9*@FWsWFWIgGRvME%p>9;$tM
z^{-{2abGL2^qx;Y>W{Yf&khjGZ#xEgD$CVP6L^Y|Zkf$@K?{WQZ+LKYxhRVEPptn2
zsS}eg&a#Ll60_3lAAmcTM6G9G_YzHM_m4gbfS03Kx}KFK2E06JizNsIO$UJsX01{~
zir*w!{qY?7^lfVerPM-}i4?aYrlVF^4CZ8SbsAVQCiCb3kv5TBGsUw`{IF;yE>-ut
z?j^EooNB=!0<@mSO3H8<s2b)+LL1^Exuc4fsg%+?A772cp&%0l(AY{74nms?P<&gr
za>I`(y`))%uP4jmr$e94mADz?RET252uZL>2#;HAHYUNRIeK36WS-+^l=wA3;v9%S
zLXST|kbvmtbCEi;-F(yx5*ld_DpQIO3)%WX%=XZ^+JtJ7*!E*VN%ZXC4oBz;*4TAU
z+dm+yzGvzPf!!rrCf{83_VoOW*Z0^s-e$_fufP7<GDL-SXlx9g*heoA5eXPaekARD
zjU6=fjMisHt*3!as;MMnvQ)i*hKKbw_&wV9maNw1{UUxWKs>d+X~Mf*`rgD>iZ)>=
z_yM=fHQpwHiQ?@wtsAa-bvv6g+2vr5krO{*mt&3)xwZr*iEuwV!@wQM-H#w<S+%2b
zD7$YV*o%fZ7lAroSfDnU_z6)CCU<tpg)dD)RgJMmeYe1OGHHO1tnArk|M)pMz3aFf
zc*ezx%qK`Xd@$bkM@W5gIEg5{{Z9^bA+Qr@`V49%X`+YB`(Y=t5-nPOE||&-Y5h&n
zdipL<bL-uI*Tisv!Pu0H?{a@A8Z(EHOvL>T6Qxj%?i}TFGBy-S(WjxU6U7=$8WdLS
z?3~#>CL>Z18A5TATY_{1G^8_G@zGv)*Jt~G+u@_3j}A{zeHqQ+o6YY&p2=h+cH{zl
zce$dIt1B~_;dO!GG8Yl^1*2`Ar4$Jh-^Z5U1k)H(t2Ajkj#pZ8YxoQ@5O#=p)LC&i
zLhN3tY*0gkmfby^vW<VYv}!|7PZ*N`IR79P6a&e&OkaB34CygQk3u~7T5SlO9|`iI
zO=oicaT5?+7W3xQma2u8-reMVAG-KW*0WFAaXbdfMM1Bj^yj|JI5u*M6knL!w9;JE
ztT&V0ALJVNw3<jEZ@1?eXUG<kbZ+aaVP>1VGa1!q;<oGw(_Q(v-|gKSbthrVct_q%
z1Af_SWo1Tz*H!C!r{$Ep&8_GeGdmpR1CtYR!Kp5=^YM-!e)a<@C6PVJ+GGS!haS2<
zIohh-J~?djth;RLHPR(17oK17xgmhg(o=&_prGea`a_GM=qb(btu59Yv-BNw1x|Hm
zIv=yLv|O0%?xwS1J<>Lj<+`dN{s;W@v^T!F`&HN9mxpLs_q3}Jwh5*4VJ*<SwmQu|
zbiR_k1A)5U;A0CVB)UWU8_;eSIndTLsYOp5@Qc%1C4poX18dQb^*ynw@cc?jF}xN8
z>}1xDTKL1{x6o7?b1y{WV-6Gn44Nc~S(sL+Hg>z$dAB{!s1u>?Jg$!DB)Jm6sGHZF
zk}(+~jR7YYr9SVETm0%n+YNM2)aU{&$(1U;TnVC(nuy~~e~lVVlU`?EcF$+KOQe>l
zEG!&Fk!!|kC@ngnGl<hA)_Fd6IGxKn_Jp+8BJ)3s-c5)<$)A-Qm){jrpju%Xua49h
z_{fx#XjId?>@%=cWpt{DzD~m?;KZLhj>HXA0HHENFnUw@bO$o7xq%kb&*UL+U~9`7
z;QOIPjww*l4;rs_)$DSl*~pW*TltnYX#@$3wQwx`mdZ&iD%`G$@h<B&AbukIQe?T3
zfzz-LDKd{Bim@SWIa&?eY!+`hv2T193{rji`_p1DfKK`C*s){w8AqIVuzVxOE?!H|
zIhU7}gGu=uj2pd>F(ungV1X#cvT-tHRJk&INAfpF6gjLVo)^lfjDzZ@MQl-jJ%(*(
z+SZjN8(BD>>F4Ks#|NbHCy5Q9*c6>CQQs2fYCdD5xIvM)n6-koS8Uy<`iKWCEFYgb
z5Ak5uM2$1MNBY`nY*_b~_uj3J<D4uC8<v3hT4+L<A0Q?@xYACkcl%qben#uen7WN5
zSuMiEDwANDKO{&k)XNv%6_lwigf3_+#3~~*G91sm@TNeZM3yU`rz);On5o0%TU0-C
z^0L*gpe@5O1QQN4yiHNQTIB1)lU@>(B||m1d}ExuNak^`HFrMI*8X+EC<A^VZGre|
z8_02w-8Zt+6uFjvzd?=;+VUSq;VXnzIl_>?Y2GH%N}xEIQb6_b8ES^-UgeSkL@>`f
zDk`&!77*;n3NH6r!?0XNvbDbf`ftYKX;?P>tw=3ltMMVr04+|`BRVgP)sC6<j<fDQ
z=57+*Bo!fL1>BVUHTXlB{~_fDU=i7yru;x*EkRc1Q0yFf?(z8sif+-@#U&&doSrS_
zSPG^5XTI$oQV(3s%F60+JTWoRcY)f)eg9KVW^K98a$wiHV>_mHo)zEg5o!xFf@{hh
zT1->#<gSXMk`LxS3!?JO95Nrx2FAVE_8G$V_837bk6x(iF4ss8_xT7*JawHE#dZ>v
zgn|+af_7=Aqo!Xy$gNli_TM%ea7)!frrrO1{J~N?*X`Q@UCi<gZ}4C@zK&PD<pX8<
zV;Nt;%^^HmzGzq>kP#p+rM{R)<tttkM*G1~pC$WFiVKd*0RFx~!cavv?H0i_!=Z2c
zE>-*PZ%lj`eIOZ40<<CaeUPhsK^J`Uwyx^7)u<nK^tP_*qi7s3`dbQ1U*)P9wiP{S
zy>qyrq5<DUw#IV@Ha$Y+IE#^9JcH#@!2*SbQ8`sO$Ekka0Xv_6>52T7Ot^zu64V6`
zxY@hn7(kxbI~4FlGH=Zx^r;zpI}dbK>RN_&xHe*xmxv5NlR+$vz<jkgu<alXB4~oR
zXO;f=sV0m`<7gFI7j>9WX{h#`y^m#z-2r%ZKA)I-LP-dj*3zulBxiy<1!mw=-NN`B
z##7Y~RTHYDe{WC3_n0}MOD=ptS&M;Rw&0X!ov>!K@9?1*(Q)iKgw?kW1yVq}Rluc`
zlL+TP-TxeuRkbWRO7Ec?Hq<%26{RI+M?_?KtU_P2|2!JG2TXa(v}N#KVq@I%kT(4$
z&r?@{9y1%<>+w)irPFV}yiazF%*Ja&G+9P7h0zwEo8ty`oGcm1F8Jn~o21i9YK#Or
zNvw4zM5&UdD|taRW7YTn8rQph`}bcTO>E$a&^VEm-Mi%cs;sqTIdT3p;NpRxfs=Ao
zv@u|Yozj3@r%U6+WN(=ODlw0qD`jew1fC~OSH1r9VEe8<Sw8x!qDeE37`vE$FVuPU
z>eVq1;}?NICpFa?R4;p|@Ayz4`K2dTL2`(~lo}PTn`gD9<@FYeYPAaI(zV7E{YjJV
z*!CE_K+D3hyRK!SZ5jh3?ZPaeYvdy++u{K{THkcZ=mrSF+MlJQEEET3iLLriy3v>t
z2lZxVK<^?K$MT=1erunxC76zLX^l`(c41NFFzCla)z-6S@_ECJjnAJ)6?4FHQxaoh
zmO$ZDbfVe}hRG}eJcWC{<|XIMT06Ckl|Gy&!80DBU>zckGj<7dG{Xr$9evG(Q~2b*
zG<FNhu058yerDfp-45Nmq<V-?T-L(9kadZ4of><2au|53{&mM=0YPZ$t9UCECzLPJ
z@7sru&f^cbUW=dFRH$zFoynt1XdOifoVN^l7!n!GSSN!nu(M5JGNOqJs*Z+sD6#qp
z6mg(%pPx2TXZJl}&GO7QvFA@Xm9jHOsSXd#EUL!axHd(zynPu79@GjAkkVDC1A2MM
zNJZfo5Z&2Qfk;xfyc^UzHyeA)flJ^0$I1kQqb<Ci-Iow;VW5sEl^@g8MXWLA%qv``
zqk0(kr*WcpPQ582U!;&r!vDa1CEEmi^EvcRTVB@NT_^rPRe!ahU@=_ieDs841HiCF
z;om8B<MiX(xsOA(pGN#aAzj7Qo4o3?Q&reH<Pot8BrSVsIEnowCP8W{v-+RfWu7Rc
z$LH`jMI%axj_vL%==wN}CBm*PMmC>@G(<YYFab^^T-EGbTUs5_<k14MsqoE7T`HUj
zcR~O!Hi$(e=nSQcWHTYPcvN^{Rlc>N#Zo>$G?$N=`ePVt($VM4NrS^ARE7vtAU-xQ
zu$)A{arP!|Nv&_1cqt<ipIq(;igfKCkDY`grGph!M&%bzkxyOk;63`BesuWCp0AOK
zo@Ur^PGz+E$?#h)lsQTTp#!NIXL7Pg<tMg(v_w(|LG@#7n@7DTuZkp>vLfx$hYUNB
zi6CR}n^Dv0n68?gvk*D+b36ia$C`wx4TY%hANX!&5EzZjVIVMHE71FfQt;q5eHZdJ
z0lhE?-ehTZ@7C?K%-@ZRlW`~Ud_^V#n2KvC%Uw+ZpZ>o7M|u@KXn7aXGdU)xmHp|@
zd^{ARREz2EhQEa$-2G=ewQ_Ffl+E|MRPaqHuB+)2Bso!fPE-O!=mtH-7sDB76810E
z&pXiFq_YoeYK#m`Mkg%ua@KWeGr**dWm5R9dl$TVj$d!-6{b=VX~g|%v+Eh1^K*$;
z;Vd#XIIg`W-ec|TV<QGy?Al_cejmRK+&JrQU)BfblZ=HlE409DOOYm7i-P{M{>aFN
zhintjgAXqt+c^4k5rmp}n}A;&f(8c<bS^&dI^Qba<%G_sUpc5gf4T#kOSC%zA+_Bx
zk~kyILpnR%wqbW#vQT9T_*kx9M_Ir^{gz(bG=*%lTU@uF0_yCsBbMpv!hi0-V5>6|
zt=jK*D5)FsI0z)-RMc_wILbG}EuHa|p`7X)rdGSu>-0$Y^wOqsPYDs#Ina+!fVt(%
z-5D;DG!K$tAvGsC8BxegU<yq5eO(G1>CEuvKlU=hWhodD;51hW*H;h&IA$cHn3V0+
zuV1JjR!CuK<^`lce`h6SD<?C$4T{$sZd_&|kK9vz!;6tTbHwRD>6Y=lQ1Cq5P}}kK
z0LumuJrfKVgECVoxCG#ECFpBRegXnwr-ro;t?_)t<f|UAW)ru73%CpGSfM11kmBah
z1AU%w2p}UYr9^-rJC9D7tylm=BhE=~@@u@lx=YWzK%QhCw=_=t_1Oi<D}MgBxK74=
z`OAA)%9ab&yRxU`cFI4brwYv7HeR@T5{n`xW{#NyUSa}Rm5XpsfIB)AB!O~+04+!g
zjGP%VG7iB@T(@(oMS8%m$lbq@rxN14r%S(uKcx5`ojGe(e7P%+TXvZwZrGqomr;4}
zh_-g299Y>Ph7??q4G2bNy*&hUT*PP8?-HvZ@sK#TWNKT9d3mW8M<P$FOZ!rh%t4Sj
z7;?u?#RIm0yX`~fH3(A18Km^58+=6I`2yhoW$v%mlvEw{=PHo=r5BIMn;C7P*4iF$
zqxTdIcCl!8&hKZEF2~UqUtG4T5Vxf$Xi40YuriW*WAct4KI~5q048AA{3La0hm$!X
z*pOAotXCV6!jgP%HO1LLoGx69hqMH4;Ihxmy@*uEX{95Vf+TuV^$)a4=JAN_Pl5}y
zD&eMNJjwd%qy6L^aLU=2>`<+Z&0aM#Y=wh!aB+Fjsp+Cb-Df!6bDEB6%syIr|GB$b
z%Xp?N;9-j6#VcLMw*gd?mNF8fXxK1Q5y5i77%Nebg(9{Ym9vWZW7g3qX?~zUweI=x
zc<Mn>U48e_uptU2bQa)lj##Sy*QeN-+_B{C3W%g7H9Y0jg@K&3%JsFQCL!DDR}s&S
z%p(&%BIxo-uK`Buqmcm|3ysV6GQ!vw^YB_H8q^gj8U#0-<=>&KGWWhx$1oAw7l<=-
zbzR;#%){0vWyJPL+z$C2RVDqCSUO!xWc|VYKKS6^$^pds1}6gF7-Sw9*K^pyvj;lo
zwfN<hW+-dpa$YaX_L^%D9*Ir#Q;B2OYJ)JBC++aQlE$Eb8t(nlNRO)J|2%O2rkfgD
z_~sP54}*?Fv<I+Dr=~HKEw+J%cl1L!<z-aE_At67!}f1HgqJ8r(P13(&>VEj^Vx(c
zR1iQ4+6u970%W}!nsu+dd{s1Ky6cLJ8oF33Q1vYHE>JbQPhQA3Vg>+9;d7>PQZe9E
zuX1pUcd1(+?eV(uQ`wDRz4>;+pTQ+?!3|OjS#o|w{3_p?Cx{#r@!KXg(CSG{(ank6
zsp3Caf3jeygyA4C!Cm)-7~DFgrq}mntwj|}KY*&fw$KG7x!K*v_wN13Si9AUj}k7r
zx5SG!MMOE2I|WR3W6ra^IQJe0sNO@pp45qwV5v2jY8qoshC>|{nFJk><)`$QVM(-F
z1ria<!6AN1xSAt)On=<DQ-cogN`7x(oU;;OH=+}8f)K1ryZB-P{s~TC=@t^C29?%2
zB=EqeO2EFB4f*YrUY}%rCww%!^q`j17HoXsV?=R=;=PKKR`6rOJnU+I(8Y)7t!s4-
z^-}S(m2Z*(cd8`kkQOzN9JP6J_@G)4uYcQV#$56SHLc_1JHH)H(ckhe;%HLhXL;_r
z2l&C5JBLIUfk5#K1J^C$_{|IF+X;W}J3l~@8qwFKj&OSV+ceTwB_okH62B_l=0pIR
z49PSD#Sj&W*>dZ=!KD+wY!(ZmmUt)Wi=4NHeLU?u$;B0P#OYaqJHwad@cwg-j*wm*
zItYa**Z0kv^Y|qt^-D)pQwPfQCysQ#+B-IXZB>nJQ{J+kUHtp(q8r;;Ne6?2?c$Mw
zO_HM(zC@g`idjACPXq+X94Hyv?Y7wA9JPZjn1y_=P~WzRTzuw5{xd(IP(LSSD6qM7
z8Qcz{VPg<NtnF;W?Cty;$ul;sI_xi(Vw<CY)=enqq7&h?Qf!q(vDXir;`s<dvsa@l
zQFOWJ&g60}GOsg(UKiaJAMZ5ZQO3q9*Wa0+Nc5HHF$uxmsZ~ikZiZD%BEjM=u6|QD
z(?4<U#84jKIXO}cU7Z_^+9a`eH-ww*8W@Mj^S(}V+{wi6eyKNnN8|X|rnyO9_1(fd
z+B4V38MFx~5Gmd5$mJ|3!4>88Ru_aXrkcZfheon3h=s#G^RwZ;l7@*64k^YZ;6C6S
zWjGIlp;Rp~Z-Rc@YJ1ZXXj~{^B7!!W@8c|4#0Us2VdV2O??2{F6giTd|6D5Rc)HQs
zO4ECUL2@|1#BN5%D9=DBH@4Ar90w(K<AJ^83}DR0de?A{8C)x#yg<QSCBjJQ54$j5
z=oV5bVhU9E@`|ph_y29F$mV`+@FSv0vY2J4ch(zFiktYlBnJ5Uy@bP~%MmWfAaIUt
zT}&kMVgcb2L(A0J;@XR=mP+1CZjzg;V8=>PE=i(6Y>gJHr7VwLPbw6pKpZ+_#*8UR
z2p+A2UK3hD)|~%I1Me3ScfQTkWfF=}YuC08IHhcP;K=FgkCN<i?YpbZ2P1u_hdpq8
zPh*tbtUkP~BfiZy1{?T18^oE9@W4gJBDixr%A2CvSU?7V5KAo5V3vT0v2z(1Le-b)
zU7GY_6ePXQL?IbQbvhX-om}kYhr}F+Z>3iwlhl%6DTT0>F)yI-QnH%5I0imoVNj)2
ze*5XE15VGzzXVNUD(r*4OI3D`l+q9M$}STDz^jXO?pP53#DdDjR7vn^AbjZ>*Gv>o
zZls(k*Z1!@iLj4^#uxSV*)qwIUd&jDdE)3QSX4<lp<r7m{+g2px8DCV0#rj|=cQdK
z`dv%@4}=s)wIs)7mWheCmW*oR=)Y`O{p7`q*kXA`prwTTMl`PME98w}qsX=o3hY#o
zu7)($?n}}4(<8uzaWE%>k|bwE+<ZRc_IklBN$SM$6LtCSgj`&`kg_y>+x{)*1ko#0
z{}Gwmu^lI8${~@LgnKyP%I)~zi_jO&-cF0T#KglAVh!xo*Fq&}l_y!T68S5G*-VYN
zq$01-t!tOM@<;iuoLFW}oYxaDI~!KN2bot)J0sn&;eZj{2<5bsJl-F<Z7K?jRp(!=
zSIVmaS0G#gTa~CZ334h-Xf#6Sf#G7kp%Dg@B6Bnqnej(oFB3s5SDi%`_1nB%?R?s)
zx{dn;v9^v_bM5f_WcQX3p{HzNx5)CUcW@(cM6x~U%VsGVGxig0XxXM(dUk*dZaii=
zuP(F(pBC%+By2lQF2;ef+2LixTttO+)XCB18JF#-rjaV-<i1AC_ix@tDB&vg3pjLF
zMK<zS5uuLVaL1BI$aJ}%fY$=i;5n5`gCg?4lIj8MTg3SJJgqJ4$ZCWa3n8(!75L5a
z`TKC-+QA+WERsI}bv3v|Eua2;*ImzyB%@!3wQ|lblS)u)Fr!DPB{I*;1T2Y=NyKa(
zlYdaD{0Virn9f}un^FXc0J^K;^8=x3KQVn$y9$rbP~VDUNhBM8%yy)OsRR+Bmid7&
zOolmIA!Ia{m)+(5W2vB|?iQYlq9})Fy5-*G_N-MMRP!>P5Fjg8T6?6UoGdXRZHX^!
zFFwWH)8@q5^*!FY@1MKsXNoV!xVHX#vinV&y$4IQH6|xTd^yVFzJ1_9yI1|&?Ywa!
z+p|t{#+px!94{pFRIQy9qqlddSq@VM>bviGhHF+-^wJN&(NFvCL-?b8o8#30gvsL`
z%)E=OeLh3LW`N?E(p`i5hrSS_5{K^~t5z(`LizsQ@zQ|Dclye8i0T(>4&P*ZmsqP{
zxkA|phiIGp4H`U5OJBoyckGc*7beu5LYQ@#S8eTm!iGLoLX=5|lOQBkhi<H~3aQy?
zc@OmxDC=pNI7k;Hnj0bBb<6D1>XB84yaDIlG;1GuPi^_BfEh^T>%-gtghMG0rC4v9
zSv$t&W8P(3CX-98(=0dD$3AdWU%M@=iisUT2u7ELOY)uw_yAZx?B8EEXG~r139)qI
zJhp>cq>^OzS|e44_;95G=Xkwyp^R-In2lsC;U<r`?8rzpqB&*F(@JcdrOwDL)_*Xw
zMlNJX5co<FLlM&@k;7|kyyP;Vl9fwG(hp_AgQ#EfIl*V?yiBriW3;b;e&8VxIW@US
zbS@H3fpv)i3u*biReMo0Ji={{Bae(Ld8HwBdEnA6CrzdOGiyS9MPKDh*I&){1ZE}r
zNYqK5Ewti-nc9tamPsRIM^#~=%ccz;22sPwK#5_;z{8k)HUG}G<B+(FyAPw?;IGKr
z66s5-qvXxm`k{mMta>xISfhzAnz$+%@1{a&b3#s9A>aV+<|?1F&)PI-FJ2tgawBM0
zeDYg9{4?8^7uM`sgar}y{CqJT!Hs-NHdIuxoap(Mu@d`}2Sv$G7vp$##_34c=Z{wI
z&Jgv(8Om3MT>mn7-Og9-<u8V0W%}wJ+r^wH(83ot4N7=$dw#J-0xPFiqo}c%a<V}r
zU52UdLELy$2{%u*Q+vq!BMmo-d?U|m-@c4|OODJ)xo!kErv)-IXmEdMA@g6vgN-EQ
zEZy<MRCk-N8c^={30BALuQ#R?<>chp)|DoVw{Rp7Al25|CH8sf?b6avXD25oPd7q3
zwG{boxqyus=a(0!Okhm%l_6>$rY^#8qd2JYmH`J->ek+zD0x(LT3QMzoL22OTnik}
z4YV%0&Z(}R^FBmE^Xl}3=LwC0`0A|Yl$?CWnof^6Aj$s$3#O!!B-<&^2z{9a-ORed
zhkD*H-GRZ-TDWxqHLGH*GHEcsm$__)Mo)j~=HgQgYHr5F%94C_f;*hg>g_-N&el|0
z4~k@<B1`Wp$*5Z-zNuj%a8QY@F$-v{1Zkt#q|TEno%a#WQ6zCCQoVRx(?wct^>#(n
z3=fIqZ8TSV9K(U}`@)roMx@R4E<GNN*e#*}NEJ%&l=e$nS~WBus>X~ln|_<B7+*Kt
zoiyj?gvw7iJ-Y>a9--gL^`2KiLu3fGOcw>1$7sbmM>@HPE4o+aCJ3zRuqxUhp)pN@
zx~rC0eQy*@AgVk;yrd@@iou#B`319!NF(yAQB3^-@r!T7DPcDmD59^hKKz-YsyI5B
zbcfZuGlSmRW_xp}g2r01Dr@y-*t+4trOXq7Q%;o{A9CrJ!7NkBO0bAse<V5CBVL1X
z%)pYrZubHfq2B%x3q`dXiJyEIxTQjOmuI2CQlWDvy_zc0&KKL^qvbP7UB<}Id7i7f
zb?epuac;4NgD^AbKfv^wi$n*D+YOnsbO_atY!vxnzeNE@p;r{Lo%?4l7ti(7oL=Oq
z5CnEs?okdNDTSa`>Uxd|Bra8^Caf|v;>yj`Reyw?0{vR;dwNRVB?DfEkk|l7nh(=0
zh$l^W5J;sA7y}vAM(8C*x#AX|92U2CZO<pJGS-j+$Y+UKpu>GD{q6Sz#b3lMI3G(j
zsQa##zT;Xo+3?*Bd(?A;?;hxjS}dH!f59yio~L-yr>d8Ie_HFKlsEv@QJqp_P@~@D
zec7K>6=WLqqgV}c^WP}PlpwNZ)`0Xx40QWIE(;VEk!#*846rGlpeoljn!6!4x`Vqd
z)L>7JXbA9JGHy{18RQz*Ni$L(M)~be>HXK|NHIE?<CU8<_UpvOlTQ|Dyx`^)Fhzg`
z8G0bonNr7Ms2XbA!hkB7tG?UQ(-vGsqMM``<N^xf$m>#|+DjU*gqh~?Qy>2N>#sIk
z4$SK8a4_-GB^htSut;;c<hjBbI`t#Bkx-V|8BgLkdu?xDyZ}=v_yS<dTnhLL`4@@z
zmDFC?3|Yoq@|FUX58eM1whR5a#Iz*BgMhUhzJiCeUHcB}2#n~Wb~gR~fuM-1Ddc1p
zIGFgtiyK$AFp467=UZ<(QAJwoeZr>@EJg}V(cwsvFujsQDhAVxyNZ6RlFJQ&VLUjJ
zNN*wTCzP+cox`CetrSI^9rg?Apo7iL&704t6EZfsna9R_^2u}xzC1eB2W0xvNlEqV
z(&?mj5cq-YVCUM~K|ohS^!1l=ol7oI=p>Gl98y1pQ#3aSpxhH!z<S5t+1t?IN&&3H
z-^TTo6+Pjv*Q%izNUh>d2BZ+?-9MW*nbP7wc=(2ZwBDXS3K0Mk!n_NR+$c0&K4Dpy
zs>C_9l+g;!kq*(;M*xYzC$tno4p3AdwTo8$^vNb1$VmhOu}+4-QBsymt<WW7G#F?b
zXb)QaeaRgHf(t_17*z9AT@d@&Vz|w#zs^$qB+ZJ#{(6k|{froVr4VR)Dz^QO2k9zA
zW4PWmnHvmEVp~vI{=F@a4{W~_tA*)jLJu9)lH2LC3JM=DbF}f1AxA>vsDreztfA^G
zxudhd|C`^_=wYr7hi-|G<T0~^b+>+sTdY=?+}8y3g^~#F*Q;0MlZg;BGG#!E1-zy(
zaOS~}m8)x<GCNB|85C0kH(j1(u<9(;*sMf7hFUYEk+Y=lDQi{`%ol|h)aSMHSpD{D
zi4?d?_@RDefXOsSo`<&RJCzc0nyV^hHS|MwmUbIwll_ZrD*_8i3E((8c}8FGD2!%H
zvJjdihRy}N5lsw<D@5Jzv1J<8m)>c={Q9dnK!q)&`wne72~EtRfAH|(!+FD0kNrRy
zkDWWvR1ii4)J)Q7`0e}Q!h(pwGT;h|<5`k{Ol{v_K6!GALkk7P>gej+pz?X4-9DMk
z7wfWbPM<NT_-OF$$Q<sHq4qRh9R$ZW7F7<pcW63j`pHZBiD{)vkZKpOn-F;&4r`^)
ze2zP^T9^z9f+)Z`yB!>}QsR}xOfPml#PNM9bUOyr`nl%_(qY!ZN*Ot}C2)-D+cbA8
zOiEp<6b4ByCVE3}r_nc2q=YZ?XxS1x7shW9+g%`3(9!(r02}iJ&+5G3K#qj6ajl$G
zQOq=!w2SZ2|KFYGOEbQ-_#XGjpSoB%nINx4al}?Mh*Fcu+_!F%SN+;!xwx=oba<Qi
z)Y;VKT|^m!5iXY=i*l3m3yPY#50WTIlOLsp`)U#J)0VSWD6I{~sO4ka^&b@7*7z}$
zIv3<~5!Hm#jQQ4D4e!;kYN|1<#+7H@qtBE<Isl4Y_GGA@#?Hr^in~<MBpu1-F9y+`
zuOk^;lyx#RO=4lVkwghka~deDyOai|0ed6W+9{2~l*(O>vbJbjW#(3(x2pWq&^u_l
zKUW2pYc#t(DC|<v{2L#Y&cROxO#F`uDY-<+!Opv_tEQ{o+<Dit)>w~Lcb4NU`fzWX
z6ZzFPB~)^>Y0^(WXQ1OFessPJOX66%KV8kCa3EN3frShtAyZ?J*r>@ExUg()pK)cc
zeeh-qY$(f|#poDfrg~24I-6JTc?L?>AS>{1<o3&wcBKXg2ALJOWtM{&bJ1toh7Kz2
z<c$D$5eY|3EjUN&9<NoxGX$Uqug|OUJ8)pdzSbAXtu=?$yQ|@wZSU?tqAHeec7Nrv
ztXVhhPXu&-r-W=-H`EqO;(v&v29uCgeE3fdygI)B{w;B0lE}GaEJN6<B{YpQ#dpEq
z1rP=cT)7|Q#sPjO)H8m7qL*j7=Su&ktM(&mh)bF-Q+AaY4FcpjPyr>{AuGoOmdiH|
z^>cr%K)~?3uvczS(`wZ&-bsCE;IqG<K>+)82pCca?_eC0lua0;@x9+s{wZdQ5KRDT
z1!Gb`Zc&=Tc#7|wfDbPD%wF16Y#OQh7L~zP$V|VX8Bf{a)yb_b;itAe?A4YjCBuC%
zh-Wu$SDovc;hSdNoroS-xN@pkxL%3xrfk#Hxjm<k-rQp&e(p_n_*r(c9j{-)=h>@g
zWPlAM$oZPFo?n0ch)vHo<b1idxzGoc%30`bdUhG<%UK#kg*k4q3SZIKwqICCLRtWW
z%q}+5iP|8|Iv&3#4z5W8LdCKz3RJ>O7W~dxt#Uaon>x>(ENL`e#ZRf#i%zKh%zEJz
zAvE9xXDYeMC!r?!?7Aa~T61J}1Tg!?890LM_9?S>7Zw@D7}UJP31j_V<Wra#=LY+3
zkA3Q^^~oo9o+e6WAm5B0`8Bvk_W=VoSnSKrD8FRNDWuyb>4KQI|J&KTyJpujY2g|{
zv;9{3wQIkASwG+SZm&&vMZ*-bq~)wP-927p-?pv1C^}T2vdoRkRJ-S;0nIFY%Y|a<
z(Cg>-qqv}mN5MBc=Tf;P(50Ed#~)u#`dFCG1>nADTm|NXeXG1Vk+LnDYeQSXf5@ef
zTyK3`^^*@+i=3H-U|Uu>X%#qqTS3A2qitz%+5JzuA<#WJ<kD5LHL0bJXki1#C&GAz
zMN3bVgFh^w;MFFV@hty!GxN%$GjD9A4q{0*m~!sS-fu|ihfEn2g$-xt5X+0_<(Q{r
z2;!ee6h=*QqYXb63!w=Blqnl*QPjHmEM*f)&|D7HZ3Sg&CHLl|T>bp$kVa_y(T|@2
z!<*Nz!)|X1Tkg@4XIo|y>*Op!qtbx=k&qQ+`YABwG=a}HZ{F;=3II)P&d5R*K}*a7
z%Jx5*_H{CNM@+%i;s3Ytg_>x>{LLjE7tTEjJuUUva$&rwzuRI5$RLD6$fsF%Sgt~<
zCpUIK?|PZb&b)?Iw6-u1O-KF*`F=a<zl}E56QHThG~`Ij<`A}!mNbuN8G_jj$$&gJ
zHXn3RB6<XWMwHEi;vow2SBCfm3Y`Wj67O0(W{w>pWRap(D_r)M@}bgeWLAiHwee-f
zocjL5!EF+Da2ff~pfw`!RAloeBnwoyS<y)hizcjXn#4#70F#){TonQ4M4(PmtQ<|0
zbrSo)T`W^nhN*$K=l1|d=`1|^gCI)~)L=5b`lsDKJ)>(c&w&bITcMz~5Ds76!ef*<
zUxXPS0?u}XiF&NNNGTsgG6;rHWj36!IU&O|vA)R23K7o2j>$9@_9eV!C0+j`;stZ)
zIeZqBl|mQ(rJvBrDdz;w-kkw9FQuDEQ`jAkL<|k{gU0S0_ui5g5sy&)7JC<ytFcDM
zUYDC2az|9jq%$l=QkVnGnC%uIieD)D!2kHRH2gBK60*jKxk$g2+zAJ2KnWRgcN}xC
z0i8vWBeOGUSVam5Ik|5lMGcBG{nHB-<5O=)N+Vy3nml^h_&LvF{QeYrN5dFfzJC&4
z?SCb}2*xWC+h9jECDCQwhnX+L`4xS1n1y%7wKI|oNwt>$3~W<^{KEDFGs&zR6fKGc
z&;cCfM{A;07lar`9jWIzKkYe<&;9btFD0*MlNAL>Y4*)Y@O!Xe$N}U0`FYtHHyxqV
zGd~;ci-e#`qCZegKc~y6lKPWFX(1FDG?|qkIQ+kD?-q%dAcow*c7c5x+=jDAP_yuw
z=ka5q3ZjPYD<c{(ua+^QmX@{6B_g>Ld3wVRpx=-lJHDqT{bj;Gnh^=pU>P1+X@rba
z8o&H!e6G>AN2;2BO_xIVJy2!#MfIm%bU2E_So97;r2$~xZcKs~muY&e!Z|b!o2MV|
zB{BC@cX(Y}`&x_+p7iQ|x3&q=^gas#b|x8@Gb}gM8C7U)Q8GZKS;@hspK@r&<C872
z4Ly7ZU=0&;9^u-2$bPvyIq4@^sG%C?_t4GX5YPnMoj*sxV5}8fZz#p6lcg>JaNZnw
zY2skgj35h?LiJt#@kt&S^MnXGujW%Nrgi4t{?wXlwk1Z%w0As3)dGIJ=nM3s6MTo%
z88z9V=Kl1&6#XNT{Ew0Nw(g+iJ>mXocFwVOWV7YpDqnhwz*<RU%g$)>VJkc^=-)YM
z#n0_@Ha4QhM1a(H_%fk{Lt}_NB@C=iKY#<OmzRn+K9GRE9Ub*DSm#_6Lv)LS69QXj
zSdiT+rWZvv%O6X5_05SQe5aQjq>wQSlt~heW<%vBxhZm#65qpM$AaQHk^w2>V|ZOR
z@~v?K8HG5rr$&vv!o#qIxccnx7H<w80)Nl)IgR7DwXVl<U25DsNQCnLy=+3j3r5ZT
zKxtqH#ZAX^+rbS@!&%WnitCqBpzpHLi@1mLP-x%C?Fp9=+>etN92BAW%Twhz%Y5%u
z%M@%`zx3m?L)#YMOY7i8D`vJ4)Ezu(*M@(5VMo`uCa=wSa|rIp>eANvkb|4ng>coi
zUGjKCL_1cRV_k%hF2^vi^P&xISzUVVj&YXjw$0`v=InrW1qs+@_U4B#52tq#(VL|7
zc(w?PCDY&#F?+=18+Y&1$1a*P^+eFf*hZQ0-dLgbszktMbExh{?TRe{G&KGR6?zVu
zJE~osH%`BMiKCWCZ-tWPRf09_vqZoqRvnlTnXJSSB06N+9UX2A;?x%7Y=aAxG$5g=
zc$is@t#j~;g*u}?7&Z1`{>E4Qu;aaZ^zVP_UUiUx-=0qQHvFTlMXwFb>FL_~SVQAY
z?VWyKK6HNHLuXLI@2U&`&Sjr_ak6-)u5#!qm&%Ih;W)%zJnv#$y2UN<#p4w&TZbvc
zMgpaxtpNEK`AMXqH7)?rDuMuF%ef?xezB!l&0x*JAZwD6L&Y3GX=9FVqNv!)5}8V{
zFGjMUk#usEr$#4?RsniD9jTsT;o8MHO4|(|yz<5Kr4P%?9n&;luCII0UHl1N2_hC)
zNI;Q&)+_2pi<OR?B6Tmve}pY7oZZN)nkz9`fIpJl2kfACNg)5c%*Q4VESk>vsx)4c
zYcD`+FmOP=gVan?-I4l}Q?otO#wGsb%Vl*B+M`*N8AV!Hl;+ZlUT^f_1bGeQpjCSN
zpy7Y78VF4(0`0Q(qmVUSMQj1<qr1+P7FDY|x&KOWhKcN(*Coa@Sy1$kW|SK~7jnyr
zsSHI%uV@?fFwS9nT9Sl)6$2|O3K-6FL6)#qB?X5Zep)iMG(C6uKhzZksR?^!M#~>L
z4f?_zg#i{%sa63t=lwV!(bue_|F0samf@PB3bW(Hka3Toq`&ReZf%6+ILEVJdu{_^
zg7M`yay)PG49pB_)|V&5MVC?3^zGsUq958QhDJ&bAluR{vl+GHS+2V9?&|^zQ-K}U
z64$sy0D>3dNw*eb6rH>v#Iq0zv}->qxf#mcK6E4?IAy^ht?t8@S)x;o-16qqInXb&
zCNA0Iy}R}6=UW>Gw%jFr`Trc);OiW#x)O+rz&i}HM^~i`X=M@`huBS!!^by$oZro)
zIEb$5)TvX{mh}nmkj4-nLw6<v7ZFj8NX}`={Zs33Jbo26y+T`8S6A!mtxf=moatDS
zO3pQ{-}OwGd_tZ6f@Pkl$sQg*$bA+%Cnu_yPPe09WPN!jA<&wjAf2!E-X`&QX!f;B
zo0C()B<hM=;SWUf($LHeye7eun_%gT;Bd08DZsCv|Iavv>y58f?a1SMhx=v9C%)*j
zn7-GdM#LarYU>A6y|G#RkY?y8B+mFds#Q^CbN0_J7>~+{IAVbiogDW(1t6}|bD>9J
z4&wWYY=mLhw7%}tB&J(&e~EyqlJwaeegNCYe5sqLefXF*yg8-6Tzgb5UEJ2HK3C?w
z!zr&!()25|kqi0dp<TK0>#WVpT}lk3QkBWK2eicA#S&QvZMO)JQ0A3#YvD369$!lm
z!6E<!hoF1hQf8cpu({n))e0+umR`J)y33P7I1A7MENL-fEH^sBRB!d-E0yBQ{U)`U
z00KPY*W^9bZHszd_N_`_aVOr?;J;N@<jo@iFqBwco>P9mTbyl(C*~Anp%?>knFkGH
zucn^aY3Q?kkK?knJv{*uKZY`=;BDc8jk8+d1;m;SQ{fL|P)#*w#v*@!=o7f`oCwp&
zx`PmuCZ-ZVJ;pO52&U=>YKCb%TvIwDnajrIxP%i~LIy-ZNiGE)Ls_HRw_WYWq|M_c
z)9!m?ArXB5pO^--0Mp268s_`XOE*UR&w7k{qsIL{zF*!qPuioQ85t0D&met%(bcRs
z7Yux~1qno|HJ|1~4lyK)W)e)wg@q<$_ibHOfv8a*)i)tkA09ylNg!MLnAmMBA>=s;
z9>Kf55i}}0j#D_>5NP(#PE1R>WpN(tYInS84ZD`>KRVGwwV2Ys_vx?K%G(p~ThgnC
zeSj(aU9-l_;gy%i$Th=E>Ug;8v(PJ#WNpzU)I`h3ch61F^pv@+b3%=kL{R>OV*0jY
z|I`tJMX4m!8h7h5gJ#%@us{Y-L0l{!Oyw0|6VMcQ2wAHLNwF^o_CS$&KdSeJ(5nY^
zLR)_>SfRRTUcT!HuqlHZc)`X!uEZxK1hYkm0?DP2uA4B4A0q01Epbajh{Psd5na7H
z>(*>4W_+S*kUfv6j-#<e{Q4bYyP>2EDzfFrY=5UgVn`qeD>*xg8TcZ&I~ZiK`KRh~
z8NJNGGi;NwhU8~HBo0)Do=bJseVnx!(pxRgBZWxbYqqE^3XQ&S<U52z`}%8i!L(Fu
zd=l5{6h}JN5?f57g<=0v9rhaIaN2c&KXe|?{)-eB)mBk12D`B?><iQ&*z@}Vbx6Ai
zaV9N704Rf#;Z_OW>b5<NMj>rruJUvj_|L?Bh*j$kue)j?Xt+x2kL%E)2GZL;qk+zT
zNe!kk+=Vy3@$!<6TBbWfYzwGITORB6UUHAr5m8KY_;j&aZo{^(C>1&JLL0dYWxn8^
z3L$ht8mDwMJnS1$D`_c`x+X9r#p#}hBj%WTt}L|drTQ@UtzrfQwG^(}FRU%?hETaA
z8uU9+0g)hXC~XCF@^-@JYKG=SfXPYK!XAin9$i-b>7wc3swbIZw3}MHBZvg7yA8M=
zKTbMrDl0_KzLolHB*7~c<M#*E63|$>W>$P$WE{w#C^4fA;`juN-VUOZ5XK#F0B0^8
zgM#BD8<gh^2Zr5@NEie``dKX#&P-Q*5iFo(On}32V@81q6va@YA7SeSMmgR)f#?9t
z5VPCViUgB=!SZ&*oyWN@QY?NK?97!u8y;acl1KO<xjPLySm>GG6i&49IuI`bA0Zab
zusvlcs`!uu{3lUS|IFX_`M>le#1$4M*7+?;tF{T6GLlE|0-3p!_k#B&Sfa4TqNm`J
zFr7N}myF*mEu(0#pl^#Cdv7340RR2v;-ZQCH8FyKC?;1d!K5r&S@Fn86(U#xc&jXv
zza2Nh_C^Z~@+gBvtGADt*K~1S1ksIkskIE=PhK})hj#Hu?N|jkgUq?~r^A_WCW#*=
zT0MMO_zy7!UajvdLI;j|j`mMQm&+^5<Nx+mr@QVOniSphu|;DqC^Q6}bBQZ{riAZJ
z&v<6Y^(mo1h^i)p#z03eslh{XcJeL*>pC{E%Z_k<q|Ernj?2hYnQHQPVUQTtB{ha&
zAyXv;QC0vu2<J+PJA#}%gv9hGm2zrQn3z_}xZ1ohEViC*mX_<^)HA>C_LeZp9b+Yx
zAwGvMO?Ru-&Vg9{nwd3XjAp5!`{<#_hBf;SIRh8kiuRZE2O{#Z1&OsTOyj@@05j=f
zVl5%DW=%9{jUTGfo!czE-pP#R>^#iew)I0D215wtCBZCvx{05io=Aj<vLs_;T|8S5
zRIiN2)%SSVhpLZOf7YjDflnH><FB?ercPa5m{w3wV9ONp7B5FaRMKpF3|W}&S*7|1
znstt-Z2I<nyw}D7BFG`Q1wI{m6=S}cE7qxAjo&f@296-4f|pcms;T;?_cuYaL6Ho@
zS|?hYTlCDn7pBGlBFrI72RAzAvy)&1+b#S6+$?H@+92Ud^86c$Y47u>EvaN?wkfLs
zw<aWSRsHnzw|$)bF=hM9_$Y9;vwR}yS9C2e-VJ%s7Z+C_g;=e)8czb5A#|nLA&Np<
zAp?f59UUId5?v@0M}Urql)~ZMX$o}>yxQzcH3^|c`ZSX^Q@vPDBtr{oZyL1Rd>Q_N
z+TJ(ux#5{w8Pvl+yzuV_q>7x-CeFAzQ96F=1DU@pVoP_9x&k^xztOu@7v$vrlIa5;
z&(Zun=&P2(Y{^M_VPSqS(;og~01VKy-s!JrQxEN9Qw6ZPWgHfySzIVqfc?}m{`}gd
zeyLcmYQ!`Ft23Wia>polwI?>m^Z2cpWhX6@b}Z%mi%xo7r33QR5Yz7$&y_Qk2$;^n
ztDZw$DV0OywW~2e$+on9-|gKC7eJ8bEx@od9OQbj##25*LSUgpLwV}@k+$Da|CPzA
zy%N$;f28{%-^Ll4tuy~@v(HGIea*!EVQz|kDAcJAxFx_rHuu2>-Ss`q-t+A}X*?Ay
zzsnS45E3KE=d;eOBi_lBpS&|fMm&cM+!)h?+kAEZU6JJxhy3(gU`$y`d0hEorWKJ3
zNCn2;nFVQM=E(mG10-M@(YjG=C?^-S<RFcD&%Pafazwi#vnqgR%?1AkKzA;2%c6LC
z4dEj_ni%57UsGdJZ%+$*Co`x6*+d*UavMnWIk63=VMi?LP6iA{#DFX~Fr*A=goy0r
z(l&jlLPmBbM_P9F$&sOC%XU1SMq#mzRU%E`mhi|(rrl2CK$90QvEL*Z?kZ1EU5c8A
z>Sy2eVS#|q2ZNK!)FbA5-Rf&$KyfBV*_(!Nk081a87Pb*(tB04+G=w@T)pcV6C{L<
zlaw{dcA#cGa_rb=m94doWOh1UD`r1L=FhZ}K)w?rS9uC}MloIu{wd>nO!L*gO7qag
z?J6h<WYnH48fJtB(NtPW!ijH&<W_Kchl00NLFJkRm4h4=2z|@dc$Ub4$~&Xym<nSq
zQw+{Q=h&9UM}r+pJa?eFzT(xsl>w!{RZqncnaf$jb?VRBoV6f+qkc5=3Hp&Qah7$<
z+s^zpJH%Y@Hu<FqbxAqwx|}@k?S4+8@8c|%@fvhn&aVw~&dWuqt-y5~XW3Q;X&OBm
z<JsEy?oQmv`ZM07zZ$UWKdb)^g;ul8F(63#3!hX}2)N=E3*t2vE9Fp;iE*4F(v*xg
zjvtgd;yxKC9v&Vt;#CIR$iIO#BG<xV>~ezqmb*0^k&Q?@CP5cfIf45DAr9$mQG09a
zH(fheEIh5GDw0RaVJYNUZ0U7NG!l{^Eg~o>hrqTiJD%Ji+!qO@jzaA-tS^Ypa55)d
za!pbNRfM1%lI4v>^antniZAn2XT*U>pUwU;%n*rR6%oo*c`*Xff691f{I4a6QKtR$
zHcl1{gPxk1nwv_Rh|pkAL60F*%r^Rulb@uiqqm@sMug{}=$c;-7Hb^X);Fl;;$uV?
zMfY|D1Gd?cpGmQjEZG*~^naym@7=~a(F27^nZ-xdVO9!*4Q`jKRP1*LT(xo0<3=?X
zFNAz3FkO-_80rEXo)d5_B5On^iE4q(7BoqMB1*JXTW7KtH`yf^HN$5&MvKHh)zLE9
z-Aaz}BrKDH!Am0Lg)h=%Zj-o{=FR(O+DO%Vc$vlzprEtflLo_J%I?c!EjrQuVXP0H
zVJ5Fr+Eb7LaVp3(miQX?D)f2uN}A(msSXj5z7L9y{1`7cDTR{3*p|(GEl(oqq`&4*
zWwFr4oC*ubJs$@RATX6oh=GB#RF|M`HV$i_3Bv)iCy<7CCH?)ibkgT^J~1a}AzDwF
zAlf%bJVY|T(xbI$Ia??5+(3tPOJc`_SrUc~70nNv8V_bVs4a$q`&;ksK!L(4m`gg1
zn9ux6_6K5pKS)jz)t-!0Oh;c~(=c`Vbn_RZ0!yuio#~n+MoO*{8P0PESRtoM^%izE
ze>|gI1>M82<w=$_^k;ZfpXH3%(D8WZR53tGY~B`$*t~w!WxEv{H7WE|RN7XA;7bK{
z;k7rwKk6m+SAER%xjw!1kQ3Im4Y0)9W5orRUE?x3u>WEt9$TP@GmbyF{fVC5%NW<6
zN^jpLq<1kVMtWA(@EO(WAMKaM6q(j7axqyFk}8*7^FlKrKE8^bWCp(RoS2t8AH9Bv
zZ%2ZsIZCAu=7E~QfjNe%f2!g39epq%hQoI*K*~3dTEzc(!f{>-Dy`h`<wgvRw-Cp!
zoy{JZsNUhd_vX<Su)&X%JmA~i)wMM|<?5mMK%@Tjl#@4He;?|jc`~BAo=ClT9|hF5
zA{JD{)~yBjxg;;dz2$XVl)C$`<3NI4gdF(mf3M79+Evgcqn6gOG?AN407)+091>55
zF*w=2yIMDx?#gc>ZF0$swPZnquwY7a7Y`uk^qLuJ71GX$*Srk~H<<^ao}299AqSGE
z_g9(!^_w*&_kZ|h!Bp{CF-hzJp)mM29?_^S#K2&cquOH{^~dbnXUah!QM=Xst#ige
z#D=4HGs70JkU~kk{!*!iscB9$woT=mfp|aSgp?87Fu;o#exQiBTp?s4reBF<6t2(>
zEJOOkVQNK~#^$5LX4sCWpOZaJOa7dDF-X9tJLBh=!afuL1_~M%y`gLwdYnbP%6`|?
zKJ?~Y4la7eN>Bljn6lmm1K@@yHlDcVJxr%^?W3UzPL`V-1Pz3^UcG)@r_^=h4ZH9}
zq60(xD4Xv@dlV-vZF#>_ryI_RMC{$Dk7raxdSkrm0X|-1ch^V8(MxWHwDR<d^RXcz
z)|0R_QPDJjd{juL?Pth1mTI>Q6DK0LT>6EE1+7ZekJ9*X<7=AEvw*%LU?syHZSq9%
zNg@6`nwN$I_{tSq1i8JLjV?iyvQR};L0qek)t5=6z4^cO_CIJ^wT;U?yYxFznZy+;
z<2f2`<L4S0x@EWwZ=k5DD*~`x8xy7y)o3<>S_Mi8ErHB*gJQ&ajuP(?ceN?WZ~iBL
zc4SS+90rQsM`g@~WUjJ8J_51#mL}dOhO2Y%e`FZYsOja{O$2Z;O8{TOF=~?k-K9NT
zHfw{__T;ubqMYtQ<A<?fCY{nP%92HFYbO_dus6<4PQ$q|b>~_P^hg!{WbUb<iYmf)
z<r9<?e1nFamlaaM9xF;xAvrsnbqd;52!WW3ZWD2au7oa7i$X9Q);g2$tXC@qY3H6J
zCX0%XQQ?FkJUqO`d+5@iI0qFAkY-b)3=a%Zttd^8YYmjjBEF;(xFB93(RP#d@O5Ud
zm)IAgupcS1X_{qjTwq8fU4C#WT`LyK{K3_#cb)FH`FR(pU=%ZQI?!tgL<YCEZ2i&0
z)>EfWm8n)re-OM!K#4BbCLp2Cr^N`d_R3LxLxN{}@z!u_U?LtFdp&*he>nMboOcG6
zmRiQxFJ+6|8C)$FE3jhjDOCKV14sxQ)h4PersXu;|4kV<?w;z)&K#8WZstoP2{l6i
zrm2u+hi$+&!V85IGd@Eq_}D?{$V`?>KP9P{NA9^&0Og>776i!4OjVNBYsTz<*o{o9
z?GKJD)>24f1!3U6F@l02NU|}k$lp}_$2lZzK1==Ktj?+jR=c-f=Br=Wn*j)46xl>w
ziR?TuR6LHsR!A$wf6zE~qM@O1k<m6e(}9#^z_@nlrPJGife0Yo?<A8pvMGc^cmTGX
z@k=LsiHNeeSCTs>JW<mlmdSkcE_aW9fK~aQcJT50-E6zxHF!@W;@@?euHC==-E7?G
zx8q&?D#|TUd`?;QzR$B=tTB>9XJ%)VUSUa?SX5J=Q+gk7c)YhXLj+GW&m8YRmfQ)Z
ztq_wRDchYH`Yiz|T9ur;x!_$`|5vygrk{B7<jL=Cn?Xza&;EE4FAYY_&KcvF@ZRkr
zewhr26yO-NUDCEW)m$x-|73_{e6;xHyM)HF3d!=GCFx5tZyh3=oM>+fAu<6Z>_fT!
z&8`;e|473PRRfy4`AqO^ih4`a?&acPO}X2B>NGXdzngOMWtRx!z>z!NeQPrIksB@t
z{)_8+FQ)5@=xr?SJw>Z@U}d3wU{soo7)#<l8fPq$U@j|bLkbVH&!%x^DlC5I^Y?t7
zVNLzZ4qb7$JTym`MmmNSh6b*QH(=RzW8_+AIAG$|5j}!c)xm*}T<`)*!UD&W|H+jb
z`9A+$a(r^dcB;4h3w~0Dve5bsD;*~$YM4fW1{DkNdg(4!)q<MccR5b_a}2$y8(tjY
zNvic7db*9b*`HI(g)6bQ(I5NX=PjEhpa6<nqPnJcMp0}lC3gY+ZsGRKj%hZ=*^JY2
zA}X-DsHDW;@84mUWB-qO_N3q%-7uO%-_8=LLv~`1o;|l_MHB11BAUkH4DmL4<%8A~
zQR{{}lU5}D8p{yXaWL8ZolGfbL|{{0bUfiCK9dmFpgL`0ci}IR8bplM{j(JH;qS{n
zzuQSbemX;?7HS&Wu(ThyfjOX9rcL~0_OSyi{ivhPu}xlc2-j_mb>dfub4iLUnp;+T
z2I!FESzXm`&HUMU{yKnlvnKrT^UzQG#NPoFY#YAKa;M!)?!V*7=d$CJPOqQapxw2i
zgNS$y`?&;sh7wct!y-H}X6u4+Lr2G)8tqsXX$2im^5j7DeUH4{glA{bSIeZD3u;&M
z>Mr<`Z?mDC%CB?=jSi(dh%_xg74`qiFKiO3I>mODlSz|tTen@hg6-$6wVS!4o3&pZ
zTG{iD<cMHzH(SF@mw#%k{`JQ>JKililhZTlD>^T;*$)lUP5uk+@}<j2)%V=#@eeiS
zj>gU@@2QOa8X8>&ZdAQ#4ZW}bW{>*k|NDpk@8SHllK$Vy`On=^k-SSScrDScHOejW
zHF=)v>}oLBN{K6TzUWCsL`hv>O2N=TICm8RvN3AudMT6IZ%)uyodtC^mLj>YMb`D}
z*JI+O4&2!`2qDnbkRwqv9CY^*Q7Fl=G4GcC^=>pG_NV9%Fecdr<?3N9_-8f|i;&?|
z2J1ye%B}6SAVJ>w@rNkW9+9(?&9xj_AbuRd`rtxj3^rF7C`vH@={iwv#9BqKQG*8>
zH8*Ca3Hr`|P$;Fmgix}!y-j)HQ#tmOTjSE;Qi(Ex<*}P~=V!9h?RRG&NxOw&LWdf>
zx&@JM=e~M3U(be@`iWva0IabvV$xq<V&~)XE)_fQ1y4CKaa)9zxE8mAqd6=e1f7oP
zBH~n2L$&qwOVXlo{)SZW0RlUo)V_)!flC+z|8cFsBS+5X+K61E*0IVb-vc)!kA~C>
zT7H>0!)$XPbFqMFbsxH99+hgunmhkXB`C@F+bnGV`nVbg+}_E2b&~|Mp8zgqDFaZq
z@+1w`_)-8YsevvE2Hq_?l>^9Q8JE2s*0R;A$%vqJi5t+u_Mu<uWvBXaao-br@-3;j
zm_oD^v@AEVz0@7vA`$O&!%MQ9`vtlIe@&Y@bx9U7!uSeoODKySf>g!z<VGS)Moabv
zJ%+UW5&_}4Qg4lKRKiKns$f1<$*8*jjqH-L=^*{hBN~zeZPVG%>Y6O2t8s%6$#-Oa
zq~y0)Jbd$C<&){&6I9=5^#F>U5AgLaUbbzTav|a+n8acmJIShK3*<*yQND~fnlWVV
z+^pIAp4Qh#6cZxBD><(Yt$67iC~2?hxXiU?Ea^B3rJ(O95)2#!X*RDB%a=j+goH0e
zMhh%68g-yMAri-5H*=Y0K51YhFse=4G;LeCGfO<P=3g3`%kMQ^ne?rqt?Yc#Z?&KQ
z>bCpzQu2N7)2oBqgq6dbL;}L5n+L4cx3}?KpRO{anw5r3YFy*9XO6|@q7@{ZF2vfv
zWxTOWPsr27V@=vjZ(ZOMLyNIJJ%1D@IZbTQiFYU8k;+f80LWbuK~jt$N;G!fzRbn0
zIsU2PJIyV>T)85yr7rulH)fWL!U%`?EF}_M+4Kfm5(OFIcOKFr@c112BPP?Yal>?s
zCTnePgw9xJ?QHF|b?erL9p63>e&*D=l-9DhH8%JD`!CTPR?E+plvk)3W2FEL(nbbV
zpD{FP0{1d;@rdY0OizgOT9+YZ#lvg*s$S^ku2NAl6$aE~mJ$oCz0sZ5W+Km}uMe@d
z4yj-&RdklG!O@_R1Z78y^~ABNMGbKtwSQ3bv-4nkLm<JkGe%qlcX4^_%FYUQ<Hb6;
zQA5ijEptpHS%N4tg|yHUoF6}0SLq^Nge5g->sH&|pyvs7W*69pt5wIVSh?RK;j5A%
zYE>t(9kp#7o76`M3RfRz6d=A4hYE!Q^&?uHPncM7p%mrg-&MzdE3uvPpO(^Aid!#4
z=IY*Yyg@@B5(6SS`kG2Pa#tgDw&9dsuZp2j)aM#K9lDR!;ykH4rcx_TLMvLTZ4Qhg
zQ|yeD66+PZEb+CMn<Q>P>M4}?vYgNsiZ9DXP27||(OGYbFjZjbq1M(l>rI5!iMx2a
zIW!h|0w<7esyo4;GD@J~Hdx^0wST=g&8`DCdJ0rbOa&7XK6vq8s_~6!?gRlX0+0-?
zfYYdrIUNm$Dx<9au5Zgzna@)6JKT4EE8R`x9L0>8m>O^3?fc+=CsosV=cxZ5LxVP|
zSL37h|6Yv`DeZTS1Rni}^^%(^!eO$+k+UnsUvcy1O{W%?<``yu2M&0q>f<%5-}7m`
zC+c_Lw3+d}+PR?NNy`soejmc~9+KB3DDUOVmz`+Jko8<mOPfKNf3xH8OPV-8`Ua~O
z(d37{0&GsiS*dz94b9d~S8G)7nuM@!A~Z^unN4&vzYlNPoX}SzJyrrITb?of`dplq
z$tnCyi>eDMXqV%xx@g?aDXa0TmVBX#<3=9to^HrPHZ%wfc<c4%>qbvUzn$LaWkp2!
z0*^VJQZKIZYm*rQm>+Ze^}zv_p1EH*3@AV8&F9qc_v53M*Y|3<NCGEc1{5+6sy-A=
z&~4pbJLmjXUfzIvOw@Myl3EPy7fiD|gm4oYcz@{iHlr?L3!0vYiLKC?A^{TBoX9Oi
z8p0V%@S7<%xSKK;7?(F*+06I-x};m#v>v$Z<(Hf~;OyHPJ~}l6Hr{znQ}`47$TdET
ztRxPp2E5UB0Ey&r606olBYhPbg&(<>ss-i~Ito|UIehhPaSsJyp+cq!K?b<(Q*!3=
zCth#HKjyD&PH@urc;i0~bmee_%Q6*ru^eAKV;i`o-#WZ*-enare$B%TOWqtpb3a9F
zP%vJSmoH(cs40=he9aT-j?A7cGzp-moj1k0!7TODV3upDx@uvzKFuk14#%&Q9|B22
zyjKF_G5})X!i9%rZm2N4LfQccHLww729y4i%C`z1)WxKB(s+|emBGe!!l)ogx3ruE
zoJ64>S$#-aVUmDj!tCZb7rOm3eYyA-L|ZEFjS4CRI@$^}RRXp6d%_P%@+8!v%*h+$
z_Sy;f^?`HIwl1xS_x0XP?3NziE{<l_ANk)P(E*j(AtsR+4RN|EL{}~jI?!tyN}ag*
zpX{^;&y>*eXoM3^;f#5xIyj^Me2gKI5dl6`1(q(k`&!~o7t<DMQ!yWOs=z~dtIvi@
zn@WuOhz$?8Ae{+Xm{KM6AM8PXBc?K{J3-s!?GV1rW!dHad3yQl85uQUZk%juq76ul
z<R{#@5Dr-5z}fbum*B~@V(|bdkb%66>Ify2$XJPoL{b5bg)<UzhO1n6{I&KP_eT+-
zMw|@uc*mAj181}${h*@40%6SE(^y-oqYwelGPedG%;;GpMf)Q1aXjr*7{ZQ{`8A{R
zUJ&Z<xOuF`sIO9bO+J;)O>6Kb^a3v<h)V%^$zd%SNd|+Em@6ZELjF0~*%DtitXmF%
zLr4KqmSCid!RWbr)oPRxdMO{$4F;C#<zn^I0&ck9WisNggXmg73@;Y<dnK>M@F(CW
z5sY{+e!W%v(gIp`Ke<ni1(R&vuAo-h-Q;^f=VENHmrbt`(AK?N#_JT#I)6_BZb
z6@sYwCAuAw#=48ycdi^H&1p5A-hAr5Rx_4eJ5xq1H&_B<&heX68t*kcl;j9g{6E=L
zELsX4^HJ1J<A`Rbn0~mfH11yAo2;|E-rEgjgKEF-HgdzV<yXVs-u(6q+qculOT7--
zGpRIrML^5Zc^ccOZ(2gmOJ+BDJ8kM-`KM1WKBXY5f`SpbM;^b)Akz?@n#^IQWHdg0
z<xx}Wvhc%#*6~kXWS?2}`Lu(dd^uxb%Chs_?DqE7`Nr^s-}HTo8{@q{G_@c3%epZi
zSlHiM_pfh%n*PgoU#8xf_TkW-A7__L^w@YYOV{4|gRyl5`hC|D$q_d?(x4`@<@S<G
zNZV%VoO~<g>ect@c+wCYk6d%@L(hqUN57afrhs=hBwlao)aRbt^RmVV1Oz<F4eG|f
z|CTmsA)MpvK8D-J&~{nvIy30w*LCO4K1%~!?0)JQQsV&-2%iH;AGs4;5Ngqb$qH#a
zw_mLJ<Dap%FPpXP+izxmpyTuHqiBwruRztmdn%)zSJ-?rvj^AGFYqneSMXrlUBX&s
zp0ym5RV*a$b?7i;Yz-v8Ox*i*>xvbc?3#DGb*uSz+!6~5W?OwIfgkw~2EzD%ocbs)
zxW}6}O&d1Vc>DV9qYN|K81&6IkJc#W%vtov^G6Qp!Wf4>v**kin`yu;c?qKQ%N<l<
zdhD6F!-u~4X5p&cqd!P=W|G}68#!!>qk!ta`R1Dxh~}G+-Fo`^KVQDwj*ayNYKaee
z_WXwPaG;@~mBS?Jt`9f3;B*fkl0(g6?w@mb93pzTa(#*)RR6|F$<*<2RE0ECTaFz+
z{v)ahoEmAM>bdJ9Db~LmRa8`TkFRQb6w}AQX+6K+MltnCK=(ZclGgsxp4nm@N&;Vo
zuYT_bbnPP@_zW7~t)Iko#Yr(iqH9Nt9Jy=uCsUXyd-)P`FFs$gq$eVayF78OxYXR-
z4;vdB&n{1%#0{|z6I*D^8U<SFU%59(LVkP3j2XvHoVc8y{}E%oK8NvV`44^l<J&uF
z625IX5NO{gef6m5fzz1Nw4W}u9qVV$p+ntnWMptuu6g<L<qC>aOdFmi)tfow_x3mJ
z%|S@;&%c1(ccE4N7*Mpl?(+dkt$SR@-OI(tUp?@rZ@+yX8q%^#Y~6!~5AUk6V$GV}
zB5)7Szb1oOw)Ow1Rk?R>r;sHV+L150>-+CJ4jndZ<*HS0qt|3G;9YpsAr*;^6<zEd
z9DW@czIu7}wRArZD=Vv~&959aQh1aD4_&>>w8HE=2`+OVJk6We__*jRoLE0_M4er5
z<~zM)Cav@(7$k*txVyP|$5%tiL~Tzf_;f&uyN@4t>e#U(Gb2uOKOE!!%OBR-Y0I#8
zKHQz;T{hrX_QXDp`k$a?&PPRk{nb}*^P0~?05B$H-Hig|@#S&nU&eotbS8ss_Z0p1
z<${79R4db{oA1`vdW~Fl<t~kPv3>u0R6{+-+I|RYK;`}8^GlzV70+1IW5iVdWQ1)i
zxUj#RIAll%A#L=QFI(0htbQl`YHDd|zj5QnosWq4_sNreSPkC+A6I>7R?q48l1K@8
zyVz8i$wQXlKVBxcHW@Lvd|LI6`*rQ3Y9hWk>3Eb`sO(jK{k;bd+S}RL>FMeH!0LVp
z>GVrnWXN|}iPvh&7Q(7_p&I)Cs(bUWp4a!?`-5dlmZ=OGBC#k*nMsBU4M-X&iiifO
z2$c*Ag_0o+rcer%M9Po`qNtEchESr>6cs9|{kmIg?PKr#$8+rGzvprMj&-cx`jXG*
z{l4#OIIr_OufM*Oj2bm+xt(2Sc*J#Ff5ZfCHKqHq8oIi$w908Rj;`Nc66*piRRdlx
zNW64aQMaCoOAR+%TCm%Ng>uiHJ!9B%n*?$h{b5XsZ`szzLp@}L0mfW?{dR_~CoRNM
zslZ3+VDS=G5vy6ob*r>aY`$c&tVL-9Op{T|5@l%lCa|w(IcH(fqD9ydbmdIlBdwgt
zch~N<@ncxd?>suW6{}ZwIA+j+fdXt*TayE6X|p)<b4n91?bsD_Rx`^(esNI6u^L6)
zG+?G4GBUe?aPB;Q{3q?4^QKL+QI}nTspR+bKvc?habp-9Vl}$;#mkq09>b#_ji2lO
z=9MhC)LrgbE+EgODN}rEpB);aC6h9=@O76w2W<s~sO9uq0g}0raGs*Q0WPO&4;MXs
zYHX9pK8i?~1PU>UsSH*1#4@2{&w0!`g`|mS1H6UDczG5?48-AL?sW=AzmGAxY(*D7
zZkbdTpA%VksEho><oFy8ji7WKzJ|)yXxp}wESo<WwuU<*SPlpYQ8|}cZ8mQ^UAk05
zhxYCF*>TA77#|H=vt~_1E&Fv19%AyX?)TBEsZaOrgJEVlBsg;2RjFF%MSK4K{X5+g
z)t9+bJ4qflQ(xuQnKLuQx*tXEM?&rKqetgbEd^0jj=z*c&{TvJ9Y_py*M+ol7_Cus
zzSE)*0>AR8yWNW$td}fV;vW$a@n8#Lwhq}RE@(VxxyB4$Iet`4s{TBimvCa)JJQwf
zejHLb{m95jz1qb%RL@<rdi8Kl`24aOMkK@X<ofl~WmwxgU-U3jQi2LVGFq$ngo_wi
z2OMu9s#+(CVOW_E-S~%ugcOz$%SN&@AS*L-;jx25V!Z-l#KYqj3`nSuVM<#?qitWi
zk<}JkMh7USfApa^uA*QyqHZ!C{ha46cyAew%N8#lLBVC>pSen@R&d<}3u@P?KKm@a
zeI^wLBsCWl@P&ts9XnR7rR8^Vi`~*wRCpOHmjc>_L-?~bX(l;DKfM3&p@ASencm;n
z;Mgh;er9#T(M<i3XqpG@L18lzgeP*@y3AjyHl;bq!>Wt*vK~JU5|jC3M~~XR7Af1v
z+}u1;LwcjrU#aVkH|u@*YuU2~lC(d>*bJL^`WNnGP+?Nm$CGoyV|*rUzmb^eV?&N#
z605;^JMSDbKwjQT$!~P3gKl2)lx?h$GatBSmHIK^c9)P-s2KU_zRlL^V2tfXX#Rq$
zHGZAEu75b}+!1Tf?%dfnrhL)tZOI8Qr)Sx27_H)d(q#O+{L@kD85tRRqB@+E3YD|f
zD5Y#v!|&|O%)?@(&A3=3|2_@zH#>}s-7=LQha5;SPVAp=rC~8zx9*bZ&z*IRPewmh
z?2)Uio#J(gZNXWt{rdHrtE<~aA_5ggG0rY7e<vi2$1~35=a*&LbLOybx<58pous9;
zjg4g}^cN|qHeA2M@iG}bmdVzNrOV74wmHXdeZMS_<N|S2E42L%!Y9+?&K)~;)Y$Ta
zZQJs5X3c6F7Z>*xh-s*bN>`4z>4Q1LkF20~L!mp8X2O}BC7!fCdKTX<nkPu%cJYS<
z+2eA$D5>A2Tt5lzX$If1d#_&YBwxRO_rfwOGKtd@vHlHy`d)n35{7A)uUwhA=g)y}
zOG|fCX9r3}o;rGTFAr6+gKc?qggfZzMESurz*A0vMo?NZ$?fZ42>U+Rkp}+$)%G@2
zv0VI)G(77KJ<BUA_mVWu@E1LJ{5XSb?Ly<f%Eo5M)TtKXc;N(f>MT5lzuojaI>P<h
zwQExEA^P`KU)2fK?GVlAC0B>DZm@)(Qdj573e%pTFtfL9(<C16fF6>t^~82s8mZs?
zUy@c}3pu;GYV)voz%K0kLOL0SXi}**H_$VB4yT{RYfqNDpXTQrJ9f<46DKTg{9Yds
z#UbW&k6FKV?b;_8HBOp3_3W855(36d0|RLyGM86ehXK>K4G0Pfl4{sLU~*4-uR9!A
z#L>Q_<PxYWlo|fB5;1(F+?>51FsnabxX*h^{oA0aNM^cRL~b8o2gjp>)lVv#P?-Lp
ziC*;=X@$Sm)@m_>Ah!KY=|6PBgx&-#sfspRJ(1_s^X}8#sQR9u_NNBq`SqvPy-SoC
ztlN{Eyo<kf_ntj>A6#(#s-$JvW#riH&7PjPzDst{+l3;xA2WV@I<MFT>`J1uY>>>)
z)bskvvKyfc?gmx=gZ1Fz=@IzZqa#a>5+%)ptgKlxXSU&r?BxoJ_1j9d$62L}_B;JX
zOpFQ)NJD5S;F}8S>OJXRwJ48KzpOb_)9bK3Zj>|NS=>uFn>f0)JC~N%T&jKZ%H_*5
z2t02;e2~Igf&10&ulXlu@K$N8b9wqWP@Ez6>DAJY%CLh(#k`t8S^w*oD1G=uX=DqR
zmgcPnEPgGSwCu2G)gOjT?IU?xQql%)@6i({-h=Z`;!pFR?(X6G-J?JWEb{GV?%^?o
z#Tf@EmG|aQz~$n*2O+b+zanbDHll&h)009Zc5m8A4Xrn?Fq4s`NVu`-vt0A<pH`9?
zkn@B)lTmgZk~xPE+D{+)2nBQ=p3iWTALsB)TK$-`crnzGZDbN=vno`D`M`d@kKMyp
zhblC}%*4vNS=qC<R#%ZPMxV0M_1_J*Wk0(xCBvSM($pL@W1;aifDSC<dr>QonJ^&(
z*%KFc7xbjL{<~@N1~Hnb2MW%L(P%7E^^>1&sj<ua>gujyJbxwPvZw2P9+7ggs(VPl
zclUB0LGJkl^10T<_wM~cZL&K!xDO3Tk3N0&0L3bBlHf(o0c>%Gyg1JG$&8gR1?YTn
z>CK)qa@{jtz3NX?r>35L?E0<HFP|A+b?@GN2Qg-E4t3Alj~}~G)GSSM>mjN)@(5M&
zqI2r`$`=$IZtdHvsC&KlV{wvImSMqsLEVwBFTiqrv8te9(;JAB5wuA@O+Yp#=f2m}
z^sItsYiZv|#JX2UyosE*<<}2KwPeDS{qrULYhog%X=sGwa}um~<n-xQ&v1VbL$@xC
zRE~PJN1RMnuU$KeI>%r|BVd-Sc|h~myWNzs&n<8m+%Qqjx7ddO5Ld@-^m66TN&OPe
zWr4!Y;F{%LKR*-Oh#wH(E*SixaL)u^3n$q|#k<$8j)>WdV{nAnB`SyOV_uR^?US;n
zzaS^>b008pU<44a((i3Ff7MbdB?1l@mrB0&>MbPWM@~)Oe(6v-kc1J;-(eD|82zk5
z|7h?Z1+j8dz04#J`Ph(}3F_)f-=_`nC`ekTa<bciNrQ{lPZ&QwP=FUBr%g*XP9(_b
zLMf7I5NnU^wh@S>oYYwBvv7q+rfq-RPK_)rPvs@KI!?YAEowR*pp=s<F2`neh2U3u
z%ohX1E~U+rVHcVmeq2~+E;4>z?}yX@#<o>;v<6C#a-KY&Rr8UOb$I3~^~}3>wIJoJ
zij0Tgc!ZOq(?}qQT<WyUViFuneNt;4IGg^~iS*SG#0lT{59SXbvyEu#fFqAyh!z<t
zoT;U6ncRn@xpnGE2UJcD)AbitD_#z_9zJ|%WJeSVcb=|JiXS;_ScGvR-Mc=$Wv?nR
z$8<eLTWZAJ)0u+EbfnPk&ZAC>cdatY<O!Edxv9V>Id)8!QaO*CqZi?e&T>+U2Sp#{
zonGy@Rb48m!bZH-e+;d%YNOW7nSZ<DCMPy-=nXyFx3Oscb5p;<$48DF84B}XS)&Ub
z_lUsJ*s+ux4?_YdDu6tV?Dv!`zQJ3YNgTV<t_U5wM};7w9?k)1)|;mq+xxNL2V%l_
z%_wKFk8JPWdV<55R-1+><HExn!}~VNs)H#_hf&<A<q(t#&MlZfzjuW;MQRoGrCf06
z<LD=8H;-!Oj77?Nq5DW1mI&zOti+B*Np`%^lIYD1zs&}Gzx(p#k|B*|Y7G~TBwll#
zP@4RPp{&jneSQ5ZNH(g0<-&J}Z=-l5Q|40Si?oLC_odb-L^&rKPs-kNzs;Gt+4JRV
zO6HOt<o)b$5vnlCN7Fd$oa)jWhK}Y99kUqGDld9jrX@>Ew_s;WXki`IUymx1UZnaP
zzOD8YzJg2c$s<S3c2^pOJm7OpmU)SEUrp`xwEmsCSTIZs4Bap0DhoSIojSFO4%kC=
z(esW@OvPmT9HCXr<8~IDYvBXyG?OC@%|{jy{STjm)TN{>L&2lx)jaa!u8j?bdsu92
zlNB9KlR83UfTBl1@_gCH6be$RMYd9VTE3QOW#;BCZ0n<Ro<<>d+#8~wY}zWm)@kcP
zu}n_$`n7AO{WrxWDW4?fUOLo5+^fr)b7K9uhzMO`?7I~^&wV{ly6^Eu1udRV)`JI6
zqSvO!-|X#gb?{x;a5hmrT$G|<@n-y<OR|rnF2(u?JC~&8!;WNzlXoMkN|i?n!{j&%
zpVzOKM$-3mn5Nn@SvuCitCsIATVvR}=Xi+e2G3<`#)8!doBX-*0|zeSrcG}dbq+&x
zU9lA_t4S!wy=qKWd5x8b!H!bCCDhGO>uoR^_44~gS+ArGl(Kn+I5aD+pU<_Qo61Z|
zMs45jXm771L{m1~grQ!iv6nvlz196>?OxY!aXIoes)zgI4I@1<W<Ir@wk2hw^7!N=
zrIQK5t)Qkq<F71NS$>YLzp!%haeul|c7(3KxL${^(P^5O#U5z#AV*jU+&v?ew!=#!
zesXY-nb!vVc8<=;zxoioB9hQ#pHnfcI+es4IAN`l?sKU%BvIj)IV6X8eBq(cFhwL5
zE%&PZSpEHg(+usF&USWolRWFkfEPq{FLb9wu0&WdGf0i@UCk4eJ;H9BSHbA!i&#Rr
zRVV+{Ufg}7oql=rLB`*EOIVBmMd7L83C&59`XW>5C?V<$#;(yOCB0<nm!f1}ki2d)
z7B0@`v$Yu^r82O0Z_{%%4%RDH{0(gX_>6nu%a<!KqmuZ{dWb<{gz@o{CkJ?qd4US0
zE4fYD{LhU*Tf=8PpmosyG}eTA=LuvwiO!HWK0g;;A()QXoq!bRSeEltg3n?|?n`{E
zvVd69FDEDeb~rLIcl#Sg2Yv{z+e#8$t9FWRdY%;xrKsvy+>)fRmIW4ei9l2Dp#96f
zhhK7sW=)i7P1W05dY#_m?G4m__F{BsFa^L`bTRv5VeZ-VHPu>7&No`LsEtfW^i%{R
z6Sl&t?=B<rhlH-=4&fv_-qZ1tm)>zuKYTom=XdMexyRk~l1gb&5B6m}QC1aM);l&<
zRWie05g72C@o`d8NnKUG#3!ws1zFlQH)eopj9Pl*iDcMc3XY%+BliOv7_87hM7;a;
zC9~K;La~?7K%ia0P{6uL&`AQzafIn{7v8r?e{*-%6FTFged}Rx1Vt~^{obr9M(n*R
zY$S_8B5wxGX&z(^LEDYVR`y2s9h*4TVuMrfyVPmgNPmpaegFPFof70;J@bii<xYDy
z49bn3-+ZbFppf+AQBn5x?U`}KW?oEv-JY2aC~q<8)w5^YUPr=CEyks0$d`oyOp*Nh
z;=_G$X0{7@wo&L^-k<iYEcd}iSMieCr%#_QbKn!I*hYT^+)VA#r^fJB52S*DyELU;
zoc?Ore}hl&#?lFs@(Lsq&fWB!I{YXx(2fnDtX>H+YM)Q=Xb;++2JR65cp^9=MUFPr
z2=a>Eh~)-PpKfIkTpbK1SgTiiD7J4<_4b>KU0NOi!1vf{ECeog=33v2Po6$~kOHUO
zh!4HAPi%9sQSS#hxgUYROn*hugTU}n34ENF*OQ`cXKLznI?ivPnRlK&Gp&<tE3<6w
zg^zPyb*Iq1<b3x`44+}}lqqs7mP@B~JaqW*kj6pn2M<zjr}Ha`b0Ad<=<4fhPmK6x
z*kVJ_+<x!!wQlyD_3k}-(A;{|jewy(`dnseDaBs&mA1FIO&6BknlMPi?Tr2v6aN;!
zB^DO_6?1|QbKFBx?uS_me2PO9c<#j5HGeW~+5rGIbn@hWq``ZTUk;r((M8f1MlbA`
z@2Z~tgotgavYUrT7+q&sqQ=(i8^7)PLXeJ14xM-}!BCfTZ8SBRX}3Uf$qa`;r|AfH
z4LCnurH=%xT)QQrW^3EtM^by&Vhmkxt;Lg}!H62Cs-S++|9<C&_r1=zB@JK;qp57K
zVL9`L)dt6g_lMbDm}FpJFa~tX6Y7jyutm|h@9CW-pi^NN;i$O%_;k$jr`hq+($WzF
zEV#&9ca<%Bw&hlh|EW`#F0XCa^DS!sz^SIj<L=+PH+bmKcBZDLO8NcYot#q_b!7VE
z=wH)pwVDUi@V3><B4WYe-||pyZ2dKXYVd%FG%HPAZi@DuZib%IZ0=-}NLpTc9zSII
zqQ~QF^z<_<MRvWnxr(MKs_x~x?U|5H4jnntPEv^&!u01fsy;wp?bWMSM}F*OlU1uT
zTI=<zOskHz`F(yNmC62XGs)xTVsTPsX}CN&!F?HbnpRznixUtrg}#xEei2(?qAioB
zK&w5-Nh=|T-oUHPP5B!Y-CP5MKMai*nl){wtBuGbPwN}M6Ar#xw-_9OU?9HxjD^up
z#oD~+hSFl~TiaVwvRsLpcPXyXmaLW-dbZnwD<aL<)AKK;Ed1$xL!E?s2i5m(u$#TT
z(~ggLRe?~z`Ua@9=%nYDjqru5E(XT?RdIx~iB@*6ajFe|9a|IFk*_RQ3*5!{V&Y=2
zt6W;!5Z=kULmFkZqgPiZ?Q<0(8b8(fnaY}LF7zVouaJ$cIoyLtnh>=&VEg5>Uk0pP
zyLLa}FO0|l+`Gu_ZZTC=Xu6@!q3PWM#>?%SP<s74z3dexn|`$q<j<NoI?!0Z&ndqu
zk<2}yt3A^=^Xk>B`Ss2Q!;es)v>^gYd{X**UvPeQP9YR9)q3&EyR#dr<Cx-^W)s26
zc6oZ(B-vH;cORyF)X~vVHlyrdFum_>vF)vVWXqY<(r-0P^+${vC6ld`;<V^Ik3ud#
zNxCrLF3li1Y|)+qM-Xok8Z|4$iV$Fx4u*9#aT9(HINtX|(WNAY@ILFgk7t2&<SLl2
z_q%xUBA5PeU>yFpFp86Zj;La_iN^K^#rb`vzt0P98r!Bz7hg_kG?O;ZA1~KLC_Iil
za`I$TmdT`cw?*Xr{eWy66`$6?3zfaglyWB+%FJxi&2DDDa^rGp6t{0(59K=M(gMle
zimmCY#*rRXX-}WVzqY)felM9}!Vt^=24^JPywC2Y_8@v`8|9LVTF<2nOWVA98PT27
zLbw2CapBH@z%bxX2V@<WvatEn%dvm_@rUgjz4MDRI}WH}2E3bN+Gd1@Lv;O$-EqCM
zhgoPx>qp1F(_44O)*^Gyj%TKs-8T-~N}7Fv<lk1etHe)kyoX)rFkq*18w;5|)v+lf
zY2gJGn_o>dX4Wrod^q2f^??*E^&b7gk;m=<uVXQ=u)YC2V_vr6!HH8~>%QudUG9fF
z6{Smuh;Y~BKKq2!6aR4`D<QsoC55j*s`YLx1^rqaFVeJ1Qy<pO@bE}XyJ<@wIHe+V
ztwr$t1j8%-L!3<1#*df1OK&_S@a(AUXYorLr1nuRIwZUXFS83gus7@6Dz9Tr34_kK
ztrI48onm8DHa^efaX<T@lVcXKen#-eMy`m&=cB{inu>WK=sgaYSa~O1S5~cgmB(qe
z3D&9Ezkff|qfw`nZKDQHyf^t^rO990PcG2c7fn#niiEO*-&AY2&3f+BWUX3zG{P5(
zLKt6n@4<ujncVc~-@gm)32M`nmAg>q^mt!5LE7U;rqyXcn#zKRCnOYO(@A)c1vI>l
z@)`2wb6#GajAnlMpls$Vn>KCo4Eot&g&1fWp#hX_v>W}SGIyxKY0ZHYL>(lf7GIWd
z9!*|Z6#$T>w=+fX{zu1-?{UVSMaVa#Ofsf4UV1wtJ$>J{8$QMB%+8T}6vACstx{x~
z{m0uDGX)ECa~~=ExlY`r7@(Y#vMbZ-AN>8%dab17QS$Q_^*d%@s<c*c%d84dgcV*J
zHD6CJjj_nRcAaTq3{Fn!L8G|~(Dl>j&&(o*3>utSEKMwu?RV~`#*gSGixeO~^Rk7{
z3{7j8iMaqqNXEFO+Ov$tdXlDQ(efb{`|3(6D)zv5i=MZEsiRBnOLUihfr|^{<CRox
zrVX;2a$hoH<AF%<&Sxs~v-7hxx&=IMNYc3E<zu&K$RbPWE1!HcReJ^&oY<6d*lcN!
zjxsuTFMv9v)x`}OtA8zEYskTljRDt_lqO2M7mce+Ugmzd<x6Ev$P=6G#-$BGMFz3&
z8nl{Er3lP2)>4P&?R%CW4O_p+&%bI=miLV2TUP1Bs|1HVkoK@K?)muf<IC5sd4qo(
zEn-%bt5Te;vAQbl!Glhs{zPd4c(uA2H@JIsV3@wOF4(-~GANT8SrHwcbUHU^O{tDn
zu~nPIWoIv^H9WdQcj=)5uGvlu(kXv+<P)gYPh~Z4r4GAyViQH$i|bC<cUGQJ-|py?
zsDI5oNU^i}F$3dCV%=BLXB&bs(~hAiHHSEU)3M2HsvENXY$48QA!$Esg<7c|WI{&s
z7x2nAh)dC3e}9h?Q|_g&+3xd}G-ogGYFjAY+LCRlg|A*YtQ~TpYlGwAW5*&3G;F%s
z41+JXNnz<asxkbcW!BxuI==Y|LpapsjS;(3hZzm0iCww59Ve+*H)puQje6UyVlH01
zcjY@_R>~=%|15$pqih*XlAEURbbR8crQ>$2;&1<Mk<`cu+p?Di)+F}!SoFZD)@Rax
zuXA%vsjk|m_wN^dIKHs?=Pi;6VJFrgJG$~?sdc0AlvS%&E1irrKCBu$q3ZjJ2es!y
zt3PM?dLDCan4h%&_ssCDXt!qDSJz_F+;9!Glzq%FxM%Kw9bpx#Y$6_AS)$qYEwI=4
zO<n5}mu^1TZ)^uNWs^~sf2AH=ffVQ6N$$Xjm=nGatWJAkmU(o_M*H?b<z>q53qVXF
z3$hm{-S@a^d*J&l?K+<^>gs7wixlRY;(d^gB~ZY*X~dn%mLa0SZ)YmGvAOz`sSP->
zjMKW|!-kb`#>7V8PJb_rOJDy@$K9on_dggYRbq&^7jgWe=&~pyYJ4@%ipK+YrS@}Y
zx0ly8bBc^^XDa>j(FhM#QF(MGTe(X^%5-7-Z|50pnXegKd<~dD#iGPjG4&Jk$z8jX
zk93<i2<dTw#)n4B0u#6@eII9L-T{4;4%$p%BvW+qHEZ1N)C_NoJu-dJn<V%Aw+%dx
z*(bH8O`E1Og1~L|4(N13eU0lk^_+He{rJ0hzj#)*QKjzOAt$ww#eJ?#Nq_FFZM+Q|
z85d;sYDI|VjqIeiuJccwpF%-n`n!|;hLBP6n~YBEm2ZsSpI4EXvdY&*>u|9u`%0hN
zhnT2|yO~>YYtiDx^+rEB=6<~Q=uuZpv#pmFZ)x=R8&>4;F-H8vwWCLj=-ySI(EOE-
z+y3FAS1UK`ke7M`ceOuL+3yo0;5HJ59T_B}HZsX>XGb@dZr61$o|^iBPDN&$RpINE
z5$DhM54V|=Jf$^-29h~e(mz`_L07v+?W`AZq<Z36#!_;%Dk+9}volQ}e`s746uaYD
zfT=u{S+8yv^7Hb*eD|qWEV+2Y=w|)BV&*mieTPLL@fnojAa(bTq@?c46F=2bJa~0n
zk_B&4Pns2TZK-u^l+XMrcJe)uf5mgkI(fe7L^Ac8T)sZ~x5V&P`qNkM8sHtL%Y77?
z)ps8-U{uCL=uA)c4CqmnQ(SB%-Dgf;wE#|RKYX_T+7ae#acgsR284ZubK=!iO}4aj
zKee(nr(MmP6#;!3#=H~n%Uz{z_@U{lu)y@9-Rr9M`0aJ=CAg<8#Rxhql>9<-=8;om
zRW)$uo)!JYD*ooFAg7y3#qZ_^LZ+}cSmZz9=XO4ZOm;B!BQ84blM8K;CRA=3Id<$-
zgP%RUQ?9qmsyKcJe=OH|hbi<6H23@Y$;RB|pIrw{^RC=Do~gH*CrY%Cq4m6lm55)h
zUl$k4Bgl-%>(}^#W_v$CkVMDNxYSB}VL69S?$^4B;c<h0!TMA6_O%!<tTU6Ak6rcP
z(8O!{W0zfD=3X>4^-m7O6XpnNmmW=f1}GlyvBcL>{-X2nu;-7NDcx&t&B%RL<#Fru
zZqkxRD{|Gxj=h3wTnyC0g#-Ha8>w_SCMG5qqZ_Yy{5BO7SK|<o*3Lo8c#=SvLdz;)
zcx0bka|QjwTgJ$e9Z*e|ll@2q8Ss4W^>^U<%KWyERw#acwdw9qt$dr))hu#aIZVhq
zv~T>Nhpns|xN!FK&x2crf8o_ym3JA_5G*g0Fi$3)QWxeiiRr-;R;l-6>NG54rL}ck
zV@i2Ng>p>&F}0N)3=Dicq;wZ7xKBk{r*wXZ($9TGics---=)z>W6UA#Uk#3<2A@)m
z?MO!0PjnLjSyq%`taLTq(xjZT!GIWe&!vo%U)KC&#x@J!ke(pKHd(P6v0g=g)I487
z%X^Qy@#(qg+nOff9-DRDP`ky2QEDqC%neXe)-q${@a;)We24E*7XsI;T)7K+^4@PW
z1X1%S*kesTP6cfKcDWI@m)u<g208P5^OOofelx~Vc*@=_Dk{>u6|VjVHH+7l1HvJy
zI!St~Q^$au@miT{BsH(h1Ch3>eX6MVnN`<uzHu(1%E70;Gwy-$Q&ndX!p_CCY^562
zxGa7|ue-FHa;Fn)r%cF}kEO*OSN=2ZR#&BuS5%3;btS)vUGGSci(v#GZ63ZS|JlY@
z!r9^Vbegd)T$|C?<C3FFtFOf0G*#*sxWjUL>#;ci6R3!`RSjdZEF-6miMMX~mQdm+
ztXMC0M=x@i+i+wJ+}Tbc`KYg5RxEr7puYQZzyb^P({c4^r|*3w#KtFPs+{|f(L|9Z
zs66TQOnFL5OaG7vd2U)fAVOr<6{d_0)s~(L9Z~kv-NkpZ4Ra!DZ+Y8?Ib4MyC+kJ0
zN9OyuOmo@wyW%vzs$n3JL2^y742t{BlhyKi<{@~$FVkp-`;T4o_^F3`q2SA>wM!GG
zl_gzQ&0`X~hhf2p*RN$ZY~F0v>59OQS28&_-&3Q{!J?HKvm)(}m~5>&t1*btpF#zk
zk=w4AOk?}|i~``R-F5>??!FV&9(CLC8sCp^<w>x=#46cuL7v*$e*pNMTi;Kh!b5Vz
zi8QbJR9YI|eb74WdduXd;X2#-^7ZQ-4P~}{!h<Ql(dH~}rdS}2@H9R3e3sKr>e9VZ
z!*%_~aEJq6O`Sen9@nUn>`{Yqw?77x6Qekni*fPs#uq$;Dk>`e`Vyh2n->{<ny19P
z|Kih%nr;u7YIjTs9m93IXkyldx%k(bnzn|n0~Y=?{7!KOYL<|pp+5!SRLq&yH{uBM
z4Q`X~H#lG)lBAbCNHw^|mU7`<73<l0D2um7HvPU!zIMS4E6@%J$R0AHk4r=E5bc(Y
zfByMrr><RP--`(zrEz>>iWMI2X-Qno+0aM>&WW9q7hY@=gMi@2cuMyeH0aW%mvp3m
zGX>re_Vraw&Tl<bATRV|okxHC?SJKp8lxYHOvna|6h|HiWn`sI3E9=+t=Zm76@K(L
zqD`{)B>R}3c48vZH7~?c2X<$CUro>Y%bo6CGGK#8n*QRt)yqZu5JbH-deCf0H`a*4
zr1IOfZL59yl3)WkJC6YZmCXx}h**K`)nZWG`A)J^r%VZBS`3O=E%S3)WK`Rm^&1-E
zEiEl=zNFq<40jEJm12h0tXW^FaigTW1{^uE3mRuHd_HDiYNJ07THk$&%JjRZc%;@q
z#8P%5qCNXa@7-6;T064ioAlDVk>Je%7R3W<WRD5sqP(Ea09ZQwcrl-j`OyFlsNLxt
zj@f4^K40L3lGibhw)qA4{f+Lby4$AO=6kSzxLwPy;+A{$T!kJ323(9QaAD>yP&1Nu
zDAOmsd;v~bPf}25OW&>Z-T4cxMrCp8bf#I|pf&iFJ@qmeeivTN+p4OOSsI*waHft>
zww2`TtP)*2bE&k1%Z;~Z$m9*fi`**)s-+;^5OhgGp~5{AT`aT~$F1s&km)jX4DXsD
zN_Sqo=!;KIch%dUic{sqN|pZR^?(z??!pjiY(%$GdgMhXr)}vyV_~1RVhd?n9Uno-
zSRI=8eI<+7V=yBw4PsMJLD7>B?{S#w-AzthUjl0M+NzVskNW^`jMU!n#0xUitkOM1
zOFs4-8*M3Fe=&+ilhvNuDI#&(h#ZH|i40{uhj@$aJwR^PjJw<;dRW5_nzXIy#;obD
zw2e14G&Jn>@kxUy|BX$9C9tt2K9~AIT{|=3sk5!^g&Xoo&LwRjMmjTC2&{5~_kxXw
z6a-PdyQ&@GqSfj$@x<rtZ#o#|@WYCo8NuAel{q10J~YRpeOa-6Q0I@4?o7dEGRFNb
zwZ(q*YHx_J5kEB<yv!1Lsf*>QQ>V&`dd83>`hqw1l~=R=L$jr6Lqtpf47DEp`t8H5
zB_MImsDoRkHvGCmh>^S4t$;eux7vQ)y3xXBZk$4XYD=?d#eK3)k>*x11ne-fUG515
zY$i%3C2XjeU*FgW<-({cxskE+81{!g#E0J#bG>Cf8kh8AAn*15DM?muV+8z=e<_*{
zNzG%`)n_eVex|Ry7+@QwG0+u^4f+Mk4;!x!`4Y_jM`fj$vRmujj#x}=*ElU_>E-2r
zyA_={bVve3vlhL+&8~xtAx2qTlO|D{{c8L2V;K1iF!yp_`N``Q*)eRX^oBcgulmlD
zoI#P+$86`*T6|!Ch7>fh{jeoDFE=g;q;Pk~4bINDA<pRNzh<i5mB;!wXf`8jFUEB5
z7-^!G*+D4T)>!qIm!8EuKp*1|JB>9xkDklvT9A12g_xcdJq%Fn%#a|BA1OX0<W8|b
z#I3n$^19*_KWaXoF(Eaf?|txCZBN7iE;F3f>{o9+d-iMwx$W3j<io8kp&+V<J1wg;
z*SDdu@jncNLK-9Ot*+HY139g--Au4Tku^VDHI#`HDuFxH3qY~Eu#mULIOgk*AKqSG
z?MYCIgXEYb^QQ;Z3+uviI|m0}R(rQ&_;#09euxSYvb!|}yKVltu23$$KYd#JI%R8r
z5kO2$gR(lV#CPx>38Rh7w6&O4stV&j=h_-*A4<D#oWRnbx<=c#h*A6s8h#2{VI8%?
z(a|qHe%$L@o4Zj@yc=?}$>L}YNWRF#d!_uuQ)VLFhyU^xQWY<9fQDOlYA5H78+8cu
zPp^%7ItLPx*s%*)$}p+n)|Q{^J{2YR7hE15vS9o4?bojr9VSpj_O7ehIy&=6<|=|)
z7h?1Qa$TDl3vIrm<%L4s9y$(PU@vfSxVfFhL=Log#OoVTStlw}FLFh21#VBq+)aZ4
zVeW2DPB*R(U*>NL7pkDIz(psgZ?|-O@tL`SH5@95&$g@lIUQ<l{Ro!4-Qku9`Ywp!
za$cuU(R6Y!UHkU--|xJRll7PL#4lJ}zf{@WW=EFk`s0)qE6GG6eCFln3!+}#r<c8m
zGyn0gPMta+ka~fwW@j&Yd3;uXz}*`gDh^!e)U{i;`(%k-f(yl=znqYO^-~A7tvVxd
zFp_;xll9BIa^4f%gM@@YFmxm`-{)NC(*a!y5*#?XHqaGydr5h@H}i~r9~BzD?$Zu0
z6l#+U;N-)EMGnork89lV@0taBo|t-WLNR~kN-AMNjpI)XcAM+RXI+7DAoGc5A!ukY
z;y`%qKPhEE(c?R6;jMA#*s*<7hiN=mcq{@{JA^KQIn3MU-@ha~gkCi|D52%&_7+hO
zD5$CRfKo!ZX@`Qw7xWtnqeW=+31YKX(9FWcH=Kt6jrfS;UX<i+`zCQb^bKeB&UtAD
zsHmujZX}{&q1G?DT4!4QlT$WIF{eMuX+_X86=FnhKJp<&bdsGX;ZW?Ro488H$o=&z
zb_NZEHIYu>;PZ6_auja`U8l@_SM0h4)$9EC@ngr{B^nAQo+97_6N*lqI_<cAU1(oA
zNG5G>)<9#Nu5sYN0g+(S5~O;_E=l$nFBth7e3@E{mjGFpEfd4Uh2x%Jj5Lt!Y<BcI
zskT$MZhN407TO5>`vpUCb#-+ynGvim#@D-{K!R=xswIo8?~)!l3Rg2KyCdW2*GyUx
zw8qh+{}hjzhrbX2gr$KLIKAHKyymo&@CxlMX7x8;QXPy;ZWlAb<$#j!NZ?~x@g3D9
z^;_ymg*#Qo_;1YM6vm9{4tLb8{_7BYuK!?0Cm3+x155|BL81h24-Qt6OILZgOD@2U
zY%U$b9EMlC6NVRQZn?GfP7(e#)RR|vvVVSieemaEC&|9O7j_EE+_r<;91?`+@16)X
z4l^b&etZH(6)?G;f!U8WPEM-AvMh1S&r9_WkrBO*yUrGsLkH<G189)OIRYS;GIu?+
zje}_(d;S`!K0;_mMR9xUar$;>V|la;vNb%+=%eebtrbFO+CQt8UEf8$;R-MoQN=)J
z80}A>O=l@7qq*`_W#@!lS8`WKV59N4j9ymFXk4$`AdP=M>Q8R!e6gC`^c61u*Yw0C
zJ;HSK;wE$z)k+}~$Bm1Bx_NHFX@)X_lCgdn#bpEosJVcRHiHN*c@R`L7S2x0dh{qz
zPzNJ<;^r}ZB!cE7c6>$<iu@YjN9)r=n^$~(`*yI<jpad9>za5<JL`Yphqs$&GZ}oW
zw5lrddo`A^vxUR)(dj6sIbT6XHAz<q*;^XxBE{C`!%zr|&5nPefHYzfP!JSxk@`)Q
zEa8w)#Kd29eCnm={lp$|;Dh+0XIM!YfuKiHzpI3Q^qA|fc=N^$Ta6sMBeR=57M>{f
zyH;_Tk7~zI%|6GM(pK<kBEal@bA*?fd)Ypv#X2XQ;KC+oZpv&C@lw#_!zr6S#3%7>
z^o50i$7FD9;fHNp{*x9{kISP}F%imHkgC<C{BK{$&PM!ElX5#sBViKBLf_grb_2t>
zH2bV}5zBPBpQ#gmS^=Fa%**_B-JkFj!Ex6G@|F9!kCzz`6lAJ&u;#|~sR#&HIXFbC
zr_xh?NPY~Z<oSo@vsAcw6fsZBO)M)k6daSKek#Kn^@o2EKjqqq<Hz;HN&@a;{9`eF
zv_F0NWnrP7*w5SOmMUg*IqwCiVzXwA!}tCZOI==e|EbT#FrM4;cxtun$vN^hA7iR+
zMEgwm81dP~^m)axbP{Jc*T?qz@AiYmvNE!(Q>16z%@)4f$1DC7*WyZJ9&J5ATLy*@
z&b5tRR>Aer6Kuhs$%&R7^P=~t2pyTQU=yJ!$xBEU7;7x63$X^s7CD*9ba8^cEGB3B
zNa<oeh%?zta%^{q31~^UtyS@#n6%6seq`VEC&CRo%`eFK^}(y2#f!Yxl}O9PR1*dc
z(B8_^Hs5FF>x_w%;$MFWeG)ApZ1b527lkNH5S_^`*3CF?DA!FN5A~P&Flt#jQX5dW
z{Y;D$=tDVV_j%LWZnvxvIZxV9F1@Yk+nfQ@JY*<lwU|-eDSn69(jdUcJzNr`;?1`y
z@<eK(_&6gtBNNv4B^nt=#X8PI2j5**)@ysq?>-c(Q&~#3ld4Stf;!n=d^pO5uhiiq
zyDs`u#>>$M4I6R#GGRz?MFE&i`h8eZMBWo4MEb+!P@CXi>KGaIjJtXj66r1wO@Uul
zq5))jRPyQ5MRnC<2Ht4s#M2e&)fo&<V_Q>S(IKPcfw8Q%`eEO1Q<YSSJ#N6=YG|fE
zy}8wg2Qvr!+*TL-U%F$*jsg~7hU~@?)+YcTF<M<a@C+HApu_8gj!%3usw2`2wrSk}
z1H#%HkWHV^ynE{n%M)C4!3zFYnn>mHo@~Svc~?ZlFr>`~$ed?EAyESc+ZHTZOJf?=
zEt-?$J7QQAYJugHYL$fv%EYD~a&kW4t)cwgQdOiX5y^$hjORKLGj<<=e*dUoHs#0P
zxq^1>MS?>gY<d3%&0@&dPMn0zoA;bPEssJ~B$WVncS<GEwqFJU5QM1V!#e<<&0Vly
zPhg;|sIGa1XW)=9i<enZqo*XgE3d<cdyq%eSX3y8zjW<)glK8anvSI3Bz8M?>f9M|
zLqeyYU1ro>pQl|&%_k;EwEuiIk?h1+iAT!YI+FEk<VN;9xM1=1>DBda+~zdykG6v~
zkzNR-Apd9qC<-bS?#t$%zpt7xLn6K`zG<Ic-|U4uz)U7_EL!v9o-Su`k1fvN^Y1?-
z`@iQ@h*njIEU{5AD}4!QEy}I1?BQD<o|xE;MmLSVU_Uj^V9FT|f;bw~qhm0;3EuV)
zH}jt86uoE<SMnO_xnP0{O;}8O_)waXLz{M4C>W&KHEb6UO5k!RN}9~<dIuj*)XU=I
zaa*#!gu03^&ICzLs7(oK&b+)H-y?3=ykgDrF0SkcS<R5j)N}Am#ZkMA5*w2KdnPt+
zb(K9}XzKKFP8S80u*VJ%;jNXMWFFs%HR(4PGXXiXBX^)^=eg^%$e;0=ML$12@5W(F
zL(VULqNqk-cF&(bpE;2Vl$zX7t&SmCwj;c6STS$7zI@#}K{k;*dGX=bywK`e1N`0w
zOmdb8<^du90Ab-h#UNalesE#1ZIcyD9E5jJ@ut#ph1XuqGNYOA&jvAHG?DF@-DXfM
z9->m_MhhV!4M?xtwN&CA&Qwmhl5|?fsNsmN_qM_X96x^5dE4>HuwMA{%!QUj9=?h5
zzP**Hb5r8{57zN7_Atc6JKYiX{vGA+nDwO(q(&E<_(%sBmOX>9Ho;P@NiFatI9Gah
zKDjRJL=TUx-!9Df=jVsd_Vi4u>{h`01C^CsDwMckEtf=~pe2y_Sf)^K-l1GubSJ`N
z!*4`G){_(!cLzRN)twHw*WKe`Yx6$-J*R{CbNrHpVu2gB_JaZ{^iX<9*`-SS=dhO1
zL?0?){Lp13Y@#@bka3^wxK)#=W?rOhow_4z^p<AjlK4nZglP+0k93wm7>l``cPR97
z+A$c#{AxZTLat{+S#a3)Z~l-SzLBJsBN&kQG<u$56q!9rfmh#SB;0Fn(J%6iJ4%=l
zhA|Jj0V3W$_|v0>hK4&}ckD|!0p7l!`(WlAS!ptntBZzB%hO4ZSSy4VhmIeYlGx^j
zclJQn-BW*Vnb+1yy~I!Nk-I$)v*I!237WZ3Vyx|p%R{)wOiTBP=7+409dn;5f$)eA
zE4$cQp)q!){9|ox<5NJ6J{XmRpFcko4O3mT1LupbZm;I%ea0>uH>Lv`B8J?>UA<dS
z00+9aU?U3ck-WTWp81j`qSI#JvLs#hsB}hnSeJiaqVZv(q06^-I;4k%Rjz~zA?m;@
zB2#~Kzt#_(4clPQMS8iX6_mBBZN^rWBC`PUnFWHKSA{{j&{iv-C>{*<-TEfOTEamE
zx|3b*al!@k$tvtWD61bldZa_2TT)fE4+k#*rJ#~_SUc^loQ?x&2MVLxTsffH(>|mB
zOdd`R^^K|8TUgaJf`4*v5BvMKrEYIkdz#13z?Vf04jIyAQ&*SF9eo)P{%^nPtc`Qq
z4{rTP|NSHEc`vl+|J(1HtNP#P{_i&_B>&Ho^WX2ROJmmlzx~SV^8ei*G}j5ks6&~n
z_&*?qn96ra&)5{Be9gw$*;%L0d4u>ReYS1>=c^S-50_mg5#KDVC6T&l2#2d`9LE68
zDa^*+RJVH$9_$7c@c^&v#*G_ErT#rzPc#1*Y2Bsusf3v=o<IN3QyT6&DWpq~$ky}X
zdQfo*k?@CtE`q;)IZmt{il48664)znm-HT$`TH!pL5=R2TEO3H;_A}+fhwmXQ)qob
zmP7E@0v_n~o@&+&TnWJ0g`O9bWr1hP=4wk(VUTXLZEZ&|omGl@@lRMH@X4qD^R~jw
z=7O_vCC;yJ1Qdcv(V+=dird$>Z6wR-25xvZs8d0;)z{AsHZ}@VpeR427#ZgKZ8lS{
zZJ&Rwd)7X6>OVi#se|{^*3<3bCkPh+E<;4>6tO6W1meqWHd&te-*Kr--7Cwj<yY(w
zB-PM|WyQSh^ye2>c5NN`Pc$GN=>PXQp9lPdBy)p@YW_lb{q*q$EO|WHdkx!gB1RoR
zOkKs%7ae<H{GIcsxA`D6@yia(T*X!Fd3qyKPS4RT*Xgv4YNvDE=nOhZN$up_u_Z|w
zri4(j6IuJ><-pr^ebsjgBGL>}iNG<1Hk-E%neksa+Be?dd)i37imnU*Z*3bU9Qnh|
zWJfPqa-9B$8Ap54_%6D0rkQ^bp@G^*K0FZ^=*7R3h*cJt@=X;Vm!Bz0mgs2ZqHVt%
z!R9+q(aU95qRQ8#c|4W3RSCUBxu36ZuhW>!+rJ(p={|6vAB#|1)t%1HJHYltQ9`17
zM?An}-#bHyR{sjJ7~ri*g-xM8hZ1dJw5bXzw6k=U0%sDYDJC!Dh4bE(rgwd&!JFMN
zc|-Xbh=&)J-c+aY77(LB$P`)OOMdkHgyKLF$G&l7`<`O{+C_XnJ~cCRL5UK;k3hhX
zz|8*`1QV7Zc$VXiN?d@CgmIWUS_Y>y`VqgLjht>0|NF~`UtgAib%;R#_`q^rp@oHo
z+m<cwk1Wuf$>z@aWk9XBAy<nEm%DWvaxoH(uXT0WbLV#YbCAY`YNe<E6272O2znNu
z4ekBNZ5;*2lj2G+-%3FCrD|ojSJQrt()9K5F>0J@a%~={w+ps-<dlx&MYF}%Fl+#T
z6x7p@>nm^&!1-z)7w=APZVwcWJ9(3ADBKAF+SiqP5avk;k*kf_yH%_89D5vUJ=+&%
z{jU&4Z4;GD%2BJ~M|APn(#r?I^>O{sj94i`KwM=t@tJ4qRdK^uQAA-QdQ<a9{r{1d
ze@vP-?j?%=iRCIWS(n*^VpPj8Aadz+8?@ZojWDKS^3X%hg$QM4evX#VrvatnFm6-K
zvviylgx5<(nM3Vg)l4_uZJ@eJcqImnauD4!nuKWjH%5W>wPGJcT6H<9A0wvSPs^vG
zafn2`?wC^O-|_}Dx>YQ+%E>8q{mb(5aw8@dg&GMBmpEoV<AiP>^hwC5qL(=VbBo>W
z(cc@n9K(>;Bs3t@`d3RnGC$Gw*PD?IZ!8dYtwOmJ=gFY%Fq^UB&5OZIbis0;)Q1qX
zt_K`H9%XXsIGpmOA6Islj8s)!@KCuW`FWk$`V)Kv`{bp{%F1@cC-ayE==PqXU`}WN
z@}4ii6B@vKYbZ-hPB`g&>ff4@@(y<XCw3-G!2E2tw|83!wG?RKlUS=%qbeVZ(Fn#w
zPX{79no!AXVvwfiUI1~0$&>prmK7ssi3AG0QeX~uzKw*Vqz!N+IkUZ``2q-hWW10H
z^<4mv%8*QQ8`lZ`!wj&}<%l|HmPB(2AEJXqRO&<k9aq<J*n@m8oG?tqGO4c!mx{V=
z4WF*hL)b`Rup7lTg_jQ%#XGt-iI4fL)V1g<D_>n#MbFTVQ3|zS2U>caO`9fA0!JSb
z>t*StI!Y!nWfWfqF1#SwQap*QgRKL_)10MC2QU!bOU)97bWy4SWu=5Kn!#Vh0zePR
z4oJ&0f3?qf_)wefhx0#7bHn=evk>445MTTt>ZB1PM%<=8&%N$swYDg!Uv*OLfs7<n
zh`G3(V|dgVYNBm%%CwUqAwpivNqNU;P_ly})y>^~U-k7+(VVczYa|i@;gEv!RcEd+
z^|A(jExVaHgPXqvnA$`C$LiB*yHvI?Ob;u#MMFu`znjKEbhkhPk{uL%K&XOmM%aqW
zN)jqj!6~y&g6wQt5G4z6CD3IW)e@K{z`Pv(nY=wfz7?zLYIdF{boZu@X)EE=g>n%E
zogZ%}Z+N~)8^XiG&wvX+J69rznUp6`?~1Y3fNuYHx@?bNAcS8$wT_laZfzt8p1qK^
zuv4-N!Z?gj27}Hr6`@U69-qqh6Z0sV{42#FCr^%9etU<JU9@pz*ceO7YyPuD=Z*R}
z`=^~KD+SO`oDy*xl{=GWfwPkEcSlB!Bvgg1-};Xzc5sTrjNEPCkk`W~VAtnfI1I~a
zcBOJBjf?y*e%TnVFnQA&$rN>sURq3Avuf4c;RvV?qQcg;O9W*RE0By<uRfPo#YEGH
z#j+sF!`kSmgjB!2dDi46U4McS<J4v)wrUAv)bQAF#5ndZ<02mX!U^t)lG8i^s~$5L
zUemQxr$bZ+QR6-fC`*8|=0K5=c4cqUZn{}9uU}}_<u;QiH3Y0GV%e>5=E^FAHnUjx
z{y#`X4y2U-fwKL%J7XHfyng-r>ofNnSx{IHoM8mUqdkRG5?DO5A?Q?k%(7iOcj_g4
z&3W+P5RxvXJaY>ReU#<qF(;<f6mO?&=)6OirFs1xs_T!}>tNv?7+_NHbN=(*kJGa^
zZrE@Hz-)<<Fb$P4Z+j3Vx~#E@shrd+TTGJlEXH8gXc!uJG0L`6SwY%M&M%(j$KyfR
zjH<%I!W8)^hc_X`QL5E&e!GKru6SUhuqex`P6zo4M;z*-@z=cL;pz(<$+sq@V&_1h
zN=|J}O*ko6m#Y>Gimg9S^=00&{zQR1PE!Pwn2{4Fn%N!aK~+IA>vew!PBvAfU8UN9
z7RM##=6WDtTO!ASnjGd1DDvb}t{Ud2-3<HDwY5U6I{0r9CVTOs>|=jLUGkd3!9#}*
zsRyoM#;hRrazq-OU*L818nva)hyscsr5NX2PTi@r2_J~mvyPfC3)qY-Fp>kJWF`Fs
zyeL__841P0<LA)wG0YyObVxE+te(nhBA`WGjuj?+<CbMUN#m=T99zRwu@oYB-a1$z
z(Hj@dY5vu1#_>rmVOjA|5~POg>DW~+d|U_u#~BMN-xGf#m;T@t%jQ|7c-}NV&a|Y*
zpRbvs;3{Wy>kEJJVd@dZGb21VQU+Om40H0@0A(X9p04%mTnh@C2s`G=iV!IduLB>j
zfBushw?6}x3zB5Wl!bL49;6z&)RxanuFXB4wxo|BP&{a%oHROKPV|uawHic4sdno2
z=A<;=g;7c;mDd+{7f<tWv!&&0<<@+oG!cZ{x}JD!ll3uVbCIy$kTCjdXOpFG#s8$B
z{fSrVLZ`>j0pt-Jp^b!4-Mx434!w?;UEdQb=Ft=#>b3%+wXV0Reomip_VVTKk|!iP
zvD?JP9K2A7W~@CapD6!t7B)S4`0(<o%$|YyDx*e?p^Z=*#0p`@+yv%NXAR5!DMWX3
zI()y8e}+t+{Bm&iDadb-D~#u(>Yc=BO*w=2hy0WGEV}eFm0R)vZ4$EAJ~`6{j<G~S
z6(eK|gN8lNA>QWlSpxIt_{k01_6pS8OfIOAdM5*Lpp685<sFzR1*5Nm#n-cxULqlp
z(g~kPNjDp@fXrC;_<WKdD?+8Cr&#Xx^4eEI=zj~F+Vt3ImJrW66=htS(&}EO@H;ti
z!Q$2oKGz{#_MSw$_{;~9P{mN1d`1Dnu>0(oWkA-h^g?BMb3T&NM3D?`{EpW^71{mw
z#<n`Y-}UCjX|N1A>}Oo5@;%`P#Z2Twh?zSx{~5@cT~Dcw9V<422qO`}6(fJ;T1}AT
zZdNvopJsIq4AV!9_<XB_l>{q?clfQYYLQ4+jMOUG!;n=6by16H)zk8-_wR>#`&wy^
z381eKGC0TM0Ec4OCps()9h~v?#Uv{AVfM3gr$1BD4Vbz~TGG8=za{pK;6g*aaUon5
z6GTb4L(owr;`b*6`gxPI2Pfn^u3d|_gs|n%@>c|I6T@FAySin?{|VhTj`-eU22Qsy
zG?A(JL*i(xXI2!aND!(R_<VbNr=3U})K_-qCii_sd0zrQ$!=*muP!j<>)=?0pZ`aw
zaFR6Wt==eW#VqkIRBUG<t*Lp0zZ0XIlDBUS%hh||T3;rm*;qLgipreOS;ir8*E;we
zIMDkpv3~R-%d!}v$`q5Y)3-Lt(8JxKx=HYODH-wW$5SswT`5`9nixkRLBR{=wA;eB
z{1AKPlL_wqLR+Ovi+t^tadz}p%HdkY7+=Hft3)E^-0Ehn`)?gTmRmEm>3tU@au&&v
zQagPOkYe|{Vp>5`A6LF4X^=z=P*wDoZt{NdyyjznSYriU`qnfA*lgWHi7#dclCR+S
z)^M&MeoZ3mcYt-7Tw7p@9|(B#6>gc=)(Z#X-qY^<qaXU+)HJR<KTM|!rXXH@eD$Ip
z{M#5b3g<k2<@TS^{Pz~li~hV|#m?v%7IykODh4iH--~H%DQ-(QrCbnZenLGU9f&*q
z@~vB2j@2kRCGLKiyt!IzRM|m48xpke+2=@KhQrG7CtY1!a_u!6vL?=xEv*?+W^y=o
zxk-WlA}er&T%{1E_I=TmL>(&YEhF=mOF-lLhXi;`*;8JuKHM-lOD@Bz4^IG`(0cKS
ztndC=aVbJ)0dHDR7bG)q2O@h(Bm%>woTEFU9+B+Y^BRu(OrFgZVBV1OMVpdFyi~wz
zr%<l;3XLqZsZPb040OKyTYSs^`Um#%cx1C`)rffGMZL}4MD-E)h?YNt+~>@$VBsl5
zVmqX$+mvZ0)%$0%g(<@UA<Dh5mW7jdk?;Caj<E>02Wde_(QFvsU%Hbu*lLA>{I({O
zaQ)&(1l2x)j~G21C9V;PouN_ig)z;Uv35a&VU~B{`9dmWv%c5k;_`TMuE&)gjQ6S>
z?Iff$5&1SDSq?Xc1v~D2Rz!tk^H|G*hhwv>BJN@=WhnDrZzfhjAAG0qAxK2&_zeh4
zB4!BhK<#amqLa8&zFDiOgrxgVkS6&~6%Yt7i`_<MB^rhXOMFZHQysRZ`VKSspRw)A
zzsJMj3)r4iXk?(*&t;en_iYB{826*U`5=|$1Z5SaUg$}k>%Xo@2u$@JwM$3~8%*QE
z!jl5c%D$iz(@9ODGoIvLqbN3zsQ=DhTQDlNRP!;O0a1CAhYr<}eg!umX{mPr`U;aN
zTCqnovsG?>^8ckUemo7#z1XqzR4${Y5FVM4SbeBL-b=_AQ!u6oK&n!V(K6_noRLgO
zbY#zCNvSM~b;QsCHxo91Z*eiw>$%Qqp7qXwRpSS$>TPpk?7FdeS#;L*{~1Wcj8epM
z%MGm{Hw|V);{#H!=DMP!&JqAP;VJpFJecw9=oPxYzf{`qX!%{eUCu(T_}1o}<=H&a
z#J(x;e~#7ocY7W^&tZzz&pt~Hwr%Oqv14@1+78mUrfp<EbpG<A_^vG~Rz<{jIlgxI
z=P8-z$KIP#b#*CfQ?Kp~OwYzloY?1m`hA~?$C|#K348t=$|Ht5i*IdG5fm*DXJOko
zMs1|MyfgBuho`3#s3+s{>h|&+<&Hgup^xtS@<JUOcg<h0;JKTIXW+k;mEC`pRqPdL
zxWZp)|If<YR(XeA>92cr_{qrji(<xUY}cv4-?giwveg0dWaK;xL&L%5)p9#MqpL|Z
zK_gWgo`ni~?tH5xw<`wOi*$lNcb4!bo*g}>F>6uwDs^Ze2K(jdl;3M`XmJ+`=<8IJ
zbn{NO&0NCXlcTdy&}+CG?eA|L0ziylO2w}VoFJXR?I)`w)#i3i&H`zcJO|ud8asu6
zs@4=06MO#rU%f{!-OIdcspk*jL!)PQ>b}*C+HXRNg~dyc_cf_zCc<p@+v^&o>VGb}
zk>7ui-bh`4&P{lCg+|~PB;gM#TdaOK^xC<ZV%c1!7<rR)MOm4hL9G5{qJ|g5B0G~m
z9RWi1rJcad0K$>Z1AlqvoSq&#!I1`6jMb8ZBMyvyHI_{cb`d7JiC-2uQl{@Mp#`?P
z=t?o3s%?`3(8MlT$IEd6dpMu^E3+R#R};#g84Hyg{111k!^gOTpB;$MU*7caOYJ?t
z$kV@~gg6`$;&@1K)t_$`^%N|S_Cd2JH>hjE<9u3IqtVR7z#7VdT<s@DiG7Pu^<>ng
z{{C?yV_fXrf9jO5JRy1ESDl<5JY7n1-|fu}b37+-A0h%5L+A_}{~DhrahL)d=)Prv
z_nt*b1B2jyAZz#GhM>WVd5}#aq$y#=U>-Qrf?99_poT=o@fL-LgY2Ic{TiN~;g6bH
zxKAUKtDCHgKKuE{D0*^gvcptDY71xt7OH(O&Fr<yWXeH+kx~%mB?*B`c%WvTerd9<
z$Sh?+3CZWJa9*D*bd)rOGVeta+Hg84y*Cu^9}~qE?~&SF`cB=nHgX&upxqDm5X~l2
z%>~gTI`T*z8Z=9-I`W9Mp|iD-F-bYlS8dkEY7fSc8Qb<2<o<9QEZ<?-%gqs^?ib@G
zm7qEo%ZBbkQc%m;ZzWZaVOSlhPAM}ASc~L*_?LHUrB^4cVfYpH{DDBGZF3dIt$kq_
zTD_Vx+Qg_#uObpDVy8vsix=+&WG*4@NZB!}xk9BQe9*wsq+%+J4|hACNG|h}HLg*4
zE^JKJGO{&3F5nN+iI`<lcaIC2X+f{Mn2zFtTXFK1iGpkkmN{~sJR?{z(Gk0qRgsH`
zp);Xx0Ckc|LB&R>CqSuQ3`v#h4m!*g8^af$=4(DYIk^`~Bq=+_-n*H$Dxe;W{%prQ
z3Z4{x9}Y2>+UoW}m2-5=ik*E=tJ_GyY7x$xUao>V4(maueH0UH&0n9!?g|LiAmOc`
zZa6x5!|F(%-(3DM0Y^BO?40YcSJU0M7GIbdaD4INT4diyJ{<$uKuy#7UZ;qXfY1LJ
zNT{)X!J9eWg{G}|brG_c!_R>YG4gxzmI9jRmE24$y*fz#V^t@1^<8$uxear!&cZzX
z+_tx@N!vx9dC&AyN%;c}lW<5@8`QV&VkJLT)7Q@<JTF)|*MU9mPoqq_%*fRr2&QY)
z>1tOAiTx;!3&N2j=0G0;YX+2v^m^u2SD5!3RWYLWzI}TJYK7ZaJvyF}z3KE%wZHwt
zRHvb>ry%$P1@J=KHfFqTdcN>;=iYH?_2kd(*}Z!?Thr{7_rdkn7lc?kfJvLb44}y0
zNzP6KilfyR1PU;{P7-0;dfC1}NlZx`m%u!AWR&+8pb+SWuq}l8diKfvZ{qSIxo9h4
za4o7m;Rnap=nI!vxURwk#K=%s)4~@VC%Fx1$G+j)xcZ#`oss_*C_0U>sbU_+eGzlM
z-OvHSt^NR=)u8TWJ4kognwmi8bR<b)LE6IH^^m`B+`JhIvVq1{NgN%@nW0FrslJ4>
zZk}O<4O=L;a=49*eN+JrfopeyB!Q6#rhxd}<THu@L5CFfk!r;pzIV~FDQyjnXWm@*
zW(xEYv(VyV2GCVYi}<H-RDqc~lcJUlJv*IJ8amLs?4ny0`Hvsyt2YUtu@&hU^3ciz
zmwj1boivKST#QekIl_k(+IX>AU9XaxAmj}lmaG#@Lk83^?&a}95Ht{h+JTFloa8bq
zwCpMgCIxIF(4Czw0>!I;=c_7_@Cxc77d^S*vU1$I*MdGn<2D25L_z%#jFXF6I#uvf
zOfN3}n*<z+u$WGb5!fz%9_7*7w{Le`yf{k3qplN0*TluP75sk}EtBPF1HDnn(bjsy
z9Jrm8B}I-AOqokdZz|-O!To?Ms2q5ZGOzJxwQ8C%T#A19KWH;g(6({p@~H_=+Gb#^
z84MnFg!#PYDu+U)5sV^ej}$_RmZz4$tn$Xm$z?00(ElU%@ja}VR_>$Ay2IUSRbu}`
zWCH(gh2XShqlivDB_Yj7@4~qnc2wKus%yBV{#5IQ8Wj(Z4HZ6K&<@|#k_w_8)^2b4
zb-Hjpqn453kZcl5zkIp0>D2#1QdHK2oqcIrf3tm<?ac*Fcy2#67NdI`O*%KTsj<R<
zltO$Y(+E#GhVXH!j(H=FV9O#FEjvzTz@##edc|k|ewou^#$j`cWD+dL66XmE+=YAA
z>038$Tu`Y7NDgP1;PCnjq?G{S<(1%G)%g3#rr!y}T5kF6-ya}YshN)-&o7%oWYfjI
zBv}j;1_dlV$jy$#eA52<8W6lz3)lopD(zP4p=NLD8(m1@9x=YxiU*n$PczZ`(tjgE
zpb=Pp*e9pZa{EJij#t{Fjys~cYBoyAM7%<>k~pCSUzBZLu^{E8eK6L(Lt++RndPte
z`TDQ_sC19;<GCXI)MmNkv>!F0I5_#g7_|1^q%H+cxJwX=K7A5S<@NRT!Uh`gVi$1k
zjad$GY6a6JX!J5+h^sh!co-ZFh6BCnn*_U#z!*}~#6f>@&Cx=P`Iv)c(CP^-xcXN3
zLA0@xF#sBXP^BY0(*s-=F-oQ0y2E643_btRqXES+SxBLtzzw9T97I~5N$LcsFf=rj
zfCvc$8+ga9B4qQ^bqB>bkLQPoDxJ#xEA_yO!a{-j3r{1#Ny92Wx@&t2QyQQbSJ!mI
z6m>WCR0(1(;Akz%of1sAMG8Y#fEr8eyESz$EG&FSP7oY<QI{=E_K=00-JvQ62Be&5
zjF?P_QSyK(uAMlo!mX2^l*_HI`|_qg6?TWNk<IET`1=qCZxhC^P|d$i-7Zgn6+;V=
zeVK0F#x?qFcEXh_cWB!MK@X0F0LtB(>V+?oP^9AAIY5}=6hsgIe?cxA^1{lD_ntvF
zcNxtJ)Tf~s5dr?pGB=mQC2S#Ym(|<y%#mjTXFlOwsjH~sdQR${O;60o${B!A1BM^R
zQFz=CHCZ9m1#Loi5}L_NslF1<1TGc6f;tA*40P;17vt%V1uX?;R)SS07yx3e6I{MK
za69-cvl)|%cLL4nL;QTI#YtO!j1qfj#E1o86LQxSl+B%)vhD9b@lPtQc9Ptk=D55F
zWGO{;%YQiJD*y1%v*ShD<d@MC=DXTDI&CD-MKIt#9~}3Z`X!WUh-3$e*Z#Lcn}s^J
z+-uLd&Ah{c4jcKB)(}+JxbSz>C;?oEUx(WWKQQTEjg6+ubz5`ugD3wMyW)43m7@q*
zEV@>_6l~iE{Zqn8|JM-kJHms9Xn8LOI(>kv^pt7f9Rel8A5J0&oZw~^`lE)l{#Ah!
zkNAw1wEipqNIoT~s*DH4LjqXb^UuIpI4xo0HG~x);R$hZX;gnJR^9v2t@Wcj^)&sL
zP}HNx!edEp-`Xx*xDXP6(G-@@4;11eCV7_Y?zDc73$L&I$Aa3sx2`8TD8X+ML_>~<
z(8J(l^6ydtWTowe2@)yp4hAw|zWS|yUD*s|mBzVV|0j{A+tf$@z?c8L2>$v1ciK$r
j1LGh6cZ{H#7RmYesncK2G`b>IO6ls%*S<7s`Og0V{53x@

literal 0
HcmV?d00001

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
new file mode 100755
index 000000000..f0fada829
--- /dev/null
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -0,0 +1,3425 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1c1a4119",
+   "metadata": {},
+   "source": [
+    "# Nemo Curator pipeline example\n",
+    "\n",
+    "## NeMo Curator introduction\n",
+    "The NeMo Curator is a Python library that consists of a collection of scalable data-mining modules for curating natural language processing (NLP) data for training large language models (LLMs). The modules within the NeMo Data Curator enable NLP researchers to mine high-quality text at scale from massive uncurated web corpora. \n",
+    "\n",
+    "NeMo Curator includes the following modules to perform data curation:\n",
+    "- Data download and Extraction\n",
+    "- Language identification and separation\n",
+    "- Text reformatting and cleaning\n",
+    "- Quality filtering\n",
+    "- Document-level deduplication\n",
+    "- Multilingual downstream-task decontamination\n",
+    "- Distributed Data Classification\n",
+    "\n",
+    "NeMo Curator team has perform ablation experiments using Common Crawl dataset to train a 357M GPT-style model to assess the effect of different curation stage on model performance. \n",
+    "\n",
+    "![alt text](./image/zeroshot_ablations.png)\n",
+    "\n",
+    "For the latest NeMo Data Curator user guide, please refer to https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/index.html "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be41377f",
+   "metadata": {},
+   "source": [
+    "## About this notebook\n",
+    "\n",
+    "\n",
+    "This notebook will use **Thai Wikipedia dataset** as example to demonstrate a typical data curation pipeline using NeMo Curator. After running through this script, user will be able to know how to use NDC to download wikipedia data, perform language separation using fasttext, perform GPU based exact deduplication and fuzzy deduplication and use CPU based heuristic filtering. \n",
+    "\n",
+    "Step description:\n",
+    "1. Download and extract data\n",
+    "2. Language detection and separation\n",
+    "3. GPU based deduplication\n",
+    "    1. Exact deduplication\n",
+    "    2. Fuzzy deduplication\n",
+    "4. Heuristic filtering\n",
+    "\n",
+    "What is not included:\n",
+    "1. Customized downloading\n",
+    "2. Classifier filtering\n",
+    "3. Downstream-task deduplication\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8860c239",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "### System Requirements\n",
+    "Here is the hardware setting for this notebook\n",
+    "\n",
+    "**GPU**: NVIDIA A10 24G. \n",
+    "\n",
+    "**CUDA & Nvidia Drivers**: CUDA 12.2 with Driver 535.154.05\n",
+    "\n",
+    "**OS**: ubuntu 22.04\n",
+    "\n",
+    "### Getting NeMo FrameWork Training Container\n",
+    "- Get access to the container via https://developer.nvidia.com/nemo-framework\n",
+    "- Set your docker credentials \n",
+    "    ```bash\n",
+    "    docker login nvcr.io\n",
+    "\n",
+    "    Username: $oauthtoken\n",
+    "    Password: <Your NGC Key>\n",
+    "- Get NeMo NeMo FrameWork Training Container\n",
+    "    ```bash\n",
+    "    docker pull nvcr.io/ea-bignlp/ga-participants/nemofw-training:24.01\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ff6bff1b",
+   "metadata": {},
+   "source": [
+    "## 0. Env Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "24dce020",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
+      "Collecting jsonlines\n",
+      "  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)\n",
+      "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines) (23.2.0)\n",
+      "Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
+      "Installing collected packages: jsonlines\n",
+      "Successfully installed jsonlines-4.0.0\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install jsonlines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6831f331",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import argparse\n",
+    "\n",
+    "from nemo_curator.utils.distributed_utils import get_client,get_num_workers\n",
+    "from nemo_curator.utils.script_utils import add_distributed_args\n",
+    "from nemo_curator.utils.file_utils import get_all_files_paths_under, separate_by_metadata\n",
+    "from nemo_curator.utils.distributed_utils import read_data, write_to_disk\n",
+    "from nemo_curator.gpu_deduplication.utils import (create_logger, parse_nc_args, performance_report_if, enable_spilling)\n",
+    "from nemo_curator.datasets import DocumentDataset\n",
+    "\n",
+    "import os\n",
+    "import sys\n",
+    "import pandas as pd\n",
+    "import time\n",
+    "import cudf\n",
+    "import dask_cudf\n",
+    "import numpy as np\n",
+    "from dask.distributed import Client, LocalCluster\n",
+    "import jsonlines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e28739b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pre_imports():\n",
+    "    import cudf \n",
+    "\n",
+    "def load_dataset(input_data_dir, file_type='jsonl'):\n",
+    "    files = list(get_all_files_paths_under(input_data_dir))\n",
+    "    raw_data = read_data(files, file_type=file_type, backend=\"pandas\", add_filename=True)\n",
+    "    dataset = DocumentDataset(raw_data)\n",
+    "\n",
+    "    return dataset\n",
+    "\n",
+    "def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)):\n",
+    "    return add_distributed_args(parser)\n",
+    "\n",
+    "def check_jsonl_file(file_dir):\n",
+    "    for file in os.listdir(file_dir):\n",
+    "        if 'jsonl' not in file:\n",
+    "            continue\n",
+    "        with open(os.path.join(file_dir,file), 'r', encoding='utf-8') as f:\n",
+    "            first_line = f.readline()\n",
+    "            print(first_line)\n",
+    "        break\n",
+    "\n",
+    "def extract_lines_with_id(file_path,target_list):\n",
+    "    with jsonlines.open(file_path) as reader:\n",
+    "        for obj in reader:\n",
+    "            if obj.get('id') in target_list:\n",
+    "                yield obj"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d279329f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/nluo_data/NeMo-Curator/tutorials/single_node_tutorial\n"
+     ]
+    }
+   ],
+   "source": [
+    "cur_dir = os.getcwd()\n",
+    "print(cur_dir)\n",
+    "data_dir = f\"{cur_dir}/workspace/\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f3f452a3",
+   "metadata": {},
+   "source": [
+    "## 1. Download\n",
+    "In this example, Thai wikipedia data will be downloaded.\n",
+    "\n",
+    "Here is what happens when function `download_wikipedia()` is called:\n",
+    "1. Run `get_wikipedia_urls()` to obtain a list of urls to download .bz2 files for Thai wikipedia data. In this module, we use the base link and the language from user input to formulate a repo links for downloadable wikipedia .bz2 dump files. The formulated link will be `https://dumps.wikimedia.org/<language>wiki`. All the links will be stored in a .txt file. Argument for this function includes:\n",
+    "    - `dump_dates`: A date in the string format of 'YYYYMMDD'. It determines which wikipedia snapshot will be downloaded. If not specified, the `latest` snapshot will be downloaded\n",
+    "    - `language`: language code of the desired language in lower case. Default value is `en`\n",
+    "\n",
+    "2. \n",
+    "    Run `download_and_extract()` to download and extract contents based on the url list obtained from `get_wikipedia_urls`. User will need to define `downloader`, `extractor` and `iterator` for the dataset. \n",
+    "    In this case, `WikipediaDownloader`,`WikipediaIterator` and `WikipediaExtractor` are used.\n",
+    "    - `WikipediaDownloader`: Downloads wikipedia dumps file to local folder.\n",
+    "    - `WikipediaIterator`: Extracts the .bz2 files and useful content from the base html content.\n",
+    "    -  `WikipediaExtractor`: Performs further task specific html content cleaning such as removing media files, removing references/tables etc. and finally yield pure text data which will be store in .jsonl format. \n",
+    "    Please refer to `./NeMo-Curator/nemo_curator/download/wikipedia.py` for  detail implementation.\n",
+    "    \n",
+    "    Argument for this function includes:\n",
+    "    - `output_path`: Output path for downloaded and extracted dataset\n",
+    "    - `output_type`: Type of output file. Default is .jsonl. User might choose other types such as parquet. In this example, .jsonl will be used\n",
+    "    - `language`: See above\n",
+    "    - `dump_date`: See above\n",
+    "    - `raw_download_dir`: Output path for intermediate downloaded .bz2 file. If not specified, will be downloaded to `output_path`\n",
+    "    - `keep_raw_download`: Whether to keep downloaded .bz2 files after extraction. Default is not to keep.\n",
+    "    - `force_download`: Whether to restart downloading process if the target .bz2 files are detected under the `raw_download_dir` \n",
+    "    - `url_limit`: Number of .bz2 files to be downloaded.\n",
+    "\n",
+    "The resultant .jsonl for Thai wikipedia will contain the following keys:\n",
+    "1. text\n",
+    "2. title\n",
+    "3. id\n",
+    "4. url\n",
+    "5. language\n",
+    "6. source_id\n",
+    "7. file_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1773cda2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator.download import download_wikipedia"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d711a8f8",
+   "metadata": {},
+   "source": [
+    " Start a CPU based Dask cluster. Please modify `n_workers` and `memory_limit` according to your hardware specification. To process TH wikipedia data, it's advised to have `memory_limit` greater than 12GB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "56ec66e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n",
+    "client = Client(cluster)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f794b51c",
+   "metadata": {},
+   "source": [
+    "Define parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a90f3505",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Output\n",
+    "download_base_directory= os.path.join(data_dir,\"wiki_downloads\")\n",
+    "download_output_directory = os.path.join(download_base_directory,\"data\")\n",
+    "\n",
+    "#Relevant parameter\n",
+    "dump_date = \"20240201\"\n",
+    "language = 'th'\n",
+    "url_limit = 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5628356b",
+   "metadata": {},
+   "source": [
+    "Download TH wikipedia data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b591b9f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = download_wikipedia(download_output_directory,\n",
+    "                   language=language, \n",
+    "                   dump_date=dump_date,\n",
+    "                   url_limit=url_limit).df.compute()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2aae29dd",
+   "metadata": {},
+   "source": [
+    "Verify result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "169fadb9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "downloads  thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n",
+      "162164 /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/wiki_downloads/data/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {download_output_directory}\n",
+    "! wc -l  {download_output_directory}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "f2bcb168",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"id\":\"1\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\",\"language\":\"th\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\"}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "check_jsonl_file(download_output_directory)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "44fa2d13",
+   "metadata": {},
+   "source": [
+    "**[Optional]**Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "590c489c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# client.cluster.close()\n",
+    "# client.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ba566fc",
+   "metadata": {},
+   "source": [
+    "## 2.Language separation and unicode fixing\n",
+    "\n",
+    "**Note**: In order to be run on interactive python. Please comment `from.code import *` and the related imports in `./nemo_curator/filters/__init__.py`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f742b881",
+   "metadata": {},
+   "source": [
+    "In this section, we will be using a language classification model by fasttext to separate the TH wikipedia dataset based on the document major languages, and we will also fix the unicode in the documents. Detailed steps are:\n",
+    "\n",
+    "1. Download fasttext model for text language detection\n",
+    "2. Construct a filter which uses the downloaded fasttext model to produce a language label to each document. \n",
+    "3. Separate each document by the language label. This will create sub-folders for each languages under the output path and the documents under the same language will be output to a .jsonl file in the corresponding sub-folder.\n",
+    "4. Load .jsonl file in the folder of desirable language. In this example, `TH` folder will be loaded.\n",
+    "5. Apply `UnicodeReformatter` to the data and output the result in .jsonl format. \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "71a6e4a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator import ScoreFilter,Modify\n",
+    "from nemo_curator.filters import FastTextLangId\n",
+    "from nemo_curator.modifiers import UnicodeReformatter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4916079c",
+   "metadata": {},
+   "source": [
+    "**[Optional]**8Start a cpu based Dask cluster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "23a63375",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n",
+    "# client = Client(cluster)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "957d7357",
+   "metadata": {},
+   "source": [
+    "Define parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "6270de3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Input path\n",
+    "multilingual_data_path = download_output_directory\n",
+    "\n",
+    "# Output path\n",
+    "language_base_output_path = os.path.join(data_dir,\"language_sep\")\n",
+    "language_data_output_path = os.path.join(language_base_output_path,\"data\")\n",
+    "language_separated_output_path = os.path.join(language_data_output_path,\"language\")\n",
+    "lang_sep_cleaned_data_output_path = os.path.join(language_data_output_path,\"cleaned\")\n",
+    "\n",
+    "# Fasttext model path\n",
+    "model_path = language_base_output_path\n",
+    "\n",
+    "# Define desired language\n",
+    "target_language = \"TH\"\n",
+    "\n",
+    "# Define key in output .jsonl files to store the language information\n",
+    "language_field = \"language\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "598cff2d",
+   "metadata": {},
+   "source": [
+    "Download fasttext model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "0c7cc007",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-03-22 08:40:55--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\n",
+      "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.227.74.12, 13.227.74.118, 13.227.74.9, ...\n",
+      "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.227.74.12|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 131266198 (125M) [application/octet-stream]\n",
+      "Saving to: ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin’\n",
+      "\n",
+      "lid.176.bin         100%[===================>] 125.18M   220MB/s    in 0.6s    \n",
+      "\n",
+      "2024-03-22 08:40:56 (220 MB/s) - ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin’ saved [131266198/131266198]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P {model_path}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d875771b",
+   "metadata": {},
+   "source": [
+    "Apply fasttext model to separate documents by their languages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c959800c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading 1 files\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time taken for splitting language:147.80864667892456\n"
+     ]
+    }
+   ],
+   "source": [
+    "t0 = time.time()\n",
+    "\n",
+    "# Load dataset \n",
+    "multilingual_dataset = load_dataset(multilingual_data_path)\n",
+    "\n",
+    "#Define Language separation pipeline\n",
+    "lang_filter = FastTextLangId(os.path.join(model_path,'lid.176.bin'))\n",
+    "language_id_pipeline = ScoreFilter(lang_filter, score_field=language_field, score_type='object')\n",
+    "filtered_dataset = language_id_pipeline(multilingual_dataset)\n",
+    "\n",
+    "# The language separation pipeline will produce a result looks like ['EN',0.96873], we only want to keep the 'EN' label and drop the detailed classifier score\n",
+    "filtered_dataset.df[language_field] = filtered_dataset.df[language_field].apply(lambda score: score[1],meta = (language_field, 'object'))\n",
+    "\n",
+    "# Split the dataset to corresponding language sub-folders\n",
+    "language_stats = separate_by_metadata(filtered_dataset.df, language_separated_output_path, metadata_field=language_field).compute()\n",
+    "\n",
+    "print(f\"Time taken for splitting language:{time.time()-t0}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bd54a24a",
+   "metadata": {},
+   "source": [
+    "Load `UnicodeReformatter` to reformat any unicode appeared in the desired language dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "0c09bc28",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading 1 files\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Time taken for fixing unicode:444.5816135406494\n"
+     ]
+    }
+   ],
+   "source": [
+    "t0 = time.time()\n",
+    "\n",
+    "# Read the language specific data and fix the unicode in it\n",
+    "lang_data_path = os.path.join(language_separated_output_path, target_language)\n",
+    "lang_data = load_dataset(lang_data_path)\n",
+    "\n",
+    "cleaner = Modify(UnicodeReformatter())\n",
+    "cleaned_data = cleaner(lang_data)\n",
+    "\n",
+    "# Write the cleaned_data\n",
+    "write_to_disk(cleaned_data.df, lang_sep_cleaned_data_output_path, write_to_filename=True, output_type='jsonl')\n",
+    "\n",
+    "print(f\"Time taken for fixing unicode:{time.time()-t0}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "00c6e5a1",
+   "metadata": {},
+   "source": [
+    "Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file (no. of lines = 162164)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "b2b34d46",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n",
+      "161748 /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/data/cleaned/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {lang_sep_cleaned_data_output_path}\n",
+    "! wc -l  {lang_sep_cleaned_data_output_path}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39d539a2",
+   "metadata": {},
+   "source": [
+    "Furthur verify by loading documents that has been identified as other language, such as 'EN'. We can see from output that the removed document is indeed in English and contains very little or even no Thai."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ace3c5b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_jsonl_file(os.path.join(language_separated_output_path,'EN'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b817bf7",
+   "metadata": {},
+   "source": [
+    "**[Optional]**Close the Dask cluster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 153,
+   "id": "bf05b6c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# client.cluster.close()\n",
+    "# client.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cc8b6aef",
+   "metadata": {},
+   "source": [
+    "## 3.Add ID\n",
+    "TH wikipedia data do have `id` field, but the `id` field contains number only. It will be better if we unified the `id` field and transform it to the format of `<prefix>_<id>`. In this way, when handling multiple dataset, we will able to know which document from which dataset has been removed. This `id` will be useful when we are running deduplication and heuristic filtering. The function we will be using is `AddID()`. Arguments for this function include:\n",
+    "- `id_field`: fields will be added to input .json file. If the key already exists in the .jsonl, it's value will be replaced.\n",
+    "- `id_prefix`: prefix used in ID. Default is 'doc-id'\n",
+    "- `start_index`: starting index in ID. Default is 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "fe9e6eef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator import AddId"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "232c01a5",
+   "metadata": {},
+   "source": [
+    "**[Optional]**If there is no running Dask cluster, start CPU based Dask cluster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 155,
+   "id": "f3f483eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n",
+    "# client = Client(cluster)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2be65a51",
+   "metadata": {},
+   "source": [
+    "Define relevant parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "054019a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input\n",
+    "add_id_input_data_dir = lang_sep_cleaned_data_output_path\n",
+    "\n",
+    "#Output\n",
+    "added_id_output_path = os.path.join(data_dir,\"add_id/cleaned\")\n",
+    "\n",
+    "#Format of output ID will be <prefix>_<id>, Define prefix here\n",
+    "add_ID_id_prefix=\"TH_wiki\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "80f9591c",
+   "metadata": {},
+   "source": [
+    "Adding ID to dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "e8fd7e09",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading 1 files\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Time taken for add ID:56.01176333427429\n"
+     ]
+    }
+   ],
+   "source": [
+    "t0 = time.time()\n",
+    "# Read input files\n",
+    "dataset = load_dataset(add_id_input_data_dir)\n",
+    "\n",
+    "# Run AddID() on the input dataset\n",
+    "add_id = AddId(id_field='id',id_prefix=add_ID_id_prefix,start_index=0)\n",
+    "id_dataset = add_id(dataset)\n",
+    "\n",
+    "#Output files\n",
+    "write_to_disk(id_dataset.df, output_file_dir=added_id_output_path, write_to_filename=True, output_type='jsonl')\n",
+    "\n",
+    "print(f\"Time taken for add ID:{time.time()-t0}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "50016a50",
+   "metadata": {},
+   "source": [
+    "Verify the result. From the output, we can see that the `id` value has been changed to `TH_wiki-0000000000` "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "27a634e9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\",\"id\":\"TH_wiki-0000000000\",\"language\":\"TH\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\"}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "check_jsonl_file(added_id_output_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7084fed",
+   "metadata": {},
+   "source": [
+    "Close Dask cluster. This cell needs to be run as we are starting a new GPU Dask cluster in the following task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "16399469",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.cluster.close()\n",
+    "client.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb227709",
+   "metadata": {},
+   "source": [
+    "## 4.Exact Dedplication\n",
+    "\n",
+    "In exact deduplication, the document text is hashed into unique string using certain hashing algorithm, such as 'md5'. The documents with exact hashed values are having identical text. We will output the `ID` of duplicated documents for removal later. The function used is `ExactDuplicates()`. Arguments for this function include:\n",
+    "- `id_field`: Key in input file for identifying document ID\n",
+    "- `text_field`: Key in input file which contains document text.\n",
+    "- `hash_method`: Hashing algorithm used. Default is `md5`\n",
+    "- `cache_dir`: If specified, the duplicated document IDs will be output to the `cache_dir`. Otherwise, the IDs will not be saved\n",
+    "\n",
+    "Also, we are going to use GPU dask cluster to accelerate computation for deduplication (both exact and fuzzy)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "8fa6c3af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator.modules import ExactDuplicates"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa70fd06",
+   "metadata": {},
+   "source": [
+    "Start a GPU based Dask cluster. Since GPU based Dask cluster involves setting several arguments, we will use the `get_client()` wrapper function to quickly set up. Please make sure the `device` in `args` is `gpu`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "7e9530f6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Namespace(scheduler_address=None, scheduler_file=None, n_workers=20, threads_per_worker=1, rmm_pool_size=None, protocol='tcp', nvlink_only=False, files_per_partition=2, num_files=-1, device='gpu', set_torch_to_use_rmm=False)"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sys.argv=['','--device','gpu']\n",
+    "parser = argparse.ArgumentParser()\n",
+    "args = attach_args(parser).parse_args()\n",
+    "args.set_torch_to_use_rmm = False\n",
+    "args"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "f71ab145",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of dask worker:1\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'tcp://127.0.0.1:37795': None}"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client = get_client(args, args.device)\n",
+    "print(f\"Number of dask worker:{get_num_workers(client)}\")\n",
+    "client.run(pre_imports)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4ef57149",
+   "metadata": {},
+   "source": [
+    "Define parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "26e6927e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input\n",
+    "exact_dedup_input_dataset_dir = added_id_output_path\n",
+    "\n",
+    "#Output\n",
+    "exact_dedup_base_output_path = os.path.join(data_dir,\"exact_dedup\")\n",
+    "exact_dedup_log_dir = os.path.join(exact_dedup_base_output_path,'log')\n",
+    "exact_dedup_output_dir = os.path.join(exact_dedup_base_output_path,'data')\n",
+    "\n",
+    "#Parameters for ExactDuplicates()\n",
+    "exact_dedup_dataset_id_field = \"id\"\n",
+    "exact_dedup_dataset_text_field = \"text\" \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "b9a75a74",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p {exact_dedup_log_dir}\n",
+    "!mkdir -p {exact_dedup_output_dir}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a9fc0bd2",
+   "metadata": {},
+   "source": [
+    "Apply exact deduplication"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "daf8f324",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading 1 files\n",
+      "Number of exact duplicated file:53\n",
+      "Time taken for exact duplicate:3.0404415130615234\n"
+     ]
+    }
+   ],
+   "source": [
+    "t0 = time.time()\n",
+    "# Read input dataset\n",
+    "input_dataset = DocumentDataset.read_json(exact_dedup_input_dataset_dir, backend='cudf')\n",
+    "\n",
+    "#Run exact deduplication to the input\n",
+    "exact_dup = ExactDuplicates(\n",
+    "    logger=exact_dedup_log_dir,\n",
+    "    id_field=exact_dedup_dataset_id_field,\n",
+    "    text_field=exact_dedup_dataset_text_field,\n",
+    "    hash_method=\"md5\",\n",
+    "    cache_dir=exact_dedup_output_dir #Duplicated document ID list is output to the cache_dir\n",
+    ")\n",
+    "duplicates = exact_dup(dataset=input_dataset)\n",
+    "\n",
+    "print(f\"Number of exact duplicated file:{len(duplicates)}\")\n",
+    "\n",
+    "print(f\"Time taken for exact duplicate:{time.time()-t0}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "517c60e4",
+   "metadata": {},
+   "source": [
+    "Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "2f3c67f8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of exact duplicated document:53\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>_hashes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>TH_wiki-0000021211</td>\n",
+       "      <td>1708cb56ec582f78716f0864dca9382d</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>TH_wiki-0000021213</td>\n",
+       "      <td>1708cb56ec582f78716f0864dca9382d</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>TH_wiki-0000105191</td>\n",
+       "      <td>e77a248506ef16737288fae5759db33a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>TH_wiki-0000105192</td>\n",
+       "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>TH_wiki-0000105193</td>\n",
+       "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   id                           _hashes\n",
+       "0  TH_wiki-0000021211  1708cb56ec582f78716f0864dca9382d\n",
+       "1  TH_wiki-0000021213  1708cb56ec582f78716f0864dca9382d\n",
+       "2  TH_wiki-0000105191  e77a248506ef16737288fae5759db33a\n",
+       "3  TH_wiki-0000105192  2e386f5c3af70f43874618988d4842b2\n",
+       "4  TH_wiki-0000105193  2e386f5c3af70f43874618988d4842b2"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "exact_dedup_res = pd.read_parquet(os.path.join(exact_dedup_output_dir,\"_exact_duplicates.parquet\"))\n",
+    "print(f\"Number of exact duplicated document:{len(exact_dedup_res)}\")\n",
+    "exact_dedup_res.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "7ed7d4de",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>_hashes</th>\n",
+       "      <th>id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0b908a91cdf0544c1ef3015cff4ee07e</td>\n",
+       "      <td>TH_wiki-0000157216 TH_wiki-0000066307</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>15f35c239b6579b4642f7656e64576ac</td>\n",
+       "      <td>TH_wiki-0000098621 TH_wiki-0000074714 TH_wiki-...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1708cb56ec582f78716f0864dca9382d</td>\n",
+       "      <td>TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
+       "      <td>TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>3e6e96a80410d5a191d098f464e66f86</td>\n",
+       "      <td>TH_wiki-0000122055 TH_wiki-0000116550</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                            _hashes  \\\n",
+       "0  0b908a91cdf0544c1ef3015cff4ee07e   \n",
+       "1  15f35c239b6579b4642f7656e64576ac   \n",
+       "2  1708cb56ec582f78716f0864dca9382d   \n",
+       "3  2e386f5c3af70f43874618988d4842b2   \n",
+       "4  3e6e96a80410d5a191d098f464e66f86   \n",
+       "\n",
+       "                                                  id  \n",
+       "0              TH_wiki-0000157216 TH_wiki-0000066307  \n",
+       "1  TH_wiki-0000098621 TH_wiki-0000074714 TH_wiki-...  \n",
+       "2  TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...  \n",
+       "3  TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-...  \n",
+       "4              TH_wiki-0000122055 TH_wiki-0000116550  "
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "exact_dedup_res.groupby('_hashes')['id'].agg(lambda x: ' '.join(x)).reset_index().head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "3051ed4b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'filename': 'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl', 'id': 'TH_wiki-0000066307', 'language': 'TH', 'source_id': 'thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2', 'text': '\\n\\nแหล่งข้อมูลอื่น \\n\\nสงขลา\\n \\nรายชื่อเกี่ยวกับจังหวัดสงขลา', 'title': 'รายชื่อโบราณสถานในจังหวัดสงขลา', 'url': 'https://th.wikipedia.org/wiki/%E0%B8%A3%E0%B8%B2%E0%B8%A2%E0%B8%8A%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B9%82%E0%B8%9A%E0%B8%A3%E0%B8%B2%E0%B8%93%E0%B8%AA%E0%B8%96%E0%B8%B2%E0%B8%99%E0%B9%83%E0%B8%99%E0%B8%88%E0%B8%B1%E0%B8%87%E0%B8%AB%E0%B8%A7%E0%B8%B1%E0%B8%94%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2'}\n",
+      "{'filename': 'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl', 'id': 'TH_wiki-0000157216', 'language': 'TH', 'source_id': 'thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2', 'text': '\\n\\nแหล่งข้อมูลอื่น \\n\\nสงขลา\\n \\nรายชื่อเกี่ยวกับจังหวัดสงขลา', 'title': 'รายชื่อโบราณสถานในจังหวัดสงขลา (อำเภอเมืองสงขลาและสิงหนคร)', 'url': 'https://th.wikipedia.org/wiki/%E0%B8%A3%E0%B8%B2%E0%B8%A2%E0%B8%8A%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B9%82%E0%B8%9A%E0%B8%A3%E0%B8%B2%E0%B8%93%E0%B8%AA%E0%B8%96%E0%B8%B2%E0%B8%99%E0%B9%83%E0%B8%99%E0%B8%88%E0%B8%B1%E0%B8%87%E0%B8%AB%E0%B8%A7%E0%B8%B1%E0%B8%94%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2%20%28%E0%B8%AD%E0%B8%B3%E0%B9%80%E0%B8%A0%E0%B8%AD%E0%B9%80%E0%B8%A1%E0%B8%B7%E0%B8%AD%E0%B8%87%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2%E0%B9%81%E0%B8%A5%E0%B8%B0%E0%B8%AA%E0%B8%B4%E0%B8%87%E0%B8%AB%E0%B8%99%E0%B8%84%E0%B8%A3%29'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "target_list = ['TH_wiki-0000157216', 'TH_wiki-0000066307']\n",
+    "for line in extract_lines_with_id(os.path.join(exact_dedup_input_dataset_dir,'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl'),target_list):\n",
+    "    print(line)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec31440b",
+   "metadata": {},
+   "source": [
+    "**[Optional]** You might choose to close Dask cluster here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "2ee05303",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# client.cluster.close()\n",
+    "# client.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "710e8540",
+   "metadata": {},
+   "source": [
+    "## 5. Fuzzy Deduplication\n",
+    "Fuzzy deduplication involves 5 intermediate steps to generate duplicates. Refer to https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/gpudeduplication.html for details\n",
+    "\n",
+    "Fuzzy deduplication in this example is a GPU implementation of MinhashLSH algorithm. This algorithm measures similarity based on statistics but not semantic meanings of text. There are a few concepts to be introduced before heading into fuzzy deduplication.\n",
+    "1. Jaccard similarity: Jaccard similarity is often used as a metric to calculate the similarity between two sets. It's calculated by dividing the number of common elements in the two sets (Intersection) by the number of total unique elements in the two sets (Union). In the case of text documents, we transform a document into a set of n-grams. If two documents share a large amount of n-grams, most likely the documents are similar. \n",
+    "\n",
+    "    ![alt text](./image/jaccard.png )\n",
+    "\n",
+    "2. Complexity of the problem: To find all the similar document pairs in a dataset, we need to compute pair-wise Jaccard similarity across the dataset. Hence, making the complexity $O(N^2)$\n",
+    "\n",
+    "The MinhashLSH algorithm is a technique for quickly estimating the similarity between sets, such as the similarity between documents represented as sets of shingles (n-grams). It's able to find out Jaccard similar pair in the corpus but in a much computational efficient way. This algorithm has following steps in a high-level:\n",
+    "1. Compute minhash for each document\n",
+    "2. Run Locality Sensitive Hashing (LSH) based on the minhash which further assign buckets to each document. Each documents will be assigned to multiple buckets. Documents within the same bucket are deemed to be similar.\n",
+    "3. Run pair-wise Jaccard similarity within each buckets to remove false positive cases within the buckets\n",
+    "4. Based on the Jaccard similarity, transform the similarity matrix to a graph ans run connected component algorithm. For a group of connected components in the graph, they are the final similar document groups and the IDs within each groups will be output for duplicate removal.\n",
+    "More detailed explanation please refer to https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/cpudeduplication.html.\n",
+    "\n",
+    "For implementation of MinhahsLSH on GPU, there are 5 steps:\n",
+    "1. Minhash computation\n",
+    "2. Bucket computation\n",
+    "3. Jaccard shuffle for load balancing in a distributed system\n",
+    "4. Jaccard similarity computation\n",
+    "5. Connected component "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c4b99c5e",
+   "metadata": {},
+   "source": [
+    "**If there is not running Dask cluster, start a GPU Dask cluster here**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "115ff2dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'tcp://127.0.0.1:33223': None}"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# sys.argv=['','--device','gpu']\n",
+    "# parser = argparse.ArgumentParser()\n",
+    "# args = attach_args(parser).parse_args()\n",
+    "# args.set_torch_to_use_rmm = False\n",
+    "\n",
+    "# client = get_client(args, args.device)\n",
+    "# get_num_workers(client)\n",
+    "# client.run(pre_imports)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1979977d",
+   "metadata": {},
+   "source": [
+    "### 5.1 Minhash\n",
+    "\n",
+    "Run `MinHash()` for this section. The output of a minhash is a parquet file which contains document ID and hashed value which is an array contains 260 32-bit integer data. To obtain such hashed values we need to go through the following steps:\n",
+    "1. Generate a set of n-gram components of a document. For example, doc = `Nemo Curator is a data curation tool`, a 3-gram set of this document will be `['Nemo Curator is','Curator is a','is a data','a data curation','data curation tool']`\n",
+    "2. Hashed each n-gram into numerical values\n",
+    "3. Generate a random hash function $H_1()$ which will hash each numeric n-gram into a 32-bit integer and take the minimum integer to use as minhash value for $H_1()$\n",
+    "4. Repeat step 2 and 3 with hash function $H_x()$ until desired minhash length is reached. Minhash value of each iteration will be append together to form the final minhash array. \n",
+    "\n",
+    "Arguments include:\n",
+    "- `seed`:Random seed used for initializing the hash functions used to compute the MinHashes. It's advised to keep this value the same for different experiment for reproducibility\n",
+    "- `num_hashes`:Length of each minhash array. Default is 260. Longer minhash length will have better estimate of actual Jaccard similarity, but require more computational power\n",
+    "- `char_ngrams`:n-gram length\n",
+    "- `use_64bit_hash`:Whether to use 64bit or 32bit hash function\n",
+    "- `id_field`: Key in input file for identifying document ID\n",
+    "- `text_field`: Key in input file which contains document text.\n",
+    "- `cache_dir`: If specified, the intermediate result will be output to the `cache_dir`. \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "f9b2a642",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator import MinHash"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c152974",
+   "metadata": {},
+   "source": [
+    "Define parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "117a569d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input\n",
+    "minhash_data_path = added_id_output_path\n",
+    "#Output\n",
+    "minshah_base_output_path = os.path.join(data_dir,\"fuzzy/minhash\")\n",
+    "minshah_log_dir = os.path.join(minshah_base_output_path,'log')\n",
+    "minshah_output_dir = os.path.join(minshah_base_output_path,'data')\n",
+    "#Specify dataset name\n",
+    "dataset_name = 'TH_wikipedia'\n",
+    "\n",
+    "#Relevant parameters\n",
+    "minhash_id_field = 'id'\n",
+    "minhash_text_field = 'text'\n",
+    "seed = 10\n",
+    "minhash_length = 260\n",
+    "char_ngram = 5\n",
+    "use_64bit_hash = False\n",
+    "files_per_partition = 2\n",
+    "\n",
+    "!mkdir -p {minshah_log_dir}\n",
+    "!mkdir -p {minshah_output_dir}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "73c1ad41",
+   "metadata": {},
+   "source": [
+    "Run MinHash"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "a17954eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing minhashes for /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/add_id/cleaned\n",
+      "Reading 1 files\n",
+      "Time taken for MinHash:7.543871879577637\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
+      "\n",
+      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "t0 = time.time()\n",
+    "print(f\"Computing minhashes for {minhash_data_path}\")\n",
+    "\n",
+    "# Load data. Only the [minhash_id_field, text_field] columns are needed\n",
+    "files = get_all_files_paths_under(root=minhash_data_path, recurse_subdirectories=False)\n",
+    "files = [f for f in files if f.endswith(\".jsonl\")]\n",
+    "df = read_data(\n",
+    "    files,\n",
+    "    file_type=\"jsonl\",\n",
+    "    backend=\"cudf\",\n",
+    "    files_per_partition=files_per_partition,\n",
+    "    add_filename=False,\n",
+    ")[[minhash_id_field, minhash_text_field]]\n",
+    "\n",
+    "# Run MinHash() on input data\n",
+    "minhasher = MinHash(\n",
+    "    seed=seed,\n",
+    "    num_hashes=minhash_length,\n",
+    "    char_ngrams=char_ngram,\n",
+    "    use_64bit_hash=use_64bit_hash,\n",
+    "    logger=minshah_log_dir,\n",
+    "    id_field=minhash_id_field,\n",
+    "    text_field=minhash_text_field,\n",
+    "    cache_dir=minshah_output_dir\n",
+    ")\n",
+    "res = minhasher(DocumentDataset(df)).df\n",
+    "\n",
+    "print(f\"Time taken for MinHash:{time.time()-t0}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19cddba5",
+   "metadata": {},
+   "source": [
+    "Verify result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "df83eec5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>_minhash_signature</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>TH_wiki-0000000000</td>\n",
+       "      <td>[11565725, 19782487, 9831980, 5480992, 2306475...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>TH_wiki-0000000001</td>\n",
+       "      <td>[407876, 107572, 824528, 346831, 216554, 10963...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>TH_wiki-0000000002</td>\n",
+       "      <td>[727721, 694551, 233868, 346831, 216554, 77001...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>TH_wiki-0000000003</td>\n",
+       "      <td>[1149282, 931656, 2515604, 1428622, 4964646, 4...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>TH_wiki-0000000004</td>\n",
+       "      <td>[1559901, 11771639, 487706, 826569, 1203860, 5...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   id                                 _minhash_signature\n",
+       "0  TH_wiki-0000000000  [11565725, 19782487, 9831980, 5480992, 2306475...\n",
+       "1  TH_wiki-0000000001  [407876, 107572, 824528, 346831, 216554, 10963...\n",
+       "2  TH_wiki-0000000002  [727721, 694551, 233868, 346831, 216554, 77001...\n",
+       "3  TH_wiki-0000000003  [1149282, 931656, 2515604, 1428622, 4964646, 4...\n",
+       "4  TH_wiki-0000000004  [1559901, 11771639, 487706, 826569, 1203860, 5..."
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "minhash_res = pd.read_parquet(os.path.join(minshah_output_dir, \"_minhashes.parquet\"))\n",
+    "minhash_res.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "998ab08a",
+   "metadata": {},
+   "source": [
+    "### 5.2 LSH\n",
+    "`LSH()` implements LSH algorithm which includes the following steps:\n",
+    "1. Divide the minhash array into `X` different portions. \n",
+    "2. For each portions, hash the minhash values into buckets. One document will be assigned to `X` buckets.\n",
+    "3. Documents within the same bucket will be deemed similar. Since every document will be assigned `X` buckets and as long as two documents share 1 or more buckets they are deemed similar, the result of LSH will have more false positive as compared to false negative. The false positive cases will be filtered in following modules, namely jaccard compute.\n",
+    "\n",
+    "Arguments include:\n",
+    "- `minhash_length`:Length of minhash signature. Must bu consistent with `MinHash()`\n",
+    "- `num_buckets`: Number of buckets\n",
+    "- `buckets_per_shuffle`: Number of buckets to shuffle concurrently\n",
+    "- `id_field`: Key in input file for identifying document ID\n",
+    "- `minhash_field`: Key in input file for identifying document MinHash signature \n",
+    "- `cache_dir`:If specified, the intermediate result will be output to the `cache_dir`.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "138544a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator import LSH\n",
+    "from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import \\\n",
+    "    convert_str_id_to_int"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "178fd0e4",
+   "metadata": {},
+   "source": [
+    "Define parameter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "21d2a261",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input\n",
+    "lsh_input_data_path = minshah_output_dir\n",
+    "\n",
+    "#Output\n",
+    "lsh_base_output_path = os.path.join(data_dir,\"fuzzy/lsh\")\n",
+    "lsh_log_dir = os.path.join(lsh_base_output_path,'log')\n",
+    "lsh_output_dir = os.path.join(lsh_base_output_path,'data')\n",
+    "\n",
+    "#Relevant parameters\n",
+    "lsh_id_field = 'id'\n",
+    "minhash_field = '_minhash_signature'\n",
+    "minhash_length=260\n",
+    "num_bands=20\n",
+    "buckets_per_shuffle=1\n",
+    "\n",
+    "!mkdir -p {lsh_log_dir}\n",
+    "!mkdir -p {lsh_output_dir}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a18708d2",
+   "metadata": {},
+   "source": [
+    "Run LSH"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "9eebeb10",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
+      "\n",
+      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time taken for LSH:20.533941984176636\n"
+     ]
+    }
+   ],
+   "source": [
+    "t0 = time.time()\n",
+    "\n",
+    "#Load MinHash output\n",
+    "df = dask_cudf.read_parquet(lsh_input_data_path, blocksize=\"2GB\", aggregate_files=True, backend = \"cudf\")\n",
+    "df = df.map_partitions(\n",
+    "    convert_str_id_to_int,\n",
+    "    id_column=lsh_id_field,\n",
+    "    meta=cudf.DataFrame(\n",
+    "        {minhash_field: [[1, 2, 3]], \"doc_id\": [1], \"dataset_id\": np.uint32(1)}\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "#Run LSH()\n",
+    "lsh = LSH(\n",
+    "    cache_dir=lsh_output_dir,\n",
+    "    minhash_length=minhash_length,\n",
+    "    num_buckets=num_bands,\n",
+    "    buckets_per_shuffle=buckets_per_shuffle,\n",
+    "    id_fields=[\"dataset_id\", \"doc_id\"],\n",
+    "    minhash_field=minhash_field,\n",
+    "    logger=lsh_log_dir,\n",
+    ")\n",
+    "res = lsh(DocumentDataset(df))\n",
+    "\n",
+    "t1 = time.time()\n",
+    "print(f\"Time taken for LSH:{time.time()-t0}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "813603e2",
+   "metadata": {},
+   "source": [
+    "Verify result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "c47da6b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>dataset_id</th>\n",
+       "      <th>doc_id</th>\n",
+       "      <th>_bucket_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>124692</td>\n",
+       "      <td>96</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>85282</td>\n",
+       "      <td>385</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>156638</td>\n",
+       "      <td>529</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>160566</td>\n",
+       "      <td>540</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>160567</td>\n",
+       "      <td>540</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   dataset_id  doc_id  _bucket_id\n",
+       "0  1692361878  124692          96\n",
+       "1  1692361878   85282         385\n",
+       "2  1692361878  156638         529\n",
+       "3  1692361878  160566         540\n",
+       "4  1692361878  160567         540"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lsh_res = pd.read_parquet(os.path.join(lsh_output_dir, \"_buckets.parquet\"))\n",
+    "lsh_res.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07bade4a",
+   "metadata": {},
+   "source": [
+    "### 5.3 Jaccard Shuffle\n",
+    "In this section, we will be using `_MapBucket()` and `_Shuffle()`.\n",
+    "\n",
+    "For `_MapBucket()`, it is designed to take input text data in .jsonl format and bucket information which is output of LSH, map the documents to their respective buckets, and write the resulting DataFrame containing the anchor documents and their associated bucket information to a Parquet file.Arguments include:\n",
+    "- `id_field`: Key in input .jsonl file for identifying document ID\n",
+    "- `text_field`: Key in input .jsonl file which contains document text.\n",
+    "- `bucket_field`: Key in input _buckets.parquet which contains `bucket_id`.\n",
+    "- `num_anchors`: Number of anchors (document in the same buckets) to be output\n",
+    "\n",
+    "\n",
+    "For `_Shuffle()`, it perform a shuffling operation on the documents based on their bucket assignments, output in .parquet format. This shuffling operation is a crucial step in the deduplication process, as it helps distribute similar documents across different partitions or workers, enabling efficient parallel processing and deduplication in subsequent steps. Arguments include:\n",
+    "- `id_fields`: Columns in `_buckets.parquet` that maps to original `id` in .jsonl data file. In this example, it is `[\"dataset_id\", \"doc_id\"]`\n",
+    "- `text_field`: Key in input .jsonl file which contains document text.\n",
+    "- `int_to_str_id`:  Key in input .jsonl file for identifying document ID\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "565253ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (\n",
+    "    get_bucket_ddf_from_parquet_path,\n",
+    "    get_text_ddf_from_json_path_with_blocksize,\n",
+    ")\n",
+    "from nemo_curator.modules.fuzzy_dedup import _MapBuckets,_Shuffle"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "70387977",
+   "metadata": {},
+   "source": [
+    "Define parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "5cff7d76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input\n",
+    "input_data_paths = [minhash_data_path]\n",
+    "input_bucket_path = lsh_output_dir\n",
+    "\n",
+    "#Output\n",
+    "jaccard_shuffle_base_output_path = os.path.join(data_dir,\"fuzzy/jaccard_shuffle\")\n",
+    "output_anchor_docs_with_bk_path = os.path.join(jaccard_shuffle_base_output_path, \"anchor_docs_with_bk.parquet\")\n",
+    "input_anchor_docs_with_bk_dir = output_anchor_docs_with_bk_path\n",
+    "output_shuffled_docs_path = os.path.join(jaccard_shuffle_base_output_path, \"shuffled_docs.parquet\")\n",
+    "\n",
+    "#Relevant parameter for _MapBucket()\n",
+    "text_ddf_blocksize = 256\n",
+    "bucket_mapping_ddf_blocksize = 256\n",
+    "num_files = None\n",
+    "shuffle_type ='tasks'\n",
+    "input_bucket_field = '_bucket_id'\n",
+    "input_id_field = 'id'\n",
+    "input_text_field = 'text'\n",
+    "\n",
+    "#Relevant parameter for _Shuffle()\n",
+    "shuffle_id_fields=[\"dataset_id\", \"doc_id\"]\n",
+    "int_to_str_id='id'\n",
+    "\n",
+    "!mkdir -p {jaccard_shuffle_base_output_path}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "699a53f1",
+   "metadata": {},
+   "source": [
+    "Run Jaccard map bucket"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "0a6e5a84",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of files being read for jaccard calculation = 1\n",
+      "Number of ddf_bk partitions = 1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
+      "\n",
+      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time taken for Bucket Mapping:2.1162023544311523 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "t0 = time.time()\n",
+    "num_workers = get_num_workers(client)\n",
+    "\n",
+    "# Read .jsonl input data\n",
+    "ddf_text = get_text_ddf_from_json_path_with_blocksize(\n",
+    "    input_data_paths=input_data_paths,\n",
+    "    num_files=num_files,\n",
+    "    blocksize=text_ddf_blocksize,\n",
+    "    id_column=input_id_field,\n",
+    "    text_column=input_text_field,\n",
+    ")\n",
+    "# Read \"_buckets.parquet\"\n",
+    "ddf_bk = get_bucket_ddf_from_parquet_path(input_bucket_path=input_bucket_path, num_workers=num_workers)\n",
+    "\n",
+    "#Run _MapBuckets()\n",
+    "map_buckets = _MapBuckets(id_fields=shuffle_id_fields, bucket_field=input_bucket_field)\n",
+    "ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(documents_df=ddf_text, buckets_df=ddf_bk, shuffle_type=shuffle_type)\n",
+    "\n",
+    "#Write to disk\n",
+    "ddf_anchor_docs_with_bk.to_parquet(output_anchor_docs_with_bk_path, write_index=False)\n",
+    "\n",
+    "print(f\"Time taken for Bucket Mapping:{time.time()-t0} s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96246266",
+   "metadata": {},
+   "source": [
+    "Verify results "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "09e65f8b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>dataset_id</th>\n",
+       "      <th>doc_id</th>\n",
+       "      <th>anchor_1_dataset_id</th>\n",
+       "      <th>anchor_1_doc_id</th>\n",
+       "      <th>anchor_0_dataset_id</th>\n",
+       "      <th>anchor_0_doc_id</th>\n",
+       "      <th>_output_partition_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>138220</td>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>145256</td>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>143672</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>50509</td>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>50509</td>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>50457</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>93989</td>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>93846</td>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>93807</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>20448</td>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>20090</td>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>20444</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>93991</td>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>93927</td>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>93697</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   dataset_id  doc_id  anchor_1_dataset_id  anchor_1_doc_id  \\\n",
+       "0  1692361878  138220           1692361878           145256   \n",
+       "1  1692361878   50509           1692361878            50509   \n",
+       "2  1692361878   93989           1692361878            93846   \n",
+       "3  1692361878   20448           1692361878            20090   \n",
+       "4  1692361878   93991           1692361878            93927   \n",
+       "\n",
+       "   anchor_0_dataset_id  anchor_0_doc_id  _output_partition_id  \n",
+       "0           1692361878           143672                     0  \n",
+       "1           1692361878            50457                     0  \n",
+       "2           1692361878            93807                     0  \n",
+       "3           1692361878            20444                     0  \n",
+       "4           1692361878            93697                     0  "
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "map_bucket_res = pd.read_parquet(output_anchor_docs_with_bk_path)\n",
+    "map_bucket_res.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "35bb1e86",
+   "metadata": {},
+   "source": [
+    "**[Optional]**Remove previous Jaccard Shuffle results. Run only when there are files under the Jaccard Shuffle output path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "da7dcc10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!rm -r {output_shuffled_docs_path}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "24c2b39d",
+   "metadata": {},
+   "source": [
+    "Run Jaccard Shuffle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "a9dcf646",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                                                                                                                                                                                                                                          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Started processing bucket-map partitions 0 through 1 of 1\n",
+      "Using 1 text partitions.\n",
+      "Starting text bytes aware shuffle\n",
+      "Will write 30596 rows to disk\n",
+      "Text-df partition  1/1 completed in 3.394432544708252\n",
+      "Bucket partition  1/1 completed in 3.4057791233062744\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.41s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time taken for Jaccard Shuffle = 3.4692487716674805 s\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "t0 = time.time()\n",
+    "\n",
+    "#Run _Shuffle() on results of _MapBucket()\n",
+    "shuffle = _Shuffle(\n",
+    "    id_fields=shuffle_id_fields,\n",
+    "    text_field=input_text_field,\n",
+    "    int_to_str_id=int_to_str_id\n",
+    ")\n",
+    "shuffle.shuffle_docs_on_buckets(\n",
+    "    documents_df=ddf_text,\n",
+    "    bucket_w_anchors_path=input_anchor_docs_with_bk_dir,\n",
+    "    output_shuffled_docs_path=output_shuffled_docs_path,\n",
+    "    bucket_mapping_df_blocksize=bucket_mapping_ddf_blocksize,\n",
+    "#     parts_per_worker=1,\n",
+    "#     bucket_parts_per_worker=8,\n",
+    "    partition_on=\"_output_partition_id\",\n",
+    ")\n",
+    "\n",
+    "print(f\"Time taken for Jaccard Shuffle = {time.time()-t0} s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c3f2177",
+   "metadata": {},
+   "source": [
+    "Verify results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "cf44fb6b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>_text_bytes</th>\n",
+       "      <th>id</th>\n",
+       "      <th>anchor_0_id</th>\n",
+       "      <th>anchor_1_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>พุทธศักราช 676 ใกล้เคียงกับ\\n เมษายน ค.ศ. 133 ...</td>\n",
+       "      <td>263</td>\n",
+       "      <td>1692361878-7032</td>\n",
+       "      <td>1692361878-7032</td>\n",
+       "      <td>1692361878-7052</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...</td>\n",
+       "      <td>217</td>\n",
+       "      <td>1692361878-9082</td>\n",
+       "      <td>1692361878-8805</td>\n",
+       "      <td>1692361878-9071</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...</td>\n",
+       "      <td>217</td>\n",
+       "      <td>1692361878-9082</td>\n",
+       "      <td>1692361878-9028</td>\n",
+       "      <td>1692361878-9045</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...</td>\n",
+       "      <td>217</td>\n",
+       "      <td>1692361878-9082</td>\n",
+       "      <td>1692361878-9072</td>\n",
+       "      <td>1692361878-9082</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ประเทศฮังการี เข้าร่วมแข่งขันกีฬาโอลิมปิกฤดูร้...</td>\n",
+       "      <td>2039</td>\n",
+       "      <td>1692361878-49091</td>\n",
+       "      <td>1692361878-49093</td>\n",
+       "      <td>1692361878-49087</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text  _text_bytes  \\\n",
+       "0  พุทธศักราช 676 ใกล้เคียงกับ\\n เมษายน ค.ศ. 133 ...          263   \n",
+       "1  พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...          217   \n",
+       "2  พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...          217   \n",
+       "3  พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...          217   \n",
+       "4  ประเทศฮังการี เข้าร่วมแข่งขันกีฬาโอลิมปิกฤดูร้...         2039   \n",
+       "\n",
+       "                 id       anchor_0_id       anchor_1_id  \n",
+       "0   1692361878-7032   1692361878-7032   1692361878-7052  \n",
+       "1   1692361878-9082   1692361878-8805   1692361878-9071  \n",
+       "2   1692361878-9082   1692361878-9028   1692361878-9045  \n",
+       "3   1692361878-9082   1692361878-9072   1692361878-9082  \n",
+       "4  1692361878-49091  1692361878-49093  1692361878-49087  "
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "jaccard_shuffle_res = pd.read_parquet(os.path.join(output_shuffled_docs_path,\"_output_partition_id=0/batch_1_1.parquet\"))\n",
+    "jaccard_shuffle_res.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ffb70238",
+   "metadata": {},
+   "source": [
+    "### 5.4 Jaccard Compute\n",
+    "We will be using `JaccardSimilarity()`.This is to computes the Jaccard similarity between document pairs. Result is a parquet dataset consisting of document id pair along with their Jaccard similarity score. To compute Jaccard similarity between two documents, we first convert the document into sets of n-grams and then compute the Jaccard similarity of the two sets.\n",
+    "\n",
+    "Arguments include:\n",
+    "- `id_field`: Column in input .parquet file identifying document ID\n",
+    "- `text_field`: Column in input .parquet file identifying document text\n",
+    "- `anchor_id_fields`: Column in input .parquet file identifying anchors. This can be generated by specifying number of anchor used in `_MapBucket` whose default value is 2\n",
+    "- `ngram_width`: n-gram used"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "06346b88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d71f440f",
+   "metadata": {},
+   "source": [
+    "Define parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "457ae138",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input\n",
+    "shuffled_docs_path = output_shuffled_docs_path\n",
+    "\n",
+    "#Output\n",
+    "jaccard_compute_base_output_path = os.path.join(data_dir,\"fuzzy/jaccard_compute\")\n",
+    "jaccard_compute_output_results_path = os.path.join(jaccard_compute_base_output_path, \"jaccard_similarity_results.parquet\")\n",
+    "\n",
+    "#Relevant parameters\n",
+    "input_id_field = 'id'\n",
+    "input_text_field = 'text'\n",
+    "ngram_size = 5\n",
+    "num_anchors = 2\n",
+    "\n",
+    "!mkdir -p {jaccard_compute_base_output_path}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "619bf820",
+   "metadata": {},
+   "source": [
+    "Run Jaccard Compute"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "2f094db1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running jaccard compute script\n",
+      "Time taken for Jaccard Computing: 0.8689384460449219\n"
+     ]
+    }
+   ],
+   "source": [
+    "enable_spilling()\n",
+    "client.run(enable_spilling)\n",
+    "\n",
+    "print(\"Running jaccard compute script\", flush=True)\n",
+    "t0 = time.time()\n",
+    "\n",
+    "jaccard = JaccardSimilarity(\n",
+    "    id_field=input_id_field,\n",
+    "    text_field=input_text_field,\n",
+    "    anchor_id_fields=[f\"anchor_{i}_{input_id_field}\" for i in range(num_anchors)],\n",
+    "    ngram_width=ngram_size,\n",
+    ")\n",
+    "\n",
+    "#Load and run Jaccard compute\n",
+    "result_df = jaccard.jaccard_compute(shuffled_docs_path)\n",
+    "\n",
+    "result_df.to_parquet(jaccard_compute_output_results_path, write_index=False, write_metadata_file=False)\n",
+    "\n",
+    "print(f\"Time taken for Jaccard Computing: {time.time()-t0}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b31e619c",
+   "metadata": {},
+   "source": [
+    "Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "ae2efe3e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id_x</th>\n",
+       "      <th>id_y</th>\n",
+       "      <th>jaccard</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1692361878-127521</td>\n",
+       "      <td>1692361878-127517</td>\n",
+       "      <td>0.755481</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1692361878-127521</td>\n",
+       "      <td>1692361878-127517</td>\n",
+       "      <td>0.755481</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1692361878-45934</td>\n",
+       "      <td>1692361878-45940</td>\n",
+       "      <td>0.922061</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1692361878-45934</td>\n",
+       "      <td>1692361878-45940</td>\n",
+       "      <td>0.922061</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1692361878-45934</td>\n",
+       "      <td>1692361878-45940</td>\n",
+       "      <td>0.922061</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                id_x               id_y   jaccard\n",
+       "0  1692361878-127521  1692361878-127517  0.755481\n",
+       "1  1692361878-127521  1692361878-127517  0.755481\n",
+       "2   1692361878-45934   1692361878-45940  0.922061\n",
+       "3   1692361878-45934   1692361878-45940  0.922061\n",
+       "4   1692361878-45934   1692361878-45940  0.922061"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "jaccard_compute_res = pd.read_parquet(jaccard_compute_output_results_path)\n",
+    "jaccard_compute_res.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "834f1831",
+   "metadata": {},
+   "source": [
+    "### 5.5 Connected Components\n",
+    "This section uses `ConnectedComponents()`.This section takes a dataset consisting of document pairs and their corresponding jaccard similarity to construct a non-directed graph. A edge will be form between documents whose Jaccard similarity is higher than the threshold (0.8 in this example). It will then identify the connected components in this graph. Documents within the same connected components are deemed duplicated\n",
+    "\n",
+    "Arguments include:\n",
+    "- `cache_dir`:Output path for intermediate results\n",
+    "- `jaccard_pairs_path`:Input path for `jaccard_similarity_results.parquet`\n",
+    "- `id_column`:prefix of ID column in `jaccard_similarity_results.parquet`\n",
+    "- `jaccard_threshold`:Threshold to determine if an edge exists between two documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "5756fde8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator.modules.fuzzy_dedup import ConnectedComponents"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "217957d6",
+   "metadata": {},
+   "source": [
+    "Define parameter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "72a1952e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input\n",
+    "jaccard_pairs_path = jaccard_compute_output_results_path\n",
+    "\n",
+    "#Output\n",
+    "connected_component_base_output_path = os.path.join(data_dir,\"fuzzy/cc\")\n",
+    "connected_component_output_path = os.path.join(connected_component_base_output_path, \"connected_components.parquet\")\n",
+    "connected_component_cache_dir = os.path.join(connected_component_base_output_path, \"cache\")\n",
+    "\n",
+    "#Relevant parameter\n",
+    "input_id_field = 'id'\n",
+    "jaccard_threshold = 0.8\n",
+    "\n",
+    "!mkdir -p {connected_component_base_output_path}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c53b3a8c",
+   "metadata": {},
+   "source": [
+    "Run Connected Component"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "46578e2b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
+      "\n",
+      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
+      "\n",
+      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
+      "\n",
+      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "batch_id = 0/1, time = 0.3100006580352783\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
+      "\n",
+      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
+      "\n",
+      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# of groups 5465\n",
+      "# of docs removed 3079\n",
+      "assert num_nodes:8544==labels_df:8544 passed\n",
+      "Time taken for Connected Component: 11.238884925842285 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "client.run(enable_spilling)\n",
+    "\n",
+    "t0 = time.time()\n",
+    "    \n",
+    "components_stage = ConnectedComponents(\n",
+    "    cache_dir=connected_component_cache_dir,\n",
+    "    jaccard_pairs_path=jaccard_pairs_path,\n",
+    "    id_column=input_id_field,\n",
+    "    convert_str_ids=True,\n",
+    "    jaccard_threshold=jaccard_threshold,\n",
+    ")\n",
+    "\n",
+    "#Load and run connected component\n",
+    "components_stage.cc_workflow(output_path=connected_component_output_path)\n",
+    "print(f\"Time taken for Connected Component: {time.time()-t0} s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6827158e",
+   "metadata": {},
+   "source": [
+    "Verify the result of `Connected Components`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "2bcfc470",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>dataset_id</th>\n",
+       "      <th>doc_id</th>\n",
+       "      <th>group</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>136999</td>\n",
+       "      <td>3837</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>85318</td>\n",
+       "      <td>3838</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>70670</td>\n",
+       "      <td>1196</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>134587</td>\n",
+       "      <td>138</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>136125</td>\n",
+       "      <td>1320</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   dataset_id  doc_id  group\n",
+       "0  1692361878  136999   3837\n",
+       "1  1692361878   85318   3838\n",
+       "2  1692361878   70670   1196\n",
+       "3  1692361878  134587    138\n",
+       "4  1692361878  136125   1320"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cc_compute_res = pd.read_parquet(connected_component_output_path)\n",
+    "cc_compute_res.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa1ee07d",
+   "metadata": {},
+   "source": [
+    "Let's check if the output fuzzy duplicated documents within the same group are similar. Please note that the `group` id in your output might be different from the notebook output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "f1f10a1c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>group</th>\n",
+       "      <th>doc_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>121</td>\n",
+       "      <td>134756, 134762, 134748, 134742, 134740, 134750...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>138</td>\n",
+       "      <td>134587, 134908, 135024, 135029, 135019, 134566...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>323</td>\n",
+       "      <td>134794, 134780, 134793, 134785, 134798, 134781...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>344</td>\n",
+       "      <td>136092, 136103, 136090, 136093, 136100, 136089...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>428</td>\n",
+       "      <td>94120, 94084, 94059, 94128, 94130, 94056, 9413...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5460</th>\n",
+       "      <td>8539</td>\n",
+       "      <td>125651</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5461</th>\n",
+       "      <td>8540</td>\n",
+       "      <td>125971</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5462</th>\n",
+       "      <td>8541</td>\n",
+       "      <td>84926</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5463</th>\n",
+       "      <td>8542</td>\n",
+       "      <td>40115</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5464</th>\n",
+       "      <td>8543</td>\n",
+       "      <td>50282</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5465 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      group                                             doc_id\n",
+       "0       121  134756, 134762, 134748, 134742, 134740, 134750...\n",
+       "1       138  134587, 134908, 135024, 135029, 135019, 134566...\n",
+       "2       323  134794, 134780, 134793, 134785, 134798, 134781...\n",
+       "3       344  136092, 136103, 136090, 136093, 136100, 136089...\n",
+       "4       428  94120, 94084, 94059, 94128, 94130, 94056, 9413...\n",
+       "...     ...                                                ...\n",
+       "5460   8539                                             125651\n",
+       "5461   8540                                             125971\n",
+       "5462   8541                                              84926\n",
+       "5463   8542                                              40115\n",
+       "5464   8543                                              50282\n",
+       "\n",
+       "[5465 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cc_compute_res['doc_id'] = cc_compute_res['doc_id'].astype(str)\n",
+    "cc_compute_res.groupby('group')['doc_id'].agg(lambda x: ', '.join(x)).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f621c2cb",
+   "metadata": {},
+   "source": [
+    "Change the `group` number if necessary. By running the code below, we can obtain a list of near duplicated documents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "bd79a7f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>dataset_id</th>\n",
+       "      <th>doc_id</th>\n",
+       "      <th>group</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>121545</td>\n",
+       "      <td>735</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>66</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>121487</td>\n",
+       "      <td>735</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>213</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>121541</td>\n",
+       "      <td>735</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>291</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>121539</td>\n",
+       "      <td>735</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>422</th>\n",
+       "      <td>1692361878</td>\n",
+       "      <td>121524</td>\n",
+       "      <td>735</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     dataset_id  doc_id  group\n",
+       "14   1692361878  121545    735\n",
+       "66   1692361878  121487    735\n",
+       "213  1692361878  121541    735\n",
+       "291  1692361878  121539    735\n",
+       "422  1692361878  121524    735"
+      ]
+     },
+     "execution_count": 72,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cc_compute_res[cc_compute_res['group']==735].head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7c02f4b",
+   "metadata": {},
+   "source": [
+    "Print the text of near duplicated document. Please replace the `id` if necessary, `id` should be in the format of `<dataset_id>_<doc_id>`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "dd0b2e33",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['ประเทศสวิตเซอร์แลนด์ ได้เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวชนฤดูหนาว ครั้งที่ 3 ค.ศ. 2020 (พ.ศ. 2563) ณ เมืองโลซาน ประเทศสวิตเซอร์แลนด์ ระหว่างวันที่ 9 - 22 มกราคม พ.ศ. 2563 คณะกรรมการโอลิมปิกแห่งชาติสวิตเซอร์แลนด์ได้ส่งทีมนักกีฬาเข้าแข่งขันทั้งหมด 56 คน แบ่งเป็นเป็นชาย 32 คนและหญิง 56 คน เข้าร่วมการแข่งขันใน 15 ชนิดกีฬา\\n\\nจำนวนผู้เข้าแข่งขัน\\n\\nผลการแข่งขัน\\n\\nสเกตลีลา\\n\\nสเกตความเร็ว\\n\\nสเกตความเร็วระยะสั้น\\n\\nฮอกกี้น้ำแข็ง\\n\\nเคอร์ลิง\\n\\nสกีลงเขา\\n\\nสกีข้ามทุ่ง\\n\\nสกีกระโดดไกล\\n\\nสกีนอร์ดิกผสม\\n\\nสกีลีลา\\n\\nสกีปีนเขา\\n\\nสโนว์บอร์ด\\n\\nทวิกีฬาฤดูหนาว\\n\\nบอบสเล\\n\\nสเกเลตัน\\n\\nอ้างอิง\\n\\nแหล่งข้อมูลอื่น \\n เว็บไซต์อย่างเป็นทางการ \\n\\nประเทศสวิตเซอร์แลนด์ในโอลิมปิกเยาวชน\\nประเทศที่เข้าร่วมแข่งขันโอลิมปิกเยาวชนฤดูหนาว 2020',\n",
+       "       'ประเทศบัลแกเรีย ได้เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวชนฤดูหนาว ครั้งที่ 3 ค.ศ. 2020 (พ.ศ. 2563) ณ เมืองโลซาน ประเทศสวิตเซอร์แลนด์ ระหว่างวันที่ 9 - 22 มกราคม พ.ศ. 2563 คณะกรรมการโอลิมปิกแห่งชาติบัลแกเรียได้ส่งทีมนักกีฬาเข้าแข่งขันทั้งหมด 18 คน แบ่งเป็นเป็นชาย 11 คนและหญิง 7 คน เข้าร่วมการแข่งขันใน 8 ชนิดกีฬา\\n\\nจำนวนผู้เข้าแข่งขัน\\n\\nผลการแข่งขัน\\n\\nสเกตลีลา\\n\\nสเกตความเร็ว\\n\\nสเกตความเร็วระยะสั้น\\n\\nฮอกกี้น้ำแข็ง\\n\\nเคอร์ลิง\\n\\nสกีลงเขา\\n\\nสกีข้ามทุ่ง\\n\\nสกีกระโดดไกล\\n\\nสกีนอร์ดิกผสม\\n\\nสกีลีลา\\n\\nสกีปีนเขา\\n\\nสโนว์บอร์ด\\n\\nทวิกีฬาฤดูหนาว\\n\\nลูช\\n\\nบอบสเล\\n\\nสเกเลตัน\\n\\nอ้างอิง\\n\\nแหล่งข้อมูลอื่น \\n เว็บไซต์อย่างเป็นทางการ \\n\\nประเทศบัลแกเรียในโอลิมปิกเยาวชน\\nประเทศที่เข้าร่วมแข่งขันโอลิมปิกเยาวชนฤดูหนาว 2020'],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "jaccard_shuffle_res[jaccard_shuffle_res['id'].isin(['1692361878-121545','1692361878-121487'])]['text'].unique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3f8d12f",
+   "metadata": {},
+   "source": [
+    "Below is the English translation of the output above. We can see that the two documents are indeed very similar to each other.\n",
+    "- `Text 1`:\n",
+    "```\n",
+    "Switzerland participated in the 3rd Youth Olympic Winter Games in 2020 (B.E. 2563) in Lausanne, Switzerland from January 9 - 22, 2563. The Swiss Olympic Committee sent a total of 56 athletes, consisting of 32 men and 56 women, to compete in 15 sports.\n",
+    "Number of Competitors:\n",
+    "Competition Results:\n",
+    "Figure Skating\n",
+    "Speed Skating\n",
+    "Short Track Speed Skating\n",
+    "Ice Hockey\n",
+    "Curling\n",
+    "Alpine Skiing\n",
+    "Cross-Country Skiing\n",
+    "Ski Jumping\n",
+    "Nordic Combined\n",
+    "Freestyle Skiing\n",
+    "Ski Mountaineering\n",
+    "Snowboard\n",
+    "Biathlon\n",
+    "Bobsleigh\n",
+    "Skeleton\n",
+    "References:\n",
+    "Other Resources:\n",
+    "Official Website\n",
+    "Switzerland at the Youth Olympics\n",
+    "Countries at the 2020 Youth Winter Olympics\n",
+    "```\n",
+    "- `Text 2`:\n",
+    "```\n",
+    "Bulgaria participated in the 3rd Youth Olympic Winter Games in 2020 (B.E. 2563) in Lausanne, Switzerland from January 9 - 22, 2563. The Bulgarian Olympic Committee sent a total of 18 athletes, consisting of 11 men and 7 women, to compete in 8 sports.\n",
+    "Number of Competitors:\n",
+    "Competition Results:\n",
+    "Figure Skating\n",
+    "Speed Skating\n",
+    "Short Track Speed Skating\n",
+    "Ice Hockey\n",
+    "Curling\n",
+    "Alpine Skiing\n",
+    "Cross-Country Skiing\n",
+    "Ski Jumping\n",
+    "Nordic Combined\n",
+    "Freestyle Skiing\n",
+    "Ski Mountaineering\n",
+    "Snowboard\n",
+    "Biathlon\n",
+    "Luge\n",
+    "Bobsleigh\n",
+    "Skeleton\n",
+    "References:\n",
+    "Other Resources:\n",
+    "Official Website\n",
+    "Bulgaria at the Youth Olympics\n",
+    "Countries at the 2020 Youth Winter Olympics\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "70ca66df",
+   "metadata": {},
+   "source": [
+    "## 6. Remove duplicates\n",
+    "\n",
+    "Now we have duplicated document IDs output by both exact deduplication and fuzzy deduplication. We will run this section to remove those documents. This is done be loading the output .parquet files and the unicode fixed input dataset in .jsonl as DataFrame. Then use DataFrame operation to remove the duplicated documents."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "93d031ec",
+   "metadata": {},
+   "source": [
+    "Define parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "id": "911be9d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input\n",
+    "dataset_dir = added_id_output_path\n",
+    "\n",
+    "#Output\n",
+    "dudped_output_dir = os.path.join(data_dir,\"remove_duplicate/result.parquet\")\n",
+    "\n",
+    "#Relevant parameter\n",
+    "input_id_field = 'id'\n",
+    "id_prefix = add_ID_id_prefix\n",
+    "\n",
+    "!mkdir -p {dudped_output_dir}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "969f6543",
+   "metadata": {},
+   "source": [
+    "We will first process the result of exact deduplication. Since result of exact deduplication contains original ID used in input dataset, it is more straightforward to deal with."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "bbbfdbb3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading 1 files\n",
+      "Reading 1 files\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Load .jsonl dataset\n",
+    "input_dataset = DocumentDataset.read_json(dataset_dir, backend='cudf')\n",
+    "\n",
+    "#Load exact deduplicate result and extract list of duplicated document ID\n",
+    "exact_duplicates = DocumentDataset.read_parquet(os.path.join(exact_dedup_output_dir,\"_exact_duplicates.parquet\"), backend='cudf')\n",
+    "exact_docs_to_remove = exact_duplicates.df.map_partitions(\n",
+    "    lambda x: x[x._hashes.duplicated(keep=\"first\")]\n",
+    ")\n",
+    "\n",
+    "#Remove the duplicated document from input dataset\n",
+    "result = input_dataset.df[\n",
+    "    ~input_dataset.df[input_id_field].isin(exact_docs_to_remove[input_id_field].compute())\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8b97567d",
+   "metadata": {},
+   "source": [
+    "For result of fuzzy deduplication, we need to first reconstructed document ID by combining `dataset_id` and `doc_id`, then use the reconstructed `ID` for removal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "513cf7a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#List of id_prefix used in Add ID\n",
+    "base_ids = [id_prefix]\n",
+    "\n",
+    "#Obtain a mapping between `dataset_id` and `id_prefix`\n",
+    "df = cudf.DataFrame()\n",
+    "df['base_id'] = [base_id for base_id in base_ids]\n",
+    "df['dataset_id'] = df['base_id'].hash_values()\n",
+    "df_pd = df.to_pandas()\n",
+    "mapping = {\n",
+    "      hashed_id: base_id\n",
+    "      for base_id, hashed_id in zip(df_pd['base_id'], df_pd['dataset_id'])\n",
+    "}\n",
+    "\n",
+    "#Load result of fuzzy deduplication\n",
+    "fuzzy_duplicates = pd.read_parquet(connected_component_output_path)\n",
+    "#Reconstruct the original document ID\n",
+    "fuzzy_duplicates['id']=fuzzy_duplicates.apply(lambda x: f\"{mapping[x['dataset_id']]}-{x['doc_id']:010d}\", axis=1)\n",
+    "#Generate list of near duplicate document ID\n",
+    "fuzzy_docs_to_remove = fuzzy_duplicates.drop_duplicates(subset=['group'], keep='first')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "dc7d647c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Writing to disk complete for 1 partitions\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Remove near duplicates\n",
+    "result = result[~result[input_id_field].isin(fuzzy_docs_to_remove[input_id_field])]\n",
+    "\n",
+    "#Save final result to local\n",
+    "write_to_disk(result, dudped_output_dir, output_type=\"parquet\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b47a967f",
+   "metadata": {},
+   "source": [
+    "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset (length = 161748)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "5e8097b1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Length of duplicate removed dataset:156257\n"
+     ]
+    }
+   ],
+   "source": [
+    "res = pd.read_parquet(dudped_output_dir)\n",
+    "print(f\"Length of duplicate removed dataset:{len(res)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "85caf66f",
+   "metadata": {},
+   "source": [
+    "Close the GPU Dask Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "cd91f5fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.cluster.close()\n",
+    "client.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c6cee97",
+   "metadata": {},
+   "source": [
+    "## 7. Heuristic Fitlering\n",
+    "\n",
+    "In this section, we will apply multiple heuristic filters to the dataset, record the heuristic score for documents and documents removed for each filter. For each heuristic filter, the filter calculates a quality scores based on user defined heuristics/algorithms and classifies documents into high quality documents or low quality documents if the quality score is above the user defined threshold.\n",
+    "\n",
+    "Sample lists of heuristic filters can be found in `./config/`\n",
+    "- `heuristic_filter_en.yaml`: Sample heuristic filter list for English dataset\n",
+    "- `heuristic_filter_non-en.yaml`:Sample heuristic filter list for Non-English dataset\n",
+    "- `heuristic_filter_code.yaml`:Sample heuristic filter list for Code language dataset\n",
+    "Please adjust the sample list e.g. remove/add filters or change filter threshold based on your own use case. In this example, `heuristic_filter_non-en.yaml` will be used.\n",
+    "\n",
+    "For detailed implementation and description of each heuristic filter, please refer to `./NeMo-Curator/nemo-curator/filters/heuristics_filter.py`. For customized heuristic filter implementation, user shall follow the sample implementations, write customized filters and update the .yaml files accordingly.\n",
+    "\n",
+    "For analysis of impact of each filters on the dataset, user should set `log-score` to true for the filters in the corresponding config .yaml file. This will output quality score for all filters in separate .txt files for each individual filter. With the quality score and filter threshold, use can calculate quality score distribution and other analysis to assess the effectiveness of each filter.\n",
+    "\n",
+    "In this example, in order to get a comprehensive output of each filter, we are iterating through ever filter using a for loop and saving the intermediate result. This process will involve extensive I/O operations and is less effective. Alternatively, after loading input dataset and filter pipeline, user can simply call `filter_pipeline(dataset)` to obtain the final filtered result."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "1ddff58c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator.utils.config_utils import build_filter_pipeline\n",
+    "from nemo_curator import Score, Filter, ScoreFilter\n",
+    "from nemo_curator.utils.file_utils import get_batched_files,expand_outdir_and_mkdir"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a728a161",
+   "metadata": {},
+   "source": [
+    "**[Optional]**The following cell is to remove warning from dask."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "e5114945",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "# Disable the metadata warning\n",
+    "warnings.filterwarnings(\"ignore\",module=\"dask.dataframe.core\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6243a7cb",
+   "metadata": {},
+   "source": [
+    "Create a CPU Dask Cluster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "fa752ded",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n",
+    "client = Client(cluster)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3dda877",
+   "metadata": {},
+   "source": [
+    "Define some helper functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "a8abf841",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_dataframe_complement(original_df, filtered_df):\n",
+    "    def partition_complement(part_original_df, partition_info=None):\n",
+    "        if not partition_info:\n",
+    "            return part_original_df\n",
+    "        part_filtered_df = filtered_df.get_partition(partition_info[\"number\"])\n",
+    "        complement_mask = ~part_original_df.index.isin(part_filtered_df.index.persist())\n",
+    "        complement_df = part_original_df[complement_mask]\n",
+    "        return complement_df\n",
+    "\n",
+    "    return original_df.map_partitions(partition_complement)\n",
+    "\n",
+    "def write_scores(df, output_dir):\n",
+    "    for column in df.columns:\n",
+    "        output_path = os.path.join(output_dir, f\"{column}.txt\")\n",
+    "        df[column].to_csv(output_path, single_file=True, encoding=\"utf-8\", header=False, index=False, mode=\"a\")\n",
+    "\n",
+    "def get_score_fields(pipeline):\n",
+    "    score_fields = []\n",
+    "    for nc_module in pipeline.modules:\n",
+    "        if isinstance(nc_module, Score) or isinstance(nc_module, ScoreFilter):\n",
+    "            if nc_module.score_field:\n",
+    "                score_fields.append(nc_module.score_field)\n",
+    "    return score_fields"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04e6b0f8",
+   "metadata": {},
+   "source": [
+    "Define parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "55e43a6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input\n",
+    "HF_input_data_dir = dudped_output_dir\n",
+    "input_file_type = 'parquet'\n",
+    "batch_size = 1\n",
+    "\n",
+    "#Output\n",
+    "HF_base_output_path = os.path.join(data_dir,'heuristic_filtering')\n",
+    "kept_document_dir =  os.path.join(HF_base_output_path,'data','hq.parquet')\n",
+    "removed_document_dir =  os.path.join(HF_base_output_path,'data','lq.parquet')\n",
+    "output_document_score_dir =  os.path.join(HF_base_output_path,'data','score')\n",
+    "output_file_type = 'parquet'\n",
+    "\n",
+    "#Relevant parameters\n",
+    "filter_config_file = './config/heuristic_filter_non-en.yaml'\n",
+    "input_id_field = 'id'\n",
+    "\n",
+    "#Set to False if do not want to save intermediate results\n",
+    "is_cache = True\n",
+    "\n",
+    "!mkdir -p {kept_document_dir}\n",
+    "!mkdir -p {removed_document_dir}\n",
+    "!mkdir -p {output_document_score_dir}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c5f6c8e",
+   "metadata": {},
+   "source": [
+    "Run heuristic filtering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "id": "f6f50332",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading 1 files\n",
+      "Saving data for symbol_to_word\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for numbers_ratio\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for urls_ratio\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for word_count\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for repeating_top_2grams\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for repeating_top_3grams\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for repeating_top_4grams\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Time taken for Heuristic filtering: 729.7436628341675 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "t0 = time.time()\n",
+    "\n",
+    "#Load filters from config\n",
+    "filter_pipeline = build_filter_pipeline(filter_config_file)\n",
+    "score_fields = get_score_fields(filter_pipeline)\n",
+    "\n",
+    "# Load dataset\n",
+    "dataset = load_dataset(HF_input_data_dir,file_type='parquet')\n",
+    "\n",
+    "\n",
+    "# Iterate through filters. For each filter, the low quality document will be removed from the dataset and output to corresponding folder for analysis\n",
+    "# Output of previous filter will be input of the next filter\n",
+    "if is_cache:\n",
+    "    curr_dataset = prev_dataset = dataset\n",
+    "    for filter_module in filter_pipeline.modules:\n",
+    "        #Apply filter\n",
+    "        curr_dataset = filter_module(curr_dataset).persist()\n",
+    "\n",
+    "        #Output filtered document\n",
+    "        print(f\"Saving data for {filter_module.filter_obj._name}\")\n",
+    "        removed_df = get_dataframe_complement(prev_dataset.df, curr_dataset.df)\n",
+    "        removed_filter_dir = os.path.join(removed_document_dir, filter_module.filter_obj._name)\n",
+    "        expand_outdir_and_mkdir(removed_filter_dir)\n",
+    "        write_to_disk(removed_df, removed_filter_dir, write_to_filename=True, output_type=output_file_type)\n",
+    "        prev_dataset = curr_dataset\n",
+    "    filtered_dataset = curr_dataset\n",
+    "else:\n",
+    "    filtered_dataset = filter_pipeline(dataset)\n",
+    "\n",
+    "# Write scores of retained doucment to separate directory\n",
+    "output_df = filtered_dataset.df[[input_id_field, *score_fields]]\n",
+    "write_scores(output_df, output_document_score_dir)\n",
+    "\n",
+    "# Remove scores from dataset df\n",
+    "filtered_dataset = DocumentDataset(filtered_dataset.df.drop(columns=score_fields))\n",
+    "\n",
+    "# Output filtered dataset\n",
+    "write_to_disk(filtered_dataset.df, kept_document_dir, write_to_filename=True, output_type=output_file_type)\n",
+    "\n",
+    "print(f\"Time taken for Heuristic filtering: {time.time()-t0} s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b19731f5",
+   "metadata": {},
+   "source": [
+    "Verify the result."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f945362",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = pd.read_parquet(kept_document_dir)\n",
+    "print(f\"Dataset size after heuristic filtering:{len(res)}\")\n",
+    "res.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb52fe04",
+   "metadata": {},
+   "source": [
+    "Close the CPU Dask Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "id": "aaa9823a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.cluster.close()\n",
+    "client.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94f6e74e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 462a1a39bf59435afa99de171cf56ea21ebba56a Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Fri, 19 Apr 2024 16:22:18 -0700
Subject: [PATCH 02/34] Fix metadata inference with pandas and dask (#35)

* Fix metadata inference with pandas and dask

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix datatypes for task decontamination

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Use targetted import

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

---------

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 nemo_curator/modules/filter.py |  9 ++++++++-
 nemo_curator/modules/task.py   | 12 +++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/nemo_curator/modules/filter.py b/nemo_curator/modules/filter.py
index 07f8cb634..7053f26fe 100644
--- a/nemo_curator/modules/filter.py
+++ b/nemo_curator/modules/filter.py
@@ -11,12 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import pandas as pd
+from dask.dataframe.extensions import make_array_nonempty
 from dask.typing import no_default
 
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.utils.module_utils import is_batched
 
+# Override so that pd.NA is not passed during the metadata inference
+make_array_nonempty.register(
+    pd.StringDtype,
+    lambda x: pd.array(["a", "b"], dtype=x),
+)
+
 
 class Score:
     def __init__(self, score_fn, score_field, text_field="text", score_type=None):
diff --git a/nemo_curator/modules/task.py b/nemo_curator/modules/task.py
index a7d9ae722..2571b6a8c 100644
--- a/nemo_curator/modules/task.py
+++ b/nemo_curator/modules/task.py
@@ -302,6 +302,8 @@ def _threshold_ngram_count(self, matched_ngrams: dict) -> set:
         return filtered_ngrams
 
     def _remove_ngrams_partition(self, partition, task_ngrams, ngrams_freq_sorted):
+        text_type = partition[self.text_field].dtype
+
         document_fn = partial(
             self._remove_ngrams,
             task_ngrams=task_ngrams,
@@ -318,7 +320,15 @@ def _remove_ngrams_partition(self, partition, task_ngrams, ngrams_freq_sorted):
 
         partition[self.text_field] = split_text
         filtered_partition = partition[valid_documents_mask]
-        return filtered_partition.explode(self.text_field, ignore_index=True)
+        exploded_partition = filtered_partition.explode(
+            self.text_field, ignore_index=True
+        )
+        # After exploding, the string datatype can become an "object" type
+        exploded_partition[self.text_field] = exploded_partition[
+            self.text_field
+        ].astype(text_type)
+
+        return exploded_partition
 
     def _remove_ngrams(self, document, task_ngrams, ngrams_freq_sorted):
         """

From f2970765be4c0a68d11ea126f357668f7096543f Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Mon, 22 Apr 2024 13:26:37 -0700
Subject: [PATCH 03/34] Disable PyTorch Compile Multiprocessing (#34)

* Move tokenizer import

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Reduce inductor threads

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Change env int to string

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Change location of env var

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add comment linking issue

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

---------

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 nemo_curator/filters/code.py     | 3 ++-
 nemo_curator/modules/__init__.py | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/nemo_curator/filters/code.py b/nemo_curator/filters/code.py
index 9a209ec47..56e4ea69a 100644
--- a/nemo_curator/filters/code.py
+++ b/nemo_curator/filters/code.py
@@ -18,7 +18,6 @@
 import numpy as np
 from bs4 import BeautifulSoup
 from comment_parser import comment_parser
-from nemo.collections.common.tokenizers import SentencePieceTokenizer
 
 from nemo_curator.filters.doc_filter import DocumentFilter, import_filter
 from nemo_curator.utils.constants import regex_alpha, regex_alphanum
@@ -104,6 +103,8 @@ def keep_document(self, score):
 class TokenizerFertilityFilter(DocumentFilter):
 
     def __init__(self, path_to_tokenizer=None, min_char_to_token_ratio=2.5):
+        from nemo.collections.common.tokenizers import SentencePieceTokenizer
+
         if path_to_tokenizer is None:
             raise ValueError(
                 "Must provide a valid path to a SentencePiece " "tokenizer"
diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py
index d845441f3..d7c099803 100644
--- a/nemo_curator/modules/__init__.py
+++ b/nemo_curator/modules/__init__.py
@@ -11,6 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+
+# Disables multiprocessing in torch.compile calls.
+# Without this, Dasks multiprocessing combined with PyTorch's
+# gives errors like "daemonic processes are not allowed to have children"
+# See https://github.com/NVIDIA/NeMo-Curator/issues/31
+os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
 from .add_id import AddId
 from .exact_dedup import ExactDuplicates

From dbe76060c5b0b77880c8131112c001f35d7d32c1 Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Tue, 23 Apr 2024 11:34:56 -0700
Subject: [PATCH 04/34] Improve speed of AddId module (#36)

* Add fast id method

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add type conversion

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix off by one errors in tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

---------

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 nemo_curator/modules/add_id.py     | 45 ++++++++++++++++++++++++---
 nemo_curator/scripts/add_id.py     |  6 ++--
 nemo_curator/utils/module_utils.py |  5 +++
 tests/test_add_id.py               | 50 ++++++++++++++++++++++++++----
 4 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/nemo_curator/modules/add_id.py b/nemo_curator/modules/add_id.py
index e8f30739b..83da7bd25 100644
--- a/nemo_curator/modules/add_id.py
+++ b/nemo_curator/modules/add_id.py
@@ -12,22 +12,58 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 import dask.dataframe as dd
 import numpy as np
 from dask import delayed
 
 from nemo_curator.datasets import DocumentDataset
+from nemo_curator.utils.module_utils import count_digits
 
 
 class AddId:
-    def __init__(self, id_field, id_prefix="doc_id", start_index=0) -> None:
+    def __init__(
+        self, id_field, id_prefix: str = "doc_id", start_index: Optional[int] = None
+    ) -> None:
         self.id_field = id_field
         self.id_prefix = id_prefix
         self.start_index = start_index
 
     def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
+        if self.start_index is None:
+            return self._add_id_fast(dataset)
+        else:
+            return self._add_id_ordered(dataset)
+
+    def _add_id_fast(self, dataset: DocumentDataset) -> DocumentDataset:
+        meta = dataset.df.dtypes.to_dict()
+        meta[self.id_field] = "string"
+
+        partition_zero_padding = count_digits(dataset.df.npartitions)
+        id_df = dataset.df.map_partitions(
+            self._add_id_fast_partition,
+            partition_zero_padding,
+            meta=meta,
+        )
+
+        return DocumentDataset(id_df)
+
+    def _add_id_fast_partition(self, partition, global_padding, partition_info=None):
+        local_padding = count_digits(len(partition))
+        global_id = partition_info["number"]
+
+        id_column = [
+            f"{self.id_prefix}-{local_id:0{local_padding}d}{global_id:0{global_padding}d}"
+            for local_id in range(len(partition))
+        ]
+        partition[self.id_field] = id_column
+
+        return partition
+
+    def _add_id_ordered(self, dataset: DocumentDataset) -> DocumentDataset:
         original_meta = dataset.df.dtypes.to_dict()
-        original_meta[self.id_field] = "object"
+        original_meta[self.id_field] = "string"
         delayed_dataset = dataset.df.to_delayed()
 
         parition_lengths = [0]
@@ -38,7 +74,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
         delayed_id_dataset = []
         for i, partition in enumerate(delayed_dataset):
             delayed_id_dataset.append(
-                delayed(self._add_id_to_partition)(partition, lower_id_bounds[i])
+                delayed(self._add_id_ordered_partition)(partition, lower_id_bounds[i])
             )
 
         id_dataset = DocumentDataset(
@@ -47,11 +83,12 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
 
         return id_dataset
 
-    def _add_id_to_partition(self, partition, partition_start_id):
+    def _add_id_ordered_partition(self, partition, partition_start_id):
         id_column = [
             f"{self.id_prefix}-{int(i + self.start_index):010d}"
             for i in range(partition_start_id, len(partition) + partition_start_id)
         ]
         partition[self.id_field] = id_column
+        partition[self.id_field] = partition[self.id_field].astype("string")
 
         return partition
diff --git a/nemo_curator/scripts/add_id.py b/nemo_curator/scripts/add_id.py
index 4e49663aa..3e91e8062 100644
--- a/nemo_curator/scripts/add_id.py
+++ b/nemo_curator/scripts/add_id.py
@@ -79,8 +79,10 @@ def attach_args(
     parser.add_argument(
         "--starting-index",
         type=int,
-        default=0,
-        help="Starting index from which to start indexing the documents",
+        default=None,
+        help="If supplied, determines the starting index from which to start "
+        "indexing the documents. By default, it is unspecified, and uses an id"
+        " scheme that is fast to calculate and is not guaranteed to be ordered.",
     )
     parser.add_argument(
         "--output-data-dir",
diff --git a/nemo_curator/utils/module_utils.py b/nemo_curator/utils/module_utils.py
index dc4a693d2..388a949f6 100644
--- a/nemo_curator/utils/module_utils.py
+++ b/nemo_curator/utils/module_utils.py
@@ -11,7 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 
 
 def is_batched(function):
     return hasattr(function, "batched") and function.batched
+
+
+def count_digits(num):
+    return math.floor(math.log10(num)) + 1
diff --git a/tests/test_add_id.py b/tests/test_add_id.py
index 458b4868d..42a8575e5 100644
--- a/tests/test_add_id.py
+++ b/tests/test_add_id.py
@@ -16,7 +16,7 @@
 import pandas as pd
 import pytest
 
-import nemo_curator
+import nemo_curator as nc
 from nemo_curator.datasets import DocumentDataset
 
 
@@ -41,10 +41,10 @@ def two_partition_dataset():
     )
 
 
-class TestPrepareTaskData:
+class TestAddId:
     def test_basic_id(self, single_partition_dataset):
         id_field = "id"
-        add_id = nemo_curator.AddId(id_field)
+        add_id = nc.AddId(id_field, start_index=0)
         id_dataset = add_id(single_partition_dataset)
         actual_ids = id_dataset.df[id_field].compute()
         expected_ids = pd.Series(
@@ -63,7 +63,7 @@ def test_basic_id(self, single_partition_dataset):
 
     def test_two_partitions(self, two_partition_dataset):
         id_field = "id"
-        add_id = nemo_curator.AddId(id_field)
+        add_id = nc.AddId(id_field, start_index=0)
         id_dataset = add_id(two_partition_dataset)
         actual_ids = id_dataset.df[id_field].compute()
         expected_ids = pd.Series(
@@ -83,7 +83,7 @@ def test_two_partitions(self, two_partition_dataset):
     def test_id_prefix(self, two_partition_dataset):
         id_field = "id"
         id_prefix = "my_id"
-        add_id = nemo_curator.AddId(id_field, id_prefix=id_prefix)
+        add_id = nc.AddId(id_field, id_prefix=id_prefix, start_index=0)
         id_dataset = add_id(two_partition_dataset)
         actual_ids = id_dataset.df[id_field].compute()
         expected_ids = pd.Series(
@@ -103,7 +103,7 @@ def test_id_prefix(self, two_partition_dataset):
     def test_start_index(self, two_partition_dataset):
         id_field = "id"
         start_index = 13
-        add_id = nemo_curator.AddId(id_field, start_index=start_index)
+        add_id = nc.AddId(id_field, start_index=start_index)
         id_dataset = add_id(two_partition_dataset)
         actual_ids = id_dataset.df[id_field].compute()
         expected_ids = pd.Series(
@@ -119,3 +119,41 @@ def test_start_index(self, two_partition_dataset):
         assert all(
             expected_ids == actual_ids
         ), f"Expected: {expected_ids}, got: {actual_ids}"
+
+    def test_fast_id_single_partition(self, single_partition_dataset):
+        id_field = "id"
+        add_id = nc.AddId(id_field)
+        id_dataset = add_id(single_partition_dataset)
+        actual_ids = id_dataset.df[id_field].compute()
+        expected_ids = pd.Series(
+            [
+                "doc_id-00",
+                "doc_id-10",
+                "doc_id-20",
+                "doc_id-30",
+                "doc_id-40",
+            ]
+        )
+
+        assert all(
+            expected_ids == actual_ids
+        ), f"Expected: {expected_ids}, got: {actual_ids}"
+
+    def test_fast_id_two_partitions(self, two_partition_dataset):
+        id_field = "id"
+        add_id = nc.AddId(id_field)
+        id_dataset = add_id(two_partition_dataset)
+        actual_ids = id_dataset.df[id_field].compute()
+        expected_ids = pd.Series(
+            [
+                "doc_id-00",
+                "doc_id-10",
+                "doc_id-20",
+                "doc_id-01",
+                "doc_id-11",
+            ]
+        )
+
+        assert all(
+            expected_ids == actual_ids
+        ), f"Expected: {expected_ids}, got: {actual_ids}"

From 417e874bc42a32f80f77c58d8e792e93c7ef49f5 Mon Sep 17 00:00:00 2001
From: Ayush Dattagupta <ayushdg95@gmail.com>
Date: Tue, 23 Apr 2024 13:40:00 -0700
Subject: [PATCH 05/34] Make GPU dependencies optional (#27)

* Move GPU imports and make them optional

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Move gpu dependencies to a seperate install

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Remove unused import

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Switch to placeholder import that raises on usage

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Remove deprecated utils usage

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Add cuML attribution

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Safe import tests, improve install instruction, update gha workflow

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Fix pytests due to loc bug

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* update install instructions

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Raise on non module-not-found errors, update logging

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Update logging to not change root logger

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

---------

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 .github/workflows/test.yml                    |   5 +-
 README.md                                     |  14 +-
 nemo_curator/datasets/doc_dataset.py          |   6 +-
 nemo_curator/gpu_deduplication/utils.py       |  76 ----
 nemo_curator/modules/__init__.py              |   7 +-
 nemo_curator/modules/exact_dedup.py           |   3 +-
 nemo_curator/modules/fuzzy_dedup.py           |  15 +-
 nemo_curator/scripts/compute_minhashes.py     |   9 +-
 nemo_curator/scripts/connected_components.py  |   7 +-
 nemo_curator/scripts/find_exact_duplicates.py |   3 +-
 nemo_curator/scripts/jaccard_compute.py       |   8 +-
 nemo_curator/scripts/jaccard_shuffle.py       |   9 +-
 nemo_curator/scripts/map_buckets.py           |   9 +-
 nemo_curator/scripts/minhash_lsh.py           |   3 +-
 nemo_curator/utils/distributed_utils.py       |  46 ++-
 nemo_curator/utils/gpu_utils.py               |   3 +
 nemo_curator/utils/import_utils.py            | 384 ++++++++++++++++++
 setup.py                                      |  13 +-
 tests/test_filters.py                         |   8 +-
 tests/test_fuzzy_dedup.py                     |   6 +-
 20 files changed, 493 insertions(+), 141 deletions(-)
 create mode 100644 nemo_curator/utils/import_utils.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d179a2a57..baa968f47 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -40,9 +40,8 @@ jobs:
         # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
         run: |
           pip install wheel cython
-          pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com .
+          pip install --no-cache-dir .
           pip install pytest
       - name: Run tests
-        # TODO: Remove env variable when gpu dependencies are optional
         run: |
-          RAPIDS_NO_INITIALIZE=1 python -m pytest -v --cpu
+          python -m pytest -v --cpu
diff --git a/README.md b/README.md
index eb8c37abe..a17a573eb 100644
--- a/README.md
+++ b/README.md
@@ -37,12 +37,20 @@ These modules are designed to be flexible and allow for reordering with few exce
 
 ## Installation
 
-NeMo Curator currently requires Python 3.10 and a GPU with CUDA 12 or above installed in order to be used.
+NeMo Curator currently requires Python 3.10 and the GPU accelerated modules require CUDA 12 or above installed in order to be used.
 
-NeMo Curator can be installed manually by cloning the repository and installing as follows:
+NeMo Curator can be installed manually by cloning the repository and installing as follows -
+
+For CPU only modules:
+```
+pip install .
 ```
-pip install --extra-index-url https://pypi.nvidia.com .
+
+For CPU + CUDA accelerated modules
 ```
+pip install --extra-index-url https://pypi.nvidia.com ".[cuda12x]"
+```
+
 ### NeMo Framework Container
 
 NeMo Curator is available in the [NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo). The NeMo Framework Container provides an end-to-end platform for development of custom generative AI models anywhere. The latest release of NeMo Curator comes preinstalled in the container.
diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py
index af45f290c..37592b188 100644
--- a/nemo_curator/datasets/doc_dataset.py
+++ b/nemo_curator/datasets/doc_dataset.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import dask.dataframe as dd
-import dask_cudf
 
 from nemo_curator.utils.distributed_utils import read_data, write_to_disk
 from nemo_curator.utils.file_utils import get_all_files_paths_under
@@ -182,10 +181,7 @@ def _read_json_or_parquet(
                 )
                 dfs.append(df)
 
-            if backend == "cudf":
-                raw_data = dask_cudf.concat(dfs, ignore_unknown_divisions=True)
-            else:
-                raw_data = dd.concat(dfs, ignore_unknown_divisions=True)
+            raw_data = dd.concat(dfs, ignore_unknown_divisions=True)
 
     elif isinstance(input_files, str):
         # Single file
diff --git a/nemo_curator/gpu_deduplication/utils.py b/nemo_curator/gpu_deduplication/utils.py
index ed69477be..f6faefe77 100644
--- a/nemo_curator/gpu_deduplication/utils.py
+++ b/nemo_curator/gpu_deduplication/utils.py
@@ -13,84 +13,8 @@
 # limitations under the License.
 
 import argparse
-import logging
-import os
-import socket
-from contextlib import nullcontext
 from time import time
 
-import cudf
-from dask_cuda import LocalCUDACluster
-from distributed import Client, performance_report
-
-
-def create_logger(rank, log_file, name="logger", log_level=logging.INFO):
-    # Create the logger
-    logger = logging.getLogger(name)
-    logger.setLevel(log_level)
-
-    myhost = socket.gethostname()
-
-    extra = {"host": myhost, "rank": rank}
-    formatter = logging.Formatter(
-        "%(asctime)s | %(host)s | Rank %(rank)s | %(message)s"
-    )
-
-    # File handler for output
-    file_handler = logging.FileHandler(log_file, mode="a")
-    file_handler.setFormatter(formatter)
-    logger.addHandler(file_handler)
-    logger = logging.LoggerAdapter(logger, extra)
-
-    return logger
-
-
-# TODO: Remove below to use nemo_curator.distributed_utils.get_client
-def get_client(args) -> Client:
-    if args.scheduler_address:
-        if args.scheduler_file:
-            raise ValueError(
-                "Only one of scheduler_address or scheduler_file can be provided"
-            )
-        else:
-            return Client(address=args.scheduler_address, timeout="30s")
-    elif args.scheduler_file:
-        return Client(scheduler_file=args.scheduler_file, timeout="30s")
-    else:
-        extra_kwargs = (
-            {
-                "enable_tcp_over_ucx": True,
-                "enable_nvlink": True,
-                "enable_infiniband": False,
-                "enable_rdmacm": False,
-            }
-            if args.nvlink_only and args.protocol == "ucx"
-            else {}
-        )
-
-        cluster = LocalCUDACluster(
-            rmm_pool_size=args.rmm_pool_size,
-            protocol=args.protocol,
-            rmm_async=True,
-            **extra_kwargs,
-        )
-        return Client(cluster)
-
-
-def performance_report_if(path=None, report_name="dask-profile.html"):
-    if path is not None:
-        return performance_report(os.path.join(path, report_name))
-    else:
-        return nullcontext()
-
-
-# TODO: Remove below to use nemo_curator.distributed_utils._enable_spilling
-def enable_spilling():
-    """
-    Enables spilling to host memory for cudf
-    """
-    cudf.set_option("spill", True)
-
 
 def get_num_workers(client):
     """
diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py
index d7c099803..434ebecf4 100644
--- a/nemo_curator/modules/__init__.py
+++ b/nemo_curator/modules/__init__.py
@@ -19,14 +19,19 @@
 # See https://github.com/NVIDIA/NeMo-Curator/issues/31
 os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
+from nemo_curator.utils.import_utils import gpu_only_import_from
+
 from .add_id import AddId
 from .exact_dedup import ExactDuplicates
 from .filter import Filter, Score, ScoreFilter
-from .fuzzy_dedup import LSH, MinHash
 from .meta import Sequential
 from .modify import Modify
 from .task import TaskDecontamination
 
+# GPU packages
+LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "LSH")
+MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "MinHash")
+
 # Pytorch related imports must come after all imports that require cugraph,
 # because of context cleanup issues b/w pytorch and cugraph
 # See this issue: https://github.com/rapidsai/cugraph/issues/2718
diff --git a/nemo_curator/modules/exact_dedup.py b/nemo_curator/modules/exact_dedup.py
index 5d960ac6e..2831f516f 100644
--- a/nemo_curator/modules/exact_dedup.py
+++ b/nemo_curator/modules/exact_dedup.py
@@ -28,7 +28,8 @@
 
 from nemo_curator._compat import DASK_P2P_ERROR
 from nemo_curator.datasets import DocumentDataset
-from nemo_curator.gpu_deduplication.utils import create_logger, performance_report_if
+from nemo_curator.log import create_logger
+from nemo_curator.utils.distributed_utils import performance_report_if
 from nemo_curator.utils.gpu_utils import is_cudf_type
 
 
diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py
index 3b0576058..b51499678 100644
--- a/nemo_curator/modules/fuzzy_dedup.py
+++ b/nemo_curator/modules/fuzzy_dedup.py
@@ -22,12 +22,12 @@
 from typing import List, Tuple, Union
 
 import cudf
-import cugraph
 import cugraph.dask as dcg
 import cugraph.dask.comms.comms as Comms
 import cupy as cp
 import dask_cudf
 import numpy as np
+from cugraph import MultiGraph
 from dask import dataframe as dd
 from dask.dataframe.shuffle import shuffle as dd_shuffle
 from dask.utils import M
@@ -39,12 +39,13 @@
     filter_text_rows_by_bucket_batch,
     merge_left_to_shuffled_right,
 )
-from nemo_curator.gpu_deduplication.utils import create_logger, performance_report_if
-from nemo_curator.utils.distributed_utils import get_current_client, get_num_workers
-from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import (
-    convert_str_id_to_int,
-    int_ids_to_str,
+from nemo_curator.log import create_logger
+from nemo_curator.utils.distributed_utils import (
+    get_current_client,
+    get_num_workers,
+    performance_report_if,
 )
+from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import int_ids_to_str
 from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (
     aggregated_anchor_docs_with_bk_read,
     get_restart_offsets,
@@ -1120,7 +1121,7 @@ def _run_connected_components(
         df = df[[self.left_id, self.right_id]].astype(np.int64)
         df = dask_cudf.concat([df, self_edge_df])
 
-        G = cugraph.MultiGraph(directed=False)
+        G = MultiGraph(directed=False)
         G.from_dask_cudf_edgelist(
             df, source=self.left_id, destination=self.right_id, renumber=False
         )
diff --git a/nemo_curator/scripts/compute_minhashes.py b/nemo_curator/scripts/compute_minhashes.py
index c7a7e68b2..044653ceb 100644
--- a/nemo_curator/scripts/compute_minhashes.py
+++ b/nemo_curator/scripts/compute_minhashes.py
@@ -18,12 +18,13 @@
 from nemo_curator import MinHash
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep
-from nemo_curator.gpu_deduplication.utils import (
-    create_logger,
-    parse_nc_args,
+from nemo_curator.gpu_deduplication.utils import parse_nc_args
+from nemo_curator.log import create_logger
+from nemo_curator.utils.distributed_utils import (
+    get_client,
     performance_report_if,
+    read_data,
 )
-from nemo_curator.utils.distributed_utils import get_client, read_data
 from nemo_curator.utils.file_utils import get_all_files_paths_under
 
 
diff --git a/nemo_curator/scripts/connected_components.py b/nemo_curator/scripts/connected_components.py
index 1ab1282af..c04f0349d 100644
--- a/nemo_curator/scripts/connected_components.py
+++ b/nemo_curator/scripts/connected_components.py
@@ -15,7 +15,7 @@
 import os
 import time
 
-from nemo_curator.gpu_deduplication.utils import enable_spilling, parse_nc_args
+from nemo_curator.gpu_deduplication.utils import parse_nc_args
 from nemo_curator.modules.fuzzy_dedup import ConnectedComponents
 from nemo_curator.utils.distributed_utils import get_client
 
@@ -32,9 +32,10 @@ def main(args):
     st = time.time()
     output_path = os.path.join(args.output_dir, "connected_components.parquet")
     args.set_torch_to_use_rmm = False
+    args.enable_spilling = True
+
     client = get_client(args, cluster_type="gpu")
-    enable_spilling()
-    client.run(enable_spilling)
+
     components_stage = ConnectedComponents(
         cache_dir=args.cache_dir,
         jaccard_pairs_path=args.jaccard_pairs_path,
diff --git a/nemo_curator/scripts/find_exact_duplicates.py b/nemo_curator/scripts/find_exact_duplicates.py
index 7da01ea8e..16173861d 100644
--- a/nemo_curator/scripts/find_exact_duplicates.py
+++ b/nemo_curator/scripts/find_exact_duplicates.py
@@ -19,7 +19,8 @@
 
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep
-from nemo_curator.gpu_deduplication.utils import create_logger, parse_nc_args
+from nemo_curator.gpu_deduplication.utils import parse_nc_args
+from nemo_curator.log import create_logger
 from nemo_curator.modules import ExactDuplicates
 from nemo_curator.utils.distributed_utils import get_client, read_data
 from nemo_curator.utils.file_utils import get_all_files_paths_under
diff --git a/nemo_curator/scripts/jaccard_compute.py b/nemo_curator/scripts/jaccard_compute.py
index f59157164..d16e95654 100644
--- a/nemo_curator/scripts/jaccard_compute.py
+++ b/nemo_curator/scripts/jaccard_compute.py
@@ -15,13 +15,13 @@
 import os
 import time
 
-from nemo_curator.gpu_deduplication.utils import enable_spilling, parse_nc_args
+from nemo_curator.gpu_deduplication.utils import parse_nc_args
 from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity
 from nemo_curator.utils.distributed_utils import get_client, get_num_workers
 
 
 def main(args):
-    description = """Computes the Jaccard similarity between document pairs
+    """Computes the Jaccard similarity between document pairs
     from partitioned parquet dataset. Result is a parquet dataset consiting of
     document id pair along with their Jaccard similarity score.
     """
@@ -30,9 +30,9 @@ def main(args):
     output_final_results_path = os.path.join(
         OUTPUT_PATH, "jaccard_similarity_results.parquet"
     )
+    args.enable_spilling = True
     client = get_client(args, "gpu")
-    enable_spilling()
-    client.run(enable_spilling)
+
     print(f"Num Workers = {get_num_workers(client)}", flush=True)
     print("Connected to dask cluster", flush=True)
     print("Running jaccard compute script", flush=True)
diff --git a/nemo_curator/scripts/jaccard_shuffle.py b/nemo_curator/scripts/jaccard_shuffle.py
index dc5d20f9b..c01935a61 100644
--- a/nemo_curator/scripts/jaccard_shuffle.py
+++ b/nemo_curator/scripts/jaccard_shuffle.py
@@ -15,12 +15,9 @@
 import os
 import time
 
-from nemo_curator.gpu_deduplication.utils import (
-    get_client,
-    get_num_workers,
-    parse_nc_args,
-)
+from nemo_curator.gpu_deduplication.utils import get_num_workers, parse_nc_args
 from nemo_curator.modules.fuzzy_dedup import _Shuffle
+from nemo_curator.utils.distributed_utils import get_client
 from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (
     get_text_ddf_from_json_path_with_blocksize,
 )
@@ -38,7 +35,7 @@ def main(args):
     OUTPUT_PATH = args.output_dir
     output_shuffled_docs_path = os.path.join(OUTPUT_PATH, "shuffled_docs.parquet")
 
-    client = get_client(args)
+    client = get_client(args, "gpu")
     client.run(func)
     print(f"Num Workers = {get_num_workers(client)}", flush=True)
     print("Connected to dask cluster", flush=True)
diff --git a/nemo_curator/scripts/map_buckets.py b/nemo_curator/scripts/map_buckets.py
index 522e4f417..9e3f71a51 100644
--- a/nemo_curator/scripts/map_buckets.py
+++ b/nemo_curator/scripts/map_buckets.py
@@ -15,12 +15,9 @@
 import os
 import time
 
-from nemo_curator.gpu_deduplication.utils import (
-    get_client,
-    get_num_workers,
-    parse_nc_args,
-)
+from nemo_curator.gpu_deduplication.utils import get_num_workers, parse_nc_args
 from nemo_curator.modules.fuzzy_dedup import _MapBuckets
+from nemo_curator.utils.distributed_utils import get_client
 from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (
     get_bucket_ddf_from_parquet_path,
     get_text_ddf_from_json_path_with_blocksize,
@@ -157,7 +154,7 @@ def main(args):
     output_anchor_docs_with_bk_path = os.path.join(
         OUTPUT_PATH, "anchor_docs_with_bk.parquet"
     )
-    client = get_client(args)
+    client = get_client(args, "gpu")
     print(f"Num Workers = {get_num_workers(client)}", flush=True)
     print("Connected to dask cluster", flush=True)
     print("Running jaccard map buckets script", flush=True)
diff --git a/nemo_curator/scripts/minhash_lsh.py b/nemo_curator/scripts/minhash_lsh.py
index fb2c6a90d..ec206dc10 100644
--- a/nemo_curator/scripts/minhash_lsh.py
+++ b/nemo_curator/scripts/minhash_lsh.py
@@ -24,7 +24,8 @@
 from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import (
     convert_str_id_to_int,
 )
-from nemo_curator.gpu_deduplication.utils import create_logger, parse_nc_args
+from nemo_curator.gpu_deduplication.utils import parse_nc_args
+from nemo_curator.log import create_logger
 from nemo_curator.utils.distributed_utils import get_client
 
 
diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py
index 71fa1cdca..2d7dc9213 100644
--- a/nemo_curator/utils/distributed_utils.py
+++ b/nemo_curator/utils/distributed_utils.py
@@ -11,20 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import os
 
 os.environ["RAPIDS_NO_INITIALIZE"] = "1"
 import warnings
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Union
 
-import cudf
 import dask.dataframe as dd
-import dask_cudf
 import pandas as pd
-from dask.distributed import Client, LocalCluster, get_worker
-from dask_cuda import LocalCUDACluster
+from dask.distributed import Client, LocalCluster, get_worker, performance_report
+
+from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING, is_cudf_type
+from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from
+
+cudf = gpu_only_import("cudf")
+LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster")
 
 
 class DotDict:
@@ -48,7 +53,6 @@ def start_dask_gpu_local_cluster(args) -> Client:
     GPUs present on the machine.
 
     """
-
     # Setting conservative defaults
     # which should work across most systems
     nvlink_only = getattr(args, "nvlink_only", False)
@@ -166,6 +170,8 @@ def _enable_spilling():
     i.e., computing on objects that occupy more memory than is available on the GPU.
 
     """
+    import cudf
+
     cudf.set_option("spill", True)
 
 
@@ -265,6 +271,10 @@ def read_data(
         A Dask-cuDF or a Dask-pandas DataFrame.
 
     """
+    if backend == "cudf":
+        # Try using cuDF. If not availible will throw an error.
+        test_obj = cudf.Series
+
     if file_type == "pickle":
         df = read_pandas_pickle(input_files[0], add_filename=add_filename)
         df = dd.from_pandas(df, npartitions=16)
@@ -369,10 +379,12 @@ def single_partition_write_with_filename(df, output_file_dir, output_type="jsonl
         warnings.warn(f"Empty partition found")
         empty_partition = False
 
-    if isinstance(df, pd.DataFrame):
-        success_ser = pd.Series([empty_partition])
-    else:
+    if is_cudf_type(df):
+        import cudf
+
         success_ser = cudf.Series([empty_partition])
+    else:
+        success_ser = pd.Series([empty_partition])
 
     if empty_partition:
         filename = df.filename.iloc[0]
@@ -425,10 +437,13 @@ def write_to_disk(df, output_file_dir, write_to_filename=False, output_type="jso
         )
 
     if write_to_filename:
-        if isinstance(df, dd.DataFrame):
-            output_meta = pd.Series([True], dtype="bool")
-        else:
+        if is_cudf_type(df):
+            import cudf
+
             output_meta = cudf.Series([True])
+        else:
+            output_meta = pd.Series([True], dtype="bool")
+
         os.makedirs(output_file_dir, exist_ok=True)
         output = df.map_partitions(
             single_partition_write_with_filename,
@@ -440,7 +455,7 @@ def write_to_disk(df, output_file_dir, write_to_filename=False, output_type="jso
         output = output.compute()
     else:
         if output_type == "jsonl":
-            if isinstance(df, dask_cudf.DataFrame):
+            if is_cudf_type(df):
                 # See open issue here: https://github.com/rapidsai/cudf/issues/15211
                 # df.to_json(output_file_dir, orient="records", lines=True, engine="cudf", force_ascii=False)
                 df.to_json(
@@ -521,3 +536,10 @@ def get_current_client():
         return Client.current()
     except ValueError:
         return None
+
+
+def performance_report_if(path=None, report_name="dask-profile.html"):
+    if path is not None:
+        return performance_report(os.path.join(path, report_name))
+    else:
+        return nullcontext()
diff --git a/nemo_curator/utils/gpu_utils.py b/nemo_curator/utils/gpu_utils.py
index de1c23dfe..86ba888fc 100644
--- a/nemo_curator/utils/gpu_utils.py
+++ b/nemo_curator/utils/gpu_utils.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+GPU_INSTALL_STRING = """Install GPU packages via `pip install --extra-index-url https://pypi.nvidia.com nemo_curator[cuda12x]`
+or use `pip install --extra-index-url https://pypi.nvidia.com ".[cuda12x]"` if installing from source"""
+
 
 def is_cudf_type(obj):
     """
diff --git a/nemo_curator/utils/import_utils.py b/nemo_curator/utils/import_utils.py
new file mode 100644
index 000000000..ea78e4597
--- /dev/null
+++ b/nemo_curator/utils/import_utils.py
@@ -0,0 +1,384 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is adapted from cuML's safe_imports module:
+# https://github.com/rapidsai/cuml/blob/e93166ea0dddfa8ef2f68c6335012af4420bc8ac/python/cuml/internals/safe_imports.py
+
+
+import importlib
+import logging
+import traceback
+from contextlib import contextmanager
+
+from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.addHandler(logging.StreamHandler())
+
+
+class UnavailableError(Exception):
+    """Error thrown if a symbol is unavailable due to an issue importing it"""
+
+
+@contextmanager
+def null_decorator(*args, **kwargs):
+    if len(kwargs) == 0 and len(args) == 1 and callable(args[0]):
+        return args[0]
+    else:
+
+        def inner(func):
+            return func
+
+        return inner
+
+
+class UnavailableMeta(type):
+    """A metaclass for generating placeholder objects for unavailable symbols
+
+    This metaclass allows errors to be deferred from import time to the time
+    that a symbol is actually used in order to streamline the usage of optional
+    dependencies. This is particularly useful for attempted imports of GPU-only
+    modules which will only be invoked if GPU-only functionality is
+    specifically used.
+
+    If an attempt to import a symbol fails, this metaclass is used to generate
+    a class which stands in for that symbol. Any attempt to call the symbol
+    (instantiate the class) or access its attributes will throw an
+    UnavailableError exception. Furthermore, this class can be used in
+    e.g. isinstance checks, since it will (correctly) fail to match any
+    instance it is compared against.
+
+    In addition to calls and attribute access, a number of dunder methods are
+    implemented so that other common usages of imported symbols (e.g.
+    arithmetic) throw an UnavailableError, but this is not guaranteed for
+    all possible uses. In such cases, other exception types (typically
+    TypeErrors) will be thrown instead.
+    """
+
+    def __new__(meta, name, bases, dct):
+        if dct.get("_msg", None) is None:
+            dct["_msg"] = f"{name} could not be imported"
+        name = f"MISSING{name}"
+        return super(UnavailableMeta, meta).__new__(meta, name, bases, dct)
+
+    def __call__(cls, *args, **kwargs):
+        raise UnavailableError(cls._msg)
+
+    def __getattr__(cls, name):
+        raise UnavailableError(cls._msg)
+
+    def __eq__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __lt__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __gt__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __ne__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __abs__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __add__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __radd__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __iadd__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __floordiv__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __rfloordiv__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __ifloordiv__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __lshift__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __rlshift__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __mul__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __rmul__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __imul__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __ilshift__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __pow__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __rpow__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __ipow__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __rshift__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __rrshift__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __irshift__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __sub__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __rsub__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __isub__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __truediv__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __rtruediv__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __itruediv__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __divmod__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __rdivmod__(cls, other):
+        raise UnavailableError(cls._msg)
+
+    def __neg__(cls):
+        raise UnavailableError(cls._msg)
+
+    def __invert__(cls):
+        raise UnavailableError(cls._msg)
+
+    def __hash__(cls):
+        raise UnavailableError(cls._msg)
+
+    def __index__(cls):
+        raise UnavailableError(cls._msg)
+
+    def __iter__(cls):
+        raise UnavailableError(cls._msg)
+
+    def __delitem__(cls, name):
+        raise UnavailableError(cls._msg)
+
+    def __setitem__(cls, name, value):
+        raise UnavailableError(cls._msg)
+
+    def __enter__(cls, *args, **kwargs):
+        raise UnavailableError(cls._msg)
+
+    def __get__(cls, *args, **kwargs):
+        raise UnavailableError(cls._msg)
+
+    def __delete__(cls, *args, **kwargs):
+        raise UnavailableError(cls._msg)
+
+    def __len__(cls):
+        raise UnavailableError(cls._msg)
+
+
+def is_unavailable(obj):
+    """Helper to check if given symbol is actually a placeholder"""
+    return type(obj) is UnavailableMeta
+
+
+class UnavailableNullContext:
+    """A placeholder class for unavailable context managers
+
+    This context manager will return a value which will throw an
+    UnavailableError if used in any way, but the context manager itself can be
+    safely invoked.
+    """
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __enter__(self):
+        return UnavailableMeta(
+            "MissingContextValue",
+            (),
+            {"_msg": "Attempted to make use of placeholder context return value."},
+        )
+
+    def __exit__(self, *args, **kwargs):
+        pass
+
+
+def safe_import(module, *, msg=None, alt=None):
+    """A function used to import modules that may not be available
+
+    This function will attempt to import a module with the given name, but it
+    will not throw an ModuleNotFoundError if the module is not found. Instead, it will
+    return a placeholder object which will raise an exception only if used.
+
+    Parameters
+    ----------
+    module: str
+        The name of the module to import.
+    msg: str or None
+        An optional error message to be displayed if this module is used
+        after a failed import.
+    alt: object
+        An optional module to be used in place of the given module if it
+        fails to import
+
+    Returns
+    -------
+    object
+        The imported module, the given alternate, or a class derived from
+        UnavailableMeta.
+    """
+    try:
+        return importlib.import_module(module)
+    except ModuleNotFoundError:
+        exception_text = traceback.format_exc()
+        logger.debug(f"Import of {module} failed with: {exception_text}")
+    except Exception:
+        exception_text = traceback.format_exc()
+        raise
+    if msg is None:
+        msg = f"{module} could not be imported"
+    if alt is None:
+        return UnavailableMeta(module.rsplit(".")[-1], (), {"_msg": msg})
+    else:
+        return alt
+
+
+def safe_import_from(module, symbol, *, msg=None, alt=None):
+    """A function used to import symbols from modules that may not be available
+
+    This function will attempt to import a symbol with the given name from
+    the given module, but it will not throw an ImportError if the symbol is not
+    found. Instead, it will return a placeholder object which will raise an
+    exception only if used.
+
+    Parameters
+    ----------
+    module: str
+        The name of the module in which the symbol is defined.
+    symbol: str
+        The name of the symbol to import.
+    msg: str or None
+        An optional error message to be displayed if this symbol is used
+        after a failed import.
+    alt: object
+        An optional object to be used in place of the given symbol if it fails
+        to import
+
+    Returns
+    -------
+    object
+        The imported symbol, the given alternate, or a class derived from
+        UnavailableMeta.
+    """
+    try:
+        imported_module = importlib.import_module(module)
+        return getattr(imported_module, symbol)
+    except ModuleNotFoundError:
+        exception_text = traceback.format_exc()
+        logger.debug(f"Import of {module} failed with: {exception_text}")
+    except AttributeError:
+        exception_text = traceback.format_exc()
+        logger.info(f"Import of {symbol} from {module} failed with: {exception_text}")
+    except Exception:
+        exception_text = traceback.format_exc()
+        raise
+    if msg is None:
+        msg = f"{module}.{symbol} could not be imported"
+    if alt is None:
+        return UnavailableMeta(symbol, (), {"_msg": msg})
+    else:
+        return alt
+
+
+def gpu_only_import(module, *, alt=None):
+    """A function used to import modules required only in GPU installs
+
+    This function will attempt to import a module with the given name.
+    This function will attempt to import a symbol with the given name from
+    the given module, but it will not throw an ImportError if the symbol is not
+    found. Instead, it will return a placeholder object which will raise an
+    exception only if used with instructions on installing a GPU build.
+
+    Parameters
+    ----------
+    module: str
+        The name of the module to import.
+    alt: object
+        An optional module to be used in place of the given module if it
+        fails to import in a non-GPU-enabled install
+
+    Returns
+    -------
+    object
+        The imported module, the given alternate, or a class derived from
+        UnavailableMeta.
+    """
+
+    return safe_import(
+        module,
+        msg=f"{module} is not installed in non GPU-enabled installations. {GPU_INSTALL_STRING}",
+        alt=alt,
+    )
+
+
+def gpu_only_import_from(module, symbol, *, alt=None):
+    """A function used to import symbols required only in GPU installs
+
+    This function will attempt to import a module with the given name.
+    This function will attempt to import a symbol with the given name from
+    the given module, but it will not throw an ImportError if the symbol is not
+    found. Instead, it will return a placeholder object which will raise an
+    exception only if used with instructions on installing a GPU build.
+
+    Parameters
+    ----------
+    module: str
+        The name of the module to import.
+    symbol: str
+        The name of the symbol to import.
+    alt: object
+        An optional object to be used in place of the given symbol if it fails
+        to import in a non-GPU-enabled install
+
+    Returns
+    -------
+    object
+        The imported symbol, the given alternate, or a class derived from
+        UnavailableMeta.
+    """
+    return safe_import_from(
+        module,
+        symbol,
+        msg=f"{module}.{symbol} is not installed in non GPU-enabled installations. {GPU_INSTALL_STRING}",
+        alt=alt,
+    )
diff --git a/setup.py b/setup.py
index b47ef5c95..8fc60e926 100644
--- a/setup.py
+++ b/setup.py
@@ -55,10 +55,6 @@
         "comment_parser",
         "beautifulsoup4",
         "mwparserfromhell @ git+https://github.com/earwig/mwparserfromhell.git@0f89f44",
-        "cudf-cu12>=24.2",
-        "dask-cudf-cu12>=24.2",
-        "cugraph-cu12>=24.2",
-        "dask-cuda>=24.2",
         "spacy>=3.6.0, <4.0.0",
         "presidio-analyzer==2.2.351",
         "presidio-anonymizer==2.2.351",
@@ -68,6 +64,15 @@
         # due to this: https://github.com/miso-belica/jusText/issues/47
         "lxml[html_clean]",
     ],
+    extras_require={
+        "cuda12x": [
+            "cudf-cu12>=24.2",
+            "dask-cudf-cu12>=24.2",
+            "cugraph-cu12>=24.2",
+            "dask-cuda>=24.2",
+            "spacy[cuda12x]>=3.6.0, <4.0.0",
+        ]
+    },
     entry_points={
         "console_scripts": [
             "get_common_crawl_urls=nemo_curator.scripts.get_common_crawl_urls:console_script",
diff --git a/tests/test_filters.py b/tests/test_filters.py
index 11bf57388..4ab11c21a 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -149,7 +149,9 @@ def test_retain_score_filter(self, letter_count_data):
         filtered_data = filter_step(letter_count_data)
 
         expected_indices = [2, 3]
-        expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices])
+        # Compute before loc due to https://github.com/dask/dask-expr/issues/1036
+        expected_data = letter_count_data.df.compute().loc[expected_indices]
+        expected_data = DocumentDataset(dd.from_pandas(expected_data, 2))
         expected_data.df[score_field] = pd.Series([5, 7], index=expected_data.df.index)
         assert all_equal(
             expected_data, filtered_data
@@ -168,7 +170,9 @@ def test_filter(self, letter_count_data):
         filtered_data = filter_step(scored_data)
 
         expected_indices = [2, 3]
-        expected_data = letter_count_data.df.loc[expected_indices]
+        # Compute before loc due to https://github.com/dask/dask-expr/issues/1036
+        expected_data = letter_count_data.df.compute().loc[expected_indices]
+        expected_data = dd.from_pandas(expected_data, 2)
         expected_data[score_field] = pd.Series([5, 7], index=expected_data.index)
         expected_data = DocumentDataset(expected_data)
         assert all_equal(
diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py
index 3c6a32754..a1acb901f 100644
--- a/tests/test_fuzzy_dedup.py
+++ b/tests/test_fuzzy_dedup.py
@@ -16,14 +16,16 @@
 from itertools import combinations
 from typing import Iterable
 
-import cudf
-import dask_cudf
 import numpy as np
 import pytest
 from dask.dataframe.utils import assert_eq
 
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.modules import LSH, MinHash
+from nemo_curator.utils.import_utils import gpu_only_import
+
+cudf = gpu_only_import("cudf")
+dask_cudf = gpu_only_import("dask_cudf")
 
 
 @pytest.fixture

From 6d992924a835709b5cbbbd75a47b5bf1eb18d953 Mon Sep 17 00:00:00 2001
From: Ayush Dattagupta <ayushdg95@gmail.com>
Date: Tue, 23 Apr 2024 14:31:24 -0700
Subject: [PATCH 06/34] Fix failing GPU tests with latest pandas bump (#41)

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tests/test_fuzzy_dedup.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py
index a1acb901f..f0ded450e 100644
--- a/tests/test_fuzzy_dedup.py
+++ b/tests/test_fuzzy_dedup.py
@@ -114,7 +114,7 @@ def test_minhash_approximation(
             tuple(zip(minhash_signatures, strings))
         ):
             true_jaccard = jaccard_index(str1, str2, char_ngrams)
-            minhash_approximation = minhash_overlap(sig1, sig2)
+            minhash_approximation = minhash_overlap(np.array(sig1), np.array(sig2))
             assert abs(true_jaccard - minhash_approximation) < THRESHOLD
 
     def test_minhash_cache(self, fuzzy_dedup_data, tmpdir):
@@ -172,7 +172,9 @@ def test_multiple_id_cols(self, tmpdir):
         )
         buckets = lsh(self.dataset)
         buckets_df = buckets.df.compute().to_pandas()
-        buckets_df["new_id"] = list(zip(buckets_df.dataset_id, buckets_df.id))
+        buckets_df["new_id"] = list(
+            map(list, zip(buckets_df.dataset_id, buckets_df.id))
+        )
         docs_list = buckets_df.groupby("_bucket_id").new_id.apply(list)
         expected_df = cudf.Series(
             [[(1, 1), (1, 2)], [(1, 2), (2, 3)], [(3, 4), (4, 5)]], name="new_id"

From dff70cc7c890f886b3ac9ae3a4402611af3abddc Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Tue, 23 Apr 2024 16:32:22 -0700
Subject: [PATCH 07/34]  Adds Nemo Curator K8s example (#40)

* [K8s]: Adds a helper script to create a dask cluster on k8s and includes
instructions for how to a Curator workload on k8s

Signed-off-by: Terry Kong <terryk@nvidia.com>

* black formatting

Signed-off-by: Terry Kong <terryk@nvidia.com>

* big_english -> my_dataset

Signed-off-by: Terry Kong <terryk@nvidia.com>

* 24.01 -> 24.03 default container

Signed-off-by: Terry Kong <terryk@nvidia.com>

* Add help kwarg to all flags

Signed-off-by: Terry Kong <terryk@nvidia.com>

* Clarify why venv is needed

Signed-off-by: Terry Kong <terryk@nvidia.com>

* fix precommit failures

Signed-off-by: Terry Kong <terryk@nvidia.com>

---------

Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 docs/user-guide/CPUvsGPU.rst          |   8 +
 docs/user-guide/KubernetesCurator.rst | 386 ++++++++++++++++++++++++++
 docs/user-guide/index.rst             |   4 +
 examples/k8s/create_dask_cluster.py   | 134 +++++++++
 4 files changed, 532 insertions(+)
 create mode 100644 docs/user-guide/KubernetesCurator.rst
 create mode 100644 examples/k8s/create_dask_cluster.py

diff --git a/docs/user-guide/CPUvsGPU.rst b/docs/user-guide/CPUvsGPU.rst
index 5fd901d19..fa5ea6aa8 100644
--- a/docs/user-guide/CPUvsGPU.rst
+++ b/docs/user-guide/CPUvsGPU.rst
@@ -96,3 +96,11 @@ Every SLURM cluster is different, so make sure you understand how your SLURM clu
 
 Our Python examples are designed to work such that they can be run locally on their own, or easily substituted into the ``start-slurm.sh`` to run on multiple nodes.
 You can adapt your scripts easily too by simply following the pattern of adding ``get_client`` with ``add_distributed_args``.
+
+-----------------------------------------
+Dask with K8s
+-----------------------------------------
+
+We also provide an example guide for how to get started with NeMo Curator on a Kubernetes cluster.
+
+Please visit :ref:`curator_kubernetes` for more information.
diff --git a/docs/user-guide/KubernetesCurator.rst b/docs/user-guide/KubernetesCurator.rst
new file mode 100644
index 000000000..c7f727df0
--- /dev/null
+++ b/docs/user-guide/KubernetesCurator.rst
@@ -0,0 +1,386 @@
+.. _curator_kubernetes:
+
+======================================
+Running NeMo Curator on Kubernetes
+======================================
+The following example demonstrates how to run the NeMo Curator with NVIDIA GPUs on a Kubernetes cluster,
+with PersistentVolumeClaims as the storage option.
+
+.. note::
+    This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.
+
+Prerequisuites
+--------------
+* Kubernetes cluster
+    * `GPU operator <https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html>`__
+    * `Dask Operator <https://kubernetes.dask.org/en/latest/operator_installation.html>`__
+* `kubectl <https://kubernetes.io/docs/tasks/tools>`__: the Kubernetes Cluster CLI
+    * Please reach out to your Kubernetes cluster admin for how to setup your ``kubectl`` KUBECONFIG
+* `ReadWriteMany <https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes>`__ `StorageClass <https://kubernetes.io/docs/concepts/storage/storage-classes/>`__ (setup by Kubernetes cluster admin)
+
+Storage
+-------
+To run NeMo Curator, we need to setup storage to upload and store the input
+files, as well as any processed outputs.
+
+Here is an example of how to create a dynamic PV from a StorageClass setup
+by your cluster admin. Replace ``STORAGE_CLASS=<...>`` with the name of
+your StorageClass.
+
+This example requests ``150Gi`` of space. Adjust that number for your
+workloads and be aware that not all storage provisioners support volume
+resizing.
+
+.. code-block:: bash
+
+    STORAGE_CLASS=<...>
+    PVC_NAME=nemo-workspace
+
+    kubectl apply -f - <<EOF
+    apiVersion: v1
+    kind: PersistentVolumeClaim
+    metadata:
+      name: ${PVC_NAME}
+    spec:
+      accessModes:
+        - ReadWriteMany
+      storageClassName: ${STORAGE_CLASS}
+      resources:
+        requests:
+          # Requesting enough storage for a few experiments
+          storage: 150Gi
+    EOF
+
+.. note::
+    The storage class must support ``ReadWriteMany`` because multiple Pods
+    may need to access the PVC to concurrently read and write.
+
+Setup PVC Busybox Helper Pod
+############################
+
+Inspecting the PVC and copying to and from it is facilitated with a busybox container.
+Some examples below assume you have this Pod running to copy to and from the PVC.
+
+.. code-block:: bash
+
+    PVC_NAME=nemo-workspace
+    MOUNT_PATH=/nemo-workspace
+
+    kubectl create -f - <<EOF
+    apiVersion: v1
+    kind: Pod
+    metadata:
+      name: nemo-workspace-busybox
+    spec:
+      containers:
+      - name: busybox
+        image: busybox
+        command: ["sleep", "infinity"]
+        volumeMounts:
+        - name: workspace
+          mountPath: ${MOUNT_PATH}
+      volumes:
+      - name: workspace
+        persistentVolumeClaim:
+          claimName: ${PVC_NAME}
+    EOF
+
+Feel free to delete this container if no longer needed, but it should use very little
+resources when idle.
+
+.. code-block:: bash
+
+    kubectl delete pod nemo-workspace-busybox
+
+Setup Docker Secrets
+--------------------
+A Kubernetes Secret needs to be created on the k8s cluster to authenticate with the NGC
+private registry. If not done already, get an NGC key from ngc.nvidia.com. Create a
+secret key on the k8s cluster with (replace ``<NGC KEY HERE>`` with your NGC secret
+key. Note that if you have any special characters in your key you might need to wrap
+the key in single quotes (``'``) so it can be parsed correctly by k8s)::
+
+    kubectl create secret docker-registry ngc-registry --docker-server=nvcr.io --docker-username=\$oauthtoken --docker-password=<NGC KEY HERE>
+
+Setup Python Environment
+------------------------
+
+The environment to run the provided scripts in this example does not need the full
+``nemo_curator`` package, so you can create a virtual environment with just the
+required packages as follows:
+
+.. code-block:: bash
+
+    python3 -m venv venv
+    source venv/bin/activate
+
+    pip install 'dask_kubernetes>=2024.4.1'
+
+Upload Data to PVC
+------------------
+
+To copy into the ``nemo-workspace`` PVC, we will do so with ``kubectl exec``. You may also
+use ``kubectl cp``, but ``exec`` has fewer surprises regarding compressed files:
+
+.. code-block:: bash
+
+    # Replace <...> with a path on your local machine
+    LOCAL_WORKSPACE=<...>
+
+    # This copies $LOCAL_WORKSPACE/my_dataset to /my_dataset within the PVC.
+    # Change foobar to the directory or file you wish to upload.
+    ( cd $LOCAL_WORKSPACE; tar cf - my_dataset | kubectl exec -i nemo-workspace-busybox -- tar xf - -C /nemo-workspace )
+
+.. note::
+    See :ref:`data-curator-download` for an example of how to download local data that can be uploaded to the PVC
+    with the above instruction.
+
+Create a Dask Cluster
+---------------------
+
+Use the ``create_dask_cluster.py`` to create a CPU or GPU dask cluster.
+
+.. note::
+    If you are creating another Dask cluster with the same ``--name <name>``, first delete it via::
+
+        kubectl delete daskcluster <name>
+
+.. code-block:: bash
+
+    # Creates a CPU Dask cluster with 1 worker
+    python create_dask_cluster.py \
+        --name rapids-dask \
+        --n_workers 1 \
+        --image nvcr.io/nvidian/bignlp-train:nemofw-nightly \
+        --image_pull_secret ngc-registry \
+        --pvcs nemo-workspace:/nemo-workspace
+
+    #╭───────────────────── Creating KubeCluster 'rapids-dask' ─────────────────────╮
+    #│                                                                              │
+    #│   DaskCluster                                                      Running   │
+    #│   Scheduler Pod                                                    Running   │
+    #│   Scheduler Service                                                Created   │
+    #│   Default Worker Group                                             Created   │
+    #│                                                                              │
+    #│ ⠧ Getting dashboard URL                                                      │
+    #╰──────────────────────────────────────────────────────────────────────────────╯
+    #cluster = KubeCluster(rapids-dask, 'tcp://localhost:61757', workers=2, threads=510, memory=3.94 TiB)
+
+    # Creates a GPU Dask cluster with 2 workers with 1 GPU each
+    python create_dask_cluster.py \
+        --name rapids-dask \
+        --n_workers 2 \
+        --n_gpus_per_worker 1 \
+        --image nvcr.io/nvidian/bignlp-train:nemofw-nightly \
+        --image_pull_secret ngc-registry \
+        --pvcs nemo-workspace:/nemo-workspace
+
+After creating a cluster, you should be able to proceed after confirming the scheduler and the workers are all ``Running``:
+
+.. code-block:: bash
+
+    # Set DASK_CLUSTER_NAME to the value of --name
+    DASK_CLUSTER_NAME=rapids-dask
+    kubectl get pods -l "dask.org/cluster-name=$DASK_CLUSTER_NAME"
+
+    # NAME                                                     READY   STATUS    RESTARTS      AGE
+    # rapids-dask-default-worker-587238cf2c-7d685f4d75-k6rnq   1/1     Running   0             57m
+    # rapids-dask-default-worker-f8ff963886-5577fff76b-qmvcd   1/1     Running   3 (52m ago)   57m
+    # rapids-dask-scheduler-654799869d-9bw4z                   1/1     Running   0             57m
+
+(Opt #1) Running Existing Module
+--------------------------------
+
+Here is an example of running the existing `gpu_exact_dedup` Curator module. The arguments and script name
+will need to be changed according to the module you wish to run:
+
+.. code-block:: bash
+
+    # Set DASK_CLUSTER_NAME to the value of --name
+    DASK_CLUSTER_NAME=rapids-dask
+    SCHEDULER_POD=$(kubectl get pods -l "dask.org/cluster-name=$DASK_CLUSTER_NAME,dask.org/component=scheduler" -o name)
+    # Starts an interactive shell session in the scheduler pod
+    kubectl exec -it $SCHEDULER_POD -- bash
+
+    ########################
+    # Inside SCHEDULER_POD #
+    ########################
+    # Run the following inside the interactive shell to launch script in the background and
+    # tee the logs to the /nemo-workspace PVC that was mounted in for persistence.
+    # The command line flags will need to be replaced with whatever the module script accepts.
+    # Recall that the PVC is mounted at /nemo-workspace, so any outputs should be written
+    # to somewhere under /nemo-workspace.
+
+    mkdir -p /nemo-workspace/curator/{output,log,profile}
+    # Write logs to script.log and to a log file with a date suffix
+    LOGS="/nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.$(date +%y_%m_%d-%H-%M-%S)"
+    (
+    echo "Writing to: $LOGS"
+    gpu_exact_dedup \
+        --input-data-dirs /nemo-workspace/my_dataset \
+        --output-dir /nemo-workspace/curator/output \
+        --hash-method md5 \
+        --log-dir /nemo-workspace/curator/log \
+        --num-files -1 \
+        --files-per-partition 1 \
+        --profile-path /nemo-workspace/curator/profile \
+        --log-frequency 250 \
+        --scheduler-address localhost:8786 \
+        2>&1
+    echo "Finished!"
+    ) | tee $LOGS &
+
+    # At this point, feel free to disconnect the shell via Ctrl+D or simply
+    exit
+
+At this point you can tail the logs and look for ``Finished!`` in ``/nemo-workspace/curator/script.log``:
+
+.. code-block:: bash
+
+    # Command will follow the logs of the running module (Press ctrl+C to close)
+    kubectl exec -it $SCHEDULER_POD -- tail -f /nemo-workspace/curator/script.log
+
+    # Writing to: /nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.24_03_27-15-52-31
+    # Computing hashes for /nemo-workspace/my_dataset
+    #                  adlr_id                           _hashes
+    # 0  cc-2023-14-0397113620  91b77eae49c10a65d485ac8ca18d6c43
+    # 1  cc-2023-14-0397113621  a266f0794cc8ffbd431823e6930e4f80
+    # 2  cc-2023-14-0397113622  baee533e2eddae764de2cd6faaa1286c
+    # 3  cc-2023-14-0397113623  87dd52a468448b99078f97e76f528eab
+    # 4  cc-2023-14-0397113624  a17664daf4f24be58e0e3a3dcf81124a
+    # Finished!
+
+
+(Opt #2) Running Custom Module
+------------------------------
+
+In this example, we'll demonstrate how to run a NeMo Curator module that you have defined locally.
+
+Since your curator module may depend on version of the Curator that differs from what is in the
+container, we will need to build a custom image with your code installed:
+
+.. code-block:: bash
+
+    # Clone your repo. This example uses the official repo
+    git clone https://github.com/NVIDIA/NeMo-Curator.git NeMo-Curator-dev
+
+    # Checkout specific ref. This example uses a commit in the main branch
+    git -C NeMo-Curator-dev checkout fc167a6edffd38a55c333742972a5a25b901cb26
+
+    # Example NeMo base image. Change it according to your requirements
+    BASE_IMAGE=nvcr.io/nvidian/bignlp-train:nemofw-nightly
+    docker build -t nemo-curator-custom ./NeMo-Curator-dev -f - <<EOF
+    FROM ${BASE_IMAGE}
+
+    COPY ./ /NeMo-Curator-dev/
+    RUN pip install -e /NeMo-Curator-dev
+    EOF
+
+    # Then push this image to your registry: Change <private-registry>/<image>:<tag> accordingly
+    docker tag nemo-curator-custom <private-registry>/<image>:<tag>
+    docker push <private-registry>/<image>:<tag>
+
+.. note::
+    When using a custom image, you will likely need to create a different secret unless you pushed to a public registry:
+
+    .. code-block:: bash
+
+        # Fill in <private-registry>/<username>/<password>
+        kubectl create secret docker-registry my-private-registry --docker-server=<private-registry> --docker-username=<username> --docker-password=<password>
+
+    And with this new secret, you create your new dask cluster:
+
+    .. code-block:: bash
+
+        # Fill in <private-registry>/<username>/<password>
+        python create_dask_cluster.py \
+            --name rapids-dask \
+            --n_workers 2 \
+            --n_gpus_per_worker 1 \
+            --image <private-registry>/<image>:<tag> \
+            --image_pull_secret my-private-registry \
+            --pvcs nemo-workspace:/nemo-workspace
+
+After the Dask cluster is deployed, you can proceed to run your module. In this example we'll use
+the ``NeMo-Curator/nemo_curator/scripts/find_exact_duplicates.py`` module, but you can find other templates
+in `NeMo-Curator/examples <https://github.com/NVIDIA/NeMo-Curator/tree/main/examples>`__:
+
+.. code-block:: bash
+
+    # Set DASK_CLUSTER_NAME to the value of --name
+    DASK_CLUSTER_NAME=rapids-dask
+    SCHEDULER_POD=$(kubectl get pods -l "dask.org/cluster-name=$DASK_CLUSTER_NAME,dask.org/component=scheduler" -o name)
+    # Starts an interactive shell session in the scheduler pod
+    kubectl exec -it $SCHEDULER_POD -- bash
+
+    ########################
+    # Inside SCHEDULER_POD #
+    ########################
+    # Run the following inside the interactive shell to launch script in the background and
+    # tee the logs to the /nemo-workspace PVC that was mounted in for persistence.
+    # The command line flags will need to be replaced with whatever the module script accepts.
+    # Recall that the PVC is mounted at /nemo-workspace, so any outputs should be written
+    # to somewhere under /nemo-workspace.
+
+    mkdir -p /nemo-workspace/curator/{output,log,profile}
+    # Append logs to script.log and write to a log file with a date suffix
+    LOGS="/nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.$(date +%y_%m_%d-%H-%M-%S)"
+    (
+    echo "Writing to: $LOGS"
+    # Recall that /NeMo-Curator-dev was copied and installed in the Dockerfile above
+    python3 -u /NeMo-Curator-dev/nemo_curator/scripts/find_exact_duplicates.py \
+        --input-data-dirs /nemo-workspace/my_dataset \
+        --output-dir /nemo-workspace/curator/output \
+        --hash-method md5 \
+        --log-dir /nemo-workspace/curator/log \
+        --files-per-partition 1 \
+        --profile-path /nemo-workspace/curator/profile \
+        --log-frequency 250 \
+        --scheduler-address localhost:8786 \
+        2>&1
+    echo "Finished!"
+    ) | tee $LOGS &
+
+    # At this point, feel free to disconnect the shell via Ctrl+D or simply
+    exit
+
+At this point you can tail the logs and look for ``Finished!`` in ``/nemo-workspace/curator/script.log``:
+
+.. code-block:: bash
+
+    # Command will follow the logs of the running module (Press ctrl+C to close)
+    kubectl exec -it $SCHEDULER_POD -- tail -f /nemo-workspace/curator/script.log
+
+    # Writing to: /nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.24_03_27-20-52-07
+    # Reading 2 files
+    # /NeMo-Curator-dev/nemo_curator/modules/exact_dedup.py:157: UserWarning: Output path f/nemo-workspace/curator/output/_exact_duplicates.parquet already exists and will be overwritten
+    #   warnings.warn(
+    # Finished!
+
+Deleting Cluster
+----------------
+After you have finished using the created dask cluster, you can delete it to release the resources:
+
+.. code-block:: bash
+
+    # Where <name> is the flag passed to create_dask_cluster.py. Example: `--name <name>`
+    kubectl delete daskcluster <name>
+
+Download Data from PVC
+----------------------
+
+To download data from your PVC, you can use the ``nemo-workspace-busybox`` Pod created earlier:
+
+.. code-block:: bash
+
+    # Replace <...> with a path on your local machine
+    LOCAL_WORKSPACE=<...>
+
+    # Tar will fail if LOCAL_WORKSPACE doesn't exist
+    mkdir -p $LOCAL_WORKSPACE
+
+    # Copy file in PVC at /nemo-workspace/foobar.txt to local file-system at $LOCAL_WORKSPACE/nemo-workspace/foobar.txt
+    kubectl exec nemo-workspace-busybox -- tar cf - /nemo-workspace/foobar.txt | tar xf - -C $LOCAL_WORKSPACE
+
+    # Copy directory in PVC /nemo-workspace/fizzbuzz to local file-system at $LOCAL_WORKSPACE/fizzbuzz
+    kubectl exec nemo-workspace-busybox -- tar cf - /nemo-workspace/fizzbuzz | tar xf - -C $LOCAL_WORKSPACE
diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst
index 278e47ab3..7ba84c03e 100644
--- a/docs/user-guide/index.rst
+++ b/docs/user-guide/index.rst
@@ -27,6 +27,9 @@
 :ref:`Personally Identifiable Information Identification and Removal <data-curator-pii>`
    The purpose of the personally identifiable information (PII) redaction tool is to help scrub sensitive data out of training datasets
 
+:ref:`curator-kubernetes`
+   Demonstration of how to run the NeMo Curator on a Dask Cluster deployed on top of Kubernetes
+
 .. toctree::
    :maxdepth: 4
    :titlesonly:
@@ -41,3 +44,4 @@
    TaskDecontamination.rst
    PersonalIdentifiableInformationIdentificationAndRemoval.rst
    DistributedDataClassification.rst
+   KubernetesCurator.rst
diff --git a/examples/k8s/create_dask_cluster.py b/examples/k8s/create_dask_cluster.py
new file mode 100644
index 000000000..28be575eb
--- /dev/null
+++ b/examples/k8s/create_dask_cluster.py
@@ -0,0 +1,134 @@
+import argparse
+
+from dask_kubernetes.operator.kubecluster import KubeCluster, make_cluster_spec
+
+
+def create_cluster(
+    name: str,
+    n_workers: int,
+    n_gpus_per_worker: int,
+    n_cpus_per_worker: int,
+    image: str,
+    image_pull_secret: str,
+    pvcs: dict[str, str],
+):
+    dask_worker_command = "dask-worker"
+    if n_gpus_per_worker and n_gpus_per_worker > 0:
+        dask_worker_command = "dask-cuda-worker"
+
+    custom_cluster_spec = make_cluster_spec(
+        name=name,
+        worker_command=dask_worker_command,
+        n_workers=n_workers,
+        image=image,
+    )
+    scheduler_spec = custom_cluster_spec["spec"]["scheduler"]["spec"]
+    worker_spec = custom_cluster_spec["spec"]["worker"]["spec"]
+    if image_pull_secret:
+        scheduler_spec["imagePullSecrets"] = [{"name": image_pull_secret}]
+        worker_spec["imagePullSecrets"] = [{"name": image_pull_secret}]
+
+    obj_vols = []
+    obj_vol_mounts = []
+    for pvc_name, mount_path in pvcs.items():
+        obj_vols.append(
+            {
+                "name": pvc_name,
+                "persistentVolumeClaim": {
+                    "claimName": pvc_name,
+                },
+            }
+        )
+        obj_vol_mounts.append(
+            {
+                "name": pvc_name,
+                "mountPath": mount_path,
+            }
+        )
+
+    scheduler_spec["volumes"] = obj_vols
+    for ctr in scheduler_spec["containers"]:
+        ctr["volumeMounts"] = obj_vol_mounts
+
+    worker_spec["volumes"] = obj_vols
+    for ctr in worker_spec["containers"]:
+        ctr["volumeMounts"] = obj_vol_mounts
+        # Resources are added to only the worker, since the scheduler doesn't need GPUs
+        if n_gpus_per_worker or n_cpus_per_worker:
+            if not ctr["resources"]:
+                ctr["resources"] = {"limits": {}}
+            if n_gpus_per_worker:
+                ctr["resources"]["limits"]["nvidia.com/gpu"] = str(n_gpus_per_worker)
+            if n_cpus_per_worker:
+                ctr["resources"]["limits"]["cpu"] = str(n_cpus_per_worker)
+
+    cluster = KubeCluster(
+        custom_cluster_spec=custom_cluster_spec, shutdown_on_close=False
+    )
+    print(f"{cluster = }")
+
+
+if __name__ == "__main__":
+
+    def parse_pvcs(specs: str) -> dict[str, str]:
+        name_to_path = {}
+        for pvc in specs.split(","):
+            # Can be empty
+            if not pvc:
+                continue
+            name, _, path = pvc.partition(":")
+            name_to_path[name] = path
+        return name_to_path
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-n",
+        "--name",
+        type=str,
+        default="rapids-dask",
+        help="The name of the DaskCluster which you would be able to inspect via `kubectl describe daskcluster <name>`.",
+    )
+    parser.add_argument(
+        "-w", "--n_workers", type=int, default=2, help="Number of workers"
+    )
+    parser.add_argument(
+        "-g",
+        "--n_gpus_per_worker",
+        type=int,
+        default=None,
+        help="Number of GPUs per worker. If not specified, the Dask Cluster defaults to a CPU cluster.",
+    )
+    parser.add_argument(
+        "-c",
+        "--n_cpus_per_worker",
+        type=int,
+        default=None,
+        help="Number of CPUs per worker. Provide this flag if you want to limit your CPU resources and K8s will throttle the workers to make sure this limit is satisfied.",
+    )
+    parser.add_argument(
+        "-i",
+        "--image",
+        type=str,
+        default="nvcr.io/nvidia/nemo:24.03.framework",
+        help="The image used for the Dask Cluster scheduler and workers.",
+    )
+    parser.add_argument(
+        "-s",
+        "--image_pull_secret",
+        type=str,
+        default=None,
+        help="If --image is from a private registry, specify the appropriate pull secret you created to allow these to be pulled.",
+    )
+    parser.add_argument(
+        "-p",
+        "--pvcs",
+        type=parse_pvcs,
+        default="",
+        help="Comma sep PVC specificiation of $pvc_name_1:$mount_path_1,$pvc_name_2:$mount_path_2. Example: foo:/foo,bar:/bar mounts pvcs named foo and bar to /foo and /bar respectively.",
+    )
+
+    args = parser.parse_args()
+
+    create_cluster(
+        **vars(args),
+    )

From f2b3904e4b0c275ecfe4cd4e50f628d3d2126133 Mon Sep 17 00:00:00 2001
From: Ayush Dattagupta <ayushdg95@gmail.com>
Date: Tue, 30 Apr 2024 08:29:19 -0700
Subject: [PATCH 08/34] Move common dedup utils and remove unused code (#42)

* Refactor common utils and remove unused code

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* More cleanup

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* More updates/shuffling

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Move gpu_dedup scripts into subfolder

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Remove gpu_deduplication subfolder

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Add readme to fuzzy dedup scripts section

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Fix typo and relative links

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Remove legacy script entrypoints

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Remove legacy scripts and add init file

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Update GpuDeduplication.rst

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

---------

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 docs/user-guide/GpuDeduplication.rst          | 120 +++++-
 examples/gpu_deduplication_example/README.md  |   3 +
 nemo_curator/gpu_deduplication/__init__.py    |  13 -
 .../gpu_deduplication/connected_component.py  | 290 -------------
 nemo_curator/gpu_deduplication/ioutils.py     | 116 -----
 .../gpu_deduplication/jaccard_compute.py      | 154 -------
 .../gpu_deduplication/jaccard_map_buckets.py  | 197 ---------
 .../gpu_deduplication/jaccard_shuffle.py      | 399 ------------------
 .../jaccard_utils/__init__.py                 |  13 -
 .../jaccard_utils/batch_shuffle_utils.py      | 130 ------
 .../jaccard_utils/doc_id_mapping.py           |  60 ---
 .../jaccard_utils/get_anchor_utils.py         |  55 ---
 .../jaccard_utils/get_output_map_utils.py     | 149 -------
 .../jaccard_utils/io_utils.py                 | 185 --------
 .../jaccard_utils/jaccard_similarity_utils.py | 103 -----
 .../gpu_deduplication/prepare_fuzzy_ids.py    |  95 -----
 nemo_curator/gpu_deduplication/utils.py       | 155 -------
 .../verify_all_pairs_jaccard.py               | 172 --------
 .../write_deduped_result_with_text.py         |  83 ----
 nemo_curator/modules/fuzzy_dedup.py           |  10 +-
 nemo_curator/scripts/find_exact_duplicates.py |   6 +-
 .../scripts/fuzzy_deduplication/README.md     |  99 +++++
 .../scripts/fuzzy_deduplication/__init__.py   |   0
 .../compute_minhashes.py                      |  12 +-
 .../connected_components.py                   |   4 +-
 .../jaccard_compute.py                        |   4 +-
 .../jaccard_shuffle.py                        |   6 +-
 .../{ => fuzzy_deduplication}/map_buckets.py  |   6 +-
 .../{ => fuzzy_deduplication}/minhash_lsh.py  |  14 +-
 .../utils/fuzzy_dedup_utils/io_utils.py       |   7 +
 .../fuzzy_dedup_utils}/merge_utils.py         |   4 +-
 nemo_curator/utils/script_utils.py            |  64 ++-
 setup.py                                      |  17 +-
 33 files changed, 302 insertions(+), 2443 deletions(-)
 delete mode 100644 nemo_curator/gpu_deduplication/__init__.py
 delete mode 100644 nemo_curator/gpu_deduplication/connected_component.py
 delete mode 100644 nemo_curator/gpu_deduplication/ioutils.py
 delete mode 100644 nemo_curator/gpu_deduplication/jaccard_compute.py
 delete mode 100644 nemo_curator/gpu_deduplication/jaccard_map_buckets.py
 delete mode 100644 nemo_curator/gpu_deduplication/jaccard_shuffle.py
 delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/__init__.py
 delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py
 delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py
 delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py
 delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py
 delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py
 delete mode 100644 nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py
 delete mode 100644 nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py
 delete mode 100644 nemo_curator/gpu_deduplication/utils.py
 delete mode 100644 nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py
 delete mode 100644 nemo_curator/gpu_deduplication/write_deduped_result_with_text.py
 create mode 100644 nemo_curator/scripts/fuzzy_deduplication/README.md
 create mode 100644 nemo_curator/scripts/fuzzy_deduplication/__init__.py
 rename nemo_curator/scripts/{ => fuzzy_deduplication}/compute_minhashes.py (94%)
 rename nemo_curator/scripts/{ => fuzzy_deduplication}/connected_components.py (95%)
 rename nemo_curator/scripts/{ => fuzzy_deduplication}/jaccard_compute.py (95%)
 rename nemo_curator/scripts/{ => fuzzy_deduplication}/jaccard_shuffle.py (95%)
 rename nemo_curator/scripts/{ => fuzzy_deduplication}/map_buckets.py (96%)
 rename nemo_curator/scripts/{ => fuzzy_deduplication}/minhash_lsh.py (91%)
 rename nemo_curator/{gpu_deduplication/jaccard_utils => utils/fuzzy_dedup_utils}/merge_utils.py (98%)

diff --git a/docs/user-guide/GpuDeduplication.rst b/docs/user-guide/GpuDeduplication.rst
index d8b54811b..61eff2b5a 100644
--- a/docs/user-guide/GpuDeduplication.rst
+++ b/docs/user-guide/GpuDeduplication.rst
@@ -58,24 +58,108 @@ steps (all scripts are included in the :code:`nemo_curator/scripts/` subdirector
     2. Output: _exact_duplicates.parquet. List of exact duplicates and the document hash.
 
 * Fuzzy Dedup
-    1. Minhashes (Compute minhashes)
-        1. Input: Data Directories
-        2. Output: minhashes.parquet for each data dir.
-    2. Buckets (Minhash Buckets/LSH)
-        1. Input: Minhash directories
-        2. Output: _buckets.parquet
-    3. Map Buckets
-        1. Input: Buckets.parquet + Data Dirs
-        2. Output: anchor_docs_with_bk.parquet
-    4. Jaccard Shuffle
-        1. Input: anchor_docs_with_bk.parquet + Data Dirs
-        2. Output: shuffled_docs.parquet
-    5. Jaccard compute
-        1. Input: Shuffled docs.parquet
-        2. Output: jaccard_similarity_results.parquet
-    6. Connected Components
-        1. Input: jaccard_similarity_results.parquet
-        2. Output: connected_components.parquet
+
+  1. Compute Minhashes
+    - Input: Data Directories
+    - Output: minhashes.parquet for each data dir.
+    - Example call:
+
+         .. code-block:: bash
+
+                 # same as `python compute_minhashes.py`
+                 gpu_compute_minhashes \
+                   --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
+                   --output-minhash-dir /path/to/output_minhashes \
+                   --input-json-text-field text_column_name \
+                   --input-json-id-field id_column_name \
+                   --minhash-length number_of_hashes \
+                   --char-ngram char_ngram_size \
+                   --hash-bytes 4(or 8 byte hashes) \
+                   --seed 42 \
+                   --log-dir ./
+                   # --scheduler-file /path/to/file.json
+
+
+  2. Buckets (Minhash Buckets)
+    - Input: Minhash directories
+    - Output: Buckets.parquet
+    - Example call:
+
+         .. code-block:: bash
+
+                 # same as `python minhash_lsh.py`
+                 minhash_buckets \
+                   --input-data-dirs /path/to/output_minhashes/dir1 /path/to/output_minhashes/dir2 \
+                   --output-bucket-dir /path/to/dedup_output \
+                   --input-minhash-field _minhash_signature \
+                   --input-json-id-field id_column_name \
+                   --minhash-length number_of_hashes \
+                   --num-bands num_bands \
+                   --buckets-per-shuffle 1 `#Value b/w [1-num_bands]. Higher is better but might lead to oom` \
+                   --log-dir ./
+                   # --scheduler-file /path/to/file.json
+
+  3. Jaccard Map Buckets
+    - Input: Buckets.parquet + Data Dir
+    - Output: anchor_docs_with_bk.parquet
+    - Example call:
+
+         .. code-block:: bash
+
+                 # same as `python map_buckets.py`
+                 jaccard_map_buckets \
+                   --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
+                   --input-bucket-dir /path/to/dedup_output/_buckets.parquet \
+                   --output-dir /path/to/dedup_output \
+                   --input-json-text-field text_column_name \
+                   --input-json-id-field id_column_name \
+                   # --scheduler-file /path/to/file.json
+
+  4. Jaccard Shuffle
+    - Input: anchor_docs_with_bk.parquet + Data Dir
+    - Output: shuffled_docs.parquet
+    - Example call:
+
+         .. code-block:: bash
+
+                 # same as `python jaccard_shuffle.py`
+                 jaccard_shuffle \
+                   --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
+                   --input-bucket-mapping-dir /path/to/dedup_output/anchor_docs_with_bk.parquet \
+                   --output-dir /path/to/dedup_output \
+                   --input-json-text-field text_column_name \
+                   --input-json-id-field id_column_name \
+                   # --scheduler-file /path/to/file.json
+
+  5. Jaccard compute
+    - Input: Shuffled docs.parquet
+    - Output: jaccard_similarity_results.parquet
+    - Example call:
+
+         .. code-block:: bash
+
+                 # same as `python jaccard_compute.py`
+                 jaccard_compute \
+                   --shuffled-docs-path /path/to/dedup_output/shuffled_docs.parquet \
+                   --output-dir /path/to/dedup_output \
+                   --ngram-size char_ngram_size_for_similarity \
+                   # --scheduler-file /path/to/file.json
+
+  6. Connected Components
+    - Input: jaccard_similarity_results.parquet
+    - Output: connected_components.parquet
+    - Example call:
+
+         .. code-block:: bash
+
+                 # same as `python connected_components.py`
+                 gpu_connected_component \
+                   --jaccard-pairs_path /path/to/dedup_output/jaccard_similarity_results.parquet \
+                   --output-dir /path/to/dedup_output \
+                   --cache-dir /path/to/cc_cache \
+                   --jaccard-threshold 0.8
+                   # --scheduler-file /path/to/file.json
+
 
 In addition to the scripts, there are examples in the `examples` directory that showcase using the python module
 directly in your own code. It also has examples on how to remove documents from the corpus using the list of duplicate IDs generated from exact or fuzzy
diff --git a/examples/gpu_deduplication_example/README.md b/examples/gpu_deduplication_example/README.md
index 9a5e64c15..2f294e1f6 100644
--- a/examples/gpu_deduplication_example/README.md
+++ b/examples/gpu_deduplication_example/README.md
@@ -1,5 +1,8 @@
 ### Deduplication Steps
 
+> [!CAUTION]
+> The examples references here are outdated and will be replaced with an example using the Python API directly. For more details on the scripts refer to [nemo_curator/scripts/fuzzy_deduplication](/nemo_curator/scripts/fuzzy_deduplication)
+
 1. Exact dedup
     1. Input: Data directories
     2. Output: exact_duplicates.parquet. List of exact duplicates and the document hash.
diff --git a/nemo_curator/gpu_deduplication/__init__.py b/nemo_curator/gpu_deduplication/__init__.py
deleted file mode 100644
index d9155f923..000000000
--- a/nemo_curator/gpu_deduplication/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo_curator/gpu_deduplication/connected_component.py b/nemo_curator/gpu_deduplication/connected_component.py
deleted file mode 100644
index 211fb4515..000000000
--- a/nemo_curator/gpu_deduplication/connected_component.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from time import time
-
-import cudf
-import cugraph
-import cugraph.dask as dcg
-import cugraph.dask.comms.comms as Comms
-import cupy
-import dask_cudf
-import numpy as np
-from dask.dataframe.shuffle import shuffle as dd_shuffle
-from dask.utils import M
-
-from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import (
-    convert_str_pair_adlr_ids_to_int,
-)
-from nemo_curator.gpu_deduplication.utils import (
-    enable_spilling,
-    get_client,
-    get_num_workers,
-    parse_nc_args,
-    timer,
-)
-
-
-def sort_adlr_id(df):
-    x = df[["adlr_id_x", "adlr_id_y"]].values
-    x = cupy.sort(x, axis=1)
-    df["adlr_id_x"] = x[:, 0]
-    df["adlr_id_y"] = x[:, 1]
-    for i in ["adlr_id_x", "adlr_id_y"]:
-        df[i] = df[i].astype("uint64")
-    return df
-
-
-def thresholding(df, threshold=0.8):
-    mask = df.jaccard > threshold
-    df.loc[mask, "jaccard"] = np.int8(1)
-    df.loc[~mask, "jaccard"] = np.int8(0)
-    return df
-
-
-@timer
-def run_connected_components(jaccard_pairs_path, adlr_id_path, output_path):
-    Comms.initialize(p2p=True)
-    df = dask_cudf.read_parquet(
-        jaccard_pairs_path, blocksize="1GB", aggregate_files=True
-    )
-    df = df[df["jaccard"] == 1].reset_index(drop=True)
-
-    labels_df = dask_cudf.read_parquet(adlr_id_path)
-    num_nodes = len(labels_df)
-
-    self_edge_df = labels_df[["uid"]].rename(columns={"uid": "adlr_id_x"})
-    self_edge_df["adlr_id_y"] = self_edge_df["adlr_id_x"]
-
-    df = df[["adlr_id_x", "adlr_id_y"]].astype(np.int64)
-    df = dask_cudf.concat([df, self_edge_df])
-
-    G = cugraph.MultiGraph(directed=False)
-    G.from_dask_cudf_edgelist(
-        df, source="adlr_id_x", destination="adlr_id_y", renumber=False
-    )
-    result = dcg.weakly_connected_components(G)
-    del G
-    max_partitions = min(32, result.npartitions)
-    n_components = len(result[["labels"]].drop_duplicates(split_out=max_partitions))
-    num_labels = len(result)
-    print("# of groups", n_components)
-    print("# of docs removed", num_labels - n_components)
-    labels_df = labels_df.merge(
-        result, left_on=["uid"], right_on=["vertex"], how="inner"
-    )
-    labels_df = labels_df[["dataset_id", "doc_id", "labels"]]
-    labels_df = labels_df.rename(columns={"labels": "group"})
-    labels_df = labels_df.persist()
-    # Doing an inner merge above
-    # should not change any rows
-
-    assert num_nodes == len(labels_df)
-    print(f"assert num_nodes:{num_nodes}==labels_df:{len(labels_df)} passed")
-    labels_df.to_parquet(output_path, write_index=False)
-    Comms.destroy()
-
-
-def attach_args(parser=None):
-    description = """Computes connected component"""
-    if not parser:
-        parser = parse_nc_args(description=description)
-
-    parser.add_argument(
-        "--jaccard-pairs-path",
-        type=str,
-        help="The directory containing the jaccard results",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="The output directory to write results to",
-    )
-    parser.add_argument(
-        "--cache-dir",
-        type=str,
-        help="The cache directory to write intermediate results to",
-    )
-    return parser
-
-
-def delete_cache_data(path):
-    if "cache" not in path:
-        return
-    cmd = f"rm -rf {path}"
-    print(cmd)
-    os.system(cmd)
-
-
-def write_output(ddf, output_path):
-    if not isinstance(output_path, str):
-        assert TypeError(f"output_path should be str. got {type(output_path)}")
-    print(f"write {output_path} ...")
-    ddf.to_parquet(output_path, write_index=False)
-
-
-def get_unique_ids_per_partition(df):
-    unique_df_ls = []
-    for tag in ["x", "y"]:
-        subset_df = df[[f"dataset_id_{tag}", f"doc_id_{tag}"]].drop_duplicates()
-        subset_df = subset_df.rename(
-            columns={f"dataset_id_{tag}": "dataset_id", f"doc_id_{tag}": "doc_id"}
-        )
-        unique_df_ls.append(subset_df)
-    unique_df = cudf.concat(unique_df_ls, ignore_index=True)
-    unique_df = unique_df.drop_duplicates()
-    return unique_df
-
-
-@timer
-def write_dedup_parsed_adlr_id(args):
-    dedup_parsed_adlr_id_path = f"{args.cache_dir}/dedup_parsed_adlr_id.parquet"
-    ddf = dask_cudf.read_parquet(
-        args.jaccard_pairs_path,
-        columns=["adlr_id_x", "adlr_id_y"],
-        blocksize="1GB",
-        aggregate_files=True,
-    )
-    ddf = ddf.map_partitions(
-        convert_str_pair_adlr_ids_to_int,
-        meta={
-            "dataset_id_x": "uint32",
-            "doc_id_x": "int64",
-            "dataset_id_y": "uint32",
-            "doc_id_y": "int64",
-        },
-    )
-
-    unique_docs = ddf.map_partitions(get_unique_ids_per_partition)
-    unique_docs = unique_docs.drop_duplicates(split_out=ddf.npartitions // 4)
-    unique_docs["uid"] = np.uint64(1)
-    unique_docs["uid"] = unique_docs["uid"].cumsum()
-    unique_docs["uid"] = unique_docs["uid"] - 1
-    write_output(unique_docs, dedup_parsed_adlr_id_path)
-    return dedup_parsed_adlr_id_path
-
-
-def batched_merge_and_write(ddf, ddf_adlr_id, output_path, batch_size=32):
-    total_batches = (ddf.npartitions + batch_size - 1) // batch_size
-    for batch_id, offset in enumerate(range(0, ddf.npartitions, batch_size)):
-        st = time()
-        subset_ddf = ddf.partitions[offset : offset + batch_size]
-        for tag in ["x", "y"]:
-            subset_ddf = subset_ddf.merge(
-                ddf_adlr_id,
-                left_on=[f"dataset_id_{tag}", f"doc_id_{tag}"],
-                right_on=["dataset_id", "doc_id"],
-                how="inner",
-                broadcast=True,
-            )
-            subset_ddf = subset_ddf.rename(columns={"uid": f"adlr_id_{tag}"})
-            subset_ddf = subset_ddf.drop(columns=[f"dataset_id_{tag}", f"doc_id_{tag}"])
-
-        subset_ddf = subset_ddf[["adlr_id_x", "adlr_id_y", "jaccard"]]
-        output_batch_path = os.path.join(output_path, f"{batch_id}.parquet")
-        subset_ddf.to_parquet(output_batch_path, write_index=False)
-
-        et = time()
-        print(f"batch_id = {batch_id}/{total_batches}, time = {et - st}", flush=True)
-
-
-@timer
-def write_encoded_jaccard_pair(args, client):
-    dedup_parsed_adlr_id_path = f"{args.cache_dir}/dedup_parsed_adlr_id.parquet"
-    output_path = f"{args.cache_dir}/encoded_jaccard_pair/"
-    ddf_adlr_id = dask_cudf.read_parquet(
-        dedup_parsed_adlr_id_path, blocksize="2GB", aggregate_files=True
-    )
-    ddf_adlr_id = ddf_adlr_id.persist()
-    len(ddf_adlr_id)
-    ddf = dask_cudf.read_parquet(
-        args.jaccard_pairs_path,
-        blocksize="256MB",
-        aggregate_files=True,
-    )
-    ddf = ddf.map_partitions(
-        convert_str_pair_adlr_ids_to_int,
-        meta={
-            "jaccard": "float32",
-            "dataset_id_x": "uint32",
-            "doc_id_x": "int64",
-            "dataset_id_y": "uint32",
-            "doc_id_y": "int64",
-        },
-    )
-    num_workers = get_num_workers(client)
-    batched_merge_and_write(ddf, ddf_adlr_id, output_path, num_workers)
-
-
-@timer
-def write_dedup_encoded_jaccard_pair(args, client):
-    input_path = f"{args.cache_dir}/encoded_jaccard_pair"
-    output_path = f"{args.cache_dir}/final_dedup_encoded_jaccard_pair.parquet"
-
-    ddf = dask_cudf.read_parquet(input_path, blocksize="512MB", aggregate_files=True)
-    meta = {"adlr_id_x": "uint64", "adlr_id_y": "uint64", "jaccard": "float32"}
-    ddf = ddf.map_partitions(sort_adlr_id, meta=meta)
-    ddf = ddf.map_partitions(thresholding, meta=meta)
-    ddf = ddf.map_partitions(
-        M.drop_duplicates,
-        meta=ddf._meta,
-        enforce_metadata=False,
-        transform_divisions=False,
-        align_dataframes=False,
-    )
-    ddf = dd_shuffle(
-        ddf,
-        ["adlr_id_x", "doc_id"],
-        ignore_index=True,
-        shuffle="tasks",
-    )
-    ddf = ddf.map_partitions(
-        M.drop_duplicates,
-        meta=ddf._meta,
-        enforce_metadata=False,
-        transform_divisions=False,
-        align_dataframes=False,
-    )
-
-    write_output(ddf, output_path)
-    return output_path
-
-
-def main(args):
-    description = """Takes a dataset consisting of document pairs
-    and their corresponding jaccard similarity to compute connected
-    components of docuements across pairs to find similar docuemnt
-    after applying a given threshold. The result is a dataset
-    consisting of all documents that are similar (above the threshold)
-    and the component they belong to."""
-    start = time()
-    output_path = os.path.join(args.output_dir, "connected_components.parquet")
-
-    client = get_client(args)
-    enable_spilling()
-    client.run(enable_spilling)
-    adlr_id_path = write_dedup_parsed_adlr_id(args)
-    write_encoded_jaccard_pair(args, client)
-    jaccard_pairs_path = write_dedup_encoded_jaccard_pair(args, client)
-    run_connected_components(jaccard_pairs_path, adlr_id_path, output_path)
-    print(f"All done in {time()-start:.1f} seconds")
-
-
-def console_script():
-    main(attach_args().parse_args())
-
-
-if __name__ == "__main__":
-    main(attach_args().parse_args())
diff --git a/nemo_curator/gpu_deduplication/ioutils.py b/nemo_curator/gpu_deduplication/ioutils.py
deleted file mode 100644
index 7ac253c04..000000000
--- a/nemo_curator/gpu_deduplication/ioutils.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Sequence
-
-import cudf
-import dask_cudf
-from dask import dataframe as dd
-from tqdm import tqdm
-
-
-# TODO:
-# Combine this with
-# nemo_curator.distributed_utils.read_cudf_jsonl
-def read_json_func(files, engine="cudf", include_path_column=False, columns=None):
-    """
-    Reads multiple Json Lines files into a cuDF
-    dataframe with an additional `path` column denoting the path
-    of the input file.
-    """
-    if not include_path_column:
-        if columns:
-            return cudf.read_json(files, engine="cudf", lines=True)[columns]
-        else:
-            return cudf.read_json(files, engine="cudf", lines=True)
-
-    dfs = []
-    for file in files:
-        if columns:
-            df = cudf.read_json(file, engine=engine, lines=True)[columns]
-        else:
-            df = cudf.read_json(file, engine=engine, lines=True)
-        df["path"] = file
-        dfs.append(df)
-    return cudf.concat(dfs, ignore_index=True)
-
-
-def bucketed_read(files, func=read_json_func, b_size=2, meta=None, **kwargs):
-    """
-    Read files with `b_size` number of files per bucket.
-    Users can specify their own read
-    """
-    filepaths = [
-        files[i : i + b_size] for i in range(0, len(files), b_size)  # noqa: E203
-    ]
-    if meta:
-        return dd.from_map(func, filepaths, meta=meta, **kwargs)
-    else:
-        return dd.from_map(func, filepaths, **kwargs)
-
-
-# TODO: Remove this function
-def regular_read_json(files, include_path_column=False):
-    return dask_cudf.read_json(
-        files, engine="cudf", lines=True, include_path_column=include_path_column
-    )
-
-
-def batched_writing(
-    dask_df: dask_cudf.DataFrame,
-    output_path: str,
-    partition_on: Sequence[str],
-    parts_ber_batch: int = 32,
-):
-    """
-    Write a dask dataframe to parquet in batches.
-    This allows us to do batched exectution and prevent OOMs
-    Args:
-        dask_df: dask dataframe to write
-        output_path: path to write to
-        partition_on: columns to partition on
-        parts_ber_batch: number of partitions per batch
-    """
-
-    total_partitions = dask_df.npartitions
-    for batch_id, part_offset in tqdm(
-        enumerate(range(0, dask_df.npartitions, parts_ber_batch))
-    ):
-        print(f"\nStarted processing batch in = {batch_id}", flush=True)
-        df = dask_df.partitions[part_offset : part_offset + parts_ber_batch]
-        if partition_on:
-            df.to_parquet(
-                output_path,
-                partition_on=partition_on,
-                name_function=lambda x: f"batch_{batch_id}_part_{x}.parquet",
-                write_metadata_file=False,
-            )
-        else:
-            df.to_parquet(
-                output_path,
-                name_function=lambda x: f"batch_{batch_id}_part_{x}.parquet",
-                write_metadata_file=False,
-            )
-        print(
-            f"Part {part_offset+parts_ber_batch}/{total_partitions} completed",
-            flush=True,
-        )
-
-
-def strip_trailing_sep(path: str):
-    """
-    Strips a path string of trailing path seperators like `/` if any.
-    """
-    return path.rstrip(os.path.sep)
diff --git a/nemo_curator/gpu_deduplication/jaccard_compute.py b/nemo_curator/gpu_deduplication/jaccard_compute.py
deleted file mode 100644
index f90e6c444..000000000
--- a/nemo_curator/gpu_deduplication/jaccard_compute.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import time
-
-import cudf
-import dask.dataframe as dd
-import numpy as np
-
-from nemo_curator.gpu_deduplication.jaccard_utils.jaccard_similarity_utils import (
-    compute_jaccard_and_create_pair_df,
-)
-from nemo_curator.gpu_deduplication.utils import (
-    enable_spilling,
-    get_client,
-    get_num_workers,
-    parse_nc_args,
-)
-
-
-def create_bins(path_dicts, max_size):
-    path_dicts.sort(key=lambda x: x["str_bytes"], reverse=True)
-    bins, bin_sizes = [], []
-    for path_d in path_dicts:
-        new_path, new_size = path_d["path"], path_d["str_bytes"]
-        for i, bin_size in enumerate(bin_sizes):
-            if bin_size + new_size <= max_size:
-                bins[i].append(new_path)
-                bin_sizes[i] += new_size
-                new_size = 0
-                break
-        if new_size:
-            bins.append([new_path])
-            bin_sizes.append(new_size)
-    return bins
-
-
-def get_anchor_docs_and_string_size(path):
-    df = cudf.read_parquet(path)
-    str_bytes = df["text"].str.byte_count().sum()
-    is_anchor_flag = (df["adlr_id"] == df["anchor_1_adlr_id"]) | (
-        df["adlr_id"] == df["anchor_0_adlr_id"]
-    )
-    anchor_df = df[is_anchor_flag].reset_index(drop=True)
-    return anchor_df, {"path": path, "str_bytes": str_bytes}
-
-
-def compute_jaccard_on_1_partition(path):
-    try:
-        df = cudf.read_parquet(path)
-        pair_df = compute_jaccard_and_create_pair_df(df)
-    except OverflowError:
-        paths = [entry.path for entry in os.scandir(os.path.join(path))]
-        anchor_df_str_size_ls = [
-            get_anchor_docs_and_string_size(path) for path in paths
-        ]
-        anchor_df = cudf.concat(
-            [anchor_doc for anchor_doc, _ in anchor_df_str_size_ls], ignore_index=True
-        ).drop_duplicates()
-        df_str_size = [str_size for _, str_size in anchor_df_str_size_ls]
-        paths = create_bins(df_str_size, np.iinfo(np.int32).max // 10)
-        pair_dfs = []
-        for path in paths:
-            print(path)
-            df = cudf.read_parquet(path).reset_index(drop=True)
-            df = cudf.concat([df, anchor_df], ignore_index=True)
-            pair_df = compute_jaccard_and_create_pair_df(df)
-            pair_dfs.append(pair_df)
-        pair_df = cudf.concat(pair_dfs, ignore_index=True)
-    return pair_df
-
-
-def run_jaccard_compute(shuffled_docs_path, output_final_results_path):
-    print("Starting Jaccard Computation", flush=True)
-    st = time.time()
-    paths = [
-        entry.path
-        for entry in os.scandir(shuffled_docs_path)
-        if not entry.path.endswith(".txt")
-    ]
-    meta_df = cudf.DataFrame(
-        {
-            "adlr_id_x": ["x"],
-            "adlr_id_y": ["y"],
-            "jaccard": np.float32([0.0]),
-        }
-    )
-    result_df = dd.from_map(
-        compute_jaccard_on_1_partition, paths, meta=meta_df
-    ).reset_index(drop=True)
-
-    result_df.to_parquet(
-        output_final_results_path,
-        write_index=False,
-        write_metadata_file=False,
-    )
-    print(f"Jaccard Computing+Writing time: {time.time() - st:.1f} seconds")
-
-
-def main(args):
-    description = """Computes the Jaccard similarity between document pairs
-    from partitioned parquet dataset. Result is a parquet dataset consiting of
-    document id pair along with their Jaccard similarity score.
-    """
-    OUTPUT_PATH = args.output_dir
-    shuffled_docs_path = args.shuffled_docs_path
-    output_final_results_path = os.path.join(OUTPUT_PATH, "dedup_final_results.parquet")
-    client = get_client(args)
-    enable_spilling()
-    client.run(enable_spilling)
-    print(f"Num Workers = {get_num_workers(client)}", flush=True)
-    print("Connected to dask cluster", flush=True)
-    print("Running jaccard compute script", flush=True)
-
-    # Run actual computation
-    run_jaccard_compute(shuffled_docs_path, output_final_results_path)
-
-
-def attach_args(parser=None):
-    description = """Computes  jaccard similarity"""
-    if not parser:
-        parser = parse_nc_args(description=description)
-
-    parser.add_argument(
-        "--shuffled-docs-path",
-        type=str,
-        help="The directory containing the shuffled documents",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="The output directory to write results to",
-    )
-    return parser
-
-
-def console_script():
-    main(attach_args().parse_args())
-
-
-if __name__ == "__main__":
-    main(attach_args().parse_args())
diff --git a/nemo_curator/gpu_deduplication/jaccard_map_buckets.py b/nemo_curator/gpu_deduplication/jaccard_map_buckets.py
deleted file mode 100644
index aa60787d4..000000000
--- a/nemo_curator/gpu_deduplication/jaccard_map_buckets.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import time
-
-from dask.dataframe.shuffle import shuffle as dd_shuffle
-from dask.utils import M
-
-from nemo_curator.gpu_deduplication.jaccard_utils.get_anchor_utils import (
-    add_anchor_docs,
-)
-from nemo_curator.gpu_deduplication.jaccard_utils.get_output_map_utils import (
-    get_output_map_based_on_str_bytes,
-)
-from nemo_curator.gpu_deduplication.jaccard_utils.io_utils import (
-    get_bucket_ddf_from_parquet_path,
-    get_text_ddf_from_json_path_with_blocksize,
-)
-from nemo_curator.gpu_deduplication.utils import (
-    get_client,
-    get_num_workers,
-    parse_nc_args,
-)
-
-
-def get_anchor_and_output_map_info(
-    input_data_paths,
-    input_bucket_path,
-    text_ddf_blocksize,
-    num_files,
-    num_workers,
-    shuffle_type,
-):
-    """
-    Get anchor docs with bucket info
-    Args:
-        input_data_paths: list of paths to input data
-        input_bucket_path: path to input buckets
-        text_ddf_blocksize: blocksize for text ddf
-        num_files: number of files to read
-        num_workers: number of workers
-        shuffle_type: type of shuffle to use
-    Returns:
-        ddf_anchor_docs_with_bk
-    """
-    ddf_text = get_text_ddf_from_json_path_with_blocksize(
-        input_data_paths=input_data_paths,
-        num_files=num_files,
-        blocksize=text_ddf_blocksize,
-    )
-    ddf_bk = get_bucket_ddf_from_parquet_path(
-        input_bucket_path=input_bucket_path, num_workers=num_workers
-    )
-    output_map_df = get_output_map_based_on_str_bytes(ddf_bk=ddf_bk, ddf_text=ddf_text)
-    ddf_anchor_docs_with_bk = ddf_bk.map_partitions(add_anchor_docs)
-    print("output_map_df is based on string bytes", flush=True)
-    ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.merge(
-        output_map_df, on=["bucket"]
-    )
-    # Bucket is no longer needed
-    ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.drop(columns=["bucket"])
-    # Below removes any duplicates lying around after dropping buckets
-    ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.map_partitions(
-        M.drop_duplicates,
-        meta=ddf_anchor_docs_with_bk._meta,
-        enforce_metadata=False,
-        transform_divisions=False,
-        align_dataframes=False,
-    )
-    ddf_anchor_docs_with_bk = dd_shuffle(
-        ddf_anchor_docs_with_bk,
-        ["dataset_id", "doc_id"],
-        ignore_index=True,
-        shuffle=shuffle_type,
-    ).map_partitions(
-        M.drop_duplicates,
-        meta=ddf_anchor_docs_with_bk._meta,
-        enforce_metadata=False,
-        transform_divisions=False,
-        align_dataframes=False,
-    )
-    del output_map_df
-    return ddf_anchor_docs_with_bk
-
-
-def attach_args(parser=None):
-    description = """Takes the buckets generated from minhashes and uses
-    document length information to create a coarse mapping of mapping multiple
-    buckets to a logical partition by using a modified bin packing algorithm.
-    """
-    if not parser:
-        parser = parse_nc_args(description=description)
-    parser.add_argument(
-        "--input-bucket-dir",
-        type=str,
-        help="The directory containing bucket information files",
-    )
-    parser.add_argument(
-        "--text-ddf-blocksize",
-        type=int,
-        default=256,
-        help="The block size for chunking jsonl files for text ddf in mb",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="The output directory to write results in",
-    )
-    parser.add_argument(
-        "--shuffle-type",
-        type=str,
-        default="tasks",
-        help="Type of shuffle to use before writing to parquet",
-    )
-    return parser
-
-
-def jaccard_get_output_map_workflow(
-    client,
-    input_data_paths,
-    input_bucket_path,
-    output_anchor_docs_with_bk_path,
-    text_ddf_blocksize,
-    num_files,
-    shuffle_type,
-):
-    """
-    Workflow for jaccard shuffle
-    Args:
-        client: dask client
-        input_data_paths: list of paths to input data
-        input_bucket_path: path to input buckets
-        output_anchor_docs_with_bk_path: path to save anchor docs with bucket info
-        text_ddf_blocksize: blocksize for text ddf
-        num_files: number of files to read
-        parts_per_worker: number of parts per worker
-        shuffle_type: type of shuffle to use before writing to parquet
-    """
-    num_workers = get_num_workers(client)
-    ddf_anchor_docs_with_bk = get_anchor_and_output_map_info(
-        input_data_paths,
-        input_bucket_path,
-        text_ddf_blocksize,
-        num_files,
-        num_workers,
-        shuffle_type,
-    )
-    ddf_anchor_docs_with_bk.to_parquet(
-        output_anchor_docs_with_bk_path,
-        write_index=False,
-    )
-
-
-def main(args):
-    input_data_paths = args.input_data_dirs
-    input_bucket_path = args.input_bucket_dir
-    OUTPUT_PATH = args.output_dir
-    output_anchor_docs_with_bk_path = os.path.join(
-        OUTPUT_PATH, "anchor_docs_with_bk.parquet"
-    )
-    client = get_client(args)
-    print(f"Num Workers = {get_num_workers(client)}", flush=True)
-    print("Connected to dask cluster", flush=True)
-    print("Running jaccard map buckets script", flush=True)
-    print(f"Args = {args}")
-    st = time.time()
-    jaccard_get_output_map_workflow(
-        client,
-        input_data_paths,
-        input_bucket_path,
-        output_anchor_docs_with_bk_path,
-        args.text_ddf_blocksize,
-        args.num_files,
-        args.shuffle_type,
-    )
-    et = time.time()
-    print(f"Bucket Mapping time taken = {et-st} s")
-
-
-def console_script():
-    main(attach_args().parse_args())
-
-
-if __name__ == "__main__":
-    main(attach_args().parse_args())
diff --git a/nemo_curator/gpu_deduplication/jaccard_shuffle.py b/nemo_curator/gpu_deduplication/jaccard_shuffle.py
deleted file mode 100644
index 846d30c4d..000000000
--- a/nemo_curator/gpu_deduplication/jaccard_shuffle.py
+++ /dev/null
@@ -1,399 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import time
-
-import cudf
-from tqdm import tqdm
-
-from nemo_curator.gpu_deduplication.jaccard_utils.batch_shuffle_utils import (
-    text_bytes_aware_shuffle,
-)
-from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import (
-    combine_back_adlr_ids,
-)
-from nemo_curator.gpu_deduplication.jaccard_utils.io_utils import (
-    aggregated_anchor_docs_with_bk_read,
-    get_restart_offsets,
-    get_text_ddf_from_json_path_with_blocksize,
-    update_restart_offsets,
-)
-from nemo_curator.gpu_deduplication.jaccard_utils.merge_utils import (
-    extract_partitioning_index,
-    filter_text_rows_by_bucket_batch,
-    merge_left_to_shuffled_right,
-)
-from nemo_curator.gpu_deduplication.utils import (
-    get_client,
-    get_num_workers,
-    parse_nc_args,
-    performance_report_if,
-)
-
-
-def write_partitioned_file(df, output_path, partition_on, batch_id):
-    if len(df) == 0:
-        return cudf.Series([True])
-
-    cudf.io.parquet.write_to_dataset(
-        df,
-        output_path,
-        partition_cols=[partition_on],
-        filename=f"batch_{batch_id}.parquet",
-    )
-    return cudf.Series([True])
-
-
-def batched_merge_and_write(
-    left_df,
-    right_df,
-    merge_on,
-    partition_on,
-    output_path,
-    parts_per_text_batch,
-    parts_per_bucket_batch,
-    bk_mapping,
-    num_workers=None,
-):
-
-    total_text_partitions = left_df.npartitions
-    total_bucket_partitions = right_df.npartitions
-
-    # Extract global partitioning index
-    left_df, global_partitioning_index = extract_partitioning_index(
-        left_df,
-        merge_on,
-        bk_mapping,
-        parts_per_bucket_batch,
-        total_bucket_partitions,
-    )
-
-    # Set start offsets
-    bucket_part_start_offset, text_part_start_offset = get_restart_offsets(output_path)
-
-    # Set end offsets
-    # NOTE: These end offsets are always set to the end
-    # of the data. However, we may want to be able to set
-    # both the start and end offsets from the command line
-    # in the future.
-    bucket_part_end_offset = total_bucket_partitions
-    text_part_end_offset = total_text_partitions
-
-    # Check that offsets are valid
-    assert bucket_part_start_offset % parts_per_bucket_batch == 0
-    assert bucket_part_end_offset > bucket_part_start_offset
-    assert text_part_end_offset > text_part_start_offset
-
-    # Initialize "retry" variables
-    #
-    # - retry_count: The number of successive batches that
-    #     we have already performed at a reduced batch size.
-    # - retry_threshold: The number of successive batches
-    #     for which we should keep the batch size low
-    #     before attempting the default batch size again.
-    #     Every time we return to the default batch size
-    #     and immediately fail, retry_threshold will double.
-    parts_per_text_batch_retry = None
-    retry_count, retry_threshold = 0, 1
-
-    print(
-        f"Starting at bucket-map partition {bucket_part_start_offset}"
-        f" and text-df partition {text_part_start_offset}",
-        flush=True,
-    )
-
-    for bucket_part_offset in tqdm(
-        range(bucket_part_start_offset, bucket_part_end_offset, parts_per_bucket_batch)
-    ):
-
-        # Outer loop over batches of "bucket-map" partitions
-        end_bucket_offset = min(
-            bucket_part_offset + parts_per_bucket_batch, bucket_part_end_offset
-        )
-        print(
-            f"\nStarted processing bucket-map partitions {bucket_part_offset} "
-            f"through {end_bucket_offset} of {bucket_part_end_offset}",
-            flush=True,
-        )
-        st_bucket = time.time()
-
-        # Select our bucket-mapping batch
-        subset_bucket_df = right_df.partitions[bucket_part_offset:end_bucket_offset]
-        subset_bucket_df = subset_bucket_df.persist()
-
-        # Filter out rows of left_df that we know cannot
-        # align with any rows of subset_bucket_df
-        left_df_use = filter_text_rows_by_bucket_batch(
-            left_df,
-            global_partitioning_index,
-            bucket_part_offset,
-            bucket_part_end_offset,
-            total_bucket_partitions,
-        )
-
-        text_part_offset = text_part_start_offset
-        while text_part_offset < text_part_end_offset:
-
-            # Check if we are "retrying" with a smaller "parts_per_text_batch"
-            if parts_per_text_batch_retry:
-                parts_per_text_batch_use = parts_per_text_batch_retry
-            else:
-                st_text = time.time()
-                parts_per_text_batch_use = parts_per_text_batch
-            print(f"Using {parts_per_text_batch_use} text partitions.", flush=True)
-
-            # Select partitions for our text batch
-            end_text_offset = min(
-                text_part_offset + parts_per_text_batch_use, text_part_end_offset
-            )
-            subset_text_df = left_df_use.partitions[text_part_offset:end_text_offset]
-
-            try:
-                # NOTE: If we have more text-df partitions than bucket-map
-                # partitions, we are more likely to see an OverflowError
-                output_df = text_bytes_aware_shuffle(
-                    merge_left_to_shuffled_right(
-                        subset_text_df,
-                        subset_bucket_df,
-                        merge_on,
-                    ),
-                    partition_on,
-                    num_workers=num_workers,
-                )
-            except OverflowError as err:
-                # We encountered an overflow error!
-                # Let's try again with less text data
-                parts_per_text_batch_retry = int(parts_per_text_batch_use / 2)
-                if parts_per_text_batch_retry < 1:
-                    raise err
-                print(
-                    f"\nWe encountered an OverflowError and will retry "
-                    f"the current batch with {parts_per_text_batch_retry} "
-                    f"text partitions instead of {parts_per_text_batch_use}.",
-                    flush=True,
-                )
-                continue
-
-            output_df = output_df.map_partitions(combine_back_adlr_ids)
-            batch_label = f"{end_bucket_offset}_{end_text_offset}"
-            written_files = output_df.map_partitions(
-                write_partitioned_file,
-                output_path,
-                partition_on,
-                batch_label,
-                meta=cudf.Series([True]),
-            )
-            written_files = written_files.compute()
-            update_restart_offsets(output_path, bucket_part_offset, end_text_offset)
-            del output_df
-
-            print(
-                "Text-df partition ",
-                f"{end_text_offset}/{text_part_end_offset} "
-                f"completed in {time.time()-st_text}",
-                flush=True,
-            )
-
-            # Update loop control-flow variables
-            if parts_per_text_batch_use == parts_per_text_batch:
-                # We succeeded at the default batch size.
-                # Reset the retry count
-                retry_count, retry_threshold = 0, 1
-            else:
-                # We succeeded at a lower batch size
-                retry_count += 1
-                if retry_count >= retry_threshold:
-                    # Go back to the default text-batch size,
-                    # but increase the retry_threshold in
-                    # case we fail again
-                    parts_per_text_batch_retry = None
-                    retry_count, retry_threshold = 0, min(retry_threshold * 2, 16)
-            text_part_offset += parts_per_text_batch_use
-
-        update_restart_offsets(output_path, end_bucket_offset, end_text_offset)
-        print(
-            "Bucket partition ",
-            f"{end_bucket_offset}/{bucket_part_end_offset} "
-            f"completed in {time.time()-st_bucket}",
-            flush=True,
-        )
-
-        # Need to reset text_part_start_offset to 0 after
-        # a single bucket-batch pass (only matters if we are
-        # breaking the bucket-mapping df into multiple batches)
-        text_part_start_offset = 0
-
-
-def jaccard_shuffling_workflow(
-    client,
-    input_data_paths,
-    input_anchor_docs_with_bk_dir,
-    output_shuffled_docs_path,
-    text_ddf_blocksize,
-    bucket_mapping_ddf_blocksize,
-    num_files,
-    parts_per_worker,
-    profile_path,
-    bucket_parts_per_worker,
-):
-    """'
-    Args:
-        client: dask client
-        input_data_paths: paths to input data
-        input_anchor_docs_with_bk_dir: path to input anchor docs with buckets
-        output_shuffled_docs_path: path to output shuffled docs
-        text_ddf_blocksize: block size for chunking jsonl files for text ddf
-        bucket_mapping_ddf_blocksize: block size for chunking parquet files
-                                      for anchor_docs_with_bk ddf
-        num_files: number of files to process
-        parts_per_worker: parts per worker to process in a batch
-        profile_path: dask profile path
-        bucket_parts_per_worker: bucket parts per worker to process in a batch
-    """
-    # Part1. Reading+Shuffling Data
-    # Read Text from Data from jsonl files
-
-    text_ddf = get_text_ddf_from_json_path_with_blocksize(
-        input_data_paths=input_data_paths,
-        num_files=num_files,
-        blocksize=text_ddf_blocksize,
-    )
-    print(
-        "Graph creation for get_text_ddf_from_json_path_with_blocksize" " complete.",
-        flush=True,
-    )
-    print(f"text_ddf.npartitions  = {text_ddf.npartitions}", flush=True)
-    st = time.time()
-    ddf_anchor_docs_with_bk, bk_mapping = aggregated_anchor_docs_with_bk_read(
-        input_anchor_docs_with_bk_dir,
-        blocksize=bucket_mapping_ddf_blocksize,
-    )
-    print("Getting ddf_anchor_docs_with_bk completed")
-    print(
-        f"ddf_anchor_docs_with_bk.npartitions = {ddf_anchor_docs_with_bk.npartitions}",
-        flush=True,
-    )
-    st = time.time()
-    num_workers = get_num_workers(client)
-    parts_per_batch = num_workers * parts_per_worker
-    print(f"parts_per_batch  = {parts_per_batch}")
-    parts_per_bucket_batch = num_workers * bucket_parts_per_worker
-    print(f"parts_per_bucket_batch  = {parts_per_bucket_batch}")
-    dask_profile_name = f"blocksize-{text_ddf_blocksize}"
-    dask_profile_name = dask_profile_name + f"parts_per_batch-{parts_per_batch}"
-    dask_profile_name = (
-        dask_profile_name + f"-parts_per_bucket_batch-{parts_per_bucket_batch}"
-    )
-    dask_profile_name = dask_profile_name + f"-jaccard-n_input_files-{num_files}.html"
-
-    text_ddf = text_ddf[["dataset_id", "doc_id", "text"]]
-
-    with performance_report_if(profile_path, dask_profile_name):
-        # Merge and write the dataframes
-        batched_merge_and_write(
-            text_ddf,
-            ddf_anchor_docs_with_bk,
-            output_path=output_shuffled_docs_path,
-            merge_on=["dataset_id", "doc_id"],
-            partition_on="output_partition_id",
-            parts_per_text_batch=parts_per_batch,
-            parts_per_bucket_batch=parts_per_bucket_batch,
-            bk_mapping=bk_mapping,
-            num_workers=num_workers,
-        )
-        print(f"Writing+Shuffling data took = {time.time()-st} s", flush=True)
-
-
-def main(args):
-    input_data_paths = args.input_data_dirs
-    input_anchor_docs_with_bk_dir = args.input_bucket_mapping_dir
-    OUTPUT_PATH = args.output_dir
-    output_anchor_docs_with_bk_path = os.path.join(
-        OUTPUT_PATH, "anchor_docs_with_bk.parquet"
-    )
-    output_shuffled_docs_path = os.path.join(OUTPUT_PATH, "shuffled_docs.parquet")
-    client = get_client(args)
-    print(f"Num Workers = {get_num_workers(client)}", flush=True)
-    print("Connected to dask cluster", flush=True)
-    print("Running jaccard shuffle script", flush=True)
-    print(f"Args = {args}")
-    st = time.time()
-    jaccard_shuffling_workflow(
-        client=client,
-        input_data_paths=input_data_paths,
-        input_anchor_docs_with_bk_dir=input_anchor_docs_with_bk_dir,
-        output_shuffled_docs_path=output_shuffled_docs_path,
-        text_ddf_blocksize=args.text_ddf_blocksize,
-        bucket_mapping_ddf_blocksize=args.bucket_mapping_ddf_blocksize,
-        num_files=args.num_files,
-        parts_per_worker=args.parts_per_worker,
-        profile_path=args.profile_path,
-        bucket_parts_per_worker=args.bucket_parts_per_worker,
-    )
-    et = time.time()
-    print(f"Jaccard Shuffle E2E time taken = {et-st} s")
-
-
-def attach_args(parser=None):
-    description = """Shuffles input text documents based on the given bucket
-    map. The output is a partitioned parquet dataset with the documents
-    shuffled by buckets
-    """
-    if not parser:
-        parser = parse_nc_args(description=description)
-
-    parser.add_argument(
-        "--input-bucket-mapping-dir",
-        type=str,
-        help="The directory containing anchor docs with bk files",
-    )
-    parser.add_argument(
-        "--text-ddf-blocksize",
-        type=int,
-        default=256,
-        help="The block size for chunking jsonl files for text ddf in mb",
-    )
-    parser.add_argument(
-        "--bucket-mapping-ddf-blocksize",
-        type=int,
-        default=256,
-        help="The block size for for anchor_docs_with_bk ddf in mb",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="The output directory to write results in",
-    )
-    parser.add_argument(
-        "--parts-per-worker",
-        default=2,
-        type=int,
-        help="The number of parts to process per worker per batch",
-    )
-    parser.add_argument(
-        "--bucket-parts-per-worker",
-        default=8,
-        type=int,
-        help="The number of bucket parts to process per worker per batch",
-    )
-    return parser
-
-
-def console_script():
-    main(attach_args().parse_args())
-
-
-if __name__ == "__main__":
-    main(attach_args().parse_args())
diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/__init__.py b/nemo_curator/gpu_deduplication/jaccard_utils/__init__.py
deleted file mode 100644
index d9155f923..000000000
--- a/nemo_curator/gpu_deduplication/jaccard_utils/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py
deleted file mode 100644
index 755112d0c..000000000
--- a/nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cudf
-import dask_cuda
-import numpy as np
-from dask import config
-from dask.dataframe.shuffle import rearrange_by_column
-from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle
-from packaging.version import Version
-
-from nemo_curator.gpu_deduplication.jaccard_utils.get_output_map_utils import (
-    build_partition,
-    get_agg_text_bytes_df,
-)
-
-USE_EXCOMMS = Version(dask_cuda.__version__) >= Version("23.10")
-
-
-def rearange_by_column_direct(
-    df,
-    col,
-    npartitions,
-    ignore_index,
-    excomms_default=USE_EXCOMMS,
-):
-    # Execute a "direct" shuffle operation without staging
-    if config.get("explicit-comms", excomms_default):
-        # Use explicit comms unless the user has
-        # disabled it with the dask config system,
-        # or we are using an older version of dask-cuda
-        return explicit_comms_shuffle(
-            df,
-            [col],
-            npartitions=npartitions,
-            ignore_index=ignore_index,
-        )
-    else:
-        return rearrange_by_column(
-            df,
-            col=col,
-            shuffle="tasks",
-            # Prevent staged shuffling by setting max_branch
-            # to the number of input partitions + 1
-            max_branch=npartitions + 1,
-            npartitions=npartitions,
-            ignore_index=ignore_index,
-        )
-
-
-def get_shuffle_part_ids_df(agg_df, partition_on, num_workers=None):
-    sizes = agg_df[f"{partition_on}_text_bytes"].values
-    max_text_bytes_per_part = int(np.iinfo(np.int32).max // 1.2)
-
-    # Adjust max_text_bytes_per_part if the number of output
-    # partitions is small compared to the number of workers.
-    # Sometimes we just have very few output partitions to
-    # deal with, and just need a larger batch
-    npartitions_min = int(num_workers * 0.8)
-    while True:
-        output_ar = build_partition(sizes.get(), max_text_bytes_per_part)
-        if output_ar.max() > npartitions_min or max_text_bytes_per_part < 2**24:
-            break
-        max_text_bytes_per_part = int(max_text_bytes_per_part // 2.0)
-
-    df = cudf.DataFrame()
-    df[partition_on] = agg_df[partition_on]
-    df["_partitions"] = output_ar
-    return df
-
-
-def get_shuffle_partition_info(df, partition_on, num_workers=None):
-    df["text_bytes"] = df["text"].map_partitions(lambda s: s.str.byte_count())
-    agg_df = get_agg_text_bytes_df(df, partition_on, 1)
-    del df
-
-    agg_df = agg_df.reset_index(drop=True)
-    shuffle_part_ids = agg_df.map_partitions(
-        get_shuffle_part_ids_df, partition_on, num_workers=num_workers
-    ).persist()
-    return shuffle_part_ids
-
-
-def text_bytes_aware_shuffle(df, partition_on, num_workers=None):
-    """
-    This shuffle takes into account the text bytes of each partition
-    and tries to make sure that the output partitions do not exceed
-    the char limit of cuDF
-
-    Args:
-        df: dask_cudf dataframe
-        partition_on: column name to partition on
-
-
-    Returns:
-        dask_cudf dataframe with _partitions columns
-    """
-    print("Starting text bytes aware shuffle", flush=True)
-    df = df.persist()
-    shuffle_part_ids = get_shuffle_partition_info(
-        df, partition_on, num_workers=num_workers
-    )
-    n_output_partitions = shuffle_part_ids["_partitions"].max().compute() + 1
-    n_output_partitions = int(n_output_partitions)
-    df = df.merge(shuffle_part_ids, on=partition_on, how="inner").persist()
-
-    df = (
-        rearange_by_column_direct(
-            df,
-            col="_partitions",
-            npartitions=n_output_partitions,
-            ignore_index=True,
-            excomms_default=True,
-        )
-        .drop(columns=["_partitions"])
-        .persist()
-    )
-    print(f"Will write {len(df)} rows to disk", flush=True)
-    return df
diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py b/nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py
deleted file mode 100644
index e29c626fe..000000000
--- a/nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def convert_str_id_to_int(df, id_column="id"):
-    """
-    Converts the legacy id format "dataset_name-0000034"
-    type of ID into 2 int based ID's
-    """
-    dx = df[id_column].str.rsplit("-", n=1, expand=True)
-    df["doc_id"] = dx[1].astype("int64").values
-    df["dataset_id"] = dx[0].hash_values()
-    df.drop(columns=[id_column], inplace=True)
-    return df
-
-
-def convert_str_pair_adlr_ids_to_int(df):
-    assert "adlr_id_x" in df.columns
-    assert "adlr_id_y" in df.columns
-
-    for tag in ["x", "y"]:
-        dx = df[f"adlr_id_{tag}"].str.rsplit("-", n=1, expand=True)
-        df[f"dataset_id_{tag}"] = dx[0].astype("uint32").values
-        df[f"doc_id_{tag}"] = dx[1].astype("int64").values
-        # See the above convert_adlr_id_to_int function
-        df = df.drop(columns=[f"adlr_id_{tag}"])
-    return df
-
-
-def combine_back_adlr_ids(df):
-    df["adlr_id"] = df["dataset_id"].astype(str) + "-" + df["doc_id"].astype(str)
-    df.drop(columns=["dataset_id", "doc_id"], inplace=True)
-
-    if "anchor_0_dataset_id" in df.columns:
-        df["anchor_0_adlr_id"] = (
-            df["anchor_0_dataset_id"].astype(str)
-            + "-"
-            + df["anchor_0_doc_id"].astype(str)
-        )
-        df.drop(columns=["anchor_0_dataset_id", "anchor_0_doc_id"], inplace=True)
-
-    if "anchor_1_dataset_id" in df.columns:
-        df["anchor_1_adlr_id"] = (
-            df["anchor_1_dataset_id"].astype(str)
-            + "-"
-            + df["anchor_1_doc_id"].astype(str)
-        )
-        df.drop(columns=["anchor_1_dataset_id", "anchor_1_doc_id"], inplace=True)
-    return df
diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py
deleted file mode 100644
index ea734dede..000000000
--- a/nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def random_select_anchor(df_bk, n=2):
-    """
-    Randomly select `n` anchors from each bucket.
-    """
-    df_bk = df_bk.copy()
-    df_bk["hash"] = df_bk[["doc_id", "dataset_id"]].hash_values()
-    df_bk = df_bk.sort_values(["bucket", "hash"])
-    df_bk["order_in_bucket"] = df_bk.groupby("bucket").cumcount()
-    df_bk["is_anchor"] = df_bk["order_in_bucket"] < n
-    for i in range(0, n):
-        df_bk[f"is_anchor_id_{i}"] = df_bk["order_in_bucket"] == i
-    df_bk = df_bk.drop(columns=["hash", "order_in_bucket"], axis=1)
-    df_bk = df_bk.reset_index(drop=True)
-    df_bk = df_bk[df_bk.is_anchor]
-    return df_bk
-
-
-def add_anchor_docs(df_bk):
-    """
-    Get anchor documents for each bucket.
-    """
-    num_anchors = 2
-    df_anchor_bk = random_select_anchor(df_bk=df_bk, n=num_anchors)
-    df_anchor_bk_0 = df_anchor_bk[df_anchor_bk["is_anchor_id_0"]][
-        ["bucket", "dataset_id", "doc_id"]
-    ].reset_index(drop=True)
-    df_anchor_bk_0 = df_anchor_bk_0.rename(
-        columns={"doc_id": "anchor_0_doc_id", "dataset_id": "anchor_0_dataset_id"}
-    )
-
-    df_anchor_bk_1 = df_anchor_bk[df_anchor_bk["is_anchor_id_1"]][
-        ["bucket", "dataset_id", "doc_id"]
-    ].reset_index(drop=True)
-    df_anchor_bk_1 = df_anchor_bk_1.rename(
-        columns={"doc_id": "anchor_1_doc_id", "dataset_id": "anchor_1_dataset_id"}
-    )
-
-    df_anchor_docs = df_anchor_bk_1.merge(df_anchor_bk_0, on=["bucket"], how="inner")
-    df_anchor_docs_with_bk = df_bk.merge(df_anchor_docs, on=["bucket"], how="inner")
-    return df_anchor_docs_with_bk
diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py
deleted file mode 100644
index bdbdedc6f..000000000
--- a/nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cudf
-import dask_cudf
-import numba
-import numpy as np
-
-from nemo_curator._compat import DASK_SHUFFLE_METHOD_ARG
-
-
-# next-fit-descending bin packing
-# https://en.wikipedia.org/wiki/Next-fit-decreasing_bin_packing
-@numba.jit(nopython=True)
-def build_partition(sizes: np.ndarray, max_size):
-    i: int = 0
-    count: int = 0
-    current: int = 0
-    size: int = 0
-    partition = np.empty(sizes.shape, dtype=np.int32)
-    for i in range(len(sizes)):
-        size = sizes[i]
-        if current + size < max_size:
-            partition[i] = count
-            current += size
-        else:
-            count += 1
-            current = size
-            partition[i] = count
-    return partition
-
-
-def update_id(df, lower_bound):
-    df["output_partition_id"] += lower_bound
-    return df
-
-
-def get_output_part_ids_with_approx_equal_sum(
-    bucket_text_bytes_df, max_text_bytes_per_part: int
-):
-    """'
-    Create a output_series that maps the ser.index into `nparts`
-    so that the total sum of bucket_val_counts_df
-    for each output id are all most equal and
-    less than max_text_bytes_per_part
-    This is used downstream for creating equal output_ids
-    """
-    sizes = bucket_text_bytes_df["bucket_text_bytes"].values
-    bucket_output_ar = build_partition(sizes.get(), max_text_bytes_per_part)
-    df = cudf.DataFrame()
-    df["bucket"] = bucket_text_bytes_df["bucket"]
-    df["output_partition_id"] = bucket_output_ar
-    return df
-
-
-def get_agg_text_bytes_df(df, agg_column, n_partitions, shuffle=False):
-    shuffle_arg = "shuffle_method" if DASK_SHUFFLE_METHOD_ARG else "shuffle"
-    agg_df = (
-        df[[agg_column, "text_bytes"]]
-        .groupby([agg_column])
-        .agg({"text_bytes": "sum"}, split_out=n_partitions, **{shuffle_arg: shuffle})
-    )
-    agg_df = agg_df.rename(columns={"text_bytes": f"{agg_column}_text_bytes"})
-    agg_df = agg_df.reset_index(drop=False)
-    # Doing a per partition sort
-    # seems to cause issues with
-    # jaccard shuffle  (Overflow errors)
-    # which  are caught and then
-    # retried with text_bytes_aware_merge
-    agg_df = agg_df.persist()
-    agg_df = agg_df.sort_values(
-        by=[f"{agg_column}_text_bytes"], ascending=False, ignore_index=True
-    )
-    agg_df = agg_df.persist()
-    # Added length to force computation
-    # after persist
-    print(f"Agg_df computed of length = {len(agg_df)}", flush=True)
-    return agg_df
-
-
-def get_output_map_from_text_bytes_per_bucket(ddf_bk_text_bytes):
-    # String bytes limit for cuDF
-    max_text_bytes_per_part = int(np.iinfo(np.int32).max // 1.2)
-    print(f"max_text_bytes_per_part = {max_text_bytes_per_part}")
-
-    # Increasing in an attempt to prevent hitting
-    # ulimits
-    output_map_df_meta = cudf.DataFrame({"bucket": [0], "output_partition_id": [1]})
-    output_map_df_meta["bucket"] = output_map_df_meta["bucket"].astype(np.uint64)
-    output_map_df_meta["output_partition_id"] = output_map_df_meta[
-        "output_partition_id"
-    ].astype(np.int32)
-    output_map_df = ddf_bk_text_bytes.map_partitions(
-        get_output_part_ids_with_approx_equal_sum,
-        max_text_bytes_per_part,
-        meta=output_map_df_meta,
-    )
-    output_map_df = output_map_df.persist()
-    print(f"Step 1 of output_map_df of len: {len(output_map_df)} computed")
-    lower_bounds = (
-        output_map_df["output_partition_id"]
-        .map_partitions(lambda s: (s.max() + 1))
-        .compute()
-    )
-    lower_bounds = np.cumsum(lower_bounds)
-
-    updated_parts = [
-        output_map_df.get_partition(i).map_partitions(update_id, lower_bounds[i - 1])
-        for i in range(1, len(lower_bounds))
-    ]
-    updated_parts.append(output_map_df.get_partition(0))
-    output_map_df = dask_cudf.concat(updated_parts)
-    output_map_df = output_map_df.persist()
-    print(f"All steps of output_map_df of len: {len(output_map_df)} computed")
-    return output_map_df
-
-
-def get_output_map_based_on_str_bytes(ddf_bk, ddf_text):
-    """
-    Add output_partition_id to ddf_bk
-    """
-    print("Getting text bytes", flush=True)
-    ddf_text["text_bytes"] = ddf_text["text"].map_partitions(
-        lambda s: s.str.byte_count()
-    )
-    n_partitions = ddf_bk.npartitions
-    ddf_text = ddf_text.drop(columns=["text"]).repartition(npartitions=n_partitions)
-    ddf_bk = ddf_bk.merge(ddf_text).repartition(npartitions=n_partitions)
-    del ddf_text
-    ddf_bk_text_bytes = get_agg_text_bytes_df(
-        ddf_bk,
-        agg_column="bucket",
-        n_partitions=n_partitions,
-        shuffle=True,
-    )
-    del ddf_bk
-    output_map_df = get_output_map_from_text_bytes_per_bucket(ddf_bk_text_bytes)
-    return output_map_df
diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py
deleted file mode 100644
index a24b99dd5..000000000
--- a/nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from glob import glob
-
-import cudf
-import dask_cudf
-import numpy as np
-from dask import dataframe as dd
-
-from nemo_curator.gpu_deduplication.ioutils import bucketed_read, read_json_func
-from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import (
-    convert_adlr_id_to_int,
-)
-
-
-def get_bucket_ddf_from_parquet_path(input_bucket_path, num_workers):
-    # Read parquet-formatted parquet files
-    ddf_bk = dask_cudf.read_parquet(
-        input_bucket_path,
-        blocksize="512MiB",
-        aggregate_files=True,
-    )
-    # Repartition to ensure we at least have num_workers partitions
-    npartitions = max(ddf_bk.npartitions, num_workers)
-    ddf_bk = ddf_bk.repartition(npartitions=npartitions)
-    print(f"Number of ddf_bk partitions = {ddf_bk.npartitions}", flush=True)
-    return ddf_bk
-
-
-def aggregated_anchor_docs_with_bk_read(path, blocksize):
-    from dask.utils import natural_sort_key
-    from pyarrow.dataset import dataset
-
-    ds = dataset(
-        sorted(glob(f"{path}/*.parquet"), key=natural_sort_key),
-        format="parquet",
-    )
-    chunks = chunk_files(ds.get_fragments(), blocksize)
-
-    # Record mapping between file indices and partition indices.
-    # We need to do this, because our anchor_docs_with_bk data
-    # should be shuffled on disk.
-    assert len(chunks)
-    part_id = np.repeat(
-        np.arange(len(chunks), dtype="int32"),
-        np.fromiter(map(len, chunks), dtype="int32"),
-    )
-    file_id = np.arange(len(part_id), dtype="int32")
-    mapping_df = cudf.DataFrame({"file_id": file_id, "part_id": part_id})
-
-    meta = cudf.DataFrame.from_arrow(ds.schema.empty_table())
-    return dd.from_map(cudf.read_parquet, chunks, meta=meta), mapping_df
-
-
-def get_text_ddf_from_json_path(input_data_paths, num_files, files_per_input_partition):
-    data_paths = [
-        entry.path for data_path in input_data_paths for entry in os.scandir(data_path)
-    ]
-    data_paths = [f for f in data_paths if f.endswith(".jsonl")]
-    if num_files != -1:
-        data_paths = data_paths[:num_files]
-    meta_df = cudf.DataFrame(
-        {
-            "text": ["x"],
-            "adlr_id": ["x"],
-        }
-    )
-    print(
-        f"Number of files being read for jaccard shuffling= {len(data_paths)}",
-        flush=True,
-    )
-
-    text_ddf = bucketed_read(
-        data_paths,
-        b_size=files_per_input_partition,
-        columns=list(meta_df.columns),
-        meta=meta_df,
-    )
-    text_ddf = text_ddf.map_partitions(
-        convert_adlr_id_to_int,
-        meta=cudf.DataFrame({"text": ["a"], "doc_id": [0], "dataset_id": np.uint32(1)}),
-    )
-    return text_ddf
-
-
-def get_file_size(file_path):
-    return os.path.getsize(file_path)
-
-
-def get_frag_size(frag):
-    # Pyarrow dataset fragment
-    return sum(rg.total_byte_size for rg in frag.row_groups)
-
-
-def chunk_files(file_list, max_size_mb):
-    """
-    Chunk files into lists of files that are less than max_size_mb
-    """
-
-    max_size_bytes = max_size_mb * 1024 * 1024
-    chunks = []
-    current_chunk = []
-    current_size = 0
-
-    for frag_or_path in file_list:
-        if isinstance(frag_or_path, str):
-            file_path = frag_or_path
-            file_size = get_file_size(file_path)
-        else:
-            file_path = frag_or_path.path
-            file_size = get_frag_size(frag_or_path)
-
-        if current_size + file_size <= max_size_bytes:
-            current_chunk.append(file_path)
-            current_size += file_size
-        else:
-            # Handle case when the first
-            # file is larger than max_size_mb
-            if current_chunk:
-                chunks.append(current_chunk)
-            current_chunk = [file_path]
-            current_size = file_size
-
-    if current_chunk:
-        chunks.append(current_chunk)
-
-    return chunks
-
-
-def get_text_ddf_from_json_path_with_blocksize(input_data_paths, num_files, blocksize):
-    data_paths = [
-        entry.path for data_path in input_data_paths for entry in os.scandir(data_path)
-    ]
-    data_paths = [f for f in data_paths if f.endswith(".jsonl")]
-    data_paths.sort()
-    if num_files != -1:
-        data_paths = data_paths[:num_files]
-    meta_df = cudf.DataFrame(
-        {
-            "text": ["x"],
-            "adlr_id": ["x"],
-        }
-    )
-    print(
-        f"Number of files being read for jaccard calculation = {len(data_paths)}",
-        flush=True,
-    )
-    filepaths_ls = chunk_files(data_paths, blocksize)
-    text_ddf = dd.from_map(
-        read_json_func, filepaths_ls, columns=list(meta_df.columns), meta=meta_df
-    )
-    text_ddf = text_ddf.map_partitions(
-        convert_adlr_id_to_int,
-        meta=cudf.DataFrame({"text": ["a"], "doc_id": [0], "dataset_id": np.uint32(1)}),
-    )
-    return text_ddf
-
-
-def get_restart_offsets(output_path):
-    bucket_offset, text_offset = 0, 0
-    fn = f"{output_path}/_restart_offset.txt"
-    if os.path.exists(fn):
-        with open(fn, "r") as f:
-            offsets = f.readline().strip("\n").split(",")
-            bucket_offset = int(offsets[0])
-            text_offset = int(offsets[1])
-    return bucket_offset, text_offset
-
-
-def update_restart_offsets(output_path, bucket_offset, text_offset):
-    with open(f"{output_path}/_restart_offset.txt", "w") as f:
-        f.write(f"{bucket_offset},{text_offset}\n")
diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py
deleted file mode 100644
index be3f4d7b9..000000000
--- a/nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cudf
-import numpy as np
-
-
-def compute_jaccard_partition(df):
-    df["jaccard"] = df["text_x"].str.jaccard_index(df["text_y"], width=5)
-    df.drop(columns=["text_x", "text_y"], inplace=True)
-    return df
-
-
-def get_max_num_rows_to_process_once(df):
-    nbytes = df["text"].str.byte_count().sum()
-    # Number of exmploded bytes
-    exploded_bytes = nbytes * 5 * 2
-    max_chars_allowed = 2_147_483_647
-    byte_ratio = int(exploded_bytes) // max_chars_allowed
-    if byte_ratio > 1:
-        nrows_at_once = len(df) // byte_ratio
-    else:
-        nrows_at_once = len(df)
-
-    nrows_at_once = max(1, nrows_at_once)
-    return nrows_at_once
-
-
-def create_empty_jaccard_result():
-    df = cudf.DataFrame()
-    df["adlr_id_x"] = "x"
-    df["adlr_id_y"] = "y"
-    df["jaccard"] = np.empty(shape=0, dtype=np.float32)
-    return df
-
-
-def compute_jaccard_pair(docs_df, anchor_df):
-    nrows_at_once = get_max_num_rows_to_process_once(docs_df)
-    result_ls = []
-    for i in range(0, docs_df.shape[0], nrows_at_once):
-        pair_df = docs_df[i : i + nrows_at_once]
-        pair_df = pair_df.merge(anchor_df, on="anchor_adlr_id")
-        pair_df = pair_df.rename(
-            columns={"adlr_id": "adlr_id_x", "anchor_adlr_id": "adlr_id_y"}
-        )
-        mask = pair_df.adlr_id_x != pair_df.adlr_id_y
-        pair_df = pair_df[mask].reset_index(drop=True)
-        if len(pair_df) == 0:
-            result_df = create_empty_jaccard_result()
-        else:
-            result_df = compute_jaccard_partition(pair_df)
-        result_ls.append(result_df)
-    if len(result_ls) == 0:
-        return create_empty_jaccard_result()
-    df_pair = cudf.concat(result_ls)
-    return df_pair
-
-
-def get_anchor_df(df, anchor_col):
-    anchor_df = df[df["adlr_id"] == df[anchor_col]]
-    anchor_df = anchor_df.reset_index(drop=True)
-    anchor_df = anchor_df[[anchor_col, "text"]]
-    anchor_df = anchor_df.rename(columns={anchor_col: "anchor_adlr_id"})
-    return anchor_df
-
-
-def compute_jaccard_and_create_pair_df(df):
-    df = df.drop_duplicates(
-        subset=["adlr_id", "anchor_1_adlr_id", "anchor_0_adlr_id"], ignore_index=True
-    )
-    anchor_columns = ["anchor_0_adlr_id", "anchor_1_adlr_id"]
-    result_ls = []
-    try:
-        for anchor_col in anchor_columns:
-            doc_df = df[["adlr_id", "text", anchor_col]]
-            doc_df = doc_df.rename(columns={anchor_col: "anchor_adlr_id"})
-            doc_df = doc_df[doc_df["adlr_id"] != doc_df["anchor_adlr_id"]]
-            anchor_df = get_anchor_df(df, anchor_col)
-            result_df = compute_jaccard_pair(doc_df, anchor_df)
-            result_ls.append(result_df)
-
-        return cudf.concat(result_ls)
-    except OverflowError as e:
-        print(
-            "Failed with  OverflowError in compute_jaccard_and_create_pair_df",
-            flush=True,
-        )
-        print(df, flush=True)
-        print("--" * 30)
-        print("Error")
-        print("---" * 30)
-        raise e
diff --git a/nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py b/nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py
deleted file mode 100644
index b06601b8d..000000000
--- a/nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-
-import cudf
-from dask import dataframe as dd
-from dask.distributed import Client
-
-
-def main(args):
-    # Create the ID mapping
-    df = cudf.DataFrame()
-    df["base_id"] = [base_id for base_id in args.base_ids.split(",")]
-    df["dataset_id"] = df["base_id"].hash_values()
-    df_pd = df.to_pandas()
-
-    output_dict = {
-        hashed_id: base_id
-        for base_id, hashed_id in zip(df_pd["base_id"], df_pd["dataset_id"])
-    }
-
-    # Write out the mapping to disk
-    with open(args.output_id_mapping, "w") as output_file:
-        json.dump(output_dict, output_file)
-
-    # Index the parquet files by group
-    client = Client()
-    ddf = dd.read_parquet(args.path_to_connected_components)
-    ddf = ddf.set_index("group")
-    ddf.to_parquet(args.output_indexed_connected_components)
-
-
-def attach_args(
-    parser=argparse.ArgumentParser(
-        """
-Prepares the output connected components from dedup for
-extraction to .txt and .jsonl files
-  """,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-):
-    parser.add_argument(
-        "--base-ids",
-        type=str,
-        default="doc_id",
-        help="A comma-delimited list of base-ids that were used for "
-        "different datasets during dedup. For example, "
-        "if you were deduplicating Wikipedia and Common Crawl, you might "
-        "have adlr_ids such has wiki-000001 and cc-000001. "
-        "The base-ids in this case would be 'wiki,cc'",
-    )
-    parser.add_argument(
-        "--path-to-connected-components",
-        type=str,
-        default=None,
-        help="Path to the connected components that is created "
-        "at the last step of the fuzzy dedup.",
-    )
-    parser.add_argument(
-        "--output-indexed-connected-components",
-        type=str,
-        default=None,
-        help="Path to the output connected components "
-        "that have been prepared for "
-        "extraction to .txt and .jsonl files",
-    )
-    parser.add_argument(
-        "--output-id-mapping",
-        type=str,
-        default="mapping.json",
-        help="A mapping between each of the strings specified "
-        "in '--base-ids' and their respective hashes",
-    )
-    return parser
-
-
-if __name__ == "__main__":
-    main(attach_args().parse_args())
-
-
-def console_script():
-    main(attach_args().parse_args())
diff --git a/nemo_curator/gpu_deduplication/utils.py b/nemo_curator/gpu_deduplication/utils.py
deleted file mode 100644
index f6faefe77..000000000
--- a/nemo_curator/gpu_deduplication/utils.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-from time import time
-
-
-def get_num_workers(client):
-    """
-    Returns the number of workers in the cluster
-    """
-    worker_list = list(client.scheduler_info()["workers"].keys())
-    return len(worker_list)
-
-
-def get_list_of_lists(lst, nchunks):
-    """
-    Splits a list into nchunks lists
-    """
-    return [lst[i::nchunks] for i in range(nchunks)]
-
-
-def parse_nc_args(
-    description="Default gpu dedup nemo_curator argument parser",
-) -> argparse.ArgumentParser:
-    """
-    Adds default set of arguments that are common to multiple stages
-    of the pipeline
-    """
-    parser = argparse.ArgumentParser(
-        description,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--input-data-dirs",
-        type=str,
-        nargs="+",
-        default=None,
-        required=False,
-        help="Input directories consisting of .jsonl files that are accessible "
-        "to all nodes. This path must be accessible by all machines in the cluster",
-    )
-    parser.add_argument(
-        "--scheduler-address",
-        type=str,
-        default=None,
-        help="Address to the scheduler of a created dask cluster. If not provided"
-        "a single node LocalCUDACluster will be started.",
-    )
-    parser.add_argument(
-        "--scheduler-file",
-        type=str,
-        default=None,
-        help="Path to the scheduler file of a created dask cluster. If not provided"
-        " a single node LocalCUDACluster will be started.",
-    )
-    parser.add_argument(
-        "--rmm-pool-size",
-        type=str,
-        default=None,
-        help="Initial pool size to use for the RMM Pool Memory allocator"
-        "Note: This only applies to the localCUDACluster. If providing an user created "
-        "cluster refer to"
-        "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-rmm-pool-size",  # noqa: E501
-    )
-    parser.add_argument(
-        "--protocol",
-        type=str,
-        default="tcp",
-        help="Protcol to use for dask cluster"
-        "Note: This only applies to the localCUDACluster. If providing an user created "
-        "cluster refer to"
-        "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-protocol",  # noqa: E501
-    )
-    parser.add_argument(
-        "--nvlink-only",
-        action="store_true",
-        help="Start a local cluster with only NVLink enabled."
-        "Only applicable when protocol=ucx and no scheduler file/address is specified",
-    )
-    parser.add_argument(
-        "--input-json-text-field",
-        type=str,
-        default="text",
-        help="The name of the field within each json object of the jsonl "
-        "file that contains the text from which minhashes will be computed. ",
-    )
-    parser.add_argument(
-        "--input-json-id-field",
-        type=str,
-        default="adlr_id",
-        help="The name of the field within each json object of the jsonl "
-        "file that assigns a unqiue ID to each document. "
-        "Can be created by running the script "
-        "'./prospector/add_id.py' which adds the field 'adlr_id' "
-        "to the documents in a distributed fashion",
-    )
-    parser.add_argument(
-        "--log-dir",
-        type=str,
-        default="./logs/",
-        help="The output log directory where node and local",
-    )
-    parser.add_argument(
-        "--files-per-partition",
-        type=int,
-        default=2,
-        help="Number of jsonl files to combine into single partition",
-    )
-    parser.add_argument(
-        "--num-files",
-        type=int,
-        default=None,
-        help="Upper limit on the number of json files to process",
-    )
-    parser.add_argument(
-        "--log-frequency",
-        type=int,
-        default=500,
-        help="The frequency with which to write log messages when "
-        "computing MinHashses. By default a log message will "
-        "be written every 500 partitions",
-    )
-    parser.add_argument(
-        "--profile-path",
-        type=str,
-        default=None,
-        help="Path to save dask profile",
-    )
-    return parser
-
-
-def timer(func):
-
-    def wrapper(*args, **kw):
-        print(f"function {func.__name__} started...")
-        start = time()
-        res = func(*args, **kw)
-        duration = time() - start
-        timing = f"function {func.__name__} finished in {duration:.1f} seconds"
-        print(timing)
-        return res
-
-    return wrapper
diff --git a/nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py b/nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py
deleted file mode 100644
index ae7e6c656..000000000
--- a/nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-from time import time
-
-import cudf
-import dask_cudf
-
-from nemo_curator.gpu_deduplication.jaccard_utils.jaccard_similarity_utils import (
-    compute_jaccard_partition,
-    create_empty_jaccard_result,
-)
-from nemo_curator.gpu_deduplication.utils import get_client, parse_nc_args
-
-
-def num_ngram(ds):
-    return ds.str.character_ngrams(5, True).list.unique().list.len()
-
-
-def write_eligible_pairs(dedup_with_text_path, cache_dir):
-    df = cudf.read_parquet(dedup_with_text_path)
-    df["num_ngram"] = num_ngram(df["text"])
-    df.drop(columns="text", inplace=True)
-    df["group"] = 0
-    B = 8_000
-    rm = 0
-    for s in range(0, df.shape[0], B):
-        e = min(s + B, df.shape[0])
-        da = df.iloc[s:e]
-        db = da.merge(df, on="group")
-        mask = db["adlr_id_x"] < db["adlr_id_y"]
-        db = db[mask]
-        mask = (db["num_ngram_x"] < db["num_ngram_y"] * 0.8) | (
-            db["num_ngram_y"] < db["num_ngram_x"] * 0.8
-        )
-        print(db.shape, mask.sum())
-        rm += mask.sum()
-        db = db[~mask]
-        db.drop(columns=["group", "num_ngram_x", "num_ngram_y"], inplace=True)
-        db.to_parquet(f"{cache_dir}/pair_{s}.parquet")
-        del da, db
-    print("total pairs removed", rm)
-
-
-def merge_text(df, dedup_with_text_path):
-    dg = cudf.read_parquet(dedup_with_text_path)
-    for i in "xy":
-        df = df.merge(dg, left_on=f"adlr_id_{i}", right_on="adlr_id")
-        df.drop(columns="adlr_id", inplace=True)
-    return df
-
-
-def get_max_num_rows_to_process_once(df):
-    nbytes = max(
-        df["text_x"].str.byte_count().sum(), df["text_y"].str.byte_count().sum()
-    )
-
-    # TODO: fix below
-    # to 4x
-    exploded_bytes = nbytes * 5 * 4
-    max_chars_allowed = 2_147_483_647
-    byte_ratio = int(exploded_bytes) // max_chars_allowed
-    if byte_ratio > 1:
-        nrows_at_once = len(df) // byte_ratio
-    else:
-        nrows_at_once = len(df)
-
-    nrows_at_once = max(1, nrows_at_once)
-    return nrows_at_once
-
-
-def compute_jaccard_pair(docs_df):
-    nrows_at_once = get_max_num_rows_to_process_once(docs_df)
-    result_ls = []
-    for i in range(0, docs_df.shape[0], nrows_at_once):
-        pair_df = docs_df[i : i + nrows_at_once]
-        if len(pair_df) == 0:
-            result_df = create_empty_jaccard_result()
-        else:
-            result_df = compute_jaccard_partition(pair_df)
-        result_ls.append(result_df)
-    if len(result_ls) == 0:
-        return create_empty_jaccard_result()
-    df_pair = cudf.concat(result_ls)
-    return df_pair
-
-
-def run_verify_all_pairs_jaccard(dedup_with_text_path, cache_dir, output_dir):
-    ddf = dask_cudf.read_parquet(f"{cache_dir}/pair_*.parquet")
-    ddf = ddf.repartition(npartitions=2048)
-
-    meta_df = cudf.DataFrame(
-        {
-            "adlr_id_x": [0],
-            "adlr_id_y": [0],
-            "text_x": ["x"],
-            "text_y": ["x"],
-        }
-    )
-
-    ddf = ddf.map_partitions(
-        partial(merge_text, dedup_with_text_path=dedup_with_text_path), meta=meta_df
-    )
-
-    meta_df = cudf.DataFrame(
-        {
-            "adlr_id_x": [0],
-            "adlr_id_y": [0],
-            "jaccard": [1.0],
-        }
-    )
-
-    ddf = ddf.map_partitions(compute_jaccard_pair, meta=meta_df)
-    mask = ddf["jaccard"] > 0.8
-    dup_pairs = ddf[mask].compute()
-    print("# of duplicated pairs with jaccard>0.8", dup_pairs.shape[0])
-    dup_pairs.to_parquet(f"{output_dir}/duplicated_pairs.parquet")
-
-
-def main(args):
-    start = time()
-    description = """Verify correctness of deduped results by calculating all pairs"""
-    dedup_with_text_path = f"{args.output_dir}/dedup_with_text.parquet"
-
-    write_eligible_pairs(dedup_with_text_path, args.cache_dir)
-    client = get_client(args)
-
-    # Run actual computation
-    run_verify_all_pairs_jaccard(
-        dedup_with_text_path,
-        args.cache_dir,
-        args.output_dir,
-    )
-    print(f"All done in {time()-start:.1f} seconds")
-
-
-def attach_args(parser=None):
-    description = """verify all pairs jaccard"""
-    if not parser:
-        parser = parse_nc_args(description=description)
-
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="The output directory to write results to",
-    )
-    parser.add_argument(
-        "--cache-dir",
-        type=str,
-        help="The cache directory to write intermediate results to",
-    )
-    return parser
-
-
-def console_script():
-    main(attach_args().parse_args())
-
-
-if __name__ == "__main__":
-    main(attach_args().parse_args())
diff --git a/nemo_curator/gpu_deduplication/write_deduped_result_with_text.py b/nemo_curator/gpu_deduplication/write_deduped_result_with_text.py
deleted file mode 100644
index 155c56bc2..000000000
--- a/nemo_curator/gpu_deduplication/write_deduped_result_with_text.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-
-import cudf
-
-from nemo_curator.gpu_deduplication.jaccard_utils.io_utils import (
-    get_text_ddf_from_json_path,
-)
-from nemo_curator.gpu_deduplication.utils import parse_nc_args
-
-
-def merge_text_partition(df, connected_components_path):
-    res = cudf.read_parquet(connected_components_path).drop(columns="dataset_id")
-    res = res.drop_duplicates("group")
-    res = res.drop(columns=["group"])
-    df = res.merge(df, on="doc_id", how="left")
-    df = df.rename(columns={"doc_id": "adlr_id"})
-    return df.drop(columns="dataset_id")
-
-
-def write_result_text_parquet(original_path, output_dir):
-    ddf = get_text_ddf_from_json_path(
-        original_path, num_files=-1, files_per_input_partition=10
-    )
-
-    connected_components_path = f"{output_dir}/connected_components.parquet"
-    print(ddf.head())
-    merge_func = partial(
-        merge_text_partition, connected_components_path=connected_components_path
-    )
-    ddf = ddf.map_partitions(merge_func, meta={"adlr_id": "uint32", "text": "O"})
-
-    mask = ddf.text.isnull()
-    ddf = ddf[~mask]
-
-    df = ddf.compute()
-    df = df.reset_index(drop=True)
-    df.to_parquet(f"{output_dir}/dedup_with_text.parquet")
-
-
-def main(args):
-    write_result_text_parquet(
-        original_path=[args.original_path], output_dir=args.output_dir
-    )
-
-
-def attach_args(parser=None):
-    description = """verify all pairs jaccard"""
-    if not parser:
-        parser = parse_nc_args(description=description)
-
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="The output directory to write results to",
-    )
-    parser.add_argument(
-        "--original-path",
-        type=str,
-        help="The path of original jsonl files",
-    )
-    return parser
-
-
-def console_script():
-    main(attach_args().parse_args())
-
-
-if __name__ == "__main__":
-    args = attach_args().parse_args()
diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py
index b51499678..ac72e53d9 100644
--- a/nemo_curator/modules/fuzzy_dedup.py
+++ b/nemo_curator/modules/fuzzy_dedup.py
@@ -34,11 +34,6 @@
 from tqdm import tqdm
 
 from nemo_curator.datasets import DocumentDataset
-from nemo_curator.gpu_deduplication.jaccard_utils.merge_utils import (
-    extract_partitioning_index,
-    filter_text_rows_by_bucket_batch,
-    merge_left_to_shuffled_right,
-)
 from nemo_curator.log import create_logger
 from nemo_curator.utils.distributed_utils import (
     get_current_client,
@@ -51,6 +46,11 @@
     get_restart_offsets,
     update_restart_offsets,
 )
+from nemo_curator.utils.fuzzy_dedup_utils.merge_utils import (
+    extract_partitioning_index,
+    filter_text_rows_by_bucket_batch,
+    merge_left_to_shuffled_right,
+)
 from nemo_curator.utils.fuzzy_dedup_utils.output_map_utils import (
     build_partition,
     get_agg_text_bytes_df,
diff --git a/nemo_curator/scripts/find_exact_duplicates.py b/nemo_curator/scripts/find_exact_duplicates.py
index 16173861d..af1f127a4 100644
--- a/nemo_curator/scripts/find_exact_duplicates.py
+++ b/nemo_curator/scripts/find_exact_duplicates.py
@@ -18,12 +18,12 @@
 import dask_cudf
 
 from nemo_curator.datasets import DocumentDataset
-from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep
-from nemo_curator.gpu_deduplication.utils import parse_nc_args
 from nemo_curator.log import create_logger
 from nemo_curator.modules import ExactDuplicates
 from nemo_curator.utils.distributed_utils import get_client, read_data
 from nemo_curator.utils.file_utils import get_all_files_paths_under
+from nemo_curator.utils.fuzzy_dedup_utils.io_utils import strip_trailing_sep
+from nemo_curator.utils.script_utils import parse_gpu_dedup_args
 
 
 def pre_imports():
@@ -88,7 +88,7 @@ def attach_args(parser=None):
     description = """Compute Exact duplicates in a given dataset.
     """
     if not parser:
-        parser = parse_nc_args(description=description)
+        parser = parse_gpu_dedup_args(description=description)
     parser.add_argument(
         "--hash-method",
         type=str,
diff --git a/nemo_curator/scripts/fuzzy_deduplication/README.md b/nemo_curator/scripts/fuzzy_deduplication/README.md
new file mode 100644
index 000000000..f5a43f405
--- /dev/null
+++ b/nemo_curator/scripts/fuzzy_deduplication/README.md
@@ -0,0 +1,99 @@
+## Fuzzy Deduplication Steps
+This directory consists of scripts that can be invoked directly via the command line for finding fuzzy duplicates from a group of Jsonl files consisting of text & unique ID's that are specifically formatted using the `add_id` script included as a part of NeMo-Curator.
+
+> [!IMPORTANT]
+> The scripts are helper utilities that wrap the fuzzy_dedup API for handling multiple jsonl directories and the id format generated by [add_id](../add_id.py). For most cases we recommend working with the fuzzy_deduplication API directly.
+
+### Usage
+1. Compute Minhashes
+    - Input: Data Directories
+    -  Output: minhashes.parquet for each data dir.
+    - Example call:
+    ```bash
+        # same as `python compute_minhashes.py`
+        gpu_compute_minhashes \
+          --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
+          --output-minhash-dir /path/to/output_minhashes \
+          --input-json-text-field text_column_name \
+          --input-json-id-field id_column_name \
+          --minhash-length number_of_hashes \
+          --char-ngram char_ngram_size \
+          --hash-bytes 4(or 8 byte hashes) \
+          --seed 42 \
+          --log-dir ./
+          # --scheduler-file /path/to/file.json
+    ```
+2. Buckets (Minhash Buckets)
+    - Input: Minhash directories
+    - Output: Buckets.parquet
+    - Example call:
+    ```bash
+        # same as `python minhash_lsh.py`
+        minhash_buckets \
+          --input-data-dirs /path/to/output_minhashes/dir1 /path/to/output_minhashes/dir2 \
+          --output-bucket-dir /path/to/dedup_output \
+          --input-minhash-field _minhash_signature \
+          --input-json-id-field id_column_name \
+          --minhash-length number_of_hashes \
+          --num-bands num_bands \
+          --buckets-per-shuffle 1 `#Value b/w [1-num_bands]. Higher is better but might lead to oom` \
+          --log-dir ./
+          # --scheduler-file /path/to/file.json
+    ```
+3. Jaccard Map Buckets
+    - Input: Buckets.parquet + Data Dir
+    - Output: anchor_docs_with_bk.parquet
+    - Example call:
+    ```bash
+        # same as `python map_buckets.py`
+        jaccard_map_buckets \
+          --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
+          --input-bucket-dir /path/to/dedup_output/_buckets.parquet \
+          --output-dir /path/to/dedup_output \
+          --input-json-text-field text_column_name \
+          --input-json-id-field id_column_name \
+          # --scheduler-file /path/to/file.json
+    ```
+4. Jaccard Shuffle
+    - Input: anchor_docs_with_bk.parquet + Data Dir
+    - Output: shuffled_docs.parquet
+    - Example call:
+    ```bash
+        # same as `python jaccard_shuffle.py`
+        jaccard_shuffle \
+          --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
+          --input-bucket-mapping-dir /path/to/dedup_output/anchor_docs_with_bk.parquet \
+          --output-dir /path/to/dedup_output \
+          --input-json-text-field text_column_name \
+          --input-json-id-field id_column_name \
+          # --scheduler-file /path/to/file.json
+    ```
+5. Jaccard compute
+    - Input: Shuffled docs.parquet
+    - Output: jaccard_similarity_results.parquet
+    - Example call:
+    ```bash
+        # same as `python jaccard_compute.py`
+        jaccard_compute \
+          --shuffled-docs-path /path/to/dedup_output/shuffled_docs.parquet \
+          --output-dir /path/to/dedup_output \
+          --ngram-size char_ngram_size_for_similarity \
+          # --scheduler-file /path/to/file.json
+    ```
+6. Connected Components
+    - Input: jaccard_similarity_results.parquet
+    - Output: connected_components.parquet
+    - Example call:
+    ```bash
+        # same as `python connected_components.py`
+        gpu_connected_component \
+          --jaccard-pairs_path /path/to/dedup_output/jaccard_similarity_results.parquet \
+          --output-dir /path/to/dedup_output \
+          --cache-dir /path/to/cc_cache \
+          --jaccard-threshold 0.8
+          # --scheduler-file /path/to/file.json
+    ```
+
+> [!TIP]
+> When using these scripts in a multi-node environment (like Slurm, K8's etc.) it is recommended to start up a Dask cluster prior to execution and connect to the existing cluster via the `--scheduler-address` or `--scheduler-file` flag.
+> Use the `--help` flag to view all possible CLI options for the scripts and details on what they do.
diff --git a/nemo_curator/scripts/fuzzy_deduplication/__init__.py b/nemo_curator/scripts/fuzzy_deduplication/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nemo_curator/scripts/compute_minhashes.py b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
similarity index 94%
rename from nemo_curator/scripts/compute_minhashes.py
rename to nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
index 044653ceb..832c7c505 100644
--- a/nemo_curator/scripts/compute_minhashes.py
+++ b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
@@ -17,8 +17,6 @@
 
 from nemo_curator import MinHash
 from nemo_curator.datasets import DocumentDataset
-from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep
-from nemo_curator.gpu_deduplication.utils import parse_nc_args
 from nemo_curator.log import create_logger
 from nemo_curator.utils.distributed_utils import (
     get_client,
@@ -26,6 +24,8 @@
     read_data,
 )
 from nemo_curator.utils.file_utils import get_all_files_paths_under
+from nemo_curator.utils.fuzzy_dedup_utils.io_utils import strip_trailing_sep
+from nemo_curator.utils.script_utils import parse_gpu_dedup_args
 
 
 def pre_imports():
@@ -111,7 +111,7 @@ def attach_args(parser=None):
     -minhash signatures is created. This dataframe is written to file after processing
     """
     if not parser:
-        parser = parse_nc_args(description=description)
+        parser = parse_gpu_dedup_args(description=description)
 
     parser.add_argument(
         "--minhash-length",
@@ -149,12 +149,6 @@ def attach_args(parser=None):
         "Each file is a parquet file that contains two series, the document ids, "
         "and a series of lists, each list denoting the minhash signature for that document id.",
     )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="gpu",
-        help="Type of cluster to start up",
-    )
     return parser
 
 
diff --git a/nemo_curator/scripts/connected_components.py b/nemo_curator/scripts/fuzzy_deduplication/connected_components.py
similarity index 95%
rename from nemo_curator/scripts/connected_components.py
rename to nemo_curator/scripts/fuzzy_deduplication/connected_components.py
index c04f0349d..f232ad100 100644
--- a/nemo_curator/scripts/connected_components.py
+++ b/nemo_curator/scripts/fuzzy_deduplication/connected_components.py
@@ -15,9 +15,9 @@
 import os
 import time
 
-from nemo_curator.gpu_deduplication.utils import parse_nc_args
 from nemo_curator.modules.fuzzy_dedup import ConnectedComponents
 from nemo_curator.utils.distributed_utils import get_client
+from nemo_curator.utils.script_utils import parse_gpu_dedup_args
 
 
 def main(args):
@@ -51,7 +51,7 @@ def main(args):
 def attach_args(parser=None):
     description = """Computes connected component"""
     if not parser:
-        parser = parse_nc_args(description=description)
+        parser = parse_gpu_dedup_args(description=description)
 
     parser.add_argument(
         "--jaccard-pairs-path",
diff --git a/nemo_curator/scripts/jaccard_compute.py b/nemo_curator/scripts/fuzzy_deduplication/jaccard_compute.py
similarity index 95%
rename from nemo_curator/scripts/jaccard_compute.py
rename to nemo_curator/scripts/fuzzy_deduplication/jaccard_compute.py
index d16e95654..4691ef935 100644
--- a/nemo_curator/scripts/jaccard_compute.py
+++ b/nemo_curator/scripts/fuzzy_deduplication/jaccard_compute.py
@@ -15,9 +15,9 @@
 import os
 import time
 
-from nemo_curator.gpu_deduplication.utils import parse_nc_args
 from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity
 from nemo_curator.utils.distributed_utils import get_client, get_num_workers
+from nemo_curator.utils.script_utils import parse_gpu_dedup_args
 
 
 def main(args):
@@ -57,7 +57,7 @@ def main(args):
 def attach_args(parser=None):
     description = """Computes jaccard similarity"""
     if not parser:
-        parser = parse_nc_args(description=description)
+        parser = parse_gpu_dedup_args(description=description)
 
     parser.add_argument(
         "--shuffled-docs-path",
diff --git a/nemo_curator/scripts/jaccard_shuffle.py b/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
similarity index 95%
rename from nemo_curator/scripts/jaccard_shuffle.py
rename to nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
index c01935a61..f0bd555dc 100644
--- a/nemo_curator/scripts/jaccard_shuffle.py
+++ b/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
@@ -15,12 +15,12 @@
 import os
 import time
 
-from nemo_curator.gpu_deduplication.utils import get_num_workers, parse_nc_args
 from nemo_curator.modules.fuzzy_dedup import _Shuffle
-from nemo_curator.utils.distributed_utils import get_client
+from nemo_curator.utils.distributed_utils import get_client, get_num_workers
 from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (
     get_text_ddf_from_json_path_with_blocksize,
 )
+from nemo_curator.utils.script_utils import parse_gpu_dedup_args
 
 
 def func():
@@ -79,7 +79,7 @@ def attach_args(parser=None):
     shuffled by buckets
     """
     if not parser:
-        parser = parse_nc_args(description=description)
+        parser = parse_gpu_dedup_args(description=description)
 
     parser.add_argument(
         "--input-bucket-mapping-dir",
diff --git a/nemo_curator/scripts/map_buckets.py b/nemo_curator/scripts/fuzzy_deduplication/map_buckets.py
similarity index 96%
rename from nemo_curator/scripts/map_buckets.py
rename to nemo_curator/scripts/fuzzy_deduplication/map_buckets.py
index 9e3f71a51..5640d9bd3 100644
--- a/nemo_curator/scripts/map_buckets.py
+++ b/nemo_curator/scripts/fuzzy_deduplication/map_buckets.py
@@ -15,13 +15,13 @@
 import os
 import time
 
-from nemo_curator.gpu_deduplication.utils import get_num_workers, parse_nc_args
 from nemo_curator.modules.fuzzy_dedup import _MapBuckets
-from nemo_curator.utils.distributed_utils import get_client
+from nemo_curator.utils.distributed_utils import get_client, get_num_workers
 from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (
     get_bucket_ddf_from_parquet_path,
     get_text_ddf_from_json_path_with_blocksize,
 )
+from nemo_curator.utils.script_utils import parse_gpu_dedup_args
 
 
 def get_anchor_and_output_map_info(
@@ -73,7 +73,7 @@ def attach_args(parser=None):
     buckets to a logical partition by using a modified bin packing algorithm.
     """
     if not parser:
-        parser = parse_nc_args(description=description)
+        parser = parse_gpu_dedup_args(description=description)
     parser.add_argument(
         "--input-bucket-dir",
         type=str,
diff --git a/nemo_curator/scripts/minhash_lsh.py b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py
similarity index 91%
rename from nemo_curator/scripts/minhash_lsh.py
rename to nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py
index ec206dc10..a0484cf0d 100644
--- a/nemo_curator/scripts/minhash_lsh.py
+++ b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py
@@ -21,12 +21,10 @@
 
 from nemo_curator import LSH
 from nemo_curator.datasets import DocumentDataset
-from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import (
-    convert_str_id_to_int,
-)
-from nemo_curator.gpu_deduplication.utils import parse_nc_args
 from nemo_curator.log import create_logger
 from nemo_curator.utils.distributed_utils import get_client
+from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import convert_str_id_to_int
+from nemo_curator.utils.script_utils import parse_gpu_dedup_args
 
 
 def pre_imports():
@@ -85,7 +83,7 @@ def attach_args(parser=None):
     denoting the bucket id's that document belongs to.
     """
     if not parser:
-        parser = parse_nc_args(description=description)
+        parser = parse_gpu_dedup_args(description=description)
 
     parser.add_argument(
         "--minhash-length",
@@ -111,12 +109,6 @@ def attach_args(parser=None):
         required=True,
         help="Number of buckets to shuffle per batch",
     )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="gpu",
-        help="Type of cluster to start up",
-    )
     parser.add_argument(
         "--output-bucket-dir",
         type=str,
diff --git a/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py b/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py
index cc6e0909f..105021bda 100644
--- a/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py
+++ b/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py
@@ -180,3 +180,10 @@ def get_frag_size(frag):
 
 def get_file_size(file_path):
     return os.path.getsize(file_path)
+
+
+def strip_trailing_sep(path: str):
+    """
+    Strips a path string of trailing path seperators like `/` if any.
+    """
+    return path.rstrip(os.path.sep)
diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/merge_utils.py b/nemo_curator/utils/fuzzy_dedup_utils/merge_utils.py
similarity index 98%
rename from nemo_curator/gpu_deduplication/jaccard_utils/merge_utils.py
rename to nemo_curator/utils/fuzzy_dedup_utils/merge_utils.py
index 08fcea53f..a144b5602 100644
--- a/nemo_curator/gpu_deduplication/jaccard_utils/merge_utils.py
+++ b/nemo_curator/utils/fuzzy_dedup_utils/merge_utils.py
@@ -22,9 +22,7 @@
 from dask.highlevelgraph import HighLevelGraph
 from dask.utils import M
 
-from nemo_curator.gpu_deduplication.jaccard_utils.batch_shuffle_utils import (
-    rearange_by_column_direct,
-)
+from nemo_curator.utils.fuzzy_dedup_utils.shuffle_utils import rearange_by_column_direct
 
 
 def _split_part(part, nsplits):
diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py
index 8da562d35..e2811dd1e 100644
--- a/nemo_curator/utils/script_utils.py
+++ b/nemo_curator/utils/script_utils.py
@@ -42,14 +42,14 @@ def add_distributed_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPa
         type=str,
         default=None,
         help="Address to the scheduler of a created dask cluster. If not provided"
-        "a single node LocalCUDACluster will be started.",
+        "a single node Cluster will be started.",
     )
     parser.add_argument(
         "--scheduler-file",
         type=str,
         default=None,
         help="Path to the scheduler file of a created dask cluster. If not provided"
-        " a single node LocalCUDACluster will be started.",
+        " a single node Cluster will be started.",
     )
     parser.add_argument(
         "--n-workers",
@@ -68,7 +68,7 @@ def add_distributed_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPa
         type=str,
         default=None,
         help="Initial pool size to use for the RMM Pool Memory allocator"
-        "Note: This only applies to the localCUDACluster. If providing an user created "
+        "Note: This only applies to the LocalCUDACluster. If providing an user created "
         "cluster refer to"
         "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-rmm-pool-size",  # noqa: E501
     )
@@ -96,7 +96,7 @@ def add_distributed_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPa
     parser.add_argument(
         "--num-files",
         type=int,
-        default=-1,
+        default=None,
         help="Upper limit on the number of json files to process",
     )
     parser.add_argument(
@@ -109,6 +109,62 @@ def add_distributed_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPa
     return parser
 
 
+def parse_gpu_dedup_args(
+    description="Default gpu dedup nemo_curator argument parser",
+) -> argparse.ArgumentParser:
+    """
+    Adds default set of arguments that are common to multiple stages
+    of the pipeline
+    """
+    parser = argparse.ArgumentParser(
+        description,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser = add_distributed_args(parser)
+
+    # Set default device to GPU for dedup
+    parser.set_defaults(device="gpu")
+    parser.add_argument(
+        "--input-data-dirs",
+        type=str,
+        nargs="+",
+        default=None,
+        required=False,
+        help="Input directories consisting of .jsonl files that are accessible "
+        "to all nodes. This path must be accessible by all machines in the cluster",
+    )
+    parser.add_argument(
+        "--input-json-text-field",
+        type=str,
+        default="text",
+        help="The name of the field within each json object of the jsonl "
+        "file that contains the text from which minhashes will be computed. ",
+    )
+    parser.add_argument(
+        "--input-json-id-field",
+        type=str,
+        default="adlr_id",
+        help="The name of the field within each json object of the jsonl "
+        "file that assigns a unqiue ID to each document. "
+        "Can be created by running the script "
+        "'./prospector/add_id.py' which adds the field 'adlr_id' "
+        "to the documents in a distributed fashion",
+    )
+    parser.add_argument(
+        "--log-dir",
+        type=str,
+        default="./logs/",
+        help="The output log directory where node and local",
+    )
+    parser.add_argument(
+        "--profile-path",
+        type=str,
+        default=None,
+        help="Path to save dask profile",
+    )
+    return parser
+
+
 def chunk_list(lst, nchnks):
     nitem = len(lst)
     splits = splitnum(nitem, nchnks)
diff --git a/setup.py b/setup.py
index 8fc60e926..91c32a296 100644
--- a/setup.py
+++ b/setup.py
@@ -89,18 +89,13 @@
             "prepare_task_data=nemo_curator.scripts.prepare_task_data:console_script",
             "find_matching_ngrams=nemo_curator.scripts.find_matching_ngrams:console_script",
             "remove_matching_ngrams=nemo_curator.scripts.remove_matching_ngrams:console_script",
-            "gpu_compute_minhashes=nemo_curator.scripts.compute_minhashes:console_script",
-            "minhash_buckets=nemo_curator.scripts.minhash_lsh:console_script",
-            "jaccard_map_buckets=nemo_curator.scripts.map_buckets:console_script",
-            "jaccard_shuffle=nemo_curator.scripts.jaccard_shuffle:console_script",
-            "jaccard_compute=nemo_curator.scripts.jaccard_compute:console_script",
-            "gpu_connected_component=nemo_curator.scripts.connected_components:console_script",
-            "write_deduped_result_with_text=nemo_curator.gpu_deduplication.write_deduped_result_with_text:console_script",
-            "verify_all_pairs_jaccard=nemo_curator.gpu_deduplication.verify_all_pairs_jaccard:console_script",
+            "gpu_compute_minhashes=nemo_curator.scripts.fuzzy_deduplication.compute_minhashes:console_script",
+            "minhash_buckets=nemo_curator.scripts.fuzzy_deduplication.minhash_lsh:console_script",
+            "jaccard_map_buckets=nemo_curator.scripts.fuzzy_deduplication.map_buckets:console_script",
+            "jaccard_shuffle=nemo_curator.scripts.fuzzy_deduplication.jaccard_shuffle:console_script",
+            "jaccard_compute=nemo_curator.scripts.fuzzy_deduplication.jaccard_compute:console_script",
+            "gpu_connected_component=nemo_curator.scripts.fuzzy_deduplication.connected_components:console_script",
             "gpu_exact_dups=nemo_curator.scripts.find_exact_duplicates:console_script",
-            "prepare_fuzzy_ids=nemo_curator.gpu_deduplication.prepare_fuzzy_ids:console_script",
-            "create_list_of_duplicate_ids=nemo_curator.gpu_deduplication.create_list_of_duplicate_ids:console_script",
-            "remove_duplicates=nemo_curator.gpu_deduplication.remove_duplicates:console_script",
             "deidentify=nemo_curator.scripts.find_pii_and_deidentify:console_script",
             "generate_statistics=nemo_curator.distributed_data_classification.generate_statistics:console_script",
             "domain_classifier_inference=nemo_curator.distributed_data_classification.domain_classifier_inference:console_script",

From b192e92e7966573d816e9f76a3ee9352e0f9f572 Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Fri, 3 May 2024 08:38:47 -0700
Subject: [PATCH 09/34] Fix lang id example (#37)

* Fix lang id example

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add classifier unit tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add test for failure

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Remove failure test

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

---------

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 .../identify_languages_and_fix_unicode.py     |  2 +-
 nemo_curator/filters/classifier_filter.py     |  6 ++
 tests/test_filters.py                         | 72 ++++++++++++++++++-
 3 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/examples/identify_languages_and_fix_unicode.py b/examples/identify_languages_and_fix_unicode.py
index 933c6c231..a95dc6905 100644
--- a/examples/identify_languages_and_fix_unicode.py
+++ b/examples/identify_languages_and_fix_unicode.py
@@ -60,7 +60,7 @@ def main(args):
 
     # Remove the language score
     filtered_dataset.df[language_field] = filtered_dataset.df[language_field].apply(
-        lambda score: score[1]
+        lambda score: score[1], meta=(None, str)
     )
 
     # Split the dataset by language
diff --git a/nemo_curator/filters/classifier_filter.py b/nemo_curator/filters/classifier_filter.py
index f32e2ff57..3ade004ec 100644
--- a/nemo_curator/filters/classifier_filter.py
+++ b/nemo_curator/filters/classifier_filter.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import dask
 import fasttext
 import numpy as np
 import pandas as pd
@@ -75,6 +76,11 @@ def __init__(self, model_path=None, min_langid_score=0.3):
         self._cutoff = min_langid_score
         self._name = "lang_id"
 
+        # Dask will automatically convert the list score type
+        # to a string without this option.
+        # See https://github.com/NVIDIA/NeMo-Curator/issues/33
+        dask.config.set({"dataframe.convert-string": False})
+
     @batched
     def score_document(self, df):
         model_attr = f"{self._name}_{self._model_path}"
diff --git a/tests/test_filters.py b/tests/test_filters.py
index 4ab11c21a..50676f385 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -14,6 +14,8 @@
 
 import os
 
+import dask
+import numpy as np
 import pandas as pd
 import pytest
 from dask import dataframe as dd
@@ -508,7 +510,7 @@ def test_repeatedparagraphschar(self):
     def test_repeatingtopngrams(self):
         dataset = list_to_dataset(
             [
-                "this is a totally fine sentence with no repeating ngrams so we are ok",
+                "this is a totally fine sentence with no repeat ngrams so we are ok",
                 "a b . a b",
                 "a a a a a a",
                 "totally fine small dupe a b a b",
@@ -756,3 +758,71 @@ def test_per_extension_filter(self):
         assert all_equal(
             expected_data, filtered_data
         ), f"Expected {expected_data} but got {filtered_data}"
+
+
+class FakeQualityFilter(DocumentFilter):
+    """
+    Emulates FastTextQualityFilter without a model
+    """
+
+    def __init__(self, alpha=3, seed=42):
+        super().__init__()
+        self._alpha = alpha
+        self._seed = np.random.seed(seed)
+
+    @batched
+    def score_document(self, df):
+        return pd.Series(np.arange(len(df)) / len(df))
+
+    @batched
+    def keep_document(self, df):
+        return np.random.pareto(self._alpha, size=len(df)) > 1 - df
+
+
+class FakeLangId(DocumentFilter):
+    """
+    Emulates FastTextLangId without a model
+    """
+
+    def __init__(self, min_langid_score=0.3, convert_string=False):
+        super().__init__()
+        self._cutoff = min_langid_score
+
+        # Dask will automatically convert the list score type
+        # to a string without this option.
+        # See https://github.com/NVIDIA/NeMo-Curator/issues/33
+        dask.config.set({"dataframe.convert-string": convert_string})
+
+    @batched
+    def score_document(self, df):
+        scores = [[0.5, "EN"], [0.7, "HI"], [0.2, "PT"]]
+        scores = scores * len(df)
+        scores = scores[: len(df)]
+        return pd.Series(scores)
+
+    def keep_document(self, score):
+        return score[0] >= self._cutoff
+
+
+class TestClassifierFilters:
+    def test_fake_quality_filter(self):
+        dataset = list_to_dataset(["a", "b", "c", "d"], npartitions=1)
+        filters = ScoreFilter(FakeQualityFilter())
+        filtered_data = filters(dataset)
+
+        expected_indices = [1, 2, 3]
+        expected_data = DocumentDataset(dataset.df.loc[expected_indices])
+        assert all_equal(
+            expected_data, filtered_data
+        ), f"Expected {expected_data} but got {filtered_data}"
+
+    def test_fake_langid_filter(self):
+        dataset = list_to_dataset(["a", "b", "c", "d"], npartitions=1)
+        filters = ScoreFilter(FakeLangId())
+        filtered_data = filters(dataset)
+
+        expected_indices = [0, 1, 3]
+        expected_data = DocumentDataset(dataset.df.loc[expected_indices])
+        assert all_equal(
+            expected_data, filtered_data
+        ), f"Expected {expected_data} but got {filtered_data}"

From 909f58d144f37669eb56567c027b87f45a59ef55 Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Fri, 3 May 2024 15:30:48 -0700
Subject: [PATCH 10/34] Add dataset blending tool (#32)

* Add initial dataset blending function

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add blend unit tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add self parameter

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix return type of blend dataset

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix blending tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Change assert statement for very uneven blend

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix key error

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add proper proportion blending test

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add four dataset blend and clarify docs

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add shuffle module

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add blend example and tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix random method name

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Wrap return type in DocumentDataset

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Save result of column drop

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Change equality check for shuffle tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix expected order after shuffle

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add more documents to shuffle test

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add assert statement

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add within partition shuffle

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Refactor add rand column for shuffle

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix filename tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add determinism handling for shuffle

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Change numpy random function

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix tests with new random method

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Remove length call from blending

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Improve scaling of blending function

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix blend tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add blending script

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add additional file paths call

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add documentation

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Reformat docs

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Remove backticks

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add context manager for shuffle tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add better deterministic shuffle path

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Update documentation and reset index

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

---------

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 docs/user-guide/DocumentDataset.rst    |  84 +++++++++++
 examples/blend_and_shuffle.py          |  53 +++++++
 nemo_curator/datasets/doc_dataset.py   |   2 +-
 nemo_curator/modules/__init__.py       |   3 +
 nemo_curator/modules/dataset_ops.py    | 183 ++++++++++++++++++++++++
 nemo_curator/scripts/blend_datasets.py | 138 ++++++++++++++++++
 setup.py                               |   1 +
 tests/test_blend_datasets.py           | 103 ++++++++++++++
 tests/test_shuffle.py                  | 186 +++++++++++++++++++++++++
 9 files changed, 752 insertions(+), 1 deletion(-)
 create mode 100644 examples/blend_and_shuffle.py
 create mode 100644 nemo_curator/modules/dataset_ops.py
 create mode 100644 nemo_curator/scripts/blend_datasets.py
 create mode 100644 tests/test_blend_datasets.py
 create mode 100644 tests/test_shuffle.py

diff --git a/docs/user-guide/DocumentDataset.rst b/docs/user-guide/DocumentDataset.rst
index 351e41a95..0086314a9 100644
--- a/docs/user-guide/DocumentDataset.rst
+++ b/docs/user-guide/DocumentDataset.rst
@@ -137,3 +137,87 @@ In these cases, we recommend processing the input dataset in batches using a sim
 
 This will read in 64 shards at a time, process them, and write them back to disk.
 Like ``get_remaining_files``, it only includes files that are in the input directory and not in the output directory.
+
+############################
+Blending and Shuffling
+############################
+
+Blending data from multiple sources can be a great way of improving downstream model performance.
+This blending can be done during model training itself (i.e., *online* blending) or it can be done before training (i.e., *offline* blending).
+Online blending is useful for rapidly iterating in the training process.
+Meanwhile, offline blending is useful if you want to distribute the dataset.
+Online blending is currently possible in `NeMo via NVIDIA Megatron Core <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/datasets/blended_dataset.py>`_, and NeMo Curator offers a way to perform blending offline.
+
+Let's take a look at how datasets can be combined using ``nc.blend_datasets``
+
+.. code-block:: python
+
+  import nemo_curator as nc
+
+  books = DocumentDataset.read_json("books_dataset/")
+  articles = DocumentDataset.read_json("articles_dataset/")
+  journals = DocumentDataset.read_json("journals_dataset/")
+
+  datasets = [books, articles, journals]
+  target_samples = 1000
+  weights = [5.0, 2.0, 1.0]
+
+  blended_dataset = nc.blend_datasets(target_samples, datasets, weights)
+
+  blended_dataset.to_json("blended_dataset/")
+
+
+* ``datasets = [books, articles, journals]`` Here, we are choosing to blend three different datasets.
+  These datasets do not have to be in the same file format, or similar in size.
+  So long as they can be read in as a DocumentDataset, they will be fine.
+  The samples from each dataset are always drawn "in order".
+  The precise order depends on the format.
+  For sharded jsonl files, the entries at the beginning of the file with the first name in sorted order will be chosen first.
+* ``target_samples = 1000`` This is the desired number of samples in the resulting dataset.
+  By sample, we mean document or just generally a single datapoint.
+  There may end up being more samples in the dataset depending on the weights.
+* ``weights = [5.0, 2.0, 1.0]`` The relative number of samples that should be taken from each dataset.
+  Given these weights, the blended dataset will have five times as many samples from books as there are samples from journals.
+  Similarly, there will be two times as many samples from articles when compared to samples from journals.
+  Weights can be a list of non-negative real numbers.
+  ``nc.blend_datasets`` will do the normalization and combine the normalized weights with the target samples to determine
+  how many samples should be taken from each dataset.
+  In the case of the books dataset, the following would be the calculation.
+
+  .. math::
+
+    \lceil target\_samples \cdot w_i\rceil=\lceil 1000\cdot \frac{5}{8}\rceil=625
+  If any datasets have fewer samples than the calculated weight, they will be oversampled to meet the quota.
+  For example, if the books dataset only had 500 documents in it, the first 125 would be repeated to achieve
+  the 625 samples.
+* ``blended_dataset = nc.blend_datasets(target_samples, datasets, weights)`` We now call the function itself.
+  Afterwards, we are left with a blended dataset that we can operate on like any other dataset.
+  We can apply filters, deduplicate, or classify the documents.
+
+Because blending datasets involves combining data from multiple sources, the sharding of the original datasets
+cannot be preserved. The options ``add_filename=True`` and ``write_to_filename=True`` for reading and writing
+datasets are therefore incompatible with ``nc.blend_datasets``.
+
+
+Shuffling can be another important aspect of dataset management.
+NeMo Curator's ``nc.Shuffle`` allows users to reorder all entries in the dataset.
+
+Here is a small example on how this can be done:
+
+.. code-block:: python
+
+  import nemo_curator as nc
+
+  books = DocumentDataset.read_json("books_dataset/")
+
+  shuffle = nc.Shuffle(seed=42)
+
+  shuffled_books = shuffle(books)
+
+  shuffled_books.to_json("shuffled_books/")
+
+* ``shuffle = nc.Shuffle(seed=42)`` This creates a shuffle operation that can be chained with
+  the various other modules in NeMo Curator. In this example, we fix the seed to be 42.
+  Setting the seed will guarantee determinism, but may be slightly slower (20-30% slower)
+  depending on the dataset size.
+* ``shuffled_books = shuffle(books)`` The dataset has now been shuffled, and we can save it to the filesystem.
diff --git a/examples/blend_and_shuffle.py b/examples/blend_and_shuffle.py
new file mode 100644
index 000000000..e070d5d2a
--- /dev/null
+++ b/examples/blend_and_shuffle.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import nemo_curator as nc
+from nemo_curator.datasets import DocumentDataset
+from nemo_curator.utils.distributed_utils import get_client
+from nemo_curator.utils.script_utils import add_distributed_args
+
+
+def main(args):
+    # Params
+    dataset_paths = ["/path/to/first", "/path/to/second", "/path/to/third"]
+    dataset_weights = [5.0, 2.0, 1.0]
+    target_size = 1000
+    output_path = "/path/to/output"
+
+    # Set up Dask client
+    client = get_client(args, args.device)
+
+    # Blend the datasets
+    datasets = [DocumentDataset.read_json(path) for path in dataset_paths]
+    blended_dataset = nc.blend_datasets(target_size, datasets, dataset_weights)
+
+    shuffle = nc.Shuffle(seed=42)
+    blended_dataset = shuffle(blended_dataset)
+
+    # Save the blend
+    blended_dataset.to_json(output_path)
+
+
+def attach_args(
+    parser=argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    ),
+):
+    return add_distributed_args(parser)
+
+
+if __name__ == "__main__":
+    main(attach_args().parse_args())
diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py
index 37592b188..a97aa1969 100644
--- a/nemo_curator/datasets/doc_dataset.py
+++ b/nemo_curator/datasets/doc_dataset.py
@@ -24,7 +24,7 @@ class DocumentDataset:
     Internally it may be distributed across multiple nodes, and may be on GPUs.
     """
 
-    def __init__(self, dataset_df):
+    def __init__(self, dataset_df: dd.DataFrame):
         self.df = dataset_df
 
     def __len__(self):
diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py
index 434ebecf4..0867942d8 100644
--- a/nemo_curator/modules/__init__.py
+++ b/nemo_curator/modules/__init__.py
@@ -22,6 +22,7 @@
 from nemo_curator.utils.import_utils import gpu_only_import_from
 
 from .add_id import AddId
+from .dataset_ops import blend_datasets, Shuffle
 from .exact_dedup import ExactDuplicates
 from .filter import Filter, Score, ScoreFilter
 from .meta import Sequential
@@ -50,4 +51,6 @@
     "Sequential",
     "TaskDecontamination",
     "AddId",
+    "blend_datasets",
+    "Shuffle",
 ]
diff --git a/nemo_curator/modules/dataset_ops.py b/nemo_curator/modules/dataset_ops.py
new file mode 100644
index 000000000..38589b1e9
--- /dev/null
+++ b/nemo_curator/modules/dataset_ops.py
@@ -0,0 +1,183 @@
+import math
+from typing import Any, Callable, List, Optional
+
+import dask.dataframe as dd
+import numpy as np
+
+from nemo_curator.datasets.doc_dataset import DocumentDataset
+
+
+def default_filename(partition_num: int) -> str:
+    return f"file_{partition_num:010d}.jsonl"
+
+
+class Shuffle:
+    def __init__(
+        self,
+        seed: Optional[int] = None,
+        npartitions: Optional[int] = None,
+        partition_to_filename: Callable[[int], str] = default_filename,
+    ) -> None:
+        """
+        Randomly permutes the dataset. This will make the original "filename" column invalid, so if the column is present it will be overwritten.
+        Args:
+            seed: The random seed that will be used to determine which partition (file) each datapoint goes to.
+                Setting the seed will guarantee determinism, but may be slightly slower (20-30% slower)
+                depending on the dataset size.
+            npartitions: The output number of partitions to create in the dataset.
+                If None, it will retain the same number of partitions as the original dataset.
+            partition_to_filename: If the filename column is present, it will be overwritten.
+                Passing a function in through this argument allows the user to configure what the filename
+                will look like given the partition number. The default method names the partition
+                f'file_{partition_num:010d}.jsonl' and should be changed if the user is not using a .jsonl format.
+        """
+        self.seed = seed
+        self.npartitions = npartitions
+        self.partition_to_filename = partition_to_filename
+        self.rand_col = "_shuffle_rand"
+
+    def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
+        if self.seed is None:
+            return self.shuffle_nondeterministic(dataset)
+        else:
+            return self.shuffle_deterministic(dataset)
+
+    def shuffle_deterministic(self, dataset: DocumentDataset) -> DocumentDataset:
+        new_npartitions = (
+            dataset.df.npartitions if self.npartitions is None else self.npartitions
+        )
+
+        dataset.df[self.rand_col] = dataset.df.map_partitions(self._add_rand_col)
+
+        shuffled_df = dataset.df.set_index(self.rand_col, npartitions=new_npartitions)
+        shuffled_df = shuffled_df.reset_index(drop=True)
+
+        if "filename" in shuffled_df:
+            shuffled_df["filename"] = shuffled_df.map_partitions(self._add_filename)
+
+        return DocumentDataset(shuffled_df)
+
+    def shuffle_nondeterministic(self, dataset: DocumentDataset) -> DocumentDataset:
+        new_npartitions = (
+            dataset.df.npartitions if self.npartitions is None else self.npartitions
+        )
+
+        dataset.df[self.rand_col] = dataset.df.map_partitions(self._add_rand_col)
+
+        shuffled_df = dataset.df.shuffle(
+            self.rand_col, npartitions=new_npartitions, ignore_index=True
+        )
+        shuffled_df = shuffled_df.drop(columns=[self.rand_col])
+        shuffled_df = shuffled_df.map_partitions(self._partition_shuffle)
+
+        return DocumentDataset(shuffled_df)
+
+    def _add_rand_col(self, partition, partition_info=None):
+        if partition_info is None:
+            partition_info = {
+                "number": 0,
+            }
+
+        if self.seed is not None:
+            np.random.seed(self.seed + partition_info["number"])
+        rand_col = np.random.randint(0, np.iinfo("int64").max, size=len(partition))
+
+        return rand_col
+
+    def _partition_shuffle(self, partition, partition_info=None):
+        if partition_info is None:
+            return partition
+
+        partition_num = partition_info["number"]
+        if self.seed is not None:
+            random_state = self.seed + partition_num
+        else:
+            random_state = None
+
+        partition = partition.sample(frac=1, random_state=random_state).reset_index(
+            drop=True
+        )
+
+        if "filename" in partition:
+            filename = self.partition_to_filename(partition_num)
+            partition["filename"] = filename
+
+        return partition
+
+    def _add_filename(self, partition, partition_info=None):
+        if partition_info is None:
+            return ["filename"] * len(partition)
+
+        filename = self.partition_to_filename(partition_info["number"])
+
+        return [filename for _ in range(len(partition))]
+
+
+def blend_datasets(
+    target_size: int, datasets: List[DocumentDataset], sampling_weights: List[float]
+) -> DocumentDataset:
+    """
+    Combined multiple datasets into one with different amounts of each dataset
+    Args:
+        target_size: The number of documents the resulting dataset should have.
+            The actual size of the dataset may be slightly larger if the normalized weights do not allow
+            for even mixtures of the datasets.
+        datasets: A list of all datasets to combine together
+        sampling_weights: A list of weights to assign to each dataset in the input. Weights will be
+            normalized across the whole list as a part of the sampling process. For example, if the normalized
+            sampling weight for dataset 1 is 0.02, 2% ofthe total samples will be sampled from dataset 1.
+            There are guaranteed to be math.ceil(normalized_weight_i * target_size) elements from dataset i in
+            the final blend.
+    """
+    if len(datasets) != len(sampling_weights):
+        raise ValueError(
+            f"Different number of datasets and weights specified. {len(datasets)} datasets and {len(sampling_weights)}"
+        )
+
+    weight_sum = sum(sampling_weights)
+    sampling_weights = [weight / weight_sum for weight in sampling_weights]
+    num_documents_per_dataset = [
+        math.ceil(weight * target_size) for weight in sampling_weights
+    ]
+
+    blend_components = []
+    for dataset, num_documents in zip(datasets, num_documents_per_dataset):
+        # Repeatedly sample from the dataset
+        while num_documents > 0:
+            sample = _partition_head(dataset.df, num_documents)
+            blend_components.append(sample)
+            num_documents -= len(sample)
+
+    blended_dataset = dd.concat(blend_components)
+
+    return DocumentDataset(blended_dataset)
+
+
+def _partition_head(ddf: dd.DataFrame, n: int) -> dd.DataFrame:
+    """
+    Returns the first n rows in a dataframe while preserving the partitions.
+    Meant as a replacement for ddf.head(npartitions=-1, compute=False) as it
+    uses too much memory at large scales
+
+    Args:
+        ddf: The dataframe to get the first rows from
+        n: The number of rows to get
+    """
+    original_meta = ddf.dtypes.to_dict()
+    partition_lengths = ddf.map_partitions(len)
+    num_partitions = 0
+    total_size = 0
+    last_length = 0
+    for length in partition_lengths:
+        total_size += length
+        num_partitions += 1
+        last_length = length
+        if total_size >= n:
+            break
+
+    delayed_df = ddf.to_delayed()
+    excess_elems = max(0, total_size - n)
+    delayed_df = delayed_df[:num_partitions]
+    delayed_df[-1] = delayed_df[-1].head(last_length - excess_elems)
+
+    return dd.from_delayed(delayed_df, meta=original_meta)
diff --git a/nemo_curator/scripts/blend_datasets.py b/nemo_curator/scripts/blend_datasets.py
new file mode 100644
index 000000000..4f0fc253a
--- /dev/null
+++ b/nemo_curator/scripts/blend_datasets.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import nemo_curator as nc
+from nemo_curator.datasets import DocumentDataset
+from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk
+from nemo_curator.utils.file_utils import (
+    expand_outdir_and_mkdir,
+    get_all_files_paths_under,
+)
+from nemo_curator.utils.script_utils import add_distributed_args, attach_bool_arg
+
+
+def main(args):
+    client = get_client(args, args.device)
+
+    out_dir = expand_outdir_and_mkdir(args.output_data_dir)
+
+    input_dirs = args.input_data_dirs.split(",")
+    weights = [float(weight) for weight in args.weights.split(",")]
+
+    datasets = [
+        DocumentDataset(
+            read_data(
+                get_all_files_paths_under(path),
+                file_type=args.input_file_type,
+                backend="pandas",
+            )
+        )
+        for path in input_dirs
+    ]
+
+    output_dataset = nc.blend_datasets(args.target_samples, datasets, weights)
+
+    if args.shuffle:
+        shuffle = nc.Shuffle(seed=args.seed)
+        output_dataset = shuffle(output_dataset)
+
+    write_to_disk(output_dataset.df, out_dir, output_type=args.output_file_type)
+
+    client.close()
+
+
+def attach_args(
+    parser=argparse.ArgumentParser(
+        """
+Blends a collection of datasets together based on certain weights.
+
+It takes as input a comma-separated list of dataset directories, the
+corresponding weights that should be associated with each datatset,
+and the target number of samples to aggregate from across all the datasets.
+The file shards of the resulting dataset are not guaranteed to be even
+or reflect the original dataset(s).
+
+A blend is created from these datasets and saved to the specified output directory.
+Optionally, the user can choose to shuffle this dataset as well.
+  """,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+):
+    parser.add_argument(
+        "--input-data-dirs",
+        type=str,
+        default=None,
+        help="Comma-separated list of directories consisting of dataset "
+        "files that are accessible to all nodes.",
+    )
+    parser.add_argument(
+        "--weights",
+        type=str,
+        default=None,
+        help="Comma-separated list of floating-point weights corresponding "
+        "to each dataset passed in --input-data-dirs",
+    )
+    parser.add_argument(
+        "--output-data-dir",
+        type=str,
+        default=None,
+        help="The output directory to where the blended dataset is"
+        "retained during filtering will be written. If this argument "
+        "is not specified, then the document scores from the "
+        "filter(s) will be written to the document meta data in place",
+    )
+    parser.add_argument(
+        "--target-samples",
+        type=int,
+        default=10000,
+        help="The number of samples to be included in the output dataset."
+        " There may be more samples in order to accurately reflect the "
+        "weight balance, but there will never be less",
+    )
+    attach_bool_arg(
+        parser,
+        "shuffle",
+        default=False,
+        help_str="Shuffles the dataset after blending",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="If specified, the random seed used for shuffling.",
+    )
+    parser.add_argument(
+        "--input-file-type",
+        type=str,
+        default="jsonl",
+        help="File type of the dataset to be read in. Supported file formats"
+        " include 'jsonl' (default), 'pickle', or 'parquet'.",
+    )
+    parser.add_argument(
+        "--output-file-type",
+        type=str,
+        default="jsonl",
+        help="File type the dataset will be written to. Supported file formats"
+        " include 'jsonl' (default), 'pickle', or 'parquet'.",
+    )
+
+    parser = add_distributed_args(parser)
+
+    return parser
+
+
+def console_script():
+    main(attach_args().parse_args())
diff --git a/setup.py b/setup.py
index 91c32a296..357e33e51 100644
--- a/setup.py
+++ b/setup.py
@@ -102,6 +102,7 @@
             "quality_classifier_multiple_models_inference=nemo_curator.distributed_data_classification.quality_classifier_multiple_models_inference:console_script",
             "quality_classifier_inference=nemo_curator.distributed_data_classification.quality_classifier_inference:console_script",
             "verify_results=nemo_curator.distributed_data_classification.verify_results:console_script",
+            "blend_datasets=nemo_curator.scripts.blend_datasets:console_script",
         ],
     },
 )
diff --git a/tests/test_blend_datasets.py b/tests/test_blend_datasets.py
new file mode 100644
index 000000000..7c7f7e28b
--- /dev/null
+++ b/tests/test_blend_datasets.py
@@ -0,0 +1,103 @@
+import dask.dataframe as dd
+import pandas as pd
+
+import nemo_curator as nc
+from nemo_curator.datasets import DocumentDataset
+
+
+def list_to_dataset(documents, col_name="text", npartitions=2):
+    data = {col_name: documents}
+    pdf = pd.DataFrame(data)
+
+    return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions))
+
+
+def all_equal(left_dataset, right_dataset):
+    left_result = left_dataset.df.compute()
+    right_result = right_dataset.df.compute()
+
+    l_cols = set(left_result.columns)
+    r_cols = set(right_result.columns)
+    assert l_cols == r_cols
+    for col in left_result.columns:
+        left = left_result[col].reset_index(drop=True)
+        right = right_result[col].reset_index(drop=True)
+        assert all(left == right), f"Mismatch in {col} column.\n{left}\n{right}\n"
+
+
+class TestBlending:
+    def test_blend_as_original(self):
+        first_dataset = list_to_dataset(["one", "two", "three"])
+        result_dataset = nc.blend_datasets(len(first_dataset), [first_dataset], [1.0])
+        all_equal(first_dataset, result_dataset)
+
+    def test_equal_blend(self):
+        first_dataset = list_to_dataset(["a", "a"])
+        second_dataset = list_to_dataset(["b", "b"])
+        result_dataset = nc.blend_datasets(
+            2, [first_dataset, second_dataset], [0.5, 0.5]
+        )
+        counts = result_dataset.df["text"].value_counts().compute()
+        assert len(result_dataset) == 2
+        assert counts["a"] == 1
+        assert counts["b"] == 1
+
+    def test_equal_blend_with_weights(self):
+        first_dataset = list_to_dataset(["a", "a"])
+        second_dataset = list_to_dataset(["b", "b"])
+        result_dataset = nc.blend_datasets(
+            2, [first_dataset, second_dataset], [2.0, 2.0]
+        )
+        counts = result_dataset.df["text"].value_counts().compute()
+        assert len(result_dataset) == 2
+        assert counts["a"] == 1
+        assert counts["b"] == 1
+
+    def test_uneven_blend(self):
+        first_dataset = list_to_dataset(["a", "a"])
+        second_dataset = list_to_dataset(["b", "b"])
+        result_dataset = nc.blend_datasets(
+            4, [first_dataset, second_dataset], [3.0, 1.0]
+        )
+        counts = result_dataset.df["text"].value_counts().compute()
+        assert len(result_dataset) == 4
+        assert counts["a"] == 3
+        assert counts["b"] == 1
+
+    def test_very_uneven_blend(self):
+        first_dataset = list_to_dataset(["a", "a"])
+        second_dataset = list_to_dataset(["b", "b"])
+        result_dataset = nc.blend_datasets(
+            4, [first_dataset, second_dataset], [1.0, 0.0]
+        )
+        counts = result_dataset.df["text"].value_counts().compute()
+        assert len(result_dataset) == 4
+        assert counts["a"] == 4
+        assert "b" not in counts
+
+    def test_proper_uneven_blend(self):
+        first_dataset = list_to_dataset(["a", "b", "c", "d"])
+        second_dataset = list_to_dataset(["e", "f"])
+        result_dataset = nc.blend_datasets(
+            8, [first_dataset, second_dataset], [1.0, 0.0]
+        )
+        counts = result_dataset.df["text"].value_counts().compute()
+        assert len(result_dataset) == 8
+        assert counts["a"] == 2
+        assert counts["b"] == 2
+        assert counts["c"] == 2
+        assert counts["d"] == 2
+
+    def test_four_dataset_blend(self):
+        datasets = []
+        datasets.append(list_to_dataset(["a", "a"]))
+        datasets.append(list_to_dataset(["b", "b", "b"]))
+        datasets.append(list_to_dataset(["c"]))
+        datasets.append(list_to_dataset(["d", "d", "d", "d"]))
+        result_dataset = nc.blend_datasets(8, datasets, [1.0, 2.0, 3.0, 4.0])
+        counts = result_dataset.df["text"].value_counts().compute()
+        assert len(result_dataset) == 10
+        assert counts["a"] == 1
+        assert counts["b"] == 2
+        assert counts["c"] == 3
+        assert counts["d"] == 4
diff --git a/tests/test_shuffle.py b/tests/test_shuffle.py
new file mode 100644
index 000000000..a23d47906
--- /dev/null
+++ b/tests/test_shuffle.py
@@ -0,0 +1,186 @@
+import dask.dataframe as dd
+import pandas as pd
+from dask.distributed import Client, LocalCluster
+
+import nemo_curator as nc
+from nemo_curator.datasets import DocumentDataset
+
+
+def list_to_dataset(documents, col_name="text", npartitions=2):
+    data = {col_name: documents}
+    pdf = pd.DataFrame(data)
+
+    return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions))
+
+
+def all_equal(left_dataset, right_dataset):
+    left_result = left_dataset.df.compute()
+    right_result = right_dataset.df.compute()
+
+    l_cols = set(left_result.columns)
+    r_cols = set(right_result.columns)
+    assert l_cols == r_cols
+    for col in left_result.columns:
+        left = left_result[col].reset_index(drop=True)
+        right = right_result[col].reset_index(drop=True)
+        assert all(left == right), f"Mismatch in {col} column.\n{left}\n{right}\n"
+
+
+class TestShuffleNondeterministic:
+    def test_shuffle(self):
+        # Single threaded Dask is the only way to guarantee shuffle determinism
+        # Docs: https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.shuffle.html
+        with LocalCluster(n_workers=1, threads_per_worker=1) as cluster:
+            with Client(cluster):
+                original_dataset = list_to_dataset(
+                    ["one", "two", "three", "four", "five"]
+                )
+                expected_dataset = list_to_dataset(
+                    ["two", "five", "three", "one", "four"]
+                )
+                shuffle = nc.Shuffle(seed=42)
+                result_dataset = shuffle.shuffle_nondeterministic(original_dataset)
+                all_equal(expected_dataset, result_dataset)
+
+    def test_new_partitions(self):
+        with LocalCluster(n_workers=1, threads_per_worker=1) as cluster:
+            with Client(cluster):
+                original_dataset = list_to_dataset(
+                    ["one", "two", "three", "four", "five"], npartitions=3
+                )
+                expected_dataset = list_to_dataset(
+                    ["two", "five", "three", "one", "four"], npartitions=3
+                )
+                shuffle = nc.Shuffle(seed=42, npartitions=2)
+                result_dataset = shuffle.shuffle_nondeterministic(original_dataset)
+                all_equal(expected_dataset, result_dataset)
+
+    def test_filename(self):
+        with LocalCluster(n_workers=1, threads_per_worker=1) as cluster:
+            with Client(cluster):
+                original_dataset = list_to_dataset(
+                    ["one", "two", "three", "four", "five"], npartitions=1
+                )
+                original_dataset.df["filename"] = "original.jsonl"
+
+                expected_data = {
+                    "text": ["one", "two", "three", "five", "four"],
+                    "filename": [
+                        "file_0000000000.jsonl",
+                        "file_0000000000.jsonl",
+                        "file_0000000000.jsonl",
+                        "file_0000000001.jsonl",
+                        "file_0000000001.jsonl",
+                    ],
+                }
+                pdf = pd.DataFrame(expected_data)
+                expected_dataset = DocumentDataset(dd.from_pandas(pdf, npartitions=2))
+
+                shuffle = nc.Shuffle(seed=42, npartitions=2)
+                result_dataset = shuffle.shuffle_nondeterministic(original_dataset)
+                all_equal(expected_dataset, result_dataset)
+
+    def test_custom_filenames(self):
+        with LocalCluster(n_workers=1, threads_per_worker=1) as cluster:
+            with Client(cluster):
+                original_dataset = list_to_dataset(
+                    ["one", "two", "three", "four", "five"], npartitions=1
+                )
+                original_dataset.df["filename"] = "original.jsonl"
+
+                expected_data = {
+                    "text": ["one", "two", "three", "five", "four"],
+                    "filename": [
+                        "my_0.test",
+                        "my_0.test",
+                        "my_0.test",
+                        "my_1.test",
+                        "my_1.test",
+                    ],
+                }
+                pdf = pd.DataFrame(expected_data)
+                expected_dataset = DocumentDataset(dd.from_pandas(pdf, npartitions=2))
+
+                def filename_fn(x):
+                    return f"my_{x}.test"
+
+                shuffle = nc.Shuffle(
+                    seed=42, npartitions=2, partition_to_filename=filename_fn
+                )
+                result_dataset = shuffle.shuffle_nondeterministic(original_dataset)
+                all_equal(expected_dataset, result_dataset)
+
+    def test_shuffle_no_seed(self):
+        original_dataset = list_to_dataset(["one", "two", "three", "four", "five"])
+        shuffle = nc.Shuffle()
+        result_dataset = shuffle(original_dataset)
+        assert len(result_dataset.df.compute()) == 5
+
+
+class TestShuffleDeterministic:
+    def test_shuffle(self):
+        original_dataset = list_to_dataset(["one", "two", "three", "four", "five"])
+        expected_dataset = list_to_dataset(["five", "four", "three", "one", "two"])
+        shuffle = nc.Shuffle(seed=42)
+        result_dataset = shuffle(original_dataset)
+        all_equal(expected_dataset, result_dataset)
+
+    def test_new_partitions(self):
+        original_dataset = list_to_dataset(
+            ["one", "two", "three", "four", "five"], npartitions=3
+        )
+        expected_dataset = list_to_dataset(
+            ["four", "three", "five", "one", "two"], npartitions=3
+        )
+        shuffle = nc.Shuffle(seed=42, npartitions=2)
+        result_dataset = shuffle(original_dataset)
+        all_equal(expected_dataset, result_dataset)
+
+    def test_filename(self):
+        original_dataset = list_to_dataset(
+            ["one", "two", "three", "four", "five"], npartitions=1
+        )
+        original_dataset.df["filename"] = "original.jsonl"
+
+        expected_data = {
+            "text": ["four", "five", "three", "one", "two"],
+            "filename": [
+                "file_0000000000.jsonl",
+                "file_0000000001.jsonl",
+                "file_0000000001.jsonl",
+                "file_0000000001.jsonl",
+                "file_0000000001.jsonl",
+            ],
+        }
+        pdf = pd.DataFrame(expected_data)
+        expected_dataset = DocumentDataset(dd.from_pandas(pdf, npartitions=2))
+
+        shuffle = nc.Shuffle(seed=42, npartitions=2)
+        result_dataset = shuffle(original_dataset)
+        all_equal(expected_dataset, result_dataset)
+
+    def test_custom_filenames(self):
+        original_dataset = list_to_dataset(
+            ["one", "two", "three", "four", "five"], npartitions=1
+        )
+        original_dataset.df["filename"] = "original.jsonl"
+
+        expected_data = {
+            "text": ["four", "five", "three", "one", "two"],
+            "filename": [
+                "my_0.test",
+                "my_1.test",
+                "my_1.test",
+                "my_1.test",
+                "my_1.test",
+            ],
+        }
+        pdf = pd.DataFrame(expected_data)
+        expected_dataset = DocumentDataset(dd.from_pandas(pdf, npartitions=2))
+
+        def filename_fn(x):
+            return f"my_{x}.test"
+
+        shuffle = nc.Shuffle(seed=42, npartitions=2, partition_to_filename=filename_fn)
+        result_dataset = shuffle(original_dataset)
+        all_equal(expected_dataset, result_dataset)

From 0bab063159943951599d9fd8f23415875bb3be0c Mon Sep 17 00:00:00 2001
From: Ayush Dattagupta <ayushdg95@gmail.com>
Date: Fri, 3 May 2024 16:31:43 -0700
Subject: [PATCH 11/34] High level fuzzy duplicates module (#46)

* Initial pass at fuzzy dedup api

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Update deprecated shuffle arg

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* dask_cuda gpu only import

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Move fuzzy_dedup imports to optional

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* more tests

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Move FuzzyDeDupConfig to it's own class

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Add example script and config file, fix typo

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Remove slurm examples for gpu dedup

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Add config module

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Rename FuzzyDeDupConfig and minhash_length to  FuzzyDuplicatesConfig, num_hashes

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Add comments and update example

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Write to same format as input in fuzzy dedup example

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

---------

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 config/fuzzy_dedup_config.yaml                |  16 ++
 examples/fuzzy_deduplication.py               | 109 ++++++++++
 examples/gpu_deduplication_example/README.md  |  29 ---
 examples/gpu_deduplication_example/batch.sh   |  38 ----
 .../create-list-of-exact-duplicate-ids.sh     |  53 -----
 .../create-list-of-fuzzy-duplicate-ids.sh     |  66 ------
 .../remove-duplicates.sh                      |  52 -----
 .../gpu_deduplication_example/run-buckets.sh  |  29 ---
 examples/gpu_deduplication_example/run-cc.sh  |  26 ---
 .../gpu_deduplication_example/run-jaccard.sh  |  16 --
 .../gpu_deduplication_example/run-minhash.sh  |  42 ----
 .../gpu_deduplication_example/run-shuffle.sh  |  35 ----
 .../gpu_deduplication_example/run-workflow.sh |  70 -------
 nemo_curator/modules/__init__.py              |   6 +
 nemo_curator/modules/config.py                | 100 +++++++++
 nemo_curator/modules/fuzzy_dedup.py           | 182 ++++++++++++++--
 .../fuzzy_deduplication/minhash_lsh.py        |   2 +-
 tests/test_config.py                          |  81 ++++++++
 tests/test_fuzzy_dedup.py                     | 195 +++++++++++++++++-
 19 files changed, 670 insertions(+), 477 deletions(-)
 create mode 100644 config/fuzzy_dedup_config.yaml
 create mode 100644 examples/fuzzy_deduplication.py
 delete mode 100644 examples/gpu_deduplication_example/README.md
 delete mode 100644 examples/gpu_deduplication_example/batch.sh
 delete mode 100644 examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh
 delete mode 100644 examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh
 delete mode 100644 examples/gpu_deduplication_example/remove-duplicates.sh
 delete mode 100644 examples/gpu_deduplication_example/run-buckets.sh
 delete mode 100644 examples/gpu_deduplication_example/run-cc.sh
 delete mode 100644 examples/gpu_deduplication_example/run-jaccard.sh
 delete mode 100644 examples/gpu_deduplication_example/run-minhash.sh
 delete mode 100644 examples/gpu_deduplication_example/run-shuffle.sh
 delete mode 100755 examples/gpu_deduplication_example/run-workflow.sh
 create mode 100644 nemo_curator/modules/config.py
 create mode 100644 tests/test_config.py

diff --git a/config/fuzzy_dedup_config.yaml b/config/fuzzy_dedup_config.yaml
new file mode 100644
index 000000000..a513a72f8
--- /dev/null
+++ b/config/fuzzy_dedup_config.yaml
@@ -0,0 +1,16 @@
+cache_dir: "./fuzzy_dedup_cache"
+# Optional Params below with default values
+# profile_dir: null
+# id_field: "id"
+# text_field: "text"
+
+# seed: 42
+# char_ngrams: 5
+# num_buckets: 20
+# hashes_per_bucket: 13
+# use_64_bit_hash: false
+# buckets_per_shuffle: 1
+
+# false_positive_check: True
+# num_anchors: 2
+# jaccard_threshold: 0.8
diff --git a/examples/fuzzy_deduplication.py b/examples/fuzzy_deduplication.py
new file mode 100644
index 000000000..d74fd775c
--- /dev/null
+++ b/examples/fuzzy_deduplication.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+
+import dask
+from dask import dataframe as dd
+
+from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig
+from nemo_curator.datasets import DocumentDataset
+from nemo_curator.utils.distributed_utils import get_client, write_to_disk
+from nemo_curator.utils.script_utils import add_distributed_args
+
+
+def pre_imports():
+    import cudf  # noqa: F401
+
+
+def main(args):
+
+    dataset_dir = "/path/to/dataset"
+    log_dir = "./"
+    cache_dir = "./fuzzy_cache"
+    output_dir = "./output"
+    dataset_id_field = "id"
+    dataset_text_field = "text"
+
+    filetype = "parquet"
+
+    # Fuzzy dup calculation only supports the cuDF/GPU backend
+    backend = "cudf"
+    assert args.device == "gpu"
+
+    with dask.config.set({"dataframe.backend": backend}):
+        client = get_client(args, args.device)
+        client.run(pre_imports)
+
+        t0 = time.time()
+        if filetype == "parquet":
+            input_dataset = DocumentDataset(
+                dd.read_parquet(
+                    dataset_dir,
+                    columns=[dataset_id_field, dataset_text_field],
+                    blocksize="256MiB",
+                    aggregate_files=True,
+                )
+            )
+        elif filetype == "jsonl":
+            input_dataset = DocumentDataset.read_json(
+                dataset_dir,
+                backend=backend,
+            )
+
+        fuzzy_dedup_config = FuzzyDuplicatesConfig(
+            cache_dir=cache_dir,
+            id_field=dataset_id_field,
+            text_field=dataset_text_field,
+            seed=42,
+            char_ngrams=5,
+            num_buckets=20,
+            hashes_per_bucket=13,
+            use_64_bit_hash=False,
+            buckets_per_shuffle=5,
+            false_positive_check=True,
+            num_anchors=2,
+            jaccard_threshold=0.8,
+        )
+        fuzzy_dup = FuzzyDuplicates(logger=log_dir, config=fuzzy_dedup_config)
+        duplicates = fuzzy_dup(dataset=input_dataset)
+
+        # By default all duplicate id's and the group they belong to are included in the result
+        # keep 1 document from each group of duplcates and mark the others to remove
+        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html
+        docs_to_remove = duplicates.df.map_partitions(
+            lambda x: x[x.group.duplicated(keep="first")]
+        )
+
+        # When there are few duplicates we can compute the results to a list and use `isin`.
+        result = input_dataset.df[
+            ~input_dataset.df[dataset_id_field].isin(
+                docs_to_remove[dataset_id_field].compute()
+            )
+        ]
+        write_to_disk(result, output_dir, output_type=filetype)
+        print(time.time() - t0)
+
+
+def attach_args(
+    parser=argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    ),
+):
+    return add_distributed_args(parser)
+
+
+if __name__ == "__main__":
+    main(attach_args().parse_args())
diff --git a/examples/gpu_deduplication_example/README.md b/examples/gpu_deduplication_example/README.md
deleted file mode 100644
index 2f294e1f6..000000000
--- a/examples/gpu_deduplication_example/README.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### Deduplication Steps
-
-> [!CAUTION]
-> The examples references here are outdated and will be replaced with an example using the Python API directly. For more details on the scripts refer to [nemo_curator/scripts/fuzzy_deduplication](/nemo_curator/scripts/fuzzy_deduplication)
-
-1. Exact dedup
-    1. Input: Data directories
-    2. Output: exact_duplicates.parquet. List of exact duplicates and the document hash.
-
-Fuzzy Dedup
-1. Minhashes (Compute minhashes)
-    1. Input: Data Directories
-    2. Output: minhashes.parquet for each data dir.
-2. Buckets (Minhash Buckets)
-    1. Input: Minhash directories
-    2. Output: Buckets.parquet
-3. Jaccard Map Buckets + Jaccard shuffle
-    1. Input: Buckets.parquet + Data Dir
-    2. Output: Shuffled docs.parquet
-4. Jaccard compute
-    1. Input: Shuffled docs.parquet
-    2. Output: dedup_final_results.parquet
-5. Connected Components
-    1. Input: Dedup_final_Results.parquet
-    2. Output: connected_components.parquet
-
-
-While calling the main `run-workflow.sh` script that points to these runscripts users can also set the relevant `LIBCUDF_CUFILE_POLICY`.
-It is reccomended to set `LIBCUDF_CUFILE_POLICY=OFF` for all runs calling the script.
diff --git a/examples/gpu_deduplication_example/batch.sh b/examples/gpu_deduplication_example/batch.sh
deleted file mode 100644
index eca7145c8..000000000
--- a/examples/gpu_deduplication_example/batch.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#! /bin/bash
-
-#SBATCH --job-name=nemo-data-curator:gpu-deduplication
-#SBATCH --nodes=8
-#SBATCH --exclusive
-#SBATCH --time=04:00:00
-
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#
-# This script can be used for running both exact and fuzzy document-level
-# deduplication using Dask and cuDF
-#
-
-base_dir=`pwd` # Assumes base dir is top-level dir of repo
-RUNSCRIPT=${RUNSCRIPT:-${base_dir}/examples/gpu_deduplication_example/run-minhash.sh}
-LIBCUDF_CUFILE_POLICY=${LIBCUDF_CUFILE_POLICY:-OFF}
-echo $RUNSCRIPT
-
-docker_image='nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.03'
-mounts="${base_dir}:${base_dir}"
-
-srun -l \
-  --container-mounts=${mounts} \
-  --container-image=${docker_image} \
-    bash -c "echo ${RUNSCRIPT};echo ${LIBCUDF_CUFILE_POLICY}; LIBCUDF_CUFILE_POLICY=${LIBCUDF_CUFILE_POLICY} RUNSCRIPT=${RUNSCRIPT} bash ${base_dir}/examples/gpu_deduplication_example/run-workflow.sh"
diff --git a/examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh b/examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh
deleted file mode 100644
index 757629e33..000000000
--- a/examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#! /bin/bash
-
-#SBATCH --job-name=nemo-data-curator:create-exact-dup-id-list
-#SBATCH --nodes=1
-#SBATCH --exclusive
-#SBATCH --time=0:30:00
-
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -eux
-
-## Log and intermediate results dirs
-base_dir=`pwd`
-src_dir="${base_dir}/workspace/nemo-data-curator"
-log_dir=${src_dir}/workspace/log/create_exact_dup_id_list
-res_dir=${src_dir}/workspace/data/create_exact_dup_id_list
-conf_dir=${src_dir}/workspace/config
-mkdir -p ${log_dir} ${res_dir} ${conf_dir}
-
-## Container related variables
-docker_image="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11"
-mounts="${base_dir}:${base_dir}"
-
-## Set relevant filepath
-input_id_list_dir=<Provide path to exact_duplicates.parquet generated from exact dedup>
-
-srun -l \
-  --mpi=pmix \
-  --output=${log_dir}/create_exact_dup_id_list_%j.out \
-  --error=${log_dir}/create_exact_dup_id_list_%j.err \
-  --container-image=${docker_image} \
-  --container-mounts=${mounts} \
-    create_list_of_duplicate_ids \
-      --input-id-list-dir=${input_id_list_dir} \
-      --input-bucket-key="_hashes" \
-      --output-id-list-dir=${res_dir}/exact_dup_ids \
-      --output-bucket-list-dir=${res_dir}/buckets \
-      --log-dir=${log_dir}/create_exact_dup_id_list
-
-# Concatenate the extracted list of ids
-cat ${res_dir}/exact_dup_ids/*.txt > ${res_dir}/exact_duplicate_id_list.txt
diff --git a/examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh b/examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh
deleted file mode 100644
index 70b0d13bd..000000000
--- a/examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#! /bin/bash
-
-#SBATCH --job-name=nemo-data-curator:create-fuzzy-dup-id-list
-#SBATCH --nodes=1
-#SBATCH --exclusive
-#SBATCH --time=0:30:00
-
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -eux
-
-## Log and intermediate results dirs
-base_dir=`pwd`
-src_dir="${base_dir}/workspace/nemo-data-curator"
-log_dir=${src_dir}/workspace/log/create_fuzzy_dup_id_list
-res_dir=${src_dir}/workspace/data/create_fuzzy_dup_id_list
-conf_dir=${src_dir}/workspace/config
-mkdir -p ${log_dir} ${res_dir} ${conf_dir}
-
-## Container related variables
-docker_image="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11"
-mounts="${base_dir}:${base_dir}"
-
-## Set relevant filepath
-input_id_list_dir=<Provide path to connected_components.parquet generated from fuzzy dedup>
-
-# Generate the mapping and prepare the connected components
-srun -l \
-  --nodes=1 \
-  --output=${log_dir}/create_fuzzy_dup_id_list_%j.out \
-  --error=${log_dir}/create_fuzzy_dup_id_list_%j.err \
-  --container-image=${docker_image} \
-  --container-mounts=${mounts} \
-    prepare_fuzzy_ids \
-      --path-to-connected-components=${input_id_list_dir} \
-      --output-indexed-connected-components=${res_dir}/indexed_connected_components.parquet \
-      --output-id-mapping=${res_dir}/mapping.json
-
-srun -l \
-  --mpi=pmix \
-  --output=${log_dir}/create_fuzzy_dup_id_list_%j.out \
-  --error=${log_dir}/create_fuzzy_dup_id_list_%j.err \
-  --container-image=${docker_image} \
-  --container-mounts=${mounts} \
-    create_list_of_duplicate_ids \
-      --input-id-list-dir=${res_dir}/indexed_connected_components.parquet \
-      --input-bucket-key="group" \
-      --id-mapping=${res_dir}/mapping.json \
-      --output-id-list-dir=${res_dir}/fuzzy_dup_ids \
-      --output-bucket-list-dir=${res_dir}/buckets \
-      --log-dir=${log_dir}/create_fuzzy_dup_id_list
-
-# Concatenate the extracted list of ids
-cat ${res_dir}/fuzzy_dup_ids/*.txt > ${res_dir}/fuzzy_duplicate_id_list.txt
diff --git a/examples/gpu_deduplication_example/remove-duplicates.sh b/examples/gpu_deduplication_example/remove-duplicates.sh
deleted file mode 100644
index 275c9f153..000000000
--- a/examples/gpu_deduplication_example/remove-duplicates.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#! /bin/bash
-
-#SBATCH --job-name=nemo-data-curator:remove-duplicates
-#SBATCH --nodes=10
-#SBATCH --exclusive
-#SBATCH --time=01:00:00
-
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -eux
-
-## Log and intermediate results dirs
-base_dir=`pwd`
-src_dir="${base_dir}/workspace/nemo-data-curator"
-log_dir=${src_dir}/workspace/log/remove_duplicates
-res_dir=${src_dir}/workspace/data/remove_duplicates
-conf_dir=${src_dir}/workspace/config
-mkdir -p ${log_dir} ${res_dir} ${conf_dir}
-
-## Container related variables
-docker_image="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11"
-mounts="${base_dir}:${base_dir}"
-
-## Set relevant filepaths
-input_data_dir="<Specify Paths to dataset>"
-input_id_list="<Specify list containing duplicate ids>"
-output_data_dir="<Specify output directory to where deduped docs will be written>"
-fname=$(basename ${input_id_list})
-tag=$(basename $fname .txt)
-
-srun -l \
-  --output=${log_dir}/remove_duplicates_${tag}_%j.out \
-  --error=${log_dir}/remove_duplicates_${tag}_%j.err \
-  --container-image=${docker_image} \
-  --container-mounts=${mounts} \
-    remove_duplicates \
-      --input-data-dir=${input_data_dir} \
-      --input-id-list=${input_id_list} \
-      --output-deduped-dir=${output_data_dir}/all_deduped \
-      --log-dir=${log_dir}/all_deduped_${tag}
diff --git a/examples/gpu_deduplication_example/run-buckets.sh b/examples/gpu_deduplication_example/run-buckets.sh
deleted file mode 100644
index 7ca1d1021..000000000
--- a/examples/gpu_deduplication_example/run-buckets.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-minhash_dir="/outputdir/minhashes"
-datasets=$(ls ${minhash_dir})
-for dataset in $datasets; do
-  input_minhash_dirs="$input_minhash_dirs $minhash_dir/$dataset/minhashes.parquet"
-done
-output_dir="/outputdir"
-
-buckets_per_shuffle=1
-
-mkdir -p $output_dir
-echo $input_minhash_dirs
-
-# Remove old buckets
-rm -r ${output_dir}/buckets.parquet
-
-python -u minhash_buckets.py \
-  --input-data-dirs $input_minhash_dirs \
-  --minhash-length 260 \
-  --output-bucket-dir $output_dir/ \
-  --log-dir $LOGDIR \
-  --protocol ucx \
-  --num-bands 20 \
-  --buckets-per-shuffle=$buckets_per_shuffle \
-  --split-out=512 \
-  --scheduler-file $LOGDIR/scheduler.json
-
-echo "Time Check: `date`"
diff --git a/examples/gpu_deduplication_example/run-cc.sh b/examples/gpu_deduplication_example/run-cc.sh
deleted file mode 100644
index ab0c62108..000000000
--- a/examples/gpu_deduplication_example/run-cc.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-
-base_dir="/outputdir"
-cc_folder="CC"
-output_dir="${base_dir}/${cc_folder}_output"
-cache_dir="${base_dir}/${cc_folder}_cache"
-jaccard_pairs_path="/outputdir/dedup_final_results.parquet"
-
-
-echo "output_dir set to $output_dir"
-echo "cache_dir set to $cache_dir"
-
-export RAPIDS_NO_INITIALIZE="1"
-export CUDF_SPILL="1"
-
-### compute connected component
-#rm -r $cache_dir
-mkdir -p $output_dir $cache_dir
-
-python -u connected_component.py \
-    --jaccard-pairs-path $jaccard_pairs_path \
-    --output-dir $output_dir \
-    --cache-dir $cache_dir \
-    --log-dir $LOGDIR \
-    --profile-path $PROFILESDIR \
-    --num-files $NUM_FILES \
-    --scheduler-file $LOGDIR/scheduler.json
diff --git a/examples/gpu_deduplication_example/run-jaccard.sh b/examples/gpu_deduplication_example/run-jaccard.sh
deleted file mode 100644
index 6ee51d302..000000000
--- a/examples/gpu_deduplication_example/run-jaccard.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-
-shuffled_docs_dir="/outputdir/shuffled_docs.parquet"
-output_dir="/outputdir"
-
-
-export CUDF_SPILL="1"
-
-python jaccard_compute.py \
-  --shuffled-docs-path $shuffled_docs_dir \
-  --output-dir $output_dir \
-  --log-dir $LOGDIR \
-  --num-files $NUM_FILES \
-  --scheduler-file $LOGDIR/scheduler.json
-
-
-echo "Time Check: `date`"
diff --git a/examples/gpu_deduplication_example/run-minhash.sh b/examples/gpu_deduplication_example/run-minhash.sh
deleted file mode 100644
index 79e069cdb..000000000
--- a/examples/gpu_deduplication_example/run-minhash.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#! /bin/bash
-
-# Assumes each directory contains Jsonl files
-input_data_dirs="/datadir/dataset1/ \
-/datadir/dataset2/ \
-/datadir/dataset3/"
-
-output_dir="/outputdir/minhashes"
-
-# NOTE: The script implicitly assumes that the last part
-# of the input data paths is the dataset name and will choose
-# output dir names as follows:
-# /outputdir/minhashes/dataset1
-# /outputdir/minhashes/dataset2
-# /outputdir/minhashes/dataset3
-# This can cause issues if the last part of the
-# dirname is the same across datasets
-
-mkdir -p $output_dir
-
-# Is a good number for files 200MB or lesser
-# Use a smaller value for larger jsonl files
-files_per_partition=20
-
-mkdir -p $output_dir
-echo $input_data_dirs
-
-python -u compute_minhashes.py \
-  --input-data-dirs $input_data_dirs \
-  --minhash-length 260 \
-  --char-ngram 5 \
-  --hash-bytes 4 \
-  --seed 42 \
-  --output-minhash-dir $output_dir \
-  --log-dir $LOGDIR \
-  --num-files $NUM_FILES \
-  --files-per-partition $files_per_partition \
-  --profile-path $PROFILESDIR \
-  --log-frequency 250 \
-  --scheduler-file $LOGDIR/scheduler.json
-
-echo "Time Check: `date`"
diff --git a/examples/gpu_deduplication_example/run-shuffle.sh b/examples/gpu_deduplication_example/run-shuffle.sh
deleted file mode 100644
index e559dbbb1..000000000
--- a/examples/gpu_deduplication_example/run-shuffle.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-input_data_dirs="/datadir/dataset1/ \
-/datadir/dataset2/ \
-/datadir/dataset3/"
-buckets_dir="/outputdir/buckets.parquet"
-output_dir="/outputdir"
-
-
-export CUDF_SPILL="1"
-
-## Run jaccard Mapping
-echo "Starting Jaccard mapping..."
-python jaccard_map_buckets.py \
-  --input-bucket-dir $buckets_dir \
-  --input-data-dirs $input_data_dirs \
-  --output-dir $output_dir \
-  --log-dir $LOGDIR \
-  --text-ddf-blocksize 512 \
-  --num-files $NUM_FILES \
-  --scheduler-file $LOGDIR/scheduler.json
-
-### Run jaccard Shuffle
-
-echo "Starting Jaccard Shuffle..."
-
-python jaccard_shuffle.py \
-  --input-bucket-mapping-dir $output_dir/anchor_docs_with_bk.parquet \
-  --input-data-dirs $input_data_dirs \
-  --output-dir $output_dir \
-  --text-ddf-blocksize 256 \
-  --bucket-mapping-ddf-blocksize 512 \
-  --num-files $NUM_FILES \
-  --parts-per-worker 1 \
-  --scheduler-file $LOGDIR/scheduler.json
-
-echo "Time Check: `date`"
diff --git a/examples/gpu_deduplication_example/run-workflow.sh b/examples/gpu_deduplication_example/run-workflow.sh
deleted file mode 100755
index b7e1392f6..000000000
--- a/examples/gpu_deduplication_example/run-workflow.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#! /bin/bash
-
-echo "Starting Workflow..."
-echo "Time Check: `date`"
-if [[ -z "$SLURM_JOB_ID" ]]; then
-  TODAY="`date +"%Y_%m_%d"`"
-else
-  TODAY="`date +"%Y_%m_%d"`-$SLURM_JOB_ID"
-fi
-
-# Prepare output directory
-export JOB_DIR=rapids-dedup-scripts/DEDUP-$TODAY
-export FULL_OUTPUT_DIR=$HOME/$JOB_DIR
-export LOGDIR=$FULL_OUTPUT_DIR/logs
-export PROFILESDIR=$FULL_OUTPUT_DIR/profiles
-# Take the default location within the container
-RUNSCRIPT=${RUNSCRIPT:--/opt/nemo-data-curator/examples/gpu_deduplication_example/run-minhash.sh}
-echo $RUNSCRIPT
-mkdir -p $LOGDIR
-mkdir -p $PROFILESDIR
-
-cd /opt/nemo-data-curator/nemo_curator/gpu_deduplication
-#-----#
-
-
-# Env vars
-export RAPIDS_NO_INITIALIZE="1"
-export CUDF_SPILL="1"
-
-export LIBCUDF_CUFILE_POLICY=${LIBCUDF_CUFILE_POLICY:-ALWAYS}
-
-# Network interface specific to the cluster being used
-export INTERFACE=ibp12s0
-export PROTOCOL=ucx
-echo $INTERFACE
-
-# This variable can be set to limit the number of jsonl files that
-# are used in the dedup. Setting to -1 reads in all files
-export NUM_FILES=-1
-
-# Start the scheduler on the rank 0 node
-if [[ -z "$SLURM_NODEID" ]] || [[ $SLURM_NODEID == 0 ]]; then
-  echo "Starting scheduler"
-  DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True \
-  DASK_DISTRIBUTED__RMM__POOL_SIZE=1GB \
-    dask scheduler \
-      --scheduler-file $LOGDIR/scheduler.json \
-      --protocol $PROTOCOL \
-      --interface $INTERFACE >> $LOGDIR/scheduler.log 2>&1 &
-fi
-sleep 30
-
-# Start the workers on each node
-echo "Starting workers..."
-dask-cuda-worker --scheduler-file $LOGDIR/scheduler.json --rmm-pool-size 72GiB --interface $INTERFACE --rmm-async >> $LOGDIR/worker_$HOSTNAME.log 2>&1 &
-
-sleep 60
-
-if [[ -z "$SLURM_NODEID" ]] || [[ $SLURM_NODEID == 0 ]]; then
-  echo "Time Check: `date`"
-  bash $RUNSCRIPT
-  echo "Time Check: `date`"
-  touch $LOGDIR/done.txt
-fi
-
-# All nodes wait until done
-while [ ! -f $LOGDIR/done.txt ]
-do
-  sleep 15
-done
diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py
index 0867942d8..8b9613261 100644
--- a/nemo_curator/modules/__init__.py
+++ b/nemo_curator/modules/__init__.py
@@ -22,6 +22,7 @@
 from nemo_curator.utils.import_utils import gpu_only_import_from
 
 from .add_id import AddId
+from .config import FuzzyDuplicatesConfig
 from .dataset_ops import blend_datasets, Shuffle
 from .exact_dedup import ExactDuplicates
 from .filter import Filter, Score, ScoreFilter
@@ -32,6 +33,9 @@
 # GPU packages
 LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "LSH")
 MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "MinHash")
+FuzzyDuplicates = gpu_only_import_from(
+    "nemo_curator.modules.fuzzy_dedup", "FuzzyDuplicates"
+)
 
 # Pytorch related imports must come after all imports that require cugraph,
 # because of context cleanup issues b/w pytorch and cugraph
@@ -42,6 +46,8 @@
     "DomainClassifier",
     "ExactDuplicates",
     "Filter",
+    "FuzzyDuplicatesConfig",
+    "FuzzyDuplicates",
     "LSH",
     "MinHash",
     "Modify",
diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py
new file mode 100644
index 000000000..45ea527f2
--- /dev/null
+++ b/nemo_curator/modules/config.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass
+
+import yaml
+
+
+@dataclass
+class BaseConfig:
+    @classmethod
+    def from_yaml(cls, file_path: str):
+        with open(file_path, "r") as file:
+            yaml_dict = yaml.safe_load(file)
+        return cls(**yaml_dict)
+
+
+@dataclass
+class FuzzyDuplicatesConfig(BaseConfig):
+    """
+    Configuration for MinHash based fuzzy duplicates detection.
+    Parameters
+    ----------
+    seed: Seed for minhash permutations
+    char_ngrams: Size of Char ngram shingles used in minhash computation
+    num_buckets: Number of Bands or buckets to use during Locality Sensitive Hashing
+    hashes_per_bucket: Number of hashes per bucket/band.
+    use_64_bit_hash: Whether to use a 32bit or 64bit hash function for minhashing.
+    buckets_per_shuffle: Number of bands/buckets to shuffle concurrently.
+        Larger values process larger batches by processing multiple bands
+        but might lead to memory pressures and related errors.
+    id_field: Column in the Dataset denoting document ID.
+    text_field: Column in the Dataset denoting document content.
+    profile_dir: str, Default None
+        If specified directory to write dask profile
+    cache_dir: str, Default None
+        Location to store deduplcation intermediates such as minhashes/buckets etc.
+    false_positive_check: bool,
+        Whether to run a check to look for false positives within buckets.
+        Note: This is a computationally expensive step.
+    num_anchors: int
+        Number of documents per bucket to use as reference for computing jaccard
+        pairs within that bucket to identify false positives.
+    jaccard_threshold: float
+        The Jaccard similariy threshold to consider a document a near duplicate
+        during false positive evaluations.
+    """
+
+    # General config
+    cache_dir: str
+    profile_dir: str = None
+    id_field: str = "id"
+    text_field: str = "text"
+
+    # Minhash + LSH Config
+    seed: int = 42
+    char_ngrams: int = 5
+    num_buckets: int = 20
+    hashes_per_bucket: int = 13
+    use_64_bit_hash: bool = False
+    buckets_per_shuffle: int = 1
+
+    false_positive_check: bool = True
+    # Only required for fp check
+    num_anchors: int = 2
+    jaccard_threshold: float = 0.8
+
+    def __post_init__(self):
+        self.num_hashes = self.num_buckets * self.hashes_per_bucket
+        if self.cache_dir is None:
+            raise ValueError(
+                "Finding fuzzy duplicates requires a cache directory accessible via all workers to store intermediates"
+            )
+        if not self.false_positive_check:
+            raise NotImplementedError(
+                "Skipping false positive checks is not supported at the moment"
+            )
+        if self.num_anchors <= 0:
+            raise ValueError("Number of anchors must be greater than 0")
+        if self.num_anchors > 2:
+            warnings.warn(
+                "Using a higher number of anchor docs might lead to higher memory footprint and might impact performance",
+                category=UserWarning,
+            )
+        if not 0 <= self.jaccard_threshold <= 1:
+            raise ValueError("Jaccard Threshold must be between [0,1]")
+        if self.buckets_per_shuffle <= 0:
+            raise ValueError("Buckets per shuffle must be greater than 0")
diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py
index ac72e53d9..b61ccde72 100644
--- a/nemo_curator/modules/fuzzy_dedup.py
+++ b/nemo_curator/modules/fuzzy_dedup.py
@@ -35,6 +35,8 @@
 
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.log import create_logger
+from nemo_curator.modules.config import FuzzyDuplicatesConfig
+from nemo_curator.modules.meta import Sequential
 from nemo_curator.utils.distributed_utils import (
     get_current_client,
     get_num_workers,
@@ -194,7 +196,7 @@ class LSH:
     def __init__(
         self,
         cache_dir: str,
-        minhash_length: int,
+        num_hashes: int,
         num_buckets: int,
         buckets_per_shuffle: int = 1,
         logger: Union[logging.LoggerAdapter, str] = "./",
@@ -207,9 +209,9 @@ def __init__(
         ----------
         cache_dir: str
           Needs to be specified, will compute & write duplicate id, bucket pairs to cache directory.
-        minhash_length: Length of minhash signature
+        num_hashes: Length of minhash signature
         num_buckets: Number of bands/buckets to create from the minhash signature.
-          Hashes_per_signature = minhash_length / num_buckets
+          Hashes_per_signature = num_hashes / num_buckets
         buckets_per_shuffle: Number of bands/buckets to shuffle concurrently.
           Larger values process larger batches by processing multiple bands
           but might lead to memory pressures and related errors.
@@ -219,13 +221,13 @@ def __init__(
         profile_dir: str, Default None
           If specified directory to write dask profile
         """
-        self.minhash_length = minhash_length
+        self.num_hashes = num_hashes
         self.num_buckets = num_buckets
         self.id_fields = [id_fields] if isinstance(id_fields, str) else id_fields
         self.minhash_field = minhash_field
         self.buckets_per_shuffle = buckets_per_shuffle
         self.bucket_ranges = self._generate_bucket_ranges(
-            self.num_buckets, self.minhash_length
+            self.num_buckets, self.num_hashes
         )
 
         if cache_dir is None:
@@ -245,15 +247,15 @@ def __init__(
             self._logger = logger
 
     def _generate_bucket_ranges(
-        self, num_buckets: int, minhash_length: int
+        self, num_buckets: int, num_hashes: int
     ) -> List[List[int]]:
         """
         Generates a list of indices for the minhash ranges given num_bands &
-        minhash_length.
-        eg: num_bands=3, minhash_length=6
+        num_hashes.
+        eg: num_bands=3, num_hashes=6
         [[0, 1], [2, 3], [4, 5]]
         """
-        minhashes_per_bucket = minhash_length // num_buckets
+        minhashes_per_bucket = num_hashes // num_buckets
 
         bucket_ranges = [
             list(
@@ -308,7 +310,7 @@ def _minhash_to_bucket_meta(
         self, df: dask_cudf.DataFrame
     ) -> Tuple[cudf.DataFrame, int]:
         meta = df._meta_nonempty[self.id_fields]
-        meta[self.minhash_field] = [np.ones(self.minhash_length)] * len(meta)
+        meta[self.minhash_field] = [np.ones(self.num_hashes)] * len(meta)
         return self.minhash_to_buckets(meta, self.bucket_ranges)
 
     def lsh(
@@ -325,7 +327,6 @@ def lsh(
             bucket_ranges=self.bucket_ranges,
             meta=meta,
         )
-
         bucket_start_id = 0
         for i in range(0, self.num_buckets, self.buckets_per_shuffle):
             value_vars = [
@@ -382,6 +383,154 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
         return DocumentDataset(buckets_df)
 
 
+class FuzzyDuplicates:
+    def __init__(
+        self,
+        config: FuzzyDuplicatesConfig,
+        logger: Union[logging.LoggerAdapter, str] = "./",
+    ):
+        """
+        Parameters
+        ----------
+        config: FuzzyDuplicatesConfig,
+            Config options for finding FuzzyDuplicates
+        logger: Existing logger to log to, or a path to a log directory.
+
+        Returns
+        -------
+        DocumentDataset containing IDs of all documents and the corresponding duplicate group
+        they belong to. Documents in the same group are near duplicates.
+        """
+        if isinstance(logger, str):
+            self._logger = create_logger(
+                rank=0,
+                log_file=os.path.join(logger, "FuzzyDuplicates.log"),
+                name="FuzzyDuplicates",
+            )
+        else:
+            self._logger = logger
+
+        self.config = config
+        self.minhash = MinHash(
+            seed=self.config.seed,
+            num_hashes=self.config.num_hashes,
+            char_ngrams=self.config.char_ngrams,
+            use_64bit_hash=self.config.use_64_bit_hash,
+            logger=self._logger,
+            id_field=self.config.id_field,
+            text_field=self.config.text_field,
+            profile_dir=self.config.profile_dir,
+            cache_dir=self.config.cache_dir,
+        )
+        self.lsh = LSH(
+            cache_dir=self.config.cache_dir,
+            num_hashes=self.config.num_hashes,
+            num_buckets=self.config.num_buckets,
+            buckets_per_shuffle=self.config.buckets_per_shuffle,
+            logger=self._logger,
+            id_fields=[self.config.id_field],
+            profile_dir=self.config.profile_dir,
+        )
+        self.map_buckets = _MapBuckets(
+            id_fields=[self.config.id_field],
+            text_field=self.config.text_field,
+            logger=self._logger,
+            num_anchors=self.config.num_anchors,
+        )
+        self.jaccard_shuffle = _Shuffle(
+            id_fields=[self.config.id_field],
+            text_field=self.config.text_field,
+            logger=self._logger,
+            profile_dir=self.config.profile_dir,
+        )
+        self.jaccard_compute = JaccardSimilarity(
+            id_field=self.config.id_field,
+            text_field=self.config.text_field,
+            ngram_width=self.config.char_ngrams,
+            anchor_id_fields=[
+                f"anchor_{i}_{self.config.id_field}"
+                for i in range(self.config.num_anchors)
+            ],
+        )
+        self.connected_components = ConnectedComponents(
+            cache_dir=self.config.cache_dir,
+            jaccard_pairs_path=os.path.join(
+                self.config.cache_dir, "jaccard_similarity_results.parquet"
+            ),
+            id_column=self.config.id_field,
+            convert_str_ids=False,
+            jaccard_threshold=self.config.jaccard_threshold,
+        )
+
+    def __call__(self, dataset: DocumentDataset):
+        """
+        Parameters
+        ----------
+        dataset: DocumentDataset
+            The input datset to compute FuzzyDuplicates. Must contain a text and unique id field.
+
+        Returns
+        -------
+        DocumentDataset containing IDs of all documents and the corresponding duplicate group
+        they belong to. Documents in the same group are near duplicates.
+        """
+        # Minhash + LSH
+        print("Stage1: Starting Minhash + LSH computation")
+        minhashLSH = Sequential([self.minhash, self.lsh])
+        buckets_df = minhashLSH(dataset)
+        print("Stage1: Minhash + LSH complete!")
+
+        # Map buckets to lower cardinality distribution
+        print("Stage2 (False Postive Check): Starting Map_Buckets")
+        ddf_mapped_buckets_w_anchors = self.map_buckets.map_buckets_with_anchors(
+            documents_df=dataset.df, buckets_df=buckets_df.df
+        )
+        mapped_buckets_w_anchors_path = os.path.join(
+            self.config.cache_dir, "anchor_docs_with_bk.parquet"
+        )
+        ddf_mapped_buckets_w_anchors.to_parquet(
+            mapped_buckets_w_anchors_path, write_index=False
+        )
+        print("Stage2 (False Postive Check): Map_Buckets Complete!")
+
+        # Shuffle documents based on mapped buckets
+        print("Stage3 (False Postive Check): Shuffle docs")
+        shuffled_docs_path = os.path.join(
+            self.config.cache_dir, "shuffled_docs.parquet"
+        )
+        self.jaccard_shuffle.shuffle_docs_on_buckets(
+            documents_df=dataset.df,
+            bucket_w_anchors_path=mapped_buckets_w_anchors_path,
+            output_shuffled_docs_path=shuffled_docs_path,
+            bucket_mapping_df_blocksize=256,
+            parts_per_worker=1,
+            bucket_parts_per_worker=8,
+        )
+        print("Stage3 (False Postive Check): Shuffle docs complete!")
+
+        # jaccard comparision within buckets
+        print("Stage4 (False Postive Check): Jaccard Similarity in Buckets")
+        jaccard_pairs_path = os.path.join(
+            self.config.cache_dir, "jaccard_similarity_results.parquet"
+        )
+        jaccard_pairs_df = self.jaccard_compute.jaccard_compute(
+            shuffled_docs_path=shuffled_docs_path
+        )
+        jaccard_pairs_df.to_parquet(
+            jaccard_pairs_path,
+            write_index=False,
+            write_metadata_file=False,
+        )
+        print("Stage4 (False Postive Check): Jaccard Similarity in Buckets Complete!")
+
+        # Connected components across buckets
+        print("Stage5: Connected Components across buckets")
+        cc_path = os.path.join(self.config.cache_dir, "connected_components.parquet")
+        self.connected_components.cc_workflow(cc_path)
+        print("Stage5: Connected Components across buckets complete!")
+        return DocumentDataset(dask_cudf.read_parquet(cc_path, split_row_groups=False))
+
+
 class _MapBuckets:
     """
     buckets to a logical partition by using a modified bin packing algorithm.
@@ -508,6 +657,7 @@ def _get_output_map_based_on_str_bytes(
         """
         Add output_partition_id to buckets_ddf
         """
+        documents_df = documents_df.copy()
         documents_df[bytes_column] = documents_df[self.text_field].map_partitions(
             lambda s: s.str.byte_count()
         )
@@ -620,7 +770,7 @@ def map_buckets_with_anchors(
             ddf_anchor_docs_with_bk,
             self.id_fields,
             ignore_index=True,
-            shuffle=shuffle_type,
+            shuffle_method=shuffle_type,
         ).map_partitions(
             M.drop_duplicates,
             meta=ddf_anchor_docs_with_bk._meta,
@@ -1195,7 +1345,7 @@ def _write_dedup_encoded_jaccard_pair(self, encoded_jaccard_pair_path):
             ddf,
             [self.left_id, self.right_id],
             ignore_index=True,
-            shuffle="tasks",
+            shuffle_method="tasks",
         )
         ddf = ddf.map_partitions(
             M.drop_duplicates,
@@ -1301,12 +1451,12 @@ def _batched_merge_and_write(
                     how="inner",
                     broadcast=True,
                 )
+                subset_ddf = subset_ddf.drop(
+                    columns=pair_ids,
+                )
                 subset_ddf = subset_ddf.rename(
                     columns={"uid": f"{self.id_column}_{tag}"}
                 )
-                subset_ddf = subset_ddf.drop(
-                    columns=[f"dataset_id_{tag}", f"doc_id_{tag}"]
-                )
 
             subset_ddf = subset_ddf[[self.left_id, self.right_id, "jaccard"]]
             output_batch_path = os.path.join(output_path, f"{batch_id}.parquet")
diff --git a/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py
index a0484cf0d..21dac27d7 100644
--- a/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py
+++ b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py
@@ -64,7 +64,7 @@ def main(args):
     )
     lsh = LSH(
         cache_dir=args.output_bucket_dir,
-        minhash_length=args.minhash_length,
+        num_hashes=args.minhash_length,
         num_buckets=args.num_bands,
         buckets_per_shuffle=args.buckets_per_shuffle,
         id_fields=["dataset_id", "doc_id"],
diff --git a/tests/test_config.py b/tests/test_config.py
new file mode 100644
index 000000000..fcb34d29a
--- /dev/null
+++ b/tests/test_config.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+import pytest
+import yaml
+
+from nemo_curator.modules.config import BaseConfig
+
+
+@dataclass
+class CustomConfig(BaseConfig):
+    a: str
+    b: int
+    c: bool
+    d: float = 3.0
+
+    def __post_init__(self):
+        if self.d <= 0:
+            raise ValueError("d must be positive")
+
+
+class TestConfig:
+    @pytest.fixture(autouse=True)
+    def config_params(self):
+        self.config_dict = {"a": "a", "b": 1, "c": True, "d": 4.0}
+
+    def test_init(self):
+        config = CustomConfig(a="a", b=1, c=True)
+        assert config.a == "a"
+        assert config.b == 1
+        assert config.c is True
+        assert config.d == 3.0
+
+    def test_from_yaml(self, tmpdir):
+        with open(tmpdir / "test_config.yaml", "w") as file:
+            yaml.dump(self.config_dict, file)
+
+        config = CustomConfig.from_yaml(tmpdir / "test_config.yaml")
+        for key, value in self.config_dict.items():
+            assert getattr(config, key) == value
+
+    def test_from_yaml_raises(self, tmpdir):
+        config_dict = self.config_dict.copy()
+        config_dict["d"] = -1.0
+        with open(tmpdir / "test_config.yaml", "w") as file:
+            yaml.dump(config_dict, file)
+        with pytest.raises(ValueError):
+            CustomConfig.from_yaml(tmpdir / "test_config.yaml")
+
+    def test_from_yaml_missing_key(self, tmpdir):
+        config_dict = self.config_dict.copy()
+        del config_dict["a"]
+        with open(tmpdir / "test_config.yaml", "w") as file:
+            yaml.dump(config_dict, file)
+        with pytest.raises(TypeError):
+            CustomConfig.from_yaml(tmpdir / "test_config.yaml")
+
+    def test_from_yaml_extra_key(self, tmpdir):
+        config_dict = self.config_dict.copy()
+        config_dict["e"] = "e"
+        with open(tmpdir / "test_config.yaml", "w") as file:
+            yaml.dump(config_dict, file)
+        with pytest.raises(TypeError):
+            CustomConfig.from_yaml(tmpdir / "test_config.yaml")
+
+    def test_post_init_raises(self):
+        with pytest.raises(ValueError):
+            CustomConfig(a="a", b=1, c=True, d=-1.0)
diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py
index f0ded450e..e89f998e0 100644
--- a/tests/test_fuzzy_dedup.py
+++ b/tests/test_fuzzy_dedup.py
@@ -18,14 +18,17 @@
 
 import numpy as np
 import pytest
+import yaml
 from dask.dataframe.utils import assert_eq
+from distributed import Client
 
+from nemo_curator import LSH, FuzzyDuplicates, FuzzyDuplicatesConfig, MinHash
 from nemo_curator.datasets import DocumentDataset
-from nemo_curator.modules import LSH, MinHash
-from nemo_curator.utils.import_utils import gpu_only_import
+from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from
 
 cudf = gpu_only_import("cudf")
 dask_cudf = gpu_only_import("dask_cudf")
+LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster")
 
 
 @pytest.fixture
@@ -46,6 +49,25 @@ def fuzzy_dedup_data():
     return DocumentDataset(df)
 
 
+@pytest.fixture
+def large_fuzzy_dedup_data():
+    df = cudf.DataFrame(
+        {
+            "id": np.arange(500),
+            "text": [
+                "A test string",
+                "A different test string",
+                "A different object",
+                "The quick brown fox jumps over the lazy dog",
+                "The quick black cat jumps over the lazy dog",
+            ]
+            * 100,
+        }
+    )
+    df = dask_cudf.from_cudf(df, 5).reset_index(drop=True)
+    return DocumentDataset(df)
+
+
 def minhash_overlap(minhash1: np.array, minhash2: np.array):
     assert len(minhash1) == len(minhash2)
     overlap = sum(minhash1 == minhash2)
@@ -149,7 +171,7 @@ def minhash_data(self):
     def test_lsh(self, tmpdir, buckets_per_shuffle):
         lsh = LSH(
             cache_dir=tmpdir,
-            minhash_length=6,
+            num_hashes=6,
             num_buckets=3,
             buckets_per_shuffle=buckets_per_shuffle,
             minhash_field="minhash_sig",
@@ -164,7 +186,7 @@ def test_lsh(self, tmpdir, buckets_per_shuffle):
     def test_multiple_id_cols(self, tmpdir):
         lsh = LSH(
             cache_dir=tmpdir,
-            minhash_length=6,
+            num_hashes=6,
             num_buckets=3,
             buckets_per_shuffle=1,
             id_fields=["id", "dataset_id"],
@@ -180,3 +202,168 @@ def test_multiple_id_cols(self, tmpdir):
             [[(1, 1), (1, 2)], [(1, 2), (2, 3)], [(3, 4), (4, 5)]], name="new_id"
         )
         assert_eq(expected_df, docs_list, check_index=False)
+
+
+@pytest.mark.gpu
+class TestFuzzyDuplicates:
+    @pytest.fixture(autouse=True, scope="class")
+    def gpu_client(self, request):
+        with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client:
+            request.cls.client = client
+            request.cls.cluster = cluster
+            yield
+
+    @pytest.mark.parametrize("use_64_bit_hash", [False, True])
+    @pytest.mark.parametrize(
+        "num_buckets,jaccard_threshold,duplicate_docs",
+        # Duplcated docs estimated from true_jaccard values
+        [
+            (5, 0.5, [[4, -1]]),
+            (10, 0.39, [[4, -1], [1, 2]]),
+            (3, 0.3, [[4, -1], [1, 2, 300]]),
+        ],
+    )
+    def test_fuzzy_dedup(
+        self,
+        fuzzy_dedup_data,
+        use_64_bit_hash,
+        num_buckets,
+        jaccard_threshold,
+        duplicate_docs,
+        tmpdir,
+    ):
+        print(self.client)
+        # Dedup might fail when indices per partition do not start from 0
+        fuzzy_dedup_data.df = fuzzy_dedup_data.df.reset_index(drop=True)
+        config = FuzzyDuplicatesConfig(
+            cache_dir=tmpdir,
+            id_field="id",
+            text_field="text",
+            seed=42,
+            char_ngrams=5,
+            num_buckets=num_buckets,
+            hashes_per_bucket=1,
+            use_64_bit_hash=use_64_bit_hash,
+            buckets_per_shuffle=5,
+            false_positive_check=True,
+            num_anchors=2,
+            jaccard_threshold=jaccard_threshold,
+        )
+        fuzzy_duplicates = FuzzyDuplicates(config=config)
+        result = fuzzy_duplicates(fuzzy_dedup_data)
+        result_df = result.df.compute()
+        # Drop non duplicated docs
+        result_df = result_df[result_df.group.duplicated(keep=False)]
+        result_df = result_df.groupby("group").id.collect()
+        # Sort to maintain uniform ordering
+
+        result_df = result_df.list.sort_values()
+        result_df = result_df.sort_values()
+        expected_df = cudf.Series(duplicate_docs, name="id")
+        expected_df = expected_df.list.sort_values()
+        expected_df = expected_df.sort_values()
+        assert_eq(expected_df, result_df, check_index=False)
+
+    @pytest.mark.xfail
+    def test_non_uniform_indices(
+        self,
+        tmpdir,
+    ):
+        print(self.client)
+        # Dedup might fail when indices per partition do not start from 0
+        df = cudf.DataFrame(
+            {
+                "id": [1, 2, 300, 4, -1],
+                "text": [
+                    "A test string",
+                    "A different test string",
+                    "A different object",
+                    "The quick brown fox jumps over the lazy dog",
+                    "The quick black cat jumps over the lazy dog",
+                ],
+            }
+        )
+        df = dask_cudf.from_cudf(df, 2)
+        data = DocumentDataset(df)
+        duplicate_docs = [[4, -1], [1, 2, 300]]
+        config = FuzzyDuplicatesConfig(
+            cache_dir=tmpdir,
+            id_field="id",
+            text_field="text",
+            seed=42,
+            char_ngrams=5,
+            num_buckets=10,
+            hashes_per_bucket=1,
+            use_64_bit_hash=False,
+            buckets_per_shuffle=5,
+            false_positive_check=True,
+            num_anchors=2,
+            jaccard_threshold=0.39,
+        )
+        fuzzy_duplicates = FuzzyDuplicates(config=config)
+        result = fuzzy_duplicates(data)
+        result_df = result.df.compute()
+        # Drop non duplicated docs
+        result_df = result_df[result_df.group.duplicated(keep=False)]
+        result_df = result_df.groupby("group").id.collect()
+        # Sort to maintain uniform ordering
+
+        result_df = result_df.list.sort_values()
+        result_df = result_df.sort_values()
+        expected_df = cudf.Series(duplicate_docs, name="id")
+        expected_df = expected_df.list.sort_values()
+        expected_df = expected_df.sort_values()
+        assert_eq(expected_df, result_df, check_index=False)
+
+    @pytest.mark.parametrize("num_anchors", [1, 3, 10])
+    def test_num_anchors(self, large_fuzzy_dedup_data, num_anchors, tmpdir):
+        config = FuzzyDuplicatesConfig(
+            cache_dir=tmpdir,
+            id_field="id",
+            text_field="text",
+            seed=42,
+            char_ngrams=5,
+            num_buckets=5,
+            hashes_per_bucket=1,
+            use_64_bit_hash=False,
+            buckets_per_shuffle=5,
+            false_positive_check=True,
+            num_anchors=num_anchors,
+            jaccard_threshold=0.39,
+        )
+        fuzzy_duplicates = FuzzyDuplicates(config=config)
+        fuzzy_duplicates(large_fuzzy_dedup_data)
+        anchor_docs_df_cols = dask_cudf.read_parquet(
+            tmpdir / "anchor_docs_with_bk.parquet"
+        ).columns
+        assert all(f"anchor_{i}_id" in anchor_docs_df_cols for i in range(num_anchors))
+
+
+class TestFuzzyDuplicatesConfig:
+    def test_bad_inputs(self, tmpdir):
+        with pytest.raises(ValueError):
+            FuzzyDuplicatesConfig(cache_dir=tmpdir, num_anchors=0)
+        with pytest.warns(
+            UserWarning, match="Using a higher number of anchor docs might"
+        ):
+            FuzzyDuplicatesConfig(cache_dir=tmpdir, num_anchors=3)
+        with pytest.raises(ValueError):
+            FuzzyDuplicatesConfig(cache_dir=tmpdir, jaccard_threshold=1.2)
+        with pytest.raises(NotImplementedError):
+            FuzzyDuplicatesConfig(cache_dir=tmpdir, false_positive_check=False)
+        with pytest.raises(ValueError):
+            FuzzyDuplicatesConfig(cache_dir=tmpdir, buckets_per_shuffle=0)
+
+    def test_from_yaml(self, tmpdir):
+        yaml_params = {
+            "cache_dir": "./",
+            "num_anchors": 2,
+            "jaccard_threshold": 0.8,
+            "false_positive_check": True,
+            "buckets_per_shuffle": 1,
+        }
+        with open(tmpdir / "config.yaml", "w") as f:
+            yaml.dump(yaml_params, f)
+        config = FuzzyDuplicatesConfig.from_yaml(tmpdir / "config.yaml")
+        for param in yaml_params:
+            assert getattr(config, param) == yaml_params[param]

From 9849164cc22a1f1d3e091a0bf125b15c26e2ba8a Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Mon, 6 May 2024 15:02:23 -0700
Subject: [PATCH 12/34] Fix indexing in PII Modifier (#55)

* Fix pii index issue

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add sequential wrapper

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Fix pii tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

---------

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 docs/user-guide/QualityFiltering.rst      | 27 +++++++++
 nemo_curator/filters/classifier_filter.py |  6 +-
 nemo_curator/modifiers/__init__.py        |  2 +
 nemo_curator/modifiers/pii_modifier.py    |  4 +-
 tests/test_filters.py                     | 17 ++++++
 tests/test_pii_accuracy.py                | 68 +++++++++++++++++++++++
 6 files changed, 119 insertions(+), 5 deletions(-)

diff --git a/docs/user-guide/QualityFiltering.rst b/docs/user-guide/QualityFiltering.rst
index 46a8c9d81..ba2c34ad6 100644
--- a/docs/user-guide/QualityFiltering.rst
+++ b/docs/user-guide/QualityFiltering.rst
@@ -153,6 +153,33 @@ Here is the ``WordCountFilter`` rewritten to use batches in the ``keep_document`
       pass_max = score <= self._max_words
       return pass_min & pass_max
 
+When you use the ``batched`` decorator, the index of the series returned from the function must remain the same as the index that was passed in.
+The index may not be continuous due to filters being applied prior to the current filter.
+In the above code, the index will be the same automatically so no change is required.
+However, when writing functions that transform the series into a different structure like a list, special care is needed.
+The following code example demonstrates what this error may look like, and how to fix it.
+
+.. code-block:: python
+
+  class BuggyLengthFilter(DocumentFilter):
+
+    @batched
+    def score_document(self, documents: pd.Series):
+      scores = []
+      for document in documents:
+        scores.append(len(document))
+
+      return pd.Series(scores) # Bad! Does not preserve the index
+
+  class CorrectLengthFilter(DocumentFilter):
+
+    @batched
+    def score_document(self, documents: pd.Series):
+      scores = []
+      for document in documents:
+        scores.append(len(document))
+
+      return pd.Series(scores, index=documents.index) # Good! Preserves the index
 
 
 -----------------------------------------
diff --git a/nemo_curator/filters/classifier_filter.py b/nemo_curator/filters/classifier_filter.py
index 3ade004ec..4f06c8b25 100644
--- a/nemo_curator/filters/classifier_filter.py
+++ b/nemo_curator/filters/classifier_filter.py
@@ -37,7 +37,7 @@ def __init__(self, model_path=None, label="__label__hq", alpha=3, seed=42):
         self._name = "fasttext_quality_filter"
 
     @batched
-    def score_document(self, df):
+    def score_document(self, df: pd.Series):
         model_attr = f"{self._name}_{self._model_path}"
         try:
             model = load_object_on_worker(model_attr, self._load_model, {})
@@ -56,7 +56,7 @@ def _score_document(text):
         return df.apply(_score_document)
 
     @batched
-    def keep_document(self, df):
+    def keep_document(self, df: pd.Series):
         return np.random.pareto(self._alpha, size=len(df)) > 1 - df
 
     def _load_model(self):
@@ -82,7 +82,7 @@ def __init__(self, model_path=None, min_langid_score=0.3):
         dask.config.set({"dataframe.convert-string": False})
 
     @batched
-    def score_document(self, df):
+    def score_document(self, df: pd.Series):
         model_attr = f"{self._name}_{self._model_path}"
         try:
             model = load_object_on_worker(model_attr, self._load_model, {})
diff --git a/nemo_curator/modifiers/__init__.py b/nemo_curator/modifiers/__init__.py
index 4c05a31e7..f6511fdb0 100644
--- a/nemo_curator/modifiers/__init__.py
+++ b/nemo_curator/modifiers/__init__.py
@@ -15,6 +15,7 @@
 from .c4 import BoilerPlateStringModifier
 from .doc_modifier import DocumentModifier
 from .fasttext import FastTextLabelModifier
+from .pii_modifier import PiiModifier
 from .unicode_reformatter import UnicodeReformatter
 
 __all__ = [
@@ -22,4 +23,5 @@
     "BoilerPlateStringModifier",
     "FastTextLabelModifier",
     "UnicodeReformatter",
+    "PiiModifier",
 ]
diff --git a/nemo_curator/modifiers/pii_modifier.py b/nemo_curator/modifiers/pii_modifier.py
index 23c713fbf..c2a398b48 100644
--- a/nemo_curator/modifiers/pii_modifier.py
+++ b/nemo_curator/modifiers/pii_modifier.py
@@ -85,8 +85,8 @@ def modify_document(self, text: pd.Series, partition_info: Dict = None):
             logging.error(
                 f"Encountered error {str(e)} in partition {partition_info['number']}"
             )
-            return pd.Series([True])
-        output: pd.Series = pd.Series(output)
+            return pd.Series([True], index=text.index)
+        output: pd.Series = pd.Series(output, text.index)
         return output
 
     def load_deidentifier(self):
diff --git a/tests/test_filters.py b/tests/test_filters.py
index 50676f385..951c1977c 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -282,6 +282,23 @@ def test_score_type(self, letter_count_data):
             expected_scores == scores.compute()
         ), f"Expected {expected_scores} but got {scores}"
 
+    def test_chain_filter(self, letter_count_data):
+        letter_count_filter = LetterCountFilter(min_count=4)
+        length_filter = BatchedLengthFilter(min_length=8, max_length=11)
+        filters = Sequential(
+            [
+                ScoreFilter(letter_count_filter, text_field="documents"),
+                ScoreFilter(length_filter, text_field="documents"),
+            ]
+        )
+        filtered_data = filters(letter_count_data)
+
+        expected_indices = [2]
+        expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices])
+        assert all_equal(
+            expected_data, filtered_data
+        ), f"Expected {expected_data} but got {filtered_data}"
+
 
 class TestHeuristicFilters:
     def test_nonalpha(self):
diff --git a/tests/test_pii_accuracy.py b/tests/test_pii_accuracy.py
index 9431779a3..7e7d58663 100644
--- a/tests/test_pii_accuracy.py
+++ b/tests/test_pii_accuracy.py
@@ -16,9 +16,17 @@
 import re
 from pathlib import Path
 
+import pandas as pd
 import pytest
+from dask import dataframe as dd
+from dask.distributed import Client, LocalCluster
 
+import nemo_curator as nc
+from nemo_curator.datasets import DocumentDataset
+from nemo_curator.filters import DocumentFilter
+from nemo_curator.modifiers import PiiModifier
 from nemo_curator.pii.algorithm import PiiDeidentifier
+from nemo_curator.utils.decorators import batched
 
 LOGGER = logging.getLogger(__name__)
 
@@ -118,3 +126,63 @@ def test_batch_accuracy(self):
         match = all(compare_outputs(x, y) for x, y in zip(outputs, targets))
         print("Matches:", "No" if not match else "Yes")
         assert match == True
+
+
+class BatchedLengthFilter(DocumentFilter):
+    """
+    Keeps documents of a given length
+    """
+
+    def __init__(self, min_length=5, max_length=10):
+        super().__init__()
+        self.min_length = min_length
+        self.max_length = max_length
+
+    @batched
+    def score_document(self, df):
+        return df.str.len()
+
+    @batched
+    def keep_document(self, scores):
+        min_threshold = self.min_length <= scores
+        max_threshold = scores <= self.max_length
+        return min_threshold & max_threshold
+
+
+class TestPIIModule:
+    def test_filter_chain(self):
+        inputs = [
+            "Alice goes on a walk",
+            "Bob goes on a walk",
+            "Someone named Charlie goes on a walk",
+            "A human walking is David",
+            "A human walking is Eliza",
+        ]
+        targets = [
+            "***** goes on a walk",
+            "*** goes on a walk",
+            "A human walking is *****",
+            "A human walking is *****",
+        ]
+        input_df = pd.DataFrame({"text": inputs})
+        target_df = pd.DataFrame({"text": targets})
+        with LocalCluster(n_workers=1, threads_per_worker=1) as cluster:
+            with Client(cluster):
+                input_dataset = DocumentDataset(dd.from_pandas(input_df, npartitions=1))
+                pipeline = nc.Sequential(
+                    [
+                        nc.ScoreFilter(
+                            BatchedLengthFilter(min_length=0, max_length=25)
+                        ),
+                        nc.Modify(
+                            PiiModifier(
+                                language="en", anonymize_action="mask", device="cpu"
+                            )
+                        ),
+                    ]
+                )
+                output_dataset = pipeline(input_dataset)
+
+                output_df = output_dataset.df.compute().reset_index(drop=True)
+                match = all(output_df["text"] == target_df["text"])
+                assert match

From 794a435c172577c31a54440a87f9b9236e5dc413 Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Tue, 7 May 2024 14:46:28 -0700
Subject: [PATCH 13/34] Disable string conversion globally (#56)

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 config/fasttext_langid.yaml               | 1 +
 nemo_curator/__init__.py                  | 8 ++++++++
 nemo_curator/filters/classifier_filter.py | 5 -----
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/config/fasttext_langid.yaml b/config/fasttext_langid.yaml
index 86b18761d..a1f4f3530 100644
--- a/config/fasttext_langid.yaml
+++ b/config/fasttext_langid.yaml
@@ -1,5 +1,6 @@
 input_field: text
 filters:
   - name: nemo_curator.filters.classifier_filter.FastTextLangId
+    log_score: True
     params:
       model_path: <Path to the FasText language id model (e.g., lid.176.bin)>
diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py
index 000e459a9..4645d55ef 100644
--- a/nemo_curator/__init__.py
+++ b/nemo_curator/__init__.py
@@ -12,4 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import dask
+
 from .modules import *
+
+# Dask will automatically convert the list score type
+# to a string without this option.
+# See https://github.com/NVIDIA/NeMo-Curator/issues/33
+# This also happens when reading and writing to files
+dask.config.set({"dataframe.convert-string": False})
diff --git a/nemo_curator/filters/classifier_filter.py b/nemo_curator/filters/classifier_filter.py
index 4f06c8b25..741df9640 100644
--- a/nemo_curator/filters/classifier_filter.py
+++ b/nemo_curator/filters/classifier_filter.py
@@ -76,11 +76,6 @@ def __init__(self, model_path=None, min_langid_score=0.3):
         self._cutoff = min_langid_score
         self._name = "lang_id"
 
-        # Dask will automatically convert the list score type
-        # to a string without this option.
-        # See https://github.com/NVIDIA/NeMo-Curator/issues/33
-        dask.config.set({"dataframe.convert-string": False})
-
     @batched
     def score_document(self, df: pd.Series):
         model_attr = f"{self._name}_{self._model_path}"

From 0f5a0298bda4aec735173d00dbcb765973be77de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Mart=C3=ADnez?=
 <26169771+miguelusque@users.noreply.github.com>
Date: Wed, 8 May 2024 18:01:56 +0200
Subject: [PATCH 14/34] Fix issue #43 (empty files creation) and improve
 reading/writing speed (#57)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit fixes issue #43 (empty files created when invoking reshard_jsonl method at nemo_curator.utils.file_utils.py) by double-checking the files size after being generated, and deleting them with size zero.

In addition to that, I have noticed there is no need to parse to JSON object the content of the different lines, which should be already in json format. By removing that extra-parsing, there is a significant speed up in the execution of this method.

Signed-off-by: Miguel Martínez <26169771+miguelusque@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 nemo_curator/utils/file_utils.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py
index af3c2513d..3ec466b4c 100644
--- a/nemo_curator/utils/file_utils.py
+++ b/nemo_curator/utils/file_utils.py
@@ -181,9 +181,8 @@ def parse_str_of_num_bytes(s, return_str=False):
 def _save_jsonl(documents, output_path, start_index=0, max_index=10000, prefix=None):
     """Worker function to write out the data to jsonl files"""
 
-    def _output_json(document):
-        myjson = json.dumps(document, ensure_ascii=False)
-        return myjson.encode("utf-8")
+    def _encode_text(document):
+        return document.strip().encode("utf-8")
 
     def _name(start_index, npad, prefix, i):
         tag = str(start_index + i).rjust(npad, "0")
@@ -195,11 +194,22 @@ def _name(start_index, npad, prefix, i):
 
     output_glob_string = os.path.join(output_path, "*.jsonl")
 
-    documents.map(_output_json).to_textfiles(
+    output_files = documents.map(_encode_text).to_textfiles(
         output_glob_string,
         name_function=name,
     )
 
+    # Delete empty files generated due to empty partitions in the bag
+    for output_file in output_files:
+        try:
+            if os.path.getsize(output_file) == 0:
+                os.remove(output_file)
+        except Exception as exception:
+            print(
+                f"An exception occurred when trying to delete {output_file}.\n{exception}",
+                flush=True,
+            )
+
 
 def reshard_jsonl(
     input_dir, output_dir, output_file_size="100M", start_index=0, file_prefix=""
@@ -212,7 +222,8 @@ def reshard_jsonl(
         output_dir: The output directory where the resharded jsonl files will be written
         output_file_size: Approximate size of output files. Must specify with a string and
             with the unit K, M or G for kilo, mega or gigabytes
-        start_index: Starting index for naming the output files
+        start_index: Starting index for naming the output files. Note: The indices may not
+            be continuous if the sharding process would output an empty file in its place
         file_prefix: Prefix to use to prepend to output file number
     """
 
@@ -222,7 +233,7 @@ def reshard_jsonl(
     input_files = list(get_all_files_paths_under(input_dir))
 
     # Read in the dask bag
-    b = db.read_text(input_files, blocksize=blocksize).map(json.loads)
+    b = db.read_text(input_files, blocksize=blocksize)
 
     # Prepare the output
     output_dir = expand_outdir_and_mkdir(output_dir)

From d4a2f0f1efb758e9c891bb26b7bb185a05c6bebd Mon Sep 17 00:00:00 2001
From: Mehran Maghoumi <Maghoumi@users.noreply.github.com>
Date: Fri, 10 May 2024 10:25:40 -0700
Subject: [PATCH 15/34] [Tutorials] Add a tutorial for PEFT data curation (#45)

This PR adds a new tutorial to demonstrate data curation for PEFT
use-cases.

Signed-off-by: Mehran Maghoumi <Maghoumi@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/peft-curation/README.md     |  19 +++
 tutorials/peft-curation/docbuilder.py | 113 ++++++++++++++++
 tutorials/peft-curation/filters.py    |  47 +++++++
 tutorials/peft-curation/main.py       | 179 ++++++++++++++++++++++++++
 tutorials/peft-curation/modifiers.py  |  68 ++++++++++
 tutorials/tinystories/README.md       |   2 +-
 tutorials/tinystories/main.py         |   6 +-
 7 files changed, 432 insertions(+), 2 deletions(-)
 create mode 100644 tutorials/peft-curation/README.md
 create mode 100644 tutorials/peft-curation/docbuilder.py
 create mode 100644 tutorials/peft-curation/filters.py
 create mode 100644 tutorials/peft-curation/main.py
 create mode 100644 tutorials/peft-curation/modifiers.py

diff --git a/tutorials/peft-curation/README.md b/tutorials/peft-curation/README.md
new file mode 100644
index 000000000..afa0d66a3
--- /dev/null
+++ b/tutorials/peft-curation/README.md
@@ -0,0 +1,19 @@
+# Curating Datasets for Parameter Efficient Fine-tuning
+
+This tutorial demonstrates the usage of NeMo Curator's Python API to curate a dataset for
+parameter-efficient fine-tuning (PEFT).
+
+In this tutorial, we use the [Enron Emails dataset](https://huggingface.co/datasets/neelblabla/enron_labeled_emails_with_subjects-llama2-7b_finetuning),
+which is a dataset of emails with corresponding classification labels for each email. Each email has
+a subject, a body and a category (class label). We demonstrate various filtering and processing
+operations that can be applied to each record.
+
+## Usage
+After installing the NeMo Curator package, you can simply run the following command:
+```
+python tutorials/peft-curation/main.py
+```
+
+By default, this tutorial will use at most 8 workers to run the curation pipeline. If you face any
+out of memory issues, you can reduce the number of workers by supplying the `--n-workers=N` argument,
+where `N` is the number of workers to spawn.
diff --git a/tutorials/peft-curation/docbuilder.py b/tutorials/peft-curation/docbuilder.py
new file mode 100644
index 000000000..3ae0840c9
--- /dev/null
+++ b/tutorials/peft-curation/docbuilder.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from typing import Dict
+
+import requests
+
+from nemo_curator.download.doc_builder import (
+    DocumentDownloader,
+    DocumentExtractor,
+    DocumentIterator,
+)
+
+
+class EmailsDownloader(DocumentDownloader):
+    def __init__(self, download_dir: str):
+        super().__init__()
+
+        if not os.path.isdir(download_dir):
+            os.makedirs(download_dir)
+
+        self._download_dir = download_dir
+        print("Download directory: ", self._download_dir)
+
+    def download(self, url: str) -> str:
+        filename = os.path.basename(url)
+        output_file = os.path.join(self._download_dir, filename)
+
+        if os.path.exists(output_file):
+            print(f"File '{output_file}' already exists, skipping download.")
+            return output_file
+
+        print(f"Downloading Enron emails dataset from '{url}'...")
+        response = requests.get(url)
+
+        with open(output_file, "wb") as file:
+            file.write(response.content)
+
+        return output_file
+
+
+class EmailsIterator(DocumentIterator):
+
+    def __init__(self):
+        super().__init__()
+        self._counter = -1
+        self._extractor = EmailsExtractor()
+        # The regular expression pattern to extract each email.
+        self._pattern = re.compile(r"\"<s>.*?<s>\"", re.DOTALL)
+
+    def iterate(self, file_path):
+        self._counter = -1
+        file_name = os.path.basename(file_path)
+
+        with open(file_path, "r", encoding="utf-8") as file:
+            lines = file.readlines()
+
+        # Ignore the first line which contains the header.
+        file_content = "".join(lines[1:])
+        # Find all the emails in the file.
+        it = self._pattern.finditer(file_content)
+
+        for email in it:
+            self._counter += 1
+            content = email.group().strip('"').strip()
+            meta = {
+                "filename": file_name,
+                "id": f"email-{self._counter}",
+            }
+            extracted_content = self._extractor.extract(content)
+
+            # Skip if no content extracted
+            if not extracted_content:
+                continue
+
+            record = {**meta, **extracted_content}
+            yield record
+
+
+class EmailsExtractor(DocumentExtractor):
+    def __init__(self):
+        super().__init__()
+        # The regular expression pattern to extract subject/body/label into groups.
+        self._pattern = re.compile(
+            r"Subject:: (.*?)\nBody:: (.*?)\n.*\[/INST\] (.*?) <s>", re.DOTALL
+        )
+
+    def extract(self, content: str) -> Dict[str, str]:
+        matches = self._pattern.findall(content)
+
+        if not matches:
+            return None
+
+        matches = matches[0]
+
+        return {
+            "subject": matches[0].strip(),
+            "body": matches[1].strip(),
+            "category": matches[2].strip(),
+        }
diff --git a/tutorials/peft-curation/filters.py b/tutorials/peft-curation/filters.py
new file mode 100644
index 000000000..0ffcd5be7
--- /dev/null
+++ b/tutorials/peft-curation/filters.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_curator.filters import DocumentFilter
+
+
+class FilterEmailsWithLongBody(DocumentFilter):
+    """
+    If the email is too long, discard.
+    """
+
+    def __init__(self, max_length: int = 5000):
+        super().__init__()
+        self.max_length = max_length
+
+    def score_document(self, text: str) -> bool:
+        return len(text) <= self.max_length
+
+    def keep_document(self, score) -> bool:
+        return score
+
+
+class FilterEmptyEmails(DocumentFilter):
+    """
+    Detects empty emails (either empty body, or labeled as empty). Returns `True` for empty emails.
+    """
+
+    def score_document(self, text: str) -> bool:
+        return (
+            not isinstance(text, str)  # The text is not a string
+            or len(text.strip()) == 0  # The text is empty
+            or "Empty message" in text  # The email is labeled as empty
+        )
+
+    def keep_document(self, score) -> bool:
+        return score
diff --git a/tutorials/peft-curation/main.py b/tutorials/peft-curation/main.py
new file mode 100644
index 000000000..9210d9f89
--- /dev/null
+++ b/tutorials/peft-curation/main.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+from functools import partial
+from typing import Any
+
+from docbuilder import EmailsDownloader, EmailsIterator
+from filters import FilterEmailsWithLongBody, FilterEmptyEmails
+from modifiers import AddPeriod, AddSystemPrompt
+
+from nemo_curator import ScoreFilter, Sequential
+from nemo_curator.datasets import DocumentDataset
+from nemo_curator.modifiers.pii_modifier import PiiModifier
+from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter
+from nemo_curator.modules.modify import Modify
+from nemo_curator.utils.distributed_utils import get_client
+from nemo_curator.utils.script_utils import add_distributed_args
+
+SCRIPT_DIR_PATH = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(SCRIPT_DIR_PATH, "data")
+DATASET_URL = "https://huggingface.co/datasets/neelblabla/enron_labeled_emails_with_subjects-llama2-7b_finetuning/raw/main/prompts_train.csv"
+
+
+def download_and_convert_to_jsonl() -> str:
+    """
+    Downloads the emails dataset and converts it to JSONL format.
+
+    Returns:
+        str: The path to the JSONL file.
+    """
+
+    # Download the dataset in raw format and convert it to JSONL.
+    downloader = EmailsDownloader(DATA_DIR)
+    output_path = os.path.join(DATA_DIR, "emails.jsonl")
+    raw_fp = downloader.download(DATASET_URL)
+
+    iterator = EmailsIterator()
+
+    # Parse the raw data and write it to a JSONL file.
+    with open(output_path, "w") as f:
+        for record in iterator.iterate(raw_fp):
+            json_record = json.dumps(record, ensure_ascii=False)
+            f.write(json_record + "\n")
+
+    return output_path
+
+
+def redact_pii(dataset: DocumentDataset, text_field) -> DocumentDataset:
+    """
+    Redacts personally identifiable information (PII) from a given dataset.
+
+    Args:
+        dataset (DocumentDataset): The dataset containing documents with PII.
+
+    Returns:
+        DocumentDataset: The redacted dataset with PII replaced by a generic value.
+    """
+    redactor = Modify(
+        PiiModifier(
+            supported_entities=[
+                "ADDRESS",
+                "EMAIL_ADDRESS",
+                "LOCATION",
+                "PERSON",
+                "URL",
+                "PHONE_NUMBER",
+            ],
+            anonymize_action="replace",
+            device="cpu",
+        ),
+        text_field=text_field,
+    )
+    return redactor(dataset)
+
+
+def run_curation_pipeline(args: Any, jsonl_fp: str) -> str:
+    """
+    Run the curation pipeline on the dataset.
+
+    Args:
+        args (Any): Command-line arguments.
+        jsonl_fp (str): The path to the uncurated JSONL file.
+
+    Returns:
+        str: The path to the curated JSONL file.
+    """
+    client = get_client(args, args.device)
+    print(f"    Running the curation pipeline on '{jsonl_fp}'...")
+    orig_dataset = DocumentDataset.read_json(jsonl_fp, add_filename=True)
+    dataset = orig_dataset
+
+    redact_pii_subject = partial(redact_pii, text_field="subject")
+    redact_pii_body = partial(redact_pii, text_field="body")
+
+    curation_steps = Sequential(
+        [
+            #
+            # Unify the text encoding to Unicode.
+            #
+            Modify(UnicodeReformatter(), text_field="subject"),
+            Modify(UnicodeReformatter(), text_field="body"),
+            Modify(UnicodeReformatter(), text_field="category"),
+            #
+            # Filtering
+            #
+            # Filter out empty emails.
+            ScoreFilter(
+                FilterEmptyEmails(), text_field="subject", score_type=bool, invert=True
+            ),
+            ScoreFilter(
+                FilterEmptyEmails(), text_field="body", score_type=bool, invert=True
+            ),
+            ScoreFilter(
+                FilterEmptyEmails(), text_field="category", score_type=bool, invert=True
+            ),
+            # Filter out emails that are too long.
+            ScoreFilter(FilterEmailsWithLongBody(), text_field="body", score_type=bool),
+            #
+            # Redact personally identifiable information (PII).
+            #
+            redact_pii_subject,
+            redact_pii_body,
+            #
+            # Final modifications.
+            #
+            # Add system prompts to every email, which helps the model focus on the task.
+            Modify(AddSystemPrompt(), text_field="body"),
+            # Add a period to the end of each email category, which makes PEFT easier.
+            Modify(AddPeriod(), text_field="category"),
+        ]
+    )
+
+    dataset = curation_steps(dataset)
+    dataset = dataset.persist()
+
+    print(f"    Original dataset length: {len(orig_dataset.df)}")
+    print(f"    After running the curation pipeline: {len(dataset.df)}")
+    print(f"    Writing to '{jsonl_fp}'...")
+    out_path = os.path.join(
+        os.path.dirname(jsonl_fp),
+        "curated",
+    )
+    os.makedirs(out_path, exist_ok=True)
+    dataset.to_json(out_path, write_to_filename=True)
+    client.close()
+    return os.path.join(out_path, os.path.basename(jsonl_fp))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser = add_distributed_args(parser)
+    args = parser.parse_args()
+    # Limit the total number of workers to ensure we don't run out of memory.
+    args.n_workers = min(args.n_workers, 8)
+
+    # Prepare the download and JSONL directories.
+    if not os.path.isdir(DATA_DIR):
+        os.makedirs(DATA_DIR)
+
+    jsonl_fp = download_and_convert_to_jsonl()
+    run_curation_pipeline(args, jsonl_fp)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/peft-curation/modifiers.py b/tutorials/peft-curation/modifiers.py
new file mode 100644
index 000000000..059036ee4
--- /dev/null
+++ b/tutorials/peft-curation/modifiers.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_curator.modifiers import DocumentModifier
+
+# The system prompt template to be inserted into the documents.
+SYS_PROMPT_TEMPLATE = """[INST] <<SYS>> You are reviewing the contents of an email. Based on the content, please categorize this email into one of the following categories:
+1. 'Company Business/Strategy.'
+2. 'Purely Personal.'
+3. 'Personal but in a professional context.'
+4. 'Logistic Arrangements.'
+5. 'Employment arrangements.'
+6. 'Document editing/checking/collaboration.'
+Please provide only one category (e.g., 'Purely Personal.'). <</SYS>>
+
+Content::
+%s
+
+What should this email be categorized as?
+[/INST]
+Answer:: """
+
+
+class AddSystemPrompt(DocumentModifier):
+    """
+    A simple modifier that adds system prompts to each document.
+    """
+
+    def modify_document(self, text: str) -> str:
+        """
+        Inserts system prompts into the document.
+
+        Args:
+            text (str): The text to be modified.
+
+        Returns:
+            str: The modified text.
+        """
+        return SYS_PROMPT_TEMPLATE % text
+
+
+class AddPeriod(DocumentModifier):
+    """
+    A simple modifier that adds a period to the end of each email category.
+    """
+
+    def modify_document(self, text: str) -> str:
+        """
+        Adds a period to the end of each email category.
+
+        Args:
+            text (str): The text to be modified.
+
+        Returns:
+            str: The modified text.
+        """
+        return text + "."
diff --git a/tutorials/tinystories/README.md b/tutorials/tinystories/README.md
index 47074cb3f..45bc3bf33 100644
--- a/tutorials/tinystories/README.md
+++ b/tutorials/tinystories/README.md
@@ -1,6 +1,6 @@
 # TinyStories
 
-This tutorial demonstrates the usage of NeMo Curator's Python API to curate the [TinyStories](https://arxiv.org/abs/2305.07759) dataset. TinyStories is a dataset of short stories generated by GPT-3.5 and GPT-4, featuring words that are undersood by 3 to 4-year olds. The small size of this dataset makes it ideal for creating and validating data curation pipelines on a local machine.
+This tutorial demonstrates the usage of NeMo Curator's Python API to curate the [TinyStories](https://arxiv.org/abs/2305.07759) dataset. TinyStories is a dataset of short stories generated by GPT-3.5 and GPT-4, featuring words that are understood by 3 to 4-year olds. The small size of this dataset makes it ideal for creating and validating data curation pipelines on a local machine.
 
 For simplicity, this tutorial uses the validation split of this dataset, which contains around 22,000 samples.
 
diff --git a/tutorials/tinystories/main.py b/tutorials/tinystories/main.py
index fa4470c35..1fbbba35c 100644
--- a/tutorials/tinystories/main.py
+++ b/tutorials/tinystories/main.py
@@ -97,19 +97,23 @@ def filter_dataset(dataset: DocumentDataset) -> DocumentDataset:
                 WordCountFilter(min_words=80),
                 text_field="text",
                 score_field="word_count",
+                score_type=int,
             ),
-            ScoreFilter(IncompleteStoryFilter(), text_field="text"),
+            ScoreFilter(IncompleteStoryFilter(), text_field="text", score_type=bool),
             ScoreFilter(
                 RepeatingTopNGramsFilter(n=2, max_repeating_ngram_ratio=0.2),
                 text_field="text",
+                score_type=float,
             ),
             ScoreFilter(
                 RepeatingTopNGramsFilter(n=3, max_repeating_ngram_ratio=0.18),
                 text_field="text",
+                score_type=float,
             ),
             ScoreFilter(
                 RepeatingTopNGramsFilter(n=4, max_repeating_ngram_ratio=0.16),
                 text_field="text",
+                score_type=float,
             ),
         ]
     )

From 8bea00b46c502285fa1db2fc005fb1f2fdde2808 Mon Sep 17 00:00:00 2001
From: Ayush Dattagupta <ayushdg95@gmail.com>
Date: Mon, 13 May 2024 14:40:46 -0700
Subject: [PATCH 16/34] Only import PII constants during Curator import (#61)

* Move PII constants to a seperate file that does not import presidio/spacy and other GPU dependencies

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Add comment around import, move constant import to global scope

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

---------

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 nemo_curator/modifiers/pii_modifier.py |  4 ++--
 nemo_curator/pii/algorithm.py          | 26 +++++---------------------
 nemo_curator/pii/constants.py          | 20 ++++++++++++++++++++
 tests/test_pii_accuracy.py             |  1 -
 4 files changed, 27 insertions(+), 24 deletions(-)
 create mode 100644 nemo_curator/pii/constants.py

diff --git a/nemo_curator/modifiers/pii_modifier.py b/nemo_curator/modifiers/pii_modifier.py
index c2a398b48..51ea5b6e2 100644
--- a/nemo_curator/modifiers/pii_modifier.py
+++ b/nemo_curator/modifiers/pii_modifier.py
@@ -17,7 +17,7 @@
 import pandas as pd
 
 from nemo_curator.modifiers import DocumentModifier
-from nemo_curator.pii.algorithm import DEFAULT_LANGUAGE
+from nemo_curator.pii.constants import DEFAULT_LANGUAGE, DEFAULT_MAX_DOC_SIZE
 from nemo_curator.utils.decorators import batched
 from nemo_curator.utils.distributed_utils import load_object_on_worker
 
@@ -97,7 +97,7 @@ def load_deidentifier(self):
 
         if self.device == "gpu":
             spacy.require_gpu()
-        from nemo_curator.pii.algorithm import DEFAULT_MAX_DOC_SIZE, PiiDeidentifier
+        from nemo_curator.pii.algorithm import PiiDeidentifier
 
         deidentifier: PiiDeidentifier = PiiDeidentifier(
             language=self.language,
diff --git a/nemo_curator/pii/algorithm.py b/nemo_curator/pii/algorithm.py
index 762214efb..2b5e16ed0 100644
--- a/nemo_curator/pii/algorithm.py
+++ b/nemo_curator/pii/algorithm.py
@@ -15,6 +15,10 @@
 from pathlib import Path
 from typing import Any, List, Mapping, Union
 
+# NOTE: Importing this module before cluster creation will create a primary CUDA context
+# that leads to issues of all GPUs not being used when creating a cluster/client later on.
+# Ensure that this module is always imported after cluster creation only when the algorithm
+# needs to be executed. See: https://github.com/NVIDIA/NeMo-Curator/issues/64
 import yaml
 from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
 from presidio_analyzer.nlp_engine import NerModelConfiguration
@@ -30,36 +34,16 @@
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
 
+from nemo_curator.pii.constants import DEFAULT_LANGUAGE, SUPPORTED_ENTITIES
 from nemo_curator.pii.custom_batch_analyzer_engine import CustomBatchAnalyzerEngine
 from nemo_curator.pii.custom_nlp_engine import CustomNlpEngine
 from nemo_curator.pii.recognizers.address_recognizer import AddressRecognizer
 
 __all__ = [
-    "DEFAULT_LANGUAGE",
-    "SUPPORTED_ENTITIES",
-    "DEFAULT_MAX_DOC_SIZE",
     "PiiDeidentifier",
 ]
 
 
-DEFAULT_LANGUAGE = "en"
-SUPPORTED_ENTITIES = [
-    "ADDRESS",
-    "CREDIT_CARD",
-    "EMAIL_ADDRESS",
-    "DATE_TIME",
-    "IP_ADDRESS",
-    "LOCATION",
-    "PERSON",
-    "URL",
-    "US_SSN",
-    "US_PASSPORT",
-    "US_DRIVER_LICENSE",
-    "PHONE_NUMBER",
-]
-DEFAULT_MAX_DOC_SIZE = 2000000
-
-
 class PiiDeidentifier(object):
     """Cleans PII from an unstructured text"""
 
diff --git a/nemo_curator/pii/constants.py b/nemo_curator/pii/constants.py
new file mode 100644
index 000000000..fc8dcc545
--- /dev/null
+++ b/nemo_curator/pii/constants.py
@@ -0,0 +1,20 @@
+DEFAULT_LANGUAGE = "en"
+
+SUPPORTED_ENTITIES = [
+    "ADDRESS",
+    "CREDIT_CARD",
+    "EMAIL_ADDRESS",
+    "DATE_TIME",
+    "IP_ADDRESS",
+    "LOCATION",
+    "PERSON",
+    "URL",
+    "US_SSN",
+    "US_PASSPORT",
+    "US_DRIVER_LICENSE",
+    "PHONE_NUMBER",
+]
+
+DEFAULT_MAX_DOC_SIZE = 2000000
+
+__all__ = ["DEFAULT_LANGUAGE", "SUPPORTED_ENTITIES", "DEFAULT_MAX_DOC_SIZE"]
diff --git a/tests/test_pii_accuracy.py b/tests/test_pii_accuracy.py
index 7e7d58663..850dafd54 100644
--- a/tests/test_pii_accuracy.py
+++ b/tests/test_pii_accuracy.py
@@ -17,7 +17,6 @@
 from pathlib import Path
 
 import pandas as pd
-import pytest
 from dask import dataframe as dd
 from dask.distributed import Client, LocalCluster
 

From c66138a55839a2e2acef405f0a9c9a5582570974 Mon Sep 17 00:00:00 2001
From: Nicoel Luo <nluo@nvidia.com>
Date: Wed, 15 May 2024 12:35:56 +0000
Subject: [PATCH 17/34] Deleting links

Signed-off-by: Nicoel Luo <nluo@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index f0fada829..3868ebbff 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -21,9 +21,7 @@
     "\n",
     "NeMo Curator team has perform ablation experiments using Common Crawl dataset to train a 357M GPT-style model to assess the effect of different curation stage on model performance. \n",
     "\n",
-    "![alt text](./image/zeroshot_ablations.png)\n",
-    "\n",
-    "For the latest NeMo Data Curator user guide, please refer to https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/index.html "
+    "![alt text](./image/zeroshot_ablations.png)\n"
    ]
   },
   {

From 148e1d494ac03a3390de5119f7eb3043f02adf54 Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:21:42 +0800
Subject: [PATCH 18/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index 3868ebbff..ce883dd34 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -5,7 +5,7 @@
    "id": "1c1a4119",
    "metadata": {},
    "source": [
-    "# Nemo Curator pipeline example\n",
+    "# Nemo Curator Pipeline Example\n",
     "\n",
     "## NeMo Curator introduction\n",
     "The NeMo Curator is a Python library that consists of a collection of scalable data-mining modules for curating natural language processing (NLP) data for training large language models (LLMs). The modules within the NeMo Data Curator enable NLP researchers to mine high-quality text at scale from massive uncurated web corpora. \n",

From 7e08c96daa5a6a16805b7eed76e42baa1bcda057 Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:21:56 +0800
Subject: [PATCH 19/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index ce883dd34..0c1acdec2 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Nemo Curator Pipeline Example\n",
     "\n",
-    "## NeMo Curator introduction\n",
+    "## NeMo Curator Introduction\n",
     "The NeMo Curator is a Python library that consists of a collection of scalable data-mining modules for curating natural language processing (NLP) data for training large language models (LLMs). The modules within the NeMo Data Curator enable NLP researchers to mine high-quality text at scale from massive uncurated web corpora. \n",
     "\n",
     "NeMo Curator includes the following modules to perform data curation:\n",

From 75f5dd7ec157d2723c3f8e095400089faa14af45 Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:23:12 +0800
Subject: [PATCH 20/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index 0c1acdec2..ba813d1bd 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -65,7 +65,7 @@
     "\n",
     "**OS**: ubuntu 22.04\n",
     "\n",
-    "### Getting NeMo FrameWork Training Container\n",
+    "### Getting NeMo Framework Training Container\n",
     "- Get access to the container via https://developer.nvidia.com/nemo-framework\n",
     "- Set your docker credentials \n",
     "    ```bash\n",

From fcd82307a2fddda64788265f69b5f35bbe6eb4a5 Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:23:20 +0800
Subject: [PATCH 21/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index ba813d1bd..44cb08b3f 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -2441,7 +2441,7 @@
     "connected_component_output_path = os.path.join(connected_component_base_output_path, \"connected_components.parquet\")\n",
     "connected_component_cache_dir = os.path.join(connected_component_base_output_path, \"cache\")\n",
     "\n",
-    "#Relevant parameter\n",
+    "#Relevant parameters\n",
     "input_id_field = 'id'\n",
     "jaccard_threshold = 0.8\n",
     "\n",

From 48af56133e5e02be92e8a65769e75f2503a49dc6 Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:23:28 +0800
Subject: [PATCH 22/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index 44cb08b3f..b53200123 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -2967,7 +2967,7 @@
     "#Output\n",
     "dudped_output_dir = os.path.join(data_dir,\"remove_duplicate/result.parquet\")\n",
     "\n",
-    "#Relevant parameter\n",
+    "#Relevant parameters\n",
     "input_id_field = 'id'\n",
     "id_prefix = add_ID_id_prefix\n",
     "\n",

From 49efc21066c547ef3b277e0caef725d6401bfdd5 Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:23:41 +0800
Subject: [PATCH 23/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index b53200123..7c3b7f1cb 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -286,7 +286,7 @@
     "download_base_directory= os.path.join(data_dir,\"wiki_downloads\")\n",
     "download_output_directory = os.path.join(download_base_directory,\"data\")\n",
     "\n",
-    "#Relevant parameter\n",
+    "#Relevant parameters\n",
     "dump_date = \"20240201\"\n",
     "language = 'th'\n",
     "url_limit = 1"

From 5826eb1051e0b63c84d514e039252c01dc850016 Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:23:50 +0800
Subject: [PATCH 24/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index 7c3b7f1cb..d07eb738f 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -45,7 +45,7 @@
     "What is not included:\n",
     "1. Customized downloading\n",
     "2. Classifier filtering\n",
-    "3. Downstream-task deduplication\n",
+    "3. Downstream-task decontamination\n",
     "\n"
    ]
   },

From 30abf299edf69d7ef20a88fdf638bc8f02a83efd Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:24:00 +0800
Subject: [PATCH 25/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index d07eb738f..9b31c6753 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -73,7 +73,7 @@
     "\n",
     "    Username: $oauthtoken\n",
     "    Password: <Your NGC Key>\n",
-    "- Get NeMo NeMo FrameWork Training Container\n",
+    "- Get NeMo NeMo Framework Training Container\n",
     "    ```bash\n",
     "    docker pull nvcr.io/ea-bignlp/ga-participants/nemofw-training:24.01\n"
    ]

From 43eae2717fb31441c03f00b93f5a7b5b66e46a33 Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:24:11 +0800
Subject: [PATCH 26/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index 9b31c6753..8b96f8e14 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -1513,7 +1513,7 @@
     "3. Documents within the same bucket will be deemed similar. Since every document will be assigned `X` buckets and as long as two documents share 1 or more buckets they are deemed similar, the result of LSH will have more false positive as compared to false negative. The false positive cases will be filtered in following modules, namely jaccard compute.\n",
     "\n",
     "Arguments include:\n",
-    "- `minhash_length`:Length of minhash signature. Must bu consistent with `MinHash()`\n",
+    "- `minhash_length`:Length of minhash signature. Must be consistent with `MinHash()`\n",
     "- `num_buckets`: Number of buckets\n",
     "- `buckets_per_shuffle`: Number of buckets to shuffle concurrently\n",
     "- `id_field`: Key in input file for identifying document ID\n",

From 87eefbdab23c1b37dc73f66aa7514a016ec6a2fa Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:24:37 +0800
Subject: [PATCH 27/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index 8b96f8e14..a22fe8faa 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -1792,7 +1792,7 @@
     "input_id_field = 'id'\n",
     "input_text_field = 'text'\n",
     "\n",
-    "#Relevant parameter for _Shuffle()\n",
+    "#Relevant parameters for _Shuffle()\n",
     "shuffle_id_fields=[\"dataset_id\", \"doc_id\"]\n",
     "int_to_str_id='id'\n",
     "\n",

From 262d8e03a579e890119b621c0db0c2409f2d1655 Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:25:03 +0800
Subject: [PATCH 28/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index a22fe8faa..4e8e21bc1 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -1783,7 +1783,7 @@
     "input_anchor_docs_with_bk_dir = output_anchor_docs_with_bk_path\n",
     "output_shuffled_docs_path = os.path.join(jaccard_shuffle_base_output_path, \"shuffled_docs.parquet\")\n",
     "\n",
-    "#Relevant parameter for _MapBucket()\n",
+    "#Relevant parameters for _MapBucket()\n",
     "text_ddf_blocksize = 256\n",
     "bucket_mapping_ddf_blocksize = 256\n",
     "num_files = None\n",

From 15db6f35bf0739c9e4e2bf45ea7532f18abfb539 Mon Sep 17 00:00:00 2001
From: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Date: Thu, 16 May 2024 10:25:21 +0800
Subject: [PATCH 29/34] Update
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb

Co-authored-by: Ryan Wolf <ryantwolf1@gmail.com>
Signed-off-by: nicoleeeluo <157772168+nicoleeeluo@users.noreply.github.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index 4e8e21bc1..e839455f7 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -1731,7 +1731,7 @@
     "### 5.3 Jaccard Shuffle\n",
     "In this section, we will be using `_MapBucket()` and `_Shuffle()`.\n",
     "\n",
-    "For `_MapBucket()`, it is designed to take input text data in .jsonl format and bucket information which is output of LSH, map the documents to their respective buckets, and write the resulting DataFrame containing the anchor documents and their associated bucket information to a Parquet file.Arguments include:\n",
+    "For `_MapBucket()`, it is designed to take input text data in jsonl format and bucket information which is output of LSH, map the documents to their respective buckets, and write the resulting DataFrame containing the anchor documents and their associated bucket information to a parquet file. Arguments include:\n",
     "- `id_field`: Key in input .jsonl file for identifying document ID\n",
     "- `text_field`: Key in input .jsonl file which contains document text.\n",
     "- `bucket_field`: Key in input _buckets.parquet which contains `bucket_id`.\n",

From 84587b201da0c275c2b6aac2cf4dfb4050acb873 Mon Sep 17 00:00:00 2001
From: Nicole Luo <nluo@nvidia.com>
Date: Fri, 17 May 2024 06:59:48 +0000
Subject: [PATCH 30/34] Fixed typo. Update content to lastest NeMo Curator
 version. Added fuzzy deduplication wrapper example

Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 .../single_gpu_tutorial.ipynb                 | 1535 ++++++++++-------
 1 file changed, 959 insertions(+), 576 deletions(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index e839455f7..006098375 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "1c1a4119",
+   "id": "e12a5ec6",
    "metadata": {},
    "source": [
     "# Nemo Curator Pipeline Example\n",
@@ -18,6 +18,7 @@
     "- Document-level deduplication\n",
     "- Multilingual downstream-task decontamination\n",
     "- Distributed Data Classification\n",
+    "- Personal identifiable information (PII) redaction\n",
     "\n",
     "NeMo Curator team has perform ablation experiments using Common Crawl dataset to train a 357M GPT-style model to assess the effect of different curation stage on model performance. \n",
     "\n",
@@ -26,7 +27,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "be41377f",
+   "id": "58d062aa",
    "metadata": {},
    "source": [
     "## About this notebook\n",
@@ -46,12 +47,14 @@
     "1. Customized downloading\n",
     "2. Classifier filtering\n",
     "3. Downstream-task decontamination\n",
+    "4. Distributed data classification with PyTorch models\n",
+    "5. Personal identifiable information (PII) redaction\n",
     "\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "8860c239",
+   "id": "a6e3492e",
    "metadata": {},
    "source": [
     "## Prerequisites\n",
@@ -80,7 +83,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ff6bff1b",
+   "id": "01d4c35a",
    "metadata": {},
    "source": [
     "## 0. Env Setup"
@@ -89,7 +92,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "24dce020",
+   "id": "8778a517",
    "metadata": {},
    "outputs": [
     {
@@ -97,12 +100,7 @@
      "output_type": "stream",
      "text": [
       "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
-      "Collecting jsonlines\n",
-      "  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)\n",
-      "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines) (23.2.0)\n",
-      "Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
-      "Installing collected packages: jsonlines\n",
-      "Successfully installed jsonlines-4.0.0\n",
+      "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (2.0.0)\n",
       "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
       "\u001b[0m\n",
       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
@@ -116,8 +114,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "6831f331",
+   "execution_count": 1,
+   "id": "41d75988",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -126,8 +124,7 @@
     "from nemo_curator.utils.distributed_utils import get_client,get_num_workers\n",
     "from nemo_curator.utils.script_utils import add_distributed_args\n",
     "from nemo_curator.utils.file_utils import get_all_files_paths_under, separate_by_metadata\n",
-    "from nemo_curator.utils.distributed_utils import read_data, write_to_disk\n",
-    "from nemo_curator.gpu_deduplication.utils import (create_logger, parse_nc_args, performance_report_if, enable_spilling)\n",
+    "from nemo_curator.utils.distributed_utils import read_data,write_to_disk\n",
     "from nemo_curator.datasets import DocumentDataset\n",
     "\n",
     "import os\n",
@@ -136,28 +133,24 @@
     "import time\n",
     "import cudf\n",
     "import dask_cudf\n",
+    "import dask\n",
     "import numpy as np\n",
     "from dask.distributed import Client, LocalCluster\n",
-    "import jsonlines"
+    "import jsonlines\n",
+    "\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "e28739b3",
+   "execution_count": 2,
+   "id": "0150b7e7",
    "metadata": {},
    "outputs": [],
    "source": [
     "def pre_imports():\n",
     "    import cudf \n",
     "\n",
-    "def load_dataset(input_data_dir, file_type='jsonl'):\n",
-    "    files = list(get_all_files_paths_under(input_data_dir))\n",
-    "    raw_data = read_data(files, file_type=file_type, backend=\"pandas\", add_filename=True)\n",
-    "    dataset = DocumentDataset(raw_data)\n",
-    "\n",
-    "    return dataset\n",
-    "\n",
     "def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)):\n",
     "    return add_distributed_args(parser)\n",
     "\n",
@@ -179,8 +172,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "d279329f",
+   "execution_count": 3,
+   "id": "3d7e6547",
    "metadata": {},
    "outputs": [
     {
@@ -199,7 +192,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f3f452a3",
+   "id": "cf0aea31",
    "metadata": {},
    "source": [
     "## 1. Download\n",
@@ -240,8 +233,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "1773cda2",
+   "execution_count": 4,
+   "id": "f41df88e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -250,7 +243,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d711a8f8",
+   "id": "b0f2d6d9",
    "metadata": {},
    "source": [
     " Start a CPU based Dask cluster. Please modify `n_workers` and `memory_limit` according to your hardware specification. To process TH wikipedia data, it's advised to have `memory_limit` greater than 12GB"
@@ -258,8 +251,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "id": "56ec66e0",
+   "execution_count": 5,
+   "id": "8742c111",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -269,7 +262,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f794b51c",
+   "id": "f910ae71",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -278,7 +271,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "a90f3505",
+   "id": "c55bcfa8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -294,7 +287,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "5628356b",
+   "id": "b11fdf43",
    "metadata": {},
    "source": [
     "Download TH wikipedia data"
@@ -303,7 +296,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b591b9f2",
+   "id": "ff615514",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -315,7 +308,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2aae29dd",
+   "id": "ff7ae4c0",
    "metadata": {},
    "source": [
     "Verify result"
@@ -323,8 +316,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "id": "169fadb9",
+   "execution_count": 26,
+   "id": "98564093",
    "metadata": {},
    "outputs": [
     {
@@ -343,8 +336,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "id": "f2bcb168",
+   "execution_count": 27,
+   "id": "ded3510b",
    "metadata": {},
    "outputs": [
     {
@@ -362,7 +355,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "44fa2d13",
+   "id": "79b4a804",
    "metadata": {},
    "source": [
     "**[Optional]**Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again."
@@ -370,8 +363,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "id": "590c489c",
+   "execution_count": 28,
+   "id": "f1e8f645",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -381,17 +374,15 @@
   },
   {
    "cell_type": "markdown",
-   "id": "5ba566fc",
+   "id": "4db3267a",
    "metadata": {},
    "source": [
-    "## 2.Language separation and unicode fixing\n",
-    "\n",
-    "**Note**: In order to be run on interactive python. Please comment `from.code import *` and the related imports in `./nemo_curator/filters/__init__.py`"
+    "## 2.Language separation and unicode fixing"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "f742b881",
+   "id": "228e3978",
    "metadata": {},
    "source": [
     "In this section, we will be using a language classification model by fasttext to separate the TH wikipedia dataset based on the document major languages, and we will also fix the unicode in the documents. Detailed steps are:\n",
@@ -406,8 +397,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
-   "id": "71a6e4a2",
+   "execution_count": 7,
+   "id": "bd5d6920",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -418,16 +409,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4916079c",
+   "id": "bd2923bb",
    "metadata": {},
    "source": [
-    "**[Optional]**8Start a cpu based Dask cluster."
+    "**[Optional]** Start a cpu based Dask cluster."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "id": "23a63375",
+   "execution_count": 8,
+   "id": "4375c02b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -437,7 +428,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "957d7357",
+   "id": "2f834de0",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -445,13 +436,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "id": "6270de3f",
+   "execution_count": 9,
+   "id": "3b3856c6",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Input path\n",
-    "multilingual_data_path = download_output_directory\n",
+    "multilingual_data_path = f\"{download_output_directory}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\"\n",
     "\n",
     "# Output path\n",
     "language_base_output_path = os.path.join(data_dir,\"language_sep\")\n",
@@ -471,7 +462,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "598cff2d",
+   "id": "3b6f887f",
    "metadata": {},
    "source": [
     "Download fasttext model"
@@ -479,24 +470,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "id": "0c7cc007",
+   "execution_count": 10,
+   "id": "218c955e",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2024-03-22 08:40:55--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\n",
-      "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.227.74.12, 13.227.74.118, 13.227.74.9, ...\n",
-      "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.227.74.12|:443... connected.\n",
+      "--2024-05-17 03:17:09--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\n",
+      "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 99.84.238.181, 99.84.238.154, 99.84.238.162, ...\n",
+      "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|99.84.238.181|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 131266198 (125M) [application/octet-stream]\n",
-      "Saving to: ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin’\n",
+      "Saving to: ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin.1’\n",
       "\n",
-      "lid.176.bin         100%[===================>] 125.18M   220MB/s    in 0.6s    \n",
+      "lid.176.bin.1       100%[===================>] 125.18M   184MB/s    in 0.7s    \n",
       "\n",
-      "2024-03-22 08:40:56 (220 MB/s) - ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin’ saved [131266198/131266198]\n",
+      "2024-05-17 03:17:10 (184 MB/s) - ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin.1’ saved [131266198/131266198]\n",
       "\n"
      ]
     }
@@ -507,7 +498,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d875771b",
+   "id": "c410253e",
    "metadata": {},
    "source": [
     "Apply fasttext model to separate documents by their languages"
@@ -515,8 +506,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "id": "c959800c",
+   "execution_count": 11,
+   "id": "c9afe965",
    "metadata": {},
    "outputs": [
     {
@@ -537,7 +528,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken for splitting language:147.80864667892456\n"
+      "Time taken for splitting language:140.04064464569092\n"
      ]
     }
    ],
@@ -545,7 +536,7 @@
     "t0 = time.time()\n",
     "\n",
     "# Load dataset \n",
-    "multilingual_dataset = load_dataset(multilingual_data_path)\n",
+    "multilingual_dataset = DocumentDataset.read_json(multilingual_data_path,add_filename=True)\n",
     "\n",
     "#Define Language separation pipeline\n",
     "lang_filter = FastTextLangId(os.path.join(model_path,'lid.176.bin'))\n",
@@ -563,7 +554,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "bd54a24a",
+   "id": "31917e7b",
    "metadata": {},
    "source": [
     "Load `UnicodeReformatter` to reformat any unicode appeared in the desired language dataset"
@@ -571,8 +562,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "id": "0c09bc28",
+   "execution_count": 12,
+   "id": "55da5f12",
    "metadata": {},
    "outputs": [
     {
@@ -581,7 +572,7 @@
      "text": [
       "Reading 1 files\n",
       "Writing to disk complete for 1 partitions\n",
-      "Time taken for fixing unicode:444.5816135406494\n"
+      "Time taken for fixing unicode:437.4811737537384\n"
      ]
     }
    ],
@@ -590,20 +581,20 @@
     "\n",
     "# Read the language specific data and fix the unicode in it\n",
     "lang_data_path = os.path.join(language_separated_output_path, target_language)\n",
-    "lang_data = load_dataset(lang_data_path)\n",
+    "lang_data = DocumentDataset.read_json(lang_data_path,add_filename=True)\n",
     "\n",
     "cleaner = Modify(UnicodeReformatter())\n",
     "cleaned_data = cleaner(lang_data)\n",
     "\n",
     "# Write the cleaned_data\n",
-    "write_to_disk(cleaned_data.df, lang_sep_cleaned_data_output_path, write_to_filename=True, output_type='jsonl')\n",
+    "cleaned_data.to_json(lang_sep_cleaned_data_output_path, write_to_filename=True)\n",
     "\n",
     "print(f\"Time taken for fixing unicode:{time.time()-t0}\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "00c6e5a1",
+   "id": "bc214e82",
    "metadata": {},
    "source": [
     "Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file (no. of lines = 162164)"
@@ -611,8 +602,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "id": "b2b34d46",
+   "execution_count": 13,
+   "id": "6b6eb634",
    "metadata": {},
    "outputs": [
     {
@@ -631,7 +622,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "39d539a2",
+   "id": "57e22770",
    "metadata": {},
    "source": [
     "Furthur verify by loading documents that has been identified as other language, such as 'EN'. We can see from output that the removed document is indeed in English and contains very little or even no Thai."
@@ -639,26 +630,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "5ace3c5b",
+   "execution_count": 38,
+   "id": "79e32205",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\",\"id\":\"1\",\"language\":\"TH\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\"}\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "check_jsonl_file(os.path.join(language_separated_output_path,'EN'))"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "9b817bf7",
+   "id": "39020971",
    "metadata": {},
    "source": [
-    "**[Optional]**Close the Dask cluster."
+    "**[Optional]** Close the Dask cluster."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
-   "id": "bf05b6c2",
+   "execution_count": 37,
+   "id": "64da23ec",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -668,20 +668,20 @@
   },
   {
    "cell_type": "markdown",
-   "id": "cc8b6aef",
+   "id": "6134eaf3",
    "metadata": {},
    "source": [
     "## 3.Add ID\n",
-    "TH wikipedia data do have `id` field, but the `id` field contains number only. It will be better if we unified the `id` field and transform it to the format of `<prefix>_<id>`. In this way, when handling multiple dataset, we will able to know which document from which dataset has been removed. This `id` will be useful when we are running deduplication and heuristic filtering. The function we will be using is `AddID()`. Arguments for this function include:\n",
+    "TH wikipedia data do have `id` field, but the `id` field contains number only. It will be better if we unified the `id` field and transform it to the format of `<prefix>_<id>`. In this way, when handling multiple dataset, we will be able to know which document from which dataset has been removed. This `id` will be useful when we are running deduplication and heuristic filtering. The function we will be using is `AddID()`. Arguments for this function include:\n",
     "- `id_field`: fields will be added to input .json file. If the key already exists in the .jsonl, it's value will be replaced.\n",
-    "- `id_prefix`: prefix used in ID. Default is 'doc-id'\n",
-    "- `start_index`: starting index in ID. Default is 0"
+    "- `id_prefix`: prefix used in ID. Default is 'doc_id'\n",
+    "- `start_index`: starting index in ID. Default is None. When set to None, an unordered ID scheme will be used for fast calculation. In this notebook, it's set to 0 for easier reference."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "id": "fe9e6eef",
+   "execution_count": 14,
+   "id": "5bed2e25",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -690,16 +690,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "232c01a5",
+   "id": "be1c546b",
    "metadata": {},
    "source": [
-    "**[Optional]**If there is no running Dask cluster, start CPU based Dask cluster."
+    "**[Optional]** If there is no running Dask cluster, start CPU based Dask cluster."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 155,
-   "id": "f3f483eb",
+   "execution_count": 15,
+   "id": "3a6349d9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -709,7 +709,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2be65a51",
+   "id": "503bfa4c",
    "metadata": {},
    "source": [
     "Define relevant parameters"
@@ -717,8 +717,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "054019a5",
+   "execution_count": 16,
+   "id": "a14c6ba3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -734,7 +734,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "80f9591c",
+   "id": "b249dcf9",
    "metadata": {},
    "source": [
     "Adding ID to dataset"
@@ -742,8 +742,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "id": "e8fd7e09",
+   "execution_count": 17,
+   "id": "d12bb962",
    "metadata": {},
    "outputs": [
     {
@@ -752,28 +752,28 @@
      "text": [
       "Reading 1 files\n",
       "Writing to disk complete for 1 partitions\n",
-      "Time taken for add ID:56.01176333427429\n"
+      "Time taken for add ID:47.33783745765686\n"
      ]
     }
    ],
    "source": [
     "t0 = time.time()\n",
     "# Read input files\n",
-    "dataset = load_dataset(add_id_input_data_dir)\n",
+    "dataset = DocumentDataset.read_json(add_id_input_data_dir,add_filename=True)\n",
     "\n",
     "# Run AddID() on the input dataset\n",
     "add_id = AddId(id_field='id',id_prefix=add_ID_id_prefix,start_index=0)\n",
     "id_dataset = add_id(dataset)\n",
     "\n",
     "#Output files\n",
-    "write_to_disk(id_dataset.df, output_file_dir=added_id_output_path, write_to_filename=True, output_type='jsonl')\n",
+    "id_dataset.to_json(added_id_output_path, write_to_filename=True)\n",
     "\n",
     "print(f\"Time taken for add ID:{time.time()-t0}\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "50016a50",
+   "id": "ce2934df",
    "metadata": {},
    "source": [
     "Verify the result. From the output, we can see that the `id` value has been changed to `TH_wiki-0000000000` "
@@ -781,8 +781,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "27a634e9",
+   "execution_count": 18,
+   "id": "cd51cd14",
    "metadata": {},
    "outputs": [
     {
@@ -800,7 +800,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e7084fed",
+   "id": "f249ab8b",
    "metadata": {},
    "source": [
     "Close Dask cluster. This cell needs to be run as we are starting a new GPU Dask cluster in the following task"
@@ -808,8 +808,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "id": "16399469",
+   "execution_count": 20,
+   "id": "62336143",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -819,7 +819,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "cb227709",
+   "id": "d6fb16b1",
    "metadata": {},
    "source": [
     "## 4.Exact Dedplication\n",
@@ -835,8 +835,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
-   "id": "8fa6c3af",
+   "execution_count": 21,
+   "id": "044f7eee",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -845,7 +845,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "aa70fd06",
+   "id": "6e5da88e",
    "metadata": {},
    "source": [
     "Start a GPU based Dask cluster. Since GPU based Dask cluster involves setting several arguments, we will use the `get_client()` wrapper function to quickly set up. Please make sure the `device` in `args` is `gpu`"
@@ -853,17 +853,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
-   "id": "7e9530f6",
+   "execution_count": 22,
+   "id": "e4d6920d",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "Namespace(scheduler_address=None, scheduler_file=None, n_workers=20, threads_per_worker=1, rmm_pool_size=None, protocol='tcp', nvlink_only=False, files_per_partition=2, num_files=-1, device='gpu', set_torch_to_use_rmm=False)"
+       "Namespace(scheduler_address=None, scheduler_file=None, n_workers=20, threads_per_worker=1, rmm_pool_size=None, protocol='tcp', nvlink_only=False, files_per_partition=2, num_files=None, device='gpu', set_torch_to_use_rmm=False)"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -878,8 +878,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
-   "id": "f71ab145",
+   "execution_count": 23,
+   "id": "717b6cef",
    "metadata": {},
    "outputs": [
     {
@@ -892,10 +892,10 @@
     {
      "data": {
       "text/plain": [
-       "{'tcp://127.0.0.1:37795': None}"
+       "{'tcp://127.0.0.1:42505': None}"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -908,7 +908,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4ef57149",
+   "id": "f267e161",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -916,8 +916,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
-   "id": "26e6927e",
+   "execution_count": 24,
+   "id": "d01e2f08",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -936,8 +936,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
-   "id": "b9a75a74",
+   "execution_count": 25,
+   "id": "6395ffde",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -947,7 +947,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a9fc0bd2",
+   "id": "a654a16e",
    "metadata": {},
    "source": [
     "Apply exact deduplication"
@@ -955,17 +955,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
-   "id": "daf8f324",
+   "execution_count": 26,
+   "id": "a5e0117c",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Reading 1 files\n",
+      "Reading 1 files\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/exact_dedup.py:158: UserWarning: Output path f/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/exact_dedup/data/_exact_duplicates.parquet already exists and will be overwritten\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "Number of exact duplicated file:53\n",
-      "Time taken for exact duplicate:3.0404415130615234\n"
+      "Time taken for exact duplicate:1.9629592895507812\n"
      ]
     }
    ],
@@ -991,7 +1005,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "517c60e4",
+   "id": "7f8bdb88",
    "metadata": {},
    "source": [
     "Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary"
@@ -999,8 +1013,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
-   "id": "2f3c67f8",
+   "execution_count": 27,
+   "id": "e045d65a",
    "metadata": {},
    "outputs": [
     {
@@ -1038,27 +1052,27 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>TH_wiki-0000021211</td>\n",
+       "      <td>TH_wiki-0000021096</td>\n",
        "      <td>1708cb56ec582f78716f0864dca9382d</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>TH_wiki-0000021213</td>\n",
+       "      <td>TH_wiki-0000021100</td>\n",
        "      <td>1708cb56ec582f78716f0864dca9382d</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>TH_wiki-0000105191</td>\n",
-       "      <td>e77a248506ef16737288fae5759db33a</td>\n",
+       "      <td>TH_wiki-0000067251</td>\n",
+       "      <td>edf8af427a33ed94150899970f39770f</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>TH_wiki-0000105192</td>\n",
-       "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
+       "      <td>TH_wiki-0000105191</td>\n",
+       "      <td>e77a248506ef16737288fae5759db33a</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>TH_wiki-0000105193</td>\n",
+       "      <td>TH_wiki-0000105192</td>\n",
        "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -1067,14 +1081,14 @@
       ],
       "text/plain": [
        "                   id                           _hashes\n",
-       "0  TH_wiki-0000021211  1708cb56ec582f78716f0864dca9382d\n",
-       "1  TH_wiki-0000021213  1708cb56ec582f78716f0864dca9382d\n",
-       "2  TH_wiki-0000105191  e77a248506ef16737288fae5759db33a\n",
-       "3  TH_wiki-0000105192  2e386f5c3af70f43874618988d4842b2\n",
-       "4  TH_wiki-0000105193  2e386f5c3af70f43874618988d4842b2"
+       "0  TH_wiki-0000021096  1708cb56ec582f78716f0864dca9382d\n",
+       "1  TH_wiki-0000021100  1708cb56ec582f78716f0864dca9382d\n",
+       "2  TH_wiki-0000067251  edf8af427a33ed94150899970f39770f\n",
+       "3  TH_wiki-0000105191  e77a248506ef16737288fae5759db33a\n",
+       "4  TH_wiki-0000105192  2e386f5c3af70f43874618988d4842b2"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1087,8 +1101,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "id": "7ed7d4de",
+   "execution_count": 28,
+   "id": "8754887e",
    "metadata": {},
    "outputs": [
     {
@@ -1125,12 +1139,12 @@
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>15f35c239b6579b4642f7656e64576ac</td>\n",
-       "      <td>TH_wiki-0000098621 TH_wiki-0000074714 TH_wiki-...</td>\n",
+       "      <td>TH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>1708cb56ec582f78716f0864dca9382d</td>\n",
-       "      <td>TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...</td>\n",
+       "      <td>TH_wiki-0000021096 TH_wiki-0000021100 TH_wiki-...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -1156,13 +1170,13 @@
        "\n",
        "                                                  id  \n",
        "0              TH_wiki-0000157216 TH_wiki-0000066307  \n",
-       "1  TH_wiki-0000098621 TH_wiki-0000074714 TH_wiki-...  \n",
-       "2  TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...  \n",
+       "1  TH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-...  \n",
+       "2  TH_wiki-0000021096 TH_wiki-0000021100 TH_wiki-...  \n",
        "3  TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-...  \n",
        "4              TH_wiki-0000122055 TH_wiki-0000116550  "
       ]
      },
-     "execution_count": 37,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1173,8 +1187,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
-   "id": "3051ed4b",
+   "execution_count": 29,
+   "id": "13712668",
    "metadata": {},
    "outputs": [
     {
@@ -1194,7 +1208,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ec31440b",
+   "id": "7a388445",
    "metadata": {},
    "source": [
     "**[Optional]** You might choose to close Dask cluster here"
@@ -1202,8 +1216,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
-   "id": "2ee05303",
+   "execution_count": 31,
+   "id": "7875bf12",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1213,7 +1227,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "710e8540",
+   "id": "20502f76",
    "metadata": {},
    "source": [
     "## 5. Fuzzy Deduplication\n",
@@ -1238,12 +1252,14 @@
     "2. Bucket computation\n",
     "3. Jaccard shuffle for load balancing in a distributed system\n",
     "4. Jaccard similarity computation\n",
-    "5. Connected component "
+    "5. Connected component \n",
+    "\n",
+    "In this section, we will firstly provide examples to each sub-steps for users to have a better understanding on what is going on under the hood. At the last sub section, we will provide example for the fuzzy deduplication wrapper."
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "c4b99c5e",
+   "id": "de98daed",
    "metadata": {},
    "source": [
     "**If there is not running Dask cluster, start a GPU Dask cluster here**"
@@ -1251,17 +1267,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
-   "id": "115ff2dc",
+   "execution_count": 60,
+   "id": "0a84ae27",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'tcp://127.0.0.1:33223': None}"
+       "{'tcp://127.0.0.1:43209': None}"
       ]
      },
-     "execution_count": 90,
+     "execution_count": 60,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1279,7 +1295,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "1979977d",
+   "id": "5de7a035",
    "metadata": {},
    "source": [
     "### 5.1 Minhash\n",
@@ -1303,8 +1319,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
-   "id": "f9b2a642",
+   "execution_count": 30,
+   "id": "bbc84690",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1313,7 +1329,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4c152974",
+   "id": "3b0beafe",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -1321,8 +1337,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
-   "id": "117a569d",
+   "execution_count": 31,
+   "id": "52f056f7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1350,7 +1366,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "73c1ad41",
+   "id": "aaefe7bd",
    "metadata": {},
    "source": [
     "Run MinHash"
@@ -1358,8 +1374,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
-   "id": "a17954eb",
+   "execution_count": 32,
+   "id": "da632a42",
    "metadata": {},
    "outputs": [
     {
@@ -1367,19 +1383,23 @@
      "output_type": "stream",
      "text": [
       "Computing minhashes for /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/add_id/cleaned\n",
-      "Reading 1 files\n",
-      "Time taken for MinHash:7.543871879577637\n"
+      "Reading 1 files\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
-      "\n",
-      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
+      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:175: UserWarning: Output path /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/fuzzy/minhash/data/_minhashes.parquet already exists and will be overwritten\n",
       "  warnings.warn(\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time taken for MinHash:5.899524927139282\n"
+     ]
     }
    ],
    "source": [
@@ -1415,7 +1435,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "19cddba5",
+   "id": "9ad4ba59",
    "metadata": {},
    "source": [
     "Verify result"
@@ -1423,8 +1443,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
-   "id": "df83eec5",
+   "execution_count": 33,
+   "id": "93220b5c",
    "metadata": {},
    "outputs": [
     {
@@ -1491,7 +1511,7 @@
        "4  TH_wiki-0000000004  [1559901, 11771639, 487706, 826569, 1203860, 5..."
       ]
      },
-     "execution_count": 45,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1503,7 +1523,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "998ab08a",
+   "id": "b407928e",
    "metadata": {},
    "source": [
     "### 5.2 LSH\n",
@@ -1524,28 +1544,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
-   "id": "138544a5",
+   "execution_count": 34,
+   "id": "f3801d7a",
    "metadata": {},
    "outputs": [],
    "source": [
     "from nemo_curator import LSH\n",
-    "from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import \\\n",
+    "from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import \\\n",
     "    convert_str_id_to_int"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "178fd0e4",
+   "id": "2a2c178a",
    "metadata": {},
    "source": [
-    "Define parameter"
+    "Define parameters"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
-   "id": "21d2a261",
+   "execution_count": 35,
+   "id": "d52707b9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1570,7 +1590,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a18708d2",
+   "id": "c59b4fe6",
    "metadata": {},
    "source": [
     "Run LSH"
@@ -1578,17 +1598,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
-   "id": "9eebeb10",
+   "execution_count": 36,
+   "id": "71c0848f",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
-      "\n",
-      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
+      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:361: UserWarning: Output path /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/fuzzy/lsh/data/_buckets.parquet already exists and will be overwritten\n",
       "  warnings.warn(\n"
      ]
     },
@@ -1596,7 +1614,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken for LSH:20.533941984176636\n"
+      "Time taken for LSH:18.237318754196167\n"
      ]
     }
    ],
@@ -1616,7 +1634,7 @@
     "#Run LSH()\n",
     "lsh = LSH(\n",
     "    cache_dir=lsh_output_dir,\n",
-    "    minhash_length=minhash_length,\n",
+    "    num_hashes=minhash_length,\n",
     "    num_buckets=num_bands,\n",
     "    buckets_per_shuffle=buckets_per_shuffle,\n",
     "    id_fields=[\"dataset_id\", \"doc_id\"],\n",
@@ -1631,7 +1649,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "813603e2",
+   "id": "3789c538",
    "metadata": {},
    "source": [
     "Verify result"
@@ -1639,8 +1657,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
-   "id": "c47da6b9",
+   "execution_count": 37,
+   "id": "d8663302",
    "metadata": {},
    "outputs": [
     {
@@ -1673,32 +1691,32 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>124692</td>\n",
-       "      <td>96</td>\n",
+       "      <td>124883</td>\n",
+       "      <td>38</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>85282</td>\n",
-       "      <td>385</td>\n",
+       "      <td>123211</td>\n",
+       "      <td>141</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>156638</td>\n",
-       "      <td>529</td>\n",
+       "      <td>124885</td>\n",
+       "      <td>38</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>160566</td>\n",
-       "      <td>540</td>\n",
+       "      <td>85294</td>\n",
+       "      <td>345</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>160567</td>\n",
-       "      <td>540</td>\n",
+       "      <td>124886</td>\n",
+       "      <td>38</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1706,14 +1724,14 @@
       ],
       "text/plain": [
        "   dataset_id  doc_id  _bucket_id\n",
-       "0  1692361878  124692          96\n",
-       "1  1692361878   85282         385\n",
-       "2  1692361878  156638         529\n",
-       "3  1692361878  160566         540\n",
-       "4  1692361878  160567         540"
+       "0  1692361878  124883          38\n",
+       "1  1692361878  123211         141\n",
+       "2  1692361878  124885          38\n",
+       "3  1692361878   85294         345\n",
+       "4  1692361878  124886          38"
       ]
      },
-     "execution_count": 49,
+     "execution_count": 37,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1725,7 +1743,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "07bade4a",
+   "id": "00f5567b",
    "metadata": {},
    "source": [
     "### 5.3 Jaccard Shuffle\n",
@@ -1746,8 +1764,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
-   "id": "565253ae",
+   "execution_count": 38,
+   "id": "c5d458d1",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1760,7 +1778,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "70387977",
+   "id": "e904bc34",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -1768,8 +1786,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
-   "id": "5cff7d76",
+   "execution_count": 39,
+   "id": "170a44fd",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1801,7 +1819,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "699a53f1",
+   "id": "333e91a8",
    "metadata": {},
    "source": [
     "Run Jaccard map bucket"
@@ -1809,8 +1827,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
-   "id": "0a6e5a84",
+   "execution_count": 40,
+   "id": "67b96227",
    "metadata": {},
    "outputs": [
     {
@@ -1818,24 +1836,8 @@
      "output_type": "stream",
      "text": [
       "Number of files being read for jaccard calculation = 1\n",
-      "Number of ddf_bk partitions = 1\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
-      "\n",
-      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken for Bucket Mapping:2.1162023544311523 s\n"
+      "Number of ddf_bk partitions = 1\n",
+      "Time taken for Bucket Mapping:2.2563915252685547 s\n"
      ]
     }
    ],
@@ -1866,16 +1868,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "96246266",
+   "id": "8f76b8ef",
    "metadata": {},
    "source": [
-    "Verify results "
+    "Verify result"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
-   "id": "09e65f8b",
+   "execution_count": 41,
+   "id": "81c9c7c7",
    "metadata": {},
    "outputs": [
     {
@@ -1912,51 +1914,51 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>138220</td>\n",
+       "      <td>8895</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>145256</td>\n",
+       "      <td>8964</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>143672</td>\n",
+       "      <td>8895</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>50509</td>\n",
+       "      <td>127089</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>50509</td>\n",
+       "      <td>127220</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>50457</td>\n",
+       "      <td>127089</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>93989</td>\n",
+       "      <td>127090</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>93846</td>\n",
+       "      <td>127220</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>93807</td>\n",
+       "      <td>127089</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>20448</td>\n",
+       "      <td>151728</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>20090</td>\n",
+       "      <td>151728</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>20444</td>\n",
+       "      <td>151729</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>93991</td>\n",
+       "      <td>137262</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>93927</td>\n",
+       "      <td>137301</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>93697</td>\n",
+       "      <td>137262</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -1965,21 +1967,21 @@
       ],
       "text/plain": [
        "   dataset_id  doc_id  anchor_1_dataset_id  anchor_1_doc_id  \\\n",
-       "0  1692361878  138220           1692361878           145256   \n",
-       "1  1692361878   50509           1692361878            50509   \n",
-       "2  1692361878   93989           1692361878            93846   \n",
-       "3  1692361878   20448           1692361878            20090   \n",
-       "4  1692361878   93991           1692361878            93927   \n",
+       "0  1692361878    8895           1692361878             8964   \n",
+       "1  1692361878  127089           1692361878           127220   \n",
+       "2  1692361878  127090           1692361878           127220   \n",
+       "3  1692361878  151728           1692361878           151728   \n",
+       "4  1692361878  137262           1692361878           137301   \n",
        "\n",
        "   anchor_0_dataset_id  anchor_0_doc_id  _output_partition_id  \n",
-       "0           1692361878           143672                     0  \n",
-       "1           1692361878            50457                     0  \n",
-       "2           1692361878            93807                     0  \n",
-       "3           1692361878            20444                     0  \n",
-       "4           1692361878            93697                     0  "
+       "0           1692361878             8895                     0  \n",
+       "1           1692361878           127089                     0  \n",
+       "2           1692361878           127089                     0  \n",
+       "3           1692361878           151729                     0  \n",
+       "4           1692361878           137262                     0  "
       ]
      },
-     "execution_count": 53,
+     "execution_count": 41,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1991,16 +1993,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "35bb1e86",
+   "id": "b4896749",
    "metadata": {},
    "source": [
-    "**[Optional]**Remove previous Jaccard Shuffle results. Run only when there are files under the Jaccard Shuffle output path"
+    "**[Optional]** Remove previous Jaccard Shuffle results. Run only when there are files under the Jaccard Shuffle output path"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
-   "id": "da7dcc10",
+   "execution_count": 43,
+   "id": "2d4dd55f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2009,7 +2011,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "24c2b39d",
+   "id": "f9b5ab9e",
    "metadata": {},
    "source": [
     "Run Jaccard Shuffle"
@@ -2017,15 +2019,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
-   "id": "a9dcf646",
+   "execution_count": 44,
+   "id": "acccb80b",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  0%|                                                                                                                                                                                                                                                                                                          | 0/1 [00:00<?, ?it/s]"
+      "  0%|                                                                                                                                                                                                                                                                                                                                                       | 0/1 [00:00<?, ?it/s]"
      ]
     },
     {
@@ -2037,22 +2039,22 @@
       "Using 1 text partitions.\n",
       "Starting text bytes aware shuffle\n",
       "Will write 30596 rows to disk\n",
-      "Text-df partition  1/1 completed in 3.394432544708252\n",
-      "Bucket partition  1/1 completed in 3.4057791233062744\n"
+      "Text-df partition  1/1 completed in 3.532567024230957\n",
+      "Bucket partition  1/1 completed in 3.545058012008667\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.41s/it]"
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.55s/it]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken for Jaccard Shuffle = 3.4692487716674805 s\n"
+      "Time taken for Jaccard Shuffle = 3.613771677017212 s\n"
      ]
     },
     {
@@ -2087,16 +2089,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "0c3f2177",
+   "id": "929f8dea",
    "metadata": {},
    "source": [
-    "Verify results"
+    "Verify result"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
-   "id": "cf44fb6b",
+   "execution_count": 45,
+   "id": "49fb9756",
    "metadata": {},
    "outputs": [
     {
@@ -2130,43 +2132,43 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>พุทธศักราช 676 ใกล้เคียงกับ\\n เมษายน ค.ศ. 133 ...</td>\n",
-       "      <td>263</td>\n",
-       "      <td>1692361878-7032</td>\n",
-       "      <td>1692361878-7032</td>\n",
-       "      <td>1692361878-7052</td>\n",
+       "      <td>ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...</td>\n",
+       "      <td>1894</td>\n",
+       "      <td>1692361878-127021</td>\n",
+       "      <td>1692361878-127021</td>\n",
+       "      <td>1692361878-126958</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...</td>\n",
-       "      <td>217</td>\n",
-       "      <td>1692361878-9082</td>\n",
-       "      <td>1692361878-8805</td>\n",
-       "      <td>1692361878-9071</td>\n",
+       "      <td>ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...</td>\n",
+       "      <td>1894</td>\n",
+       "      <td>1692361878-127021</td>\n",
+       "      <td>1692361878-127021</td>\n",
+       "      <td>1692361878-127017</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...</td>\n",
-       "      <td>217</td>\n",
-       "      <td>1692361878-9082</td>\n",
-       "      <td>1692361878-9028</td>\n",
-       "      <td>1692361878-9045</td>\n",
+       "      <td>ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...</td>\n",
+       "      <td>1894</td>\n",
+       "      <td>1692361878-127021</td>\n",
+       "      <td>1692361878-126928</td>\n",
+       "      <td>1692361878-126891</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...</td>\n",
-       "      <td>217</td>\n",
-       "      <td>1692361878-9082</td>\n",
-       "      <td>1692361878-9072</td>\n",
-       "      <td>1692361878-9082</td>\n",
+       "      <td>วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...</td>\n",
+       "      <td>423</td>\n",
+       "      <td>1692361878-87271</td>\n",
+       "      <td>1692361878-87204</td>\n",
+       "      <td>1692361878-87271</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>ประเทศฮังการี เข้าร่วมแข่งขันกีฬาโอลิมปิกฤดูร้...</td>\n",
-       "      <td>2039</td>\n",
-       "      <td>1692361878-49091</td>\n",
-       "      <td>1692361878-49093</td>\n",
-       "      <td>1692361878-49087</td>\n",
+       "      <td>วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...</td>\n",
+       "      <td>423</td>\n",
+       "      <td>1692361878-87271</td>\n",
+       "      <td>1692361878-87267</td>\n",
+       "      <td>1692361878-87271</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2174,21 +2176,21 @@
       ],
       "text/plain": [
        "                                                text  _text_bytes  \\\n",
-       "0  พุทธศักราช 676 ใกล้เคียงกับ\\n เมษายน ค.ศ. 133 ...          263   \n",
-       "1  พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...          217   \n",
-       "2  พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...          217   \n",
-       "3  พุทธศักราช 41 ใกล้เคียงกับ ก่อน คริสต์ศักราช 5...          217   \n",
-       "4  ประเทศฮังการี เข้าร่วมแข่งขันกีฬาโอลิมปิกฤดูร้...         2039   \n",
+       "0  ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...         1894   \n",
+       "1  ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...         1894   \n",
+       "2  ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...         1894   \n",
+       "3  วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...          423   \n",
+       "4  วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...          423   \n",
        "\n",
-       "                 id       anchor_0_id       anchor_1_id  \n",
-       "0   1692361878-7032   1692361878-7032   1692361878-7052  \n",
-       "1   1692361878-9082   1692361878-8805   1692361878-9071  \n",
-       "2   1692361878-9082   1692361878-9028   1692361878-9045  \n",
-       "3   1692361878-9082   1692361878-9072   1692361878-9082  \n",
-       "4  1692361878-49091  1692361878-49093  1692361878-49087  "
+       "                  id        anchor_0_id        anchor_1_id  \n",
+       "0  1692361878-127021  1692361878-127021  1692361878-126958  \n",
+       "1  1692361878-127021  1692361878-127021  1692361878-127017  \n",
+       "2  1692361878-127021  1692361878-126928  1692361878-126891  \n",
+       "3   1692361878-87271   1692361878-87204   1692361878-87271  \n",
+       "4   1692361878-87271   1692361878-87267   1692361878-87271  "
       ]
      },
-     "execution_count": 55,
+     "execution_count": 45,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2200,7 +2202,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ffb70238",
+   "id": "1a23a5c0",
    "metadata": {},
    "source": [
     "### 5.4 Jaccard Compute\n",
@@ -2215,8 +2217,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
-   "id": "06346b88",
+   "execution_count": 46,
+   "id": "6cfa08ea",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2225,7 +2227,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d71f440f",
+   "id": "389f305b",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -2233,8 +2235,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
-   "id": "457ae138",
+   "execution_count": 47,
+   "id": "c142a42a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2256,7 +2258,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "619bf820",
+   "id": "7a0f610f",
    "metadata": {},
    "source": [
     "Run Jaccard Compute"
@@ -2264,8 +2266,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
-   "id": "2f094db1",
+   "execution_count": 48,
+   "id": "8ceae838",
    "metadata": {},
    "outputs": [
     {
@@ -2273,13 +2275,13 @@
      "output_type": "stream",
      "text": [
       "Running jaccard compute script\n",
-      "Time taken for Jaccard Computing: 0.8689384460449219\n"
+      "Time taken for Jaccard Computing: 0.5923423767089844\n"
      ]
     }
    ],
    "source": [
-    "enable_spilling()\n",
-    "client.run(enable_spilling)\n",
+    "# enable_spilling()\n",
+    "# client.run(enable_spilling)\n",
     "\n",
     "print(\"Running jaccard compute script\", flush=True)\n",
     "t0 = time.time()\n",
@@ -2301,7 +2303,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b31e619c",
+   "id": "ae06ad56",
    "metadata": {},
    "source": [
     "Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets."
@@ -2309,8 +2311,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
-   "id": "ae2efe3e",
+   "execution_count": 49,
+   "id": "686eb956",
    "metadata": {},
    "outputs": [
     {
@@ -2342,33 +2344,33 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>1692361878-127521</td>\n",
-       "      <td>1692361878-127517</td>\n",
-       "      <td>0.755481</td>\n",
+       "      <td>1692361878-49094</td>\n",
+       "      <td>1692361878-49078</td>\n",
+       "      <td>0.784000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>1692361878-127521</td>\n",
-       "      <td>1692361878-127517</td>\n",
-       "      <td>0.755481</td>\n",
+       "      <td>1692361878-49094</td>\n",
+       "      <td>1692361878-49078</td>\n",
+       "      <td>0.784000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>1692361878-45934</td>\n",
-       "      <td>1692361878-45940</td>\n",
-       "      <td>0.922061</td>\n",
+       "      <td>1692361878-49094</td>\n",
+       "      <td>1692361878-49078</td>\n",
+       "      <td>0.784000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>1692361878-45934</td>\n",
-       "      <td>1692361878-45940</td>\n",
-       "      <td>0.922061</td>\n",
+       "      <td>1692361878-49094</td>\n",
+       "      <td>1692361878-49078</td>\n",
+       "      <td>0.784000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>1692361878-45934</td>\n",
-       "      <td>1692361878-45940</td>\n",
-       "      <td>0.922061</td>\n",
+       "      <td>1692361878-161128</td>\n",
+       "      <td>1692361878-161122</td>\n",
+       "      <td>0.890339</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2376,14 +2378,14 @@
       ],
       "text/plain": [
        "                id_x               id_y   jaccard\n",
-       "0  1692361878-127521  1692361878-127517  0.755481\n",
-       "1  1692361878-127521  1692361878-127517  0.755481\n",
-       "2   1692361878-45934   1692361878-45940  0.922061\n",
-       "3   1692361878-45934   1692361878-45940  0.922061\n",
-       "4   1692361878-45934   1692361878-45940  0.922061"
+       "0   1692361878-49094   1692361878-49078  0.784000\n",
+       "1   1692361878-49094   1692361878-49078  0.784000\n",
+       "2   1692361878-49094   1692361878-49078  0.784000\n",
+       "3   1692361878-49094   1692361878-49078  0.784000\n",
+       "4  1692361878-161128  1692361878-161122  0.890339"
       ]
      },
-     "execution_count": 59,
+     "execution_count": 49,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2395,7 +2397,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "834f1831",
+   "id": "63911051",
    "metadata": {},
    "source": [
     "### 5.5 Connected Components\n",
@@ -2410,8 +2412,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
-   "id": "5756fde8",
+   "execution_count": 50,
+   "id": "5eae08f1",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2420,16 +2422,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "217957d6",
+   "id": "ed713696",
    "metadata": {},
    "source": [
-    "Define parameter"
+    "Define parameters"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
-   "id": "72a1952e",
+   "execution_count": 51,
+   "id": "a0881f12",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2450,7 +2452,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c53b3a8c",
+   "id": "4fba31d2",
    "metadata": {},
    "source": [
     "Run Connected Component"
@@ -2458,62 +2460,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
-   "id": "46578e2b",
+   "execution_count": 52,
+   "id": "da4a8d4e",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
-      "\n",
-      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
-      "  warnings.warn(\n",
-      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
-      "\n",
-      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
-      "  warnings.warn(\n",
-      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
-      "\n",
-      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "batch_id = 0/1, time = 0.3100006580352783\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
-      "\n",
-      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
-      "  warnings.warn(\n",
-      "/usr/local/lib/python3.10/dist-packages/dask/dataframe/io/parquet/core.py:421: FutureWarning: The `aggregate_files` argument will be deprecated in the future. Please consider using `from_map` to create a DataFrame collection with a custom file-to-partition mapping.\n",
-      "\n",
-      "If you strongly oppose the deprecation of `aggregate_files`, please comment at https://github.com/dask/dask/issues/9051\n",
-      "  warnings.warn(\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "batch_id = 0/1, time = 0.26957249641418457\n",
       "# of groups 5465\n",
       "# of docs removed 3079\n",
       "assert num_nodes:8544==labels_df:8544 passed\n",
-      "Time taken for Connected Component: 11.238884925842285 s\n"
+      "Time taken for Connected Component: 4.331223726272583 s\n"
      ]
     }
    ],
    "source": [
-    "client.run(enable_spilling)\n",
+    "#client.run(enable_spilling)\n",
     "\n",
     "t0 = time.time()\n",
     "    \n",
@@ -2532,7 +2496,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6827158e",
+   "id": "24b55482",
    "metadata": {},
    "source": [
     "Verify the result of `Connected Components`"
@@ -2540,8 +2504,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
-   "id": "2bcfc470",
+   "execution_count": 53,
+   "id": "ecd69e7e",
    "metadata": {},
    "outputs": [
     {
@@ -2574,32 +2538,32 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>136999</td>\n",
-       "      <td>3837</td>\n",
+       "      <td>139585</td>\n",
+       "      <td>1936</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>85318</td>\n",
-       "      <td>3838</td>\n",
+       "      <td>8059</td>\n",
+       "      <td>5312</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>70670</td>\n",
-       "      <td>1196</td>\n",
+       "      <td>93474</td>\n",
+       "      <td>5313</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>134587</td>\n",
-       "      <td>138</td>\n",
+       "      <td>127790</td>\n",
+       "      <td>2774</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>136125</td>\n",
-       "      <td>1320</td>\n",
+       "      <td>49650</td>\n",
+       "      <td>1425</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2607,14 +2571,14 @@
       ],
       "text/plain": [
        "   dataset_id  doc_id  group\n",
-       "0  1692361878  136999   3837\n",
-       "1  1692361878   85318   3838\n",
-       "2  1692361878   70670   1196\n",
-       "3  1692361878  134587    138\n",
-       "4  1692361878  136125   1320"
+       "0  1692361878  139585   1936\n",
+       "1  1692361878    8059   5312\n",
+       "2  1692361878   93474   5313\n",
+       "3  1692361878  127790   2774\n",
+       "4  1692361878   49650   1425"
       ]
      },
-     "execution_count": 63,
+     "execution_count": 53,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2626,7 +2590,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "aa1ee07d",
+   "id": "44834e54",
    "metadata": {},
    "source": [
     "Let's check if the output fuzzy duplicated documents within the same group are similar. Please note that the `group` id in your output might be different from the notebook output."
@@ -2634,8 +2598,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
-   "id": "f1f10a1c",
+   "execution_count": 54,
+   "id": "6c404c89",
    "metadata": {},
    "outputs": [
     {
@@ -2666,28 +2630,28 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>121</td>\n",
-       "      <td>134756, 134762, 134748, 134742, 134740, 134750...</td>\n",
+       "      <td>75</td>\n",
+       "      <td>160982, 161038, 161124, 161109, 161121, 160991...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>138</td>\n",
-       "      <td>134587, 134908, 135024, 135029, 135019, 134566...</td>\n",
+       "      <td>112</td>\n",
+       "      <td>122007, 122124, 122020, 122282, 122010, 122134...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>323</td>\n",
-       "      <td>134794, 134780, 134793, 134785, 134798, 134781...</td>\n",
+       "      <td>151</td>\n",
+       "      <td>134584, 135030, 134908, 134891, 135029, 135020...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>344</td>\n",
-       "      <td>136092, 136103, 136090, 136093, 136100, 136089...</td>\n",
+       "      <td>321</td>\n",
+       "      <td>94082, 94114, 94126, 94057, 94121, 94132, 9411...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>428</td>\n",
-       "      <td>94120, 94084, 94059, 94128, 94130, 94056, 9413...</td>\n",
+       "      <td>339</td>\n",
+       "      <td>116230, 116237, 116223, 116236, 116176, 116204...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -2697,27 +2661,27 @@
        "    <tr>\n",
        "      <th>5460</th>\n",
        "      <td>8539</td>\n",
-       "      <td>125651</td>\n",
+       "      <td>120646</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5461</th>\n",
        "      <td>8540</td>\n",
-       "      <td>125971</td>\n",
+       "      <td>158174</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5462</th>\n",
        "      <td>8541</td>\n",
-       "      <td>84926</td>\n",
+       "      <td>132405</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5463</th>\n",
        "      <td>8542</td>\n",
-       "      <td>40115</td>\n",
+       "      <td>49199</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5464</th>\n",
        "      <td>8543</td>\n",
-       "      <td>50282</td>\n",
+       "      <td>160924</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2726,22 +2690,22 @@
       ],
       "text/plain": [
        "      group                                             doc_id\n",
-       "0       121  134756, 134762, 134748, 134742, 134740, 134750...\n",
-       "1       138  134587, 134908, 135024, 135029, 135019, 134566...\n",
-       "2       323  134794, 134780, 134793, 134785, 134798, 134781...\n",
-       "3       344  136092, 136103, 136090, 136093, 136100, 136089...\n",
-       "4       428  94120, 94084, 94059, 94128, 94130, 94056, 9413...\n",
+       "0        75  160982, 161038, 161124, 161109, 161121, 160991...\n",
+       "1       112  122007, 122124, 122020, 122282, 122010, 122134...\n",
+       "2       151  134584, 135030, 134908, 134891, 135029, 135020...\n",
+       "3       321  94082, 94114, 94126, 94057, 94121, 94132, 9411...\n",
+       "4       339  116230, 116237, 116223, 116236, 116176, 116204...\n",
        "...     ...                                                ...\n",
-       "5460   8539                                             125651\n",
-       "5461   8540                                             125971\n",
-       "5462   8541                                              84926\n",
-       "5463   8542                                              40115\n",
-       "5464   8543                                              50282\n",
+       "5460   8539                                             120646\n",
+       "5461   8540                                             158174\n",
+       "5462   8541                                             132405\n",
+       "5463   8542                                              49199\n",
+       "5464   8543                                             160924\n",
        "\n",
        "[5465 rows x 2 columns]"
       ]
      },
-     "execution_count": 64,
+     "execution_count": 54,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2753,7 +2717,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f621c2cb",
+   "id": "b4cd941d",
    "metadata": {},
    "source": [
     "Change the `group` number if necessary. By running the code below, we can obtain a list of near duplicated documents."
@@ -2761,8 +2725,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
-   "id": "bd79a7f7",
+   "execution_count": 55,
+   "id": "09b3fd0b",
    "metadata": {},
    "outputs": [
     {
@@ -2793,34 +2757,34 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>14</th>\n",
+       "      <th>420</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>121545</td>\n",
-       "      <td>735</td>\n",
+       "      <td>122007</td>\n",
+       "      <td>112</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66</th>\n",
+       "      <th>425</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>121487</td>\n",
-       "      <td>735</td>\n",
+       "      <td>122124</td>\n",
+       "      <td>112</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>213</th>\n",
+       "      <th>689</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>121541</td>\n",
-       "      <td>735</td>\n",
+       "      <td>122020</td>\n",
+       "      <td>112</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>291</th>\n",
+       "      <th>764</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>121539</td>\n",
-       "      <td>735</td>\n",
+       "      <td>122282</td>\n",
+       "      <td>112</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>422</th>\n",
+       "      <th>952</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>121524</td>\n",
-       "      <td>735</td>\n",
+       "      <td>122010</td>\n",
+       "      <td>112</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2828,25 +2792,25 @@
       ],
       "text/plain": [
        "     dataset_id  doc_id  group\n",
-       "14   1692361878  121545    735\n",
-       "66   1692361878  121487    735\n",
-       "213  1692361878  121541    735\n",
-       "291  1692361878  121539    735\n",
-       "422  1692361878  121524    735"
+       "420  1692361878  122007    112\n",
+       "425  1692361878  122124    112\n",
+       "689  1692361878  122020    112\n",
+       "764  1692361878  122282    112\n",
+       "952  1692361878  122010    112"
       ]
      },
-     "execution_count": 72,
+     "execution_count": 55,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "cc_compute_res[cc_compute_res['group']==735].head()"
+    "cc_compute_res[cc_compute_res['group']==112].head()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "e7c02f4b",
+   "id": "8b0de04f",
    "metadata": {},
    "source": [
     "Print the text of near duplicated document. Please replace the `id` if necessary, `id` should be in the format of `<dataset_id>_<doc_id>`"
@@ -2855,7 +2819,7 @@
   {
    "cell_type": "code",
    "execution_count": 73,
-   "id": "dd0b2e33",
+   "id": "fbf88107",
    "metadata": {},
    "outputs": [
     {
@@ -2877,7 +2841,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c3f8d12f",
+   "id": "fd33ac1d",
    "metadata": {},
    "source": [
     "Below is the English translation of the output above. We can see that the two documents are indeed very similar to each other.\n",
@@ -2938,7 +2902,256 @@
   },
   {
    "cell_type": "markdown",
-   "id": "70ca66df",
+   "id": "68cfec8a",
+   "metadata": {},
+   "source": [
+    "### 5.6 Fuzzy deduplication wrapper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "fe7de030",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "fe8794b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input\n",
+    "fuzzy_dedup_data_path = added_id_output_path\n",
+    "#Output\n",
+    "fuzzy_dedup_base_output_path = os.path.join(data_dir,\"fuzzy_wrapper\")\n",
+    "fuzzy_dedup_log_dir = os.path.join(fuzzy_dedup_base_output_path,'log')\n",
+    "fuzzy_dedup_cache_dir = os.path.join(fuzzy_dedup_base_output_path,'cache')\n",
+    "fuzzy_dedup_output_dir = os.path.join(fuzzy_dedup_base_output_path,'data')\n",
+    "#Specify dataset name\n",
+    "dataset_name = 'TH_wikipedia'\n",
+    "\n",
+    "#Relevant parameters\n",
+    "id_field = 'id'\n",
+    "text_field = 'text'\n",
+    "filetype = \"parquet\"\n",
+    "\n",
+    "!mkdir -p {fuzzy_dedup_base_output_path}\n",
+    "!mkdir -p {fuzzy_dedup_log_dir}\n",
+    "!mkdir -p {fuzzy_dedup_cache_dir}\n",
+    "!mkdir -p {fuzzy_dedup_output_dir}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0aa0b60c",
+   "metadata": {},
+   "source": [
+    "**[Optional]** If the cache folder is not empty, please CLEAR the folder before proceeding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "83705eaa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!rm -r {fuzzy_dedup_cache_dir}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "72494e54",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading 1 files\n",
+      "Stage1: Starting Minhash + LSH computation\n",
+      "Stage1: Minhash + LSH complete!\n",
+      "Stage2 (False Postive Check): Starting Map_Buckets\n",
+      "Stage2 (False Postive Check): Map_Buckets Complete!\n",
+      "Stage3 (False Postive Check): Shuffle docs\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                                                                                                                                                                                                                                                                                       | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Started processing bucket-map partitions 0 through 1 of 1\n",
+      "Using 1 text partitions.\n",
+      "Starting text bytes aware shuffle\n",
+      "Will write 32059 rows to disk\n",
+      "Text-df partition  1/1 completed in 2.764477491378784\n",
+      "Bucket partition  1/1 completed in 2.783641815185547\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.79s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Stage3 (False Postive Check): Shuffle docs complete!\n",
+      "Stage4 (False Postive Check): Jaccard Similarity in Buckets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Stage4 (False Postive Check): Jaccard Similarity in Buckets Complete!\n",
+      "Stage5: Connected Components across buckets\n",
+      "batch_id = 0/1, time = 0.2485034465789795\n",
+      "# of groups 5458\n",
+      "# of docs removed 3086\n",
+      "assert num_nodes:8544==labels_df:8544 passed\n",
+      "Stage5: Connected Components across buckets complete!\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Time taken for Connected Component: 20.06704068183899 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "with dask.config.set({\"dataframe.backend\": 'cudf'}):\n",
+    "        \n",
+    "        t0 = time.time()\n",
+    "        \n",
+    "        input_dataset = DocumentDataset.read_json(fuzzy_dedup_data_path, backend='cudf')\n",
+    "\n",
+    "        fuzzy_dedup_config = FuzzyDuplicatesConfig(\n",
+    "            cache_dir=fuzzy_dedup_cache_dir,\n",
+    "            id_field=id_field,\n",
+    "            text_field=text_field,\n",
+    "            seed=seed, #Use the seed set in Minhash section for consistency\n",
+    "            char_ngrams=5,\n",
+    "            num_buckets=20,\n",
+    "            hashes_per_bucket=13,\n",
+    "            use_64_bit_hash=False,\n",
+    "            buckets_per_shuffle=5,\n",
+    "            false_positive_check=True,\n",
+    "            num_anchors=2,\n",
+    "            jaccard_threshold=0.8,\n",
+    "        )\n",
+    "        fuzzy_dup = FuzzyDuplicates(logger=fuzzy_dedup_log_dir, config=fuzzy_dedup_config)\n",
+    "        duplicates = fuzzy_dup(dataset=input_dataset)\n",
+    "        \n",
+    "        duplicates.to_parquet(fuzzy_dedup_output_dir, write_to_filename=False)\n",
+    "       \n",
+    "        print(f\"Time taken for Connected Component: {time.time()-t0} s\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "00a8530a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>group</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>TH_wiki-0000134798</td>\n",
+       "      <td>736</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>TH_wiki-0000116226</td>\n",
+       "      <td>1526</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>TH_wiki-0000126796</td>\n",
+       "      <td>2934</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>TH_wiki-0000138218</td>\n",
+       "      <td>156</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>TH_wiki-0000085437</td>\n",
+       "      <td>2722</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   id  group\n",
+       "0  TH_wiki-0000134798    736\n",
+       "1  TH_wiki-0000116226   1526\n",
+       "2  TH_wiki-0000126796   2934\n",
+       "3  TH_wiki-0000138218    156\n",
+       "4  TH_wiki-0000085437   2722"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fuzzy_dedup_res = pd.read_parquet(fuzzy_dedup_output_dir)\n",
+    "fuzzy_dedup_res.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9dfbdde",
    "metadata": {},
    "source": [
     "## 6. Remove duplicates\n",
@@ -2948,7 +3161,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "93d031ec",
+   "id": "bb722fd2",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -2956,8 +3169,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
-   "id": "911be9d9",
+   "execution_count": 81,
+   "id": "5a4b97b7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2976,7 +3189,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "969f6543",
+   "id": "d3962deb",
    "metadata": {},
    "source": [
     "We will first process the result of exact deduplication. Since result of exact deduplication contains original ID used in input dataset, it is more straightforward to deal with."
@@ -2984,8 +3197,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
-   "id": "bbbfdbb3",
+   "execution_count": 82,
+   "id": "a29d720d",
    "metadata": {},
    "outputs": [
     {
@@ -3015,65 +3228,80 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8b97567d",
+   "id": "b4c1c057",
    "metadata": {},
    "source": [
     "For result of fuzzy deduplication, we need to first reconstructed document ID by combining `dataset_id` and `doc_id`, then use the reconstructed `ID` for removal"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "1c0f1ee6",
+   "metadata": {},
+   "source": [
+    "**[Optional]** Uncomment the cell to use result from step by step fuzzy deduplication"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 82,
-   "id": "513cf7a0",
+   "execution_count": 83,
+   "id": "1ff911ad",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#List of id_prefix used in Add ID\n",
-    "base_ids = [id_prefix]\n",
-    "\n",
-    "#Obtain a mapping between `dataset_id` and `id_prefix`\n",
-    "df = cudf.DataFrame()\n",
-    "df['base_id'] = [base_id for base_id in base_ids]\n",
-    "df['dataset_id'] = df['base_id'].hash_values()\n",
-    "df_pd = df.to_pandas()\n",
-    "mapping = {\n",
-    "      hashed_id: base_id\n",
-    "      for base_id, hashed_id in zip(df_pd['base_id'], df_pd['dataset_id'])\n",
-    "}\n",
-    "\n",
-    "#Load result of fuzzy deduplication\n",
-    "fuzzy_duplicates = pd.read_parquet(connected_component_output_path)\n",
-    "#Reconstruct the original document ID\n",
-    "fuzzy_duplicates['id']=fuzzy_duplicates.apply(lambda x: f\"{mapping[x['dataset_id']]}-{x['doc_id']:010d}\", axis=1)\n",
+    "# #List of id_prefix used in Add ID\n",
+    "# base_ids = [id_prefix]\n",
+    "\n",
+    "# #Obtain a mapping between `dataset_id` and `id_prefix`\n",
+    "# df = cudf.DataFrame()\n",
+    "# df['base_id'] = [base_id for base_id in base_ids]\n",
+    "# df['dataset_id'] = df['base_id'].hash_values()\n",
+    "# df_pd = df.to_pandas()\n",
+    "# mapping = {\n",
+    "#       hashed_id: base_id\n",
+    "#       for base_id, hashed_id in zip(df_pd['base_id'], df_pd['dataset_id'])\n",
+    "# }\n",
+    "\n",
+    "# #Load result of fuzzy deduplication \n",
+    "# fuzzy_duplicates = pd.read_parquet(connected_component_output_path)\n",
+    "# #Reconstruct the original document ID\n",
+    "# fuzzy_duplicates['id']=fuzzy_duplicates.apply(lambda x: f\"{mapping[x['dataset_id']]}-{x['doc_id']:010d}\", axis=1)\n",
+    "\n",
+    "# #Generate list of near duplicate document ID\n",
+    "# fuzzy_docs_to_remove = fuzzy_duplicates.drop_duplicates(subset=['group'], keep='first')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "2220d8fc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Loads result from fuzzy dedup wrapper\n",
+    "fuzzy_duplicates = pd.read_parquet(fuzzy_dedup_output_dir)\n",
+    "\n",
     "#Generate list of near duplicate document ID\n",
     "fuzzy_docs_to_remove = fuzzy_duplicates.drop_duplicates(subset=['group'], keep='first')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
-   "id": "dc7d647c",
+   "execution_count": 85,
+   "id": "08143e1e",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Writing to disk complete for 1 partitions\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "#Remove near duplicates\n",
     "result = result[~result[input_id_field].isin(fuzzy_docs_to_remove[input_id_field])]\n",
     "\n",
     "#Save final result to local\n",
-    "write_to_disk(result, dudped_output_dir, output_type=\"parquet\")"
+    "result.to_parquet(dudped_output_dir, write_to_filename=True)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "b47a967f",
+   "id": "a5008578",
    "metadata": {},
    "source": [
     "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset (length = 161748)"
@@ -3081,15 +3309,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
-   "id": "5e8097b1",
+   "execution_count": 86,
+   "id": "a692c916",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Length of duplicate removed dataset:156257\n"
+      "Length of duplicate removed dataset:156265\n"
      ]
     }
    ],
@@ -3100,16 +3328,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "85caf66f",
+   "id": "24440f5f",
    "metadata": {},
    "source": [
-    "Close the GPU Dask Cluster"
+    "Close the GPU Dask Cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
-   "id": "cd91f5fe",
+   "execution_count": 88,
+   "id": "656a24f2",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3119,7 +3347,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "1c6cee97",
+   "id": "3a00f6ea",
    "metadata": {},
    "source": [
     "## 7. Heuristic Fitlering\n",
@@ -3141,8 +3369,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
-   "id": "1ddff58c",
+   "execution_count": 89,
+   "id": "41f7cdf4",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3153,16 +3381,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a728a161",
+   "id": "f5ed694b",
    "metadata": {},
    "source": [
-    "**[Optional]**The following cell is to remove warning from dask."
+    "**[Optional]** The following cell is to remove warning from dask."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
-   "id": "e5114945",
+   "execution_count": 90,
+   "id": "39aab4d9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3174,7 +3402,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6243a7cb",
+   "id": "3c196329",
    "metadata": {},
    "source": [
     "Create a CPU Dask Cluster."
@@ -3182,8 +3410,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
-   "id": "fa752ded",
+   "execution_count": 91,
+   "id": "1ffd3928",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3193,7 +3421,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c3dda877",
+   "id": "4a514d3c",
    "metadata": {},
    "source": [
     "Define some helper functions"
@@ -3201,8 +3429,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
-   "id": "a8abf841",
+   "execution_count": 92,
+   "id": "d4aaccc4",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3233,7 +3461,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "04e6b0f8",
+   "id": "5d43a755",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -3241,8 +3469,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
-   "id": "55e43a6c",
+   "execution_count": 93,
+   "id": "4f8b0336",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3272,7 +3500,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4c5f6c8e",
+   "id": "bddd9dd9",
    "metadata": {},
    "source": [
     "Run heuristic filtering"
@@ -3280,8 +3508,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
-   "id": "f6f50332",
+   "execution_count": 94,
+   "id": "1df83255",
    "metadata": {},
    "outputs": [
     {
@@ -3295,6 +3523,34 @@
       "Writing to disk complete for 1 partitions\n",
       "Saving data for urls_ratio\n",
       "Writing to disk complete for 1 partitions\n",
+      "Saving data for white_space\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for parentheses_ratio\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for boilerplate_string_ratio\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for repeated_lines\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for repeated_paragraphs\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for repeated_lines_char\n",
+      "Writing to disk complete for 1 partitions\n",
+      "Saving data for repeated_paragraphs_char\n",
+      "Writing to disk complete for 1 partitions\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/nemo_curator/utils/distributed_utils.py:379: UserWarning: Empty partition found\n",
+      "  warnings.warn(f\"Empty partition found\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "Saving data for word_count\n",
       "Writing to disk complete for 1 partitions\n",
       "Saving data for repeating_top_2grams\n",
@@ -3304,7 +3560,7 @@
       "Saving data for repeating_top_4grams\n",
       "Writing to disk complete for 1 partitions\n",
       "Writing to disk complete for 1 partitions\n",
-      "Time taken for Heuristic filtering: 729.7436628341675 s\n"
+      "Time taken for Heuristic filtering: 1120.5212895870209 s\n"
      ]
     }
    ],
@@ -3316,7 +3572,7 @@
     "score_fields = get_score_fields(filter_pipeline)\n",
     "\n",
     "# Load dataset\n",
-    "dataset = load_dataset(HF_input_data_dir,file_type='parquet')\n",
+    "dataset = DocumentDataset.read_parquet(HF_input_data_dir, backend='pandas', add_filename=True)\n",
     "\n",
     "\n",
     "# Iterate through filters. For each filter, the low quality document will be removed from the dataset and output to corresponding folder for analysis\n",
@@ -3346,14 +3602,14 @@
     "filtered_dataset = DocumentDataset(filtered_dataset.df.drop(columns=score_fields))\n",
     "\n",
     "# Output filtered dataset\n",
-    "write_to_disk(filtered_dataset.df, kept_document_dir, write_to_filename=True, output_type=output_file_type)\n",
+    "filtered_dataset.to_parquet(kept_document_dir, write_to_filename=True)\n",
     "\n",
     "print(f\"Time taken for Heuristic filtering: {time.time()-t0} s\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "b19731f5",
+   "id": "0fab7ee5",
    "metadata": {},
    "source": [
     "Verify the result."
@@ -3361,10 +3617,137 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "8f945362",
+   "execution_count": 95,
+   "id": "65160254",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset size after heuristic filtering:192786\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>id</th>\n",
+       "      <th>language</th>\n",
+       "      <th>source_id</th>\n",
+       "      <th>text</th>\n",
+       "      <th>title</th>\n",
+       "      <th>url</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>part.0.parquet</td>\n",
+       "      <td>TH_wiki-0000000001</td>\n",
+       "      <td>TH</td>\n",
+       "      <td>thwiki-20240201-thwiki-20240201-pages-articles...</td>\n",
+       "      <td>ดาราศาสตร์ คือวิชาวิทยาศาสตร์ที่ศึกษาวัตถุในท้...</td>\n",
+       "      <td>ดาราศาสตร์</td>\n",
+       "      <td>https://th.wikipedia.org/wiki/%E0%B8%94%E0%B8%...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>part.0.parquet</td>\n",
+       "      <td>TH_wiki-0000000002</td>\n",
+       "      <td>TH</td>\n",
+       "      <td>thwiki-20240201-thwiki-20240201-pages-articles...</td>\n",
+       "      <td>ภูมิศาสตร์ (,  แปลว่า \"การพรรณนาเกี่ยวกับโลก\")...</td>\n",
+       "      <td>ภูมิศาสตร์</td>\n",
+       "      <td>https://th.wikipedia.org/wiki/%E0%B8%A0%E0%B8%...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>part.0.parquet</td>\n",
+       "      <td>TH_wiki-0000000003</td>\n",
+       "      <td>TH</td>\n",
+       "      <td>thwiki-20240201-thwiki-20240201-pages-articles...</td>\n",
+       "      <td>พันทิป.คอม หรือพันทิป ก่อตั้งขึ้นเมื่อวันที่ 7...</td>\n",
+       "      <td>พันทิป.คอม</td>\n",
+       "      <td>https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>part.0.parquet</td>\n",
+       "      <td>TH_wiki-0000000004</td>\n",
+       "      <td>TH</td>\n",
+       "      <td>thwiki-20240201-thwiki-20240201-pages-articles...</td>\n",
+       "      <td>พันธุ์ทิพย์พลาซ่า () เป็นศูนย์การค้าเกี่ยวกับเ...</td>\n",
+       "      <td>พันธุ์ทิพย์พลาซ่า</td>\n",
+       "      <td>https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>part.0.parquet</td>\n",
+       "      <td>TH_wiki-0000000005</td>\n",
+       "      <td>TH</td>\n",
+       "      <td>thwiki-20240201-thwiki-20240201-pages-articles...</td>\n",
+       "      <td>วิทยาการคอมพิวเตอร์ศึกษาเกี่ยวกับโครงสร้างพื้น...</td>\n",
+       "      <td>วิทยาการคอมพิวเตอร์</td>\n",
+       "      <td>https://th.wikipedia.org/wiki/%E0%B8%A7%E0%B8%...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         filename                  id language  \\\n",
+       "1  part.0.parquet  TH_wiki-0000000001       TH   \n",
+       "2  part.0.parquet  TH_wiki-0000000002       TH   \n",
+       "3  part.0.parquet  TH_wiki-0000000003       TH   \n",
+       "4  part.0.parquet  TH_wiki-0000000004       TH   \n",
+       "5  part.0.parquet  TH_wiki-0000000005       TH   \n",
+       "\n",
+       "                                           source_id  \\\n",
+       "1  thwiki-20240201-thwiki-20240201-pages-articles...   \n",
+       "2  thwiki-20240201-thwiki-20240201-pages-articles...   \n",
+       "3  thwiki-20240201-thwiki-20240201-pages-articles...   \n",
+       "4  thwiki-20240201-thwiki-20240201-pages-articles...   \n",
+       "5  thwiki-20240201-thwiki-20240201-pages-articles...   \n",
+       "\n",
+       "                                                text                title  \\\n",
+       "1  ดาราศาสตร์ คือวิชาวิทยาศาสตร์ที่ศึกษาวัตถุในท้...           ดาราศาสตร์   \n",
+       "2  ภูมิศาสตร์ (,  แปลว่า \"การพรรณนาเกี่ยวกับโลก\")...           ภูมิศาสตร์   \n",
+       "3  พันทิป.คอม หรือพันทิป ก่อตั้งขึ้นเมื่อวันที่ 7...           พันทิป.คอม   \n",
+       "4  พันธุ์ทิพย์พลาซ่า () เป็นศูนย์การค้าเกี่ยวกับเ...    พันธุ์ทิพย์พลาซ่า   \n",
+       "5  วิทยาการคอมพิวเตอร์ศึกษาเกี่ยวกับโครงสร้างพื้น...  วิทยาการคอมพิวเตอร์   \n",
+       "\n",
+       "                                                 url  \n",
+       "1  https://th.wikipedia.org/wiki/%E0%B8%94%E0%B8%...  \n",
+       "2  https://th.wikipedia.org/wiki/%E0%B8%A0%E0%B8%...  \n",
+       "3  https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...  \n",
+       "4  https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...  \n",
+       "5  https://th.wikipedia.org/wiki/%E0%B8%A7%E0%B8%...  "
+      ]
+     },
+     "execution_count": 95,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "res = pd.read_parquet(kept_document_dir)\n",
     "print(f\"Dataset size after heuristic filtering:{len(res)}\")\n",
@@ -3373,7 +3756,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "cb52fe04",
+   "id": "412bd6d2",
    "metadata": {},
    "source": [
     "Close the CPU Dask Cluster"
@@ -3381,8 +3764,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 94,
-   "id": "aaa9823a",
+   "execution_count": 96,
+   "id": "e6129857",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3393,7 +3776,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "94f6e74e",
+   "id": "4679d955",
    "metadata": {},
    "outputs": [],
    "source": []

From 4b024cb7a3bef5f45da5b463d14bcf952fceecb0 Mon Sep 17 00:00:00 2001
From: Nicole Luo <nluo@nvidia.com>
Date: Mon, 20 May 2024 07:00:32 +0000
Subject: [PATCH 31/34] Fixing Style

Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 .pre-commit-config.yaml                         |  0
 .../config/heuristic_filter_non-en.yaml         | 17 ++++++++---------
 2 files changed, 8 insertions(+), 9 deletions(-)
 mode change 100644 => 100755 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
old mode 100644
new mode 100755
diff --git a/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml b/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml
index 50d435e2e..4c1b80905 100755
--- a/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml
+++ b/tutorials/single_node_tutorial/config/heuristic_filter_non-en.yaml
@@ -1,33 +1,32 @@
 input_field: text
 filters:
   # The filters below define a chain of heuristic filters to be applied to each document in a corpus.
-  # This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words. 
+  # This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words.
   # The filter listed at the top will be applied first, and the following filters will be applied in
   # the order they appear in this file. Each filter can be removed and re-ordered as desired.
   - name: nemo_curator.filters.heuristic_filter.SymbolsToWordsFilter
     log_score: True
     params:
       max_symbol_to_word_ratio: 0.1
-    
   - name: nemo_curator.filters.heuristic_filter.NumbersFilter
     log_score: True
     params:
       max_number_to_text_ratio: 0.15
   - name: nemo_curator.filters.heuristic_filter.UrlsFilter
     log_score: True
-    params: 
+    params:
       max_url_to_text_ratio: 0.2
   - name: nemo_curator.filters.heuristic_filter.WhiteSpaceFilter
     log_score: True
-    params: 
+    params:
       max_white_space_ratio: 0.25
   - name: nemo_curator.filters.heuristic_filter.ParenthesesFilter
     log_score: True
-    params: 
+    params:
       max_parentheses_ratio: 0.1
   - name: nemo_curator.filters.heuristic_filter.BoilerPlateStringFilter
     log_score: True
-    params: 
+    params:
       remove_if_at_top_or_bottom: True
       max_boilerplate_string_ratio: 0.4
   - name: nemo_curator.filters.heuristic_filter.RepeatedLinesFilter
@@ -50,17 +49,17 @@ filters:
     params:
       min_words: 50
       max_words: 100000
-  # NOTE: This filter tends to remove many documents and will need to 
+  # NOTE: This filter tends to remove many documents and will need to
   # be tuned per language
 #   - name: nemo_curator.filters.heuristic_filter.PunctuationFilter
 #     params:
 #       max_num_sentences_without_endmark_ratio: 0.85
 #   - name: nemo_curator.filters.heuristic_filter.MeanWordLengthFilter
 #     params:
-#       max_mean_word_length: 10 
+#       max_mean_word_length: 10
 #       min_mean_word_length: 3
 #   - name: nemo_curator.filters.heuristic_filter.LongWordFilter
-#     params: 
+#     params:
 #       max_word_length: 1000
 #   - name: nemo_curator.filters.heuristic_filter.EllipsisFilter
 #     params:

From 0a50fd433c3c020508cf08670986b7ca1daf583c Mon Sep 17 00:00:00 2001
From: Nicole Luo <nluo@nvidia.com>
Date: Mon, 20 May 2024 07:01:10 +0000
Subject: [PATCH 32/34] Updating container version

Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index 006098375..3ceecad2c 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -78,7 +78,7 @@
     "    Password: <Your NGC Key>\n",
     "- Get NeMo NeMo Framework Training Container\n",
     "    ```bash\n",
-    "    docker pull nvcr.io/ea-bignlp/ga-participants/nemofw-training:24.01\n"
+    "    docker pull docker pull nvcr.io/nvidia/nemo:dev.framework\n"
    ]
   },
   {

From 2a9052c7a7c3684d6b1f7b4f5770de6e5f09bd75 Mon Sep 17 00:00:00 2001
From: Nicole Luo <nluo@nvidia.com>
Date: Mon, 20 May 2024 08:31:02 +0000
Subject: [PATCH 33/34] Fixing style

Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 tests/test_fuzzy_dedup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py
index 022940b2d..1c952d27d 100644
--- a/tests/test_fuzzy_dedup.py
+++ b/tests/test_fuzzy_dedup.py
@@ -440,4 +440,4 @@ def add_partition_info(df, partition_info=None):
     )
 
     # Check that the real and expected partitions match
-    assert (check["file_id"] == check["expected_file_id"]).all()
\ No newline at end of file
+    assert (check["file_id"] == check["expected_file_id"]).all()

From 11e4eba48c4a8ce83f9ac9ff453f92b8aae4fca3 Mon Sep 17 00:00:00 2001
From: Nicole Luo <nluo@nvidia.com>
Date: Fri, 24 May 2024 03:45:27 +0000
Subject: [PATCH 34/34] Update get_client() according to latest version; Update
 log path for map_bucket section

Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 .../single_gpu_tutorial.ipynb                 | 798 +++++++++---------
 1 file changed, 389 insertions(+), 409 deletions(-)

diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
index 3ceecad2c..0653279b8 100755
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "e12a5ec6",
+   "id": "9bd01afc",
    "metadata": {},
    "source": [
     "# Nemo Curator Pipeline Example\n",
@@ -27,7 +27,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "58d062aa",
+   "id": "7b1808ea",
    "metadata": {},
    "source": [
     "## About this notebook\n",
@@ -48,13 +48,13 @@
     "2. Classifier filtering\n",
     "3. Downstream-task decontamination\n",
     "4. Distributed data classification with PyTorch models\n",
-    "5. Personal identifiable information (PII) redaction\n",
+    "5. Personal identifiable information (PII) redaction \n",
     "\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "a6e3492e",
+   "id": "78537bd7",
    "metadata": {},
    "source": [
     "## Prerequisites\n",
@@ -83,7 +83,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "01d4c35a",
+   "id": "062b5423",
    "metadata": {},
    "source": [
     "## 0. Env Setup"
@@ -92,7 +92,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "8778a517",
+   "id": "8add9bbd",
    "metadata": {},
    "outputs": [
     {
@@ -100,11 +100,10 @@
      "output_type": "stream",
      "text": [
       "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
-      "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (2.0.0)\n",
+      "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (4.0.0)\n",
+      "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines) (23.2.0)\n",
       "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
-      "\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n"
+      "\u001b[0m"
      ]
     }
    ],
@@ -114,8 +113,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "41d75988",
+   "execution_count": 2,
+   "id": "9940c70d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -143,8 +142,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "0150b7e7",
+   "execution_count": 3,
+   "id": "fd8a381d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -172,15 +171,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "3d7e6547",
+   "execution_count": 4,
+   "id": "589ff257",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/nluo_data/NeMo-Curator/tutorials/single_node_tutorial\n"
+      "/work_dir/tutorials/single_node_tutorial\n"
      ]
     }
    ],
@@ -192,7 +191,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "cf0aea31",
+   "id": "662d505f",
    "metadata": {},
    "source": [
     "## 1. Download\n",
@@ -233,8 +232,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "f41df88e",
+   "execution_count": 5,
+   "id": "adb59379",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -243,7 +242,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b0f2d6d9",
+   "id": "9b56f12a",
    "metadata": {},
    "source": [
     " Start a CPU based Dask cluster. Please modify `n_workers` and `memory_limit` according to your hardware specification. To process TH wikipedia data, it's advised to have `memory_limit` greater than 12GB"
@@ -252,7 +251,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "8742c111",
+   "id": "e822b5ac",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -262,7 +261,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f910ae71",
+   "id": "e90cc8b1",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -271,7 +270,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "c55bcfa8",
+   "id": "9a03b463",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -287,7 +286,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b11fdf43",
+   "id": "f41734a1",
    "metadata": {},
    "source": [
     "Download TH wikipedia data"
@@ -296,7 +295,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ff615514",
+   "id": "a45965a7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -308,7 +307,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ff7ae4c0",
+   "id": "22b7d5b3",
    "metadata": {},
    "source": [
     "Verify result"
@@ -317,7 +316,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "id": "98564093",
+   "id": "45a69041",
    "metadata": {},
    "outputs": [
     {
@@ -337,7 +336,7 @@
   {
    "cell_type": "code",
    "execution_count": 27,
-   "id": "ded3510b",
+   "id": "53bdccfd",
    "metadata": {},
    "outputs": [
     {
@@ -355,7 +354,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "79b4a804",
+   "id": "c5f58643",
    "metadata": {},
    "source": [
     "**[Optional]**Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again."
@@ -364,7 +363,7 @@
   {
    "cell_type": "code",
    "execution_count": 28,
-   "id": "f1e8f645",
+   "id": "0669a830",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -374,7 +373,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4db3267a",
+   "id": "43334988",
    "metadata": {},
    "source": [
     "## 2.Language separation and unicode fixing"
@@ -382,7 +381,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "228e3978",
+   "id": "86ccdc1f",
    "metadata": {},
    "source": [
     "In this section, we will be using a language classification model by fasttext to separate the TH wikipedia dataset based on the document major languages, and we will also fix the unicode in the documents. Detailed steps are:\n",
@@ -398,7 +397,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "id": "bd5d6920",
+   "id": "1e9198e8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -409,7 +408,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "bd2923bb",
+   "id": "76e46d2a",
    "metadata": {},
    "source": [
     "**[Optional]** Start a cpu based Dask cluster."
@@ -418,7 +417,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "4375c02b",
+   "id": "da3aed8a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -428,7 +427,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2f834de0",
+   "id": "4a72479c",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -436,8 +435,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "3b3856c6",
+   "execution_count": 7,
+   "id": "13b9d2b1",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -462,7 +461,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3b6f887f",
+   "id": "8df0322a",
    "metadata": {},
    "source": [
     "Download fasttext model"
@@ -471,7 +470,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "id": "218c955e",
+   "id": "2666727d",
    "metadata": {},
    "outputs": [
     {
@@ -498,7 +497,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c410253e",
+   "id": "58452516",
    "metadata": {},
    "source": [
     "Apply fasttext model to separate documents by their languages"
@@ -507,7 +506,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "id": "c9afe965",
+   "id": "d8b8c491",
    "metadata": {},
    "outputs": [
     {
@@ -554,7 +553,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "31917e7b",
+   "id": "d443a5d1",
    "metadata": {},
    "source": [
     "Load `UnicodeReformatter` to reformat any unicode appeared in the desired language dataset"
@@ -563,7 +562,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "id": "55da5f12",
+   "id": "272a5f67",
    "metadata": {},
    "outputs": [
     {
@@ -594,7 +593,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "bc214e82",
+   "id": "9bd57a53",
    "metadata": {},
    "source": [
     "Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file (no. of lines = 162164)"
@@ -603,7 +602,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "id": "6b6eb634",
+   "id": "e3329c83",
    "metadata": {},
    "outputs": [
     {
@@ -622,7 +621,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "57e22770",
+   "id": "0b6cbc26",
    "metadata": {},
    "source": [
     "Furthur verify by loading documents that has been identified as other language, such as 'EN'. We can see from output that the removed document is indeed in English and contains very little or even no Thai."
@@ -631,7 +630,7 @@
   {
    "cell_type": "code",
    "execution_count": 38,
-   "id": "79e32205",
+   "id": "050d944c",
    "metadata": {},
    "outputs": [
     {
@@ -649,7 +648,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "39020971",
+   "id": "7d17f010",
    "metadata": {},
    "source": [
     "**[Optional]** Close the Dask cluster."
@@ -658,7 +657,7 @@
   {
    "cell_type": "code",
    "execution_count": 37,
-   "id": "64da23ec",
+   "id": "7e64cc35",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -668,7 +667,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6134eaf3",
+   "id": "1d46cece",
    "metadata": {},
    "source": [
     "## 3.Add ID\n",
@@ -681,7 +680,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "id": "5bed2e25",
+   "id": "5f788b91",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -690,7 +689,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "be1c546b",
+   "id": "cd17be33",
    "metadata": {},
    "source": [
     "**[Optional]** If there is no running Dask cluster, start CPU based Dask cluster."
@@ -699,7 +698,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "id": "3a6349d9",
+   "id": "5ba1d54a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -709,7 +708,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "503bfa4c",
+   "id": "12f59d5e",
    "metadata": {},
    "source": [
     "Define relevant parameters"
@@ -717,8 +716,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "id": "a14c6ba3",
+   "execution_count": 8,
+   "id": "843eba7f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -734,7 +733,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b249dcf9",
+   "id": "e7a8307c",
    "metadata": {},
    "source": [
     "Adding ID to dataset"
@@ -743,7 +742,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "id": "d12bb962",
+   "id": "b7a91bf1",
    "metadata": {},
    "outputs": [
     {
@@ -773,7 +772,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ce2934df",
+   "id": "e92b5dab",
    "metadata": {},
    "source": [
     "Verify the result. From the output, we can see that the `id` value has been changed to `TH_wiki-0000000000` "
@@ -782,7 +781,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "id": "cd51cd14",
+   "id": "e585cedd",
    "metadata": {},
    "outputs": [
     {
@@ -800,7 +799,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f249ab8b",
+   "id": "0cbddf6e",
    "metadata": {},
    "source": [
     "Close Dask cluster. This cell needs to be run as we are starting a new GPU Dask cluster in the following task"
@@ -809,7 +808,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "id": "62336143",
+   "id": "4daa1f2a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -819,7 +818,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d6fb16b1",
+   "id": "1baf027e",
    "metadata": {},
    "source": [
     "## 4.Exact Dedplication\n",
@@ -835,8 +834,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "id": "044f7eee",
+   "execution_count": 7,
+   "id": "3f7ba34c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -845,41 +844,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6e5da88e",
+   "id": "e268cfca",
    "metadata": {},
    "source": [
-    "Start a GPU based Dask cluster. Since GPU based Dask cluster involves setting several arguments, we will use the `get_client()` wrapper function to quickly set up. Please make sure the `device` in `args` is `gpu`"
+    "Start a GPU based Dask cluster. Since GPU based Dask cluster involves setting several arguments, we will use the `get_client()` wrapper function to quickly set up. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "id": "e4d6920d",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Namespace(scheduler_address=None, scheduler_file=None, n_workers=20, threads_per_worker=1, rmm_pool_size=None, protocol='tcp', nvlink_only=False, files_per_partition=2, num_files=None, device='gpu', set_torch_to_use_rmm=False)"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sys.argv=['','--device','gpu']\n",
-    "parser = argparse.ArgumentParser()\n",
-    "args = attach_args(parser).parse_args()\n",
-    "args.set_torch_to_use_rmm = False\n",
-    "args"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "717b6cef",
+   "execution_count": 9,
+   "id": "4b73e5f9",
    "metadata": {},
    "outputs": [
     {
@@ -892,23 +866,44 @@
     {
      "data": {
       "text/plain": [
-       "{'tcp://127.0.0.1:42505': None}"
+       "{'tcp://127.0.0.1:36179': None}"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "client = get_client(args, args.device)\n",
+    "client = get_client(cluster_type = 'gpu', set_torch_to_use_rmm=False)\n",
     "print(f\"Number of dask worker:{get_num_workers(client)}\")\n",
     "client.run(pre_imports)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "f267e161",
+   "id": "0fc99440",
+   "metadata": {},
+   "source": [
+    "If you encounter the following error\n",
+    "`get_client() missing 1 required positional argument: 'args'`:\n",
+    "\n",
+    "This is probably because the `nemo_curator` library is not updated to the newer version. Please run the following line in the terminal, following instruction in our [GitHub](https://github.com/nicoleeeluo/NeMo-Curator/tree/main) repo, and restart the notebook. Intermediate result of the previous section has been saved to local, you can start from this section after updating."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a590c78a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#pip install --extra-index-url https://pypi.nvidia.com \".[cuda12x]\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0151abe0",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -916,8 +911,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "id": "d01e2f08",
+   "execution_count": 10,
+   "id": "54b627a4",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -936,8 +931,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "6395ffde",
+   "execution_count": 11,
+   "id": "6ede2e41",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -947,7 +942,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a654a16e",
+   "id": "1882204a",
    "metadata": {},
    "source": [
     "Apply exact deduplication"
@@ -955,8 +950,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "id": "a5e0117c",
+   "execution_count": 12,
+   "id": "dfaaa765",
    "metadata": {},
    "outputs": [
     {
@@ -970,7 +965,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/exact_dedup.py:158: UserWarning: Output path f/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/exact_dedup/data/_exact_duplicates.parquet already exists and will be overwritten\n",
+      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/exact_dedup.py:158: UserWarning: Output path f/work_dir/tutorials/single_node_tutorial/workspace/exact_dedup/data/_exact_duplicates.parquet already exists and will be overwritten\n",
       "  warnings.warn(\n"
      ]
     },
@@ -979,7 +974,7 @@
      "output_type": "stream",
      "text": [
       "Number of exact duplicated file:53\n",
-      "Time taken for exact duplicate:1.9629592895507812\n"
+      "Time taken for exact duplicate:1.9788782596588135\n"
      ]
     }
    ],
@@ -1005,7 +1000,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "7f8bdb88",
+   "id": "e68f0399",
    "metadata": {},
    "source": [
     "Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary"
@@ -1013,8 +1008,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "e045d65a",
+   "execution_count": 15,
+   "id": "28d8bb0b",
    "metadata": {},
    "outputs": [
     {
@@ -1052,27 +1047,27 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>TH_wiki-0000021096</td>\n",
-       "      <td>1708cb56ec582f78716f0864dca9382d</td>\n",
+       "      <td>TH_wiki-0000122055</td>\n",
+       "      <td>3e6e96a80410d5a191d098f464e66f86</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>TH_wiki-0000021100</td>\n",
-       "      <td>1708cb56ec582f78716f0864dca9382d</td>\n",
+       "      <td>TH_wiki-0000105191</td>\n",
+       "      <td>e77a248506ef16737288fae5759db33a</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>TH_wiki-0000067251</td>\n",
-       "      <td>edf8af427a33ed94150899970f39770f</td>\n",
+       "      <td>TH_wiki-0000105192</td>\n",
+       "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>TH_wiki-0000105191</td>\n",
-       "      <td>e77a248506ef16737288fae5759db33a</td>\n",
+       "      <td>TH_wiki-0000105193</td>\n",
+       "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>TH_wiki-0000105192</td>\n",
+       "      <td>TH_wiki-0000105194</td>\n",
        "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -1081,14 +1076,14 @@
       ],
       "text/plain": [
        "                   id                           _hashes\n",
-       "0  TH_wiki-0000021096  1708cb56ec582f78716f0864dca9382d\n",
-       "1  TH_wiki-0000021100  1708cb56ec582f78716f0864dca9382d\n",
-       "2  TH_wiki-0000067251  edf8af427a33ed94150899970f39770f\n",
-       "3  TH_wiki-0000105191  e77a248506ef16737288fae5759db33a\n",
-       "4  TH_wiki-0000105192  2e386f5c3af70f43874618988d4842b2"
+       "0  TH_wiki-0000122055  3e6e96a80410d5a191d098f464e66f86\n",
+       "1  TH_wiki-0000105191  e77a248506ef16737288fae5759db33a\n",
+       "2  TH_wiki-0000105192  2e386f5c3af70f43874618988d4842b2\n",
+       "3  TH_wiki-0000105193  2e386f5c3af70f43874618988d4842b2\n",
+       "4  TH_wiki-0000105194  2e386f5c3af70f43874618988d4842b2"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1101,8 +1096,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
-   "id": "8754887e",
+   "execution_count": 16,
+   "id": "fca41870",
    "metadata": {},
    "outputs": [
     {
@@ -1144,7 +1139,7 @@
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>1708cb56ec582f78716f0864dca9382d</td>\n",
-       "      <td>TH_wiki-0000021096 TH_wiki-0000021100 TH_wiki-...</td>\n",
+       "      <td>TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -1171,12 +1166,12 @@
        "                                                  id  \n",
        "0              TH_wiki-0000157216 TH_wiki-0000066307  \n",
        "1  TH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-...  \n",
-       "2  TH_wiki-0000021096 TH_wiki-0000021100 TH_wiki-...  \n",
+       "2  TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...  \n",
        "3  TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-...  \n",
        "4              TH_wiki-0000122055 TH_wiki-0000116550  "
       ]
      },
-     "execution_count": 28,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1187,8 +1182,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "id": "13712668",
+   "execution_count": 17,
+   "id": "8c9624ac",
    "metadata": {},
    "outputs": [
     {
@@ -1208,7 +1203,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "7a388445",
+   "id": "4013203c",
    "metadata": {},
    "source": [
     "**[Optional]** You might choose to close Dask cluster here"
@@ -1216,8 +1211,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
-   "id": "7875bf12",
+   "execution_count": 13,
+   "id": "5ef2f05e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1227,7 +1222,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "20502f76",
+   "id": "7a2feadc",
    "metadata": {},
    "source": [
     "## 5. Fuzzy Deduplication\n",
@@ -1259,7 +1254,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "de98daed",
+   "id": "ffca14ad",
    "metadata": {},
    "source": [
     "**If there is not running Dask cluster, start a GPU Dask cluster here**"
@@ -1267,35 +1262,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
-   "id": "0a84ae27",
+   "execution_count": null,
+   "id": "e00ba2fd",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'tcp://127.0.0.1:43209': None}"
-      ]
-     },
-     "execution_count": 60,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# sys.argv=['','--device','gpu']\n",
-    "# parser = argparse.ArgumentParser()\n",
-    "# args = attach_args(parser).parse_args()\n",
-    "# args.set_torch_to_use_rmm = False\n",
-    "\n",
-    "# client = get_client(args, args.device)\n",
-    "# get_num_workers(client)\n",
+    "# client = get_client(cluster_type = 'gpu', set_torch_to_use_rmm=False)\n",
+    "# print(f\"Number of dask worker:{get_num_workers(client)}\")\n",
     "# client.run(pre_imports)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "5de7a035",
+   "id": "5df73743",
    "metadata": {},
    "source": [
     "### 5.1 Minhash\n",
@@ -1319,8 +1298,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
-   "id": "bbc84690",
+   "execution_count": 11,
+   "id": "1fc5bff3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1329,7 +1308,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3b0beafe",
+   "id": "7bf9cc8d",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -1337,8 +1316,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
-   "id": "52f056f7",
+   "execution_count": 12,
+   "id": "d600d1b8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1366,7 +1345,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "aaefe7bd",
+   "id": "1c31ddf4",
    "metadata": {},
    "source": [
     "Run MinHash"
@@ -1374,15 +1353,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
-   "id": "da632a42",
+   "execution_count": 13,
+   "id": "88540950",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Computing minhashes for /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/add_id/cleaned\n",
+      "Computing minhashes for /work_dir/tutorials/single_node_tutorial/workspace/add_id/cleaned\n",
       "Reading 1 files\n"
      ]
     },
@@ -1390,7 +1369,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:175: UserWarning: Output path /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/fuzzy/minhash/data/_minhashes.parquet already exists and will be overwritten\n",
+      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:175: UserWarning: Output path /work_dir/tutorials/single_node_tutorial/workspace/fuzzy/minhash/data/_minhashes.parquet already exists and will be overwritten\n",
       "  warnings.warn(\n"
      ]
     },
@@ -1398,7 +1377,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken for MinHash:5.899524927139282\n"
+      "Time taken for MinHash:6.340771198272705\n"
      ]
     }
    ],
@@ -1435,7 +1414,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "9ad4ba59",
+   "id": "158bf3ab",
    "metadata": {},
    "source": [
     "Verify result"
@@ -1443,8 +1422,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
-   "id": "93220b5c",
+   "execution_count": 14,
+   "id": "10b5eb55",
    "metadata": {},
    "outputs": [
     {
@@ -1511,7 +1490,7 @@
        "4  TH_wiki-0000000004  [1559901, 11771639, 487706, 826569, 1203860, 5..."
       ]
      },
-     "execution_count": 33,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1523,7 +1502,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b407928e",
+   "id": "0bce0f80",
    "metadata": {},
    "source": [
     "### 5.2 LSH\n",
@@ -1544,8 +1523,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
-   "id": "f3801d7a",
+   "execution_count": 15,
+   "id": "645b8a53",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1556,7 +1535,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2a2c178a",
+   "id": "110db216",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -1564,8 +1543,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
-   "id": "d52707b9",
+   "execution_count": 16,
+   "id": "738ab265",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1590,7 +1569,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c59b4fe6",
+   "id": "a5250a2a",
    "metadata": {},
    "source": [
     "Run LSH"
@@ -1598,15 +1577,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
-   "id": "71c0848f",
+   "execution_count": 17,
+   "id": "1ef61e2b",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:361: UserWarning: Output path /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/fuzzy/lsh/data/_buckets.parquet already exists and will be overwritten\n",
+      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:361: UserWarning: Output path /work_dir/tutorials/single_node_tutorial/workspace/fuzzy/lsh/data/_buckets.parquet already exists and will be overwritten\n",
       "  warnings.warn(\n"
      ]
     },
@@ -1614,7 +1593,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken for LSH:18.237318754196167\n"
+      "Time taken for LSH:19.37230634689331\n"
      ]
     }
    ],
@@ -1649,7 +1628,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3789c538",
+   "id": "ad2e3b60",
    "metadata": {},
    "source": [
     "Verify result"
@@ -1657,8 +1636,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "id": "d8663302",
+   "execution_count": 18,
+   "id": "9d0449c6",
    "metadata": {},
    "outputs": [
     {
@@ -1691,32 +1670,32 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>124883</td>\n",
-       "      <td>38</td>\n",
+       "      <td>123547</td>\n",
+       "      <td>210</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>123211</td>\n",
-       "      <td>141</td>\n",
+       "      <td>93844</td>\n",
+       "      <td>120</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>124885</td>\n",
-       "      <td>38</td>\n",
+       "      <td>66564</td>\n",
+       "      <td>86</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>85294</td>\n",
-       "      <td>345</td>\n",
+       "      <td>93845</td>\n",
+       "      <td>120</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>124886</td>\n",
-       "      <td>38</td>\n",
+       "      <td>66565</td>\n",
+       "      <td>86</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1724,14 +1703,14 @@
       ],
       "text/plain": [
        "   dataset_id  doc_id  _bucket_id\n",
-       "0  1692361878  124883          38\n",
-       "1  1692361878  123211         141\n",
-       "2  1692361878  124885          38\n",
-       "3  1692361878   85294         345\n",
-       "4  1692361878  124886          38"
+       "0  1692361878  123547         210\n",
+       "1  1692361878   93844         120\n",
+       "2  1692361878   66564          86\n",
+       "3  1692361878   93845         120\n",
+       "4  1692361878   66565          86"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1743,7 +1722,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "00f5567b",
+   "id": "f952f074",
    "metadata": {},
    "source": [
     "### 5.3 Jaccard Shuffle\n",
@@ -1764,8 +1743,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
-   "id": "c5d458d1",
+   "execution_count": 19,
+   "id": "707ea54d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1778,7 +1757,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e904bc34",
+   "id": "8f2e321d",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -1786,8 +1765,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
-   "id": "170a44fd",
+   "execution_count": 25,
+   "id": "70e2dff9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1799,6 +1778,7 @@
     "jaccard_shuffle_base_output_path = os.path.join(data_dir,\"fuzzy/jaccard_shuffle\")\n",
     "output_anchor_docs_with_bk_path = os.path.join(jaccard_shuffle_base_output_path, \"anchor_docs_with_bk.parquet\")\n",
     "input_anchor_docs_with_bk_dir = output_anchor_docs_with_bk_path\n",
+    "jaccard_shuffle_log_path = os.path.join(jaccard_shuffle_base_output_path,\"log\")\n",
     "output_shuffled_docs_path = os.path.join(jaccard_shuffle_base_output_path, \"shuffled_docs.parquet\")\n",
     "\n",
     "#Relevant parameters for _MapBucket()\n",
@@ -1814,12 +1794,13 @@
     "shuffle_id_fields=[\"dataset_id\", \"doc_id\"]\n",
     "int_to_str_id='id'\n",
     "\n",
-    "!mkdir -p {jaccard_shuffle_base_output_path}"
+    "!mkdir -p {jaccard_shuffle_base_output_path}\n",
+    "!mkdir -p {jaccard_shuffle_log_path}"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "333e91a8",
+   "id": "d0f19efa",
    "metadata": {},
    "source": [
     "Run Jaccard map bucket"
@@ -1827,8 +1808,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
-   "id": "67b96227",
+   "execution_count": 26,
+   "id": "b2850b0a",
    "metadata": {},
    "outputs": [
     {
@@ -1837,7 +1818,7 @@
      "text": [
       "Number of files being read for jaccard calculation = 1\n",
       "Number of ddf_bk partitions = 1\n",
-      "Time taken for Bucket Mapping:2.2563915252685547 s\n"
+      "Time taken for Bucket Mapping:1.239295244216919 s\n"
      ]
     }
    ],
@@ -1857,7 +1838,7 @@
     "ddf_bk = get_bucket_ddf_from_parquet_path(input_bucket_path=input_bucket_path, num_workers=num_workers)\n",
     "\n",
     "#Run _MapBuckets()\n",
-    "map_buckets = _MapBuckets(id_fields=shuffle_id_fields, bucket_field=input_bucket_field)\n",
+    "map_buckets = _MapBuckets(id_fields=shuffle_id_fields, bucket_field=input_bucket_field, logger=jaccard_shuffle_log_path)\n",
     "ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(documents_df=ddf_text, buckets_df=ddf_bk, shuffle_type=shuffle_type)\n",
     "\n",
     "#Write to disk\n",
@@ -1868,7 +1849,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8f76b8ef",
+   "id": "a1533a15",
    "metadata": {},
    "source": [
     "Verify result"
@@ -1876,8 +1857,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
-   "id": "81c9c7c7",
+   "execution_count": 27,
+   "id": "d74012c3",
    "metadata": {},
    "outputs": [
     {
@@ -1914,51 +1895,51 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>8895</td>\n",
+       "      <td>127258</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>8964</td>\n",
+       "      <td>127781</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>8895</td>\n",
+       "      <td>126955</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>127089</td>\n",
+       "      <td>85383</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>127220</td>\n",
+       "      <td>85364</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>127089</td>\n",
+       "      <td>85374</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>127090</td>\n",
+       "      <td>45030</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>127220</td>\n",
+       "      <td>85200</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>127089</td>\n",
+       "      <td>45030</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>151728</td>\n",
+       "      <td>127259</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>151728</td>\n",
+       "      <td>127781</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>151729</td>\n",
+       "      <td>126955</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>137262</td>\n",
+       "      <td>127968</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>137301</td>\n",
+       "      <td>127961</td>\n",
        "      <td>1692361878</td>\n",
-       "      <td>137262</td>\n",
+       "      <td>127996</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -1967,21 +1948,21 @@
       ],
       "text/plain": [
        "   dataset_id  doc_id  anchor_1_dataset_id  anchor_1_doc_id  \\\n",
-       "0  1692361878    8895           1692361878             8964   \n",
-       "1  1692361878  127089           1692361878           127220   \n",
-       "2  1692361878  127090           1692361878           127220   \n",
-       "3  1692361878  151728           1692361878           151728   \n",
-       "4  1692361878  137262           1692361878           137301   \n",
+       "0  1692361878  127258           1692361878           127781   \n",
+       "1  1692361878   85383           1692361878            85364   \n",
+       "2  1692361878   45030           1692361878            85200   \n",
+       "3  1692361878  127259           1692361878           127781   \n",
+       "4  1692361878  127968           1692361878           127961   \n",
        "\n",
        "   anchor_0_dataset_id  anchor_0_doc_id  _output_partition_id  \n",
-       "0           1692361878             8895                     0  \n",
-       "1           1692361878           127089                     0  \n",
-       "2           1692361878           127089                     0  \n",
-       "3           1692361878           151729                     0  \n",
-       "4           1692361878           137262                     0  "
+       "0           1692361878           126955                     0  \n",
+       "1           1692361878            85374                     0  \n",
+       "2           1692361878            45030                     0  \n",
+       "3           1692361878           126955                     0  \n",
+       "4           1692361878           127996                     0  "
       ]
      },
-     "execution_count": 41,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1993,7 +1974,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b4896749",
+   "id": "1487b1ad",
    "metadata": {},
    "source": [
     "**[Optional]** Remove previous Jaccard Shuffle results. Run only when there are files under the Jaccard Shuffle output path"
@@ -2001,8 +1982,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
-   "id": "2d4dd55f",
+   "execution_count": 30,
+   "id": "b414f703",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2011,7 +1992,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f9b5ab9e",
+   "id": "f33a6782",
    "metadata": {},
    "source": [
     "Run Jaccard Shuffle"
@@ -2019,15 +2000,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
-   "id": "acccb80b",
+   "execution_count": 31,
+   "id": "86d1b3e5",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  0%|                                                                                                                                                                                                                                                                                                                                                       | 0/1 [00:00<?, ?it/s]"
+      "  0%|                                                                                                                                                                                                                                                                                | 0/1 [00:00<?, ?it/s]"
      ]
     },
     {
@@ -2039,22 +2020,22 @@
       "Using 1 text partitions.\n",
       "Starting text bytes aware shuffle\n",
       "Will write 30596 rows to disk\n",
-      "Text-df partition  1/1 completed in 3.532567024230957\n",
-      "Bucket partition  1/1 completed in 3.545058012008667\n"
+      "Text-df partition  1/1 completed in 2.4342942237854004\n",
+      "Bucket partition  1/1 completed in 2.4410006999969482\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.55s/it]"
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.45s/it]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken for Jaccard Shuffle = 3.613771677017212 s\n"
+      "Time taken for Jaccard Shuffle = 2.4802186489105225 s\n"
      ]
     },
     {
@@ -2072,7 +2053,8 @@
     "shuffle = _Shuffle(\n",
     "    id_fields=shuffle_id_fields,\n",
     "    text_field=input_text_field,\n",
-    "    int_to_str_id=int_to_str_id\n",
+    "    int_to_str_id=int_to_str_id,\n",
+    "    logger=jaccard_shuffle_log_path\n",
     ")\n",
     "shuffle.shuffle_docs_on_buckets(\n",
     "    documents_df=ddf_text,\n",
@@ -2089,7 +2071,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "929f8dea",
+   "id": "86b06cb5",
    "metadata": {},
    "source": [
     "Verify result"
@@ -2097,8 +2079,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
-   "id": "49fb9756",
+   "execution_count": 32,
+   "id": "1b51a5fb",
    "metadata": {},
    "outputs": [
     {
@@ -2132,43 +2114,43 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...</td>\n",
-       "      <td>1894</td>\n",
-       "      <td>1692361878-127021</td>\n",
-       "      <td>1692361878-127021</td>\n",
-       "      <td>1692361878-126958</td>\n",
+       "      <td>การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...</td>\n",
+       "      <td>1457</td>\n",
+       "      <td>1692361878-135417</td>\n",
+       "      <td>1692361878-135463</td>\n",
+       "      <td>1692361878-135417</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...</td>\n",
-       "      <td>1894</td>\n",
-       "      <td>1692361878-127021</td>\n",
-       "      <td>1692361878-127021</td>\n",
-       "      <td>1692361878-127017</td>\n",
+       "      <td>การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...</td>\n",
+       "      <td>1457</td>\n",
+       "      <td>1692361878-135417</td>\n",
+       "      <td>1692361878-135392</td>\n",
+       "      <td>1692361878-135447</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...</td>\n",
-       "      <td>1894</td>\n",
-       "      <td>1692361878-127021</td>\n",
-       "      <td>1692361878-126928</td>\n",
-       "      <td>1692361878-126891</td>\n",
+       "      <td>สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...</td>\n",
+       "      <td>1262</td>\n",
+       "      <td>1692361878-83363</td>\n",
+       "      <td>1692361878-94231</td>\n",
+       "      <td>1692361878-83363</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...</td>\n",
-       "      <td>423</td>\n",
-       "      <td>1692361878-87271</td>\n",
-       "      <td>1692361878-87204</td>\n",
-       "      <td>1692361878-87271</td>\n",
+       "      <td>สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...</td>\n",
+       "      <td>1262</td>\n",
+       "      <td>1692361878-83363</td>\n",
+       "      <td>1692361878-94905</td>\n",
+       "      <td>1692361878-83363</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...</td>\n",
-       "      <td>423</td>\n",
-       "      <td>1692361878-87271</td>\n",
-       "      <td>1692361878-87267</td>\n",
-       "      <td>1692361878-87271</td>\n",
+       "      <td>สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...</td>\n",
+       "      <td>1262</td>\n",
+       "      <td>1692361878-83363</td>\n",
+       "      <td>1692361878-94906</td>\n",
+       "      <td>1692361878-94905</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2176,21 +2158,21 @@
       ],
       "text/plain": [
        "                                                text  _text_bytes  \\\n",
-       "0  ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...         1894   \n",
-       "1  ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...         1894   \n",
-       "2  ประเทศยูกันดา เข้าร่วมแข่งขันกีฬาโอลิมปิกเยาวช...         1894   \n",
-       "3  วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...          423   \n",
-       "4  วอลเลย์บอลหญิงชิงแชมป์อเมริกาใต้ 1985 () เป็นค...          423   \n",
+       "0  การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...         1457   \n",
+       "1  การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...         1457   \n",
+       "2  สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...         1262   \n",
+       "3  สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...         1262   \n",
+       "4  สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...         1262   \n",
        "\n",
        "                  id        anchor_0_id        anchor_1_id  \n",
-       "0  1692361878-127021  1692361878-127021  1692361878-126958  \n",
-       "1  1692361878-127021  1692361878-127021  1692361878-127017  \n",
-       "2  1692361878-127021  1692361878-126928  1692361878-126891  \n",
-       "3   1692361878-87271   1692361878-87204   1692361878-87271  \n",
-       "4   1692361878-87271   1692361878-87267   1692361878-87271  "
+       "0  1692361878-135417  1692361878-135463  1692361878-135417  \n",
+       "1  1692361878-135417  1692361878-135392  1692361878-135447  \n",
+       "2   1692361878-83363   1692361878-94231   1692361878-83363  \n",
+       "3   1692361878-83363   1692361878-94905   1692361878-83363  \n",
+       "4   1692361878-83363   1692361878-94906   1692361878-94905  "
       ]
      },
-     "execution_count": 45,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2202,7 +2184,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "1a23a5c0",
+   "id": "b8644e51",
    "metadata": {},
    "source": [
     "### 5.4 Jaccard Compute\n",
@@ -2217,8 +2199,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
-   "id": "6cfa08ea",
+   "execution_count": 33,
+   "id": "b1a532a2",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2227,7 +2209,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "389f305b",
+   "id": "c9e65975",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -2235,8 +2217,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
-   "id": "c142a42a",
+   "execution_count": 34,
+   "id": "291d3aaa",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2258,7 +2240,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "7a0f610f",
+   "id": "9341b58c",
    "metadata": {},
    "source": [
     "Run Jaccard Compute"
@@ -2266,8 +2248,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
-   "id": "8ceae838",
+   "execution_count": 35,
+   "id": "9b1b9bdd",
    "metadata": {},
    "outputs": [
     {
@@ -2275,7 +2257,7 @@
      "output_type": "stream",
      "text": [
       "Running jaccard compute script\n",
-      "Time taken for Jaccard Computing: 0.5923423767089844\n"
+      "Time taken for Jaccard Computing: 0.735356330871582\n"
      ]
     }
    ],
@@ -2303,7 +2285,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ae06ad56",
+   "id": "bb740d30",
    "metadata": {},
    "source": [
     "Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets."
@@ -2311,8 +2293,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
-   "id": "686eb956",
+   "execution_count": 36,
+   "id": "a41d1f09",
    "metadata": {},
    "outputs": [
     {
@@ -2344,33 +2326,33 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>1692361878-49094</td>\n",
-       "      <td>1692361878-49078</td>\n",
-       "      <td>0.784000</td>\n",
+       "      <td>1692361878-136568</td>\n",
+       "      <td>1692361878-136566</td>\n",
+       "      <td>0.754448</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>1692361878-49094</td>\n",
-       "      <td>1692361878-49078</td>\n",
-       "      <td>0.784000</td>\n",
+       "      <td>1692361878-136568</td>\n",
+       "      <td>1692361878-136566</td>\n",
+       "      <td>0.754448</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>1692361878-49094</td>\n",
-       "      <td>1692361878-49078</td>\n",
-       "      <td>0.784000</td>\n",
+       "      <td>1692361878-136568</td>\n",
+       "      <td>1692361878-136566</td>\n",
+       "      <td>0.754448</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>1692361878-49094</td>\n",
-       "      <td>1692361878-49078</td>\n",
-       "      <td>0.784000</td>\n",
+       "      <td>1692361878-136568</td>\n",
+       "      <td>1692361878-136566</td>\n",
+       "      <td>0.754448</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>1692361878-161128</td>\n",
-       "      <td>1692361878-161122</td>\n",
-       "      <td>0.890339</td>\n",
+       "      <td>1692361878-92875</td>\n",
+       "      <td>1692361878-87743</td>\n",
+       "      <td>0.828794</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2378,14 +2360,14 @@
       ],
       "text/plain": [
        "                id_x               id_y   jaccard\n",
-       "0   1692361878-49094   1692361878-49078  0.784000\n",
-       "1   1692361878-49094   1692361878-49078  0.784000\n",
-       "2   1692361878-49094   1692361878-49078  0.784000\n",
-       "3   1692361878-49094   1692361878-49078  0.784000\n",
-       "4  1692361878-161128  1692361878-161122  0.890339"
+       "0  1692361878-136568  1692361878-136566  0.754448\n",
+       "1  1692361878-136568  1692361878-136566  0.754448\n",
+       "2  1692361878-136568  1692361878-136566  0.754448\n",
+       "3  1692361878-136568  1692361878-136566  0.754448\n",
+       "4   1692361878-92875   1692361878-87743  0.828794"
       ]
      },
-     "execution_count": 49,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2397,7 +2379,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "63911051",
+   "id": "a505402e",
    "metadata": {},
    "source": [
     "### 5.5 Connected Components\n",
@@ -2412,8 +2394,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
-   "id": "5eae08f1",
+   "execution_count": 37,
+   "id": "3bff521b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2422,7 +2404,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ed713696",
+   "id": "d8afed6a",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -2430,8 +2412,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
-   "id": "a0881f12",
+   "execution_count": 38,
+   "id": "b40735dd",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2452,7 +2434,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4fba31d2",
+   "id": "33d8957f",
    "metadata": {},
    "source": [
     "Run Connected Component"
@@ -2460,25 +2442,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
-   "id": "da4a8d4e",
+   "execution_count": 39,
+   "id": "fe62dd51",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "batch_id = 0/1, time = 0.26957249641418457\n",
+      "batch_id = 0/1, time = 0.29015278816223145\n",
       "# of groups 5465\n",
       "# of docs removed 3079\n",
       "assert num_nodes:8544==labels_df:8544 passed\n",
-      "Time taken for Connected Component: 4.331223726272583 s\n"
+      "Time taken for Connected Component: 4.489336729049683 s\n"
      ]
     }
    ],
    "source": [
-    "#client.run(enable_spilling)\n",
-    "\n",
     "t0 = time.time()\n",
     "    \n",
     "components_stage = ConnectedComponents(\n",
@@ -2496,7 +2476,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "24b55482",
+   "id": "669495ee",
    "metadata": {},
    "source": [
     "Verify the result of `Connected Components`"
@@ -2504,8 +2484,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
-   "id": "ecd69e7e",
+   "execution_count": 40,
+   "id": "efbd6973",
    "metadata": {},
    "outputs": [
     {
@@ -2538,32 +2518,32 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>139585</td>\n",
-       "      <td>1936</td>\n",
+       "      <td>122282</td>\n",
+       "      <td>903</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>8059</td>\n",
-       "      <td>5312</td>\n",
+       "      <td>139772</td>\n",
+       "      <td>1952</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>93474</td>\n",
-       "      <td>5313</td>\n",
+       "      <td>93927</td>\n",
+       "      <td>112</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>127790</td>\n",
-       "      <td>2774</td>\n",
+       "      <td>121450</td>\n",
+       "      <td>2046</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>1692361878</td>\n",
-       "      <td>49650</td>\n",
-       "      <td>1425</td>\n",
+       "      <td>85288</td>\n",
+       "      <td>3030</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2571,14 +2551,14 @@
       ],
       "text/plain": [
        "   dataset_id  doc_id  group\n",
-       "0  1692361878  139585   1936\n",
-       "1  1692361878    8059   5312\n",
-       "2  1692361878   93474   5313\n",
-       "3  1692361878  127790   2774\n",
-       "4  1692361878   49650   1425"
+       "0  1692361878  122282    903\n",
+       "1  1692361878  139772   1952\n",
+       "2  1692361878   93927    112\n",
+       "3  1692361878  121450   2046\n",
+       "4  1692361878   85288   3030"
       ]
      },
-     "execution_count": 53,
+     "execution_count": 40,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2590,7 +2570,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "44834e54",
+   "id": "0c3e2bdc",
    "metadata": {},
    "source": [
     "Let's check if the output fuzzy duplicated documents within the same group are similar. Please note that the `group` id in your output might be different from the notebook output."
@@ -2599,7 +2579,7 @@
   {
    "cell_type": "code",
    "execution_count": 54,
-   "id": "6c404c89",
+   "id": "d8fa1e8e",
    "metadata": {},
    "outputs": [
     {
@@ -2717,7 +2697,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b4cd941d",
+   "id": "f34b8140",
    "metadata": {},
    "source": [
     "Change the `group` number if necessary. By running the code below, we can obtain a list of near duplicated documents."
@@ -2726,7 +2706,7 @@
   {
    "cell_type": "code",
    "execution_count": 55,
-   "id": "09b3fd0b",
+   "id": "fd01f5fe",
    "metadata": {},
    "outputs": [
     {
@@ -2810,7 +2790,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8b0de04f",
+   "id": "99a8d732",
    "metadata": {},
    "source": [
     "Print the text of near duplicated document. Please replace the `id` if necessary, `id` should be in the format of `<dataset_id>_<doc_id>`"
@@ -2819,7 +2799,7 @@
   {
    "cell_type": "code",
    "execution_count": 73,
-   "id": "fbf88107",
+   "id": "68883f58",
    "metadata": {},
    "outputs": [
     {
@@ -2841,7 +2821,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "fd33ac1d",
+   "id": "3b6578b4",
    "metadata": {},
    "source": [
     "Below is the English translation of the output above. We can see that the two documents are indeed very similar to each other.\n",
@@ -2902,7 +2882,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "68cfec8a",
+   "id": "f36436f3",
    "metadata": {},
    "source": [
     "### 5.6 Fuzzy deduplication wrapper"
@@ -2911,7 +2891,7 @@
   {
    "cell_type": "code",
    "execution_count": 56,
-   "id": "fe7de030",
+   "id": "eb52ec06",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2921,7 +2901,7 @@
   {
    "cell_type": "code",
    "execution_count": 57,
-   "id": "fe8794b8",
+   "id": "625c1828",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2948,7 +2928,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "0aa0b60c",
+   "id": "cb76d8e5",
    "metadata": {},
    "source": [
     "**[Optional]** If the cache folder is not empty, please CLEAR the folder before proceeding"
@@ -2957,7 +2937,7 @@
   {
    "cell_type": "code",
    "execution_count": 59,
-   "id": "83705eaa",
+   "id": "e7fb4c4c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2967,7 +2947,7 @@
   {
    "cell_type": "code",
    "execution_count": 60,
-   "id": "72494e54",
+   "id": "2368443f",
    "metadata": {},
    "outputs": [
     {
@@ -3072,7 +3052,7 @@
   {
    "cell_type": "code",
    "execution_count": 61,
-   "id": "00a8530a",
+   "id": "14bfe3bc",
    "metadata": {},
    "outputs": [
     {
@@ -3151,7 +3131,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b9dfbdde",
+   "id": "d2726cf9",
    "metadata": {},
    "source": [
     "## 6. Remove duplicates\n",
@@ -3161,7 +3141,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "bb722fd2",
+   "id": "e4dd78db",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -3170,7 +3150,7 @@
   {
    "cell_type": "code",
    "execution_count": 81,
-   "id": "5a4b97b7",
+   "id": "0027c8d2",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3189,7 +3169,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d3962deb",
+   "id": "a373860d",
    "metadata": {},
    "source": [
     "We will first process the result of exact deduplication. Since result of exact deduplication contains original ID used in input dataset, it is more straightforward to deal with."
@@ -3198,7 +3178,7 @@
   {
    "cell_type": "code",
    "execution_count": 82,
-   "id": "a29d720d",
+   "id": "f59e92c3",
    "metadata": {},
    "outputs": [
     {
@@ -3228,7 +3208,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b4c1c057",
+   "id": "f55d6737",
    "metadata": {},
    "source": [
     "For result of fuzzy deduplication, we need to first reconstructed document ID by combining `dataset_id` and `doc_id`, then use the reconstructed `ID` for removal"
@@ -3236,7 +3216,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "1c0f1ee6",
+   "id": "3b9c122d",
    "metadata": {},
    "source": [
     "**[Optional]** Uncomment the cell to use result from step by step fuzzy deduplication"
@@ -3245,7 +3225,7 @@
   {
    "cell_type": "code",
    "execution_count": 83,
-   "id": "1ff911ad",
+   "id": "c6a1bb0a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3274,7 +3254,7 @@
   {
    "cell_type": "code",
    "execution_count": 84,
-   "id": "2220d8fc",
+   "id": "746d3673",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3288,7 +3268,7 @@
   {
    "cell_type": "code",
    "execution_count": 85,
-   "id": "08143e1e",
+   "id": "62b34838",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3301,7 +3281,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a5008578",
+   "id": "edfa52ce",
    "metadata": {},
    "source": [
     "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset (length = 161748)"
@@ -3310,7 +3290,7 @@
   {
    "cell_type": "code",
    "execution_count": 86,
-   "id": "a692c916",
+   "id": "78eee9b3",
    "metadata": {},
    "outputs": [
     {
@@ -3328,7 +3308,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "24440f5f",
+   "id": "15e07a32",
    "metadata": {},
    "source": [
     "Close the GPU Dask Cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again."
@@ -3337,7 +3317,7 @@
   {
    "cell_type": "code",
    "execution_count": 88,
-   "id": "656a24f2",
+   "id": "8e807bd7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3347,7 +3327,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3a00f6ea",
+   "id": "a416a293",
    "metadata": {},
    "source": [
     "## 7. Heuristic Fitlering\n",
@@ -3370,7 +3350,7 @@
   {
    "cell_type": "code",
    "execution_count": 89,
-   "id": "41f7cdf4",
+   "id": "b988ad1e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3381,7 +3361,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f5ed694b",
+   "id": "097a1b48",
    "metadata": {},
    "source": [
     "**[Optional]** The following cell is to remove warning from dask."
@@ -3390,7 +3370,7 @@
   {
    "cell_type": "code",
    "execution_count": 90,
-   "id": "39aab4d9",
+   "id": "44552288",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3402,7 +3382,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3c196329",
+   "id": "9a59699d",
    "metadata": {},
    "source": [
     "Create a CPU Dask Cluster."
@@ -3411,7 +3391,7 @@
   {
    "cell_type": "code",
    "execution_count": 91,
-   "id": "1ffd3928",
+   "id": "b8f80ab3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3421,7 +3401,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4a514d3c",
+   "id": "a7702918",
    "metadata": {},
    "source": [
     "Define some helper functions"
@@ -3430,7 +3410,7 @@
   {
    "cell_type": "code",
    "execution_count": 92,
-   "id": "d4aaccc4",
+   "id": "6f2e7523",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3461,7 +3441,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "5d43a755",
+   "id": "227fa8b0",
    "metadata": {},
    "source": [
     "Define parameters"
@@ -3470,7 +3450,7 @@
   {
    "cell_type": "code",
    "execution_count": 93,
-   "id": "4f8b0336",
+   "id": "a894f90f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3500,7 +3480,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "bddd9dd9",
+   "id": "ccea406e",
    "metadata": {},
    "source": [
     "Run heuristic filtering"
@@ -3509,7 +3489,7 @@
   {
    "cell_type": "code",
    "execution_count": 94,
-   "id": "1df83255",
+   "id": "03b3da27",
    "metadata": {},
    "outputs": [
     {
@@ -3609,7 +3589,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "0fab7ee5",
+   "id": "a53b04e9",
    "metadata": {},
    "source": [
     "Verify the result."
@@ -3618,7 +3598,7 @@
   {
    "cell_type": "code",
    "execution_count": 95,
-   "id": "65160254",
+   "id": "07475373",
    "metadata": {},
    "outputs": [
     {
@@ -3756,7 +3736,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "412bd6d2",
+   "id": "24e8b173",
    "metadata": {},
    "source": [
     "Close the CPU Dask Cluster"
@@ -3765,7 +3745,7 @@
   {
    "cell_type": "code",
    "execution_count": 96,
-   "id": "e6129857",
+   "id": "12508f5e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3776,7 +3756,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4679d955",
+   "id": "83e4aed1",
    "metadata": {},
    "outputs": [],
    "source": []