From da45ba2866288df10d601907637db077ecfe92a6 Mon Sep 17 00:00:00 2001
From: Nicole Luo <nluo@nvidia.com>
Date: Thu, 15 Aug 2024 01:21:15 +0000
Subject: [PATCH] Fix bug #183 #185

Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 .../single_gpu_tutorial.ipynb                 | 2142 ++++-------------
 1 file changed, 447 insertions(+), 1695 deletions(-)
 mode change 100755 => 100644 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
old mode 100755
new mode 100644
index 36dc4f84e..c285f0a43
--- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
+++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -78,7 +78,7 @@
     "    Password: <Your NGC Key>\n",
     "- Get NeMo NeMo Framework Training Container\n",
     "    ```bash\n",
-    "    docker pull docker pull nvcr.io/nvidia/nemo:dev.framework\n"
+    "    docker pull nvcr.io/nvidia/nemo:dev\n"
    ]
   },
   {
@@ -91,34 +91,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "8add9bbd",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
-      "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (4.0.0)\n",
-      "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines) (23.2.0)\n",
-      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
-      "\u001b[0m"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "!pip install jsonlines"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
+   "id": "97079227-d9c3-40d2-b939-64b221b86990",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%env DASK_DATAFRAME__QUERY_PLANNING False\n",
+    "%env CUDA_VISIBLE_DEVICES 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "id": "9940c70d",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "import argparse\n",
+    "import os\n",
     "\n",
     "from nemo_curator.utils.distributed_utils import get_client,get_num_workers\n",
     "from nemo_curator.utils.script_utils import ArgumentHelper\n",
@@ -126,7 +132,6 @@
     "from nemo_curator.utils.distributed_utils import read_data,write_to_disk\n",
     "from nemo_curator.datasets import DocumentDataset\n",
     "\n",
-    "import os\n",
     "import sys\n",
     "import pandas as pd\n",
     "import time\n",
@@ -135,16 +140,16 @@
     "import dask\n",
     "import numpy as np\n",
     "from dask.distributed import Client, LocalCluster\n",
-    "import jsonlines\n",
-    "\n",
-    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
+    "import jsonlines\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "fd8a381d",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "def pre_imports():\n",
@@ -166,23 +171,23 @@
     "    with jsonlines.open(file_path) as reader:\n",
     "        for obj in reader:\n",
     "            if obj.get('id') in target_list:\n",
-    "                yield obj"
+    "                yield obj\n",
+    "\n",
+    "def get_base_dataset_file_name(download_folder):\n",
+    "    files = os.listdir(download_folder)\n",
+    "    for file in files:\n",
+    "        if file.startswith('thwiki') and file.endswith(''):\n",
+    "            return file"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "589ff257",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/work_dir/tutorials/single_node_tutorial\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "cur_dir = os.getcwd()\n",
     "print(cur_dir)\n",
@@ -232,9 +237,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "adb59379",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator.download import download_wikipedia"
@@ -250,9 +257,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "e822b5ac",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n",
@@ -269,9 +278,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "9a03b463",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Output\n",
@@ -279,7 +290,6 @@
     "download_output_directory = os.path.join(download_base_directory,\"data\")\n",
     "\n",
     "#Relevant parameters\n",
-    "dump_date = \"20240201\"\n",
     "language = 'th'\n",
     "url_limit = 1"
    ]
@@ -296,12 +306,13 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "a45965a7",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "res = download_wikipedia(download_output_directory,\n",
     "                   language=language, \n",
-    "                   dump_date=dump_date,\n",
     "                   url_limit=url_limit).df.compute()"
    ]
   },
@@ -310,59 +321,56 @@
    "id": "22b7d5b3",
    "metadata": {},
    "source": [
-    "Verify result"
+    "**[Optional]** Verify result"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "id": "45a69041",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "downloads  thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n",
-      "162164 /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/wiki_downloads/data/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "! ls {download_output_directory}\n",
-    "! wc -l  {download_output_directory}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl"
+    "# List all the file in the output directory.\n",
+    "# ! ls {download_output_directory}\n",
+    "\n",
+    "# Please replace your dataset file name accordingly.\n",
+    "# ! wc -l  {download_output_directory}/{YOUR DATASET FILE NAME}.jsonl"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "id": "53bdccfd",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"id\":\"1\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\",\"language\":\"th\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\"}\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "check_jsonl_file(download_output_directory)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "024d7a31-20a3-4d2a-9b89-2f85638fa0da",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!rm -r {download_output_directory}/downloads"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "c5f58643",
    "metadata": {},
    "source": [
-    "**[Optional]**Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again."
+    "**[Optional]** Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": null,
    "id": "0669a830",
    "metadata": {},
    "outputs": [],
@@ -396,9 +404,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "1e9198e8",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator import ScoreFilter,Modify\n",
@@ -416,9 +426,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "da3aed8a",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n",
@@ -435,13 +447,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "13b9d2b1",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# Input path\n",
-    "multilingual_data_path = f\"{download_output_directory}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\"\n",
+    "multilingual_data_path = download_output_directory\n",
     "\n",
     "# Output path\n",
     "language_base_output_path = os.path.join(data_dir,\"language_sep\")\n",
@@ -469,28 +483,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "2666727d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--2024-05-17 03:17:09--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\n",
-      "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 99.84.238.181, 99.84.238.154, 99.84.238.162, ...\n",
-      "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|99.84.238.181|:443... connected.\n",
-      "HTTP request sent, awaiting response... 200 OK\n",
-      "Length: 131266198 (125M) [application/octet-stream]\n",
-      "Saving to: ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin.1’\n",
-      "\n",
-      "lid.176.bin.1       100%[===================>] 125.18M   184MB/s    in 0.7s    \n",
-      "\n",
-      "2024-05-17 03:17:10 (184 MB/s) - ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin.1’ saved [131266198/131266198]\n",
-      "\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P {model_path}"
    ]
@@ -505,32 +503,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "d8b8c491",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Reading 1 files\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken for splitting language:140.04064464569092\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "t0 = time.time()\n",
     "\n",
@@ -561,20 +539,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "272a5f67",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Reading 1 files\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Time taken for fixing unicode:437.4811737537384\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "t0 = time.time()\n",
     "\n",
@@ -596,27 +566,23 @@
    "id": "9bd57a53",
    "metadata": {},
    "source": [
-    "Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file (no. of lines = 162164)"
+    "**[Optional]** Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "id": "e3329c83",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n",
-      "161748 /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/data/cleaned/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "! ls {lang_sep_cleaned_data_output_path}\n",
-    "! wc -l  {lang_sep_cleaned_data_output_path}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl"
+    "# List all the file in the output directory.\n",
+    "# ! ls {lang_sep_cleaned_data_output_path}\n",
+    "\n",
+    "# Please replace your dataset file name accordingly.\n",
+    "# ! wc -l  {lang_sep_cleaned_data_output_path}/{YOUR DATASET FILE NAME}.jsonl"
    ]
   },
   {
@@ -629,19 +595,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": null,
    "id": "050d944c",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\",\"id\":\"1\",\"language\":\"TH\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\"}\n",
-      "\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "check_jsonl_file(os.path.join(language_separated_output_path,'EN'))"
    ]
@@ -656,9 +615,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": null,
    "id": "7e64cc35",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# client.cluster.close()\n",
@@ -679,9 +640,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "id": "5f788b91",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator import AddId"
@@ -697,9 +660,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "id": "5ba1d54a",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n",
@@ -716,9 +681,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "843eba7f",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Input\n",
@@ -741,20 +708,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "b7a91bf1",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Reading 1 files\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Time taken for add ID:47.33783745765686\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "t0 = time.time()\n",
     "# Read input files\n",
@@ -780,19 +739,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "id": "e585cedd",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\",\"id\":\"TH_wiki-0000000000\",\"language\":\"TH\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\"}\n",
-      "\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "check_jsonl_file(added_id_output_path)"
    ]
@@ -807,9 +759,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "id": "4daa1f2a",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "client.cluster.close()\n",
@@ -834,9 +788,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "3f7ba34c",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator.modules import ExactDuplicates"
@@ -852,28 +808,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "4b73e5f9",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of dask worker:1\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'tcp://127.0.0.1:36179': None}"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "client = get_client(cluster_type = 'gpu', set_torch_to_use_rmm=False)\n",
     "print(f\"Number of dask worker:{get_num_workers(client)}\")\n",
@@ -893,9 +833,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "a590c78a",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#pip install --extra-index-url https://pypi.nvidia.com \".[cuda12x]\""
@@ -911,9 +853,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "54b627a4",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Input\n",
@@ -931,9 +875,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "6ede2e41",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "!mkdir -p {exact_dedup_log_dir}\n",
@@ -950,34 +896,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "dfaaa765",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Reading 1 files\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/exact_dedup.py:158: UserWarning: Output path f/work_dir/tutorials/single_node_tutorial/workspace/exact_dedup/data/_exact_duplicates.parquet already exists and will be overwritten\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of exact duplicated file:53\n",
-      "Time taken for exact duplicate:1.9788782596588135\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "t0 = time.time()\n",
     "# Read input dataset\n",
@@ -1003,91 +927,17 @@
    "id": "e68f0399",
    "metadata": {},
    "source": [
-    "Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary"
+    "**[Optional]** Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "id": "28d8bb0b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of exact duplicated document:53\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>_hashes</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>TH_wiki-0000122055</td>\n",
-       "      <td>3e6e96a80410d5a191d098f464e66f86</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>TH_wiki-0000105191</td>\n",
-       "      <td>e77a248506ef16737288fae5759db33a</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>TH_wiki-0000105192</td>\n",
-       "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>TH_wiki-0000105193</td>\n",
-       "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>TH_wiki-0000105194</td>\n",
-       "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                   id                           _hashes\n",
-       "0  TH_wiki-0000122055  3e6e96a80410d5a191d098f464e66f86\n",
-       "1  TH_wiki-0000105191  e77a248506ef16737288fae5759db33a\n",
-       "2  TH_wiki-0000105192  2e386f5c3af70f43874618988d4842b2\n",
-       "3  TH_wiki-0000105193  2e386f5c3af70f43874618988d4842b2\n",
-       "4  TH_wiki-0000105194  2e386f5c3af70f43874618988d4842b2"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "exact_dedup_res = pd.read_parquet(os.path.join(exact_dedup_output_dir,\"_exact_duplicates.parquet\"))\n",
     "print(f\"Number of exact duplicated document:{len(exact_dedup_res)}\")\n",
@@ -1096,109 +946,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "id": "fca41870",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>_hashes</th>\n",
-       "      <th>id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0b908a91cdf0544c1ef3015cff4ee07e</td>\n",
-       "      <td>TH_wiki-0000157216 TH_wiki-0000066307</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>15f35c239b6579b4642f7656e64576ac</td>\n",
-       "      <td>TH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1708cb56ec582f78716f0864dca9382d</td>\n",
-       "      <td>TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2e386f5c3af70f43874618988d4842b2</td>\n",
-       "      <td>TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>3e6e96a80410d5a191d098f464e66f86</td>\n",
-       "      <td>TH_wiki-0000122055 TH_wiki-0000116550</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                            _hashes  \\\n",
-       "0  0b908a91cdf0544c1ef3015cff4ee07e   \n",
-       "1  15f35c239b6579b4642f7656e64576ac   \n",
-       "2  1708cb56ec582f78716f0864dca9382d   \n",
-       "3  2e386f5c3af70f43874618988d4842b2   \n",
-       "4  3e6e96a80410d5a191d098f464e66f86   \n",
-       "\n",
-       "                                                  id  \n",
-       "0              TH_wiki-0000157216 TH_wiki-0000066307  \n",
-       "1  TH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-...  \n",
-       "2  TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...  \n",
-       "3  TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-...  \n",
-       "4              TH_wiki-0000122055 TH_wiki-0000116550  "
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "exact_dedup_res.groupby('_hashes')['id'].agg(lambda x: ' '.join(x)).reset_index().head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "597d04a4-0b82-43f9-9f61-61729e768ab1",
+   "metadata": {},
+   "source": [
+    "Using the duplicated id shown above, check the content to see if it's exact duplicates"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "8c9624ac",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'filename': 'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl', 'id': 'TH_wiki-0000066307', 'language': 'TH', 'source_id': 'thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2', 'text': '\\n\\nแหล่งข้อมูลอื่น \\n\\nสงขลา\\n \\nรายชื่อเกี่ยวกับจังหวัดสงขลา', 'title': 'รายชื่อโบราณสถานในจังหวัดสงขลา', 'url': 'https://th.wikipedia.org/wiki/%E0%B8%A3%E0%B8%B2%E0%B8%A2%E0%B8%8A%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B9%82%E0%B8%9A%E0%B8%A3%E0%B8%B2%E0%B8%93%E0%B8%AA%E0%B8%96%E0%B8%B2%E0%B8%99%E0%B9%83%E0%B8%99%E0%B8%88%E0%B8%B1%E0%B8%87%E0%B8%AB%E0%B8%A7%E0%B8%B1%E0%B8%94%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2'}\n",
-      "{'filename': 'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl', 'id': 'TH_wiki-0000157216', 'language': 'TH', 'source_id': 'thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2', 'text': '\\n\\nแหล่งข้อมูลอื่น \\n\\nสงขลา\\n \\nรายชื่อเกี่ยวกับจังหวัดสงขลา', 'title': 'รายชื่อโบราณสถานในจังหวัดสงขลา (อำเภอเมืองสงขลาและสิงหนคร)', 'url': 'https://th.wikipedia.org/wiki/%E0%B8%A3%E0%B8%B2%E0%B8%A2%E0%B8%8A%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B9%82%E0%B8%9A%E0%B8%A3%E0%B8%B2%E0%B8%93%E0%B8%AA%E0%B8%96%E0%B8%B2%E0%B8%99%E0%B9%83%E0%B8%99%E0%B8%88%E0%B8%B1%E0%B8%87%E0%B8%AB%E0%B8%A7%E0%B8%B1%E0%B8%94%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2%20%28%E0%B8%AD%E0%B8%B3%E0%B9%80%E0%B8%A0%E0%B8%AD%E0%B9%80%E0%B8%A1%E0%B8%B7%E0%B8%AD%E0%B8%87%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2%E0%B9%81%E0%B8%A5%E0%B8%B0%E0%B8%AA%E0%B8%B4%E0%B8%87%E0%B8%AB%E0%B8%99%E0%B8%84%E0%B8%A3%29'}\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "target_list = ['TH_wiki-0000157216', 'TH_wiki-0000066307']\n",
-    "for line in extract_lines_with_id(os.path.join(exact_dedup_input_dataset_dir,'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl'),target_list):\n",
-    "    print(line)"
+    "# target_list = ['TH_wiki-0000157216', 'TH_wiki-0000066307']\n",
+    "# for line in extract_lines_with_id(os.path.join(exact_dedup_input_dataset_dir,'{YOUR DATASET FILE NAME}'),target_list):\n",
+    "#     print(line)"
    ]
   },
   {
@@ -1211,9 +988,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "id": "5ef2f05e",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# client.cluster.close()\n",
@@ -1298,9 +1077,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "1fc5bff3",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator import MinHash"
@@ -1316,9 +1097,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "d600d1b8",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Input\n",
@@ -1353,34 +1136,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "id": "88540950",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Computing minhashes for /work_dir/tutorials/single_node_tutorial/workspace/add_id/cleaned\n",
-      "Reading 1 files\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:175: UserWarning: Output path /work_dir/tutorials/single_node_tutorial/workspace/fuzzy/minhash/data/_minhashes.parquet already exists and will be overwritten\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken for MinHash:6.340771198272705\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "t0 = time.time()\n",
     "print(f\"Computing minhashes for {minhash_data_path}\")\n",
@@ -1417,87 +1178,20 @@
    "id": "158bf3ab",
    "metadata": {},
    "source": [
-    "Verify result"
+    "**[Optional]** Verify result"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "id": "10b5eb55",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>_minhash_signature</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>TH_wiki-0000000000</td>\n",
-       "      <td>[11565725, 19782487, 9831980, 5480992, 2306475...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>TH_wiki-0000000001</td>\n",
-       "      <td>[407876, 107572, 824528, 346831, 216554, 10963...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>TH_wiki-0000000002</td>\n",
-       "      <td>[727721, 694551, 233868, 346831, 216554, 77001...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>TH_wiki-0000000003</td>\n",
-       "      <td>[1149282, 931656, 2515604, 1428622, 4964646, 4...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>TH_wiki-0000000004</td>\n",
-       "      <td>[1559901, 11771639, 487706, 826569, 1203860, 5...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                   id                                 _minhash_signature\n",
-       "0  TH_wiki-0000000000  [11565725, 19782487, 9831980, 5480992, 2306475...\n",
-       "1  TH_wiki-0000000001  [407876, 107572, 824528, 346831, 216554, 10963...\n",
-       "2  TH_wiki-0000000002  [727721, 694551, 233868, 346831, 216554, 77001...\n",
-       "3  TH_wiki-0000000003  [1149282, 931656, 2515604, 1428622, 4964646, 4...\n",
-       "4  TH_wiki-0000000004  [1559901, 11771639, 487706, 826569, 1203860, 5..."
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "minhash_res = pd.read_parquet(os.path.join(minshah_output_dir, \"_minhashes.parquet\"))\n",
-    "minhash_res.head()"
+    "# minhash_res = pd.read_parquet(os.path.join(minshah_output_dir, \"_minhashes.parquet\"))\n",
+    "# minhash_res.head()"
    ]
   },
   {
@@ -1523,9 +1217,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "id": "645b8a53",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator import LSH\n",
@@ -1543,9 +1239,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "id": "738ab265",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Input\n",
@@ -1577,26 +1275,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "1ef61e2b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:361: UserWarning: Output path /work_dir/tutorials/single_node_tutorial/workspace/fuzzy/lsh/data/_buckets.parquet already exists and will be overwritten\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken for LSH:19.37230634689331\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "t0 = time.time()\n",
     "\n",
@@ -1631,93 +1315,20 @@
    "id": "ad2e3b60",
    "metadata": {},
    "source": [
-    "Verify result"
+    "**[Optional]** Verify result"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "id": "9d0449c6",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>dataset_id</th>\n",
-       "      <th>doc_id</th>\n",
-       "      <th>_bucket_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>123547</td>\n",
-       "      <td>210</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>93844</td>\n",
-       "      <td>120</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>66564</td>\n",
-       "      <td>86</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>93845</td>\n",
-       "      <td>120</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>66565</td>\n",
-       "      <td>86</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   dataset_id  doc_id  _bucket_id\n",
-       "0  1692361878  123547         210\n",
-       "1  1692361878   93844         120\n",
-       "2  1692361878   66564          86\n",
-       "3  1692361878   93845         120\n",
-       "4  1692361878   66565          86"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "lsh_res = pd.read_parquet(os.path.join(lsh_output_dir, \"_buckets.parquet\"))\n",
-    "lsh_res.head()"
+    "# lsh_res = pd.read_parquet(os.path.join(lsh_output_dir, \"_buckets.parquet\"))\n",
+    "# lsh_res.head()"
    ]
   },
   {
@@ -1743,9 +1354,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "id": "707ea54d",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (\n",
@@ -1765,9 +1378,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "id": "70e2dff9",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Input\n",
@@ -1808,20 +1423,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "id": "b2850b0a",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of files being read for jaccard calculation = 1\n",
-      "Number of ddf_bk partitions = 1\n",
-      "Time taken for Bucket Mapping:1.239295244216919 s\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "t0 = time.time()\n",
     "num_workers = get_num_workers(client)\n",
@@ -1852,124 +1459,20 @@
    "id": "a1533a15",
    "metadata": {},
    "source": [
-    "Verify result"
+    "**[Optional]** Verify result"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "id": "d74012c3",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>dataset_id</th>\n",
-       "      <th>doc_id</th>\n",
-       "      <th>anchor_1_dataset_id</th>\n",
-       "      <th>anchor_1_doc_id</th>\n",
-       "      <th>anchor_0_dataset_id</th>\n",
-       "      <th>anchor_0_doc_id</th>\n",
-       "      <th>_output_partition_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>127258</td>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>127781</td>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>126955</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>85383</td>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>85364</td>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>85374</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>45030</td>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>85200</td>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>45030</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>127259</td>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>127781</td>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>126955</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>127968</td>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>127961</td>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>127996</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   dataset_id  doc_id  anchor_1_dataset_id  anchor_1_doc_id  \\\n",
-       "0  1692361878  127258           1692361878           127781   \n",
-       "1  1692361878   85383           1692361878            85364   \n",
-       "2  1692361878   45030           1692361878            85200   \n",
-       "3  1692361878  127259           1692361878           127781   \n",
-       "4  1692361878  127968           1692361878           127961   \n",
-       "\n",
-       "   anchor_0_dataset_id  anchor_0_doc_id  _output_partition_id  \n",
-       "0           1692361878           126955                     0  \n",
-       "1           1692361878            85374                     0  \n",
-       "2           1692361878            45030                     0  \n",
-       "3           1692361878           126955                     0  \n",
-       "4           1692361878           127996                     0  "
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "map_bucket_res = pd.read_parquet(output_anchor_docs_with_bk_path)\n",
-    "map_bucket_res.head()"
+    "# map_bucket_res = pd.read_parquet(output_anchor_docs_with_bk_path)\n",
+    "# map_bucket_res.head()"
    ]
   },
   {
@@ -1982,7 +1485,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": null,
    "id": "b414f703",
    "metadata": {},
    "outputs": [],
@@ -2000,52 +1503,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "id": "86d1b3e5",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  0%|                                                                                                                                                                                                                                                                                | 0/1 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Started processing bucket-map partitions 0 through 1 of 1\n",
-      "Using 1 text partitions.\n",
-      "Starting text bytes aware shuffle\n",
-      "Will write 30596 rows to disk\n",
-      "Text-df partition  1/1 completed in 2.4342942237854004\n",
-      "Bucket partition  1/1 completed in 2.4410006999969482\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.45s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken for Jaccard Shuffle = 2.4802186489105225 s\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "t0 = time.time()\n",
     "\n",
@@ -2074,112 +1537,20 @@
    "id": "86b06cb5",
    "metadata": {},
    "source": [
-    "Verify result"
+    "**[Optional]** Verify result"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "1b51a5fb",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>text</th>\n",
-       "      <th>_text_bytes</th>\n",
-       "      <th>id</th>\n",
-       "      <th>anchor_0_id</th>\n",
-       "      <th>anchor_1_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...</td>\n",
-       "      <td>1457</td>\n",
-       "      <td>1692361878-135417</td>\n",
-       "      <td>1692361878-135463</td>\n",
-       "      <td>1692361878-135417</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...</td>\n",
-       "      <td>1457</td>\n",
-       "      <td>1692361878-135417</td>\n",
-       "      <td>1692361878-135392</td>\n",
-       "      <td>1692361878-135447</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...</td>\n",
-       "      <td>1262</td>\n",
-       "      <td>1692361878-83363</td>\n",
-       "      <td>1692361878-94231</td>\n",
-       "      <td>1692361878-83363</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...</td>\n",
-       "      <td>1262</td>\n",
-       "      <td>1692361878-83363</td>\n",
-       "      <td>1692361878-94905</td>\n",
-       "      <td>1692361878-83363</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...</td>\n",
-       "      <td>1262</td>\n",
-       "      <td>1692361878-83363</td>\n",
-       "      <td>1692361878-94906</td>\n",
-       "      <td>1692361878-94905</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                text  _text_bytes  \\\n",
-       "0  การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...         1457   \n",
-       "1  การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...         1457   \n",
-       "2  สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...         1262   \n",
-       "3  สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...         1262   \n",
-       "4  สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...         1262   \n",
-       "\n",
-       "                  id        anchor_0_id        anchor_1_id  \n",
-       "0  1692361878-135417  1692361878-135463  1692361878-135417  \n",
-       "1  1692361878-135417  1692361878-135392  1692361878-135447  \n",
-       "2   1692361878-83363   1692361878-94231   1692361878-83363  \n",
-       "3   1692361878-83363   1692361878-94905   1692361878-83363  \n",
-       "4   1692361878-83363   1692361878-94906   1692361878-94905  "
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b51a5fb",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "jaccard_shuffle_res = pd.read_parquet(os.path.join(output_shuffled_docs_path,\"_output_partition_id=0/batch_1_1.parquet\"))\n",
-    "jaccard_shuffle_res.head()"
+    "# jaccard_shuffle_res = pd.read_parquet(os.path.join(output_shuffled_docs_path,\"_output_partition_id=0/batch_1_1.parquet\"))\n",
+    "# jaccard_shuffle_res.head()"
    ]
   },
   {
@@ -2199,9 +1570,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "id": "b1a532a2",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity"
@@ -2217,9 +1590,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "id": "291d3aaa",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Input\n",
@@ -2248,19 +1623,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": null,
    "id": "9b1b9bdd",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Running jaccard compute script\n",
-      "Time taken for Jaccard Computing: 0.735356330871582\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "# enable_spilling()\n",
     "# client.run(enable_spilling)\n",
@@ -2288,93 +1656,20 @@
    "id": "bb740d30",
    "metadata": {},
    "source": [
-    "Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets."
+    "**[Optional]** Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": null,
    "id": "a41d1f09",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id_x</th>\n",
-       "      <th>id_y</th>\n",
-       "      <th>jaccard</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1692361878-136568</td>\n",
-       "      <td>1692361878-136566</td>\n",
-       "      <td>0.754448</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1692361878-136568</td>\n",
-       "      <td>1692361878-136566</td>\n",
-       "      <td>0.754448</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1692361878-136568</td>\n",
-       "      <td>1692361878-136566</td>\n",
-       "      <td>0.754448</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>1692361878-136568</td>\n",
-       "      <td>1692361878-136566</td>\n",
-       "      <td>0.754448</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1692361878-92875</td>\n",
-       "      <td>1692361878-87743</td>\n",
-       "      <td>0.828794</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                id_x               id_y   jaccard\n",
-       "0  1692361878-136568  1692361878-136566  0.754448\n",
-       "1  1692361878-136568  1692361878-136566  0.754448\n",
-       "2  1692361878-136568  1692361878-136566  0.754448\n",
-       "3  1692361878-136568  1692361878-136566  0.754448\n",
-       "4   1692361878-92875   1692361878-87743  0.828794"
-      ]
-     },
-     "execution_count": 36,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "jaccard_compute_res = pd.read_parquet(jaccard_compute_output_results_path)\n",
-    "jaccard_compute_res.head()"
+    "# jaccard_compute_res = pd.read_parquet(jaccard_compute_output_results_path)\n",
+    "# jaccard_compute_res.head()"
    ]
   },
   {
@@ -2394,9 +1689,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": null,
    "id": "3bff521b",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator.modules.fuzzy_dedup import ConnectedComponents"
@@ -2412,9 +1709,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": null,
    "id": "b40735dd",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Input\n",
@@ -2442,22 +1741,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": null,
    "id": "fe62dd51",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "batch_id = 0/1, time = 0.29015278816223145\n",
-      "# of groups 5465\n",
-      "# of docs removed 3079\n",
-      "assert num_nodes:8544==labels_df:8544 passed\n",
-      "Time taken for Connected Component: 4.489336729049683 s\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "t0 = time.time()\n",
     "    \n",
@@ -2479,93 +1768,20 @@
    "id": "669495ee",
    "metadata": {},
    "source": [
-    "Verify the result of `Connected Components`"
+    "**[Optional]** Run the following cells to verify the result of `Connected Components`"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": null,
    "id": "efbd6973",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>dataset_id</th>\n",
-       "      <th>doc_id</th>\n",
-       "      <th>group</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>122282</td>\n",
-       "      <td>903</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>139772</td>\n",
-       "      <td>1952</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>93927</td>\n",
-       "      <td>112</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>121450</td>\n",
-       "      <td>2046</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>85288</td>\n",
-       "      <td>3030</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   dataset_id  doc_id  group\n",
-       "0  1692361878  122282    903\n",
-       "1  1692361878  139772   1952\n",
-       "2  1692361878   93927    112\n",
-       "3  1692361878  121450   2046\n",
-       "4  1692361878   85288   3030"
-      ]
-     },
-     "execution_count": 40,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "cc_compute_res = pd.read_parquet(connected_component_output_path)\n",
-    "cc_compute_res.head()"
+    "# cc_compute_res = pd.read_parquet(connected_component_output_path)\n",
+    "# cc_compute_res.head()"
    ]
   },
   {
@@ -2578,121 +1794,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": null,
    "id": "d8fa1e8e",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>group</th>\n",
-       "      <th>doc_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>75</td>\n",
-       "      <td>160982, 161038, 161124, 161109, 161121, 160991...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>112</td>\n",
-       "      <td>122007, 122124, 122020, 122282, 122010, 122134...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>151</td>\n",
-       "      <td>134584, 135030, 134908, 134891, 135029, 135020...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>321</td>\n",
-       "      <td>94082, 94114, 94126, 94057, 94121, 94132, 9411...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>339</td>\n",
-       "      <td>116230, 116237, 116223, 116236, 116176, 116204...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5460</th>\n",
-       "      <td>8539</td>\n",
-       "      <td>120646</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5461</th>\n",
-       "      <td>8540</td>\n",
-       "      <td>158174</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5462</th>\n",
-       "      <td>8541</td>\n",
-       "      <td>132405</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5463</th>\n",
-       "      <td>8542</td>\n",
-       "      <td>49199</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5464</th>\n",
-       "      <td>8543</td>\n",
-       "      <td>160924</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5465 rows × 2 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      group                                             doc_id\n",
-       "0        75  160982, 161038, 161124, 161109, 161121, 160991...\n",
-       "1       112  122007, 122124, 122020, 122282, 122010, 122134...\n",
-       "2       151  134584, 135030, 134908, 134891, 135029, 135020...\n",
-       "3       321  94082, 94114, 94126, 94057, 94121, 94132, 9411...\n",
-       "4       339  116230, 116237, 116223, 116236, 116176, 116204...\n",
-       "...     ...                                                ...\n",
-       "5460   8539                                             120646\n",
-       "5461   8540                                             158174\n",
-       "5462   8541                                             132405\n",
-       "5463   8542                                              49199\n",
-       "5464   8543                                             160924\n",
-       "\n",
-       "[5465 rows x 2 columns]"
-      ]
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "cc_compute_res['doc_id'] = cc_compute_res['doc_id'].astype(str)\n",
-    "cc_compute_res.groupby('group')['doc_id'].agg(lambda x: ', '.join(x)).reset_index()"
+    "# cc_compute_res['doc_id'] = cc_compute_res['doc_id'].astype(str)\n",
+    "# cc_compute_res.groupby('group')['doc_id'].agg(lambda x: ', '.join(x)).reset_index()"
    ]
   },
   {
@@ -2705,87 +1815,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": null,
    "id": "fd01f5fe",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>dataset_id</th>\n",
-       "      <th>doc_id</th>\n",
-       "      <th>group</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>420</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>122007</td>\n",
-       "      <td>112</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>425</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>122124</td>\n",
-       "      <td>112</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>689</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>122020</td>\n",
-       "      <td>112</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>764</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>122282</td>\n",
-       "      <td>112</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>952</th>\n",
-       "      <td>1692361878</td>\n",
-       "      <td>122010</td>\n",
-       "      <td>112</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     dataset_id  doc_id  group\n",
-       "420  1692361878  122007    112\n",
-       "425  1692361878  122124    112\n",
-       "689  1692361878  122020    112\n",
-       "764  1692361878  122282    112\n",
-       "952  1692361878  122010    112"
-      ]
-     },
-     "execution_count": 55,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "cc_compute_res[cc_compute_res['group']==112].head()"
+    "#Repalce ??? with the group number you want to check\n",
+    "# cc_compute_res[cc_compute_res['group']==???].head()"
    ]
   },
   {
@@ -2816,7 +1854,10 @@
     }
    ],
    "source": [
-    "jaccard_shuffle_res[jaccard_shuffle_res['id'].isin(['1692361878-121545','1692361878-121487'])]['text'].unique()"
+    "# Repalce 'ID1' and 'ID2' with IDs you want to check\n",
+    "# The output is an example of fuzzy duplicates \n",
+    "\n",
+    "# jaccard_shuffle_res[jaccard_shuffle_res['id'].isin(['ID1','ID2'])]['text'].unique()"
    ]
   },
   {
@@ -2890,9 +1931,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": null,
    "id": "eb52ec06",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig"
@@ -2900,9 +1943,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": null,
    "id": "625c1828",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Input\n",
@@ -2936,9 +1981,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": null,
    "id": "e7fb4c4c",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#!rm -r {fuzzy_dedup_cache_dir}"
@@ -2946,80 +1993,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": null,
    "id": "2368443f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Reading 1 files\n",
-      "Stage1: Starting Minhash + LSH computation\n",
-      "Stage1: Minhash + LSH complete!\n",
-      "Stage2 (False Postive Check): Starting Map_Buckets\n",
-      "Stage2 (False Postive Check): Map_Buckets Complete!\n",
-      "Stage3 (False Postive Check): Shuffle docs\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  0%|                                                                                                                                                                                                                                                                                                                                                       | 0/1 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Started processing bucket-map partitions 0 through 1 of 1\n",
-      "Using 1 text partitions.\n",
-      "Starting text bytes aware shuffle\n",
-      "Will write 32059 rows to disk\n",
-      "Text-df partition  1/1 completed in 2.764477491378784\n",
-      "Bucket partition  1/1 completed in 2.783641815185547\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.79s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Stage3 (False Postive Check): Shuffle docs complete!\n",
-      "Stage4 (False Postive Check): Jaccard Similarity in Buckets\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Stage4 (False Postive Check): Jaccard Similarity in Buckets Complete!\n",
-      "Stage5: Connected Components across buckets\n",
-      "batch_id = 0/1, time = 0.2485034465789795\n",
-      "# of groups 5458\n",
-      "# of docs removed 3086\n",
-      "assert num_nodes:8544==labels_df:8544 passed\n",
-      "Stage5: Connected Components across buckets complete!\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Time taken for Connected Component: 20.06704068183899 s\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "with dask.config.set({\"dataframe.backend\": 'cudf'}):\n",
     "        \n",
@@ -3051,79 +2030,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": null,
    "id": "14bfe3bc",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>group</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>TH_wiki-0000134798</td>\n",
-       "      <td>736</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>TH_wiki-0000116226</td>\n",
-       "      <td>1526</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>TH_wiki-0000126796</td>\n",
-       "      <td>2934</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>TH_wiki-0000138218</td>\n",
-       "      <td>156</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>TH_wiki-0000085437</td>\n",
-       "      <td>2722</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                   id  group\n",
-       "0  TH_wiki-0000134798    736\n",
-       "1  TH_wiki-0000116226   1526\n",
-       "2  TH_wiki-0000126796   2934\n",
-       "3  TH_wiki-0000138218    156\n",
-       "4  TH_wiki-0000085437   2722"
-      ]
-     },
-     "execution_count": 61,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "fuzzy_dedup_res = pd.read_parquet(fuzzy_dedup_output_dir)\n",
     "fuzzy_dedup_res.head()"
@@ -3149,9 +2061,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": null,
    "id": "0027c8d2",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Input\n",
@@ -3177,19 +2091,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": null,
    "id": "f59e92c3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Reading 1 files\n",
-      "Reading 1 files\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "#Load .jsonl dataset\n",
     "input_dataset = DocumentDataset.read_json(dataset_dir, backend='cudf')\n",
@@ -3224,7 +2131,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": null,
    "id": "c6a1bb0a",
    "metadata": {},
    "outputs": [],
@@ -3253,9 +2160,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": null,
    "id": "746d3673",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Loads result from fuzzy dedup wrapper\n",
@@ -3267,9 +2176,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": null,
    "id": "62b34838",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Remove near duplicates\n",
@@ -3284,23 +2195,17 @@
    "id": "edfa52ce",
    "metadata": {},
    "source": [
-    "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset (length = 161748)"
+    "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": null,
    "id": "78eee9b3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Length of duplicate removed dataset:156265\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "res = pd.read_parquet(dudped_output_dir)\n",
     "print(f\"Length of duplicate removed dataset:{len(res)}\")"
@@ -3316,9 +2221,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": null,
    "id": "8e807bd7",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "client.cluster.close()\n",
@@ -3349,9 +2256,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": null,
    "id": "b988ad1e",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from nemo_curator.utils.config_utils import build_filter_pipeline\n",
@@ -3369,9 +2278,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": null,
    "id": "44552288",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "import warnings\n",
@@ -3390,10 +2301,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": null,
    "id": "b8f80ab3",
-   "metadata": {},
-   "outputs": [],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-08-12 06:36:51,616 - distributed.scheduler - WARNING - Removing worker 'tcp://127.0.0.1:32917' caused the cluster to lose already computed task(s), which will be recomputed elsewhere: {('getitem-5b69d236ac9974e9fb86010ffc64382a', 0), ('getitem-1a1421e1fc0bebcfdb81496a35f59d59', 0), ('getitem-a531838794cbb6793b5455275c088d56', 0), ('getitem-5a479f5a8ba45819d7bc110e6f66c5cf', 0), ('getitem-20cb1fb330d399835eab7d541c90d9ad', 0), ('getitem-ea8820d11bd559a47001726946b401f1', 0), ('getitem-dc3a1400f3d825aa608fea3f19009402', 0), ('getitem-fc7ee0a305222d3cbc86116635f8f1b7', 0), ('getitem-5a35ddcf8be5c285f2cc9e07ba4168d6', 0), ('getitem-9f6e0b039afa9a3a892b2eee42fff9ff', 0), ('getitem-aef58cc24b78e9deb456d9854d8056db', 0), ('getitem-cf46f299cd36329b1ec712d5fd751b3a', 0), ('getitem-36157dd00770b4907cf863f121981541', 0), ('getitem-2d4c129c73f6e4bd0add5175ea806475', 0)} (stimulus_id='handle-worker-cleanup-1723444611.6157267')\n"
+     ]
+    }
+   ],
    "source": [
     "cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n",
     "client = Client(cluster)"
@@ -3409,9 +2330,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
+   "execution_count": null,
    "id": "6f2e7523",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "def get_dataframe_complement(original_df, filtered_df):\n",
@@ -3449,9 +2372,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 93,
+   "execution_count": null,
    "id": "a894f90f",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Input\n",
@@ -3488,62 +2413,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 94,
+   "execution_count": null,
    "id": "03b3da27",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Reading 1 files\n",
-      "Saving data for symbol_to_word\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for numbers_ratio\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for urls_ratio\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for white_space\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for parentheses_ratio\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for boilerplate_string_ratio\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for repeated_lines\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for repeated_paragraphs\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for repeated_lines_char\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for repeated_paragraphs_char\n",
-      "Writing to disk complete for 1 partitions\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/nemo_curator/utils/distributed_utils.py:379: UserWarning: Empty partition found\n",
-      "  warnings.warn(f\"Empty partition found\")\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Saving data for word_count\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for repeating_top_2grams\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for repeating_top_3grams\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Saving data for repeating_top_4grams\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Writing to disk complete for 1 partitions\n",
-      "Time taken for Heuristic filtering: 1120.5212895870209 s\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "t0 = time.time()\n",
     "\n",
@@ -3592,146 +2467,21 @@
    "id": "a53b04e9",
    "metadata": {},
    "source": [
-    "Verify the result."
+    "**[Optional]** Verify the result."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
+   "execution_count": null,
    "id": "07475373",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset size after heuristic filtering:192786\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>filename</th>\n",
-       "      <th>id</th>\n",
-       "      <th>language</th>\n",
-       "      <th>source_id</th>\n",
-       "      <th>text</th>\n",
-       "      <th>title</th>\n",
-       "      <th>url</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>part.0.parquet</td>\n",
-       "      <td>TH_wiki-0000000001</td>\n",
-       "      <td>TH</td>\n",
-       "      <td>thwiki-20240201-thwiki-20240201-pages-articles...</td>\n",
-       "      <td>ดาราศาสตร์ คือวิชาวิทยาศาสตร์ที่ศึกษาวัตถุในท้...</td>\n",
-       "      <td>ดาราศาสตร์</td>\n",
-       "      <td>https://th.wikipedia.org/wiki/%E0%B8%94%E0%B8%...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>part.0.parquet</td>\n",
-       "      <td>TH_wiki-0000000002</td>\n",
-       "      <td>TH</td>\n",
-       "      <td>thwiki-20240201-thwiki-20240201-pages-articles...</td>\n",
-       "      <td>ภูมิศาสตร์ (,  แปลว่า \"การพรรณนาเกี่ยวกับโลก\")...</td>\n",
-       "      <td>ภูมิศาสตร์</td>\n",
-       "      <td>https://th.wikipedia.org/wiki/%E0%B8%A0%E0%B8%...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>part.0.parquet</td>\n",
-       "      <td>TH_wiki-0000000003</td>\n",
-       "      <td>TH</td>\n",
-       "      <td>thwiki-20240201-thwiki-20240201-pages-articles...</td>\n",
-       "      <td>พันทิป.คอม หรือพันทิป ก่อตั้งขึ้นเมื่อวันที่ 7...</td>\n",
-       "      <td>พันทิป.คอม</td>\n",
-       "      <td>https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>part.0.parquet</td>\n",
-       "      <td>TH_wiki-0000000004</td>\n",
-       "      <td>TH</td>\n",
-       "      <td>thwiki-20240201-thwiki-20240201-pages-articles...</td>\n",
-       "      <td>พันธุ์ทิพย์พลาซ่า () เป็นศูนย์การค้าเกี่ยวกับเ...</td>\n",
-       "      <td>พันธุ์ทิพย์พลาซ่า</td>\n",
-       "      <td>https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>part.0.parquet</td>\n",
-       "      <td>TH_wiki-0000000005</td>\n",
-       "      <td>TH</td>\n",
-       "      <td>thwiki-20240201-thwiki-20240201-pages-articles...</td>\n",
-       "      <td>วิทยาการคอมพิวเตอร์ศึกษาเกี่ยวกับโครงสร้างพื้น...</td>\n",
-       "      <td>วิทยาการคอมพิวเตอร์</td>\n",
-       "      <td>https://th.wikipedia.org/wiki/%E0%B8%A7%E0%B8%...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         filename                  id language  \\\n",
-       "1  part.0.parquet  TH_wiki-0000000001       TH   \n",
-       "2  part.0.parquet  TH_wiki-0000000002       TH   \n",
-       "3  part.0.parquet  TH_wiki-0000000003       TH   \n",
-       "4  part.0.parquet  TH_wiki-0000000004       TH   \n",
-       "5  part.0.parquet  TH_wiki-0000000005       TH   \n",
-       "\n",
-       "                                           source_id  \\\n",
-       "1  thwiki-20240201-thwiki-20240201-pages-articles...   \n",
-       "2  thwiki-20240201-thwiki-20240201-pages-articles...   \n",
-       "3  thwiki-20240201-thwiki-20240201-pages-articles...   \n",
-       "4  thwiki-20240201-thwiki-20240201-pages-articles...   \n",
-       "5  thwiki-20240201-thwiki-20240201-pages-articles...   \n",
-       "\n",
-       "                                                text                title  \\\n",
-       "1  ดาราศาสตร์ คือวิชาวิทยาศาสตร์ที่ศึกษาวัตถุในท้...           ดาราศาสตร์   \n",
-       "2  ภูมิศาสตร์ (,  แปลว่า \"การพรรณนาเกี่ยวกับโลก\")...           ภูมิศาสตร์   \n",
-       "3  พันทิป.คอม หรือพันทิป ก่อตั้งขึ้นเมื่อวันที่ 7...           พันทิป.คอม   \n",
-       "4  พันธุ์ทิพย์พลาซ่า () เป็นศูนย์การค้าเกี่ยวกับเ...    พันธุ์ทิพย์พลาซ่า   \n",
-       "5  วิทยาการคอมพิวเตอร์ศึกษาเกี่ยวกับโครงสร้างพื้น...  วิทยาการคอมพิวเตอร์   \n",
-       "\n",
-       "                                                 url  \n",
-       "1  https://th.wikipedia.org/wiki/%E0%B8%94%E0%B8%...  \n",
-       "2  https://th.wikipedia.org/wiki/%E0%B8%A0%E0%B8%...  \n",
-       "3  https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...  \n",
-       "4  https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...  \n",
-       "5  https://th.wikipedia.org/wiki/%E0%B8%A7%E0%B8%...  "
-      ]
-     },
-     "execution_count": 95,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "res = pd.read_parquet(kept_document_dir)\n",
-    "print(f\"Dataset size after heuristic filtering:{len(res)}\")\n",
-    "res.head()"
+    "# res = pd.read_parquet(kept_document_dir)\n",
+    "# print(f\"Dataset size after heuristic filtering:{len(res)}\")\n",
+    "# res.head()"
    ]
   },
   {
@@ -3744,9 +2494,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 96,
+   "execution_count": 64,
    "id": "12508f5e",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "client.cluster.close()\n",

	id	_hashes
0	TH_wiki-0000122055	3e6e96a80410d5a191d098f464e66f86
1	TH_wiki-0000105191	e77a248506ef16737288fae5759db33a
2	TH_wiki-0000105192	2e386f5c3af70f43874618988d4842b2
3	TH_wiki-0000105193	2e386f5c3af70f43874618988d4842b2
4	TH_wiki-0000105194	2e386f5c3af70f43874618988d4842b2
	_hashes	id
0	0b908a91cdf0544c1ef3015cff4ee07e	TH_wiki-0000157216 TH_wiki-0000066307
1	15f35c239b6579b4642f7656e64576ac	TH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-...
2	1708cb56ec582f78716f0864dca9382d	TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...
3	2e386f5c3af70f43874618988d4842b2	TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-...
4	3e6e96a80410d5a191d098f464e66f86	TH_wiki-0000122055 TH_wiki-0000116550
	id	_minhash_signature
0	TH_wiki-0000000000	[11565725, 19782487, 9831980, 5480992, 2306475...
1	TH_wiki-0000000001	[407876, 107572, 824528, 346831, 216554, 10963...
2	TH_wiki-0000000002	[727721, 694551, 233868, 346831, 216554, 77001...
3	TH_wiki-0000000003	[1149282, 931656, 2515604, 1428622, 4964646, 4...
4	TH_wiki-0000000004	[1559901, 11771639, 487706, 826569, 1203860, 5...
	dataset_id	doc_id	_bucket_id
0	1692361878	123547	210
1	1692361878	93844	120
2	1692361878	66564	86
3	1692361878	93845	120
4	1692361878	66565	86
	dataset_id	doc_id	anchor_1_dataset_id	anchor_1_doc_id	anchor_0_dataset_id	anchor_0_doc_id
0	1692361878	127258	1692361878	127781	1692361878	126955
1	1692361878	85383	1692361878	85364	1692361878	85374
2	1692361878	45030	1692361878	85200	1692361878	45030
3	1692361878	127259	1692361878	127781	1692361878	126955
4	1692361878	127968	1692361878	127961	1692361878	127996
	text	_text_bytes	id	anchor_0_id	anchor_1_id
0	การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...	1457	1692361878-135417	1692361878-135463	1692361878-135417
1	การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...	1457	1692361878-135417	1692361878-135392	1692361878-135447
2	สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...	1262	1692361878-83363	1692361878-94231	1692361878-83363
3	สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...	1262	1692361878-83363	1692361878-94905	1692361878-83363
4	สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...	1262	1692361878-83363	1692361878-94906	1692361878-94905
	id_x	id_y	jaccard
0	1692361878-136568	1692361878-136566	0.754448
1	1692361878-136568	1692361878-136566	0.754448
2	1692361878-136568	1692361878-136566	0.754448
3	1692361878-136568	1692361878-136566	0.754448
4	1692361878-92875	1692361878-87743	0.828794
	dataset_id	doc_id	group
0	1692361878	122282	903
1	1692361878	139772	1952
2	1692361878	93927	112
3	1692361878	121450	2046
4	1692361878	85288	3030
	group	doc_id
0	75	160982, 161038, 161124, 161109, 161121, 160991...
1	112	122007, 122124, 122020, 122282, 122010, 122134...
2	151	134584, 135030, 134908, 134891, 135029, 135020...
3	321	94082, 94114, 94126, 94057, 94121, 94132, 9411...
4	339	116230, 116237, 116223, 116236, 116176, 116204...
...	...	...
5460	8539	120646
5461	8540	158174
5462	8541	132405
5463	8542	49199
5464	8543	160924
	dataset_id	doc_id	group
420	1692361878	122007	112
425	1692361878	122124	112
689	1692361878	122020	112
764	1692361878	122282	112
952	1692361878	122010	112