From da45ba2866288df10d601907637db077ecfe92a6 Mon Sep 17 00:00:00 2001 From: Nicole Luo Date: Thu, 15 Aug 2024 01:21:15 +0000 Subject: [PATCH] Fix bug #183 #185 Signed-off-by: Nicole Luo --- .../single_gpu_tutorial.ipynb | 2142 ++++------------- 1 file changed, 447 insertions(+), 1695 deletions(-) mode change 100755 => 100644 tutorials/single_node_tutorial/single_gpu_tutorial.ipynb diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb old mode 100755 new mode 100644 index 36dc4f84e..c285f0a43 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -78,7 +78,7 @@ " Password: \n", "- Get NeMo NeMo Framework Training Container\n", " ```bash\n", - " docker pull docker pull nvcr.io/nvidia/nemo:dev.framework\n" + " docker pull nvcr.io/nvidia/nemo:dev\n" ] }, { @@ -91,34 +91,40 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "8add9bbd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", - "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (4.0.0)\n", - "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines) (23.2.0)\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "!pip install jsonlines" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, + "id": "97079227-d9c3-40d2-b939-64b221b86990", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%env DASK_DATAFRAME__QUERY_PLANNING False\n", + "%env CUDA_VISIBLE_DEVICES 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "9940c70d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import argparse\n", + "import os\n", "\n", "from nemo_curator.utils.distributed_utils import get_client,get_num_workers\n", "from nemo_curator.utils.script_utils import ArgumentHelper\n", @@ -126,7 +132,6 @@ "from nemo_curator.utils.distributed_utils import read_data,write_to_disk\n", "from nemo_curator.datasets import DocumentDataset\n", "\n", - "import os\n", "import sys\n", "import pandas as pd\n", "import time\n", @@ -135,16 +140,16 @@ "import dask\n", "import numpy as np\n", "from dask.distributed import Client, LocalCluster\n", - "import jsonlines\n", - "\n", - "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"" + "import jsonlines\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "fd8a381d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def pre_imports():\n", @@ -166,23 +171,23 @@ " with jsonlines.open(file_path) as reader:\n", " for obj in reader:\n", " if obj.get('id') in target_list:\n", - " yield obj" + " yield obj\n", + "\n", + "def get_base_dataset_file_name(download_folder):\n", + " files = os.listdir(download_folder)\n", + " for file in files:\n", + " if file.startswith('thwiki') and file.endswith(''):\n", + " return file" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "589ff257", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/work_dir/tutorials/single_node_tutorial\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "cur_dir = os.getcwd()\n", "print(cur_dir)\n", @@ -232,9 +237,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "adb59379", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator.download import download_wikipedia" @@ -250,9 +257,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "e822b5ac", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n", @@ -269,9 +278,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "9a03b463", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Output\n", @@ -279,7 +290,6 @@ "download_output_directory = os.path.join(download_base_directory,\"data\")\n", "\n", "#Relevant parameters\n", - "dump_date = \"20240201\"\n", "language = 'th'\n", "url_limit = 1" ] @@ -296,12 +306,13 @@ "cell_type": "code", "execution_count": null, "id": "a45965a7", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "res = download_wikipedia(download_output_directory,\n", " language=language, \n", - " dump_date=dump_date,\n", " url_limit=url_limit).df.compute()" ] }, @@ -310,59 +321,56 @@ "id": "22b7d5b3", "metadata": {}, "source": [ - "Verify result" + "**[Optional]** Verify result" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "45a69041", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloads thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n", - "162164 /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/wiki_downloads/data/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n" - ] - } - ], + "outputs": [], "source": [ - "! ls {download_output_directory}\n", - "! wc -l {download_output_directory}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl" + "# List all the file in the output directory.\n", + "# ! ls {download_output_directory}\n", + "\n", + "# Please replace your dataset file name accordingly.\n", + "# ! wc -l {download_output_directory}/{YOUR DATASET FILE NAME}.jsonl" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "53bdccfd", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"id\":\"1\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\",\"language\":\"th\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\"}\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "check_jsonl_file(download_output_directory)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "024d7a31-20a3-4d2a-9b89-2f85638fa0da", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!rm -r {download_output_directory}/downloads" + ] + }, { "cell_type": "markdown", "id": "c5f58643", "metadata": {}, "source": [ - "**[Optional]**Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again." + "**[Optional]** Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again." ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "0669a830", "metadata": {}, "outputs": [], @@ -396,9 +404,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "1e9198e8", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator import ScoreFilter,Modify\n", @@ -416,9 +426,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "da3aed8a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n", @@ -435,13 +447,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "13b9d2b1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Input path\n", - "multilingual_data_path = f\"{download_output_directory}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\"\n", + "multilingual_data_path = download_output_directory\n", "\n", "# Output path\n", "language_base_output_path = os.path.join(data_dir,\"language_sep\")\n", @@ -469,28 +483,12 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "2666727d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2024-05-17 03:17:09-- https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\n", - "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 99.84.238.181, 99.84.238.154, 99.84.238.162, ...\n", - "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|99.84.238.181|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 131266198 (125M) [application/octet-stream]\n", - "Saving to: ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin.1’\n", - "\n", - "lid.176.bin.1 100%[===================>] 125.18M 184MB/s in 0.7s \n", - "\n", - "2024-05-17 03:17:10 (184 MB/s) - ‘/nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/lid.176.bin.1’ saved [131266198/131266198]\n", - "\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P {model_path}" ] @@ -505,32 +503,12 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "d8b8c491", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 1 files\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Time taken for splitting language:140.04064464569092\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "t0 = time.time()\n", "\n", @@ -561,20 +539,12 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "272a5f67", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 1 files\n", - "Writing to disk complete for 1 partitions\n", - "Time taken for fixing unicode:437.4811737537384\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "t0 = time.time()\n", "\n", @@ -596,27 +566,23 @@ "id": "9bd57a53", "metadata": {}, "source": [ - "Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file (no. of lines = 162164)" + "**[Optional]** Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file " ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "e3329c83", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n", - "161748 /nluo_data/NeMo-Curator/tutorials/single_node_tutorial/workspace/language_sep/data/cleaned/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "! ls {lang_sep_cleaned_data_output_path}\n", - "! wc -l {lang_sep_cleaned_data_output_path}/thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl" + "# List all the file in the output directory.\n", + "# ! ls {lang_sep_cleaned_data_output_path}\n", + "\n", + "# Please replace your dataset file name accordingly.\n", + "# ! wc -l {lang_sep_cleaned_data_output_path}/{YOUR DATASET FILE NAME}.jsonl" ] }, { @@ -629,19 +595,12 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "050d944c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\",\"id\":\"1\",\"language\":\"TH\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\"}\n", - "\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "check_jsonl_file(os.path.join(language_separated_output_path,'EN'))" ] @@ -656,9 +615,11 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "7e64cc35", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# client.cluster.close()\n", @@ -679,9 +640,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "5f788b91", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator import AddId" @@ -697,9 +660,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "5ba1d54a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n", @@ -716,9 +681,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "843eba7f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Input\n", @@ -741,20 +708,12 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "b7a91bf1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 1 files\n", - "Writing to disk complete for 1 partitions\n", - "Time taken for add ID:47.33783745765686\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "t0 = time.time()\n", "# Read input files\n", @@ -780,19 +739,12 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "e585cedd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"filename\":\"thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl\",\"id\":\"TH_wiki-0000000000\",\"language\":\"TH\",\"source_id\":\"thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2\",\"text\":\"–\\n\\nป้ายบอกทาง \\n ศาลาประชาคม – กระดานข่าว โครงการ ทรัพยากรและกิจกรรมซึ่งครอบคลุมวิกิพีเดียอย่างกว้างขวาง\\n แผนกช่วยเหลือ – ถามข้อสงสัยเกี่ยวกับการใช้งานวิกิพีเดีย\\n ปุจฉา-วิสัชนา – ถามข้อสงสัยทั่วไปที่คุณอยากรู้\\n ข่าวไซต์ – ประกาศ อัพเดต บทความและข้อมูลข่าวเกี่ยวกับวิกิพีเดียและมูลนิธิวิกิมีเดีย\\n สภากาแฟ – สำหรับอภิปรายเกี่ยวกับวิกิพีเดีย รวมถึงรายงานปัญหาเทคนิคและเสนอนโยบาย\\n Local Embassy – For Wikipedia-related discussion in languages other than Thai.\\n สร้างบทความใหม่ – บทช่วยสอนสำหรับเตรียมพร้อมสร้างบทความแรกของคุณ\\n\\nภาษาอื่น \\n\\n \",\"title\":\"หน้าหลัก\",\"url\":\"https:\\/\\/th.wikipedia.org\\/wiki\\/%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%81\"}\n", - "\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "check_jsonl_file(added_id_output_path)" ] @@ -807,9 +759,11 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "4daa1f2a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "client.cluster.close()\n", @@ -834,9 +788,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "3f7ba34c", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator.modules import ExactDuplicates" @@ -852,28 +808,12 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "4b73e5f9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of dask worker:1\n" - ] - }, - { - "data": { - "text/plain": [ - "{'tcp://127.0.0.1:36179': None}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "client = get_client(cluster_type = 'gpu', set_torch_to_use_rmm=False)\n", "print(f\"Number of dask worker:{get_num_workers(client)}\")\n", @@ -893,9 +833,11 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "a590c78a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#pip install --extra-index-url https://pypi.nvidia.com \".[cuda12x]\"" @@ -911,9 +853,11 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "54b627a4", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Input\n", @@ -931,9 +875,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "6ede2e41", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "!mkdir -p {exact_dedup_log_dir}\n", @@ -950,34 +896,12 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "dfaaa765", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 1 files\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/exact_dedup.py:158: UserWarning: Output path f/work_dir/tutorials/single_node_tutorial/workspace/exact_dedup/data/_exact_duplicates.parquet already exists and will be overwritten\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of exact duplicated file:53\n", - "Time taken for exact duplicate:1.9788782596588135\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "t0 = time.time()\n", "# Read input dataset\n", @@ -1003,91 +927,17 @@ "id": "e68f0399", "metadata": {}, "source": [ - "Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary" + "**[Optional]** Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "28d8bb0b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of exact duplicated document:53\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_hashes
0TH_wiki-00001220553e6e96a80410d5a191d098f464e66f86
1TH_wiki-0000105191e77a248506ef16737288fae5759db33a
2TH_wiki-00001051922e386f5c3af70f43874618988d4842b2
3TH_wiki-00001051932e386f5c3af70f43874618988d4842b2
4TH_wiki-00001051942e386f5c3af70f43874618988d4842b2
\n", - "
" - ], - "text/plain": [ - " id _hashes\n", - "0 TH_wiki-0000122055 3e6e96a80410d5a191d098f464e66f86\n", - "1 TH_wiki-0000105191 e77a248506ef16737288fae5759db33a\n", - "2 TH_wiki-0000105192 2e386f5c3af70f43874618988d4842b2\n", - "3 TH_wiki-0000105193 2e386f5c3af70f43874618988d4842b2\n", - "4 TH_wiki-0000105194 2e386f5c3af70f43874618988d4842b2" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "exact_dedup_res = pd.read_parquet(os.path.join(exact_dedup_output_dir,\"_exact_duplicates.parquet\"))\n", "print(f\"Number of exact duplicated document:{len(exact_dedup_res)}\")\n", @@ -1096,109 +946,36 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "fca41870", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
_hashesid
00b908a91cdf0544c1ef3015cff4ee07eTH_wiki-0000157216 TH_wiki-0000066307
115f35c239b6579b4642f7656e64576acTH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-...
21708cb56ec582f78716f0864dca9382dTH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-...
32e386f5c3af70f43874618988d4842b2TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-...
43e6e96a80410d5a191d098f464e66f86TH_wiki-0000122055 TH_wiki-0000116550
\n", - "
" - ], - "text/plain": [ - " _hashes \\\n", - "0 0b908a91cdf0544c1ef3015cff4ee07e \n", - "1 15f35c239b6579b4642f7656e64576ac \n", - "2 1708cb56ec582f78716f0864dca9382d \n", - "3 2e386f5c3af70f43874618988d4842b2 \n", - "4 3e6e96a80410d5a191d098f464e66f86 \n", - "\n", - " id \n", - "0 TH_wiki-0000157216 TH_wiki-0000066307 \n", - "1 TH_wiki-0000074714 TH_wiki-0000074715 TH_wiki-... \n", - "2 TH_wiki-0000021211 TH_wiki-0000021213 TH_wiki-... \n", - "3 TH_wiki-0000105192 TH_wiki-0000105193 TH_wiki-... \n", - "4 TH_wiki-0000122055 TH_wiki-0000116550 " - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "exact_dedup_res.groupby('_hashes')['id'].agg(lambda x: ' '.join(x)).reset_index().head()" ] }, + { + "cell_type": "markdown", + "id": "597d04a4-0b82-43f9-9f61-61729e768ab1", + "metadata": {}, + "source": [ + "Using the duplicated id shown above, check the content to see if it's exact duplicates" + ] + }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "8c9624ac", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'filename': 'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl', 'id': 'TH_wiki-0000066307', 'language': 'TH', 'source_id': 'thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2', 'text': '\\n\\nแหล่งข้อมูลอื่น \\n\\nสงขลา\\n \\nรายชื่อเกี่ยวกับจังหวัดสงขลา', 'title': 'รายชื่อโบราณสถานในจังหวัดสงขลา', 'url': 'https://th.wikipedia.org/wiki/%E0%B8%A3%E0%B8%B2%E0%B8%A2%E0%B8%8A%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B9%82%E0%B8%9A%E0%B8%A3%E0%B8%B2%E0%B8%93%E0%B8%AA%E0%B8%96%E0%B8%B2%E0%B8%99%E0%B9%83%E0%B8%99%E0%B8%88%E0%B8%B1%E0%B8%87%E0%B8%AB%E0%B8%A7%E0%B8%B1%E0%B8%94%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2'}\n", - "{'filename': 'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl', 'id': 'TH_wiki-0000157216', 'language': 'TH', 'source_id': 'thwiki-20240201-thwiki-20240201-pages-articles-multistream.xml.bz2', 'text': '\\n\\nแหล่งข้อมูลอื่น \\n\\nสงขลา\\n \\nรายชื่อเกี่ยวกับจังหวัดสงขลา', 'title': 'รายชื่อโบราณสถานในจังหวัดสงขลา (อำเภอเมืองสงขลาและสิงหนคร)', 'url': 'https://th.wikipedia.org/wiki/%E0%B8%A3%E0%B8%B2%E0%B8%A2%E0%B8%8A%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B9%82%E0%B8%9A%E0%B8%A3%E0%B8%B2%E0%B8%93%E0%B8%AA%E0%B8%96%E0%B8%B2%E0%B8%99%E0%B9%83%E0%B8%99%E0%B8%88%E0%B8%B1%E0%B8%87%E0%B8%AB%E0%B8%A7%E0%B8%B1%E0%B8%94%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2%20%28%E0%B8%AD%E0%B8%B3%E0%B9%80%E0%B8%A0%E0%B8%AD%E0%B9%80%E0%B8%A1%E0%B8%B7%E0%B8%AD%E0%B8%87%E0%B8%AA%E0%B8%87%E0%B8%82%E0%B8%A5%E0%B8%B2%E0%B9%81%E0%B8%A5%E0%B8%B0%E0%B8%AA%E0%B8%B4%E0%B8%87%E0%B8%AB%E0%B8%99%E0%B8%84%E0%B8%A3%29'}\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "target_list = ['TH_wiki-0000157216', 'TH_wiki-0000066307']\n", - "for line in extract_lines_with_id(os.path.join(exact_dedup_input_dataset_dir,'thwiki-20240201-pages-articles-multistream.xml.bz2.jsonl'),target_list):\n", - " print(line)" + "# target_list = ['TH_wiki-0000157216', 'TH_wiki-0000066307']\n", + "# for line in extract_lines_with_id(os.path.join(exact_dedup_input_dataset_dir,'{YOUR DATASET FILE NAME}'),target_list):\n", + "# print(line)" ] }, { @@ -1211,9 +988,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "5ef2f05e", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# client.cluster.close()\n", @@ -1298,9 +1077,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "1fc5bff3", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator import MinHash" @@ -1316,9 +1097,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "d600d1b8", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Input\n", @@ -1353,34 +1136,12 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "88540950", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Computing minhashes for /work_dir/tutorials/single_node_tutorial/workspace/add_id/cleaned\n", - "Reading 1 files\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:175: UserWarning: Output path /work_dir/tutorials/single_node_tutorial/workspace/fuzzy/minhash/data/_minhashes.parquet already exists and will be overwritten\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Time taken for MinHash:6.340771198272705\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "t0 = time.time()\n", "print(f\"Computing minhashes for {minhash_data_path}\")\n", @@ -1417,87 +1178,20 @@ "id": "158bf3ab", "metadata": {}, "source": [ - "Verify result" + "**[Optional]** Verify result" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "10b5eb55", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_minhash_signature
0TH_wiki-0000000000[11565725, 19782487, 9831980, 5480992, 2306475...
1TH_wiki-0000000001[407876, 107572, 824528, 346831, 216554, 10963...
2TH_wiki-0000000002[727721, 694551, 233868, 346831, 216554, 77001...
3TH_wiki-0000000003[1149282, 931656, 2515604, 1428622, 4964646, 4...
4TH_wiki-0000000004[1559901, 11771639, 487706, 826569, 1203860, 5...
\n", - "
" - ], - "text/plain": [ - " id _minhash_signature\n", - "0 TH_wiki-0000000000 [11565725, 19782487, 9831980, 5480992, 2306475...\n", - "1 TH_wiki-0000000001 [407876, 107572, 824528, 346831, 216554, 10963...\n", - "2 TH_wiki-0000000002 [727721, 694551, 233868, 346831, 216554, 77001...\n", - "3 TH_wiki-0000000003 [1149282, 931656, 2515604, 1428622, 4964646, 4...\n", - "4 TH_wiki-0000000004 [1559901, 11771639, 487706, 826569, 1203860, 5..." - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "minhash_res = pd.read_parquet(os.path.join(minshah_output_dir, \"_minhashes.parquet\"))\n", - "minhash_res.head()" + "# minhash_res = pd.read_parquet(os.path.join(minshah_output_dir, \"_minhashes.parquet\"))\n", + "# minhash_res.head()" ] }, { @@ -1523,9 +1217,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "645b8a53", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator import LSH\n", @@ -1543,9 +1239,11 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "738ab265", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Input\n", @@ -1577,26 +1275,12 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "1ef61e2b", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/nemo_curator/modules/fuzzy_dedup.py:361: UserWarning: Output path /work_dir/tutorials/single_node_tutorial/workspace/fuzzy/lsh/data/_buckets.parquet already exists and will be overwritten\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Time taken for LSH:19.37230634689331\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "t0 = time.time()\n", "\n", @@ -1631,93 +1315,20 @@ "id": "ad2e3b60", "metadata": {}, "source": [ - "Verify result" + "**[Optional]** Verify result" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "9d0449c6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dataset_iddoc_id_bucket_id
01692361878123547210
1169236187893844120
216923618786656486
3169236187893845120
416923618786656586
\n", - "
" - ], - "text/plain": [ - " dataset_id doc_id _bucket_id\n", - "0 1692361878 123547 210\n", - "1 1692361878 93844 120\n", - "2 1692361878 66564 86\n", - "3 1692361878 93845 120\n", - "4 1692361878 66565 86" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "lsh_res = pd.read_parquet(os.path.join(lsh_output_dir, \"_buckets.parquet\"))\n", - "lsh_res.head()" + "# lsh_res = pd.read_parquet(os.path.join(lsh_output_dir, \"_buckets.parquet\"))\n", + "# lsh_res.head()" ] }, { @@ -1743,9 +1354,11 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "707ea54d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (\n", @@ -1765,9 +1378,11 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "70e2dff9", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Input\n", @@ -1808,20 +1423,12 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "b2850b0a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of files being read for jaccard calculation = 1\n", - "Number of ddf_bk partitions = 1\n", - "Time taken for Bucket Mapping:1.239295244216919 s\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "t0 = time.time()\n", "num_workers = get_num_workers(client)\n", @@ -1852,124 +1459,20 @@ "id": "a1533a15", "metadata": {}, "source": [ - "Verify result" + "**[Optional]** Verify result" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "d74012c3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dataset_iddoc_idanchor_1_dataset_idanchor_1_doc_idanchor_0_dataset_idanchor_0_doc_id_output_partition_id
01692361878127258169236187812778116923618781269550
11692361878853831692361878853641692361878853740
21692361878450301692361878852001692361878450300
31692361878127259169236187812778116923618781269550
41692361878127968169236187812796116923618781279960
\n", - "
" - ], - "text/plain": [ - " dataset_id doc_id anchor_1_dataset_id anchor_1_doc_id \\\n", - "0 1692361878 127258 1692361878 127781 \n", - "1 1692361878 85383 1692361878 85364 \n", - "2 1692361878 45030 1692361878 85200 \n", - "3 1692361878 127259 1692361878 127781 \n", - "4 1692361878 127968 1692361878 127961 \n", - "\n", - " anchor_0_dataset_id anchor_0_doc_id _output_partition_id \n", - "0 1692361878 126955 0 \n", - "1 1692361878 85374 0 \n", - "2 1692361878 45030 0 \n", - "3 1692361878 126955 0 \n", - "4 1692361878 127996 0 " - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "map_bucket_res = pd.read_parquet(output_anchor_docs_with_bk_path)\n", - "map_bucket_res.head()" + "# map_bucket_res = pd.read_parquet(output_anchor_docs_with_bk_path)\n", + "# map_bucket_res.head()" ] }, { @@ -1982,7 +1485,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "b414f703", "metadata": {}, "outputs": [], @@ -2000,52 +1503,12 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "86d1b3e5", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/1 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
text_text_bytesidanchor_0_idanchor_1_id
0การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...14571692361878-1354171692361878-1354631692361878-135417
1การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด...14571692361878-1354171692361878-1353921692361878-135447
2สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...12621692361878-833631692361878-942311692361878-83363
3สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...12621692361878-833631692361878-949051692361878-83363
4สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค...12621692361878-833631692361878-949061692361878-94905
\n", - "" - ], - "text/plain": [ - " text _text_bytes \\\n", - "0 การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด... 1457 \n", - "1 การแข่งขันกีฬากรีฑาในโอลิมปิกฤดูร้อน 2020 – เด... 1457 \n", - "2 สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค... 1262 \n", - "3 สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค... 1262 \n", - "4 สุริยุปราคาบางส่วนจะเกิดขึ้นในวันที่ 13 กรกฎาค... 1262 \n", - "\n", - " id anchor_0_id anchor_1_id \n", - "0 1692361878-135417 1692361878-135463 1692361878-135417 \n", - "1 1692361878-135417 1692361878-135392 1692361878-135447 \n", - "2 1692361878-83363 1692361878-94231 1692361878-83363 \n", - "3 1692361878-83363 1692361878-94905 1692361878-83363 \n", - "4 1692361878-83363 1692361878-94906 1692361878-94905 " - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], + "cell_type": "code", + "execution_count": null, + "id": "1b51a5fb", + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "jaccard_shuffle_res = pd.read_parquet(os.path.join(output_shuffled_docs_path,\"_output_partition_id=0/batch_1_1.parquet\"))\n", - "jaccard_shuffle_res.head()" + "# jaccard_shuffle_res = pd.read_parquet(os.path.join(output_shuffled_docs_path,\"_output_partition_id=0/batch_1_1.parquet\"))\n", + "# jaccard_shuffle_res.head()" ] }, { @@ -2199,9 +1570,11 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "b1a532a2", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity" @@ -2217,9 +1590,11 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "291d3aaa", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Input\n", @@ -2248,19 +1623,12 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "9b1b9bdd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Running jaccard compute script\n", - "Time taken for Jaccard Computing: 0.735356330871582\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "# enable_spilling()\n", "# client.run(enable_spilling)\n", @@ -2288,93 +1656,20 @@ "id": "bb740d30", "metadata": {}, "source": [ - "Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets." + "**[Optional]** Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets." ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "a41d1f09", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_xid_yjaccard
01692361878-1365681692361878-1365660.754448
11692361878-1365681692361878-1365660.754448
21692361878-1365681692361878-1365660.754448
31692361878-1365681692361878-1365660.754448
41692361878-928751692361878-877430.828794
\n", - "
" - ], - "text/plain": [ - " id_x id_y jaccard\n", - "0 1692361878-136568 1692361878-136566 0.754448\n", - "1 1692361878-136568 1692361878-136566 0.754448\n", - "2 1692361878-136568 1692361878-136566 0.754448\n", - "3 1692361878-136568 1692361878-136566 0.754448\n", - "4 1692361878-92875 1692361878-87743 0.828794" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "jaccard_compute_res = pd.read_parquet(jaccard_compute_output_results_path)\n", - "jaccard_compute_res.head()" + "# jaccard_compute_res = pd.read_parquet(jaccard_compute_output_results_path)\n", + "# jaccard_compute_res.head()" ] }, { @@ -2394,9 +1689,11 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "3bff521b", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator.modules.fuzzy_dedup import ConnectedComponents" @@ -2412,9 +1709,11 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "b40735dd", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Input\n", @@ -2442,22 +1741,12 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "fe62dd51", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "batch_id = 0/1, time = 0.29015278816223145\n", - "# of groups 5465\n", - "# of docs removed 3079\n", - "assert num_nodes:8544==labels_df:8544 passed\n", - "Time taken for Connected Component: 4.489336729049683 s\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "t0 = time.time()\n", " \n", @@ -2479,93 +1768,20 @@ "id": "669495ee", "metadata": {}, "source": [ - "Verify the result of `Connected Components`" + "**[Optional]** Run the following cells to verify the result of `Connected Components`" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "efbd6973", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dataset_iddoc_idgroup
01692361878122282903
116923618781397721952
2169236187893927112
316923618781214502046
41692361878852883030
\n", - "
" - ], - "text/plain": [ - " dataset_id doc_id group\n", - "0 1692361878 122282 903\n", - "1 1692361878 139772 1952\n", - "2 1692361878 93927 112\n", - "3 1692361878 121450 2046\n", - "4 1692361878 85288 3030" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "cc_compute_res = pd.read_parquet(connected_component_output_path)\n", - "cc_compute_res.head()" + "# cc_compute_res = pd.read_parquet(connected_component_output_path)\n", + "# cc_compute_res.head()" ] }, { @@ -2578,121 +1794,15 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": null, "id": "d8fa1e8e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
groupdoc_id
075160982, 161038, 161124, 161109, 161121, 160991...
1112122007, 122124, 122020, 122282, 122010, 122134...
2151134584, 135030, 134908, 134891, 135029, 135020...
332194082, 94114, 94126, 94057, 94121, 94132, 9411...
4339116230, 116237, 116223, 116236, 116176, 116204...
.........
54608539120646
54618540158174
54628541132405
5463854249199
54648543160924
\n", - "

5465 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " group doc_id\n", - "0 75 160982, 161038, 161124, 161109, 161121, 160991...\n", - "1 112 122007, 122124, 122020, 122282, 122010, 122134...\n", - "2 151 134584, 135030, 134908, 134891, 135029, 135020...\n", - "3 321 94082, 94114, 94126, 94057, 94121, 94132, 9411...\n", - "4 339 116230, 116237, 116223, 116236, 116176, 116204...\n", - "... ... ...\n", - "5460 8539 120646\n", - "5461 8540 158174\n", - "5462 8541 132405\n", - "5463 8542 49199\n", - "5464 8543 160924\n", - "\n", - "[5465 rows x 2 columns]" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "cc_compute_res['doc_id'] = cc_compute_res['doc_id'].astype(str)\n", - "cc_compute_res.groupby('group')['doc_id'].agg(lambda x: ', '.join(x)).reset_index()" + "# cc_compute_res['doc_id'] = cc_compute_res['doc_id'].astype(str)\n", + "# cc_compute_res.groupby('group')['doc_id'].agg(lambda x: ', '.join(x)).reset_index()" ] }, { @@ -2705,87 +1815,15 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "id": "fd01f5fe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dataset_iddoc_idgroup
4201692361878122007112
4251692361878122124112
6891692361878122020112
7641692361878122282112
9521692361878122010112
\n", - "
" - ], - "text/plain": [ - " dataset_id doc_id group\n", - "420 1692361878 122007 112\n", - "425 1692361878 122124 112\n", - "689 1692361878 122020 112\n", - "764 1692361878 122282 112\n", - "952 1692361878 122010 112" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "cc_compute_res[cc_compute_res['group']==112].head()" + "#Repalce ??? with the group number you want to check\n", + "# cc_compute_res[cc_compute_res['group']==???].head()" ] }, { @@ -2816,7 +1854,10 @@ } ], "source": [ - "jaccard_shuffle_res[jaccard_shuffle_res['id'].isin(['1692361878-121545','1692361878-121487'])]['text'].unique()" + "# Repalce 'ID1' and 'ID2' with IDs you want to check\n", + "# The output is an example of fuzzy duplicates \n", + "\n", + "# jaccard_shuffle_res[jaccard_shuffle_res['id'].isin(['ID1','ID2'])]['text'].unique()" ] }, { @@ -2890,9 +1931,11 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": null, "id": "eb52ec06", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig" @@ -2900,9 +1943,11 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": null, "id": "625c1828", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Input\n", @@ -2936,9 +1981,11 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": null, "id": "e7fb4c4c", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#!rm -r {fuzzy_dedup_cache_dir}" @@ -2946,80 +1993,12 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "id": "2368443f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 1 files\n", - "Stage1: Starting Minhash + LSH computation\n", - "Stage1: Minhash + LSH complete!\n", - "Stage2 (False Postive Check): Starting Map_Buckets\n", - "Stage2 (False Postive Check): Map_Buckets Complete!\n", - "Stage3 (False Postive Check): Shuffle docs\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/1 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idgroup
0TH_wiki-0000134798736
1TH_wiki-00001162261526
2TH_wiki-00001267962934
3TH_wiki-0000138218156
4TH_wiki-00000854372722
\n", - "" - ], - "text/plain": [ - " id group\n", - "0 TH_wiki-0000134798 736\n", - "1 TH_wiki-0000116226 1526\n", - "2 TH_wiki-0000126796 2934\n", - "3 TH_wiki-0000138218 156\n", - "4 TH_wiki-0000085437 2722" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "fuzzy_dedup_res = pd.read_parquet(fuzzy_dedup_output_dir)\n", "fuzzy_dedup_res.head()" @@ -3149,9 +2061,11 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": null, "id": "0027c8d2", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Input\n", @@ -3177,19 +2091,12 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": null, "id": "f59e92c3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 1 files\n", - "Reading 1 files\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "#Load .jsonl dataset\n", "input_dataset = DocumentDataset.read_json(dataset_dir, backend='cudf')\n", @@ -3224,7 +2131,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": null, "id": "c6a1bb0a", "metadata": {}, "outputs": [], @@ -3253,9 +2160,11 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": null, "id": "746d3673", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Loads result from fuzzy dedup wrapper\n", @@ -3267,9 +2176,11 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": null, "id": "62b34838", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Remove near duplicates\n", @@ -3284,23 +2195,17 @@ "id": "edfa52ce", "metadata": {}, "source": [ - "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset (length = 161748)" + "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset " ] }, { "cell_type": "code", - "execution_count": 86, + "execution_count": null, "id": "78eee9b3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length of duplicate removed dataset:156265\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "res = pd.read_parquet(dudped_output_dir)\n", "print(f\"Length of duplicate removed dataset:{len(res)}\")" @@ -3316,9 +2221,11 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": null, "id": "8e807bd7", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "client.cluster.close()\n", @@ -3349,9 +2256,11 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": null, "id": "b988ad1e", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from nemo_curator.utils.config_utils import build_filter_pipeline\n", @@ -3369,9 +2278,11 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": null, "id": "44552288", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import warnings\n", @@ -3390,10 +2301,20 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": null, "id": "b8f80ab3", - "metadata": {}, - "outputs": [], + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-08-12 06:36:51,616 - distributed.scheduler - WARNING - Removing worker 'tcp://127.0.0.1:32917' caused the cluster to lose already computed task(s), which will be recomputed elsewhere: {('getitem-5b69d236ac9974e9fb86010ffc64382a', 0), ('getitem-1a1421e1fc0bebcfdb81496a35f59d59', 0), ('getitem-a531838794cbb6793b5455275c088d56', 0), ('getitem-5a479f5a8ba45819d7bc110e6f66c5cf', 0), ('getitem-20cb1fb330d399835eab7d541c90d9ad', 0), ('getitem-ea8820d11bd559a47001726946b401f1', 0), ('getitem-dc3a1400f3d825aa608fea3f19009402', 0), ('getitem-fc7ee0a305222d3cbc86116635f8f1b7', 0), ('getitem-5a35ddcf8be5c285f2cc9e07ba4168d6', 0), ('getitem-9f6e0b039afa9a3a892b2eee42fff9ff', 0), ('getitem-aef58cc24b78e9deb456d9854d8056db', 0), ('getitem-cf46f299cd36329b1ec712d5fd751b3a', 0), ('getitem-36157dd00770b4907cf863f121981541', 0), ('getitem-2d4c129c73f6e4bd0add5175ea806475', 0)} (stimulus_id='handle-worker-cleanup-1723444611.6157267')\n" + ] + } + ], "source": [ "cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n", "client = Client(cluster)" @@ -3409,9 +2330,11 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": null, "id": "6f2e7523", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def get_dataframe_complement(original_df, filtered_df):\n", @@ -3449,9 +2372,11 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": null, "id": "a894f90f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#Input\n", @@ -3488,62 +2413,12 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": null, "id": "03b3da27", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 1 files\n", - "Saving data for symbol_to_word\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for numbers_ratio\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for urls_ratio\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for white_space\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for parentheses_ratio\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for boilerplate_string_ratio\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for repeated_lines\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for repeated_paragraphs\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for repeated_lines_char\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for repeated_paragraphs_char\n", - "Writing to disk complete for 1 partitions\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/nemo_curator/utils/distributed_utils.py:379: UserWarning: Empty partition found\n", - " warnings.warn(f\"Empty partition found\")\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving data for word_count\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for repeating_top_2grams\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for repeating_top_3grams\n", - "Writing to disk complete for 1 partitions\n", - "Saving data for repeating_top_4grams\n", - "Writing to disk complete for 1 partitions\n", - "Writing to disk complete for 1 partitions\n", - "Time taken for Heuristic filtering: 1120.5212895870209 s\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "t0 = time.time()\n", "\n", @@ -3592,146 +2467,21 @@ "id": "a53b04e9", "metadata": {}, "source": [ - "Verify the result." + "**[Optional]** Verify the result." ] }, { "cell_type": "code", - "execution_count": 95, + "execution_count": null, "id": "07475373", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset size after heuristic filtering:192786\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenameidlanguagesource_idtexttitleurl
1part.0.parquetTH_wiki-0000000001THthwiki-20240201-thwiki-20240201-pages-articles...ดาราศาสตร์ คือวิชาวิทยาศาสตร์ที่ศึกษาวัตถุในท้...ดาราศาสตร์https://th.wikipedia.org/wiki/%E0%B8%94%E0%B8%...
2part.0.parquetTH_wiki-0000000002THthwiki-20240201-thwiki-20240201-pages-articles...ภูมิศาสตร์ (, แปลว่า \"การพรรณนาเกี่ยวกับโลก\")...ภูมิศาสตร์https://th.wikipedia.org/wiki/%E0%B8%A0%E0%B8%...
3part.0.parquetTH_wiki-0000000003THthwiki-20240201-thwiki-20240201-pages-articles...พันทิป.คอม หรือพันทิป ก่อตั้งขึ้นเมื่อวันที่ 7...พันทิป.คอมhttps://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...
4part.0.parquetTH_wiki-0000000004THthwiki-20240201-thwiki-20240201-pages-articles...พันธุ์ทิพย์พลาซ่า () เป็นศูนย์การค้าเกี่ยวกับเ...พันธุ์ทิพย์พลาซ่าhttps://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%...
5part.0.parquetTH_wiki-0000000005THthwiki-20240201-thwiki-20240201-pages-articles...วิทยาการคอมพิวเตอร์ศึกษาเกี่ยวกับโครงสร้างพื้น...วิทยาการคอมพิวเตอร์https://th.wikipedia.org/wiki/%E0%B8%A7%E0%B8%...
\n", - "
" - ], - "text/plain": [ - " filename id language \\\n", - "1 part.0.parquet TH_wiki-0000000001 TH \n", - "2 part.0.parquet TH_wiki-0000000002 TH \n", - "3 part.0.parquet TH_wiki-0000000003 TH \n", - "4 part.0.parquet TH_wiki-0000000004 TH \n", - "5 part.0.parquet TH_wiki-0000000005 TH \n", - "\n", - " source_id \\\n", - "1 thwiki-20240201-thwiki-20240201-pages-articles... \n", - "2 thwiki-20240201-thwiki-20240201-pages-articles... \n", - "3 thwiki-20240201-thwiki-20240201-pages-articles... \n", - "4 thwiki-20240201-thwiki-20240201-pages-articles... \n", - "5 thwiki-20240201-thwiki-20240201-pages-articles... \n", - "\n", - " text title \\\n", - "1 ดาราศาสตร์ คือวิชาวิทยาศาสตร์ที่ศึกษาวัตถุในท้... ดาราศาสตร์ \n", - "2 ภูมิศาสตร์ (, แปลว่า \"การพรรณนาเกี่ยวกับโลก\")... ภูมิศาสตร์ \n", - "3 พันทิป.คอม หรือพันทิป ก่อตั้งขึ้นเมื่อวันที่ 7... พันทิป.คอม \n", - "4 พันธุ์ทิพย์พลาซ่า () เป็นศูนย์การค้าเกี่ยวกับเ... พันธุ์ทิพย์พลาซ่า \n", - "5 วิทยาการคอมพิวเตอร์ศึกษาเกี่ยวกับโครงสร้างพื้น... วิทยาการคอมพิวเตอร์ \n", - "\n", - " url \n", - "1 https://th.wikipedia.org/wiki/%E0%B8%94%E0%B8%... \n", - "2 https://th.wikipedia.org/wiki/%E0%B8%A0%E0%B8%... \n", - "3 https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%... \n", - "4 https://th.wikipedia.org/wiki/%E0%B8%9E%E0%B8%... \n", - "5 https://th.wikipedia.org/wiki/%E0%B8%A7%E0%B8%... " - ] - }, - "execution_count": 95, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "res = pd.read_parquet(kept_document_dir)\n", - "print(f\"Dataset size after heuristic filtering:{len(res)}\")\n", - "res.head()" + "# res = pd.read_parquet(kept_document_dir)\n", + "# print(f\"Dataset size after heuristic filtering:{len(res)}\")\n", + "# res.head()" ] }, { @@ -3744,9 +2494,11 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 64, "id": "12508f5e", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "client.cluster.close()\n",