diff --git a/examples/distributed_data_classification_examples/test_domain.ipynb b/examples/distributed_data_classification_examples/test_domain.ipynb index 45bedbd85..be36c450b 100644 --- a/examples/distributed_data_classification_examples/test_domain.ipynb +++ b/examples/distributed_data_classification_examples/test_domain.ipynb @@ -15,13 +15,14 @@ ], "source": [ "%env PYTHONWARNINGS=ignore\n", + "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -79,20 +80,26 @@ "\n", "model_file_name = \"/home/nfs/syurick/LLM_domain_classifier_inference/GoogleDebertaAgree_v3b_bce_maxlen512_bs64_noRef_best.pth\"\n", "# Input can be a string or list\n", - "input_file_path = \"/home/nfs/syurick/LLM_domain_classifier_inference/4360_results_jsonl_dir/\"\n", - "output_file_path = \"/raid/vjawa/output_file.json\"\n" + "input_file_path = \"/raid/vjawa/prospector-lm/subset_CC-MAIN-2023-14_english\"\n", + "output_file_path = \"/raid/vjawa/output_subset_CC-MAIN-2023-14_english\"\n" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Reading 16 files\n", + "Reading 50 files\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "Starting domain classifier inference\n" ] }, @@ -100,31 +107,159 @@ "name": "stderr", "output_type": "stream", "text": [ - "GPU: 0, Part: 14: 100%|██████████| 937/937 [00:14<00:00, 64.41it/s] \n", - "GPU: 0, Part: 13: 100%|██████████| 937/937 [00:16<00:00, 57.34it/s]\n", - "GPU: 0, Part: 12: 100%|██████████| 937/937 [00:14<00:00, 63.48it/s] \n", - "GPU: 0, Part: 5: 100%|██████████| 938/938 [00:14<00:00, 62.78it/s] \n", - "GPU: 0, Part: 9: 100%|██████████| 937/937 [00:15<00:00, 60.63it/s]\n", - "GPU: 0, Part: 10: 100%|██████████| 937/937 [00:15<00:00, 61.35it/s]\n", - "GPU: 0, Part: 6: 100%|██████████| 938/938 [00:15<00:00, 59.92it/s]\n", - "GPU: 0, Part: 4: 100%|██████████| 938/938 [00:15<00:00, 61.30it/s]\n", - "GPU: 0, Part: 15: 100%|██████████| 937/937 [00:15<00:00, 61.20it/s]\n", - "GPU: 0, Part: 0: 100%|██████████| 938/938 [00:15<00:00, 61.44it/s]\n", - "GPU: 0, Part: 2: 100%|██████████| 938/938 [00:15<00:00, 61.10it/s]\n", - "GPU: 0, Part: 1: 100%|██████████| 938/938 [00:15<00:00, 60.69it/s]\n", - "GPU: 0, Part: 8: 100%|██████████| 937/937 [00:15<00:00, 60.11it/s]\n", - "GPU: 0, Part: 7: 100%|██████████| 937/937 [00:15<00:00, 60.54it/s]\n", - "GPU: 0, Part: 3: 100%|██████████| 938/938 [00:15<00:00, 60.71it/s]\n", - "GPU: 0, Part: 11: 100%|██████████| 937/937 [00:15<00:00, 60.47it/s]\n" + "GPU: 0, Part: 9: 100%|██████████| 9995/9995 [00:42<00:00, 233.16it/s]/s]\n", + "GPU: 0, Part: 44: 100%|██████████| 9793/9793 [00:41<00:00, 235.01it/s]\n", + "GPU: 0, Part: 7: 100%|██████████| 9956/9956 [00:42<00:00, 233.63it/s]]\n", + "GPU: 0, Part: 8: 100%|██████████| 10093/10093 [00:44<00:00, 226.58it/s]\n", + "GPU: 0, Part: 47: 100%|██████████| 10100/10100 [00:43<00:00, 234.62it/s]\n", + "GPU: 0, Part: 6: 100%|██████████| 10088/10088 [00:42<00:00, 236.62it/s]\n", + "GPU: 0, Part: 48: 100%|██████████| 10021/10021 [00:42<00:00, 235.82it/s]\n", + "GPU: 0, Part: 49: 100%|██████████| 10200/10200 [00:42<00:00, 238.71it/s]\n", + "GPU: 0, Part: 4: 100%|██████████| 9747/9747 [00:42<00:00, 227.39it/s]]\n", + "GPU: 0, Part: 40: 100%|██████████| 9999/9999 [00:42<00:00, 236.56it/s]\n", + "GPU: 0, Part: 46: 100%|██████████| 9994/9994 [00:43<00:00, 230.24it/s]\n", + "GPU: 0, Part: 41: 100%|██████████| 9938/9938 [00:43<00:00, 228.66it/s]\n", + "GPU: 0, Part: 45: 100%|██████████| 9832/9832 [00:43<00:00, 225.21it/s]\n", + "GPU: 0, Part: 42: 100%|██████████| 9985/9985 [00:42<00:00, 232.65it/s]\n", + "GPU: 0, Part: 5: 100%|██████████| 9873/9873 [00:43<00:00, 225.99it/s]\n", + "GPU: 0, Part: 43: 100%|██████████| 9933/9933 [00:44<00:00, 221.88it/s]\n", + "GPU: 0, Part: 39: 100%|██████████| 10075/10075 [00:31<00:00, 316.80it/s]\n", + "GPU: 0, Part: 3: 100%|██████████| 9714/9714 [00:33<00:00, 293.11it/s]s]\n", + "GPU: 0, Part: 26: 100%|██████████| 10090/10090 [00:31<00:00, 316.52it/s]\n", + "GPU: 0, Part: 36: 100%|██████████| 10019/10019 [00:32<00:00, 307.36it/s]\n", + "GPU: 0, Part: 33: 100%|██████████| 10187/10187 [00:32<00:00, 313.55it/s]\n", + "GPU: 0, Part: 27: 100%|██████████| 9941/9941 [00:32<00:00, 310.53it/s]\n", + "GPU: 0, Part: 28: 100%|██████████| 10095/10095 [00:32<00:00, 315.00it/s]\n", + "GPU: 0, Part: 31: 100%|██████████| 10150/10150 [00:32<00:00, 311.38it/s]\n", + "GPU: 0, Part: 34: 100%|██████████| 9934/9934 [00:33<00:00, 294.76it/s]s]\n", + "GPU: 0, Part: 37: 100%|██████████| 9939/9939 [00:33<00:00, 293.81it/s]\n", + "GPU: 0, Part: 32: 100%|██████████| 9959/9959 [00:33<00:00, 294.59it/s]s]\n", + "GPU: 0, Part: 25: 100%|██████████| 10204/10204 [00:34<00:00, 298.84it/s]\n", + "GPU: 0, Part: 35: 100%|██████████| 9861/9861 [00:33<00:00, 292.10it/s]s]\n", + "GPU: 0, Part: 30: 100%|██████████| 10031/10031 [00:34<00:00, 294.82it/s]\n", + "GPU: 0, Part: 38: 100%|██████████| 9988/9988 [00:35<00:00, 284.55it/s]\n", + "GPU: 0, Part: 29: 100%|██████████| 10027/10027 [00:34<00:00, 291.28it/s]\n", + "GPU: 0, Part: 24: 100%|██████████| 9799/9799 [00:31<00:00, 312.35it/s]]\n", + "GPU: 0, Part: 23: 100%|██████████| 9905/9905 [00:32<00:00, 306.16it/s]]\n", + "GPU: 0, Part: 22: 100%|██████████| 9658/9658 [00:30<00:00, 316.38it/s]]\n", + "GPU: 0, Part: 17: 100%|██████████| 9986/9986 [00:31<00:00, 320.21it/s]]]\n", + "GPU: 0, Part: 12: 100%|██████████| 10014/10014 [00:31<00:00, 322.65it/s]\n", + "GPU: 0, Part: 20: 100%|██████████| 9421/9421 [00:31<00:00, 300.02it/s]]\n", + "GPU: 0, Part: 18: 100%|██████████| 9658/9658 [00:32<00:00, 298.53it/s]]\n", + "GPU: 0, Part: 21: 100%|██████████| 9559/9559 [00:30<00:00, 308.45it/s]]\n", + "GPU: 0, Part: 19: 100%|██████████| 9664/9664 [00:31<00:00, 304.16it/s]s]\n", + "GPU: 0, Part: 15: 100%|██████████| 9860/9860 [00:31<00:00, 315.41it/s]]]\n", + "GPU: 0, Part: 14: 100%|██████████| 10159/10159 [00:32<00:00, 314.31it/s]\n", + "GPU: 0, Part: 13: 100%|██████████| 10260/10260 [00:34<00:00, 300.39it/s]\n", + "GPU: 0, Part: 10: 100%|██████████| 10061/10061 [00:31<00:00, 314.94it/s]\n", + "GPU: 0, Part: 16: 100%|██████████| 10009/10009 [00:34<00:00, 292.21it/s]\n", + "GPU: 0, Part: 11: 100%|██████████| 10284/10284 [00:34<00:00, 298.47it/s]\n", + "GPU: 0, Part: 2: 100%|██████████| 9684/9684 [00:35<00:00, 273.30it/s]\n", + "GPU: 0, Part: 1: 100%|██████████| 9698/9698 [00:30<00:00, 315.30it/s]\n", + "GPU: 0, Part: 0: 100%|██████████| 9729/9729 [00:31<00:00, 309.69it/s]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-05-09 01:05:38,916 - tornado.application - ERROR - Uncaught exception GET /status/ws (10.2.226.46)\n", + "HTTPServerRequest(protocol='http', host='10.120.104.12:8787', method='GET', uri='/status/ws', version='HTTP/1.1', remote_ip='10.2.226.46')\n", + "Traceback (most recent call last):\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/web.py\", line 1790, in _execute\n", + " result = await result\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 273, in get\n", + " await self.ws_connection.accept_connection(self)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 863, in accept_connection\n", + " await self._accept_connection(handler)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 946, in _accept_connection\n", + " await self._receive_frame_loop()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 1105, in _receive_frame_loop\n", + " self.handler.on_ws_connection_close(self.close_code, self.close_reason)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 571, in on_ws_connection_close\n", + " self.on_connection_close()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 563, in on_connection_close\n", + " self.on_close()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/views/ws.py\", line 308, in on_close\n", + " self.connection.session.notify_connection_lost()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/connection.py\", line 65, in session\n", + " assert self._session is not None\n", + "AssertionError\n", + "2024-05-09 01:05:43,132 - bokeh.server.protocol_handler - ERROR - error handling message\n", + " message: Message 'PATCH-DOC' content: {'events': [{'kind': 'ModelChanged', 'model': {'id': 'p9330'}, 'attr': 'inner_width', 'new': 834}, {'kind': 'ModelChanged', 'model': {'id': 'p9330'}, 'attr': 'inner_height', 'new': 863}, {'kind': 'ModelChanged', 'model': {'id': 'p9330'}, 'attr': 'outer_width', 'new': 854}, {'kind': 'ModelChanged', 'model': {'id': 'p9330'}, 'attr': 'outer_height', 'new': 895}]} \n", + " error: AssertionError()\n", + "Traceback (most recent call last):\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/protocol_handler.py\", line 97, in handle\n", + " work = await handler(message, connection)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/session.py\", line 295, in patch\n", + " return connection.session._handle_patch(message, connection)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/connection.py\", line 65, in session\n", + " assert self._session is not None\n", + "AssertionError\n", + "2024-05-09 01:05:44,167 - bokeh.server.protocol_handler - ERROR - error handling message\n", + " message: Message 'PATCH-DOC' content: {'events': [{'kind': 'MessageSent', 'msg_type': 'bokeh_event', 'msg_data': {'type': 'event', 'name': 'document_ready', 'values': {'type': 'map'}}}]} \n", + " error: AssertionError()\n", + "Traceback (most recent call last):\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/protocol_handler.py\", line 97, in handle\n", + " work = await handler(message, connection)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/session.py\", line 295, in patch\n", + " return connection.session._handle_patch(message, connection)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/connection.py\", line 65, in session\n", + " assert self._session is not None\n", + "AssertionError\n", + "2024-05-09 01:05:44,171 - tornado.application - ERROR - Uncaught exception GET /gpu/ws (10.2.226.46)\n", + "HTTPServerRequest(protocol='http', host='10.120.104.12:8787', method='GET', uri='/gpu/ws', version='HTTP/1.1', remote_ip='10.2.226.46')\n", + "Traceback (most recent call last):\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/web.py\", line 1790, in _execute\n", + " result = await result\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 273, in get\n", + " await self.ws_connection.accept_connection(self)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 863, in accept_connection\n", + " await self._accept_connection(handler)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 946, in _accept_connection\n", + " await self._receive_frame_loop()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 1105, in _receive_frame_loop\n", + " self.handler.on_ws_connection_close(self.close_code, self.close_reason)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 571, in on_ws_connection_close\n", + " self.on_connection_close()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 563, in on_connection_close\n", + " self.on_close()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/views/ws.py\", line 308, in on_close\n", + " self.connection.session.notify_connection_lost()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/connection.py\", line 65, in session\n", + " assert self._session is not None\n", + "AssertionError\n", + "2024-05-09 01:05:55,279 - tornado.application - ERROR - Uncaught exception GET /status/ws (10.2.226.46)\n", + "HTTPServerRequest(protocol='http', host='10.120.104.12:8787', method='GET', uri='/status/ws', version='HTTP/1.1', remote_ip='10.2.226.46')\n", + "Traceback (most recent call last):\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/web.py\", line 1790, in _execute\n", + " result = await result\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 273, in get\n", + " await self.ws_connection.accept_connection(self)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 863, in accept_connection\n", + " await self._accept_connection(handler)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 946, in _accept_connection\n", + " await self._receive_frame_loop()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 1105, in _receive_frame_loop\n", + " self.handler.on_ws_connection_close(self.close_code, self.close_reason)\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 571, in on_ws_connection_close\n", + " self.on_connection_close()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/tornado/websocket.py\", line 563, in on_connection_close\n", + " self.on_close()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/views/ws.py\", line 308, in on_close\n", + " self.connection.session.notify_connection_lost()\n", + " File \"/datasets/vjawa/miniconda3/envs/NeMo-Curator-env-2/lib/python3.10/site-packages/bokeh/server/connection.py\", line 65, in session\n", + " assert self._session is not None\n", + "AssertionError\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Writing to disk complete for 16 partitions\n", - "CPU times: user 11.4 s, sys: 11.4 s, total: 22.7 s\n", - "Wall time: 49.7 s\n" + "Writing to disk complete for 50 partitions\n", + "CPU times: user 39.5 s, sys: 27.2 s, total: 1min 6s\n", + "Wall time: 3min 14s\n" ] } ], @@ -138,7 +273,7 @@ "domain_classifier = DomainClassifier(\n", " model_file_name=model_file_name,\n", " labels=labels,\n", - " batch_size=256,\n", + " batch_size=1024,\n", ")\n", "result_dataset = domain_classifier(dataset=input_dataset)\n", "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=True)" @@ -146,14 +281,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Reading 16 files\n" + "Reading 50 files\n" ] } ], @@ -163,7 +298,27 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/raid/vjawa/output_subset_CC-MAIN-2023-14_english'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_file_path" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -190,16 +345,16 @@ " <th></th>\n", " <th>adlr_id</th>\n", " <th>filename</th>\n", - " <th>id</th>\n", " <th>labels</th>\n", - " <th>pred</th>\n", + " <th>langid_score</th>\n", + " <th>language</th>\n", " <th>source_id</th>\n", - " <th>split_id</th>\n", " <th>text</th>\n", " <th>url</th>\n", + " <th>warc_id</th>\n", " </tr>\n", " <tr>\n", - " <th>npartitions=16</th>\n", + " <th>npartitions=50</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", @@ -217,7 +372,7 @@ " <td>object</td>\n", " <td>object</td>\n", " <td>object</td>\n", - " <td>object</td>\n", + " <td>float64</td>\n", " <td>object</td>\n", " <td>object</td>\n", " <td>object</td>\n", @@ -278,10 +433,10 @@ "<div>Dask Name: read_single_partition, 1 graph layer</div>" ], "text/plain": [ - "<dask_cudf.DataFrame | 16 tasks | 16 npartitions>" + "<dask_cudf.DataFrame | 50 tasks | 50 npartitions>" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -292,7 +447,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -318,203 +473,215 @@ " <th></th>\n", " <th>adlr_id</th>\n", " <th>filename</th>\n", - " <th>id</th>\n", " <th>labels</th>\n", - " <th>pred</th>\n", + " <th>langid_score</th>\n", + " <th>language</th>\n", " <th>source_id</th>\n", - " <th>split_id</th>\n", " <th>text</th>\n", " <th>url</th>\n", + " <th>warc_id</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>cc-2022-40-0431053204</td>\n", - " <td>00.jsonl</td>\n", - " <td>a8083fe4-525d-4888-8513-b91f43bd8ee1</td>\n", - " <td>Online_Communities</td>\n", - " <td>Online_Communities</td>\n", - " <td>crawl-data-CC-MAIN-2022-40-segments-1664030336...</td>\n", - " <td>lambada-0003225258-0000</td>\n", - " <td>Having been a community leader—and member—for ...</td>\n", - " <td>https://lisalarter.com/7-tips-for-building-ste...</td>\n", + " <td>cc-2023-14-0001622299</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Jobs_and_Education</td>\n", + " <td>0.946693</td>\n", + " <td>EN</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Neighborhood Street Fund Application: plans fo...</td>\n", + " <td>http://12thaveseattle.com/blog/2013/02/11/12th...</td>\n", + " <td>6dd74af8-669e-4aaf-b5f8-e2a44f03574b</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>cc-2022-40-0510168267</td>\n", - " <td>00.jsonl</td>\n", - " <td>559febdc-cb7f-4217-897a-c8dac325123b</td>\n", - " <td>Finance</td>\n", - " <td>Finance</td>\n", - " <td>crawl-data-CC-MAIN-2022-40-segments-1664030337...</td>\n", - " <td>lambada-0003918122-0000</td>\n", - " <td>Zelle is a way of sending money to almost anyo...</td>\n", - " <td>https://oregonmassageandwellnessclinic.com/app...</td>\n", + " <td>cc-2023-14-0001622300</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Computers_and_Electronics</td>\n", + " <td>0.918942</td>\n", + " <td>EN</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Main navigation\\n\\nProject Assistance, Managem...</td>\n", + " <td>http://1kenthomas.com/slides/drupal-project-as...</td>\n", + " <td>eb065ae1-4737-4557-b040-96a1ecf67db4</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>cc-2022-40-0695312978</td>\n", - " <td>00.jsonl</td>\n", - " <td>b1ec1a9f-693e-4672-b485-54f48a3dfdb6</td>\n", - " <td>Arts_and_Entertainment</td>\n", - " <td>Arts_and_Entertainment</td>\n", - " <td>crawl-data-CC-MAIN-2022-40-segments-1664030337...</td>\n", - " <td>lambada-0005286343-0000</td>\n", - " <td>Nicole Scherzinger and Enrique Lglesias Get St...</td>\n", - " <td>https://menzmag.com/entertainment/celebrity-go...</td>\n", + " <td>cc-2023-14-0001622301</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Autos_and_Vehicles</td>\n", + " <td>0.937426</td>\n", + " <td>EN</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>RENDI AUTO\\n\\nBMW X5\\n\\nPeriod\\n\\n-\\n\\nName\\n\\...</td>\n", + " <td>http://1rendiauto.ee/en/autod-2/?auto_id=13</td>\n", + " <td>9f69f52e-350d-4677-8609-d7f3c8d759c6</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>cc-2022-40-0318121708</td>\n", - " <td>00.jsonl</td>\n", - " <td>f1217f04-58d3-4c88-8d33-250401b219f6</td>\n", - " <td>Internet_and_Telecom</td>\n", - " <td>Internet_and_Telecom</td>\n", - " <td>crawl-data-CC-MAIN-2022-40-segments-1664030335...</td>\n", - " <td>lambada-0002386272-0000</td>\n", - " <td>Thanksgiving 2021 WhatsApp Status Video to Dow...</td>\n", - " <td>https://nonstop-news.com/lifestyle/thanksgivin...</td>\n", + " <td>cc-2023-14-0001622302</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Sensitive_Subjects</td>\n", + " <td>0.976790</td>\n", + " <td>EN</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Now based on multiple underwater UFO encounter...</td>\n", + " <td>http://24newstodays.com/2022/12/23/weird-encou...</td>\n", + " <td>256174a9-506e-4b23-9580-de10e3ab4590</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>cc-2022-40-0602859436</td>\n", - " <td>00.jsonl</td>\n", - " <td>d255ebe4-0601-469b-a5d3-c4102d83dabd</td>\n", - " <td>Games</td>\n", - " <td>Games</td>\n", - " <td>crawl-data-CC-MAIN-2022-40-segments-1664030337...</td>\n", - " <td>lambada-0004541139-0000</td>\n", - " <td>Lakeside Inn And Casino Lake Tahoe – Online si...</td>\n", - " <td>https://psplondon.com/lakeside-inn-and-casino-...</td>\n", + " <td>cc-2023-14-0001622303</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Jobs_and_Education</td>\n", + " <td>0.906984</td>\n", + " <td>EN</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>We are commited to providing the highest level...</td>\n", + " <td>http://434caaeea2929142-u.edu-newsletters.com/...</td>\n", + " <td>f87fd3c6-1450-4260-93e5-bed50ea608e4</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", - " <td>cc-2022-40-0025406361</td>\n", - " <td>00.jsonl</td>\n", - " <td>5d598bfa-ca17-4203-800c-4d02072c3b87</td>\n", - " <td>Books_and_Literature</td>\n", - " <td>Books_and_Literature</td>\n", - " <td>crawl-data-CC-MAIN-2022-40-segments-1664030334...</td>\n", - " <td>lambada-0000190248-0000</td>\n", - " <td>A THOUSAND WORDS - Alex Waterhouse-Hayward's b...</td>\n", - " <td>http://blog.alexwaterhousehayward.com/2006/03/...</td>\n", + " <td>cc-2023-14-0001622304</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Sports</td>\n", + " <td>0.977782</td>\n", + " <td>EN</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>football\\n\\nThe football season in Bulgaria ha...</td>\n", + " <td>http://4liberty.eu/tag/football/</td>\n", + " <td>933d5938-d5b7-4745-ba69-db153fb25173</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", - " <td>cc-2022-40-0605292636</td>\n", - " <td>00.jsonl</td>\n", - " <td>42ced198-6cdb-4ef2-bf4d-dc1254da0da6</td>\n", - " <td>Shopping</td>\n", - " <td>Beauty_and_Fitness</td>\n", - " <td>crawl-data-CC-MAIN-2022-40-segments-1664030337...</td>\n", - " <td>lambada-0004601177-0000</td>\n", - " <td>Search our store\\n\\nCLOCKWORK ORANGE OUTFIT\\n\\...</td>\n", - " <td>https://dressx.com/products/clockwork-orange-o...</td>\n", + " <td>cc-2023-14-0001622305</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Pets_and_Animals</td>\n", + " <td>0.942719</td>\n", + " <td>EN</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>As leaders of the No Kill Movement on Maui, we...</td>\n", + " <td>http://9thlifehawaii.org/site/Spay-amp-Neuter-...</td>\n", + " <td>c1a83c12-0fbb-4fc9-8c68-4a109d71048e</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", - " <td>cc-2022-40-0270701137</td>\n", - " <td>00.jsonl</td>\n", - " <td>a0bbffa6-670d-43e4-8027-9fc0862df95f</td>\n", - " <td>News</td>\n", - " <td>News</td>\n", - " <td>crawl-data-CC-MAIN-2022-40-segments-1664030335...</td>\n", - " <td>lambada-0002122651-0000</td>\n", - " <td>The Democrat Police State Imposes its Tyranny\\...</td>\n", - " <td>https://www.paulcraigroberts.org/2022/08/13/th...</td>\n", + " <td>cc-2023-14-0001622306</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Internet_and_Telecom</td>\n", + " <td>0.948170</td>\n", + " <td>EN</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>The IP address for this domain may have change...</td>\n", + " <td>http://a1levelingcleveland.com/cgi-sys/default...</td>\n", + " <td>48829d73-dc4b-4423-8dd5-db32ac6a4349</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", - " <td>cc-2022-40-0130518751</td>\n", - " <td>00.jsonl</td>\n", - " <td>80948f1a-0970-4bc4-879a-22725a388d62</td>\n", - " <td>Games</td>\n", - " <td>Games</td>\n", - " <td>crawl-data-CC-MAIN-2022-40-segments-1664030334...</td>\n", - " <td>lambada-0000961821-0000</td>\n", - " <td>How to Play the Lottery Online\\n\\nThe lottery ...</td>\n", - " <td>https://moellerdog.com/index.php/2022/09/16/ho...</td>\n", + " <td>cc-2023-14-0001622307</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Food_and_Drink</td>\n", + " <td>0.971278</td>\n", + " <td>EN</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>A\\nvegetarian since the age of 15, Abbey Levin...</td>\n", + " <td>http://abbeysvegetarianrecipes.com/abbey.html</td>\n", + " <td>223a469b-6001-4820-aa2d-7eadb6afa6c4</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", - " <td>cc-2022-40-0430464926</td>\n", - " <td>00.jsonl</td>\n", - " <td>234d085c-f735-4b2e-bcfd-edc65fb4ed22</td>\n", - " <td>Beauty_and_Fitness</td>\n", - " <td>Beauty_and_Fitness</td>\n", - " <td>crawl-data-CC-MAIN-2022-40-segments-1664030336...</td>\n", - " <td>lambada-0003227706-0000</td>\n", - " <td>LASER LIPOSUCTION\\n\\nLaser Liposuction works b...</td>\n", - " <td>https://shapechicagoland.com/services/body-tre...</td>\n", + " <td>cc-2023-14-0001622308</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Online_Communities</td>\n", + " <td>0.864449</td>\n", + " <td>EN</td>\n", + " <td>crawl-data-CC-MAIN-2023-14-segments-1679296943...</td>\n", + " <td>Search This Blog\\n\\nDisclaimer\\n\\nThe owners, ...</td>\n", + " <td>http://abeckslife.blogspot.com/2011/12/woodwri...</td>\n", + " <td>9c57d2eb-dd01-4cc6-a473-7ae30ba26e3e</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " adlr_id filename id \\\n", - "0 cc-2022-40-0431053204 00.jsonl a8083fe4-525d-4888-8513-b91f43bd8ee1 \n", - "1 cc-2022-40-0510168267 00.jsonl 559febdc-cb7f-4217-897a-c8dac325123b \n", - "2 cc-2022-40-0695312978 00.jsonl b1ec1a9f-693e-4672-b485-54f48a3dfdb6 \n", - "3 cc-2022-40-0318121708 00.jsonl f1217f04-58d3-4c88-8d33-250401b219f6 \n", - "4 cc-2022-40-0602859436 00.jsonl d255ebe4-0601-469b-a5d3-c4102d83dabd \n", - "5 cc-2022-40-0025406361 00.jsonl 5d598bfa-ca17-4203-800c-4d02072c3b87 \n", - "6 cc-2022-40-0605292636 00.jsonl 42ced198-6cdb-4ef2-bf4d-dc1254da0da6 \n", - "7 cc-2022-40-0270701137 00.jsonl a0bbffa6-670d-43e4-8027-9fc0862df95f \n", - "8 cc-2022-40-0130518751 00.jsonl 80948f1a-0970-4bc4-879a-22725a388d62 \n", - "9 cc-2022-40-0430464926 00.jsonl 234d085c-f735-4b2e-bcfd-edc65fb4ed22 \n", + " adlr_id filename \\\n", + "0 cc-2023-14-0001622299 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "1 cc-2023-14-0001622300 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "2 cc-2023-14-0001622301 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "3 cc-2023-14-0001622302 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "4 cc-2023-14-0001622303 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "5 cc-2023-14-0001622304 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "6 cc-2023-14-0001622305 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "7 cc-2023-14-0001622306 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "8 cc-2023-14-0001622307 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "9 cc-2023-14-0001622308 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", "\n", - " labels pred \\\n", - "0 Online_Communities Online_Communities \n", - "1 Finance Finance \n", - "2 Arts_and_Entertainment Arts_and_Entertainment \n", - "3 Internet_and_Telecom Internet_and_Telecom \n", - "4 Games Games \n", - "5 Books_and_Literature Books_and_Literature \n", - "6 Shopping Beauty_and_Fitness \n", - "7 News News \n", - "8 Games Games \n", - "9 Beauty_and_Fitness Beauty_and_Fitness \n", + " labels langid_score language \\\n", + "0 Jobs_and_Education 0.946693 EN \n", + "1 Computers_and_Electronics 0.918942 EN \n", + "2 Autos_and_Vehicles 0.937426 EN \n", + "3 Sensitive_Subjects 0.976790 EN \n", + "4 Jobs_and_Education 0.906984 EN \n", + "5 Sports 0.977782 EN \n", + "6 Pets_and_Animals 0.942719 EN \n", + "7 Internet_and_Telecom 0.948170 EN \n", + "8 Food_and_Drink 0.971278 EN \n", + "9 Online_Communities 0.864449 EN \n", "\n", - " source_id split_id \\\n", - "0 crawl-data-CC-MAIN-2022-40-segments-1664030336... lambada-0003225258-0000 \n", - "1 crawl-data-CC-MAIN-2022-40-segments-1664030337... lambada-0003918122-0000 \n", - "2 crawl-data-CC-MAIN-2022-40-segments-1664030337... lambada-0005286343-0000 \n", - "3 crawl-data-CC-MAIN-2022-40-segments-1664030335... lambada-0002386272-0000 \n", - "4 crawl-data-CC-MAIN-2022-40-segments-1664030337... lambada-0004541139-0000 \n", - "5 crawl-data-CC-MAIN-2022-40-segments-1664030334... lambada-0000190248-0000 \n", - "6 crawl-data-CC-MAIN-2022-40-segments-1664030337... lambada-0004601177-0000 \n", - "7 crawl-data-CC-MAIN-2022-40-segments-1664030335... lambada-0002122651-0000 \n", - "8 crawl-data-CC-MAIN-2022-40-segments-1664030334... lambada-0000961821-0000 \n", - "9 crawl-data-CC-MAIN-2022-40-segments-1664030336... lambada-0003227706-0000 \n", + " source_id \\\n", + "0 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "1 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "2 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "3 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "4 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "5 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "6 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "7 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "8 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", + "9 crawl-data-CC-MAIN-2023-14-segments-1679296943... \n", "\n", " text \\\n", - "0 Having been a community leader—and member—for ... \n", - "1 Zelle is a way of sending money to almost anyo... \n", - "2 Nicole Scherzinger and Enrique Lglesias Get St... \n", - "3 Thanksgiving 2021 WhatsApp Status Video to Dow... \n", - "4 Lakeside Inn And Casino Lake Tahoe – Online si... \n", - "5 A THOUSAND WORDS - Alex Waterhouse-Hayward's b... \n", - "6 Search our store\\n\\nCLOCKWORK ORANGE OUTFIT\\n\\... \n", - "7 The Democrat Police State Imposes its Tyranny\\... \n", - "8 How to Play the Lottery Online\\n\\nThe lottery ... \n", - "9 LASER LIPOSUCTION\\n\\nLaser Liposuction works b... \n", + "0 Neighborhood Street Fund Application: plans fo... \n", + "1 Main navigation\\n\\nProject Assistance, Managem... \n", + "2 RENDI AUTO\\n\\nBMW X5\\n\\nPeriod\\n\\n-\\n\\nName\\n\\... \n", + "3 Now based on multiple underwater UFO encounter... \n", + "4 We are commited to providing the highest level... \n", + "5 football\\n\\nThe football season in Bulgaria ha... \n", + "6 As leaders of the No Kill Movement on Maui, we... \n", + "7 The IP address for this domain may have change... \n", + "8 A\\nvegetarian since the age of 15, Abbey Levin... \n", + "9 Search This Blog\\n\\nDisclaimer\\n\\nThe owners, ... \n", + "\n", + " url \\\n", + "0 http://12thaveseattle.com/blog/2013/02/11/12th... \n", + "1 http://1kenthomas.com/slides/drupal-project-as... \n", + "2 http://1rendiauto.ee/en/autod-2/?auto_id=13 \n", + "3 http://24newstodays.com/2022/12/23/weird-encou... \n", + "4 http://434caaeea2929142-u.edu-newsletters.com/... \n", + "5 http://4liberty.eu/tag/football/ \n", + "6 http://9thlifehawaii.org/site/Spay-amp-Neuter-... \n", + "7 http://a1levelingcleveland.com/cgi-sys/default... \n", + "8 http://abbeysvegetarianrecipes.com/abbey.html \n", + "9 http://abeckslife.blogspot.com/2011/12/woodwri... \n", "\n", - " url \n", - "0 https://lisalarter.com/7-tips-for-building-ste... \n", - "1 https://oregonmassageandwellnessclinic.com/app... \n", - "2 https://menzmag.com/entertainment/celebrity-go... \n", - "3 https://nonstop-news.com/lifestyle/thanksgivin... \n", - "4 https://psplondon.com/lakeside-inn-and-casino-... \n", - "5 http://blog.alexwaterhousehayward.com/2006/03/... \n", - "6 https://dressx.com/products/clockwork-orange-o... \n", - "7 https://www.paulcraigroberts.org/2022/08/13/th... \n", - "8 https://moellerdog.com/index.php/2022/09/16/ho... \n", - "9 https://shapechicagoland.com/services/body-tre... " + " warc_id \n", + "0 6dd74af8-669e-4aaf-b5f8-e2a44f03574b \n", + "1 eb065ae1-4737-4557-b040-96a1ecf67db4 \n", + "2 9f69f52e-350d-4677-8609-d7f3c8d759c6 \n", + "3 256174a9-506e-4b23-9580-de10e3ab4590 \n", + "4 f87fd3c6-1450-4260-93e5-bed50ea608e4 \n", + "5 933d5938-d5b7-4745-ba69-db153fb25173 \n", + "6 c1a83c12-0fbb-4fc9-8c68-4a109d71048e \n", + "7 48829d73-dc4b-4423-8dd5-db32ac6a4349 \n", + "8 223a469b-6001-4820-aa2d-7eadb6afa6c4 \n", + "9 9c57d2eb-dd01-4cc6-a473-7ae30ba26e3e " ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" }