From 95de32397509a4a908e0998c7460a02be0875cae Mon Sep 17 00:00:00 2001 From: RodionfromHSE Date: Thu, 26 Oct 2023 22:00:16 +0200 Subject: [PATCH] fix download script --- notebooks/data/dataset_download.ipynb | 95 ++++++++------------------- 1 file changed, 29 insertions(+), 66 deletions(-) diff --git a/notebooks/data/dataset_download.ipynb b/notebooks/data/dataset_download.ipynb index f2243ef..f703887 100644 --- a/notebooks/data/dataset_download.ipynb +++ b/notebooks/data/dataset_download.ipynb @@ -22,49 +22,16 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "52792it [04:47, 39.63it/s] " - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb Cell 2\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 14\u001b[0m bar\u001b[39m.\u001b[39mupdate(\u001b[39m1\u001b[39m)\n\u001b[1;32m 16\u001b[0m \u001b[39mreturn\u001b[39;00m samples\n\u001b[0;32m---> 18\u001b[0m train_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39;49mn_train, split\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mtrain\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 19\u001b[0m valid_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39mn_valid, split\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mvalidation\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 20\u001b[0m test_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39mn_test, split\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m'\u001b[39m)\n", - "\u001b[1;32m/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb Cell 2\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 9\u001b[0m samples \u001b[39m=\u001b[39m []\n\u001b[1;32m 10\u001b[0m bar \u001b[39m=\u001b[39m tqdm(total\u001b[39m=\u001b[39mn)\n\u001b[0;32m---> 11\u001b[0m \u001b[39mfor\u001b[39;49;00m sample \u001b[39min\u001b[39;49;00m dataset:\n\u001b[1;32m 12\u001b[0m \u001b[39mif\u001b[39;49;00m config\u001b[39m.\u001b[39;49mmin_chars_len \u001b[39m<\u001b[39;49m\u001b[39m=\u001b[39;49m \u001b[39mlen\u001b[39;49m(sample[\u001b[39m'\u001b[39;49m\u001b[39mtext\u001b[39;49m\u001b[39m'\u001b[39;49m]) \u001b[39m<\u001b[39;49m\u001b[39m=\u001b[39;49m config\u001b[39m.\u001b[39;49mmax_chars_len:\n\u001b[1;32m 13\u001b[0m samples\u001b[39m.\u001b[39;49mappend(sample)\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/datasets/iterable_dataset.py:1379\u001b[0m, in \u001b[0;36mIterableDataset.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1376\u001b[0m \u001b[39myield\u001b[39;00m formatter\u001b[39m.\u001b[39mformat_row(pa_table)\n\u001b[1;32m 1377\u001b[0m \u001b[39mreturn\u001b[39;00m\n\u001b[0;32m-> 1379\u001b[0m \u001b[39mfor\u001b[39;49;00m key, example \u001b[39min\u001b[39;49;00m ex_iterable:\n\u001b[1;32m 1380\u001b[0m \u001b[39mif\u001b[39;49;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfeatures:\n\u001b[1;32m 1381\u001b[0m \u001b[39m# `IterableDataset` automatically fills missing columns with None.\u001b[39;49;00m\n\u001b[1;32m 1382\u001b[0m \u001b[39m# This is done with `_apply_feature_types_on_example`.\u001b[39;49;00m\n\u001b[1;32m 1383\u001b[0m example \u001b[39m=\u001b[39;49m _apply_feature_types_on_example(\n\u001b[1;32m 1384\u001b[0m example, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfeatures, token_per_repo_id\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_token_per_repo_id\n\u001b[1;32m 1385\u001b[0m )\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/datasets/iterable_dataset.py:281\u001b[0m, in \u001b[0;36mArrowExamplesIterable.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__iter__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m 280\u001b[0m formatter \u001b[39m=\u001b[39m PythonFormatter()\n\u001b[0;32m--> 281\u001b[0m \u001b[39mfor\u001b[39;49;00m key, pa_table \u001b[39min\u001b[39;49;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgenerate_tables_fn(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mkwargs):\n\u001b[1;32m 282\u001b[0m \u001b[39mfor\u001b[39;49;00m pa_subtable \u001b[39min\u001b[39;49;00m pa_table\u001b[39m.\u001b[39;49mto_reader(max_chunksize\u001b[39m=\u001b[39;49mconfig\u001b[39m.\u001b[39;49mARROW_READER_BATCH_SIZE_IN_DATASET_ITER):\n\u001b[1;32m 283\u001b[0m formatted_batch \u001b[39m=\u001b[39;49m formatter\u001b[39m.\u001b[39;49mformat_batch(pa_subtable)\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/datasets/packaged_modules/json/json.py:107\u001b[0m, in \u001b[0;36mJson._generate_tables\u001b[0;34m(self, files)\u001b[0m\n\u001b[1;32m 103\u001b[0m encoding_errors \u001b[39m=\u001b[39m (\n\u001b[1;32m 104\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mencoding_errors \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mencoding_errors \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mstrict\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 105\u001b[0m )\n\u001b[1;32m 106\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m--> 107\u001b[0m batch \u001b[39m=\u001b[39m f\u001b[39m.\u001b[39;49mread(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mconfig\u001b[39m.\u001b[39;49mchunksize)\n\u001b[1;32m 108\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m batch:\n\u001b[1;32m 109\u001b[0m \u001b[39mbreak\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/datasets/download/streaming_download_manager.py:333\u001b[0m, in \u001b[0;36m_add_retries_to_file_obj_read_method..read_with_retries\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[39mfor\u001b[39;00m retry \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m1\u001b[39m, max_retries \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m):\n\u001b[1;32m 332\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 333\u001b[0m out \u001b[39m=\u001b[39m read(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 334\u001b[0m \u001b[39mbreak\u001b[39;00m\n\u001b[1;32m 335\u001b[0m \u001b[39mexcept\u001b[39;00m (ClientError, \u001b[39mTimeoutError\u001b[39;00m) \u001b[39mas\u001b[39;00m err:\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/fsspec/spec.py:1856\u001b[0m, in \u001b[0;36mAbstractBufferedFile.read\u001b[0;34m(self, length)\u001b[0m\n\u001b[1;32m 1853\u001b[0m \u001b[39mif\u001b[39;00m length \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 1854\u001b[0m \u001b[39m# don't even bother calling fetch\u001b[39;00m\n\u001b[1;32m 1855\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m-> 1856\u001b[0m out \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcache\u001b[39m.\u001b[39;49m_fetch(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mloc, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mloc \u001b[39m+\u001b[39;49m length)\n\u001b[1;32m 1857\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mloc \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39mlen\u001b[39m(out)\n\u001b[1;32m 1858\u001b[0m \u001b[39mreturn\u001b[39;00m out\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/fsspec/caching.py:189\u001b[0m, in \u001b[0;36mReadAheadCache._fetch\u001b[0;34m(self, start, end)\u001b[0m\n\u001b[1;32m 187\u001b[0m part \u001b[39m=\u001b[39m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 188\u001b[0m end \u001b[39m=\u001b[39m \u001b[39mmin\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msize, end \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mblocksize)\n\u001b[0;32m--> 189\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcache \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfetcher(start, end) \u001b[39m# new block replaces old\u001b[39;00m\n\u001b[1;32m 190\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstart \u001b[39m=\u001b[39m start\n\u001b[1;32m 191\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mend \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstart \u001b[39m+\u001b[39m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcache)\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py:422\u001b[0m, in \u001b[0;36mHfFileSystemFile._fetch_range\u001b[0;34m(self, start, end)\u001b[0m\n\u001b[1;32m 415\u001b[0m headers \u001b[39m=\u001b[39m {\n\u001b[1;32m 416\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mrange\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mbytes=\u001b[39m\u001b[39m{\u001b[39;00mstart\u001b[39m}\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m{\u001b[39;00mend\u001b[39m \u001b[39m\u001b[39m-\u001b[39m\u001b[39m \u001b[39m\u001b[39m1\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 417\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfs\u001b[39m.\u001b[39m_api\u001b[39m.\u001b[39m_build_hf_headers(),\n\u001b[1;32m 418\u001b[0m }\n\u001b[1;32m 419\u001b[0m url \u001b[39m=\u001b[39m (\n\u001b[1;32m 420\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfs\u001b[39m.\u001b[39mendpoint\u001b[39m}\u001b[39;00m\u001b[39m/\u001b[39m\u001b[39m{\u001b[39;00mREPO_TYPES_URL_PREFIXES\u001b[39m.\u001b[39mget(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolved_path\u001b[39m.\u001b[39mrepo_type,\u001b[39m \u001b[39m\u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)\u001b[39m \u001b[39m\u001b[39m+\u001b[39m\u001b[39m \u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolved_path\u001b[39m.\u001b[39mrepo_id\u001b[39m}\u001b[39;00m\u001b[39m/resolve/\u001b[39m\u001b[39m{\u001b[39;00msafe_quote(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolved_path\u001b[39m.\u001b[39mrevision)\u001b[39m}\u001b[39;00m\u001b[39m/\u001b[39m\u001b[39m{\u001b[39;00msafe_quote(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolved_path\u001b[39m.\u001b[39mpath_in_repo)\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 421\u001b[0m )\n\u001b[0;32m--> 422\u001b[0m r \u001b[39m=\u001b[39m http_backoff(\u001b[39m\"\u001b[39;49m\u001b[39mGET\u001b[39;49m\u001b[39m\"\u001b[39;49m, url, headers\u001b[39m=\u001b[39;49mheaders)\n\u001b[1;32m 423\u001b[0m hf_raise_for_status(r)\n\u001b[1;32m 424\u001b[0m \u001b[39mreturn\u001b[39;00m r\u001b[39m.\u001b[39mcontent\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/huggingface_hub/utils/_http.py:258\u001b[0m, in \u001b[0;36mhttp_backoff\u001b[0;34m(method, url, max_retries, base_wait_time, max_wait_time, retry_on_exceptions, retry_on_status_codes, **kwargs)\u001b[0m\n\u001b[1;32m 255\u001b[0m kwargs[\u001b[39m\"\u001b[39m\u001b[39mdata\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mseek(io_obj_initial_pos)\n\u001b[1;32m 257\u001b[0m \u001b[39m# Perform request and return if status_code is not in the retry list.\u001b[39;00m\n\u001b[0;32m--> 258\u001b[0m response \u001b[39m=\u001b[39m session\u001b[39m.\u001b[39;49mrequest(method\u001b[39m=\u001b[39;49mmethod, url\u001b[39m=\u001b[39;49murl, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 259\u001b[0m \u001b[39mif\u001b[39;00m response\u001b[39m.\u001b[39mstatus_code \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m retry_on_status_codes:\n\u001b[1;32m 260\u001b[0m \u001b[39mreturn\u001b[39;00m response\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 584\u001b[0m send_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m 585\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtimeout\u001b[39m\u001b[39m\"\u001b[39m: timeout,\n\u001b[1;32m 586\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 587\u001b[0m }\n\u001b[1;32m 588\u001b[0m send_kwargs\u001b[39m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(prep, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49msend_kwargs)\n\u001b[1;32m 591\u001b[0m \u001b[39mreturn\u001b[39;00m resp\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/sessions.py:725\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 722\u001b[0m \u001b[39mif\u001b[39;00m allow_redirects:\n\u001b[1;32m 723\u001b[0m \u001b[39m# Redirect resolving generator.\u001b[39;00m\n\u001b[1;32m 724\u001b[0m gen \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolve_redirects(r, request, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m--> 725\u001b[0m history \u001b[39m=\u001b[39m [resp \u001b[39mfor\u001b[39;49;00m resp \u001b[39min\u001b[39;49;00m gen]\n\u001b[1;32m 726\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 727\u001b[0m history \u001b[39m=\u001b[39m []\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/sessions.py:725\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 722\u001b[0m \u001b[39mif\u001b[39;00m allow_redirects:\n\u001b[1;32m 723\u001b[0m \u001b[39m# Redirect resolving generator.\u001b[39;00m\n\u001b[1;32m 724\u001b[0m gen \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolve_redirects(r, request, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m--> 725\u001b[0m history \u001b[39m=\u001b[39m [resp \u001b[39mfor\u001b[39;49;00m resp \u001b[39min\u001b[39;49;00m gen]\n\u001b[1;32m 726\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 727\u001b[0m history \u001b[39m=\u001b[39m []\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/sessions.py:266\u001b[0m, in \u001b[0;36mSessionRedirectMixin.resolve_redirects\u001b[0;34m(self, resp, req, stream, timeout, verify, cert, proxies, yield_requests, **adapter_kwargs)\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[39myield\u001b[39;00m req\n\u001b[1;32m 264\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 266\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(\n\u001b[1;32m 267\u001b[0m req,\n\u001b[1;32m 268\u001b[0m stream\u001b[39m=\u001b[39;49mstream,\n\u001b[1;32m 269\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout,\n\u001b[1;32m 270\u001b[0m verify\u001b[39m=\u001b[39;49mverify,\n\u001b[1;32m 271\u001b[0m cert\u001b[39m=\u001b[39;49mcert,\n\u001b[1;32m 272\u001b[0m proxies\u001b[39m=\u001b[39;49mproxies,\n\u001b[1;32m 273\u001b[0m allow_redirects\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 274\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49madapter_kwargs,\n\u001b[1;32m 275\u001b[0m )\n\u001b[1;32m 277\u001b[0m extract_cookies_to_jar(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcookies, prepared_request, resp\u001b[39m.\u001b[39mraw)\n\u001b[1;32m 279\u001b[0m \u001b[39m# extract redirect url, if any, for the next loop\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/sessions.py:747\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 744\u001b[0m \u001b[39mpass\u001b[39;00m\n\u001b[1;32m 746\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m stream:\n\u001b[0;32m--> 747\u001b[0m r\u001b[39m.\u001b[39;49mcontent\n\u001b[1;32m 749\u001b[0m \u001b[39mreturn\u001b[39;00m r\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/models.py:899\u001b[0m, in \u001b[0;36mResponse.content\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 897\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 898\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 899\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content \u001b[39m=\u001b[39m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mjoin(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39miter_content(CONTENT_CHUNK_SIZE)) \u001b[39mor\u001b[39;00m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 901\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content_consumed \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 902\u001b[0m \u001b[39m# don't need to release the connection; that's been handled by urllib3\u001b[39;00m\n\u001b[1;32m 903\u001b[0m \u001b[39m# since we exhausted the data.\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/models.py:816\u001b[0m, in \u001b[0;36mResponse.iter_content..generate\u001b[0;34m()\u001b[0m\n\u001b[1;32m 814\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mraw, \u001b[39m\"\u001b[39m\u001b[39mstream\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m 815\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 816\u001b[0m \u001b[39myield from\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mraw\u001b[39m.\u001b[39mstream(chunk_size, decode_content\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m 817\u001b[0m \u001b[39mexcept\u001b[39;00m ProtocolError \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 818\u001b[0m \u001b[39mraise\u001b[39;00m ChunkedEncodingError(e)\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/urllib3/response.py:936\u001b[0m, in \u001b[0;36mHTTPResponse.stream\u001b[0;34m(self, amt, decode_content)\u001b[0m\n\u001b[1;32m 934\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 935\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m is_fp_closed(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_fp) \u001b[39mor\u001b[39;00m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_decoded_buffer) \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m--> 936\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(amt\u001b[39m=\u001b[39;49mamt, decode_content\u001b[39m=\u001b[39;49mdecode_content)\n\u001b[1;32m 938\u001b[0m \u001b[39mif\u001b[39;00m data:\n\u001b[1;32m 939\u001b[0m \u001b[39myield\u001b[39;00m data\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/urllib3/response.py:879\u001b[0m, in \u001b[0;36mHTTPResponse.read\u001b[0;34m(self, amt, decode_content, cache_content)\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_decoded_buffer) \u001b[39m>\u001b[39m\u001b[39m=\u001b[39m amt:\n\u001b[1;32m 877\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_decoded_buffer\u001b[39m.\u001b[39mget(amt)\n\u001b[0;32m--> 879\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_raw_read(amt)\n\u001b[1;32m 881\u001b[0m flush_decoder \u001b[39m=\u001b[39m amt \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mor\u001b[39;00m (amt \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m data)\n\u001b[1;32m 883\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m data \u001b[39mand\u001b[39;00m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_decoded_buffer) \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/urllib3/response.py:814\u001b[0m, in \u001b[0;36mHTTPResponse._raw_read\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 811\u001b[0m fp_closed \u001b[39m=\u001b[39m \u001b[39mgetattr\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_fp, \u001b[39m\"\u001b[39m\u001b[39mclosed\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 813\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_error_catcher():\n\u001b[0;32m--> 814\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_fp_read(amt) \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m fp_closed \u001b[39melse\u001b[39;00m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 815\u001b[0m \u001b[39mif\u001b[39;00m amt \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m amt \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m data:\n\u001b[1;32m 816\u001b[0m \u001b[39m# Platform-specific: Buggy versions of Python.\u001b[39;00m\n\u001b[1;32m 817\u001b[0m \u001b[39m# Close the connection when no data is returned\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 822\u001b[0m \u001b[39m# not properly close the connection in all cases. There is\u001b[39;00m\n\u001b[1;32m 823\u001b[0m \u001b[39m# no harm in redundantly calling close.\u001b[39;00m\n\u001b[1;32m 824\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_fp\u001b[39m.\u001b[39mclose()\n", - "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/urllib3/response.py:799\u001b[0m, in \u001b[0;36mHTTPResponse._fp_read\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 796\u001b[0m \u001b[39mreturn\u001b[39;00m buffer\u001b[39m.\u001b[39mgetvalue()\n\u001b[1;32m 797\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 798\u001b[0m \u001b[39m# StringIO doesn't like amt=None\u001b[39;00m\n\u001b[0;32m--> 799\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_fp\u001b[39m.\u001b[39;49mread(amt) \u001b[39mif\u001b[39;00m amt \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_fp\u001b[39m.\u001b[39mread()\n", - "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py:466\u001b[0m, in \u001b[0;36mHTTPResponse.read\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 463\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlength \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m amt \u001b[39m>\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlength:\n\u001b[1;32m 464\u001b[0m \u001b[39m# clip the read to the \"end of response\"\u001b[39;00m\n\u001b[1;32m 465\u001b[0m amt \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlength\n\u001b[0;32m--> 466\u001b[0m s \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfp\u001b[39m.\u001b[39mread(amt)\n\u001b[1;32m 467\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m s \u001b[39mand\u001b[39;00m amt:\n\u001b[1;32m 468\u001b[0m \u001b[39m# Ideally, we would raise IncompleteRead if the content-length\u001b[39;00m\n\u001b[1;32m 469\u001b[0m \u001b[39m# wasn't satisfied, but it might break compatibility.\u001b[39;00m\n\u001b[1;32m 470\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_close_conn()\n", - "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11/socket.py:706\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 704\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m 705\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 706\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[1;32m 707\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[1;32m 708\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n", - "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ssl.py:1311\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1307\u001b[0m \u001b[39mif\u001b[39;00m flags \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 1308\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 1309\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m\n\u001b[1;32m 1310\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m)\n\u001b[0;32m-> 1311\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(nbytes, buffer)\n\u001b[1;32m 1312\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1313\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mrecv_into(buffer, nbytes, flags)\n", - "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ssl.py:1167\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1165\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 1166\u001b[0m \u001b[39mif\u001b[39;00m buffer \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 1167\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m, buffer)\n\u001b[1;32m 1168\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1169\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sslobj\u001b[39m.\u001b[39mread(\u001b[39mlen\u001b[39m)\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + "100%|██████████| 10000/10000 [00:15<00:00, 664.22it/s]\n", + "100%|██████████| 1000/1000 [00:01<00:00, 877.92it/s]\n", + "100%|██████████| 1000/1000 [00:01<00:00, 754.20it/s]\n" ] } ], @@ -95,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -112,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -135,16 +102,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "69" + "9178" ] }, - "execution_count": 17, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -155,23 +122,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Overlap: 4\n", - "Before: 10000, 1065\n", - "After: 65, 49\n", - "Overlap: 0\n", - "Before: 65, 1023\n", - "After: 65, 25\n", - "Overlap: 4\n", - "Before: 49, 25\n", - "After: 45, 21\n", - "Train: 65, Valid: 45, Test: 21\n" + "Overlap: 72\n", + "Overlap: 24\n", + "Overlap: 65\n", + "Train: 9082, Valid: 702, Test: 568\n" ] } ], @@ -194,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -206,23 +167,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.25, 0.5, 0.75 quantile: [290. 369. 460.]\n", - "Max len: 508\n", - "Min len: 145\n", - "Example: We will now see how the negative entropy \\(h\\) yields\n", - "explicit steps that effectively enforce the simplicial constraints.\n", - "Indeed, after a short classical calculation (see, for instance [1]}),\n", - "it is possible to show that the update (REF ) is equivalent to the\n", - "following one: for all \\(j\\) in \\(\\lbrace 1, \\ldots , d\\rbrace \\) ,\n", - "\\( \\mathbf {z}_j^{k+1} = \\frac{\\mathbf {z}_j^k e^{-\\eta _k \\nabla f(\\mathbf {z}^k)_j}}{\\sum _{l=1}^d \\mathbf {z}_l^k e^{-\\eta _k \\nabla f(\\mathbf {z}^k)_l}},\\) \n", - "\n" + "0.25, 0.5, 0.75 quantile: [264. 358. 439.]\n", + "Max len: 512\n", + "Min len: 128\n", + "Example: Regarding the computational complexity of the algorithm, line REF is\n", + "computed in \\(\\mathcal {O}(|V(G)| + |E(G)|)\\) time [1]}.\n", + "Moreover, the complete bipartite decomposition of \\(G\\) is computed\n", + "in \\(\\mathcal {O}(|V(G)|)\\) time [2]}. To conclude,\n", + "the paths at lines REF , REF , REF and REF \n", + "are computed in constant time, as each \\(K_i\\) is complete bipartite. Therefore,\n", + "the complexity of Algorithm REF is \\(\\mathcal {O}(|V(G)| + |E(G)|)\\) .\n", + "\\(\\Box \\)\n", + "
\n" ] } ], @@ -240,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [