From 865d18874ec5e16d90be5a2a9c3272a0f4ebc405 Mon Sep 17 00:00:00 2001 From: RodionfromHSE Date: Thu, 26 Oct 2023 21:41:48 +0200 Subject: [PATCH] fix download script --- notebooks/data/dataset_download.ipynb | 160 ++++++++++++++++++++++---- 1 file changed, 135 insertions(+), 25 deletions(-) diff --git a/notebooks/data/dataset_download.ipynb b/notebooks/data/dataset_download.ipynb index f0bb5a3..f2243ef 100644 --- a/notebooks/data/dataset_download.ipynb +++ b/notebooks/data/dataset_download.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -22,18 +22,49 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": {}, "outputs": [ { - "ename": "TypeError", - "evalue": "take_n_samples() got an unexpected keyword argument 'split'", + "name": "stderr", + "output_type": "stream", + "text": [ + "52792it [04:47, 39.63it/s] " + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb Cell 2\u001b[0m line \u001b[0;36m2\n\u001b[1;32m 16\u001b[0m bar\u001b[39m.\u001b[39mupdate(\u001b[39mlen\u001b[39m(new_samples))\n\u001b[1;32m 18\u001b[0m \u001b[39mreturn\u001b[39;00m samples\n\u001b[0;32m---> 20\u001b[0m train_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39;49mn_train, split\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mtrain\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 21\u001b[0m valid_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39mn_valid, split\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mvalidation\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 22\u001b[0m test_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39mn_test, split\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m'\u001b[39m)\n", - "\u001b[0;31mTypeError\u001b[0m: take_n_samples() got an unexpected keyword argument 'split'" + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb Cell 2\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 14\u001b[0m bar\u001b[39m.\u001b[39mupdate(\u001b[39m1\u001b[39m)\n\u001b[1;32m 16\u001b[0m \u001b[39mreturn\u001b[39;00m samples\n\u001b[0;32m---> 18\u001b[0m train_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39;49mn_train, split\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mtrain\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 19\u001b[0m valid_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39mn_valid, split\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mvalidation\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 20\u001b[0m test_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39mn_test, split\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m'\u001b[39m)\n", + "\u001b[1;32m/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb Cell 2\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 9\u001b[0m samples \u001b[39m=\u001b[39m []\n\u001b[1;32m 10\u001b[0m bar \u001b[39m=\u001b[39m tqdm(total\u001b[39m=\u001b[39mn)\n\u001b[0;32m---> 11\u001b[0m \u001b[39mfor\u001b[39;49;00m sample \u001b[39min\u001b[39;49;00m dataset:\n\u001b[1;32m 12\u001b[0m \u001b[39mif\u001b[39;49;00m config\u001b[39m.\u001b[39;49mmin_chars_len \u001b[39m<\u001b[39;49m\u001b[39m=\u001b[39;49m \u001b[39mlen\u001b[39;49m(sample[\u001b[39m'\u001b[39;49m\u001b[39mtext\u001b[39;49m\u001b[39m'\u001b[39;49m]) \u001b[39m<\u001b[39;49m\u001b[39m=\u001b[39;49m config\u001b[39m.\u001b[39;49mmax_chars_len:\n\u001b[1;32m 13\u001b[0m samples\u001b[39m.\u001b[39;49mappend(sample)\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/datasets/iterable_dataset.py:1379\u001b[0m, in \u001b[0;36mIterableDataset.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1376\u001b[0m \u001b[39myield\u001b[39;00m formatter\u001b[39m.\u001b[39mformat_row(pa_table)\n\u001b[1;32m 1377\u001b[0m \u001b[39mreturn\u001b[39;00m\n\u001b[0;32m-> 1379\u001b[0m \u001b[39mfor\u001b[39;49;00m key, example \u001b[39min\u001b[39;49;00m ex_iterable:\n\u001b[1;32m 1380\u001b[0m \u001b[39mif\u001b[39;49;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfeatures:\n\u001b[1;32m 1381\u001b[0m \u001b[39m# `IterableDataset` automatically fills missing columns with None.\u001b[39;49;00m\n\u001b[1;32m 1382\u001b[0m \u001b[39m# This is done with `_apply_feature_types_on_example`.\u001b[39;49;00m\n\u001b[1;32m 1383\u001b[0m example \u001b[39m=\u001b[39;49m _apply_feature_types_on_example(\n\u001b[1;32m 1384\u001b[0m example, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfeatures, token_per_repo_id\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_token_per_repo_id\n\u001b[1;32m 1385\u001b[0m )\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/datasets/iterable_dataset.py:281\u001b[0m, in \u001b[0;36mArrowExamplesIterable.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__iter__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m 280\u001b[0m formatter \u001b[39m=\u001b[39m PythonFormatter()\n\u001b[0;32m--> 281\u001b[0m \u001b[39mfor\u001b[39;49;00m key, pa_table \u001b[39min\u001b[39;49;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgenerate_tables_fn(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mkwargs):\n\u001b[1;32m 282\u001b[0m \u001b[39mfor\u001b[39;49;00m pa_subtable \u001b[39min\u001b[39;49;00m pa_table\u001b[39m.\u001b[39;49mto_reader(max_chunksize\u001b[39m=\u001b[39;49mconfig\u001b[39m.\u001b[39;49mARROW_READER_BATCH_SIZE_IN_DATASET_ITER):\n\u001b[1;32m 283\u001b[0m formatted_batch \u001b[39m=\u001b[39;49m formatter\u001b[39m.\u001b[39;49mformat_batch(pa_subtable)\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/datasets/packaged_modules/json/json.py:107\u001b[0m, in \u001b[0;36mJson._generate_tables\u001b[0;34m(self, files)\u001b[0m\n\u001b[1;32m 103\u001b[0m encoding_errors \u001b[39m=\u001b[39m (\n\u001b[1;32m 104\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mencoding_errors \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mencoding_errors \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mstrict\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 105\u001b[0m )\n\u001b[1;32m 106\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m--> 107\u001b[0m batch \u001b[39m=\u001b[39m f\u001b[39m.\u001b[39;49mread(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mconfig\u001b[39m.\u001b[39;49mchunksize)\n\u001b[1;32m 108\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m batch:\n\u001b[1;32m 109\u001b[0m \u001b[39mbreak\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/datasets/download/streaming_download_manager.py:333\u001b[0m, in \u001b[0;36m_add_retries_to_file_obj_read_method..read_with_retries\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[39mfor\u001b[39;00m retry \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m1\u001b[39m, max_retries \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m):\n\u001b[1;32m 332\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 333\u001b[0m out \u001b[39m=\u001b[39m read(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 334\u001b[0m \u001b[39mbreak\u001b[39;00m\n\u001b[1;32m 335\u001b[0m \u001b[39mexcept\u001b[39;00m (ClientError, \u001b[39mTimeoutError\u001b[39;00m) \u001b[39mas\u001b[39;00m err:\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/fsspec/spec.py:1856\u001b[0m, in \u001b[0;36mAbstractBufferedFile.read\u001b[0;34m(self, length)\u001b[0m\n\u001b[1;32m 1853\u001b[0m \u001b[39mif\u001b[39;00m length \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 1854\u001b[0m \u001b[39m# don't even bother calling fetch\u001b[39;00m\n\u001b[1;32m 1855\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m-> 1856\u001b[0m out \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcache\u001b[39m.\u001b[39;49m_fetch(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mloc, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mloc \u001b[39m+\u001b[39;49m length)\n\u001b[1;32m 1857\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mloc \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39mlen\u001b[39m(out)\n\u001b[1;32m 1858\u001b[0m \u001b[39mreturn\u001b[39;00m out\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/fsspec/caching.py:189\u001b[0m, in \u001b[0;36mReadAheadCache._fetch\u001b[0;34m(self, start, end)\u001b[0m\n\u001b[1;32m 187\u001b[0m part \u001b[39m=\u001b[39m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 188\u001b[0m end \u001b[39m=\u001b[39m \u001b[39mmin\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msize, end \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mblocksize)\n\u001b[0;32m--> 189\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcache \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfetcher(start, end) \u001b[39m# new block replaces old\u001b[39;00m\n\u001b[1;32m 190\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstart \u001b[39m=\u001b[39m start\n\u001b[1;32m 191\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mend \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstart \u001b[39m+\u001b[39m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcache)\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py:422\u001b[0m, in \u001b[0;36mHfFileSystemFile._fetch_range\u001b[0;34m(self, start, end)\u001b[0m\n\u001b[1;32m 415\u001b[0m headers \u001b[39m=\u001b[39m {\n\u001b[1;32m 416\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mrange\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mbytes=\u001b[39m\u001b[39m{\u001b[39;00mstart\u001b[39m}\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m{\u001b[39;00mend\u001b[39m \u001b[39m\u001b[39m-\u001b[39m\u001b[39m \u001b[39m\u001b[39m1\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 417\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfs\u001b[39m.\u001b[39m_api\u001b[39m.\u001b[39m_build_hf_headers(),\n\u001b[1;32m 418\u001b[0m }\n\u001b[1;32m 419\u001b[0m url \u001b[39m=\u001b[39m (\n\u001b[1;32m 420\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfs\u001b[39m.\u001b[39mendpoint\u001b[39m}\u001b[39;00m\u001b[39m/\u001b[39m\u001b[39m{\u001b[39;00mREPO_TYPES_URL_PREFIXES\u001b[39m.\u001b[39mget(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolved_path\u001b[39m.\u001b[39mrepo_type,\u001b[39m \u001b[39m\u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)\u001b[39m \u001b[39m\u001b[39m+\u001b[39m\u001b[39m \u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolved_path\u001b[39m.\u001b[39mrepo_id\u001b[39m}\u001b[39;00m\u001b[39m/resolve/\u001b[39m\u001b[39m{\u001b[39;00msafe_quote(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolved_path\u001b[39m.\u001b[39mrevision)\u001b[39m}\u001b[39;00m\u001b[39m/\u001b[39m\u001b[39m{\u001b[39;00msafe_quote(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolved_path\u001b[39m.\u001b[39mpath_in_repo)\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 421\u001b[0m )\n\u001b[0;32m--> 422\u001b[0m r \u001b[39m=\u001b[39m http_backoff(\u001b[39m\"\u001b[39;49m\u001b[39mGET\u001b[39;49m\u001b[39m\"\u001b[39;49m, url, headers\u001b[39m=\u001b[39;49mheaders)\n\u001b[1;32m 423\u001b[0m hf_raise_for_status(r)\n\u001b[1;32m 424\u001b[0m \u001b[39mreturn\u001b[39;00m r\u001b[39m.\u001b[39mcontent\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/huggingface_hub/utils/_http.py:258\u001b[0m, in \u001b[0;36mhttp_backoff\u001b[0;34m(method, url, max_retries, base_wait_time, max_wait_time, retry_on_exceptions, retry_on_status_codes, **kwargs)\u001b[0m\n\u001b[1;32m 255\u001b[0m kwargs[\u001b[39m\"\u001b[39m\u001b[39mdata\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mseek(io_obj_initial_pos)\n\u001b[1;32m 257\u001b[0m \u001b[39m# Perform request and return if status_code is not in the retry list.\u001b[39;00m\n\u001b[0;32m--> 258\u001b[0m response \u001b[39m=\u001b[39m session\u001b[39m.\u001b[39;49mrequest(method\u001b[39m=\u001b[39;49mmethod, url\u001b[39m=\u001b[39;49murl, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 259\u001b[0m \u001b[39mif\u001b[39;00m response\u001b[39m.\u001b[39mstatus_code \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m retry_on_status_codes:\n\u001b[1;32m 260\u001b[0m \u001b[39mreturn\u001b[39;00m response\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 584\u001b[0m send_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m 585\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtimeout\u001b[39m\u001b[39m\"\u001b[39m: timeout,\n\u001b[1;32m 586\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 587\u001b[0m }\n\u001b[1;32m 588\u001b[0m send_kwargs\u001b[39m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(prep, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49msend_kwargs)\n\u001b[1;32m 591\u001b[0m \u001b[39mreturn\u001b[39;00m resp\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/sessions.py:725\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 722\u001b[0m \u001b[39mif\u001b[39;00m allow_redirects:\n\u001b[1;32m 723\u001b[0m \u001b[39m# Redirect resolving generator.\u001b[39;00m\n\u001b[1;32m 724\u001b[0m gen \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolve_redirects(r, request, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m--> 725\u001b[0m history \u001b[39m=\u001b[39m [resp \u001b[39mfor\u001b[39;49;00m resp \u001b[39min\u001b[39;49;00m gen]\n\u001b[1;32m 726\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 727\u001b[0m history \u001b[39m=\u001b[39m []\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/sessions.py:725\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 722\u001b[0m \u001b[39mif\u001b[39;00m allow_redirects:\n\u001b[1;32m 723\u001b[0m \u001b[39m# Redirect resolving generator.\u001b[39;00m\n\u001b[1;32m 724\u001b[0m gen \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresolve_redirects(r, request, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m--> 725\u001b[0m history \u001b[39m=\u001b[39m [resp \u001b[39mfor\u001b[39;49;00m resp \u001b[39min\u001b[39;49;00m gen]\n\u001b[1;32m 726\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 727\u001b[0m history \u001b[39m=\u001b[39m []\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/sessions.py:266\u001b[0m, in \u001b[0;36mSessionRedirectMixin.resolve_redirects\u001b[0;34m(self, resp, req, stream, timeout, verify, cert, proxies, yield_requests, **adapter_kwargs)\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[39myield\u001b[39;00m req\n\u001b[1;32m 264\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 266\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(\n\u001b[1;32m 267\u001b[0m req,\n\u001b[1;32m 268\u001b[0m stream\u001b[39m=\u001b[39;49mstream,\n\u001b[1;32m 269\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout,\n\u001b[1;32m 270\u001b[0m verify\u001b[39m=\u001b[39;49mverify,\n\u001b[1;32m 271\u001b[0m cert\u001b[39m=\u001b[39;49mcert,\n\u001b[1;32m 272\u001b[0m proxies\u001b[39m=\u001b[39;49mproxies,\n\u001b[1;32m 273\u001b[0m allow_redirects\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 274\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49madapter_kwargs,\n\u001b[1;32m 275\u001b[0m )\n\u001b[1;32m 277\u001b[0m extract_cookies_to_jar(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcookies, prepared_request, resp\u001b[39m.\u001b[39mraw)\n\u001b[1;32m 279\u001b[0m \u001b[39m# extract redirect url, if any, for the next loop\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/sessions.py:747\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 744\u001b[0m \u001b[39mpass\u001b[39;00m\n\u001b[1;32m 746\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m stream:\n\u001b[0;32m--> 747\u001b[0m r\u001b[39m.\u001b[39;49mcontent\n\u001b[1;32m 749\u001b[0m \u001b[39mreturn\u001b[39;00m r\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/models.py:899\u001b[0m, in \u001b[0;36mResponse.content\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 897\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 898\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 899\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content \u001b[39m=\u001b[39m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mjoin(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39miter_content(CONTENT_CHUNK_SIZE)) \u001b[39mor\u001b[39;00m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 901\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content_consumed \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 902\u001b[0m \u001b[39m# don't need to release the connection; that's been handled by urllib3\u001b[39;00m\n\u001b[1;32m 903\u001b[0m \u001b[39m# since we exhausted the data.\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/requests/models.py:816\u001b[0m, in \u001b[0;36mResponse.iter_content..generate\u001b[0;34m()\u001b[0m\n\u001b[1;32m 814\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mraw, \u001b[39m\"\u001b[39m\u001b[39mstream\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m 815\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 816\u001b[0m \u001b[39myield from\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mraw\u001b[39m.\u001b[39mstream(chunk_size, decode_content\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m 817\u001b[0m \u001b[39mexcept\u001b[39;00m ProtocolError \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 818\u001b[0m \u001b[39mraise\u001b[39;00m ChunkedEncodingError(e)\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/urllib3/response.py:936\u001b[0m, in \u001b[0;36mHTTPResponse.stream\u001b[0;34m(self, amt, decode_content)\u001b[0m\n\u001b[1;32m 934\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 935\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m is_fp_closed(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_fp) \u001b[39mor\u001b[39;00m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_decoded_buffer) \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m--> 936\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(amt\u001b[39m=\u001b[39;49mamt, decode_content\u001b[39m=\u001b[39;49mdecode_content)\n\u001b[1;32m 938\u001b[0m \u001b[39mif\u001b[39;00m data:\n\u001b[1;32m 939\u001b[0m \u001b[39myield\u001b[39;00m data\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/urllib3/response.py:879\u001b[0m, in \u001b[0;36mHTTPResponse.read\u001b[0;34m(self, amt, decode_content, cache_content)\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_decoded_buffer) \u001b[39m>\u001b[39m\u001b[39m=\u001b[39m amt:\n\u001b[1;32m 877\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_decoded_buffer\u001b[39m.\u001b[39mget(amt)\n\u001b[0;32m--> 879\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_raw_read(amt)\n\u001b[1;32m 881\u001b[0m flush_decoder \u001b[39m=\u001b[39m amt \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mor\u001b[39;00m (amt \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m data)\n\u001b[1;32m 883\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m data \u001b[39mand\u001b[39;00m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_decoded_buffer) \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/urllib3/response.py:814\u001b[0m, in \u001b[0;36mHTTPResponse._raw_read\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 811\u001b[0m fp_closed \u001b[39m=\u001b[39m \u001b[39mgetattr\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_fp, \u001b[39m\"\u001b[39m\u001b[39mclosed\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 813\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_error_catcher():\n\u001b[0;32m--> 814\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_fp_read(amt) \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m fp_closed \u001b[39melse\u001b[39;00m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 815\u001b[0m \u001b[39mif\u001b[39;00m amt \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m amt \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m data:\n\u001b[1;32m 816\u001b[0m \u001b[39m# Platform-specific: Buggy versions of Python.\u001b[39;00m\n\u001b[1;32m 817\u001b[0m \u001b[39m# Close the connection when no data is returned\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 822\u001b[0m \u001b[39m# not properly close the connection in all cases. There is\u001b[39;00m\n\u001b[1;32m 823\u001b[0m \u001b[39m# no harm in redundantly calling close.\u001b[39;00m\n\u001b[1;32m 824\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_fp\u001b[39m.\u001b[39mclose()\n", + "File \u001b[0;32m~/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/urllib3/response.py:799\u001b[0m, in \u001b[0;36mHTTPResponse._fp_read\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 796\u001b[0m \u001b[39mreturn\u001b[39;00m buffer\u001b[39m.\u001b[39mgetvalue()\n\u001b[1;32m 797\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 798\u001b[0m \u001b[39m# StringIO doesn't like amt=None\u001b[39;00m\n\u001b[0;32m--> 799\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_fp\u001b[39m.\u001b[39;49mread(amt) \u001b[39mif\u001b[39;00m amt \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_fp\u001b[39m.\u001b[39mread()\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py:466\u001b[0m, in \u001b[0;36mHTTPResponse.read\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 463\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlength \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m amt \u001b[39m>\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlength:\n\u001b[1;32m 464\u001b[0m \u001b[39m# clip the read to the \"end of response\"\u001b[39;00m\n\u001b[1;32m 465\u001b[0m amt \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlength\n\u001b[0;32m--> 466\u001b[0m s \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfp\u001b[39m.\u001b[39mread(amt)\n\u001b[1;32m 467\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m s \u001b[39mand\u001b[39;00m amt:\n\u001b[1;32m 468\u001b[0m \u001b[39m# Ideally, we would raise IncompleteRead if the content-length\u001b[39;00m\n\u001b[1;32m 469\u001b[0m \u001b[39m# wasn't satisfied, but it might break compatibility.\u001b[39;00m\n\u001b[1;32m 470\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_close_conn()\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11/socket.py:706\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 704\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m 705\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 706\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[1;32m 707\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[1;32m 708\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ssl.py:1311\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1307\u001b[0m \u001b[39mif\u001b[39;00m flags \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 1308\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 1309\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m\n\u001b[1;32m 1310\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m)\n\u001b[0;32m-> 1311\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(nbytes, buffer)\n\u001b[1;32m 1312\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1313\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mrecv_into(buffer, nbytes, flags)\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ssl.py:1167\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1165\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 1166\u001b[0m \u001b[39mif\u001b[39;00m buffer \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 1167\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m, buffer)\n\u001b[1;32m 1168\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1169\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sslobj\u001b[39m.\u001b[39mread(\u001b[39mlen\u001b[39m)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], @@ -48,11 +79,12 @@ " dataset = load_dataset(config.dataset, split=split, streaming=True)\n", " samples = []\n", " bar = tqdm(total=n)\n", - " while len(samples) < n:\n", - " new_samples = dataset.take(batch_size)\n", - " new_samples = list(filter(lambda x: config.min_chars_len <= len(x['text']) <= config.max_chars_len, new_samples))\n", - " samples.extend(new_samples)\n", - " bar.update(len(new_samples))\n", + " for sample in dataset:\n", + " if config.min_chars_len <= len(sample['text']) <= config.max_chars_len:\n", + " samples.append(sample)\n", + " bar.update(1)\n", + " if len(samples) == n:\n", + " break\n", "\n", " return samples\n", "\n", @@ -65,16 +97,62 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train samples: 10000\n" + ] + } + ], + "source": [ + "print(f'Train samples: {len(train_samples)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train texts: 10000\n" + ] + } + ], "source": [ "def extract_texts(samples):\n", " return [sample['text'] for sample in samples]\n", "\n", "train_texts = extract_texts(train_samples)\n", + "print(f'Train texts: {len(train_texts)}')\n", "valid_texts = extract_texts(valid_samples)\n", "test_texts = extract_texts(test_samples)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "69" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(set(train_texts))" + ] + }, { "cell_type": "code", "execution_count": null, @@ -84,18 +162,46 @@ "name": "stdout", "output_type": "stream", "text": [ - "There is a strand of literature on continuous-action games on networks in which each player takes an action represented by a real value \\(x\\ge 0\\)  [1]}, [2]}. Typically, player \\(i\\) maximizes the following quadratic utility function\n", - "\\(u_i(x_i;{\\bf {x}}_{-i}) = \\alpha x_i - \\frac{1}{2}x_i^2 +\\gamma \\sum _{j\\ne i} \\mathcal {A}_{ij}x_ix_j,\\) \n", - "\n", - "There is a strand of literature on continuous-action games on networks in which each player takes an action represented by a real value \\(x\\ge 0\\)  [1]}, [2]}. Typically, player \\(i\\) maximizes the following quadratic utility function\n", - "\\(u_i(x_i;{\\bf {x}}_{-i}) = \\alpha x_i - \\frac{1}{2}x_i^2 +\\gamma \\sum _{j\\ne i} \\mathcal {A}_{ij}x_ix_j,\\) \n", - "\n" + "Overlap: 4\n", + "Before: 10000, 1065\n", + "After: 65, 49\n", + "Overlap: 0\n", + "Before: 65, 1023\n", + "After: 65, 25\n", + "Overlap: 4\n", + "Before: 49, 25\n", + "After: 45, 21\n", + "Train: 65, Valid: 45, Test: 21\n" ] } ], "source": [ - "print(valid_texts[0])\n", - "print(test_texts[0])" + "def remove_overlap(texts_a, texts_b):\n", + " overlap = list(set(texts_a) & set(texts_b))\n", + " print(f'Overlap: {len(overlap)}')\n", + "\n", + " # remove\n", + " texts_a = list(set(texts_a) - set(overlap))\n", + " texts_b = list(set(texts_b) - set(overlap))\n", + " return texts_a, texts_b\n", + "\n", + "train_texts, valid_texts = remove_overlap(train_texts, valid_texts)\n", + "train_texts, test_texts = remove_overlap(train_texts, test_texts)\n", + "valid_texts, test_texts = remove_overlap(valid_texts, test_texts)\n", + "\n", + "print(f\"Train: {len(train_texts)}, Valid: {len(valid_texts)}, Test: {len(test_texts)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# assert that texts don't overlap\n", + "assert len(set(train_texts) & set(valid_texts)) == 0\n", + "assert len(set(train_texts) & set(test_texts)) == 0\n", + "assert len(set(valid_texts) & set(test_texts)) == 0" ] }, { @@ -107,12 +213,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "0.25, 0.5, 0.75 quantile: [289. 382. 458.25]\n", + "0.25, 0.5, 0.75 quantile: [290. 369. 460.]\n", "Max len: 508\n", "Min len: 145\n", - "Example: Theorem B (Equivalent version of Beurling's Theorem, [1]}). \n", - "A closed subspace of \\(H^{2}\\) is shift-invariant iff it is invariant under multiplication by every bounded analytic function in \\(H^{\\infty }\\) .\n", - "\n", + "Example: We will now see how the negative entropy \\(h\\) yields\n", + "explicit steps that effectively enforce the simplicial constraints.\n", + "Indeed, after a short classical calculation (see, for instance [1]}),\n", + "it is possible to show that the update (REF ) is equivalent to the\n", + "following one: for all \\(j\\) in \\(\\lbrace 1, \\ldots , d\\rbrace \\) ,\n", + "\\( \\mathbf {z}_j^{k+1} = \\frac{\\mathbf {z}_j^k e^{-\\eta _k \\nabla f(\\mathbf {z}^k)_j}}{\\sum _{l=1}^d \\mathbf {z}_l^k e^{-\\eta _k \\nabla f(\\mathbf {z}^k)_l}},\\) \n", "\n" ] } @@ -138,6 +247,7 @@ "import os\n", "import json\n", "\n", + "os.makedirs(config.save_dir, exist_ok=True)\n", "for texts, split_name in [\n", " (train_texts, 'train'),\n", " (valid_texts, 'valid'),\n",