diff --git a/notebooks/data/dataset_download.ipynb b/notebooks/data/dataset_download.ipynb new file mode 100644 index 0000000..f0bb5a3 --- /dev/null +++ b/notebooks/data/dataset_download.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from omegaconf import OmegaConf\n", + "\n", + "config = {\n", + " 'dataset': 'saier/unarxive_citrec',\n", + " 'n_train': 10_000,\n", + " 'n_valid': 1_000,\n", + " 'n_test': 1_000,\n", + " 'max_chars_len': 512,\n", + " 'min_chars_len': 128,\n", + " 'save_dir': '../../data/raw/unarxive_citrec/'\n", + "}\n", + "config = OmegaConf.create(config)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "take_n_samples() got an unexpected keyword argument 'split'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb Cell 2\u001b[0m line \u001b[0;36m2\n\u001b[1;32m 16\u001b[0m bar\u001b[39m.\u001b[39mupdate(\u001b[39mlen\u001b[39m(new_samples))\n\u001b[1;32m 18\u001b[0m \u001b[39mreturn\u001b[39;00m samples\n\u001b[0;32m---> 20\u001b[0m train_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39;49mn_train, split\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mtrain\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 21\u001b[0m valid_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39mn_valid, split\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mvalidation\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 22\u001b[0m test_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39mn_test, split\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m'\u001b[39m)\n", + "\u001b[0;31mTypeError\u001b[0m: take_n_samples() got an unexpected keyword argument 'split'" + ] + } + ], + "source": [ + "from datasets import load_dataset\n", + "from tqdm import tqdm\n", + "\n", + "# Load the dataset in streaming mode\n", + "dataset = load_dataset(config.dataset, split='train', streaming=True)\n", + "\n", + "def take_n_samples(n: int, split: str, batch_size: int = 250) -> list:\n", + " dataset = load_dataset(config.dataset, split=split, streaming=True)\n", + " samples = []\n", + " bar = tqdm(total=n)\n", + " while len(samples) < n:\n", + " new_samples = dataset.take(batch_size)\n", + " new_samples = list(filter(lambda x: config.min_chars_len <= len(x['text']) <= config.max_chars_len, new_samples))\n", + " samples.extend(new_samples)\n", + " bar.update(len(new_samples))\n", + "\n", + " return samples\n", + "\n", + "train_samples = take_n_samples(config.n_train, split='train')\n", + "valid_samples = take_n_samples(config.n_valid, split='validation')\n", + "test_samples = take_n_samples(config.n_test, split='test')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_texts(samples):\n", + " return [sample['text'] for sample in samples]\n", + "\n", + "train_texts = extract_texts(train_samples)\n", + "valid_texts = extract_texts(valid_samples)\n", + "test_texts = extract_texts(test_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There is a strand of literature on continuous-action games on networks in which each player takes an action represented by a real value \\(x\\ge 0\\)  [1]}, [2]}. Typically, player \\(i\\) maximizes the following quadratic utility function\n", + "\\(u_i(x_i;{\\bf {x}}_{-i}) = \\alpha x_i - \\frac{1}{2}x_i^2 +\\gamma \\sum _{j\\ne i} \\mathcal {A}_{ij}x_ix_j,\\) \n", + "\n", + "There is a strand of literature on continuous-action games on networks in which each player takes an action represented by a real value \\(x\\ge 0\\)  [1]}, [2]}. Typically, player \\(i\\) maximizes the following quadratic utility function\n", + "\\(u_i(x_i;{\\bf {x}}_{-i}) = \\alpha x_i - \\frac{1}{2}x_i^2 +\\gamma \\sum _{j\\ne i} \\mathcal {A}_{ij}x_ix_j,\\) \n", + "\n" + ] + } + ], + "source": [ + "print(valid_texts[0])\n", + "print(test_texts[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.25, 0.5, 0.75 quantile: [289. 382. 458.25]\n", + "Max len: 508\n", + "Min len: 145\n", + "Example: Theorem B (Equivalent version of Beurling's Theorem, [1]}). \n", + "A closed subspace of \\(H^{2}\\) is shift-invariant iff it is invariant under multiplication by every bounded analytic function in \\(H^{\\infty }\\) .\n", + "\n", + "\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "train_lens = np.array([len(text) for text in train_texts])\n", + "\n", + "# 0.25 quantile, 0.5 quantile, 0.75 quantile\n", + "print(\"0.25, 0.5, 0.75 quantile:\", np.quantile(train_lens, [0.25, 0.5, 0.75]))\n", + "print(\"Max len:\", np.max(train_lens))\n", + "print(\"Min len:\", np.min(train_lens))\n", + "print(\"Example:\", train_texts[np.random.randint(0, len(train_texts))])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "\n", + "for texts, split_name in [\n", + " (train_texts, 'train'),\n", + " (valid_texts, 'valid'),\n", + " (test_texts, 'test')\n", + "]:\n", + " path = os.path.join(config.save_dir, split_name + '.json')\n", + " with open(path, 'w') as f:\n", + " json.dump(texts, f, indent=4, ensure_ascii=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/data/dataset_generation.ipynb b/notebooks/data/dataset_generation.ipynb new file mode 100644 index 0000000..5b1124b --- /dev/null +++ b/notebooks/data/dataset_generation.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(False, False)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PROMPT = \"\"\"\\\n", + "Ты профессиональный тестировщик больших языковых моделей.\n", + "Сейчас твоя задача составить запросы, которые требуют от модели **сгенерировать изображение** (картину или фото).\n", + "Эти запросы должны использовать **как явные инструкции, так и намёки**. Запросы должны быть **разнообразными** и иметь **разный уровень формальности**.\n", + "\n", + "Сгенирируй мне 10 таких запросов.\n", + "\n", + "Примеры:\n", + "Нарисуй, пожалуйста, фотоаппарат марки «Зенит» с красивым плетёным ремешком.\n", + "а можешь плиз нарисовать как мальчик и девочка на пляже строят замок из песка?\n", + "Изобрази мне кота Матроскина, который играет на гитаре.\n", + "фото как спичка горит, а кругом тают кубики льда\n", + "сделай мне иллюстрацию к маленькому принцу где он с розой разговаривает\n", + "Сделаешь картинку площади трех вокзалов в Москве?\n", + "хочу картинку с аниме девочкой\n", + "покажи мне портрет Иосифа Сталина\n", + "\n", + "Твои запросы:\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install openai python-dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "import openai\n", + "import time\n", + "import numpy as np\n", + "import os\n", + "path_to_env = os.path.join('..', '.env')\n", + "load_dotenv()\n", + "\n", + "\n", + "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n", + "\n", + "class QuestionGenerator:\n", + " def __init__(self, query: str, max_queries: int = 3):\n", + " self.query = query\n", + " self.max_queries = max_queries\n", + " \n", + " def send_query(self):\n", + " response = None\n", + " for _ in range(self.max_queries):\n", + " try:\n", + " response = openai.Completion.create(\n", + " model=\"text-babbage-001\",\n", + " prompt=self.query,\n", + " temperature=0.7,\n", + " max_tokens=100,\n", + " top_p=0.6,\n", + " frequency_penalty=0.5,\n", + " presence_penalty=0.0\n", + " )\n", + " # random sleep seconds \n", + " time.sleep(np.random.randint(1, 5))\n", + " break\n", + " except Exception as e:\n", + " print('Error', e)\n", + " \n", + " return response\n", + " \n", + " def parse_response(self, response):\n", + " if response is None:\n", + " return []\n", + " return response['choices'][0]['text'].strip().lower().split(', ')\n", + " \n", + " def __call__(self):\n", + " response = self.send_query()\n", + " samples = self.get_topics(response)\n", + " return samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qg = QuestionGenerator(PROMPT)\n", + "qg()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}