From 58813d13378223da55ee859190bd5e9a72fdb858 Mon Sep 17 00:00:00 2001
From: RodionfromHSE <rodionkhvorostov@gmail.com>
Date: Thu, 26 Oct 2023 14:13:22 +0200
Subject: [PATCH] Raw version of data generation

---
 notebooks/data/dataset_download.ipynb   | 173 ++++++++++++++++++++++++
 notebooks/data/dataset_generation.ipynb | 136 +++++++++++++++++++
 2 files changed, 309 insertions(+)
 create mode 100644 notebooks/data/dataset_download.ipynb
 create mode 100644 notebooks/data/dataset_generation.ipynb
diff --git a/notebooks/data/dataset_download.ipynb b/notebooks/data/dataset_download.ipynb
new file mode 100644
index 0000000..f0bb5a3
--- /dev/null
+++ b/notebooks/data/dataset_download.ipynb
@@ -0,0 +1,173 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from omegaconf import OmegaConf\n",
+    "\n",
+    "config = {\n",
+    "    'dataset': 'saier/unarxive_citrec',\n",
+    "    'n_train': 10_000,\n",
+    "    'n_valid': 1_000,\n",
+    "    'n_test': 1_000,\n",
+    "    'max_chars_len': 512,\n",
+    "    'min_chars_len': 128,\n",
+    "    'save_dir': '../../data/raw/unarxive_citrec/'\n",
+    "}\n",
+    "config = OmegaConf.create(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "take_n_samples() got an unexpected keyword argument 'split'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb Cell 2\u001b[0m line \u001b[0;36m2\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb#W0sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m         bar\u001b[39m.\u001b[39mupdate(\u001b[39mlen\u001b[39m(new_samples))\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb#W0sZmlsZQ%3D%3D?line=17'>18</a>\u001b[0m     \u001b[39mreturn\u001b[39;00m samples\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb#W0sZmlsZQ%3D%3D?line=19'>20</a>\u001b[0m train_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39;49mn_train, split\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mtrain\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb#W0sZmlsZQ%3D%3D?line=20'>21</a>\u001b[0m valid_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39mn_valid, split\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mvalidation\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/user010/Desktop/Programming/ML/En2RuTranslator/notebooks/data/dataset_download.ipynb#W0sZmlsZQ%3D%3D?line=21'>22</a>\u001b[0m test_samples \u001b[39m=\u001b[39m take_n_samples(config\u001b[39m.\u001b[39mn_test, split\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m'\u001b[39m)\n",
+      "\u001b[0;31mTypeError\u001b[0m: take_n_samples() got an unexpected keyword argument 'split'"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "# Load the dataset in streaming mode\n",
+    "dataset = load_dataset(config.dataset, split='train', streaming=True)\n",
+    "\n",
+    "def take_n_samples(n: int, split: str, batch_size: int = 250) -> list:\n",
+    "    dataset = load_dataset(config.dataset, split=split, streaming=True)\n",
+    "    samples = []\n",
+    "    bar = tqdm(total=n)\n",
+    "    while len(samples) < n:\n",
+    "        new_samples = dataset.take(batch_size)\n",
+    "        new_samples = list(filter(lambda x: config.min_chars_len <= len(x['text']) <= config.max_chars_len, new_samples))\n",
+    "        samples.extend(new_samples)\n",
+    "        bar.update(len(new_samples))\n",
+    "\n",
+    "    return samples\n",
+    "\n",
+    "train_samples = take_n_samples(config.n_train, split='train')\n",
+    "valid_samples = take_n_samples(config.n_valid, split='validation')\n",
+    "test_samples = take_n_samples(config.n_test, split='test')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_texts(samples):\n",
+    "    return [sample['text'] for sample in samples]\n",
+    "\n",
+    "train_texts = extract_texts(train_samples)\n",
+    "valid_texts = extract_texts(valid_samples)\n",
+    "test_texts = extract_texts(test_samples)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "There is a strand of literature on continuous-action games on networks in which each player takes an action represented by a real value \\(x\\ge 0\\)  [1]}, [2]}. Typically, player \\(i\\)  maximizes the following quadratic utility function\n",
+      "\\(u_i(x_i;{\\bf {x}}_{-i}) = \\alpha x_i - \\frac{1}{2}x_i^2 +\\gamma \\sum _{j\\ne i} \\mathcal {A}_{ij}x_ix_j,\\) \n",
+      "\n",
+      "There is a strand of literature on continuous-action games on networks in which each player takes an action represented by a real value \\(x\\ge 0\\)  [1]}, [2]}. Typically, player \\(i\\)  maximizes the following quadratic utility function\n",
+      "\\(u_i(x_i;{\\bf {x}}_{-i}) = \\alpha x_i - \\frac{1}{2}x_i^2 +\\gamma \\sum _{j\\ne i} \\mathcal {A}_{ij}x_ix_j,\\) \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(valid_texts[0])\n",
+    "print(test_texts[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.25, 0.5, 0.75 quantile: [289.   382.   458.25]\n",
+      "Max len: 508\n",
+      "Min len: 145\n",
+      "Example: Theorem B (Equivalent version of Beurling's Theorem, [1]}). \n",
+      "A closed subspace of \\(H^{2}\\)  is shift-invariant iff it is invariant under multiplication by every bounded analytic function in \\(H^{\\infty }\\) .\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "train_lens = np.array([len(text) for text in train_texts])\n",
+    "\n",
+    "# 0.25 quantile, 0.5 quantile, 0.75 quantile\n",
+    "print(\"0.25, 0.5, 0.75 quantile:\", np.quantile(train_lens, [0.25, 0.5, 0.75]))\n",
+    "print(\"Max len:\", np.max(train_lens))\n",
+    "print(\"Min len:\", np.min(train_lens))\n",
+    "print(\"Example:\", train_texts[np.random.randint(0, len(train_texts))])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "\n",
+    "for texts, split_name in [\n",
+    "    (train_texts, 'train'),\n",
+    "    (valid_texts, 'valid'),\n",
+    "    (test_texts, 'test')\n",
+    "]:\n",
+    "    path = os.path.join(config.save_dir, split_name + '.json')\n",
+    "    with open(path, 'w') as f:\n",
+    "        json.dump(texts, f, indent=4, ensure_ascii=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/data/dataset_generation.ipynb b/notebooks/data/dataset_generation.ipynb
new file mode 100644
index 0000000..5b1124b
--- /dev/null
+++ b/notebooks/data/dataset_generation.ipynb
@@ -0,0 +1,136 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(False, False)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "PROMPT = \"\"\"\\\n",
+    "Ты профессиональный тестировщик больших языковых моделей.\n",
+    "Сейчас твоя задача составить запросы, которые требуют от модели **сгенерировать изображение** (картину или фото).\n",
+    "Эти запросы должны использовать **как явные инструкции, так и намёки**. Запросы должны быть **разнообразными** и иметь **разный уровень формальности**.\n",
+    "\n",
+    "Сгенирируй мне 10 таких запросов.\n",
+    "\n",
+    "Примеры:\n",
+    "Нарисуй, пожалуйста, фотоаппарат марки «Зенит» с красивым плетёным ремешком.\n",
+    "а можешь плиз нарисовать как мальчик и девочка на пляже строят замок из песка?\n",
+    "Изобрази мне кота Матроскина, который играет на гитаре.\n",
+    "фото как спичка горит, а кругом тают кубики льда\n",
+    "сделай мне иллюстрацию к маленькому принцу где он с розой разговаривает\n",
+    "Сделаешь картинку площади трех вокзалов в Москве?\n",
+    "хочу картинку с аниме девочкой\n",
+    "покажи мне портрет Иосифа Сталина\n",
+    "\n",
+    "Твои запросы:\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install openai python-dotenv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "import openai\n",
+    "import time\n",
+    "import numpy as np\n",
+    "import os\n",
+    "path_to_env = os.path.join('..', '.env')\n",
+    "load_dotenv()\n",
+    "\n",
+    "\n",
+    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
+    "\n",
+    "class QuestionGenerator:\n",
+    "  def __init__(self, query: str, max_queries: int = 3):\n",
+    "    self.query = query\n",
+    "    self.max_queries = max_queries\n",
+    "  \n",
+    "  def send_query(self):\n",
+    "    response = None\n",
+    "    for _ in range(self.max_queries):\n",
+    "      try:\n",
+    "        response = openai.Completion.create(\n",
+    "          model=\"text-babbage-001\",\n",
+    "          prompt=self.query,\n",
+    "          temperature=0.7,\n",
+    "          max_tokens=100,\n",
+    "          top_p=0.6,\n",
+    "          frequency_penalty=0.5,\n",
+    "          presence_penalty=0.0\n",
+    "        )\n",
+    "        # random sleep seconds \n",
+    "        time.sleep(np.random.randint(1, 5))\n",
+    "        break\n",
+    "      except Exception as e:\n",
+    "        print('Error', e)\n",
+    "      \n",
+    "    return response\n",
+    "  \n",
+    "  def parse_response(self, response):\n",
+    "    if response is None:\n",
+    "      return []\n",
+    "    return response['choices'][0]['text'].strip().lower().split(', ')\n",
+    "  \n",
+    "  def __call__(self):\n",
+    "    response = self.send_query()\n",
+    "    samples = self.get_topics(response)\n",
+    "    return samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qg = QuestionGenerator(PROMPT)\n",
+    "qg()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}