diff --git a/README.md b/README.md index 87450fd..d561199 100644 --- a/README.md +++ b/README.md @@ -55,4 +55,61 @@ Project Organization -------- +## Installation + +```bash +pip3 install -r requirements.txt +``` + +## Dataset + +#### Training dataset +The training dataset is based on `saier/unarxive_citrec` [hf](https://huggingface.co/datasets/saier/unarxive_citrec). + +*Details*: + +Train size: 9082 +Valid size: 702 +Test size: 568 + +All the samples have length from `128` to `512` characters (TO-DO: characters -> tokens) +More in `notebooks/data/dataset_download.ipynb` + +After collecting the dataset, we carefully translated the samples from English to Russian using the OpenAI API. Details in `notebooks/data/dataset_translate.ipynb` + +#### Dataset for model comparison (EvalDataset) +This dataset is based on `turkic_xwmt`, `subset=ru-en`, `split=test` [hf](https://huggingface.co/datasets/turkic_xwmt). + +Dataset size: 1000 + +## Models comparison + +Models comparison is based on bleu score of the translated samples and reference translation by OpenAI. + +*Models*: +transformer-en-ru: `Helsinki-NLP/opus-mt-en-ru` [hf](https://huggingface.co/Helsinki-NLP/opus-mt-en-ru) +nnlb-1.3B-distilled: `facebook/nllb-200-distilled-1.3B` [hf](https://huggingface.co/facebook/nllb-200-distilled-1.3B) + +**Results**: +transformer-en-ru BLEU: 2.58 +nnlb-1.3B-distilled BLEU: 2.55 + +Even though results aren't statistically important, transformer-en-ru model was chosen since it's faster and has smaller size. +Details in `src/finetune/eval_bleu.py` + +## Model finetuning + +Simple seq2seq model finetuning transformer-en-ru. +Details in `notebooks/finetune/finetune.ipynb`. +Model on [hf](https://huggingface.co/under-tree/transformer-en-ru) + +**Fine-tuned model results:** +eval_loss: 0.656 +eval_bleu: 67.197 (suspeciously high) + + + + + +

Project based on the cookiecutter data science project template. #cookiecutterdatascience

diff --git a/notebooks/data/dataset_generation.ipynb b/notebooks/data/dataset_generation.ipynb deleted file mode 100644 index 5b1124b..0000000 --- a/notebooks/data/dataset_generation.ipynb +++ /dev/null @@ -1,136 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(False, False)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "PROMPT = \"\"\"\\\n", - "Ты профессиональный тестировщик больших языковых моделей.\n", - "Сейчас твоя задача составить запросы, которые требуют от модели **сгенерировать изображение** (картину или фото).\n", - "Эти запросы должны использовать **как явные инструкции, так и намёки**. Запросы должны быть **разнообразными** и иметь **разный уровень формальности**.\n", - "\n", - "Сгенирируй мне 10 таких запросов.\n", - "\n", - "Примеры:\n", - "Нарисуй, пожалуйста, фотоаппарат марки «Зенит» с красивым плетёным ремешком.\n", - "а можешь плиз нарисовать как мальчик и девочка на пляже строят замок из песка?\n", - "Изобрази мне кота Матроскина, который играет на гитаре.\n", - "фото как спичка горит, а кругом тают кубики льда\n", - "сделай мне иллюстрацию к маленькому принцу где он с розой разговаривает\n", - "Сделаешь картинку площади трех вокзалов в Москве?\n", - "хочу картинку с аниме девочкой\n", - "покажи мне портрет Иосифа Сталина\n", - "\n", - "Твои запросы:\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip3 install openai python-dotenv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dotenv import load_dotenv\n", - "import openai\n", - "import time\n", - "import numpy as np\n", - "import os\n", - "path_to_env = os.path.join('..', '.env')\n", - "load_dotenv()\n", - "\n", - "\n", - "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n", - "\n", - "class QuestionGenerator:\n", - " def __init__(self, query: str, max_queries: int = 3):\n", - " self.query = query\n", - " self.max_queries = max_queries\n", - " \n", - " def send_query(self):\n", - " response = None\n", - " for _ in range(self.max_queries):\n", - " try:\n", - " response = openai.Completion.create(\n", - " model=\"text-babbage-001\",\n", - " prompt=self.query,\n", - " temperature=0.7,\n", - " max_tokens=100,\n", - " top_p=0.6,\n", - " frequency_penalty=0.5,\n", - " presence_penalty=0.0\n", - " )\n", - " # random sleep seconds \n", - " time.sleep(np.random.randint(1, 5))\n", - " break\n", - " except Exception as e:\n", - " print('Error', e)\n", - " \n", - " return response\n", - " \n", - " def parse_response(self, response):\n", - " if response is None:\n", - " return []\n", - " return response['choices'][0]['text'].strip().lower().split(', ')\n", - " \n", - " def __call__(self):\n", - " response = self.send_query()\n", - " samples = self.get_topics(response)\n", - " return samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qg = QuestionGenerator(PROMPT)\n", - "qg()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}