diff --git a/src/prod/example.ipynb b/src/prod/example.ipynb new file mode 100644 index 0000000..9323b07 --- /dev/null +++ b/src/prod/example.ipynb @@ -0,0 +1,280 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/user010/Desktop/Programming/ML/En2RuTranslator\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "\n", + "root_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))\n", + "print(root_dir)\n", + "assert os.path.exists(root_dir), f'Could not find root directory at {root_dir}'\n", + "sys.path.insert(0, root_dir)\n", + "\n", + "from custom_utils.config_handler import read_config, pprint_config" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"root\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator\",\n", + " \"params\": {\n", + " \"batch_size\": 16,\n", + " \"max_length\": 512,\n", + " \"train_args\": {\n", + " \"evaluation_strategy\": \"epoch\",\n", + " \"learning_rate\": 2e-05,\n", + " \"per_device_train_batch_size\": 16,\n", + " \"per_device_eval_batch_size\": 16,\n", + " \"weight_decay\": 0.01,\n", + " \"save_total_limit\": 3,\n", + " \"num_train_epochs\": 4,\n", + " \"predict_with_generate\": true\n", + " },\n", + " \"wandb_args\": {\n", + " \"report_to\": \"wandb\",\n", + " \"run_name\": \"finetune\"\n", + " }\n", + " },\n", + " \"dest_synonym_searcher\": {\n", + " \"path\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator/models/embs/cc.ru.100.bin\",\n", + " \"type\": \"fasttext\"\n", + " },\n", + " \"src_synonym_searcher\": {\n", + " \"path\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator/models/embs/cc.en.100.bin\",\n", + " \"type\": \"fasttext\"\n", + " },\n", + " \"translator\": {\n", + " \"name\": \"opus-distilled-en-ru\",\n", + " \"model_and_tokenizer_name\": \"under-tree/transformer-en-ru\",\n", + " \"output_dir\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator/models/opus-distilled-en-ru/finetuned\",\n", + " \"type\": \"seq2seq\"\n", + " },\n", + " \"attention_extractor\": {\n", + " \"type\": \"random\"\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "overrides = [\"setup=prod\"]\n", + "cfg = read_config(overrides=overrides)\n", + "pprint_config(cfg)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/user010/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/transformers/models/marian/tokenization_marian.py:197: UserWarning: Recommended: pip install sacremoses.\n", + " warnings.warn(\"Recommended: pip install sacremoses.\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4.22 s, sys: 2.59 s, total: 6.81 s\n", + "Wall time: 8.32 s\n" + ] + } + ], + "source": [ + "%%time\n", + "from translator_app import TranslatorApp\n", + "\n", + "app = TranslatorApp(cfg)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/user010/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:3848: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 9.32 s, sys: 1.47 s, total: 10.8 s\n", + "Wall time: 11.3 s\n" + ] + } + ], + "source": [ + "%%time\n", + "text = \"Oh, dear, I'm afraid I've said too much. I'm a bad, bad blogger.\"\n", + "app_output = app(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### App Output\n", + "\n", + "* input words\n", + "* input tokens\n", + "* src synonyms\n", + "* output words\n", + "* output tokens\n", + "* dest synonyms\n", + "* translation \n", + "* attention*\n", + "\n", + "```python\n", + "{\n", + " 'translation': str,\n", + " 'input_tokens': list[str]\n", + " 'input_words': list[str]\n", + " 'output_tokens': list[str]\n", + " 'output_words': list[str]\n", + " 'src_synonyms': list[list[str]]\n", + " 'dest_synonyms': list[list[str]]\n", + " 'attention': list[list[float]]\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['oh', ',', 'dear', ',', 'i', \"'m\", 'afraid', 'i', \"'ve\", 'said', 'too', 'much', '.', 'i', \"'m\", 'a', 'bad', ',', 'bad', 'blogger', '.']\n" + ] + } + ], + "source": [ + "print(app_output['input_words'])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['▁Oh', ',', '▁dear', ',', '▁I', \"'\", 'm', '▁afraid', '▁I', \"'\", 've', '▁said', '▁too', '▁much', '.', '▁I', \"'\", 'm', '▁a', '▁bad', ',', '▁bad', '▁blogger', '.']\n" + ] + } + ], + "source": [ + "print(app_output['input_tokens'])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['dearest', 'dears', 'dear.And', 'darling', 'dearies']\n" + ] + } + ], + "source": [ + "dear_idx = app_output['input_words'].index('dear')\n", + "print(app_output['src_synonyms'][dear_idx])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "О, дорогая, боюсь, я слишком много сказал. Я плохой, плохой блоггер.\n" + ] + } + ], + "source": [ + "print(app_output['translation'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}