-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
43e4365
commit ba04635
Showing
1 changed file
with
280 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,280 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"The autoreload extension is already loaded. To reload it, use:\n", | ||
" %reload_ext autoreload\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%load_ext autoreload\n", | ||
"%autoreload 2" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"/Users/user010/Desktop/Programming/ML/En2RuTranslator\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import os\n", | ||
"import sys\n", | ||
"\n", | ||
"root_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))\n", | ||
"print(root_dir)\n", | ||
"assert os.path.exists(root_dir), f'Could not find root directory at {root_dir}'\n", | ||
"sys.path.insert(0, root_dir)\n", | ||
"\n", | ||
"from custom_utils.config_handler import read_config, pprint_config" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"{\n", | ||
" \"root\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator\",\n", | ||
" \"params\": {\n", | ||
" \"batch_size\": 16,\n", | ||
" \"max_length\": 512,\n", | ||
" \"train_args\": {\n", | ||
" \"evaluation_strategy\": \"epoch\",\n", | ||
" \"learning_rate\": 2e-05,\n", | ||
" \"per_device_train_batch_size\": 16,\n", | ||
" \"per_device_eval_batch_size\": 16,\n", | ||
" \"weight_decay\": 0.01,\n", | ||
" \"save_total_limit\": 3,\n", | ||
" \"num_train_epochs\": 4,\n", | ||
" \"predict_with_generate\": true\n", | ||
" },\n", | ||
" \"wandb_args\": {\n", | ||
" \"report_to\": \"wandb\",\n", | ||
" \"run_name\": \"finetune\"\n", | ||
" }\n", | ||
" },\n", | ||
" \"dest_synonym_searcher\": {\n", | ||
" \"path\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator/models/embs/cc.ru.100.bin\",\n", | ||
" \"type\": \"fasttext\"\n", | ||
" },\n", | ||
" \"src_synonym_searcher\": {\n", | ||
" \"path\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator/models/embs/cc.en.100.bin\",\n", | ||
" \"type\": \"fasttext\"\n", | ||
" },\n", | ||
" \"translator\": {\n", | ||
" \"name\": \"opus-distilled-en-ru\",\n", | ||
" \"model_and_tokenizer_name\": \"under-tree/transformer-en-ru\",\n", | ||
" \"output_dir\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator/models/opus-distilled-en-ru/finetuned\",\n", | ||
" \"type\": \"seq2seq\"\n", | ||
" },\n", | ||
" \"attention_extractor\": {\n", | ||
" \"type\": \"random\"\n", | ||
" }\n", | ||
"}\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"overrides = [\"setup=prod\"]\n", | ||
"cfg = read_config(overrides=overrides)\n", | ||
"pprint_config(cfg)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/Users/user010/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/transformers/models/marian/tokenization_marian.py:197: UserWarning: Recommended: pip install sacremoses.\n", | ||
" warnings.warn(\"Recommended: pip install sacremoses.\")\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"CPU times: user 4.22 s, sys: 2.59 s, total: 6.81 s\n", | ||
"Wall time: 8.32 s\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%%time\n", | ||
"from translator_app import TranslatorApp\n", | ||
"\n", | ||
"app = TranslatorApp(cfg)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/Users/user010/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:3848: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n", | ||
" warnings.warn(\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"CPU times: user 9.32 s, sys: 1.47 s, total: 10.8 s\n", | ||
"Wall time: 11.3 s\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%%time\n", | ||
"text = \"Oh, dear, I'm afraid I've said too much. I'm a bad, bad blogger.\"\n", | ||
"app_output = app(text)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### App Output\n", | ||
"\n", | ||
"* input words\n", | ||
"* input tokens\n", | ||
"* src synonyms\n", | ||
"* output words\n", | ||
"* output tokens\n", | ||
"* dest synonyms\n", | ||
"* translation \n", | ||
"* attention*\n", | ||
"\n", | ||
"```python\n", | ||
"{\n", | ||
" 'translation': str,\n", | ||
" 'input_tokens': list[str]\n", | ||
" 'input_words': list[str]\n", | ||
" 'output_tokens': list[str]\n", | ||
" 'output_words': list[str]\n", | ||
" 'src_synonyms': list[list[str]]\n", | ||
" 'dest_synonyms': list[list[str]]\n", | ||
" 'attention': list[list[float]]\n", | ||
"}\n", | ||
"```" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 81, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['oh', ',', 'dear', ',', 'i', \"'m\", 'afraid', 'i', \"'ve\", 'said', 'too', 'much', '.', 'i', \"'m\", 'a', 'bad', ',', 'bad', 'blogger', '.']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(app_output['input_words'])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['▁Oh', ',', '▁dear', ',', '▁I', \"'\", 'm', '▁afraid', '▁I', \"'\", 've', '▁said', '▁too', '▁much', '.', '▁I', \"'\", 'm', '▁a', '▁bad', ',', '▁bad', '▁blogger', '.']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(app_output['input_tokens'])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['dearest', 'dears', 'dear.And', 'darling', 'dearies']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"dear_idx = app_output['input_words'].index('dear')\n", | ||
"print(app_output['src_synonyms'][dear_idx])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"О, дорогая, боюсь, я слишком много сказал. Я плохой, плохой блоггер.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(app_output['translation'])" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |